Compare revisions

Soubeste Thibault · Soubeste Thibault · 0e522f47 · 0e522f47 · 0e522f47 · aaf16351
--- a/.gitignore
+++ b/.gitignore
+input/*.csv
+input/audio_files/**
+output/**
+**/__pycache__
+train/temp/train/**
+train/temp/val/**
+train/data_training/audio_files/**
+train/data_training/*.csv
\ No newline at end of file
--- a/Makefile
+++ b/Makefile
+prepare:
+	python creation_datasets/create_csv.py France
+	python creation_datasets/download_files.py France.csv
+	rm -rf train/data_training/birds.csv
+	mv creation_datasets/fichiers_csv/France.csv train/data_training/birds.csv
+	rm -rf train/data_training/audio_files/
+	mv creation_datasets/audio_files_France train/data_training/audio_files/
+	python preload_training_data.py
+
+train:
+	python train.py -m all
\ No newline at end of file
--- a/README.txt
+++ b/README.txt
+To use the ia :
+put in the input folder :
+an audio_files with the audios and a csv file containing the audios you want to test. The format is :
+audio_id
+audio1
+audio2
+...
+
+no extension on the audio_id column
+Note : the csv can have multiple columns, but only the audio_id is considered
+
+
+To make the training :
+download and preprocess the dataset : in Terminal, put yourself in the "birdcall-detection" folder. Then type "make prepare"
+Wait the download end. Then you can launch the training with "make train".
+If you have not enough RAM, you'd better train models one by one. To do so, write (still in the "birdcall-detection" folder) :
+python train -m 1
+python train -m 2
+python train -m 3
+python train -m 4
+
+Note : If you want to make your own dataset, in the train/data_training folder, put an audio_files folder and a csv with the adequate format :
+cnt,en,id,length
+**,class1,audio1.mp3,**
+**,class1,audio2.mp3,**
+...
+**,class2,audio4.wav,**
+...
+
+where audio.* is just the name of the corresponding audio file and *** are characters that needs to be there but are not important
--- a/creation_datasets/README.txt
+++ b/creation_datasets/README.txt
--- a/detection.py
+++ b/detection.py
+import torch
+import pandas as pd
+from pathlib import Path
+import numpy as np
+from fastprogress import progress_bar
+import warnings
+from contextlib import contextmanager
+import time
+
+from src.models import models, AttBlock
+from src.preproc import clip_to_image
+
+PERIOD = 30
+# Arbitrary
+ratio = {
+    "ref2_th03": 0.25/0.77,
+    "ref2_th04": 0.14/0.77,
+    "eff_th04": 0.13/0.77,
+    "ext": 0.25/0.77
+}
+all_time_duration = 0
+# We may determine tresholds for each class but it's not done here
+thresholds = {}
+inv_bird_call = np.load('inv_bird_code.npy', allow_pickle=True)
+
+
+@contextmanager
+def timer(name: str):
+    t0 = time.time()
+    msg = f"[{name}] start"
+    print(msg)
+    yield
+    global all_time_duration
+    all_time_duration += time.time() - t0
+    msg = f"[{name}] done in {time.time() - t0:.2f} s"
+    print(msg)
+
+
+def prediction_for_clip(test_df: pd.DataFrame,
+                        clip: Path, models):
+    """Given a clip, the function predict the bird singing"""
+    images = clip_to_image(clip)
+    array = np.asarray(images)
+    tensors = torch.from_numpy(array)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    estimated_event_list = []
+    global_time = 0.0
+    audio_id = test_df["audio_id"].values[0]
+    for image in progress_bar(tensors):
+        image = image[None, :]/255.0
+        image = image.to(device)
+        outputs = {}
+        with torch.no_grad():
+            for key in models:
+                prediction = models[key](image)
+                framewise_outputs = prediction["framewise_output"].detach(
+                ).cpu().numpy()[0]
+                outputs[key] = framewise_outputs
+
+        key = list(outputs.keys())[0]
+        framewise_outputs = np.zeros_like(outputs[key], dtype=np.float32)
+        for key in outputs:
+            framewise_outputs += ratio[key] * outputs[key]
+        thresholded = np.zeros_like(framewise_outputs)
+        for i in range(len(inv_bird_call)):
+            thresholded[:, i] = framewise_outputs[:, i] >= 0.01
+            # thresholded[:, i] = framewise_outputs[:, i] >= thresholds[INV_BIRD_CODE[i]] # uncoment if there is personalized tresholds
+        sec_per_frame = PERIOD / thresholded.shape[0]
+        for target_idx in range(thresholded.shape[1]):
+            if thresholded[:, target_idx].mean() == 0:
+                pass
+            else:
+                detected = np.argwhere(thresholded[:, target_idx]).reshape(-1)
+                head_idx = 0
+                tail_idx = 0
+                while True:
+                    if (tail_idx + 1 == len(detected)) or (
+                            detected[tail_idx + 1] -
+                            detected[tail_idx] != 1):
+                        onset = sec_per_frame * detected[
+                            head_idx] + global_time
+                        offset = sec_per_frame * detected[
+                            tail_idx] + global_time
+                        onset_idx = detected[head_idx]
+                        offset_idx = detected[tail_idx]
+                        max_confidence = framewise_outputs[
+                            onset_idx:offset_idx, target_idx].max()
+                        mean_confidence = framewise_outputs[
+                            onset_idx:offset_idx, target_idx].mean()
+                        estimated_event = {
+                            "audio_id": audio_id,
+                            "ebird_code": inv_bird_call[target_idx],
+                            "onset": onset,
+                            "offset": offset,
+                            "max_confidence": max_confidence,
+                            "mean_confidence": mean_confidence
+                        }
+                        estimated_event_list.append(estimated_event)
+                        head_idx = tail_idx + 1
+                        tail_idx = tail_idx + 1
+                        if head_idx >= len(detected):
+                            break
+                    else:
+                        tail_idx += 1
+        global_time += PERIOD
+    prediction_df = pd.DataFrame(estimated_event_list)
+    return prediction_df
+
+
+def prediction(test_df: pd.DataFrame,
+               test_audio: Path,
+               models):
+    """"given the pass of a folder containing audios and a csv corresponding, it returns a prediction for each audio which need a postprocess"""
+    unique_audio_id = test_df.audio_id.unique()
+    warnings.filterwarnings("ignore")
+    prediction_dfs = []
+    for audio_id in unique_audio_id:
+        clip_path = test_audio + audio_id
+        test_df_for_audio_id = test_df.query(
+            f"audio_id == '{audio_id}'").reset_index(drop=True)
+        with timer(f"Prediction & load on {audio_id}"):
+            prediction_df = prediction_for_clip(test_df_for_audio_id,
+                                                clip=clip_path,
+                                                models=models)
+
+        prediction_dfs.append(prediction_df)
+
+    prediction_df = pd.concat(prediction_dfs, axis=0,
+                              sort=False).reset_index(drop=True)
+    return prediction_df
+
+
+def postproc(prediction_df, test):
+    """Make the postprocessing"""
+    labels = {}
+
+    for audio_id, sub_df in prediction_df.groupby("audio_id"):
+        events = sub_df[["ebird_code", "mean_confidence"]].values
+        n_events = len(events)
+        bird_max_conf = np.max(events[:, 1])
+        for i in range(n_events):
+            if events[i][1] == bird_max_conf:
+                row_id = f"{audio_id}"
+                bird = events[i][0]
+                labels[row_id] = {bird, ""}
+    for key in labels:
+        labels[key] = " ".join(sorted(list(labels[key])))
+
+    row_ids = list(labels.keys())
+    birds = list(labels.values())
+    post_processed = pd.DataFrame({
+        "audio_id": row_ids,
+        "birds": birds})
+    all_row_id = test[["audio_id"]]
+    submission = all_row_id.merge(post_processed, on="audio_id", how="left")
+    submission = submission.fillna("nocall")
+    return submission
+
+
+if __name__ == '__main__':
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    for model in models:  # load the model
+        num_ftrs = models[model].fc1.in_features
+        models[model].att_block = AttBlock(
+            num_ftrs, len(inv_bird_call), activation="sigmoid")
+        models[model].load_state_dict(torch.load(
+            'weights_trained/'+model+'.pth'))
+        models[model].to(device)
+
+    test = pd.read_csv("input/test.csv")
+    test_audio = "input/audio_files/"
+    test["audio_id"] = test["audio_id"].map(str)
+
+    prediction_df = prediction(
+        test_df=test, test_audio=test_audio, models=models)
+    post_processed = prediction_df
+
+    if not prediction_df.empty:
+        post_processed = postproc(prediction_df, test)
+    all_row_id = test[["audio_id"]]
+    submission = all_row_id.merge(post_processed, on="audio_id", how="left")
+    submission.to_csv("output/submission.csv", index=False)
+    print(f"all done in {all_time_duration:.2f} s")
--- a/inv_bird_code.npy
+++ b/inv_bird_code.npy
--- a/preload_training_data.py
+++ b/preload_training_data.py
+from src.preproc import transform_all_images
+if __name__ == '__main__':
+    # Make the preprocessing and save .png in the train/temp/ folder
+    transform_all_images('train/data_training/', 'audio_files/', 'birds.csv')
--- a/requirement.txt
+++ b/requirement.txt
+anaconda-client==1.11.1
+anaconda-project==0.11.1
+appdirs==1.4.4
+astroid==2.15.3
+asttokens==2.2.1
+attrs==22.1.0
+audioread==3.0.0
+backcall==0.2.0
+backports.functools-lru-cache==1.6.4
+backports.tempfile==1.0
+backports.weakref==1.0.post1
+beautifulsoup4==4.11.1
+boltons==23.0.0
+Bottleneck==1.3.7
+brotlipy==0.7.0
+certifi==2022.9.24
+cffi==1.15.1
+chardet==4.0.0
+charset-normalizer==2.0.4
+click==8.0.4
+clyent==1.2.2
+colorama==0.4.6
+comm==0.1.3
+conda-content-trust==0.1.3
+conda-pack==0.6.0
+conda-package-handling==2.0.2
+conda_package_streaming==0.7.0
+conda-repo-cli==1.0.41
+conda-verify==3.4.2
+contourpy==1.0.7
+cryptography==39.0.1
+cycler==0.11.0
+debugpy==1.6.6
+decorator==5.1.1
+defusedxml==0.7.1
+dill==0.3.6
+executing==1.2.0
+fastjsonschema==2.16.2
+fastprogress==1.0.0
+filelock==3.9.0
+flit_core==3.8.0
+fonttools==4.39.3
+future==0.18.3
+glob2==0.7
+idna==3.4
+importlib-metadata==6.1.0
+ipykernel==6.15.0
+ipython==8.11.0
+isort==5.12.0
+jedi==0.18.2
+Jinja2==3.1.2
+joblib==1.2.0
+jsonschema==4.17.3
+jupyter_client==8.1.0
+jupyter_core==5.3.0
+keyboard==0.13.5
+kiwisolver==1.4.4
+lazy_loader==0.2
+lazy-object-proxy==1.9.0
+libarchive-c==2.9
+librosa==0.10.0.post2
+llvmlite==0.39.1
+MarkupSafe==2.1.1
+matplotlib==3.7.1
+matplotlib-inline==0.1.6
+mccabe==0.7.0
+menuinst==1.4.19
+mkl-fft==1.3.1
+mkl-random==1.2.2
+mkl-service==2.4.0
+mpmath==1.2.1
+msgpack==1.0.5
+nbformat==5.7.0
+nest-asyncio==1.5.6
+networkx==2.8.4
+numba==0.56.4
+numexpr==2.8.4
+numpy==1.23.5
+packaging==23.0
+panda==0.3.1
+pandas==1.5.3
+parso==0.8.3
+pathlib==1.0.1
+pickleshare==0.7.5
+Pillow==9.4.0
+pip==22.3.1
+pkginfo==1.9.6
+platformdirs==3.2.0
+pluggy==1.0.0
+ply==3.11
+pooch==1.6.0
+prompt-toolkit==3.0.38
+psutil==5.9.0
+pure-eval==0.2.2
+pycosat==0.6.4
+pycparser==2.21
+Pygments==2.14.0
+PyJWT==2.4.0
+pylint==2.17.2
+pyOpenSSL==23.0.0
+pyparsing==3.0.9
+PyQt5==5.15.7
+PyQt5-sip==12.11.0
+pyrsistent==0.18.0
+PySocks==1.7.1
+PySoundFile==0.9.0.post1
+python-dateutil==2.8.2
+pytz==2022.7
+pywin32==306
+PyYAML==6.0
+pyzmq==25.0.2
+QtPy==2.2.0
+requests==2.28.1
+resampy==0.4.2
+ruamel.yaml==0.17.21
+ruamel.yaml.clib==0.2.6
+ruamel-yaml-conda==0.17.21
+scikit-learn==1.2.2
+scipy==1.10.1
+setuptools==65.6.3
+sip==6.6.2
+six==1.16.0
+soundfile==0.12.1
+soupsieve==2.3.2.post1
+soxr==0.3.4
+stack-data==0.6.2
+sympy==1.11.1
+threadpoolctl==3.1.0
+toml==0.10.2
+tomli==2.0.1
+tomlkit==0.11.7
+toolz==0.12.0
+torch==2.0.0
+torchaudio==2.0.0
+torchsummary==1.5.1
+torchvision==0.15.0
+tornado==6.2
+tqdm==4.65.0
+traitlets==5.7.1
+typing_extensions==4.4.0
+ujson==5.4.0
+urllib3==1.26.14
+wcwidth==0.2.6
+wheel==0.38.4
+win-inet-pton==1.1.0
+wincertstore==0.2
+wrapt==1.15.0
+zipp==3.15.0
+zstandard==0.19.0
\ No newline at end of file
--- a/src/models.py
+++ b/src/models.py
+import math
+
+import cv2
+import audioread
+import collections
+import logging
+import os
+import random
+import re
+import time
+import warnings
+
+import librosa
+import numpy as np
+import pandas as pd
+import soundfile as sf
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.data as data
+
+from contextlib import contextmanager
+from functools import partial
+from pathlib import Path
+from typing import Optional
+
+from fastprogress import progress_bar
+from sklearn.metrics import f1_score
+from torch.nn import Conv2d, Module, Linear, BatchNorm2d, ReLU
+from torch.nn.modules.utils import _pair
+import torch.optim as optim
+
+
+def init_layer(layer):
+    nn.init.xavier_uniform_(layer.weight)
+
+    if hasattr(layer, "bias"):
+        if layer.bias is not None:
+            layer.bias.data.fill_(0.)
+
+
+def init_bn(bn):
+    bn.bias.data.fill_(0.)
+    bn.weight.data.fill_(1.0)
+
+
+def interpolate(x: torch.Tensor, ratio: int):
+    """Interpolate data in time domain. This is used to compensate the
+    resolution reduction in downsampling of a CNN.
+
+    Args:
+      x: (batch_size, time_steps, classes_num)
+      ratio: int, ratio to interpolate
+    Returns:
+      upsampled: (batch_size, time_steps * ratio, classes_num)
+    """
+    (batch_size, time_steps, classes_num) = x.shape
+    upsampled = x[:, :, None, :].repeat(1, 1, ratio, 1)
+    upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num)
+    return upsampled
+
+
+def pad_framewise_output(framewise_output: torch.Tensor, frames_num: int):
+    """Pad framewise_output to the same length as input frames. The pad value
+    is the same as the value of the last frame.
+    Args:
+      framewise_output: (batch_size, frames_num, classes_num)
+      frames_num: int, number of frames to pad
+    Outputs:
+      output: (batch_size, frames_num, classes_num)
+    """
+    pad = framewise_output[:, -1:, :].repeat(
+        1, frames_num - framewise_output.shape[1], 1)
+    """tensor for padding"""
+
+    output = torch.cat((framewise_output, pad), dim=1)
+    """(batch_size, frames_num, classes_num)"""
+
+    return output
+
+
+class ConvBlock(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int):
+        super().__init__()
+
+        self.conv1 = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=(3, 3),
+            stride=(1, 1),
+            padding=(1, 1),
+            bias=False)
+
+        self.conv2 = nn.Conv2d(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=(3, 3),
+            stride=(1, 1),
+            padding=(1, 1),
+            bias=False)
+
+        self.bn1 = nn.BatchNorm2d(out_channels)
+        self.bn2 = nn.BatchNorm2d(out_channels)
+
+        self.init_weight()
+
+    def init_weight(self):
+        init_layer(self.conv1)
+        init_layer(self.conv2)
+        init_bn(self.bn1)
+        init_bn(self.bn2)
+
+    def forward(self, input, pool_size=(2, 2), pool_type='avg'):
+
+        x = input
+        x = F.relu_(self.bn1(self.conv1(x)))
+        x = F.relu_(self.bn2(self.conv2(x)))
+        if pool_type == 'max':
+            x = F.max_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg':
+            x = F.avg_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg+max':
+            x1 = F.avg_pool2d(x, kernel_size=pool_size)
+            x2 = F.max_pool2d(x, kernel_size=pool_size)
+            x = x1 + x2
+        else:
+            raise Exception('Incorrect argument!')
+
+        return x
+
+
+class AttBlock(nn.Module):
+    def __init__(self,
+                 in_features: int,
+                 out_features: int,
+                 activation="linear",
+                 temperature=1.0):
+        super().__init__()
+
+        self.activation = activation
+        self.temperature = temperature
+        self.att = nn.Conv1d(
+            in_channels=in_features,
+            out_channels=out_features,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True)
+        self.cla = nn.Conv1d(
+            in_channels=in_features,
+            out_channels=out_features,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True)
+
+        self.init_weights()
+
+    def init_weights(self):
+        init_layer(self.att)
+
+    def forward(self, x):
+        # x: (n_samples, n_in, n_time)
+        norm_att = torch.softmax(torch.tanh(self.att(x)), dim=-1)
+        cla = self.nonlinear_transform(self.cla(x))
+        x = torch.sum(norm_att * cla, dim=2)
+        return x, norm_att, cla
+
+    def nonlinear_transform(self, x):
+        if self.activation == 'linear':
+            return x
+        elif self.activation == 'sigmoid':
+            return torch.sigmoid(x)
+
+
+class SplAtConv2d(Module):
+    """Split-Attention Conv2d
+    """
+
+    def __init__(self, in_channels, channels, kernel_size, stride=(1, 1), padding=(0, 0),
+                 dilation=(1, 1), groups=1, bias=True,
+                 radix=2, reduction_factor=4,
+                 rectify=False, rectify_avg=False, norm_layer=None,
+                 dropblock_prob=0.0, **kwargs):
+        super(SplAtConv2d, self).__init__()
+        padding = _pair(padding)
+        self.rectify = rectify and (padding[0] > 0 or padding[1] > 0)
+        self.rectify_avg = rectify_avg
+        inter_channels = max(in_channels*radix//reduction_factor, 32)
+        self.radix = radix
+        self.cardinality = groups
+        self.channels = channels
+        self.dropblock_prob = dropblock_prob
+        if self.rectify:
+            from rfconv import RFConv2d
+            self.conv = RFConv2d(in_channels, channels*radix, kernel_size, stride, padding, dilation,
+                                 groups=groups*radix, bias=bias, average_mode=rectify_avg, **kwargs)
+        else:
+            self.conv = Conv2d(in_channels, channels*radix, kernel_size, stride, padding, dilation,
+                               groups=groups*radix, bias=bias, **kwargs)
+        self.use_bn = norm_layer is not None
+        if self.use_bn:
+            self.bn0 = norm_layer(channels*radix)
+        self.relu = ReLU(inplace=True)
+        self.fc1 = Conv2d(channels, inter_channels, 1, groups=self.cardinality)
+        if self.use_bn:
+            self.bn1 = norm_layer(inter_channels)
+        self.fc2 = Conv2d(inter_channels, channels*radix,
+                          1, groups=self.cardinality)
+        if dropblock_prob > 0.0:
+            self.dropblock = DropBlock2D(dropblock_prob, 3)
+        self.rsoftmax = rSoftMax(radix, groups)
+
+    def forward(self, x):
+        x = self.conv(x)
+        if self.use_bn:
+            x = self.bn0(x)
+        if self.dropblock_prob > 0.0:
+            x = self.dropblock(x)
+        x = self.relu(x)
+
+        batch, rchannel = x.shape[:2]
+        if self.radix > 1:
+            if torch.__version__ < '1.5':
+                splited = torch.split(x, int(rchannel//self.radix), dim=1)
+            else:
+                splited = torch.split(x, rchannel//self.radix, dim=1)
+            gap = sum(splited)
+        else:
+            gap = x
+        gap = F.adaptive_avg_pool2d(gap, 1)
+        gap = self.fc1(gap)
+
+        if self.use_bn:
+            gap = self.bn1(gap)
+        gap = self.relu(gap)
+
+        atten = self.fc2(gap)
+        atten = self.rsoftmax(atten).view(batch, -1, 1, 1)
+
+        if self.radix > 1:
+            if torch.__version__ < '1.5':
+                attens = torch.split(atten, int(rchannel//self.radix), dim=1)
+            else:
+                attens = torch.split(atten, rchannel//self.radix, dim=1)
+            out = sum([att*split for (att, split) in zip(attens, splited)])
+        else:
+            out = atten * x
+        return out.contiguous()
+
+
+class rSoftMax(nn.Module):
+    def __init__(self, radix, cardinality):
+        super().__init__()
+        self.radix = radix
+        self.cardinality = cardinality
+
+    def forward(self, x):
+        batch = x.size(0)
+        if self.radix > 1:
+            x = x.view(batch, self.cardinality, self.radix, -1).transpose(1, 2)
+            x = F.softmax(x, dim=1)
+            x = x.reshape(batch, -1)
+        else:
+            x = torch.sigmoid(x)
+        return x
+
+
+class DropBlock2D(object):
+    def __init__(self, *args, **kwargs):
+        raise NotImplementedError
+
+
+class GlobalAvgPool2d(nn.Module):
+    def __init__(self):
+        """Global average pooling over the input's spatial dimensions"""
+        super(GlobalAvgPool2d, self).__init__()
+
+    def forward(self, inputs):
+        return nn.functional.adaptive_avg_pool2d(inputs, 1).view(inputs.size(0), -1)
+
+
+class Bottleneck(nn.Module):
+    """ResNet Bottleneck
+    """
+    # pylint: disable=unused-argument
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None,
+                 radix=1, cardinality=1, bottleneck_width=64,
+                 avd=False, avd_first=False, dilation=1, is_first=False,
+                 rectified_conv=False, rectify_avg=False,
+                 norm_layer=None, dropblock_prob=0.0, last_gamma=False):
+        super(Bottleneck, self).__init__()
+        group_width = int(planes * (bottleneck_width / 64.)) * cardinality
+        self.conv1 = nn.Conv2d(inplanes, group_width,
+                               kernel_size=1, bias=False)
+        self.bn1 = norm_layer(group_width)
+        self.dropblock_prob = dropblock_prob
+        self.radix = radix
+        self.avd = avd and (stride > 1 or is_first)
+        self.avd_first = avd_first
+
+        if self.avd:
+            self.avd_layer = nn.AvgPool2d(3, stride, padding=1)
+            stride = 1
+
+        if dropblock_prob > 0.0:
+            self.dropblock1 = DropBlock2D(dropblock_prob, 3)
+            if radix == 1:
+                self.dropblock2 = DropBlock2D(dropblock_prob, 3)
+            self.dropblock3 = DropBlock2D(dropblock_prob, 3)
+
+        if radix >= 1:
+            self.conv2 = SplAtConv2d(
+                group_width, group_width, kernel_size=3,
+                stride=stride, padding=dilation,
+                dilation=dilation, groups=cardinality, bias=False,
+                radix=radix, rectify=rectified_conv,
+                rectify_avg=rectify_avg,
+                norm_layer=norm_layer,
+                dropblock_prob=dropblock_prob)
+        elif rectified_conv:
+            from rfconv import RFConv2d
+            self.conv2 = RFConv2d(
+                group_width, group_width, kernel_size=3, stride=stride,
+                padding=dilation, dilation=dilation,
+                groups=cardinality, bias=False,
+                average_mode=rectify_avg)
+            self.bn2 = norm_layer(group_width)
+        else:
+            self.conv2 = nn.Conv2d(
+                group_width, group_width, kernel_size=3, stride=stride,
+                padding=dilation, dilation=dilation,
+                groups=cardinality, bias=False)
+            self.bn2 = norm_layer(group_width)
+
+        self.conv3 = nn.Conv2d(
+            group_width, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = norm_layer(planes*4)
+
+        if last_gamma:
+            from torch.nn.init import zeros_
+            zeros_(self.bn3.weight)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.dilation = dilation
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        if self.dropblock_prob > 0.0:
+            out = self.dropblock1(out)
+        out = self.relu(out)
+
+        if self.avd and self.avd_first:
+            out = self.avd_layer(out)
+
+        out = self.conv2(out)
+        if self.radix == 0:
+            out = self.bn2(out)
+            if self.dropblock_prob > 0.0:
+                out = self.dropblock2(out)
+            out = self.relu(out)
+
+        if self.avd and not self.avd_first:
+            out = self.avd_layer(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.dropblock_prob > 0.0:
+            out = self.dropblock3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class ResNest(nn.Module):
+    """ResNet Variants
+    Parameters
+    ----------
+    block : Block
+        Class for the residual block. Options are BasicBlockV1, BottleneckV1.
+    layers : list of int
+        Numbers of layers in each block
+    classes : int, default 1000
+        Number of classification classes.
+    dilated : bool, default False
+        Applying dilation strategy to pretrained ResNet yielding a stride-8 model,
+        typically used in Semantic Segmentation.
+    norm_layer : object
+        Normalization layer used in backbone network (default: :class:`mxnet.gluon.nn.BatchNorm`;
+        for Synchronized Cross-GPU BachNormalization).
+    Reference:
+        - He, Kaiming, et al. "Deep residual learning for image recognition." Proceedings of the IEEE conference on computer vision and pattern recognition. 2016.
+        - Yu, Fisher, and Vladlen Koltun. "Multi-scale context aggregation by dilated convolutions."
+    """
+    # pylint: disable=unused-variable
+
+    def __init__(self, block, layers, radix=1, groups=1, bottleneck_width=64,
+                 num_classes=1000, dilated=False, dilation=1,
+                 deep_stem=False, stem_width=64, avg_down=False,
+                 rectified_conv=False, rectify_avg=False,
+                 avd=False, avd_first=False,
+                 final_drop=0.0, dropblock_prob=0,
+                 last_gamma=False, norm_layer=nn.BatchNorm2d):
+        self.cardinality = groups
+        self.bottleneck_width = bottleneck_width
+        # ResNet-D params
+        self.inplanes = stem_width*2 if deep_stem else 64
+        self.avg_down = avg_down
+        self.last_gamma = last_gamma
+        # ResNeSt params
+        self.radix = radix
+        self.avd = avd
+        self.avd_first = avd_first
+
+        super(ResNest, self).__init__()
+        self.rectified_conv = rectified_conv
+        self.rectify_avg = rectify_avg
+        if rectified_conv:
+            from rfconv import RFConv2d
+            conv_layer = RFConv2d
+        else:
+            conv_layer = nn.Conv2d
+        conv_kwargs = {'average_mode': rectify_avg} if rectified_conv else {}
+        if deep_stem:
+            self.conv1 = nn.Sequential(
+                conv_layer(3, stem_width, kernel_size=3, stride=2,
+                           padding=1, bias=False, **conv_kwargs),
+                norm_layer(stem_width),
+                nn.ReLU(inplace=True),
+                conv_layer(stem_width, stem_width, kernel_size=3,
+                           stride=1, padding=1, bias=False, **conv_kwargs),
+                norm_layer(stem_width),
+                nn.ReLU(inplace=True),
+                conv_layer(stem_width, stem_width*2, kernel_size=3,
+                           stride=1, padding=1, bias=False, **conv_kwargs),
+            )
+        else:
+            self.conv1 = conv_layer(3, 64, kernel_size=7, stride=2, padding=3,
+                                    bias=False, **conv_kwargs)
+        self.bn1 = norm_layer(self.inplanes)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(
+            block, 64, layers[0], norm_layer=norm_layer, is_first=False)
+        self.layer2 = self._make_layer(
+            block, 128, layers[1], stride=2, norm_layer=norm_layer)
+        if dilated or dilation == 4:
+            self.layer3 = self._make_layer(block, 256, layers[2], stride=1,
+                                           dilation=2, norm_layer=norm_layer,
+                                           dropblock_prob=dropblock_prob)
+            self.layer4 = self._make_layer(block, 512, layers[3], stride=1,
+                                           dilation=4, norm_layer=norm_layer,
+                                           dropblock_prob=dropblock_prob)
+        elif dilation == 2:
+            self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
+                                           dilation=1, norm_layer=norm_layer,
+                                           dropblock_prob=dropblock_prob)
+            self.layer4 = self._make_layer(block, 512, layers[3], stride=1,
+                                           dilation=2, norm_layer=norm_layer,
+                                           dropblock_prob=dropblock_prob)
+        else:
+            self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
+                                           norm_layer=norm_layer,
+                                           dropblock_prob=dropblock_prob)
+            self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
+                                           norm_layer=norm_layer,
+                                           dropblock_prob=dropblock_prob)
+        self.avgpool = GlobalAvgPool2d()
+        self.drop = nn.Dropout(final_drop) if final_drop > 0.0 else None
+        self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, norm_layer):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def _make_layer(self, block, planes, blocks, stride=1, dilation=1, norm_layer=None,
+                    dropblock_prob=0.0, is_first=True):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            down_layers = []
+            if self.avg_down:
+                if dilation == 1:
+                    down_layers.append(nn.AvgPool2d(kernel_size=stride, stride=stride,
+                                                    ceil_mode=True, count_include_pad=False))
+                else:
+                    down_layers.append(nn.AvgPool2d(kernel_size=1, stride=1,
+                                                    ceil_mode=True, count_include_pad=False))
+                down_layers.append(nn.Conv2d(self.inplanes, planes * block.expansion,
+                                             kernel_size=1, stride=1, bias=False))
+            else:
+                down_layers.append(nn.Conv2d(self.inplanes, planes * block.expansion,
+                                             kernel_size=1, stride=stride, bias=False))
+            down_layers.append(norm_layer(planes * block.expansion))
+            downsample = nn.Sequential(*down_layers)
+
+        layers = []
+        if dilation == 1 or dilation == 2:
+            layers.append(block(self.inplanes, planes, stride, downsample=downsample,
+                                radix=self.radix, cardinality=self.cardinality,
+                                bottleneck_width=self.bottleneck_width,
+                                avd=self.avd, avd_first=self.avd_first,
+                                dilation=1, is_first=is_first, rectified_conv=self.rectified_conv,
+                                rectify_avg=self.rectify_avg,
+                                norm_layer=norm_layer, dropblock_prob=dropblock_prob,
+                                last_gamma=self.last_gamma))
+        elif dilation == 4:
+            layers.append(block(self.inplanes, planes, stride, downsample=downsample,
+                                radix=self.radix, cardinality=self.cardinality,
+                                bottleneck_width=self.bottleneck_width,
+                                avd=self.avd, avd_first=self.avd_first,
+                                dilation=2, is_first=is_first, rectified_conv=self.rectified_conv,
+                                rectify_avg=self.rectify_avg,
+                                norm_layer=norm_layer, dropblock_prob=dropblock_prob,
+                                last_gamma=self.last_gamma))
+        else:
+            raise RuntimeError("=> unknown dilation size: {}".format(dilation))
+
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes,
+                                radix=self.radix, cardinality=self.cardinality,
+                                bottleneck_width=self.bottleneck_width,
+                                avd=self.avd, avd_first=self.avd_first,
+                                dilation=dilation, rectified_conv=self.rectified_conv,
+                                rectify_avg=self.rectify_avg,
+                                norm_layer=norm_layer, dropblock_prob=dropblock_prob,
+                                last_gamma=self.last_gamma))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.avgpool(x)
+        # x = x.view(x.size(0), -1)
+        x = torch.flatten(x, 1)
+        if self.drop:
+            x = self.drop(x)
+        x = self.fc(x)
+
+        multilabel_proba = torch.sigmoid(x)
+        multiclass_proba = torch.softmax(x, dim=1)
+        return {
+            "logits": x,
+            "multilabel_proba": multilabel_proba,
+            "multiclass_proba": multiclass_proba
+        }
+
+# An ordinary implementation of Swish function
+
+
+class Swish(nn.Module):
+    def forward(self, x):
+        return x * torch.sigmoid(x)
+
+
+# A memory-efficient implementation of Swish function
+class SwishImplementation(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, i):
+        result = i * torch.sigmoid(i)
+        ctx.save_for_backward(i)
+        return result
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        i = ctx.saved_tensors[0]
+        sigmoid_i = torch.sigmoid(i)
+        return grad_output * (sigmoid_i * (1 + i * (1 - sigmoid_i)))
+
+
+class MemoryEfficientSwish(nn.Module):
+    def forward(self, x):
+        return SwishImplementation.apply(x)
+
+
+def round_filters(filters, global_params):
+    """Calculate and round number of filters based on width multiplier.
+       Use width_coefficient, depth_divisor and min_depth of global_params.
+    Args:
+        filters (int): Filters number to be calculated.
+        global_params (namedtuple): Global params of the model.
+    Returns:
+        new_filters: New filters number after calculating.
+    """
+    multiplier = global_params.width_coefficient
+    if not multiplier:
+        return filters
+    # TODO: modify the params names.
+    #       maybe the names (width_divisor,min_width)
+    #       are more suitable than (depth_divisor,min_depth).
+    divisor = global_params.depth_divisor
+    min_depth = global_params.min_depth
+    filters *= multiplier
+    min_depth = min_depth or divisor  # pay attention to this line when using min_depth
+    # follow the formula transferred from official TensorFlow implementation
+    new_filters = max(min_depth, int(
+        filters + divisor / 2) // divisor * divisor)
+    if new_filters < 0.9 * filters:  # prevent rounding by more than 10%
+        new_filters += divisor
+    return int(new_filters)
+
+
+def round_repeats(repeats, global_params):
+    """Calculate module's repeat number of a block based on depth multiplier.
+       Use depth_coefficient of global_params.
+    Args:
+        repeats (int): num_repeat to be calculated.
+        global_params (namedtuple): Global params of the model.
+    Returns:
+        new repeat: New repeat number after calculating.
+    """
+    multiplier = global_params.depth_coefficient
+    if not multiplier:
+        return repeats
+    # follow the formula transferred from official TensorFlow implementation
+    return int(math.ceil(multiplier * repeats))
+
+
+def drop_connect(inputs, p, training):
+    """Drop connect.
+
+    Args:
+        input (tensor: BCWH): Input of this structure.
+        p (float: 0.0~1.0): Probability of drop connection.
+        training (bool): The running mode.
+    Returns:
+        output: Output after drop connection.
+    """
+    assert p >= 0 and p <= 1, 'p must be in range of [0,1]'
+
+    if not training:
+        return inputs
+
+    batch_size = inputs.shape[0]
+    keep_prob = 1 - p
+
+    # generate binary_tensor mask according to probability (p for 0, 1-p for 1)
+    random_tensor = keep_prob
+    random_tensor += torch.rand([batch_size, 1, 1, 1],
+                                dtype=inputs.dtype, device=inputs.device)
+    binary_tensor = torch.floor(random_tensor)
+
+    output = inputs / keep_prob * binary_tensor
+    return output
+
+
+def get_width_and_height_from_size(x):
+    """Obtain height and width from x.
+    Args:
+        x (int, tuple or list): Data size.
+    Returns:
+        size: A tuple or list (H,W).
+    """
+    if isinstance(x, int):
+        return x, x
+    if isinstance(x, list) or isinstance(x, tuple):
+        return x
+    else:
+        raise TypeError()
+
+
+def calculate_output_image_size(input_image_size, stride):
+    """Calculates the output image size when using Conv2dSamePadding with a stride.
+       Necessary for static padding. Thanks to mannatsingh for pointing this out.
+    Args:
+        input_image_size (int, tuple or list): Size of input image.
+        stride (int, tuple or list): Conv2d operation's stride.
+    Returns:
+        output_image_size: A list [H,W].
+    """
+    if input_image_size is None:
+        return None
+    image_height, image_width = get_width_and_height_from_size(
+        input_image_size)
+    stride = stride if isinstance(stride, int) else stride[0]
+    image_height = int(math.ceil(image_height / stride))
+    image_width = int(math.ceil(image_width / stride))
+    return [image_height, image_width]
+
+
+# Note:
+# The following 'SamePadding' functions make output size equal ceil(input size/stride).
+# Only when stride equals 1, can the output size be the same as input size.
+# Don't be confused by their function names ! ! !
+
+def get_same_padding_conv2d(image_size=None):
+    """Chooses static padding if you have specified an image size, and dynamic padding otherwise.
+       Static padding is necessary for ONNX exporting of models.
+    Args:
+        image_size (int or tuple): Size of the image.
+    Returns:
+        Conv2dDynamicSamePadding or Conv2dStaticSamePadding.
+    """
+    if image_size is None:
+        return Conv2dDynamicSamePadding
+    else:
+        return partial(Conv2dStaticSamePadding, image_size=image_size)
+
+
+class Conv2dDynamicSamePadding(nn.Conv2d):
+    """2D Convolutions like TensorFlow, for a dynamic image size.
+       The padding is operated in forward function by calculating dynamically.
+    """
+
+    # Tips for 'SAME' mode padding.
+    #     Given the following:
+    #         i: width or height
+    #         s: stride
+    #         k: kernel size
+    #         d: dilation
+    #         p: padding
+    #     Output after Conv2d:
+    #         o = floor((i+p-((k-1)*d+1))/s+1)
+    # If o equals i, i = floor((i+p-((k-1)*d+1))/s+1),
+    # => p = (i-1)*s+((k-1)*d+1)-i
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, groups=1, bias=True):
+        super().__init__(in_channels, out_channels,
+                         kernel_size, stride, 0, dilation, groups, bias)
+        self.stride = self.stride if len(self.stride) == 2 else [
+            self.stride[0]] * 2
+
+    def forward(self, x):
+        ih, iw = x.size()[-2:]
+        kh, kw = self.weight.size()[-2:]
+        sh, sw = self.stride
+        # change the output size according to stride ! ! !
+        oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
+        pad_h = max((oh - 1) * self.stride[0] +
+                    (kh - 1) * self.dilation[0] + 1 - ih, 0)
+        pad_w = max((ow - 1) * self.stride[1] +
+                    (kw - 1) * self.dilation[1] + 1 - iw, 0)
+        if pad_h > 0 or pad_w > 0:
+            x = F.pad(x, [pad_w // 2, pad_w - pad_w //
+                      2, pad_h // 2, pad_h - pad_h // 2])
+        return F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
+
+
+class Conv2dStaticSamePadding(nn.Conv2d):
+    """2D Convolutions like TensorFlow's 'SAME' mode, with the given input image size.
+       The padding mudule is calculated in construction function, then used in forward.
+    """
+
+    # With the same calculation as Conv2dDynamicSamePadding
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, image_size=None, **kwargs):
+        super().__init__(in_channels, out_channels, kernel_size, stride, **kwargs)
+        self.stride = self.stride if len(self.stride) == 2 else [
+            self.stride[0]] * 2
+
+        # Calculate padding based on image size and save it
+        assert image_size is not None
+        ih, iw = (image_size, image_size) if isinstance(
+            image_size, int) else image_size
+        kh, kw = self.weight.size()[-2:]
+        sh, sw = self.stride
+        oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
+        pad_h = max((oh - 1) * self.stride[0] +
+                    (kh - 1) * self.dilation[0] + 1 - ih, 0)
+        pad_w = max((ow - 1) * self.stride[1] +
+                    (kw - 1) * self.dilation[1] + 1 - iw, 0)
+        if pad_h > 0 or pad_w > 0:
+            self.static_padding = nn.ZeroPad2d(
+                (pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2))
+        else:
+            self.static_padding = Identity()
+
+    def forward(self, x):
+        x = self.static_padding(x)
+        x = F.conv2d(x, self.weight, self.bias, self.stride,
+                     self.padding, self.dilation, self.groups)
+        return x
+
+
+def get_same_padding_maxPool2d(image_size=None):
+    """Chooses static padding if you have specified an image size, and dynamic padding otherwise.
+       Static padding is necessary for ONNX exporting of models.
+    Args:
+        image_size (int or tuple): Size of the image.
+    Returns:
+        MaxPool2dDynamicSamePadding or MaxPool2dStaticSamePadding.
+    """
+    if image_size is None:
+        return MaxPool2dDynamicSamePadding
+    else:
+        return partial(MaxPool2dStaticSamePadding, image_size=image_size)
+
+
+class MaxPool2dDynamicSamePadding(nn.MaxPool2d):
+    """2D MaxPooling like TensorFlow's 'SAME' mode, with a dynamic image size.
+       The padding is operated in forward function by calculating dynamically.
+    """
+
+    def __init__(self, kernel_size, stride, padding=0, dilation=1, return_indices=False, ceil_mode=False):
+        super().__init__(kernel_size, stride, padding, dilation, return_indices, ceil_mode)
+        self.stride = [self.stride] * \
+            2 if isinstance(self.stride, int) else self.stride
+        self.kernel_size = [self.kernel_size] * \
+            2 if isinstance(self.kernel_size, int) else self.kernel_size
+        self.dilation = [self.dilation] * \
+            2 if isinstance(self.dilation, int) else self.dilation
+
+    def forward(self, x):
+        ih, iw = x.size()[-2:]
+        kh, kw = self.kernel_size
+        sh, sw = self.stride
+        oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
+        pad_h = max((oh - 1) * self.stride[0] +
+                    (kh - 1) * self.dilation[0] + 1 - ih, 0)
+        pad_w = max((ow - 1) * self.stride[1] +
+                    (kw - 1) * self.dilation[1] + 1 - iw, 0)
+        if pad_h > 0 or pad_w > 0:
+            x = F.pad(x, [pad_w // 2, pad_w - pad_w //
+                      2, pad_h // 2, pad_h - pad_h // 2])
+        return F.max_pool2d(x, self.kernel_size, self.stride, self.padding,
+                            self.dilation, self.ceil_mode, self.return_indices)
+
+
+class MaxPool2dStaticSamePadding(nn.MaxPool2d):
+    """2D MaxPooling like TensorFlow's 'SAME' mode, with the given input image size.
+       The padding mudule is calculated in construction function, then used in forward.
+    """
+
+    def __init__(self, kernel_size, stride, image_size=None, **kwargs):
+        super().__init__(kernel_size, stride, **kwargs)
+        self.stride = [self.stride] * \
+            2 if isinstance(self.stride, int) else self.stride
+        self.kernel_size = [self.kernel_size] * \
+            2 if isinstance(self.kernel_size, int) else self.kernel_size
+        self.dilation = [self.dilation] * \
+            2 if isinstance(self.dilation, int) else self.dilation
+
+        # Calculate padding based on image size and save it
+        assert image_size is not None
+        ih, iw = (image_size, image_size) if isinstance(
+            image_size, int) else image_size
+        kh, kw = self.kernel_size
+        sh, sw = self.stride
+        oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
+        pad_h = max((oh - 1) * self.stride[0] +
+                    (kh - 1) * self.dilation[0] + 1 - ih, 0)
+        pad_w = max((ow - 1) * self.stride[1] +
+                    (kw - 1) * self.dilation[1] + 1 - iw, 0)
+        if pad_h > 0 or pad_w > 0:
+            self.static_padding = nn.ZeroPad2d(
+                (pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2))
+        else:
+            self.static_padding = Identity()
+
+    def forward(self, x):
+        x = self.static_padding(x)
+        x = F.max_pool2d(x, self.kernel_size, self.stride, self.padding,
+                         self.dilation, self.ceil_mode, self.return_indices)
+        return x
+
+
+class Identity(nn.Module):
+    """Identity mapping.
+       Send input to output directly.
+    """
+
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, input):
+        return input
+
+
+def efficientnet_params(model_name):
+    """Map EfficientNet model name to parameter coefficients.
+    Args:
+        model_name (str): Model name to be queried.
+    Returns:
+        params_dict[model_name]: A (width,depth,res,dropout) tuple.
+    """
+    params_dict = {
+        # Coefficients:   width,depth,res,dropout
+        'efficientnet-b0': (1.0, 1.0, 224, 0.2),
+        'efficientnet-b1': (1.0, 1.1, 240, 0.2),
+        'efficientnet-b2': (1.1, 1.2, 260, 0.3),
+        'efficientnet-b3': (1.2, 1.4, 300, 0.3),
+        'efficientnet-b4': (1.4, 1.8, 380, 0.4),
+        'efficientnet-b5': (1.6, 2.2, 456, 0.4),
+        'efficientnet-b6': (1.8, 2.6, 528, 0.5),
+        'efficientnet-b7': (2.0, 3.1, 600, 0.5),
+        'efficientnet-b8': (2.2, 3.6, 672, 0.5),
+        'efficientnet-l2': (4.3, 5.3, 800, 0.5),
+    }
+    return params_dict[model_name]
+
+
+def get_model_params(model_name, override_params):
+    """Get the block args and global params for a given model name.
+    Args:
+        model_name (str): Model's name.
+        override_params (dict): A dict to modify global_params.
+    Returns:
+        blocks_args, global_params
+    """
+    if model_name.startswith('efficientnet'):
+        w, d, s, p = efficientnet_params(model_name)
+        # note: all models have drop connect rate = 0.2
+        blocks_args, global_params = efficientnet(
+            width_coefficient=w, depth_coefficient=d, dropout_rate=p, image_size=s)
+    else:
+        raise NotImplementedError(
+            'model name is not pre-defined: %s' % model_name)
+    if override_params:
+        # ValueError will be raised here if override_params has fields not included in global_params.
+        global_params = global_params._replace(**override_params)
+    return blocks_args, global_params
+
+
+class BlockDecoder(object):
+    """Block Decoder for readability,
+       straight from the official TensorFlow repository.
+    """
+
+    @staticmethod
+    def _decode_block_string(block_string):
+        """Get a block through a string notation of arguments.
+        Args:
+            block_string (str): A string notation of arguments.
+                                Examples: 'r1_k3_s11_e1_i32_o16_se0.25_noskip'.
+        Returns:
+            BlockArgs: The namedtuple defined at the top of this file.
+        """
+        assert isinstance(block_string, str)
+
+        ops = block_string.split('_')
+        options = {}
+        for op in ops:
+            splits = re.split(r'(\d.*)', op)
+            if len(splits) >= 2:
+                key, value = splits[:2]
+                options[key] = value
+
+        # Check stride
+        assert (('s' in options and len(options['s']) == 1) or
+                (len(options['s']) == 2 and options['s'][0] == options['s'][1]))
+
+        return BlockArgs(
+            num_repeat=int(options['r']),
+            kernel_size=int(options['k']),
+            stride=[int(options['s'][0])],
+            expand_ratio=int(options['e']),
+            input_filters=int(options['i']),
+            output_filters=int(options['o']),
+            se_ratio=float(options['se']) if 'se' in options else None,
+            id_skip=('noskip' not in block_string))
+
+    @staticmethod
+    def _encode_block_string(block):
+        """Encode a block to a string.
+        Args:
+            block (namedtuple): A BlockArgs type argument.
+        Returns:
+            block_string: A String form of BlockArgs.
+        """
+        args = [
+            'r%d' % block.num_repeat,
+            'k%d' % block.kernel_size,
+            's%d%d' % (block.strides[0], block.strides[1]),
+            'e%s' % block.expand_ratio,
+            'i%d' % block.input_filters,
+            'o%d' % block.output_filters
+        ]
+        if 0 < block.se_ratio <= 1:
+            args.append('se%s' % block.se_ratio)
+        if block.id_skip is False:
+            args.append('noskip')
+        return '_'.join(args)
+
+    @staticmethod
+    def decode(string_list):
+        """Decode a list of string notations to specify blocks inside the network.
+        Args:
+            string_list (list[str]): A list of strings, each string is a notation of block.
+        Returns:
+            blocks_args: A list of BlockArgs namedtuples of block args.
+        """
+        assert isinstance(string_list, list)
+        blocks_args = []
+        for block_string in string_list:
+            blocks_args.append(BlockDecoder._decode_block_string(block_string))
+        return blocks_args
+
+    @staticmethod
+    def encode(blocks_args):
+        """Encode a list of BlockArgs to a list of strings.
+        Args:
+            blocks_args (list[namedtuples]): A list of BlockArgs namedtuples of block args.
+        Returns:
+            block_strings: A list of strings, each string is a notation of block.
+        """
+        block_strings = []
+        for block in blocks_args:
+            block_strings.append(BlockDecoder._encode_block_string(block))
+        return block_strings
+
+
+def efficientnet(width_coefficient=None, depth_coefficient=None, image_size=None,
+                 dropout_rate=0.2, drop_connect_rate=0.2, num_classes=1000):
+    """Create BlockArgs and GlobalParams for efficientnet model.
+    Args:
+        width_coefficient (float)
+        depth_coefficient (float)
+        image_size (int)
+        dropout_rate (float)
+        drop_connect_rate (float)
+        num_classes (int)
+        Meaning as the name suggests.
+    Returns:
+        blocks_args, global_params.
+    """
+
+    # Blocks args for the whole model(efficientnet-b0 by default)
+    # It will be modified in the construction of EfficientNet Class according to model
+    blocks_args = [
+        'r1_k3_s11_e1_i32_o16_se0.25',
+        'r2_k3_s22_e6_i16_o24_se0.25',
+        'r2_k5_s22_e6_i24_o40_se0.25',
+        'r3_k3_s22_e6_i40_o80_se0.25',
+        'r3_k5_s11_e6_i80_o112_se0.25',
+        'r4_k5_s22_e6_i112_o192_se0.25',
+        'r1_k3_s11_e6_i192_o320_se0.25',
+    ]
+    blocks_args = BlockDecoder.decode(blocks_args)
+
+    global_params = GlobalParams(
+        width_coefficient=width_coefficient,
+        depth_coefficient=depth_coefficient,
+        image_size=image_size,
+        dropout_rate=dropout_rate,
+
+        num_classes=num_classes,
+        batch_norm_momentum=0.99,
+        batch_norm_epsilon=1e-3,
+        drop_connect_rate=drop_connect_rate,
+        depth_divisor=8,
+        min_depth=None,
+    )
+
+    return blocks_args, global_params
+
+
+GlobalParams = collections.namedtuple('GlobalParams', [
+    'width_coefficient', 'depth_coefficient', 'image_size', 'dropout_rate',
+    'num_classes', 'batch_norm_momentum', 'batch_norm_epsilon',
+    'drop_connect_rate', 'depth_divisor', 'min_depth'])
+
+# Parameters for an individual model block
+BlockArgs = collections.namedtuple('BlockArgs', [
+    'num_repeat', 'kernel_size', 'stride', 'expand_ratio',
+    'input_filters', 'output_filters', 'se_ratio', 'id_skip'])
+
+# Set GlobalParams and BlockArgs's defaults
+GlobalParams.__new__.__defaults__ = (None,) * len(GlobalParams._fields)
+BlockArgs.__new__.__defaults__ = (None,) * len(BlockArgs._fields)
+
+VALID_MODELS = (
+    'efficientnet-b0', 'efficientnet-b1', 'efficientnet-b2', 'efficientnet-b3',
+    'efficientnet-b4', 'efficientnet-b5', 'efficientnet-b6', 'efficientnet-b7',
+    'efficientnet-b8',
+
+    # Support the construction of 'efficientnet-l2' without pretrained weights
+    'efficientnet-l2'
+)
+
+
+class MBConvBlock(nn.Module):
+    """Mobile Inverted Residual Bottleneck Block.
+    Args:
+        block_args (namedtuple): BlockArgs, defined in utils.py.
+        global_params (namedtuple): GlobalParam, defined in utils.py.
+        image_size (tuple or list): [image_height, image_width].
+    References:
+        [1] https://arxiv.org/abs/1704.04861 (MobileNet v1)
+        [2] https://arxiv.org/abs/1801.04381 (MobileNet v2)
+        [3] https://arxiv.org/abs/1905.02244 (MobileNet v3)
+    """
+
+    def __init__(self, block_args, global_params, image_size=None):
+        super().__init__()
+        self._block_args = block_args
+        # pytorch's difference from tensorflow
+        self._bn_mom = 1 - global_params.batch_norm_momentum
+        self._bn_eps = global_params.batch_norm_epsilon
+        self.has_se = (self._block_args.se_ratio is not None) and (
+            0 < self._block_args.se_ratio <= 1)
+        # whether to use skip connection and drop connect
+        self.id_skip = block_args.id_skip
+
+        # Expansion phase (Inverted Bottleneck)
+        inp = self._block_args.input_filters  # number of input channels
+        oup = self._block_args.input_filters * \
+            self._block_args.expand_ratio  # number of output channels
+        if self._block_args.expand_ratio != 1:
+            Conv2d = get_same_padding_conv2d(image_size=image_size)
+            self._expand_conv = Conv2d(
+                in_channels=inp, out_channels=oup, kernel_size=1, bias=False)
+            self._bn0 = nn.BatchNorm2d(
+                num_features=oup, momentum=self._bn_mom, eps=self._bn_eps)
+            # image_size = calculate_output_image_size(image_size, 1) <-- this wouldn't modify image_size
+
+        # Depthwise convolution phase
+        k = self._block_args.kernel_size
+        s = self._block_args.stride
+        Conv2d = get_same_padding_conv2d(image_size=image_size)
+        self._depthwise_conv = Conv2d(
+            in_channels=oup, out_channels=oup, groups=oup,  # groups makes it depthwise
+            kernel_size=k, stride=s, bias=False)
+        self._bn1 = nn.BatchNorm2d(
+            num_features=oup, momentum=self._bn_mom, eps=self._bn_eps)
+        image_size = calculate_output_image_size(image_size, s)
+
+        # Squeeze and Excitation layer, if desired
+        if self.has_se:
+            Conv2d = get_same_padding_conv2d(image_size=(1, 1))
+            num_squeezed_channels = max(
+                1, int(self._block_args.input_filters * self._block_args.se_ratio))
+            self._se_reduce = Conv2d(
+                in_channels=oup, out_channels=num_squeezed_channels, kernel_size=1)
+            self._se_expand = Conv2d(
+                in_channels=num_squeezed_channels, out_channels=oup, kernel_size=1)
+
+        # Pointwise convolution phase
+        final_oup = self._block_args.output_filters
+        Conv2d = get_same_padding_conv2d(image_size=image_size)
+        self._project_conv = Conv2d(
+            in_channels=oup, out_channels=final_oup, kernel_size=1, bias=False)
+        self._bn2 = nn.BatchNorm2d(
+            num_features=final_oup, momentum=self._bn_mom, eps=self._bn_eps)
+        self._swish = MemoryEfficientSwish()
+
+    def forward(self, inputs, drop_connect_rate=None):
+        """MBConvBlock's forward function.
+        Args:
+            inputs (tensor): Input tensor.
+            drop_connect_rate (bool): Drop connect rate (float, between 0 and 1).
+        Returns:
+            Output of this block after processing.
+        """
+
+        # Expansion and Depthwise Convolution
+        x = inputs
+        if self._block_args.expand_ratio != 1:
+            x = self._expand_conv(inputs)
+            x = self._bn0(x)
+            x = self._swish(x)
+
+        x = self._depthwise_conv(x)
+        x = self._bn1(x)
+        x = self._swish(x)
+
+        # Squeeze and Excitation
+        if self.has_se:
+            x_squeezed = F.adaptive_avg_pool2d(x, 1)
+            x_squeezed = self._se_reduce(x_squeezed)
+            x_squeezed = self._swish(x_squeezed)
+            x_squeezed = self._se_expand(x_squeezed)
+            x = torch.sigmoid(x_squeezed) * x
+
+        # Pointwise Convolution
+        x = self._project_conv(x)
+        x = self._bn2(x)
+
+        # Skip connection and drop connect
+        input_filters, output_filters = self._block_args.input_filters, self._block_args.output_filters
+        if self.id_skip and self._block_args.stride == 1 and input_filters == output_filters:
+            # The combination of skip connection and drop connect brings about stochastic depth.
+            if drop_connect_rate:
+                x = drop_connect(x, p=drop_connect_rate,
+                                 training=self.training)
+            x = x + inputs  # skip connection
+        return x
+
+    def set_swish(self, memory_efficient=True):
+        """Sets swish function as memory efficient (for training) or standard (for export).
+        Args:
+            memory_efficient (bool): Whether to use memory-efficient version of swish.
+        """
+        self._swish = MemoryEfficientSwish() if memory_efficient else Swish()
+
+
+class EfficientNet(nn.Module):
+    """EfficientNet model.
+       Most easily loaded with the .from_name or .from_pretrained methods.
+    Args:
+        blocks_args (list[namedtuple]): A list of BlockArgs to construct blocks.
+        global_params (namedtuple): A set of GlobalParams shared between blocks.
+
+    References:
+        [1] https://arxiv.org/abs/1905.11946 (EfficientNet)
+    Example:
+        >>> import torch
+        >>> from efficientnet.model import EfficientNet
+        >>> inputs = torch.rand(1, 3, 224, 224)
+        >>> model = EfficientNet.from_pretrained('efficientnet-b0')
+        >>> model.eval()
+        >>> outputs = model(inputs)
+    """
+
+    def __init__(self, blocks_args=None, global_params=None):
+        super().__init__()
+        assert isinstance(blocks_args, list), 'blocks_args should be a list'
+        assert len(blocks_args) > 0, 'block args must be greater than 0'
+        self._global_params = global_params
+        self._blocks_args = blocks_args
+
+        # Batch norm parameters
+        bn_mom = 1 - self._global_params.batch_norm_momentum
+        bn_eps = self._global_params.batch_norm_epsilon
+
+        # Get stem static or dynamic convolution depending on image size
+        image_size = global_params.image_size
+        Conv2d = get_same_padding_conv2d(image_size=image_size)
+
+        # Stem
+        in_channels = 3  # rgb
+        # number of output channels
+        out_channels = round_filters(32, self._global_params)
+        self._conv_stem = Conv2d(
+            in_channels, out_channels, kernel_size=3, stride=2, bias=False)
+        self._bn0 = nn.BatchNorm2d(
+            num_features=out_channels, momentum=bn_mom, eps=bn_eps)
+        image_size = calculate_output_image_size(image_size, 2)
+
+        # Build blocks
+        self._blocks = nn.ModuleList([])
+        for block_args in self._blocks_args:
+
+            # Update block input and output filters based on depth multiplier.
+            block_args = block_args._replace(
+                input_filters=round_filters(
+                    block_args.input_filters, self._global_params),
+                output_filters=round_filters(
+                    block_args.output_filters, self._global_params),
+                num_repeat=round_repeats(
+                    block_args.num_repeat, self._global_params)
+            )
+
+            # The first block needs to take care of stride and filter size increase.
+            self._blocks.append(MBConvBlock(
+                block_args, self._global_params, image_size=image_size))
+            image_size = calculate_output_image_size(
+                image_size, block_args.stride)
+            if block_args.num_repeat > 1:  # modify block_args to keep same output size
+                block_args = block_args._replace(
+                    input_filters=block_args.output_filters, stride=1)
+            for _ in range(block_args.num_repeat - 1):
+                self._blocks.append(MBConvBlock(
+                    block_args, self._global_params, image_size=image_size))
+                # image_size = calculate_output_image_size(image_size, block_args.stride)  # stride = 1
+
+        # Head
+        in_channels = block_args.output_filters  # output of final block
+        out_channels = round_filters(1280, self._global_params)
+        Conv2d = get_same_padding_conv2d(image_size=image_size)
+        self._conv_head = Conv2d(
+            in_channels, out_channels, kernel_size=1, bias=False)
+        self._bn1 = nn.BatchNorm2d(
+            num_features=out_channels, momentum=bn_mom, eps=bn_eps)
+
+        # Final linear layer
+        self._avg_pooling = nn.AdaptiveAvgPool2d(1)
+        self._dropout = nn.Dropout(self._global_params.dropout_rate)
+        self._fc = nn.Linear(out_channels, self._global_params.num_classes)
+        self._swish = MemoryEfficientSwish()
+
+    def set_swish(self, memory_efficient=True):
+        """Sets swish function as memory efficient (for training) or standard (for export).
+        Args:
+            memory_efficient (bool): Whether to use memory-efficient version of swish.
+        """
+        self._swish = MemoryEfficientSwish() if memory_efficient else Swish()
+        for block in self._blocks:
+            block.set_swish(memory_efficient)
+
+    def extract_endpoints(self, inputs):
+        """Use convolution layer to extract features
+        from reduction levels i in [1, 2, 3, 4, 5].
+        Args:
+            inputs (tensor): Input tensor.
+        Returns:
+            Dictionary of last intermediate features
+            with reduction levels i in [1, 2, 3, 4, 5].
+            Example:
+                >>> import torch
+                >>> from efficientnet.model import EfficientNet
+                >>> inputs = torch.rand(1, 3, 224, 224)
+                >>> model = EfficientNet.from_pretrained('efficientnet-b0')
+                >>> endpoints = model.extract_endpoints(inputs)
+                >>> print(endpoints['reduction_1'].shape)  # torch.Size([1, 16, 112, 112])
+                >>> print(endpoints['reduction_2'].shape)  # torch.Size([1, 24, 56, 56])
+                >>> print(endpoints['reduction_3'].shape)  # torch.Size([1, 40, 28, 28])
+                >>> print(endpoints['reduction_4'].shape)  # torch.Size([1, 112, 14, 14])
+                >>> print(endpoints['reduction_5'].shape)  # torch.Size([1, 1280, 7, 7])
+        """
+        endpoints = dict()
+
+        # Stem
+        x = self._swish(self._bn0(self._conv_stem(inputs)))
+        prev_x = x
+
+        # Blocks
+        for idx, block in enumerate(self._blocks):
+            drop_connect_rate = self._global_params.drop_connect_rate
+            if drop_connect_rate:
+                # scale drop connect_rate
+                drop_connect_rate *= float(idx) / len(self._blocks)
+            x = block(x, drop_connect_rate=drop_connect_rate)
+            if prev_x.size(2) > x.size(2):
+                endpoints[f'reduction_{len(endpoints)+1}'] = prev_x
+            prev_x = x
+
+        # Head
+        x = self._swish(self._bn1(self._conv_head(x)))
+        endpoints[f'reduction_{len(endpoints)+1}'] = x
+
+        return endpoints
+
+    def extract_features(self, inputs):
+        """use convolution layer to extract feature .
+        Args:
+            inputs (tensor): Input tensor.
+        Returns:
+            Output of the final convolution 
+            layer in the efficientnet model.
+        """
+        # Stem
+        x = self._swish(self._bn0(self._conv_stem(inputs)))
+
+        # Blocks
+        for idx, block in enumerate(self._blocks):
+            drop_connect_rate = self._global_params.drop_connect_rate
+            if drop_connect_rate:
+                # scale drop connect_rate
+                drop_connect_rate *= float(idx) / len(self._blocks)
+            x = block(x, drop_connect_rate=drop_connect_rate)
+
+        # Head
+        x = self._swish(self._bn1(self._conv_head(x)))
+
+        return x
+
+    def forward(self, inputs):
+        """EfficientNet's forward function.
+           Calls extract_features to extract features, applies final linear layer, and returns logits.
+        Args:
+            inputs (tensor): Input tensor.
+        Returns:
+            Output of this model after processing.
+        """
+        # Convolution layers
+        x = self.extract_features(inputs)
+
+        # Pooling and final linear layer
+        x = self._avg_pooling(x)
+        x = x.flatten(start_dim=1)
+        x = self._dropout(x)
+        x = self._fc(x)
+
+        return x
+
+    @classmethod
+    def from_name(cls, model_name, in_channels=3, **override_params):
+        """create an efficientnet model according to name.
+        Args:
+            model_name (str): Name for efficientnet.
+            in_channels (int): Input data's channel number.
+            override_params (other key word params): 
+                Params to override model's global_params.
+                Optional key:
+                    'width_coefficient', 'depth_coefficient',
+                    'image_size', 'dropout_rate',
+                    'num_classes', 'batch_norm_momentum',
+                    'batch_norm_epsilon', 'drop_connect_rate',
+                    'depth_divisor', 'min_depth'
+        Returns:
+            An efficientnet model.
+        """
+        cls._check_model_name_is_valid(model_name)
+        blocks_args, global_params = get_model_params(
+            model_name, override_params)
+        model = cls(blocks_args, global_params)
+        model._change_in_channels(in_channels)
+        return model
+
+    @classmethod
+    def from_pretrained(cls, model_name, weights_path=None, advprop=False,
+                        in_channels=3, num_classes=1000, **override_params):
+        """create an efficientnet model according to name.
+        Args:
+            model_name (str): Name for efficientnet.
+            weights_path (None or str): 
+                str: path to pretrained weights file on the local disk.
+                None: use pretrained weights downloaded from the Internet.
+            advprop (bool): 
+                Whether to load pretrained weights
+                trained with advprop (valid when weights_path is None).
+            in_channels (int): Input data's channel number.
+            num_classes (int): 
+                Number of categories for classification.
+                It controls the output size for final linear layer.
+            override_params (other key word params): 
+                Params to override model's global_params.
+                Optional key:
+                    'width_coefficient', 'depth_coefficient',
+                    'image_size', 'dropout_rate',
+                    'num_classes', 'batch_norm_momentum',
+                    'batch_norm_epsilon', 'drop_connect_rate',
+                    'depth_divisor', 'min_depth'
+        Returns:
+            A pretrained efficientnet model.
+        """
+        model = cls.from_name(
+            model_name, num_classes=num_classes, **override_params)
+        load_pretrained_weights(model, model_name, weights_path=weights_path, load_fc=(
+            num_classes == 1000), advprop=advprop)
+        model._change_in_channels(in_channels)
+        return model
+
+    @classmethod
+    def get_image_size(cls, model_name):
+        """Get the input image size for a given efficientnet model.
+        Args:
+            model_name (str): Name for efficientnet.
+        Returns:
+            Input image size (resolution).
+        """
+        cls._check_model_name_is_valid(model_name)
+        _, _, res, _ = efficientnet_params(model_name)
+        return res
+
+    @classmethod
+    def _check_model_name_is_valid(cls, model_name):
+        """Validates model name. 
+        Args:
+            model_name (str): Name for efficientnet.
+        Returns:
+            bool: Is a valid name or not.
+        """
+        if model_name not in VALID_MODELS:
+            raise ValueError('model_name should be one of: ' +
+                             ', '.join(VALID_MODELS))
+
+    def _change_in_channels(self, in_channels):
+        """Adjust model's first convolution layer to in_channels, if in_channels not equals 3.
+        Args:
+            in_channels (int): Input data's channel number.
+        """
+        if in_channels != 3:
+            Conv2d = get_same_padding_conv2d(
+                image_size=self._global_params.image_size)
+            out_channels = round_filters(32, self._global_params)
+            self._conv_stem = Conv2d(
+                in_channels, out_channels, kernel_size=3, stride=2, bias=False)
+
+
+class ResNestSED(nn.Module):
+    def __init__(self, num_classes=264):
+        super().__init__()
+        self.interpolate_ratio = 30  # Downsampled ratio
+        base_model = ResNest(
+            Bottleneck, [3, 4, 6, 3],
+            radix=1, groups=1, bottleneck_width=64,
+            deep_stem=True, stem_width=32, avg_down=True,
+            avd=True, avd_first=True)
+        layers = list(base_model.children())[:-2]
+        self.encoder = nn.Sequential(*layers)
+
+        in_features = base_model.fc.in_features
+
+        self.fc1 = nn.Linear(in_features, in_features, bias=True)
+        self.att_block = AttBlock(
+            in_features, num_classes, activation="sigmoid")
+
+        self.init_weight()
+
+    def init_weight(self):
+        init_layer(self.fc1)
+
+    def forward(self, input):
+        frames_num = input.size(3)
+
+        # (batch_size, channels, freq, frames)
+        x = self.encoder(input)
+        # (batch_size, channels, frames)
+        x = torch.mean(x, dim=2)
+
+        # channel smoothing
+        x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
+        x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
+        x = x1 + x2
+
+        x = F.dropout(x, p=0.5, training=self.training)
+        x = x.transpose(1, 2)
+        x = F.relu_(self.fc1(x))
+        x = x.transpose(1, 2)
+        x = F.dropout(x, p=0.5, training=self.training)
+        (clipwise_output, norm_att, segmentwise_output) = self.att_block(x)
+        logit = torch.sum(norm_att * self.att_block.cla(x), dim=2)
+        segmentwise_output = segmentwise_output.transpose(1, 2)
+
+        # Get framewise output
+        framewise_output = interpolate(segmentwise_output,
+                                       self.interpolate_ratio)
+        framewise_output = pad_framewise_output(framewise_output, frames_num)
+
+        output_dict = {
+            "framewise_output": framewise_output,
+            "logit": logit,
+            "clipwise_output": clipwise_output
+        }
+        return output_dict
+
+
+class EfficientNetSED(nn.Module):
+    def __init__(self, base_model_name: str, pretrained=False,
+                 num_classes=264):
+        super().__init__()
+        self.interpolate_ratio = 32  # Downsampled ratio
+        if pretrained:
+            self.base_model = EfficientNet.from_pretrained(base_model_name)
+        else:
+            self.base_model = EfficientNet.from_name(base_model_name)
+
+        in_features = self.base_model._fc.in_features
+
+        self.fc1 = nn.Linear(in_features, in_features, bias=True)
+        self.att_block = AttBlock(
+            in_features, num_classes, activation="sigmoid")
+
+        self.init_weight()
+
+    def init_weight(self):
+        init_layer(self.fc1)
+
+    def forward(self, input):
+        frames_num = input.size(3)
+
+        # (batch_size, channels, freq, frames)
+        x = self.base_model.extract_features(input)
+
+        # (batch_size, channels, frames)
+        x = torch.mean(x, dim=2)
+
+        # channel smoothing
+        x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
+        x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
+        x = x1 + x2
+
+        x = F.dropout(x, p=0.5, training=self.training)
+        x = x.transpose(1, 2)
+        x = F.relu_(self.fc1(x))
+        x = x.transpose(1, 2)
+        x = F.dropout(x, p=0.5, training=self.training)
+        (clipwise_output, _, segmentwise_output) = self.att_block(x)
+        segmentwise_output = segmentwise_output.transpose(1, 2)
+
+        # Get framewise output
+        framewise_output = interpolate(segmentwise_output,
+                                       self.interpolate_ratio)
+        framewise_output = pad_framewise_output(framewise_output, frames_num)
+
+        output_dict = {
+            "framewise_output": framewise_output,
+            "segmentwise_output": segmentwise_output,
+            "clipwise_output": clipwise_output
+        }
+
+        return output_dict
+
+
+resnest_model_config = {
+    "num_classes": 264
+}
+
+effnet_model_config = {
+    "num_classes": 264,
+    "base_model_name": "efficientnet-b0",
+    "pretrained": False
+}
+
+weights_path = {
+    "resnest": {
+        "ref2_th03": "train/weights_pretrained/birdcall-resnest-ema-all-ref2-th03/ema.pth",
+        "ref2_th04": "train/weights_pretrained/birdcall-resnest-ema-all-ref2-th04/ema.pth",
+        "ext": "train/weights_pretrained/birdcall-resnest-emta-all-ext-ref2-th04/ema.pth"
+    },
+    "effnet": {
+        "eff_th04": "train/weights_pretrained/birdcall-effnet-b0-ema-all-ref2/ema.pth"
+    }
+}
+
+
+def get_model(resnest_config: dict, effnet_config: dict, weights_path: dict):
+    models = {}
+    device = torch.device("cuda")
+    for model_key in weights_path:
+        path_dict = weights_path[model_key]
+        for path_key in path_dict:
+            if model_key == "resnest":
+                model = ResNestSED(**resnest_config)
+            else:
+                model = EfficientNetSED(**effnet_config)
+            checkpoint = torch.load(path_dict[path_key])
+            model_state_dict = {}
+            for key in checkpoint["model_state_dict"]:
+                if key == "n_averaged":
+                    continue
+                new_key = key.replace("module.", "")
+                model_state_dict[new_key] = checkpoint["model_state_dict"][key]
+            model.load_state_dict(model_state_dict)
+            model.to(device)
+            model.eval()
+            models[path_key] = model
+    return models
+
+
+def get_optimizer(model: nn.Module, config: dict):
+    optimizer_config = config["optimizer"]
+    optimizer_name = optimizer_config.get("name")
+
+    return optim.__getattribute__(optimizer_name)(model.parameters(),
+                                                  **optimizer_config["params"])
+
+
+def get_scheduler(optimizer, config: dict):
+    scheduler_config = config["scheduler"]
+    scheduler_name = scheduler_config.get("name")
+
+    if scheduler_name is None:
+        return
+    else:
+        return optim.lr_scheduler.__getattribute__(scheduler_name)(
+            optimizer, **scheduler_config["params"])
+
+
+models = get_model(resnest_config=resnest_model_config,
+                   effnet_config=effnet_model_config,
+                   weights_path=weights_path)
--- a/src/preproc.py
+++ b/src/preproc.py
+import librosa
+import numpy as np
+from pathlib import Path
+import shutil
+import cv2
+import os
+from src.utils import normalize_melspec
+from fastprogress import progress_bar
+
+# Parameters
+TARGET_SR = 32000
+melspectrogram_parameters = {
+    "n_mels": 128,
+    "fmin": 20,
+    "fmax": 16000
+}
+pcen_parameters = {
+    "gain": 0.98,
+    "bias": 2,
+    "power": 0.5,
+    "time_constant": 0.4,
+    "eps": 0.000001
+}
+PERIOD = 30
+CHUNK = PERIOD * TARGET_SR
+
+###
+
+
+def transform_all_images(dirpath: str, sound_file: str, csv_file: str):
+    """Create a folder with .png for the training"""
+    csv_file = open(dirpath + csv_file, "r", encoding='utf-8')
+    i = 0
+    csv_file.readline()
+
+    # Reset the temp folder
+    shutil.rmtree('train/temp/train', ignore_errors=True)
+    os.makedirs('train/temp/train')
+    shutil.rmtree('train/temp/val', ignore_errors=True)
+    os.makedirs('train/temp/val')
+
+    for audio_line in progress_bar(csv_file.readlines()):
+        L = audio_line.split(",")
+        id_audio = L[-2]
+        id_species = L[1]
+        # Create a folder for each species
+        os.makedirs('train/temp/train/'+id_species, exist_ok=True)
+        os.makedirs('train/temp/val/'+id_species, exist_ok=True)
+
+        image = np.swapaxes(clip_to_image(
+            dirpath+sound_file+id_audio, all_chunks=False), 0, 2)
+
+        # 70% of the audio are used for the training phase and 30% for the validation phase
+        if i % 50 > 15:
+            cv2.imwrite('train/temp/train/'+id_species +
+                        '/'+id_audio+'.png', image)
+
+        else:
+            cv2.imwrite('train/temp/val/'+id_species +
+                        '/'+id_audio+'.png', image)
+        i += 1
+
+
+def preproc(y):
+    """return the preprocessing of a clip 'y' """
+    y_batch = y.astype(np.float32)
+
+    if len(y_batch) > 0:  # Normalization
+        max_vol = np.abs(y_batch).max()
+        if max_vol > 0:
+            y_batch = np.asfortranarray(y_batch * 1 / max_vol)
+
+    # Zero paddling to have an input of constant size
+    y_pad = np.zeros(PERIOD * TARGET_SR, dtype=np.float32)
+    y_pad[:len(y_batch)] = y_batch
+
+    # spectrograms
+    melspec = librosa.feature.melspectrogram(y=y_pad,
+                                             sr=TARGET_SR,
+                                             **melspectrogram_parameters)
+    pcen = librosa.pcen(melspec, sr=TARGET_SR, **pcen_parameters)
+    clean_mel = librosa.power_to_db(melspec ** 1.5)
+    melspec = librosa.power_to_db(melspec).astype(np.float32)
+    # Normalization
+    norm_melspec = normalize_melspec(melspec)
+    norm_pcen = normalize_melspec(pcen)
+    norm_clean_mel = normalize_melspec(clean_mel)
+    # Concatenate, we have a color picture
+    image = np.stack([norm_melspec, norm_pcen, norm_clean_mel], axis=-1)
+    height, width, _ = image.shape
+    image = cv2.resize(image, (int(width * 224 / height), 224))
+    image = np.moveaxis(image, 2, 0)
+    image = (image).astype(np.float32)
+
+    return image
+
+
+def clip_to_image(clip_path: str, all_chunks=True):
+    """return the clip almost ready to apply the model. If all_chunks=False, only the first chunk is returned"""
+    # load the audio file
+    if Path(clip_path+".mp3").exists():
+
+        clip, _ = librosa.load(clip_path+".mp3",
+                               sr=TARGET_SR,
+                               mono=True,
+                               res_type="kaiser_fast")
+    elif Path(clip_path+".wav").exists():
+        clip, _ = librosa.load(clip_path + ".wav",
+                               sr=TARGET_SR,
+                               mono=True,
+                               res_type="kaiser_fast")
+    try:
+        clip
+    except UnboundLocalError:
+        raise FileExistsError(
+            f"{clip_path}.mp3 or .wav doesn't exist, only .wav & .mp3 are allowed. Aswell, it might be an audio from the .csv that is not in the audio folder. Easy fix : delete the corresponding line in the csv")
+
+    y = clip.astype(np.float32)
+
+    if not all_chunks:
+        image = preproc(y[:CHUNK])
+        array = np.asarray(image)
+        return (array)
+
+    nb_chunk = (len(y)-1)//CHUNK+1
+    images = []
+    for k in range(nb_chunk):
+        image = preproc(y[k*CHUNK:(k+1)*CHUNK])
+        images.append(image)
+    array = np.asarray(images)
+    return (array)
--- a/src/utils.py
+++ b/src/utils.py
+import torch.nn as nn
+import os
+import numpy as np
+
+
+class ImprovedPANNsLoss(nn.Module):
+    """criterion used for the training"""
+
+    def __init__(self, output_key="logit", weights=[1, 0.5]):
+        super().__init__()
+
+        self.output_key = output_key
+        if output_key == "logit":
+            self.normal_loss = nn.BCEWithLogitsLoss()
+        else:
+            self.normal_loss = nn.BCELoss()
+
+        self.bce = nn.BCELoss()
+        self.weights = weights
+
+    def forward(self, input, target):
+        input_ = input[self.output_key]
+        target = target.float()
+
+        framewise_output = input["framewise_output"]
+        clipwise_output_with_max, _ = framewise_output.max(dim=1)
+
+        normal_loss = self.normal_loss(input_, target)
+        auxiliary_loss = self.bce(clipwise_output_with_max, target)
+
+        return self.weights[0] * normal_loss + self.weights[1] * auxiliary_loss
+
+
+def find_classes(dir: str):
+    """return inv_bird_code and bird_code"""
+    classes = os.listdir(dir)
+    classes.sort()
+    class_to_idx = {bird: i for i, bird in enumerate(classes)}
+    return classes, class_to_idx
+
+
+def normalize_melspec(X: np.ndarray):
+    """Normalize a spectrogram in a strange way"""
+    eps = 1e-6
+    mean = X.mean()
+    X = X - mean
+    std = X.std()
+    Xstd = X / (std + eps)
+    norm_min, norm_max = Xstd.min(), Xstd.max()
+    if (norm_max - norm_min) > eps:
+        V = Xstd
+        V[V < norm_min] = norm_min
+        V[V > norm_max] = norm_max
+        V = 255 * (V - norm_min) / (norm_max - norm_min)
+        V = V.astype(np.uint8)
+    else:
+        # Just zero
+        V = np.zeros_like(Xstd, dtype=np.uint8)
+    return V
--- a/train.py
+++ b/train.py
+from __future__ import print_function, division
+
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.optim import lr_scheduler
+import torch.backends.cudnn as cudnn
+import numpy as np
+import torchvision
+from torchvision import datasets, models, transforms
+import time
+import os
+import copy
+
+import keyboard
+from src.utils import ImprovedPANNsLoss, find_classes
+from src.models import get_optimizer, get_scheduler, AttBlock, models
+
+# Utils
+
+
+def transform_PIL_Array(image_PIL):
+    image = np.array(image_PIL)
+    image = np.swapaxes(image, 0, 2)
+    image = torch.from_numpy(image/255.0)
+    image = image.float()
+    return image
+
+
+def transform_labels(labels, num_classes):
+    siz = labels.size()
+    new_labels = torch.zeros((siz[0], num_classes))
+    for i, label in enumerate(labels):
+        new_labels[i, label] = 1
+    return new_labels
+
+
+__keep_running__ = True
+
+
+def stop_running():
+    "appuyer sur '$' termine la derniere epoch puis fini l'entrainement"
+    global __keep_running__
+    __keep_running__ = False
+
+
+keyboard.add_hotkey('$', stop_running)
+
+
+def train_model(model, device, criterion, optimizer, scheduler, model_name, dataloaders, dataset_sizes, class_names, num_epochs=25):
+    since = time.time()
+
+    best_model_wts = copy.deepcopy(model.state_dict())
+    best_acc = -np.inf
+
+    for epoch in range(num_epochs):
+        if not __keep_running__:
+            continue
+        print(f'Epoch {epoch}/{num_epochs - 1}')
+        print('-' * 10)
+
+        # Each epoch has a training and validation phase
+        for phase in ['train', 'val']:
+            if phase == 'train':
+                model.train()  # Set model to training mode
+            else:
+                model.eval()   # Set model to evaluate mode
+
+            running_loss = 0.0
+            running_corrects = 0
+
+            # Iterate over data.
+            for inputs, labels in dataloaders[phase]:
+                labels = transform_labels(labels, len(class_names))
+                inputs = inputs.to(device)
+                labels = labels.to(device)
+                # zero the parameter gradients
+                optimizer.zero_grad()
+
+                # forward
+                # track history if only in train
+                with torch.set_grad_enabled(phase == 'train'):
+                    outputs = model(inputs)
+                    if model_name == "eff_th04":
+                        outputs["segmentwise_output"], _ = outputs["segmentwise_output"].max(
+                            dim=1)
+                        loss = criterion(outputs, labels)
+                    else:
+                        loss = criterion(outputs, labels)
+
+                    # backward + optimize only if in training phase
+                    if phase == 'train':
+                        loss.backward()
+                        optimizer.step()
+
+                # statistics
+                running_loss += loss.item() * inputs.size(0)
+                ###
+                result = outputs['framewise_output']
+                result, _ = torch.max(result, dim=1)
+                # least squares
+                running_corrects -= torch.sum((result-labels)**2)
+                ###
+                del inputs, outputs, loss, result
+            if phase == 'train':
+                scheduler.step()
+
+            epoch_loss = running_loss / dataset_sizes[phase]
+            epoch_acc = running_corrects.double() / dataset_sizes[phase]
+
+            print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')
+
+            # deep copy the model
+            if phase == 'val' and epoch_acc > best_acc:
+                best_acc = epoch_acc
+                best_model_wts = copy.deepcopy(model.state_dict())
+            del epoch_acc, epoch_loss, running_corrects, running_loss
+        print()
+
+    time_elapsed = time.time() - since
+    print(
+        f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
+    print(f'Best val Acc: {best_acc:4f}')
+
+    # load best model weights
+    model.load_state_dict(best_model_wts)
+    return model
+
+
+def train(models, device, key_model=None, num_epochs=25, lr=0.001, batch_size=2):
+
+    image_datasets = {x: datasets.ImageFolder('train/temp/' + x, transform=transform_PIL_Array)
+                      for x in ['train', 'val']}
+
+    dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=batch_size,
+                                                  shuffle=True, num_workers=0)
+                   for x in ['train', 'val']}
+
+    dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
+    class_names = image_datasets['train'].classes
+
+    if not key_model:
+        key_model = models.keys()
+    for model in key_model:
+        print(model)
+        model_conv = models[model]
+
+        for param in model_conv.parameters():
+            param.requires_grad = False
+
+        # Parameters of newly constructed modules have requires_grad=True by default
+        num_ftrs = model_conv.fc1.in_features
+        model_conv.att_block = AttBlock(
+            num_ftrs, len(class_names), activation="sigmoid")
+        model_conv = model_conv.to(device)
+        print()
+
+        # Observe that only parameters of final layer are being optimized as
+        # opposed to before.
+        if model != "eff_th04":
+            criterion = ImprovedPANNsLoss()
+        else:
+            criterion = ImprovedPANNsLoss('segmentwise_output')
+
+        optimizer_conv = get_optimizer(
+            model_conv, {"optimizer": {'name': 'Adam', 'params': {'lr': lr}}})
+        exp_lr_scheduler = get_scheduler(optimizer_conv, {'scheduler': {
+                                         'name': 'CosineAnnealingLR', 'params': {'T_max': 10}}})
+
+        print('ok')
+        model_conv = train_model(model_conv, device, criterion, optimizer_conv,
+                                 exp_lr_scheduler, model, dataloaders, dataset_sizes, class_names, num_epochs=num_epochs)
+        torch.save(model_conv.state_dict(), 'weights_trained/'+model+'.pth')
+        torch.cuda.empty_cache()
+        del model_conv
+        torch.cuda.empty_cache()
+    return None
+
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '-m', '--model', help="""Chose if you want to train model "1", "2", "3", "4" or "all" """)
+    args = parser.parse_args()
+    keys = args.model
+    if keys == 'all':
+        keys = ["ref2_th03", "ref2_th04",
+                "eff_th04", "ext"]
+    elif keys in ['1', '2', '3', '4']:
+        keys = [["ref2_th03", "ref2_th04",
+                 "eff_th04", "ext"][int(keys)-1]]
+
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    print(device)
+    cudnn.benchmark = False
+    # batch 20 et 15 pour eff_th04 (8GB VRAM) # if you don't have enough ram you'd better launch one by one
+    train(models, device, keys, num_epochs=20, batch_size=15)
+
+    classes, _ = find_classes('train/temp/train/')
+    np.save('inv_bird_code.npy', classes)
--- a/train/training dict
+++ b/train/training dict
+{'ref2_th03': [2.0378282148759443, 1.350958687918527, 1.1509922572544644, 1.0251973749517085, 0.9530742771022923, 0.9385142064356543, 0.8788359924987122, 0.8812886332417583, 0.8496827345628005, 0.866959456559066, 0.8618548885806577, 0.8511872343964629, 0.8530584482046275, 0.8455350268018116, 0.8048266735705701, 0.8087591653341776, 0.7847911289760046, 0.7558997856391656, 0.7466989035134788, 0.7164284003959908, 0.7004573109385732, 0.6832559606531164, 0.6746111859332075, 0.6557628505832547, 0.6601907080346412],
+'ext': [2.203362265785972, 1.4098027826665522, 1.159971886938745, 1.0570843036358173, 1.0038691929408483, 0.9624923035338685, 0.9488933688991673, 0.9413243388081646, 0.927853762448489, 0.9318580208243905, 0.9206070952363067, 0.923733637883113, 0.9244260683164492, 0.9089159284319197, 0.8776338179032882, 0.8544231875912175, 0.8257928366189475, 0.8005734957181491, 0.7814002403846154, 0.7663307609138909, 0.745571765270862, 0.7303421523544815, 0.7229315789191278, 0.7038964910821601, 0.7025682218782195],
+'eff_th04': [1.1523478581355169, 1.009532215831044, 0.9885831560407367, 0.9802307296585251, 0.9729183322780736, 0.9660697350135217, 0.9601244245256697, 0.956034398340917, 0.953995589371566, 0.9535502339457418, 0.953701648083362, 0.953343737256396, 0.9514200399210165, 0.9463589008037862, 0.9367290957943425, 0.921370118528932, 0.8963166959993133, 0.8660067044771635, 0.8309774713201837, 0.7993172446450035, 0.7720077850006439, 0.7488513988452954, 0.7336551540500516, 0.7187049781883157, 0.7086745880462312],
+'ref2_th04': [1.4462518377618476, 1.0416387201665522, 0.8972107604309754, 0.8218668843363668, 0.7769343617198232, 0.7448355706183466, 0.7304498022729224, 0.7148164183228881, 0.71854735992767, 0.713101565182864, 0.7222622043483861, 0.712948977292239, 0.702530242584564, 0.7084923209724846, 0.6837784064995064, 0.6682654999114656, 0.6659571888682607, 0.6426979316459908, 0.6317920475215703, 0.6097299345247038, 0.6050359704992274, 0.5991017268254207, 0.5904199369661101, 0.5811248611617875, 0.5721830011724116]}
\ No newline at end of file
--- a/train/weights_pretrained/birdcall-effnet-b0-ema-all-ref2/ema.pth
+++ b/train/weights_pretrained/birdcall-effnet-b0-ema-all-ref2/ema.pth
--- a/train/weights_pretrained/birdcall-effnet-ema-all-focal-ref2-th04/ema.pth
+++ b/train/weights_pretrained/birdcall-effnet-ema-all-focal-ref2-th04/ema.pth
--- a/train/weights_pretrained/birdcall-resnest-ema-all-ref2-th03/ema.pth
+++ b/train/weights_pretrained/birdcall-resnest-ema-all-ref2-th03/ema.pth
--- a/train/weights_pretrained/birdcall-resnest-ema-all-ref2-th04/ema.pth
+++ b/train/weights_pretrained/birdcall-resnest-ema-all-ref2-th04/ema.pth
--- a/train/weights_pretrained/birdcall-resnest-ema-all-ref2-th06/ema.pth
+++ b/train/weights_pretrained/birdcall-resnest-ema-all-ref2-th06/ema.pth
--- a/train/weights_pretrained/birdcall-resnest-ema-all-ref2-th07/ema.pth
+++ b/train/weights_pretrained/birdcall-resnest-ema-all-ref2-th07/ema.pth
--- a/train/weights_pretrained/birdcall-resnest-ema-all-refine2/ema.pth
+++ b/train/weights_pretrained/birdcall-resnest-ema-all-refine2/ema.pth
No results found