Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision

Target

Select target project
  • manon.arfib/birdcall-detection
1 result
Select Git revision
Show changes
Commits on Source (2)
Showing
with 2449 additions and 0 deletions
input/*.csv
input/audio_files/**
output/**
**/__pycache__
train/temp/train/**
train/temp/val/**
train/data_training/audio_files/**
train/data_training/*.csv
\ No newline at end of file
prepare:
python creation_datasets/create_csv.py France
python creation_datasets/download_files.py France.csv
rm -rf train/data_training/birds.csv
mv creation_datasets/fichiers_csv/France.csv train/data_training/birds.csv
rm -rf train/data_training/audio_files/
mv creation_datasets/audio_files_France train/data_training/audio_files/
python preload_training_data.py
train:
python train.py -m all
\ No newline at end of file
To use the ia :
put in the input folder :
an audio_files with the audios and a csv file containing the audios you want to test. The format is :
audio_id
audio1
audio2
...
no extension on the audio_id column
Note : the csv can have multiple columns, but only the audio_id is considered
To make the training :
download and preprocess the dataset : in Terminal, put yourself in the "birdcall-detection" folder. Then type "make prepare"
Wait the download end. Then you can launch the training with "make train".
If you have not enough RAM, you'd better train models one by one. To do so, write (still in the "birdcall-detection" folder) :
python train -m 1
python train -m 2
python train -m 3
python train -m 4
Note : If you want to make your own dataset, in the train/data_training folder, put an audio_files folder and a csv with the adequate format :
cnt,en,id,length
**,class1,audio1.mp3,**
**,class1,audio2.mp3,**
...
**,class2,audio4.wav,**
...
where audio.* is just the name of the corresponding audio file and *** are characters that needs to be there but are not important
import torch
import pandas as pd
from pathlib import Path
import numpy as np
from fastprogress import progress_bar
import warnings
from contextlib import contextmanager
import time
from src.models import models, AttBlock
from src.preproc import clip_to_image
PERIOD = 30
# Arbitrary
ratio = {
"ref2_th03": 0.25/0.77,
"ref2_th04": 0.14/0.77,
"eff_th04": 0.13/0.77,
"ext": 0.25/0.77
}
all_time_duration = 0
# We may determine tresholds for each class but it's not done here
thresholds = {}
inv_bird_call = np.load('inv_bird_code.npy', allow_pickle=True)
@contextmanager
def timer(name: str):
t0 = time.time()
msg = f"[{name}] start"
print(msg)
yield
global all_time_duration
all_time_duration += time.time() - t0
msg = f"[{name}] done in {time.time() - t0:.2f} s"
print(msg)
def prediction_for_clip(test_df: pd.DataFrame,
clip: Path, models):
"""Given a clip, the function predict the bird singing"""
images = clip_to_image(clip)
array = np.asarray(images)
tensors = torch.from_numpy(array)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
estimated_event_list = []
global_time = 0.0
audio_id = test_df["audio_id"].values[0]
for image in progress_bar(tensors):
image = image[None, :]/255.0
image = image.to(device)
outputs = {}
with torch.no_grad():
for key in models:
prediction = models[key](image)
framewise_outputs = prediction["framewise_output"].detach(
).cpu().numpy()[0]
outputs[key] = framewise_outputs
key = list(outputs.keys())[0]
framewise_outputs = np.zeros_like(outputs[key], dtype=np.float32)
for key in outputs:
framewise_outputs += ratio[key] * outputs[key]
thresholded = np.zeros_like(framewise_outputs)
for i in range(len(inv_bird_call)):
thresholded[:, i] = framewise_outputs[:, i] >= 0.01
# thresholded[:, i] = framewise_outputs[:, i] >= thresholds[INV_BIRD_CODE[i]] # uncoment if there is personalized tresholds
sec_per_frame = PERIOD / thresholded.shape[0]
for target_idx in range(thresholded.shape[1]):
if thresholded[:, target_idx].mean() == 0:
pass
else:
detected = np.argwhere(thresholded[:, target_idx]).reshape(-1)
head_idx = 0
tail_idx = 0
while True:
if (tail_idx + 1 == len(detected)) or (
detected[tail_idx + 1] -
detected[tail_idx] != 1):
onset = sec_per_frame * detected[
head_idx] + global_time
offset = sec_per_frame * detected[
tail_idx] + global_time
onset_idx = detected[head_idx]
offset_idx = detected[tail_idx]
max_confidence = framewise_outputs[
onset_idx:offset_idx, target_idx].max()
mean_confidence = framewise_outputs[
onset_idx:offset_idx, target_idx].mean()
estimated_event = {
"audio_id": audio_id,
"ebird_code": inv_bird_call[target_idx],
"onset": onset,
"offset": offset,
"max_confidence": max_confidence,
"mean_confidence": mean_confidence
}
estimated_event_list.append(estimated_event)
head_idx = tail_idx + 1
tail_idx = tail_idx + 1
if head_idx >= len(detected):
break
else:
tail_idx += 1
global_time += PERIOD
prediction_df = pd.DataFrame(estimated_event_list)
return prediction_df
def prediction(test_df: pd.DataFrame,
test_audio: Path,
models):
""""given the pass of a folder containing audios and a csv corresponding, it returns a prediction for each audio which need a postprocess"""
unique_audio_id = test_df.audio_id.unique()
warnings.filterwarnings("ignore")
prediction_dfs = []
for audio_id in unique_audio_id:
clip_path = test_audio + audio_id
test_df_for_audio_id = test_df.query(
f"audio_id == '{audio_id}'").reset_index(drop=True)
with timer(f"Prediction & load on {audio_id}"):
prediction_df = prediction_for_clip(test_df_for_audio_id,
clip=clip_path,
models=models)
prediction_dfs.append(prediction_df)
prediction_df = pd.concat(prediction_dfs, axis=0,
sort=False).reset_index(drop=True)
return prediction_df
def postproc(prediction_df, test):
"""Make the postprocessing"""
labels = {}
for audio_id, sub_df in prediction_df.groupby("audio_id"):
events = sub_df[["ebird_code", "mean_confidence"]].values
n_events = len(events)
bird_max_conf = np.max(events[:, 1])
for i in range(n_events):
if events[i][1] == bird_max_conf:
row_id = f"{audio_id}"
bird = events[i][0]
labels[row_id] = {bird, ""}
for key in labels:
labels[key] = " ".join(sorted(list(labels[key])))
row_ids = list(labels.keys())
birds = list(labels.values())
post_processed = pd.DataFrame({
"audio_id": row_ids,
"birds": birds})
all_row_id = test[["audio_id"]]
submission = all_row_id.merge(post_processed, on="audio_id", how="left")
submission = submission.fillna("nocall")
return submission
if __name__ == '__main__':
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
for model in models: # load the model
num_ftrs = models[model].fc1.in_features
models[model].att_block = AttBlock(
num_ftrs, len(inv_bird_call), activation="sigmoid")
models[model].load_state_dict(torch.load(
'weights_trained/'+model+'.pth'))
models[model].to(device)
test = pd.read_csv("input/test.csv")
test_audio = "input/audio_files/"
test["audio_id"] = test["audio_id"].map(str)
prediction_df = prediction(
test_df=test, test_audio=test_audio, models=models)
post_processed = prediction_df
if not prediction_df.empty:
post_processed = postproc(prediction_df, test)
all_row_id = test[["audio_id"]]
submission = all_row_id.merge(post_processed, on="audio_id", how="left")
submission.to_csv("output/submission.csv", index=False)
print(f"all done in {all_time_duration:.2f} s")
File added
from src.preproc import transform_all_images
if __name__ == '__main__':
# Make the preprocessing and save .png in the train/temp/ folder
transform_all_images('train/data_training/', 'audio_files/', 'birds.csv')
anaconda-client==1.11.1
anaconda-project==0.11.1
appdirs==1.4.4
astroid==2.15.3
asttokens==2.2.1
attrs==22.1.0
audioread==3.0.0
backcall==0.2.0
backports.functools-lru-cache==1.6.4
backports.tempfile==1.0
backports.weakref==1.0.post1
beautifulsoup4==4.11.1
boltons==23.0.0
Bottleneck==1.3.7
brotlipy==0.7.0
certifi==2022.9.24
cffi==1.15.1
chardet==4.0.0
charset-normalizer==2.0.4
click==8.0.4
clyent==1.2.2
colorama==0.4.6
comm==0.1.3
conda-content-trust==0.1.3
conda-pack==0.6.0
conda-package-handling==2.0.2
conda_package_streaming==0.7.0
conda-repo-cli==1.0.41
conda-verify==3.4.2
contourpy==1.0.7
cryptography==39.0.1
cycler==0.11.0
debugpy==1.6.6
decorator==5.1.1
defusedxml==0.7.1
dill==0.3.6
executing==1.2.0
fastjsonschema==2.16.2
fastprogress==1.0.0
filelock==3.9.0
flit_core==3.8.0
fonttools==4.39.3
future==0.18.3
glob2==0.7
idna==3.4
importlib-metadata==6.1.0
ipykernel==6.15.0
ipython==8.11.0
isort==5.12.0
jedi==0.18.2
Jinja2==3.1.2
joblib==1.2.0
jsonschema==4.17.3
jupyter_client==8.1.0
jupyter_core==5.3.0
keyboard==0.13.5
kiwisolver==1.4.4
lazy_loader==0.2
lazy-object-proxy==1.9.0
libarchive-c==2.9
librosa==0.10.0.post2
llvmlite==0.39.1
MarkupSafe==2.1.1
matplotlib==3.7.1
matplotlib-inline==0.1.6
mccabe==0.7.0
menuinst==1.4.19
mkl-fft==1.3.1
mkl-random==1.2.2
mkl-service==2.4.0
mpmath==1.2.1
msgpack==1.0.5
nbformat==5.7.0
nest-asyncio==1.5.6
networkx==2.8.4
numba==0.56.4
numexpr==2.8.4
numpy==1.23.5
packaging==23.0
panda==0.3.1
pandas==1.5.3
parso==0.8.3
pathlib==1.0.1
pickleshare==0.7.5
Pillow==9.4.0
pip==22.3.1
pkginfo==1.9.6
platformdirs==3.2.0
pluggy==1.0.0
ply==3.11
pooch==1.6.0
prompt-toolkit==3.0.38
psutil==5.9.0
pure-eval==0.2.2
pycosat==0.6.4
pycparser==2.21
Pygments==2.14.0
PyJWT==2.4.0
pylint==2.17.2
pyOpenSSL==23.0.0
pyparsing==3.0.9
PyQt5==5.15.7
PyQt5-sip==12.11.0
pyrsistent==0.18.0
PySocks==1.7.1
PySoundFile==0.9.0.post1
python-dateutil==2.8.2
pytz==2022.7
pywin32==306
PyYAML==6.0
pyzmq==25.0.2
QtPy==2.2.0
requests==2.28.1
resampy==0.4.2
ruamel.yaml==0.17.21
ruamel.yaml.clib==0.2.6
ruamel-yaml-conda==0.17.21
scikit-learn==1.2.2
scipy==1.10.1
setuptools==65.6.3
sip==6.6.2
six==1.16.0
soundfile==0.12.1
soupsieve==2.3.2.post1
soxr==0.3.4
stack-data==0.6.2
sympy==1.11.1
threadpoolctl==3.1.0
toml==0.10.2
tomli==2.0.1
tomlkit==0.11.7
toolz==0.12.0
torch==2.0.0
torchaudio==2.0.0
torchsummary==1.5.1
torchvision==0.15.0
tornado==6.2
tqdm==4.65.0
traitlets==5.7.1
typing_extensions==4.4.0
ujson==5.4.0
urllib3==1.26.14
wcwidth==0.2.6
wheel==0.38.4
win-inet-pton==1.1.0
wincertstore==0.2
wrapt==1.15.0
zipp==3.15.0
zstandard==0.19.0
\ No newline at end of file
import math
import cv2
import audioread
import collections
import logging
import os
import random
import re
import time
import warnings
import librosa
import numpy as np
import pandas as pd
import soundfile as sf
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
from contextlib import contextmanager
from functools import partial
from pathlib import Path
from typing import Optional
from fastprogress import progress_bar
from sklearn.metrics import f1_score
from torch.nn import Conv2d, Module, Linear, BatchNorm2d, ReLU
from torch.nn.modules.utils import _pair
import torch.optim as optim
def init_layer(layer):
nn.init.xavier_uniform_(layer.weight)
if hasattr(layer, "bias"):
if layer.bias is not None:
layer.bias.data.fill_(0.)
def init_bn(bn):
bn.bias.data.fill_(0.)
bn.weight.data.fill_(1.0)
def interpolate(x: torch.Tensor, ratio: int):
"""Interpolate data in time domain. This is used to compensate the
resolution reduction in downsampling of a CNN.
Args:
x: (batch_size, time_steps, classes_num)
ratio: int, ratio to interpolate
Returns:
upsampled: (batch_size, time_steps * ratio, classes_num)
"""
(batch_size, time_steps, classes_num) = x.shape
upsampled = x[:, :, None, :].repeat(1, 1, ratio, 1)
upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num)
return upsampled
def pad_framewise_output(framewise_output: torch.Tensor, frames_num: int):
"""Pad framewise_output to the same length as input frames. The pad value
is the same as the value of the last frame.
Args:
framewise_output: (batch_size, frames_num, classes_num)
frames_num: int, number of frames to pad
Outputs:
output: (batch_size, frames_num, classes_num)
"""
pad = framewise_output[:, -1:, :].repeat(
1, frames_num - framewise_output.shape[1], 1)
"""tensor for padding"""
output = torch.cat((framewise_output, pad), dim=1)
"""(batch_size, frames_num, classes_num)"""
return output
class ConvBlock(nn.Module):
def __init__(self, in_channels: int, out_channels: int):
super().__init__()
self.conv1 = nn.Conv2d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=(3, 3),
stride=(1, 1),
padding=(1, 1),
bias=False)
self.conv2 = nn.Conv2d(
in_channels=out_channels,
out_channels=out_channels,
kernel_size=(3, 3),
stride=(1, 1),
padding=(1, 1),
bias=False)
self.bn1 = nn.BatchNorm2d(out_channels)
self.bn2 = nn.BatchNorm2d(out_channels)
self.init_weight()
def init_weight(self):
init_layer(self.conv1)
init_layer(self.conv2)
init_bn(self.bn1)
init_bn(self.bn2)
def forward(self, input, pool_size=(2, 2), pool_type='avg'):
x = input
x = F.relu_(self.bn1(self.conv1(x)))
x = F.relu_(self.bn2(self.conv2(x)))
if pool_type == 'max':
x = F.max_pool2d(x, kernel_size=pool_size)
elif pool_type == 'avg':
x = F.avg_pool2d(x, kernel_size=pool_size)
elif pool_type == 'avg+max':
x1 = F.avg_pool2d(x, kernel_size=pool_size)
x2 = F.max_pool2d(x, kernel_size=pool_size)
x = x1 + x2
else:
raise Exception('Incorrect argument!')
return x
class AttBlock(nn.Module):
def __init__(self,
in_features: int,
out_features: int,
activation="linear",
temperature=1.0):
super().__init__()
self.activation = activation
self.temperature = temperature
self.att = nn.Conv1d(
in_channels=in_features,
out_channels=out_features,
kernel_size=1,
stride=1,
padding=0,
bias=True)
self.cla = nn.Conv1d(
in_channels=in_features,
out_channels=out_features,
kernel_size=1,
stride=1,
padding=0,
bias=True)
self.init_weights()
def init_weights(self):
init_layer(self.att)
def forward(self, x):
# x: (n_samples, n_in, n_time)
norm_att = torch.softmax(torch.tanh(self.att(x)), dim=-1)
cla = self.nonlinear_transform(self.cla(x))
x = torch.sum(norm_att * cla, dim=2)
return x, norm_att, cla
def nonlinear_transform(self, x):
if self.activation == 'linear':
return x
elif self.activation == 'sigmoid':
return torch.sigmoid(x)
class SplAtConv2d(Module):
"""Split-Attention Conv2d
"""
def __init__(self, in_channels, channels, kernel_size, stride=(1, 1), padding=(0, 0),
dilation=(1, 1), groups=1, bias=True,
radix=2, reduction_factor=4,
rectify=False, rectify_avg=False, norm_layer=None,
dropblock_prob=0.0, **kwargs):
super(SplAtConv2d, self).__init__()
padding = _pair(padding)
self.rectify = rectify and (padding[0] > 0 or padding[1] > 0)
self.rectify_avg = rectify_avg
inter_channels = max(in_channels*radix//reduction_factor, 32)
self.radix = radix
self.cardinality = groups
self.channels = channels
self.dropblock_prob = dropblock_prob
if self.rectify:
from rfconv import RFConv2d
self.conv = RFConv2d(in_channels, channels*radix, kernel_size, stride, padding, dilation,
groups=groups*radix, bias=bias, average_mode=rectify_avg, **kwargs)
else:
self.conv = Conv2d(in_channels, channels*radix, kernel_size, stride, padding, dilation,
groups=groups*radix, bias=bias, **kwargs)
self.use_bn = norm_layer is not None
if self.use_bn:
self.bn0 = norm_layer(channels*radix)
self.relu = ReLU(inplace=True)
self.fc1 = Conv2d(channels, inter_channels, 1, groups=self.cardinality)
if self.use_bn:
self.bn1 = norm_layer(inter_channels)
self.fc2 = Conv2d(inter_channels, channels*radix,
1, groups=self.cardinality)
if dropblock_prob > 0.0:
self.dropblock = DropBlock2D(dropblock_prob, 3)
self.rsoftmax = rSoftMax(radix, groups)
def forward(self, x):
x = self.conv(x)
if self.use_bn:
x = self.bn0(x)
if self.dropblock_prob > 0.0:
x = self.dropblock(x)
x = self.relu(x)
batch, rchannel = x.shape[:2]
if self.radix > 1:
if torch.__version__ < '1.5':
splited = torch.split(x, int(rchannel//self.radix), dim=1)
else:
splited = torch.split(x, rchannel//self.radix, dim=1)
gap = sum(splited)
else:
gap = x
gap = F.adaptive_avg_pool2d(gap, 1)
gap = self.fc1(gap)
if self.use_bn:
gap = self.bn1(gap)
gap = self.relu(gap)
atten = self.fc2(gap)
atten = self.rsoftmax(atten).view(batch, -1, 1, 1)
if self.radix > 1:
if torch.__version__ < '1.5':
attens = torch.split(atten, int(rchannel//self.radix), dim=1)
else:
attens = torch.split(atten, rchannel//self.radix, dim=1)
out = sum([att*split for (att, split) in zip(attens, splited)])
else:
out = atten * x
return out.contiguous()
class rSoftMax(nn.Module):
def __init__(self, radix, cardinality):
super().__init__()
self.radix = radix
self.cardinality = cardinality
def forward(self, x):
batch = x.size(0)
if self.radix > 1:
x = x.view(batch, self.cardinality, self.radix, -1).transpose(1, 2)
x = F.softmax(x, dim=1)
x = x.reshape(batch, -1)
else:
x = torch.sigmoid(x)
return x
class DropBlock2D(object):
def __init__(self, *args, **kwargs):
raise NotImplementedError
class GlobalAvgPool2d(nn.Module):
def __init__(self):
"""Global average pooling over the input's spatial dimensions"""
super(GlobalAvgPool2d, self).__init__()
def forward(self, inputs):
return nn.functional.adaptive_avg_pool2d(inputs, 1).view(inputs.size(0), -1)
class Bottleneck(nn.Module):
"""ResNet Bottleneck
"""
# pylint: disable=unused-argument
expansion = 4
def __init__(self, inplanes, planes, stride=1, downsample=None,
radix=1, cardinality=1, bottleneck_width=64,
avd=False, avd_first=False, dilation=1, is_first=False,
rectified_conv=False, rectify_avg=False,
norm_layer=None, dropblock_prob=0.0, last_gamma=False):
super(Bottleneck, self).__init__()
group_width = int(planes * (bottleneck_width / 64.)) * cardinality
self.conv1 = nn.Conv2d(inplanes, group_width,
kernel_size=1, bias=False)
self.bn1 = norm_layer(group_width)
self.dropblock_prob = dropblock_prob
self.radix = radix
self.avd = avd and (stride > 1 or is_first)
self.avd_first = avd_first
if self.avd:
self.avd_layer = nn.AvgPool2d(3, stride, padding=1)
stride = 1
if dropblock_prob > 0.0:
self.dropblock1 = DropBlock2D(dropblock_prob, 3)
if radix == 1:
self.dropblock2 = DropBlock2D(dropblock_prob, 3)
self.dropblock3 = DropBlock2D(dropblock_prob, 3)
if radix >= 1:
self.conv2 = SplAtConv2d(
group_width, group_width, kernel_size=3,
stride=stride, padding=dilation,
dilation=dilation, groups=cardinality, bias=False,
radix=radix, rectify=rectified_conv,
rectify_avg=rectify_avg,
norm_layer=norm_layer,
dropblock_prob=dropblock_prob)
elif rectified_conv:
from rfconv import RFConv2d
self.conv2 = RFConv2d(
group_width, group_width, kernel_size=3, stride=stride,
padding=dilation, dilation=dilation,
groups=cardinality, bias=False,
average_mode=rectify_avg)
self.bn2 = norm_layer(group_width)
else:
self.conv2 = nn.Conv2d(
group_width, group_width, kernel_size=3, stride=stride,
padding=dilation, dilation=dilation,
groups=cardinality, bias=False)
self.bn2 = norm_layer(group_width)
self.conv3 = nn.Conv2d(
group_width, planes * 4, kernel_size=1, bias=False)
self.bn3 = norm_layer(planes*4)
if last_gamma:
from torch.nn.init import zeros_
zeros_(self.bn3.weight)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.dilation = dilation
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
if self.dropblock_prob > 0.0:
out = self.dropblock1(out)
out = self.relu(out)
if self.avd and self.avd_first:
out = self.avd_layer(out)
out = self.conv2(out)
if self.radix == 0:
out = self.bn2(out)
if self.dropblock_prob > 0.0:
out = self.dropblock2(out)
out = self.relu(out)
if self.avd and not self.avd_first:
out = self.avd_layer(out)
out = self.conv3(out)
out = self.bn3(out)
if self.dropblock_prob > 0.0:
out = self.dropblock3(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class ResNest(nn.Module):
"""ResNet Variants
Parameters
----------
block : Block
Class for the residual block. Options are BasicBlockV1, BottleneckV1.
layers : list of int
Numbers of layers in each block
classes : int, default 1000
Number of classification classes.
dilated : bool, default False
Applying dilation strategy to pretrained ResNet yielding a stride-8 model,
typically used in Semantic Segmentation.
norm_layer : object
Normalization layer used in backbone network (default: :class:`mxnet.gluon.nn.BatchNorm`;
for Synchronized Cross-GPU BachNormalization).
Reference:
- He, Kaiming, et al. "Deep residual learning for image recognition." Proceedings of the IEEE conference on computer vision and pattern recognition. 2016.
- Yu, Fisher, and Vladlen Koltun. "Multi-scale context aggregation by dilated convolutions."
"""
# pylint: disable=unused-variable
def __init__(self, block, layers, radix=1, groups=1, bottleneck_width=64,
num_classes=1000, dilated=False, dilation=1,
deep_stem=False, stem_width=64, avg_down=False,
rectified_conv=False, rectify_avg=False,
avd=False, avd_first=False,
final_drop=0.0, dropblock_prob=0,
last_gamma=False, norm_layer=nn.BatchNorm2d):
self.cardinality = groups
self.bottleneck_width = bottleneck_width
# ResNet-D params
self.inplanes = stem_width*2 if deep_stem else 64
self.avg_down = avg_down
self.last_gamma = last_gamma
# ResNeSt params
self.radix = radix
self.avd = avd
self.avd_first = avd_first
super(ResNest, self).__init__()
self.rectified_conv = rectified_conv
self.rectify_avg = rectify_avg
if rectified_conv:
from rfconv import RFConv2d
conv_layer = RFConv2d
else:
conv_layer = nn.Conv2d
conv_kwargs = {'average_mode': rectify_avg} if rectified_conv else {}
if deep_stem:
self.conv1 = nn.Sequential(
conv_layer(3, stem_width, kernel_size=3, stride=2,
padding=1, bias=False, **conv_kwargs),
norm_layer(stem_width),
nn.ReLU(inplace=True),
conv_layer(stem_width, stem_width, kernel_size=3,
stride=1, padding=1, bias=False, **conv_kwargs),
norm_layer(stem_width),
nn.ReLU(inplace=True),
conv_layer(stem_width, stem_width*2, kernel_size=3,
stride=1, padding=1, bias=False, **conv_kwargs),
)
else:
self.conv1 = conv_layer(3, 64, kernel_size=7, stride=2, padding=3,
bias=False, **conv_kwargs)
self.bn1 = norm_layer(self.inplanes)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(
block, 64, layers[0], norm_layer=norm_layer, is_first=False)
self.layer2 = self._make_layer(
block, 128, layers[1], stride=2, norm_layer=norm_layer)
if dilated or dilation == 4:
self.layer3 = self._make_layer(block, 256, layers[2], stride=1,
dilation=2, norm_layer=norm_layer,
dropblock_prob=dropblock_prob)
self.layer4 = self._make_layer(block, 512, layers[3], stride=1,
dilation=4, norm_layer=norm_layer,
dropblock_prob=dropblock_prob)
elif dilation == 2:
self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
dilation=1, norm_layer=norm_layer,
dropblock_prob=dropblock_prob)
self.layer4 = self._make_layer(block, 512, layers[3], stride=1,
dilation=2, norm_layer=norm_layer,
dropblock_prob=dropblock_prob)
else:
self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
norm_layer=norm_layer,
dropblock_prob=dropblock_prob)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
norm_layer=norm_layer,
dropblock_prob=dropblock_prob)
self.avgpool = GlobalAvgPool2d()
self.drop = nn.Dropout(final_drop) if final_drop > 0.0 else None
self.fc = nn.Linear(512 * block.expansion, num_classes)
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
elif isinstance(m, norm_layer):
m.weight.data.fill_(1)
m.bias.data.zero_()
def _make_layer(self, block, planes, blocks, stride=1, dilation=1, norm_layer=None,
dropblock_prob=0.0, is_first=True):
downsample = None
if stride != 1 or self.inplanes != planes * block.expansion:
down_layers = []
if self.avg_down:
if dilation == 1:
down_layers.append(nn.AvgPool2d(kernel_size=stride, stride=stride,
ceil_mode=True, count_include_pad=False))
else:
down_layers.append(nn.AvgPool2d(kernel_size=1, stride=1,
ceil_mode=True, count_include_pad=False))
down_layers.append(nn.Conv2d(self.inplanes, planes * block.expansion,
kernel_size=1, stride=1, bias=False))
else:
down_layers.append(nn.Conv2d(self.inplanes, planes * block.expansion,
kernel_size=1, stride=stride, bias=False))
down_layers.append(norm_layer(planes * block.expansion))
downsample = nn.Sequential(*down_layers)
layers = []
if dilation == 1 or dilation == 2:
layers.append(block(self.inplanes, planes, stride, downsample=downsample,
radix=self.radix, cardinality=self.cardinality,
bottleneck_width=self.bottleneck_width,
avd=self.avd, avd_first=self.avd_first,
dilation=1, is_first=is_first, rectified_conv=self.rectified_conv,
rectify_avg=self.rectify_avg,
norm_layer=norm_layer, dropblock_prob=dropblock_prob,
last_gamma=self.last_gamma))
elif dilation == 4:
layers.append(block(self.inplanes, planes, stride, downsample=downsample,
radix=self.radix, cardinality=self.cardinality,
bottleneck_width=self.bottleneck_width,
avd=self.avd, avd_first=self.avd_first,
dilation=2, is_first=is_first, rectified_conv=self.rectified_conv,
rectify_avg=self.rectify_avg,
norm_layer=norm_layer, dropblock_prob=dropblock_prob,
last_gamma=self.last_gamma))
else:
raise RuntimeError("=> unknown dilation size: {}".format(dilation))
self.inplanes = planes * block.expansion
for i in range(1, blocks):
layers.append(block(self.inplanes, planes,
radix=self.radix, cardinality=self.cardinality,
bottleneck_width=self.bottleneck_width,
avd=self.avd, avd_first=self.avd_first,
dilation=dilation, rectified_conv=self.rectified_conv,
rectify_avg=self.rectify_avg,
norm_layer=norm_layer, dropblock_prob=dropblock_prob,
last_gamma=self.last_gamma))
return nn.Sequential(*layers)
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.avgpool(x)
# x = x.view(x.size(0), -1)
x = torch.flatten(x, 1)
if self.drop:
x = self.drop(x)
x = self.fc(x)
multilabel_proba = torch.sigmoid(x)
multiclass_proba = torch.softmax(x, dim=1)
return {
"logits": x,
"multilabel_proba": multilabel_proba,
"multiclass_proba": multiclass_proba
}
# An ordinary implementation of Swish function
class Swish(nn.Module):
def forward(self, x):
return x * torch.sigmoid(x)
# A memory-efficient implementation of Swish function
class SwishImplementation(torch.autograd.Function):
@staticmethod
def forward(ctx, i):
result = i * torch.sigmoid(i)
ctx.save_for_backward(i)
return result
@staticmethod
def backward(ctx, grad_output):
i = ctx.saved_tensors[0]
sigmoid_i = torch.sigmoid(i)
return grad_output * (sigmoid_i * (1 + i * (1 - sigmoid_i)))
class MemoryEfficientSwish(nn.Module):
def forward(self, x):
return SwishImplementation.apply(x)
def round_filters(filters, global_params):
"""Calculate and round number of filters based on width multiplier.
Use width_coefficient, depth_divisor and min_depth of global_params.
Args:
filters (int): Filters number to be calculated.
global_params (namedtuple): Global params of the model.
Returns:
new_filters: New filters number after calculating.
"""
multiplier = global_params.width_coefficient
if not multiplier:
return filters
# TODO: modify the params names.
# maybe the names (width_divisor,min_width)
# are more suitable than (depth_divisor,min_depth).
divisor = global_params.depth_divisor
min_depth = global_params.min_depth
filters *= multiplier
min_depth = min_depth or divisor # pay attention to this line when using min_depth
# follow the formula transferred from official TensorFlow implementation
new_filters = max(min_depth, int(
filters + divisor / 2) // divisor * divisor)
if new_filters < 0.9 * filters: # prevent rounding by more than 10%
new_filters += divisor
return int(new_filters)
def round_repeats(repeats, global_params):
"""Calculate module's repeat number of a block based on depth multiplier.
Use depth_coefficient of global_params.
Args:
repeats (int): num_repeat to be calculated.
global_params (namedtuple): Global params of the model.
Returns:
new repeat: New repeat number after calculating.
"""
multiplier = global_params.depth_coefficient
if not multiplier:
return repeats
# follow the formula transferred from official TensorFlow implementation
return int(math.ceil(multiplier * repeats))
def drop_connect(inputs, p, training):
"""Drop connect.
Args:
input (tensor: BCWH): Input of this structure.
p (float: 0.0~1.0): Probability of drop connection.
training (bool): The running mode.
Returns:
output: Output after drop connection.
"""
assert p >= 0 and p <= 1, 'p must be in range of [0,1]'
if not training:
return inputs
batch_size = inputs.shape[0]
keep_prob = 1 - p
# generate binary_tensor mask according to probability (p for 0, 1-p for 1)
random_tensor = keep_prob
random_tensor += torch.rand([batch_size, 1, 1, 1],
dtype=inputs.dtype, device=inputs.device)
binary_tensor = torch.floor(random_tensor)
output = inputs / keep_prob * binary_tensor
return output
def get_width_and_height_from_size(x):
"""Obtain height and width from x.
Args:
x (int, tuple or list): Data size.
Returns:
size: A tuple or list (H,W).
"""
if isinstance(x, int):
return x, x
if isinstance(x, list) or isinstance(x, tuple):
return x
else:
raise TypeError()
def calculate_output_image_size(input_image_size, stride):
"""Calculates the output image size when using Conv2dSamePadding with a stride.
Necessary for static padding. Thanks to mannatsingh for pointing this out.
Args:
input_image_size (int, tuple or list): Size of input image.
stride (int, tuple or list): Conv2d operation's stride.
Returns:
output_image_size: A list [H,W].
"""
if input_image_size is None:
return None
image_height, image_width = get_width_and_height_from_size(
input_image_size)
stride = stride if isinstance(stride, int) else stride[0]
image_height = int(math.ceil(image_height / stride))
image_width = int(math.ceil(image_width / stride))
return [image_height, image_width]
# Note:
# The following 'SamePadding' functions make output size equal ceil(input size/stride).
# Only when stride equals 1, can the output size be the same as input size.
# Don't be confused by their function names ! ! !
def get_same_padding_conv2d(image_size=None):
"""Chooses static padding if you have specified an image size, and dynamic padding otherwise.
Static padding is necessary for ONNX exporting of models.
Args:
image_size (int or tuple): Size of the image.
Returns:
Conv2dDynamicSamePadding or Conv2dStaticSamePadding.
"""
if image_size is None:
return Conv2dDynamicSamePadding
else:
return partial(Conv2dStaticSamePadding, image_size=image_size)
class Conv2dDynamicSamePadding(nn.Conv2d):
"""2D Convolutions like TensorFlow, for a dynamic image size.
The padding is operated in forward function by calculating dynamically.
"""
# Tips for 'SAME' mode padding.
# Given the following:
# i: width or height
# s: stride
# k: kernel size
# d: dilation
# p: padding
# Output after Conv2d:
# o = floor((i+p-((k-1)*d+1))/s+1)
# If o equals i, i = floor((i+p-((k-1)*d+1))/s+1),
# => p = (i-1)*s+((k-1)*d+1)-i
def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, groups=1, bias=True):
super().__init__(in_channels, out_channels,
kernel_size, stride, 0, dilation, groups, bias)
self.stride = self.stride if len(self.stride) == 2 else [
self.stride[0]] * 2
def forward(self, x):
ih, iw = x.size()[-2:]
kh, kw = self.weight.size()[-2:]
sh, sw = self.stride
# change the output size according to stride ! ! !
oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
pad_h = max((oh - 1) * self.stride[0] +
(kh - 1) * self.dilation[0] + 1 - ih, 0)
pad_w = max((ow - 1) * self.stride[1] +
(kw - 1) * self.dilation[1] + 1 - iw, 0)
if pad_h > 0 or pad_w > 0:
x = F.pad(x, [pad_w // 2, pad_w - pad_w //
2, pad_h // 2, pad_h - pad_h // 2])
return F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
class Conv2dStaticSamePadding(nn.Conv2d):
"""2D Convolutions like TensorFlow's 'SAME' mode, with the given input image size.
The padding mudule is calculated in construction function, then used in forward.
"""
# With the same calculation as Conv2dDynamicSamePadding
def __init__(self, in_channels, out_channels, kernel_size, stride=1, image_size=None, **kwargs):
super().__init__(in_channels, out_channels, kernel_size, stride, **kwargs)
self.stride = self.stride if len(self.stride) == 2 else [
self.stride[0]] * 2
# Calculate padding based on image size and save it
assert image_size is not None
ih, iw = (image_size, image_size) if isinstance(
image_size, int) else image_size
kh, kw = self.weight.size()[-2:]
sh, sw = self.stride
oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
pad_h = max((oh - 1) * self.stride[0] +
(kh - 1) * self.dilation[0] + 1 - ih, 0)
pad_w = max((ow - 1) * self.stride[1] +
(kw - 1) * self.dilation[1] + 1 - iw, 0)
if pad_h > 0 or pad_w > 0:
self.static_padding = nn.ZeroPad2d(
(pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2))
else:
self.static_padding = Identity()
def forward(self, x):
x = self.static_padding(x)
x = F.conv2d(x, self.weight, self.bias, self.stride,
self.padding, self.dilation, self.groups)
return x
def get_same_padding_maxPool2d(image_size=None):
"""Chooses static padding if you have specified an image size, and dynamic padding otherwise.
Static padding is necessary for ONNX exporting of models.
Args:
image_size (int or tuple): Size of the image.
Returns:
MaxPool2dDynamicSamePadding or MaxPool2dStaticSamePadding.
"""
if image_size is None:
return MaxPool2dDynamicSamePadding
else:
return partial(MaxPool2dStaticSamePadding, image_size=image_size)
class MaxPool2dDynamicSamePadding(nn.MaxPool2d):
"""2D MaxPooling like TensorFlow's 'SAME' mode, with a dynamic image size.
The padding is operated in forward function by calculating dynamically.
"""
def __init__(self, kernel_size, stride, padding=0, dilation=1, return_indices=False, ceil_mode=False):
super().__init__(kernel_size, stride, padding, dilation, return_indices, ceil_mode)
self.stride = [self.stride] * \
2 if isinstance(self.stride, int) else self.stride
self.kernel_size = [self.kernel_size] * \
2 if isinstance(self.kernel_size, int) else self.kernel_size
self.dilation = [self.dilation] * \
2 if isinstance(self.dilation, int) else self.dilation
def forward(self, x):
ih, iw = x.size()[-2:]
kh, kw = self.kernel_size
sh, sw = self.stride
oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
pad_h = max((oh - 1) * self.stride[0] +
(kh - 1) * self.dilation[0] + 1 - ih, 0)
pad_w = max((ow - 1) * self.stride[1] +
(kw - 1) * self.dilation[1] + 1 - iw, 0)
if pad_h > 0 or pad_w > 0:
x = F.pad(x, [pad_w // 2, pad_w - pad_w //
2, pad_h // 2, pad_h - pad_h // 2])
return F.max_pool2d(x, self.kernel_size, self.stride, self.padding,
self.dilation, self.ceil_mode, self.return_indices)
class MaxPool2dStaticSamePadding(nn.MaxPool2d):
"""2D MaxPooling like TensorFlow's 'SAME' mode, with the given input image size.
The padding mudule is calculated in construction function, then used in forward.
"""
def __init__(self, kernel_size, stride, image_size=None, **kwargs):
super().__init__(kernel_size, stride, **kwargs)
self.stride = [self.stride] * \
2 if isinstance(self.stride, int) else self.stride
self.kernel_size = [self.kernel_size] * \
2 if isinstance(self.kernel_size, int) else self.kernel_size
self.dilation = [self.dilation] * \
2 if isinstance(self.dilation, int) else self.dilation
# Calculate padding based on image size and save it
assert image_size is not None
ih, iw = (image_size, image_size) if isinstance(
image_size, int) else image_size
kh, kw = self.kernel_size
sh, sw = self.stride
oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
pad_h = max((oh - 1) * self.stride[0] +
(kh - 1) * self.dilation[0] + 1 - ih, 0)
pad_w = max((ow - 1) * self.stride[1] +
(kw - 1) * self.dilation[1] + 1 - iw, 0)
if pad_h > 0 or pad_w > 0:
self.static_padding = nn.ZeroPad2d(
(pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2))
else:
self.static_padding = Identity()
def forward(self, x):
x = self.static_padding(x)
x = F.max_pool2d(x, self.kernel_size, self.stride, self.padding,
self.dilation, self.ceil_mode, self.return_indices)
return x
class Identity(nn.Module):
"""Identity mapping.
Send input to output directly.
"""
def __init__(self):
super(Identity, self).__init__()
def forward(self, input):
return input
def efficientnet_params(model_name):
"""Map EfficientNet model name to parameter coefficients.
Args:
model_name (str): Model name to be queried.
Returns:
params_dict[model_name]: A (width,depth,res,dropout) tuple.
"""
params_dict = {
# Coefficients: width,depth,res,dropout
'efficientnet-b0': (1.0, 1.0, 224, 0.2),
'efficientnet-b1': (1.0, 1.1, 240, 0.2),
'efficientnet-b2': (1.1, 1.2, 260, 0.3),
'efficientnet-b3': (1.2, 1.4, 300, 0.3),
'efficientnet-b4': (1.4, 1.8, 380, 0.4),
'efficientnet-b5': (1.6, 2.2, 456, 0.4),
'efficientnet-b6': (1.8, 2.6, 528, 0.5),
'efficientnet-b7': (2.0, 3.1, 600, 0.5),
'efficientnet-b8': (2.2, 3.6, 672, 0.5),
'efficientnet-l2': (4.3, 5.3, 800, 0.5),
}
return params_dict[model_name]
def get_model_params(model_name, override_params):
"""Get the block args and global params for a given model name.
Args:
model_name (str): Model's name.
override_params (dict): A dict to modify global_params.
Returns:
blocks_args, global_params
"""
if model_name.startswith('efficientnet'):
w, d, s, p = efficientnet_params(model_name)
# note: all models have drop connect rate = 0.2
blocks_args, global_params = efficientnet(
width_coefficient=w, depth_coefficient=d, dropout_rate=p, image_size=s)
else:
raise NotImplementedError(
'model name is not pre-defined: %s' % model_name)
if override_params:
# ValueError will be raised here if override_params has fields not included in global_params.
global_params = global_params._replace(**override_params)
return blocks_args, global_params
class BlockDecoder(object):
"""Block Decoder for readability,
straight from the official TensorFlow repository.
"""
@staticmethod
def _decode_block_string(block_string):
"""Get a block through a string notation of arguments.
Args:
block_string (str): A string notation of arguments.
Examples: 'r1_k3_s11_e1_i32_o16_se0.25_noskip'.
Returns:
BlockArgs: The namedtuple defined at the top of this file.
"""
assert isinstance(block_string, str)
ops = block_string.split('_')
options = {}
for op in ops:
splits = re.split(r'(\d.*)', op)
if len(splits) >= 2:
key, value = splits[:2]
options[key] = value
# Check stride
assert (('s' in options and len(options['s']) == 1) or
(len(options['s']) == 2 and options['s'][0] == options['s'][1]))
return BlockArgs(
num_repeat=int(options['r']),
kernel_size=int(options['k']),
stride=[int(options['s'][0])],
expand_ratio=int(options['e']),
input_filters=int(options['i']),
output_filters=int(options['o']),
se_ratio=float(options['se']) if 'se' in options else None,
id_skip=('noskip' not in block_string))
@staticmethod
def _encode_block_string(block):
"""Encode a block to a string.
Args:
block (namedtuple): A BlockArgs type argument.
Returns:
block_string: A String form of BlockArgs.
"""
args = [
'r%d' % block.num_repeat,
'k%d' % block.kernel_size,
's%d%d' % (block.strides[0], block.strides[1]),
'e%s' % block.expand_ratio,
'i%d' % block.input_filters,
'o%d' % block.output_filters
]
if 0 < block.se_ratio <= 1:
args.append('se%s' % block.se_ratio)
if block.id_skip is False:
args.append('noskip')
return '_'.join(args)
@staticmethod
def decode(string_list):
"""Decode a list of string notations to specify blocks inside the network.
Args:
string_list (list[str]): A list of strings, each string is a notation of block.
Returns:
blocks_args: A list of BlockArgs namedtuples of block args.
"""
assert isinstance(string_list, list)
blocks_args = []
for block_string in string_list:
blocks_args.append(BlockDecoder._decode_block_string(block_string))
return blocks_args
@staticmethod
def encode(blocks_args):
"""Encode a list of BlockArgs to a list of strings.
Args:
blocks_args (list[namedtuples]): A list of BlockArgs namedtuples of block args.
Returns:
block_strings: A list of strings, each string is a notation of block.
"""
block_strings = []
for block in blocks_args:
block_strings.append(BlockDecoder._encode_block_string(block))
return block_strings
def efficientnet(width_coefficient=None, depth_coefficient=None, image_size=None,
dropout_rate=0.2, drop_connect_rate=0.2, num_classes=1000):
"""Create BlockArgs and GlobalParams for efficientnet model.
Args:
width_coefficient (float)
depth_coefficient (float)
image_size (int)
dropout_rate (float)
drop_connect_rate (float)
num_classes (int)
Meaning as the name suggests.
Returns:
blocks_args, global_params.
"""
# Blocks args for the whole model(efficientnet-b0 by default)
# It will be modified in the construction of EfficientNet Class according to model
blocks_args = [
'r1_k3_s11_e1_i32_o16_se0.25',
'r2_k3_s22_e6_i16_o24_se0.25',
'r2_k5_s22_e6_i24_o40_se0.25',
'r3_k3_s22_e6_i40_o80_se0.25',
'r3_k5_s11_e6_i80_o112_se0.25',
'r4_k5_s22_e6_i112_o192_se0.25',
'r1_k3_s11_e6_i192_o320_se0.25',
]
blocks_args = BlockDecoder.decode(blocks_args)
global_params = GlobalParams(
width_coefficient=width_coefficient,
depth_coefficient=depth_coefficient,
image_size=image_size,
dropout_rate=dropout_rate,
num_classes=num_classes,
batch_norm_momentum=0.99,
batch_norm_epsilon=1e-3,
drop_connect_rate=drop_connect_rate,
depth_divisor=8,
min_depth=None,
)
return blocks_args, global_params
GlobalParams = collections.namedtuple('GlobalParams', [
'width_coefficient', 'depth_coefficient', 'image_size', 'dropout_rate',
'num_classes', 'batch_norm_momentum', 'batch_norm_epsilon',
'drop_connect_rate', 'depth_divisor', 'min_depth'])
# Parameters for an individual model block
BlockArgs = collections.namedtuple('BlockArgs', [
'num_repeat', 'kernel_size', 'stride', 'expand_ratio',
'input_filters', 'output_filters', 'se_ratio', 'id_skip'])
# Set GlobalParams and BlockArgs's defaults
GlobalParams.__new__.__defaults__ = (None,) * len(GlobalParams._fields)
BlockArgs.__new__.__defaults__ = (None,) * len(BlockArgs._fields)
VALID_MODELS = (
'efficientnet-b0', 'efficientnet-b1', 'efficientnet-b2', 'efficientnet-b3',
'efficientnet-b4', 'efficientnet-b5', 'efficientnet-b6', 'efficientnet-b7',
'efficientnet-b8',
# Support the construction of 'efficientnet-l2' without pretrained weights
'efficientnet-l2'
)
class MBConvBlock(nn.Module):
"""Mobile Inverted Residual Bottleneck Block.
Args:
block_args (namedtuple): BlockArgs, defined in utils.py.
global_params (namedtuple): GlobalParam, defined in utils.py.
image_size (tuple or list): [image_height, image_width].
References:
[1] https://arxiv.org/abs/1704.04861 (MobileNet v1)
[2] https://arxiv.org/abs/1801.04381 (MobileNet v2)
[3] https://arxiv.org/abs/1905.02244 (MobileNet v3)
"""
def __init__(self, block_args, global_params, image_size=None):
super().__init__()
self._block_args = block_args
# pytorch's difference from tensorflow
self._bn_mom = 1 - global_params.batch_norm_momentum
self._bn_eps = global_params.batch_norm_epsilon
self.has_se = (self._block_args.se_ratio is not None) and (
0 < self._block_args.se_ratio <= 1)
# whether to use skip connection and drop connect
self.id_skip = block_args.id_skip
# Expansion phase (Inverted Bottleneck)
inp = self._block_args.input_filters # number of input channels
oup = self._block_args.input_filters * \
self._block_args.expand_ratio # number of output channels
if self._block_args.expand_ratio != 1:
Conv2d = get_same_padding_conv2d(image_size=image_size)
self._expand_conv = Conv2d(
in_channels=inp, out_channels=oup, kernel_size=1, bias=False)
self._bn0 = nn.BatchNorm2d(
num_features=oup, momentum=self._bn_mom, eps=self._bn_eps)
# image_size = calculate_output_image_size(image_size, 1) <-- this wouldn't modify image_size
# Depthwise convolution phase
k = self._block_args.kernel_size
s = self._block_args.stride
Conv2d = get_same_padding_conv2d(image_size=image_size)
self._depthwise_conv = Conv2d(
in_channels=oup, out_channels=oup, groups=oup, # groups makes it depthwise
kernel_size=k, stride=s, bias=False)
self._bn1 = nn.BatchNorm2d(
num_features=oup, momentum=self._bn_mom, eps=self._bn_eps)
image_size = calculate_output_image_size(image_size, s)
# Squeeze and Excitation layer, if desired
if self.has_se:
Conv2d = get_same_padding_conv2d(image_size=(1, 1))
num_squeezed_channels = max(
1, int(self._block_args.input_filters * self._block_args.se_ratio))
self._se_reduce = Conv2d(
in_channels=oup, out_channels=num_squeezed_channels, kernel_size=1)
self._se_expand = Conv2d(
in_channels=num_squeezed_channels, out_channels=oup, kernel_size=1)
# Pointwise convolution phase
final_oup = self._block_args.output_filters
Conv2d = get_same_padding_conv2d(image_size=image_size)
self._project_conv = Conv2d(
in_channels=oup, out_channels=final_oup, kernel_size=1, bias=False)
self._bn2 = nn.BatchNorm2d(
num_features=final_oup, momentum=self._bn_mom, eps=self._bn_eps)
self._swish = MemoryEfficientSwish()
def forward(self, inputs, drop_connect_rate=None):
"""MBConvBlock's forward function.
Args:
inputs (tensor): Input tensor.
drop_connect_rate (bool): Drop connect rate (float, between 0 and 1).
Returns:
Output of this block after processing.
"""
# Expansion and Depthwise Convolution
x = inputs
if self._block_args.expand_ratio != 1:
x = self._expand_conv(inputs)
x = self._bn0(x)
x = self._swish(x)
x = self._depthwise_conv(x)
x = self._bn1(x)
x = self._swish(x)
# Squeeze and Excitation
if self.has_se:
x_squeezed = F.adaptive_avg_pool2d(x, 1)
x_squeezed = self._se_reduce(x_squeezed)
x_squeezed = self._swish(x_squeezed)
x_squeezed = self._se_expand(x_squeezed)
x = torch.sigmoid(x_squeezed) * x
# Pointwise Convolution
x = self._project_conv(x)
x = self._bn2(x)
# Skip connection and drop connect
input_filters, output_filters = self._block_args.input_filters, self._block_args.output_filters
if self.id_skip and self._block_args.stride == 1 and input_filters == output_filters:
# The combination of skip connection and drop connect brings about stochastic depth.
if drop_connect_rate:
x = drop_connect(x, p=drop_connect_rate,
training=self.training)
x = x + inputs # skip connection
return x
def set_swish(self, memory_efficient=True):
"""Sets swish function as memory efficient (for training) or standard (for export).
Args:
memory_efficient (bool): Whether to use memory-efficient version of swish.
"""
self._swish = MemoryEfficientSwish() if memory_efficient else Swish()
class EfficientNet(nn.Module):
"""EfficientNet model.
Most easily loaded with the .from_name or .from_pretrained methods.
Args:
blocks_args (list[namedtuple]): A list of BlockArgs to construct blocks.
global_params (namedtuple): A set of GlobalParams shared between blocks.
References:
[1] https://arxiv.org/abs/1905.11946 (EfficientNet)
Example:
>>> import torch
>>> from efficientnet.model import EfficientNet
>>> inputs = torch.rand(1, 3, 224, 224)
>>> model = EfficientNet.from_pretrained('efficientnet-b0')
>>> model.eval()
>>> outputs = model(inputs)
"""
def __init__(self, blocks_args=None, global_params=None):
super().__init__()
assert isinstance(blocks_args, list), 'blocks_args should be a list'
assert len(blocks_args) > 0, 'block args must be greater than 0'
self._global_params = global_params
self._blocks_args = blocks_args
# Batch norm parameters
bn_mom = 1 - self._global_params.batch_norm_momentum
bn_eps = self._global_params.batch_norm_epsilon
# Get stem static or dynamic convolution depending on image size
image_size = global_params.image_size
Conv2d = get_same_padding_conv2d(image_size=image_size)
# Stem
in_channels = 3 # rgb
# number of output channels
out_channels = round_filters(32, self._global_params)
self._conv_stem = Conv2d(
in_channels, out_channels, kernel_size=3, stride=2, bias=False)
self._bn0 = nn.BatchNorm2d(
num_features=out_channels, momentum=bn_mom, eps=bn_eps)
image_size = calculate_output_image_size(image_size, 2)
# Build blocks
self._blocks = nn.ModuleList([])
for block_args in self._blocks_args:
# Update block input and output filters based on depth multiplier.
block_args = block_args._replace(
input_filters=round_filters(
block_args.input_filters, self._global_params),
output_filters=round_filters(
block_args.output_filters, self._global_params),
num_repeat=round_repeats(
block_args.num_repeat, self._global_params)
)
# The first block needs to take care of stride and filter size increase.
self._blocks.append(MBConvBlock(
block_args, self._global_params, image_size=image_size))
image_size = calculate_output_image_size(
image_size, block_args.stride)
if block_args.num_repeat > 1: # modify block_args to keep same output size
block_args = block_args._replace(
input_filters=block_args.output_filters, stride=1)
for _ in range(block_args.num_repeat - 1):
self._blocks.append(MBConvBlock(
block_args, self._global_params, image_size=image_size))
# image_size = calculate_output_image_size(image_size, block_args.stride) # stride = 1
# Head
in_channels = block_args.output_filters # output of final block
out_channels = round_filters(1280, self._global_params)
Conv2d = get_same_padding_conv2d(image_size=image_size)
self._conv_head = Conv2d(
in_channels, out_channels, kernel_size=1, bias=False)
self._bn1 = nn.BatchNorm2d(
num_features=out_channels, momentum=bn_mom, eps=bn_eps)
# Final linear layer
self._avg_pooling = nn.AdaptiveAvgPool2d(1)
self._dropout = nn.Dropout(self._global_params.dropout_rate)
self._fc = nn.Linear(out_channels, self._global_params.num_classes)
self._swish = MemoryEfficientSwish()
def set_swish(self, memory_efficient=True):
"""Sets swish function as memory efficient (for training) or standard (for export).
Args:
memory_efficient (bool): Whether to use memory-efficient version of swish.
"""
self._swish = MemoryEfficientSwish() if memory_efficient else Swish()
for block in self._blocks:
block.set_swish(memory_efficient)
def extract_endpoints(self, inputs):
"""Use convolution layer to extract features
from reduction levels i in [1, 2, 3, 4, 5].
Args:
inputs (tensor): Input tensor.
Returns:
Dictionary of last intermediate features
with reduction levels i in [1, 2, 3, 4, 5].
Example:
>>> import torch
>>> from efficientnet.model import EfficientNet
>>> inputs = torch.rand(1, 3, 224, 224)
>>> model = EfficientNet.from_pretrained('efficientnet-b0')
>>> endpoints = model.extract_endpoints(inputs)
>>> print(endpoints['reduction_1'].shape) # torch.Size([1, 16, 112, 112])
>>> print(endpoints['reduction_2'].shape) # torch.Size([1, 24, 56, 56])
>>> print(endpoints['reduction_3'].shape) # torch.Size([1, 40, 28, 28])
>>> print(endpoints['reduction_4'].shape) # torch.Size([1, 112, 14, 14])
>>> print(endpoints['reduction_5'].shape) # torch.Size([1, 1280, 7, 7])
"""
endpoints = dict()
# Stem
x = self._swish(self._bn0(self._conv_stem(inputs)))
prev_x = x
# Blocks
for idx, block in enumerate(self._blocks):
drop_connect_rate = self._global_params.drop_connect_rate
if drop_connect_rate:
# scale drop connect_rate
drop_connect_rate *= float(idx) / len(self._blocks)
x = block(x, drop_connect_rate=drop_connect_rate)
if prev_x.size(2) > x.size(2):
endpoints[f'reduction_{len(endpoints)+1}'] = prev_x
prev_x = x
# Head
x = self._swish(self._bn1(self._conv_head(x)))
endpoints[f'reduction_{len(endpoints)+1}'] = x
return endpoints
def extract_features(self, inputs):
"""use convolution layer to extract feature .
Args:
inputs (tensor): Input tensor.
Returns:
Output of the final convolution
layer in the efficientnet model.
"""
# Stem
x = self._swish(self._bn0(self._conv_stem(inputs)))
# Blocks
for idx, block in enumerate(self._blocks):
drop_connect_rate = self._global_params.drop_connect_rate
if drop_connect_rate:
# scale drop connect_rate
drop_connect_rate *= float(idx) / len(self._blocks)
x = block(x, drop_connect_rate=drop_connect_rate)
# Head
x = self._swish(self._bn1(self._conv_head(x)))
return x
def forward(self, inputs):
"""EfficientNet's forward function.
Calls extract_features to extract features, applies final linear layer, and returns logits.
Args:
inputs (tensor): Input tensor.
Returns:
Output of this model after processing.
"""
# Convolution layers
x = self.extract_features(inputs)
# Pooling and final linear layer
x = self._avg_pooling(x)
x = x.flatten(start_dim=1)
x = self._dropout(x)
x = self._fc(x)
return x
@classmethod
def from_name(cls, model_name, in_channels=3, **override_params):
"""create an efficientnet model according to name.
Args:
model_name (str): Name for efficientnet.
in_channels (int): Input data's channel number.
override_params (other key word params):
Params to override model's global_params.
Optional key:
'width_coefficient', 'depth_coefficient',
'image_size', 'dropout_rate',
'num_classes', 'batch_norm_momentum',
'batch_norm_epsilon', 'drop_connect_rate',
'depth_divisor', 'min_depth'
Returns:
An efficientnet model.
"""
cls._check_model_name_is_valid(model_name)
blocks_args, global_params = get_model_params(
model_name, override_params)
model = cls(blocks_args, global_params)
model._change_in_channels(in_channels)
return model
@classmethod
def from_pretrained(cls, model_name, weights_path=None, advprop=False,
in_channels=3, num_classes=1000, **override_params):
"""create an efficientnet model according to name.
Args:
model_name (str): Name for efficientnet.
weights_path (None or str):
str: path to pretrained weights file on the local disk.
None: use pretrained weights downloaded from the Internet.
advprop (bool):
Whether to load pretrained weights
trained with advprop (valid when weights_path is None).
in_channels (int): Input data's channel number.
num_classes (int):
Number of categories for classification.
It controls the output size for final linear layer.
override_params (other key word params):
Params to override model's global_params.
Optional key:
'width_coefficient', 'depth_coefficient',
'image_size', 'dropout_rate',
'num_classes', 'batch_norm_momentum',
'batch_norm_epsilon', 'drop_connect_rate',
'depth_divisor', 'min_depth'
Returns:
A pretrained efficientnet model.
"""
model = cls.from_name(
model_name, num_classes=num_classes, **override_params)
load_pretrained_weights(model, model_name, weights_path=weights_path, load_fc=(
num_classes == 1000), advprop=advprop)
model._change_in_channels(in_channels)
return model
@classmethod
def get_image_size(cls, model_name):
"""Get the input image size for a given efficientnet model.
Args:
model_name (str): Name for efficientnet.
Returns:
Input image size (resolution).
"""
cls._check_model_name_is_valid(model_name)
_, _, res, _ = efficientnet_params(model_name)
return res
@classmethod
def _check_model_name_is_valid(cls, model_name):
"""Validates model name.
Args:
model_name (str): Name for efficientnet.
Returns:
bool: Is a valid name or not.
"""
if model_name not in VALID_MODELS:
raise ValueError('model_name should be one of: ' +
', '.join(VALID_MODELS))
def _change_in_channels(self, in_channels):
"""Adjust model's first convolution layer to in_channels, if in_channels not equals 3.
Args:
in_channels (int): Input data's channel number.
"""
if in_channels != 3:
Conv2d = get_same_padding_conv2d(
image_size=self._global_params.image_size)
out_channels = round_filters(32, self._global_params)
self._conv_stem = Conv2d(
in_channels, out_channels, kernel_size=3, stride=2, bias=False)
class ResNestSED(nn.Module):
def __init__(self, num_classes=264):
super().__init__()
self.interpolate_ratio = 30 # Downsampled ratio
base_model = ResNest(
Bottleneck, [3, 4, 6, 3],
radix=1, groups=1, bottleneck_width=64,
deep_stem=True, stem_width=32, avg_down=True,
avd=True, avd_first=True)
layers = list(base_model.children())[:-2]
self.encoder = nn.Sequential(*layers)
in_features = base_model.fc.in_features
self.fc1 = nn.Linear(in_features, in_features, bias=True)
self.att_block = AttBlock(
in_features, num_classes, activation="sigmoid")
self.init_weight()
def init_weight(self):
init_layer(self.fc1)
def forward(self, input):
frames_num = input.size(3)
# (batch_size, channels, freq, frames)
x = self.encoder(input)
# (batch_size, channels, frames)
x = torch.mean(x, dim=2)
# channel smoothing
x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
x = x1 + x2
x = F.dropout(x, p=0.5, training=self.training)
x = x.transpose(1, 2)
x = F.relu_(self.fc1(x))
x = x.transpose(1, 2)
x = F.dropout(x, p=0.5, training=self.training)
(clipwise_output, norm_att, segmentwise_output) = self.att_block(x)
logit = torch.sum(norm_att * self.att_block.cla(x), dim=2)
segmentwise_output = segmentwise_output.transpose(1, 2)
# Get framewise output
framewise_output = interpolate(segmentwise_output,
self.interpolate_ratio)
framewise_output = pad_framewise_output(framewise_output, frames_num)
output_dict = {
"framewise_output": framewise_output,
"logit": logit,
"clipwise_output": clipwise_output
}
return output_dict
class EfficientNetSED(nn.Module):
def __init__(self, base_model_name: str, pretrained=False,
num_classes=264):
super().__init__()
self.interpolate_ratio = 32 # Downsampled ratio
if pretrained:
self.base_model = EfficientNet.from_pretrained(base_model_name)
else:
self.base_model = EfficientNet.from_name(base_model_name)
in_features = self.base_model._fc.in_features
self.fc1 = nn.Linear(in_features, in_features, bias=True)
self.att_block = AttBlock(
in_features, num_classes, activation="sigmoid")
self.init_weight()
def init_weight(self):
init_layer(self.fc1)
def forward(self, input):
frames_num = input.size(3)
# (batch_size, channels, freq, frames)
x = self.base_model.extract_features(input)
# (batch_size, channels, frames)
x = torch.mean(x, dim=2)
# channel smoothing
x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
x = x1 + x2
x = F.dropout(x, p=0.5, training=self.training)
x = x.transpose(1, 2)
x = F.relu_(self.fc1(x))
x = x.transpose(1, 2)
x = F.dropout(x, p=0.5, training=self.training)
(clipwise_output, _, segmentwise_output) = self.att_block(x)
segmentwise_output = segmentwise_output.transpose(1, 2)
# Get framewise output
framewise_output = interpolate(segmentwise_output,
self.interpolate_ratio)
framewise_output = pad_framewise_output(framewise_output, frames_num)
output_dict = {
"framewise_output": framewise_output,
"segmentwise_output": segmentwise_output,
"clipwise_output": clipwise_output
}
return output_dict
resnest_model_config = {
"num_classes": 264
}
effnet_model_config = {
"num_classes": 264,
"base_model_name": "efficientnet-b0",
"pretrained": False
}
weights_path = {
"resnest": {
"ref2_th03": "train/weights_pretrained/birdcall-resnest-ema-all-ref2-th03/ema.pth",
"ref2_th04": "train/weights_pretrained/birdcall-resnest-ema-all-ref2-th04/ema.pth",
"ext": "train/weights_pretrained/birdcall-resnest-emta-all-ext-ref2-th04/ema.pth"
},
"effnet": {
"eff_th04": "train/weights_pretrained/birdcall-effnet-b0-ema-all-ref2/ema.pth"
}
}
def get_model(resnest_config: dict, effnet_config: dict, weights_path: dict):
models = {}
device = torch.device("cuda")
for model_key in weights_path:
path_dict = weights_path[model_key]
for path_key in path_dict:
if model_key == "resnest":
model = ResNestSED(**resnest_config)
else:
model = EfficientNetSED(**effnet_config)
checkpoint = torch.load(path_dict[path_key])
model_state_dict = {}
for key in checkpoint["model_state_dict"]:
if key == "n_averaged":
continue
new_key = key.replace("module.", "")
model_state_dict[new_key] = checkpoint["model_state_dict"][key]
model.load_state_dict(model_state_dict)
model.to(device)
model.eval()
models[path_key] = model
return models
def get_optimizer(model: nn.Module, config: dict):
optimizer_config = config["optimizer"]
optimizer_name = optimizer_config.get("name")
return optim.__getattribute__(optimizer_name)(model.parameters(),
**optimizer_config["params"])
def get_scheduler(optimizer, config: dict):
scheduler_config = config["scheduler"]
scheduler_name = scheduler_config.get("name")
if scheduler_name is None:
return
else:
return optim.lr_scheduler.__getattribute__(scheduler_name)(
optimizer, **scheduler_config["params"])
models = get_model(resnest_config=resnest_model_config,
effnet_config=effnet_model_config,
weights_path=weights_path)
import librosa
import numpy as np
from pathlib import Path
import shutil
import cv2
import os
from src.utils import normalize_melspec
from fastprogress import progress_bar
# Parameters
TARGET_SR = 32000
melspectrogram_parameters = {
"n_mels": 128,
"fmin": 20,
"fmax": 16000
}
pcen_parameters = {
"gain": 0.98,
"bias": 2,
"power": 0.5,
"time_constant": 0.4,
"eps": 0.000001
}
PERIOD = 30
CHUNK = PERIOD * TARGET_SR
###
def transform_all_images(dirpath: str, sound_file: str, csv_file: str):
"""Create a folder with .png for the training"""
csv_file = open(dirpath + csv_file, "r", encoding='utf-8')
i = 0
csv_file.readline()
# Reset the temp folder
shutil.rmtree('train/temp/train', ignore_errors=True)
os.makedirs('train/temp/train')
shutil.rmtree('train/temp/val', ignore_errors=True)
os.makedirs('train/temp/val')
for audio_line in progress_bar(csv_file.readlines()):
L = audio_line.split(",")
id_audio = L[-2]
id_species = L[1]
# Create a folder for each species
os.makedirs('train/temp/train/'+id_species, exist_ok=True)
os.makedirs('train/temp/val/'+id_species, exist_ok=True)
image = np.swapaxes(clip_to_image(
dirpath+sound_file+id_audio, all_chunks=False), 0, 2)
# 70% of the audio are used for the training phase and 30% for the validation phase
if i % 50 > 15:
cv2.imwrite('train/temp/train/'+id_species +
'/'+id_audio+'.png', image)
else:
cv2.imwrite('train/temp/val/'+id_species +
'/'+id_audio+'.png', image)
i += 1
def preproc(y):
"""return the preprocessing of a clip 'y' """
y_batch = y.astype(np.float32)
if len(y_batch) > 0: # Normalization
max_vol = np.abs(y_batch).max()
if max_vol > 0:
y_batch = np.asfortranarray(y_batch * 1 / max_vol)
# Zero paddling to have an input of constant size
y_pad = np.zeros(PERIOD * TARGET_SR, dtype=np.float32)
y_pad[:len(y_batch)] = y_batch
# spectrograms
melspec = librosa.feature.melspectrogram(y=y_pad,
sr=TARGET_SR,
**melspectrogram_parameters)
pcen = librosa.pcen(melspec, sr=TARGET_SR, **pcen_parameters)
clean_mel = librosa.power_to_db(melspec ** 1.5)
melspec = librosa.power_to_db(melspec).astype(np.float32)
# Normalization
norm_melspec = normalize_melspec(melspec)
norm_pcen = normalize_melspec(pcen)
norm_clean_mel = normalize_melspec(clean_mel)
# Concatenate, we have a color picture
image = np.stack([norm_melspec, norm_pcen, norm_clean_mel], axis=-1)
height, width, _ = image.shape
image = cv2.resize(image, (int(width * 224 / height), 224))
image = np.moveaxis(image, 2, 0)
image = (image).astype(np.float32)
return image
def clip_to_image(clip_path: str, all_chunks=True):
"""return the clip almost ready to apply the model. If all_chunks=False, only the first chunk is returned"""
# load the audio file
if Path(clip_path+".mp3").exists():
clip, _ = librosa.load(clip_path+".mp3",
sr=TARGET_SR,
mono=True,
res_type="kaiser_fast")
elif Path(clip_path+".wav").exists():
clip, _ = librosa.load(clip_path + ".wav",
sr=TARGET_SR,
mono=True,
res_type="kaiser_fast")
try:
clip
except UnboundLocalError:
raise FileExistsError(
f"{clip_path}.mp3 or .wav doesn't exist, only .wav & .mp3 are allowed. Aswell, it might be an audio from the .csv that is not in the audio folder. Easy fix : delete the corresponding line in the csv")
y = clip.astype(np.float32)
if not all_chunks:
image = preproc(y[:CHUNK])
array = np.asarray(image)
return (array)
nb_chunk = (len(y)-1)//CHUNK+1
images = []
for k in range(nb_chunk):
image = preproc(y[k*CHUNK:(k+1)*CHUNK])
images.append(image)
array = np.asarray(images)
return (array)
import torch.nn as nn
import os
import numpy as np
class ImprovedPANNsLoss(nn.Module):
"""criterion used for the training"""
def __init__(self, output_key="logit", weights=[1, 0.5]):
super().__init__()
self.output_key = output_key
if output_key == "logit":
self.normal_loss = nn.BCEWithLogitsLoss()
else:
self.normal_loss = nn.BCELoss()
self.bce = nn.BCELoss()
self.weights = weights
def forward(self, input, target):
input_ = input[self.output_key]
target = target.float()
framewise_output = input["framewise_output"]
clipwise_output_with_max, _ = framewise_output.max(dim=1)
normal_loss = self.normal_loss(input_, target)
auxiliary_loss = self.bce(clipwise_output_with_max, target)
return self.weights[0] * normal_loss + self.weights[1] * auxiliary_loss
def find_classes(dir: str):
"""return inv_bird_code and bird_code"""
classes = os.listdir(dir)
classes.sort()
class_to_idx = {bird: i for i, bird in enumerate(classes)}
return classes, class_to_idx
def normalize_melspec(X: np.ndarray):
"""Normalize a spectrogram in a strange way"""
eps = 1e-6
mean = X.mean()
X = X - mean
std = X.std()
Xstd = X / (std + eps)
norm_min, norm_max = Xstd.min(), Xstd.max()
if (norm_max - norm_min) > eps:
V = Xstd
V[V < norm_min] = norm_min
V[V > norm_max] = norm_max
V = 255 * (V - norm_min) / (norm_max - norm_min)
V = V.astype(np.uint8)
else:
# Just zero
V = np.zeros_like(Xstd, dtype=np.uint8)
return V
from __future__ import print_function, division
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.backends.cudnn as cudnn
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import time
import os
import copy
import keyboard
from src.utils import ImprovedPANNsLoss, find_classes
from src.models import get_optimizer, get_scheduler, AttBlock, models
# Utils
def transform_PIL_Array(image_PIL):
image = np.array(image_PIL)
image = np.swapaxes(image, 0, 2)
image = torch.from_numpy(image/255.0)
image = image.float()
return image
def transform_labels(labels, num_classes):
siz = labels.size()
new_labels = torch.zeros((siz[0], num_classes))
for i, label in enumerate(labels):
new_labels[i, label] = 1
return new_labels
__keep_running__ = True
def stop_running():
"appuyer sur '$' termine la derniere epoch puis fini l'entrainement"
global __keep_running__
__keep_running__ = False
keyboard.add_hotkey('$', stop_running)
def train_model(model, device, criterion, optimizer, scheduler, model_name, dataloaders, dataset_sizes, class_names, num_epochs=25):
since = time.time()
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = -np.inf
for epoch in range(num_epochs):
if not __keep_running__:
continue
print(f'Epoch {epoch}/{num_epochs - 1}')
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['train', 'val']:
if phase == 'train':
model.train() # Set model to training mode
else:
model.eval() # Set model to evaluate mode
running_loss = 0.0
running_corrects = 0
# Iterate over data.
for inputs, labels in dataloaders[phase]:
labels = transform_labels(labels, len(class_names))
inputs = inputs.to(device)
labels = labels.to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'train'):
outputs = model(inputs)
if model_name == "eff_th04":
outputs["segmentwise_output"], _ = outputs["segmentwise_output"].max(
dim=1)
loss = criterion(outputs, labels)
else:
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item() * inputs.size(0)
###
result = outputs['framewise_output']
result, _ = torch.max(result, dim=1)
# least squares
running_corrects -= torch.sum((result-labels)**2)
###
del inputs, outputs, loss, result
if phase == 'train':
scheduler.step()
epoch_loss = running_loss / dataset_sizes[phase]
epoch_acc = running_corrects.double() / dataset_sizes[phase]
print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')
# deep copy the model
if phase == 'val' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict())
del epoch_acc, epoch_loss, running_corrects, running_loss
print()
time_elapsed = time.time() - since
print(
f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
print(f'Best val Acc: {best_acc:4f}')
# load best model weights
model.load_state_dict(best_model_wts)
return model
def train(models, device, key_model=None, num_epochs=25, lr=0.001, batch_size=2):
image_datasets = {x: datasets.ImageFolder('train/temp/' + x, transform=transform_PIL_Array)
for x in ['train', 'val']}
dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=batch_size,
shuffle=True, num_workers=0)
for x in ['train', 'val']}
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
class_names = image_datasets['train'].classes
if not key_model:
key_model = models.keys()
for model in key_model:
print(model)
model_conv = models[model]
for param in model_conv.parameters():
param.requires_grad = False
# Parameters of newly constructed modules have requires_grad=True by default
num_ftrs = model_conv.fc1.in_features
model_conv.att_block = AttBlock(
num_ftrs, len(class_names), activation="sigmoid")
model_conv = model_conv.to(device)
print()
# Observe that only parameters of final layer are being optimized as
# opposed to before.
if model != "eff_th04":
criterion = ImprovedPANNsLoss()
else:
criterion = ImprovedPANNsLoss('segmentwise_output')
optimizer_conv = get_optimizer(
model_conv, {"optimizer": {'name': 'Adam', 'params': {'lr': lr}}})
exp_lr_scheduler = get_scheduler(optimizer_conv, {'scheduler': {
'name': 'CosineAnnealingLR', 'params': {'T_max': 10}}})
print('ok')
model_conv = train_model(model_conv, device, criterion, optimizer_conv,
exp_lr_scheduler, model, dataloaders, dataset_sizes, class_names, num_epochs=num_epochs)
torch.save(model_conv.state_dict(), 'weights_trained/'+model+'.pth')
torch.cuda.empty_cache()
del model_conv
torch.cuda.empty_cache()
return None
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument(
'-m', '--model', help="""Chose if you want to train model "1", "2", "3", "4" or "all" """)
args = parser.parse_args()
keys = args.model
if keys == 'all':
keys = ["ref2_th03", "ref2_th04",
"eff_th04", "ext"]
elif keys in ['1', '2', '3', '4']:
keys = [["ref2_th03", "ref2_th04",
"eff_th04", "ext"][int(keys)-1]]
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
cudnn.benchmark = False
# batch 20 et 15 pour eff_th04 (8GB VRAM) # if you don't have enough ram you'd better launch one by one
train(models, device, keys, num_epochs=20, batch_size=15)
classes, _ = find_classes('train/temp/train/')
np.save('inv_bird_code.npy', classes)
{'ref2_th03': [2.0378282148759443, 1.350958687918527, 1.1509922572544644, 1.0251973749517085, 0.9530742771022923, 0.9385142064356543, 0.8788359924987122, 0.8812886332417583, 0.8496827345628005, 0.866959456559066, 0.8618548885806577, 0.8511872343964629, 0.8530584482046275, 0.8455350268018116, 0.8048266735705701, 0.8087591653341776, 0.7847911289760046, 0.7558997856391656, 0.7466989035134788, 0.7164284003959908, 0.7004573109385732, 0.6832559606531164, 0.6746111859332075, 0.6557628505832547, 0.6601907080346412],
'ext': [2.203362265785972, 1.4098027826665522, 1.159971886938745, 1.0570843036358173, 1.0038691929408483, 0.9624923035338685, 0.9488933688991673, 0.9413243388081646, 0.927853762448489, 0.9318580208243905, 0.9206070952363067, 0.923733637883113, 0.9244260683164492, 0.9089159284319197, 0.8776338179032882, 0.8544231875912175, 0.8257928366189475, 0.8005734957181491, 0.7814002403846154, 0.7663307609138909, 0.745571765270862, 0.7303421523544815, 0.7229315789191278, 0.7038964910821601, 0.7025682218782195],
'eff_th04': [1.1523478581355169, 1.009532215831044, 0.9885831560407367, 0.9802307296585251, 0.9729183322780736, 0.9660697350135217, 0.9601244245256697, 0.956034398340917, 0.953995589371566, 0.9535502339457418, 0.953701648083362, 0.953343737256396, 0.9514200399210165, 0.9463589008037862, 0.9367290957943425, 0.921370118528932, 0.8963166959993133, 0.8660067044771635, 0.8309774713201837, 0.7993172446450035, 0.7720077850006439, 0.7488513988452954, 0.7336551540500516, 0.7187049781883157, 0.7086745880462312],
'ref2_th04': [1.4462518377618476, 1.0416387201665522, 0.8972107604309754, 0.8218668843363668, 0.7769343617198232, 0.7448355706183466, 0.7304498022729224, 0.7148164183228881, 0.71854735992767, 0.713101565182864, 0.7222622043483861, 0.712948977292239, 0.702530242584564, 0.7084923209724846, 0.6837784064995064, 0.6682654999114656, 0.6659571888682607, 0.6426979316459908, 0.6317920475215703, 0.6097299345247038, 0.6050359704992274, 0.5991017268254207, 0.5904199369661101, 0.5811248611617875, 0.5721830011724116]}
\ No newline at end of file
File added
File added
File added
File added
File added
File added
File added