diff --git a/.idea/workspace.xml b/.idea/workspace.xml deleted file mode 100644 index 78a318f..0000000 --- a/.idea/workspace.xml +++ /dev/null @@ -1,295 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 1613302221903 - - - 1613467084268 - - - - - - - - - - - - file://$PROJECT_DIR$/ml_lib/modules/util.py - 231 - - - file://$PROJECT_DIR$/models/transformer_model_horizontal.py - 68 - - - file://$PROJECT_DIR$/ml_lib/audio_toolset/mel_dataset.py - 29 - - - file://$PROJECT_DIR$/main.py - 46 - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/_parameters.ini b/_parameters.ini index 59b52ae..2ebd093 100644 --- a/_parameters.ini +++ b/_parameters.ini @@ -4,14 +4,13 @@ debug = False eval = True seed = 69 owner = si11ium -model_name = CNNBaseline +model_name = VisualTransformer data_name = PrimatesLibrosaDatamodule [data] num_worker = 10 data_root = data -reset = False -n_mels = 64 +n_mels = 128 sr = 16000 hop_length = 128 n_fft = 256 @@ -22,6 +21,21 @@ shift_ratio = 0.3 noise_ratio = 0.0 mask_ratio = 0.0 +[Tester] +weight_init = xavier_normal_ +activation = gelu +use_bias = True +use_norm = True +use_residual = True +dropout = 0.2 + +lat_dim = 32 +patch_size = 8 +attn_depth = 12 +heads = 4 +embedding_size = 128 + + [CNNBaseline] weight_init = xavier_normal_ activation = gelu @@ -40,6 +54,22 @@ use_norm = True use_residual = True dropout = 0.2 +lat_dim = 32 +mlp_dim = 32 +head_dim = 32 +patch_size = 8 +attn_depth = 12 +heads = 4 +embedding_size = 33 + +[VerticalVisualTransformer] +weight_init = xavier_normal_ +activation = gelu +use_bias = True +use_norm = True +use_residual = True +dropout = 0.2 + lat_dim = 32 patch_size = 8 attn_depth = 12 @@ -60,19 +90,34 @@ attn_depth = 12 heads = 6 embedding_size = 32 +[VisualPerformer] +weight_init = xavier_normal_ +activation = gelu +use_bias = True +use_norm = True +use_residual = True +dropout = 0.2 + +lat_dim = 32 +patch_size = 8 +attn_depth = 12 +heads = 4 +embedding_size = 30 + [train] outpath = output version = None sampler = EqualSampler -loss = focal_loss_rob +loss = ce_loss sto_weight_avg = False weight_decay = 0 opt_reset_interval = 0 -max_epochs = 200 +max_epochs = 150 batch_size = 30 lr = 0.001 +scheduler='LambdaLR' use_residual = True -lr_warm_restart_epochs = 0 +lr_scheduler_parameter = 0.97 num_sanity_val_steps = 2 check_val_every_n_epoch = 5 checkpoint_callback = True diff --git a/datasets/primates_librosa_datamodule.py b/datasets/primates_librosa_datamodule.py index ae2fba2..c35d7d5 100644 --- a/datasets/primates_librosa_datamodule.py +++ b/datasets/primates_librosa_datamodule.py @@ -4,7 +4,6 @@ from pathlib import Path from torch.utils.data import DataLoader, ConcatDataset, WeightedRandomSampler from torchvision.transforms import Compose, RandomApply -from tqdm import tqdm from ml_lib.audio_toolset.audio_io import NormalizeLocal from ml_lib.audio_toolset.audio_to_mel_dataset import LibrosaAudioToMelDataset @@ -69,13 +68,14 @@ class PrimatesLibrosaDatamodule(_BaseDataModule): # Validation Dataloader def val_dataloader(self): - return DataLoader(dataset=self.datasets[DATA_OPTION_devel], num_workers=self.num_worker, pin_memory=True, - sampler=self.samplers[DATA_OPTION_devel], batch_size=self.batch_size) + return DataLoader(dataset=self.datasets[DATA_OPTION_devel], shuffle=False, + batch_size=self.batch_size, pin_memory=False, + num_workers=self.num_worker) # Test Dataloader def test_dataloader(self): return DataLoader(dataset=self.datasets[DATA_OPTION_test], shuffle=False, - batch_size=self.batch_size, pin_memory=True, + batch_size=self.batch_size, pin_memory=False, num_workers=self.num_worker) def _build_subdataset(self, row, build=False): @@ -134,7 +134,7 @@ class PrimatesLibrosaDatamodule(_BaseDataModule): datasets[data_option] = ConcatDataset(dataset) # Build Weighted Sampler for train and val - if data_option in [DATA_OPTION_train, DATA_OPTION_devel]: + if data_option in [DATA_OPTION_train]: if self.sampler == EqualSampler.__name__: class_idxs = [[idx for idx, (_, __, label) in enumerate(datasets[data_option]) if label == class_idx] for class_idx in range(len(self.class_names)) @@ -147,6 +147,7 @@ class PrimatesLibrosaDatamodule(_BaseDataModule): len_largest_class = max(class_counts.values()) weights[data_option] = [1 / class_counts[x] for x in range(len(class_counts))] + ############################################################################## weights[data_option] = [weights[data_option][datasets[data_option][i][-1]] for i in range(len(datasets[data_option]))] samplers[data_option] = WeightedRandomSampler(weights[data_option], diff --git a/main.py b/main.py index 2ab5bb6..560827a 100644 --- a/main.py +++ b/main.py @@ -2,12 +2,12 @@ from argparse import Namespace import warnings -from pytorch_lightning import Trainer +from pytorch_lightning import Trainer, Callback from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint +from ml_lib.utils.callbacks import BestScoresCallback from ml_lib.utils.config import parse_comandline_args_add_defaults from ml_lib.utils.loggers import Logger -from ml_lib.utils.tools import locate_and_import_class, auto_cast import variables as v @@ -15,22 +15,38 @@ warnings.filterwarnings('ignore', category=FutureWarning) warnings.filterwarnings('ignore', category=UserWarning) -def run_lightning_loop(h_params, data_class, model_class): +def run_lightning_loop(h_params, data_class, model_class, additional_callbacks=None): with Logger.from_argparse_args(h_params) as logger: # Callbacks # ============================================================================= # Checkpoint Saving ckpt_callback = ModelCheckpoint( - monitor='mean_loss', + monitor='PL_recall_score', dirpath=str(logger.log_dir), filename='ckpt_weights', + mode='max', verbose=False, save_top_k=3, + save_last=True ) # Learning Rate Logger lr_logger = LearningRateMonitor(logging_interval='epoch') + + + # Track best scores + score_callback = BestScoresCallback(['PL_recall_score']) + + callbacks = [ckpt_callback, lr_logger, score_callback] + + if additional_callbacks and isinstance(additional_callbacks, Callback): + callbacks.append(additional_callbacks) + elif additional_callbacks and isinstance(additional_callbacks, list): + callbacks.extend(additional_callbacks) + else: + pass + # START # ============================================================================= # Let Datamodule pull what it wants @@ -38,19 +54,27 @@ def run_lightning_loop(h_params, data_class, model_class): datamodule.setup() # Let Trainer pull what it wants and add callbacks - trainer = Trainer.from_argparse_args(h_params, logger=logger, callbacks=[ckpt_callback, lr_logger]) + trainer = Trainer.from_argparse_args(h_params, logger=logger, callbacks=callbacks) # Let Model pull what it wants model = model_class.from_argparse_args(h_params, in_shape=datamodule.shape, n_classes=v.N_CLASS_multi) model.init_weights() + # trainer.test(model=model, datamodule=datamodule) + trainer.fit(model, datamodule) + trainer.save_checkpoint(logger.save_dir / 'last_weights.ckpt') - # Log paramters - pytorch_total_params = sum(p.numel() for p in model.parameters()) - # logger.log_text('n_parameters', pytorch_total_params) - - trainer.save_checkpoint(logger.save_dir / 'weights.ckpt') + try: + trainer.test(model=model, datamodule=datamodule) + except: + print('Test did not Suceed!') + pass + try: + logger.log_metrics(score_callback.best_scores, step=trainer.global_step+1) + except: + print('debug max_score_logging') + return score_callback.best_scores['PL_recall_score'] if __name__ == '__main__': diff --git a/models/cnn_baseline.py b/models/cnn_baseline.py index bb33728..449d753 100644 --- a/models/cnn_baseline.py +++ b/models/cnn_baseline.py @@ -15,7 +15,9 @@ class CNNBaseline(CombinedModelMixins, ): def __init__(self, in_shape, n_classes, weight_init, activation, use_bias, use_norm, dropout, lat_dim, features, - filters, lr, weight_decay, sto_weight_avg, lr_warm_restart_epochs, opt_reset_interval, loss): + filters, + lr, weight_decay, sto_weight_avg, lr_warm_restart_epochs, opt_reset_interval, + loss, scheduler): # TODO: Move this to parent class, or make it much easieer to access.... a = dict(locals()) diff --git a/models/performer.py b/models/performer.py index e69de29..88f525b 100644 --- a/models/performer.py +++ b/models/performer.py @@ -0,0 +1,133 @@ +import inspect +from argparse import Namespace + +import warnings + +import torch +from performer_pytorch import Performer +from torch import nn + +from einops import rearrange, repeat + +from ml_lib.metrics.multi_class_classification import MultiClassScores +from ml_lib.modules.util import (LightningBaseModule, AutoPadToShape, F_x) +from util.module_mixins import CombinedModelMixins + +MIN_NUM_PATCHES = 16 + + +class VisualPerformer(CombinedModelMixins, + LightningBaseModule + ): + + def __init__(self, in_shape, n_classes, weight_init, activation, + embedding_size, heads, attn_depth, patch_size, use_residual, + use_bias, use_norm, dropout, lat_dim, loss, scheduler, + lr, weight_decay, sto_weight_avg, lr_warm_restart_epochs, opt_reset_interval): + + # TODO: Move this to parent class, or make it much easieer to access... But How... + a = dict(locals()) + params = {arg: a[arg] for arg in inspect.signature(self.__init__).parameters.keys() if arg != 'self'} + super(VisualPerformer, self).__init__(params) + + self.in_shape = in_shape + assert len(self.in_shape) == 3, 'There need to be three Dimensions' + channels, height, width = self.in_shape + + # Model Paramters + # ============================================================================= + # Additional parameters + self.embed_dim = self.params.embedding_size + + # Automatic Image Shaping + self.patch_size = self.params.patch_size + image_size = (max(height, width) // self.patch_size) * self.patch_size + self.image_size = image_size + self.patch_size if image_size < max(height, width) else image_size + + # This should be obsolete + assert self.image_size % self.patch_size == 0, 'image dimensions must be divisible by the patch size' + + num_patches = (self.image_size // self.patch_size) ** 2 + patch_dim = channels * self.patch_size ** 2 + assert num_patches >= MIN_NUM_PATCHES, f'your number of patches ({num_patches}) is way too small for ' + \ + f'attention. Try decreasing your patch size' + + # Correct the Embedding Dim + if not self.embed_dim % self.params.heads == 0: + self.embed_dim = (self.embed_dim // self.params.heads) * self.params.heads + message = ('Embedding Dimension was fixed to be devideable by the number' + + f' of attention heads, is now: {self.embed_dim}') + for func in print, warnings.warn: + func(message) + + # Utility Modules + self.autopad = AutoPadToShape((self.image_size, self.image_size)) + + # Modules with Parameters + self.performer = Performer( + dim=self.embed_dim, # dimension + depth=self.params.attn_depth, # layers + heads=self.params.heads, # heads + causal=True, # auto-regressive or not + nb_features=None, # 256, # number of random features, if not set, will default to + # (d * log(d)), where d is the dimension of each head + feature_redraw_interval=1000, # how frequently to redraw the projection matrix, + # the more frequent, the slower the training + generalized_attention=False, # defaults to softmax approximation, + # but can be set to True for generalized attention + kernel_fn=self.params.activation(), # the kernel function to be used, + # if generalized attention is turned on, defaults to Relu + reversible=True, # reversible layers, from Reformer paper + ff_chunks=10, # chunk feedforward layer, from Reformer paper + use_scalenorm=False, # use scale norm, from 'Transformers without Tears' paper + use_rezero=False, # use rezero, from 'Rezero is all you need' paper + ff_glu=True, # use GLU variant for feedforward + ff_dropout=self.params.dropout, # feedforward dropout + attn_dropout=self.params.dropout, # post-attn dropout + local_attn_heads=self.params.heads // 2, # 4 heads are local attention, 4 others are global performers + local_window_size=(patch_dim // self.params.heads) * 2 # window size of local attention + ) + + self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, self.embed_dim)) + self.patch_to_embedding = nn.Linear(patch_dim, self.embed_dim) if self.params.embedding_size \ + else F_x(self.embed_dim) + self.cls_token = nn.Parameter(torch.randn(1, 1, self.embed_dim)) + self.dropout = nn.Dropout(self.params.dropout) + + self.to_cls_token = nn.Identity() + + self.mlp_head = nn.Sequential( + nn.LayerNorm(self.embed_dim), + nn.Linear(self.embed_dim, self.params.lat_dim), + nn.GELU(), + nn.Dropout(self.params.dropout), + nn.Linear(self.params.lat_dim, n_classes), + nn.Softmax() + ) + + def forward(self, x): + """ + :param x: the sequence to the encoder (required). + :return: + """ + tensor = self.autopad(x) + p = self.params.patch_size + tensor = rearrange(tensor, 'b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=p, p2=p) + + tensor = self.patch_to_embedding(tensor) + b, n, _ = tensor.shape + + cls_tokens = repeat(self.cls_token, '() n d -> b n d', b=b) + + tensor = torch.cat((cls_tokens, tensor), dim=1) + tensor += self.pos_embedding[:, :(n + 1)] + tensor = self.dropout(tensor) + + tensor = self.performer(tensor) + + tensor = self.to_cls_token(tensor[:, 0]) + tensor = self.mlp_head(tensor) + return Namespace(main_out=tensor) + + def additional_scores(self, outputs): + return MultiClassScores(self)(outputs) diff --git a/models/testing.py b/models/testing.py new file mode 100644 index 0000000..ad5928a --- /dev/null +++ b/models/testing.py @@ -0,0 +1,120 @@ +import inspect +from argparse import Namespace + +import warnings + +import torch +from torch import nn + +from einops import rearrange, repeat + +from ml_lib.metrics.multi_class_classification import MultiClassScores +from ml_lib.modules.blocks import TransformerModule +from ml_lib.modules.util import (LightningBaseModule, AutoPadToShape, F_x) +from util.module_mixins import CombinedModelMixins + +MIN_NUM_PATCHES = 16 + + +class Tester(CombinedModelMixins, + LightningBaseModule + ): + + def __init__(self, in_shape, n_classes, weight_init, activation, + embedding_size, heads, attn_depth, patch_size, use_residual, + use_bias, use_norm, dropout, lat_dim, loss, scheduler, mlp_dim, + lr, weight_decay, sto_weight_avg, lr_scheduler_parameter, opt_reset_interval): + + # TODO: Move this to parent class, or make it much easieer to access... But How... + a = dict(locals()) + params = {arg: a[arg] for arg in inspect.signature(self.__init__).parameters.keys() if arg != 'self'} + super(Tester, self).__init__(params) + + self.in_shape = in_shape + assert len(self.in_shape) == 3, 'There need to be three Dimensions' + channels, height, width = self.in_shape + + # Model Paramters + # ============================================================================= + # Additional parameters + self.embed_dim = self.params.embedding_size + + # Automatic Image Shaping + self.patch_size = self.params.patch_size + image_size = (max(height, width) // self.patch_size) * self.patch_size + self.image_size = image_size + self.patch_size if image_size < max(height, width) else image_size + + # This should be obsolete + assert self.image_size % self.patch_size == 0, 'image dimensions must be divisible by the patch size' + + num_patches = (self.image_size // self.patch_size) ** 2 + patch_dim = channels * self.patch_size ** 2 + assert num_patches >= MIN_NUM_PATCHES, f'your number of patches ({num_patches}) is way too small for ' + \ + f'attention. Try decreasing your patch size' + + # Correct the Embedding Dim + if not self.embed_dim % self.params.heads == 0: + self.embed_dim = (self.embed_dim // self.params.heads) * self.params.heads + message = ('Embedding Dimension was fixed to be devideable by the number' + + f' of attention heads, is now: {self.embed_dim}') + for func in print, warnings.warn: + func(message) + + # Utility Modules + self.autopad = AutoPadToShape((self.image_size, self.image_size)) + + # Modules with Parameters + self.transformer = TransformerModule(in_shape=self.embed_dim, mlp_dim=self.params.mlp_dim, + heads=self.params.heads, depth=self.params.attn_depth, + dropout=self.params.dropout, use_norm=self.params.use_norm, + activation=self.params.activation, use_residual=self.params.use_residual + ) + + self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, self.embed_dim)) + self.patch_to_embedding = nn.Linear(patch_dim, self.embed_dim) if self.params.embedding_size \ + else F_x(self.embed_dim) + self.cls_token = nn.Parameter(torch.randn(1, 1, self.embed_dim)) + self.dropout = nn.Dropout(self.params.dropout) + + self.to_cls_token = nn.Identity() + + self.mlp_head = nn.Sequential( + nn.LayerNorm(self.embed_dim), + nn.Linear(self.embed_dim, self.params.lat_dim), + nn.GELU(), + nn.Dropout(self.params.dropout), + nn.Linear(self.params.lat_dim, n_classes), + nn.Softmax() + ) + + def forward(self, x, mask=None, return_attn_weights=False): + """ + :param x: the sequence to the encoder (required). + :param mask: the mask for the src sequence (optional). + :return: + """ + tensor = self.autopad(x) + p = self.params.patch_size + tensor = rearrange(tensor, 'b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=p, p2=p) + + tensor = self.patch_to_embedding(tensor) + b, n, _ = tensor.shape + + cls_tokens = repeat(self.cls_token, '() n d -> b n d', b=b) + + tensor = torch.cat((cls_tokens, tensor), dim=1) + tensor += self.pos_embedding[:, :(n + 1)] + tensor = self.dropout(tensor) + + if return_attn_weights: + tensor, attn_weights = self.transformer(tensor, mask, return_attn_weights) + else: + attn_weights = None + tensor = self.transformer(tensor, mask) + + tensor = self.to_cls_token(tensor[:, 0]) + tensor = self.mlp_head(tensor) + return Namespace(main_out=tensor, attn_weights=attn_weights) + + def additional_scores(self, outputs): + return MultiClassScores(self)(outputs) diff --git a/models/transformer_model.py b/models/transformer_model.py index d2b1bee..3cb78e2 100644 --- a/models/transformer_model.py +++ b/models/transformer_model.py @@ -21,9 +21,9 @@ class VisualTransformer(CombinedModelMixins, ): def __init__(self, in_shape, n_classes, weight_init, activation, - embedding_size, heads, attn_depth, patch_size,use_residual, - use_bias, use_norm, dropout, lat_dim, loss, - lr, weight_decay, sto_weight_avg, lr_warm_restart_epochs, opt_reset_interval): + embedding_size, heads, attn_depth, patch_size, use_residual, + use_bias, use_norm, dropout, lat_dim, loss, scheduler, mlp_dim, head_dim, + lr, weight_decay, sto_weight_avg, lr_scheduler_parameter, opt_reset_interval): # TODO: Move this to parent class, or make it much easieer to access... But How... a = dict(locals()) @@ -53,26 +53,26 @@ class VisualTransformer(CombinedModelMixins, f'attention. Try decreasing your patch size' # Correct the Embedding Dim - if not self.embed_dim % self.params.heads == 0: - self.embed_dim = (self.embed_dim // self.params.heads) * self.params.heads - message = ('Embedding Dimension was fixed to be devideable by the number' + - f' of attention heads, is now: {self.embed_dim}') - for func in print, warnings.warn: - func(message) + #if not self.embed_dim % self.params.heads == 0: + # self.embed_dim = (self.embed_dim // self.params.heads) * self.params.heads + # message = ('Embedding Dimension was fixed to be devideable by the number' + + # f' of attention heads, is now: {self.embed_dim}') + # for func in print, warnings.warn: + # func(message) # Utility Modules self.autopad = AutoPadToShape((self.image_size, self.image_size)) # Modules with Parameters - self.transformer = TransformerModule(in_shape=self.embed_dim, mlp_dim=self.params.lat_dim, + self.transformer = TransformerModule(in_shape=self.embed_dim, mlp_dim=self.params.mlp_dim, + head_dim=self.params.head_dim, heads=self.params.heads, depth=self.params.attn_depth, dropout=self.params.dropout, use_norm=self.params.use_norm, activation=self.params.activation, use_residual=self.params.use_residual ) self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, self.embed_dim)) - self.patch_to_embedding = nn.Linear(patch_dim, self.embed_dim) if self.params.embedding_size \ - else F_x(self.embed_dim) + self.patch_to_embedding = nn.Linear(patch_dim, self.embed_dim) self.cls_token = nn.Parameter(torch.randn(1, 1, self.embed_dim)) self.dropout = nn.Dropout(self.params.dropout) @@ -117,4 +117,4 @@ class VisualTransformer(CombinedModelMixins, return Namespace(main_out=tensor, attn_weights=attn_weights) def additional_scores(self, outputs): - return MultiClassScores(self)(outputs) + return MultiClassScores(self)(outputs) diff --git a/models/transformer_model_horizontal.py b/models/transformer_model_horizontal.py index b62325c..3678e75 100644 --- a/models/transformer_model_horizontal.py +++ b/models/transformer_model_horizontal.py @@ -21,8 +21,8 @@ class HorizontalVisualTransformer(CombinedModelMixins, ): def __init__(self, in_shape, n_classes, weight_init, activation, - embedding_size, heads, attn_depth, patch_size,use_residual, - use_bias, use_norm, dropout, lat_dim, features, loss, + embedding_size, heads, attn_depth, patch_size, use_residual, + use_bias, use_norm, dropout, lat_dim, features, loss, scheduler, lr, weight_decay, sto_weight_avg, lr_warm_restart_epochs, opt_reset_interval): # TODO: Move this to parent class, or make it much easieer to access... But How... diff --git a/models/transformer_model_vertical.py b/models/transformer_model_vertical.py new file mode 100644 index 0000000..82606b7 --- /dev/null +++ b/models/transformer_model_vertical.py @@ -0,0 +1,116 @@ +import inspect +from argparse import Namespace + +import warnings + +import torch +from einops import repeat +from torch import nn + +from ml_lib.metrics.multi_class_classification import MultiClassScores +from ml_lib.modules.blocks import TransformerModule +from ml_lib.modules.util import (LightningBaseModule, AutoPadToShape, F_x, SlidingWindow) +from util.module_mixins import CombinedModelMixins + +MIN_NUM_PATCHES = 16 + + +class VerticalVisualTransformer(CombinedModelMixins, LightningBaseModule): + + def __init__(self, in_shape, n_classes, weight_init, activation, + embedding_size, heads, attn_depth, patch_size, use_residual, + use_bias, use_norm, dropout, lat_dim, features, loss, scheduler, + lr, weight_decay, sto_weight_avg, lr_warm_restart_epochs, opt_reset_interval): + + # TODO: Move this to parent class, or make it much easieer to access... But How... + a = dict(locals()) + params = {arg: a[arg] for arg in inspect.signature(self.__init__).parameters.keys() if arg != 'self'} + super(VerticalVisualTransformer, self).__init__(params) + + self.in_shape = in_shape + self.n_classes = n_classes + + assert len(self.in_shape) == 3, 'There need to be three Dimensions' + channels, height, width = self.in_shape + + # Model Paramters + # ============================================================================= + # Additional parameters + self.embed_dim = self.params.embedding_size + self.height = height + self.channels = channels + + self.new_width = ((width - self.params.patch_size)//1) + 1 + + num_patches = self.new_width - (self.params.patch_size // 2) + patch_dim = channels * self.params.patch_size * self.height + assert num_patches >= MIN_NUM_PATCHES, f'your number of patches ({num_patches}) is way too small for ' + \ + f'attention. Try decreasing your patch size' + + # Correct the Embedding Dim + if not self.embed_dim % self.params.heads == 0: + self.embed_dim = (self.embed_dim // self.params.heads) * self.params.heads + message = ('Embedding Dimension was fixed to be devideable by the number' + + f' of attention heads, is now: {self.embed_dim}') + for func in print, warnings.warn: + func(message) + + # Utility Modules + self.autopad = AutoPadToShape((self.height, self.new_width)) + self.dropout = nn.Dropout(self.params.dropout) + self.slider = SlidingWindow((channels, *self.autopad.target_shape), (self.height, self.params.patch_size), + keepdim=False) + + # Modules with Parameters + self.transformer = TransformerModule(in_shape=self.embed_dim, mlp_dim=self.params.lat_dim, + heads=self.params.heads, depth=self.params.attn_depth, + dropout=self.params.dropout, use_norm=self.params.use_norm, + activation=self.params.activation + ) + + self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, self.embed_dim)) + self.patch_to_embedding = nn.Linear(patch_dim, self.embed_dim) if self.params.embedding_size \ + else F_x(self.embed_dim) + self.cls_token = nn.Parameter(torch.randn(1, 1, self.embed_dim)) + self.to_cls_token = nn.Identity() + + self.mlp_head = nn.Sequential( + nn.LayerNorm(self.embed_dim), + nn.Linear(self.embed_dim, self.params.lat_dim), + nn.GELU(), + nn.Dropout(self.params.dropout), + nn.Linear(self.params.lat_dim, self.n_classes), + nn.Softmax() + ) + + def forward(self, x, mask=None, return_attn_weights=False): + """ + :param x: the sequence to the encoder (required). + :param mask: the mask for the src sequence (optional). + :param return_attn_weights: wether to return the attn weights (optional) + :return: + """ + tensor = self.autopad(x) + tensor = self.slider(tensor) + + tensor = self.patch_to_embedding(tensor) + b, n, _ = tensor.shape + + cls_tokens = repeat(self.cls_token, '() n d -> b n d', b=b) + + tensor = torch.cat((cls_tokens, tensor), dim=1) + tensor += self.pos_embedding[:, :(n + 1)] + tensor = self.dropout(tensor) + + if return_attn_weights: + tensor, attn_weights = self.transformer(tensor, mask, return_attn_weights) + else: + attn_weights = None + tensor = self.transformer(tensor, mask) + + tensor = self.to_cls_token(tensor[:, 0]) + tensor = self.mlp_head(tensor) + return Namespace(main_out=tensor, attn_weights=attn_weights) + + def additional_scores(self, outputs): + return MultiClassScores(self)(outputs) diff --git a/multi_run.py b/multi_run.py index 955a02a..ee6e30b 100644 --- a/multi_run.py +++ b/multi_run.py @@ -10,23 +10,30 @@ import itertools if __name__ == '__main__': # Set new values - hparams_dict = dict(model_name=['VisualTransformer'], - max_epochs=[150], + hparams_dict = dict(seed=range(10), + model_name=['VisualTransformer'], batch_size=[50], - random_apply_chance=[0.5], - loudness_ratio=[0], - shift_ratio=[0.3], - noise_ratio=[0.3], - mask_ratio=[0.3], - lr=[0.001], - dropout=[0.2], - lat_dim=[32, 64], - patch_size=[8, 12], - attn_depth=[12], - heads=[6], - embedding_size=[16, 32], + max_epochs=[250], + random_apply_chance=[0.3], # trial.suggest_float('random_apply_chance', 0.1, 0.5, step=0.1), + loudness_ratio=[0], # trial.suggest_float('loudness_ratio', 0.0, 0.5, step=0.1), + shift_ratio=[0.3], # trial.suggest_float('shift_ratio', 0.0, 0.5, step=0.1), + noise_ratio=[0.3], # trial.suggest_float('noise_ratio', 0.0, 0.5, step=0.1), + mask_ratio=[0.3], # trial.suggest_float('mask_ratio', 0.0, 0.5, step=0.1), + lr=[5e-3], # trial.suggest_uniform('lr', 1e-3, 3e-3), + dropout=[0.2], # trial.suggest_float('dropout', 0.0, 0.3, step=0.05), + lat_dim=[32], # 2 ** trial.suggest_int('lat_dim', 1, 5, step=1), + mlp_dim=[16], # 2 ** trial.suggest_int('mlp_dim', 1, 5, step=1), + head_dim=[6], # 2 ** trial.suggest_int('head_dim', 1, 5, step=1), + patch_size=[12], # trial.suggest_int('patch_size', 6, 12, step=3), + attn_depth=[10], # trial.suggest_int('attn_depth', 2, 14, step=4), + heads=[6], # trial.suggest_int('heads', 2, 16, step=2), + scheduler=['CosineAnnealingWarmRestarts'], # trial.suggest_categorical('scheduler', [None, 'LambdaLR']), + lr_scheduler_parameter=[25], # [0.98], + embedding_size=[30], # trial.suggest_int('embedding_size', 12, 64, step=12), loss=['ce_loss'], - sampler=['WeightedRandomSampler'] + sampler=['WeightedRandomSampler'], + # rial.suggest_categorical('sampler', [None, 'WeightedRandomSampler']), + weight_decay=[0], # trial.suggest_loguniform('weight_decay', 1e-20, 1e-1), ) keys, values = zip(*hparams_dict.items()) diff --git a/optuna_tune.py b/optuna_tune.py new file mode 100644 index 0000000..f8f02c0 --- /dev/null +++ b/optuna_tune.py @@ -0,0 +1,79 @@ +import pickle +from argparse import Namespace +from pathlib import Path + +import optuna as optuna +from optuna.integration import PyTorchLightningPruningCallback + +from main import run_lightning_loop +from ml_lib.utils.config import parse_comandline_args_add_defaults +import neptunecontrib.monitoring.optuna as opt_utils + + +def optimize(trial: optuna.Trial): + # Optuna configuration + folder = Path('study') + folder.mkdir(parents=False, exist_ok=True) + optuna_suggestions = dict( + model_name='VisualTransformer', + batch_size=trial.suggest_int('batch_size', 30, 100, step=32), + lr_scheduler_parameter=trial.suggest_float('lr_scheduler_parameter', 0.8, 1, step=0.01), + max_epochs=100, + random_apply_chance=0.1, # trial.suggest_float('random_apply_chance', 0.1, 0.5, step=0.1), + loudness_ratio=0.1, # trial.suggest_float('loudness_ratio', 0.0, 0.5, step=0.1), + shift_ratio=0.1, # trial.suggest_float('shift_ratio', 0.0, 0.5, step=0.1), + noise_ratio=0, # trial.suggest_float('noise_ratio', 0.0, 0.5, step=0.1), + mask_ratio=0.2, # trial.suggest_float('mask_ratio', 0.0, 0.5, step=0.1), + lr=trial.suggest_uniform('lr', 1e-3, 3e-3), + dropout=0.05, # trial.suggest_float('dropout', 0.0, 0.3, step=0.05), + lat_dim=32, # 2 ** trial.suggest_int('lat_dim', 1, 5, step=1), + mlp_dim=16, # 2 ** trial.suggest_int('mlp_dim', 1, 5, step=1), + head_dim=8, # 2 ** trial.suggest_int('head_dim', 1, 5, step=1), + patch_size=12, # trial.suggest_int('patch_size', 6, 12, step=3), + attn_depth=10, # trial.suggest_int('attn_depth', 2, 14, step=4), + heads=16, # trial.suggest_int('heads', 2, 16, step=2), + scheduler='LambdaLR', # trial.suggest_categorical('scheduler', [None, 'LambdaLR']), + embedding_size=48, # trial.suggest_int('embedding_size', 12, 64, step=12), + loss='ce_loss', + sampler='WeightedRandomSampler', # rial.suggest_categorical('sampler', [None, 'WeightedRandomSampler']), + weight_decay=trial.suggest_loguniform('weight_decay', 1e-20, 1e-1), + study_name=trial.study.study_name + ) + + pruning_callback = PyTorchLightningPruningCallback(trial, monitor="PL_recall_score") + + # Parse comandline args, read config and get model + cmd_args, found_data_class, found_model_class = parse_comandline_args_add_defaults('_parameters.ini') + + h_params = dict(**cmd_args) + h_params.update(optuna_suggestions) + h_params = Namespace(**h_params) + try: + best_score = run_lightning_loop(h_params, data_class=found_data_class, model_class=found_model_class, + additional_callbacks=pruning_callback) + except Exception as e: + print(e) + best_score = 0 + return best_score + + +if __name__ == '__main__': + study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=1337)) + # study.optimize(optimize, n_trials=50, callbacks=[opt_utils.NeptuneCallback(log_study=True, log_charts=True)]) + study.optimize(optimize, n_trials=50) + + print("Number of finished trials: {}".format(len(study.trials))) + + print("Best trial:") + trial = study.best_trial + + print(" Value: {}".format(trial.value)) + + print(" Params: ") + for key, value in trial.params.items(): + print(" {}: {}".format(key, value)) + + optuna_study_object = Path('study') / 'study.pkl' + optuna_study_object.parent.mkdir(exist_ok=True) + with optuna_study_object.open(mode='wb') as f: + pickle.dump(study, f) diff --git a/util/module_mixins.py b/util/module_mixins.py index dc7183a..4464d92 100644 --- a/util/module_mixins.py +++ b/util/module_mixins.py @@ -1,3 +1,6 @@ +from collections import defaultdict +from pathlib import Path + from abc import ABC import torch @@ -42,24 +45,67 @@ class ValMixin: model_out = self(batch_x) y = model_out.main_out + sorted_y = defaultdict(list) + sorted_batch_y = dict() + for idx, file_name in enumerate(batch_files): + sorted_y[file_name].append(y[idx]) + sorted_batch_y.update({file_name: batch_y[idx]}) + sorted_y = dict(sorted_y) + + for file_name in sorted_y: + sorted_y.update({file_name: torch.stack(sorted_y[file_name])}) + + y_max = torch.stack( + [torch.argmax(x.mean(dim=0)) if x.shape[0] > 1 else torch.argmax(x) for x in sorted_y.values()] + ).squeeze() + y_one_hot = torch.nn.functional.one_hot(y_max, num_classes=5).float() + self.metrics.update(y_one_hot, torch.stack(tuple(sorted_batch_y.values())).long()) + val_loss = self.ce_loss(y, batch_y.long()) - self.metrics.update(y, batch_y) # torch.argmax(y, -1), batch_y) - - return dict(val_loss=val_loss, batch_files=batch_files, + return dict(batch_files=batch_files, val_loss=val_loss, batch_idx=batch_idx, y=y, batch_y=batch_y) def validation_epoch_end(self, outputs, *_, **__): assert isinstance(self, LightningBaseModule) summary_dict = dict() - keys = list(outputs[0].keys()) + summary_dict.update({f'mean_{key}': torch.mean(torch.stack([output[key] + for output in outputs])) + for key in keys if 'loss' in key} + ) + + sorted_y = defaultdict(list) + sorted_batch_y = dict() + for output in outputs: + for idx, file_name in enumerate(output['batch_files']): + sorted_y[file_name].append(output['y'][idx]) + sorted_batch_y.update({file_name: output['batch_y'][idx]}) + sorted_y = dict(sorted_y) + sorted_batch_y = torch.stack(tuple(sorted_batch_y.values())).long() + + for file_name in sorted_y: + sorted_y.update({file_name: torch.stack(sorted_y[file_name])}) + + y_mean = torch.stack( + [torch.mean(x, dim=0, keepdim=True) if x.shape[0] > 1 else x for x in sorted_y.values()] + ).squeeze() + mean_vote_loss = self.ce_loss(y_mean, sorted_batch_y) + summary_dict.update(val_mean_vote_loss=mean_vote_loss) + + y_max = torch.stack( + [torch.argmax(x.mean(dim=0)) if x.shape[0] > 1 else torch.argmax(x) for x in sorted_y.values()] + ).squeeze() + y_one_hot = torch.nn.functional.one_hot(y_max, num_classes=5).float() + max_vote_loss = self.ce_loss(y_one_hot, sorted_batch_y) + summary_dict.update(val_max_vote_loss=max_vote_loss) + summary_dict.update({f'mean_{key}': torch.mean(torch.stack([output[key] for output in outputs])) for key in keys if 'loss' in key} ) # Sklearn Scores - additional_scores = self.additional_scores(outputs) + additional_scores = self.additional_scores(dict(y=y_one_hot, batch_y=sorted_batch_y)) summary_dict.update(**additional_scores) pl_metrics, pl_images = self.metrics.compute_and_prepare() @@ -85,13 +131,40 @@ class TestMixin: def test_epoch_end(self, outputs, *_, **__): assert isinstance(self, LightningBaseModule) - - y_arg_max = torch.argmax(outputs[0]['y']) - - pd.DataFrame(data=dict(filenames=outputs[0]['batch_files'], predtiction=y_arg_max)) - # No logging, just inference. - # self.log_dict(summary_dict, on_epoch=True) + + sorted_y = defaultdict(list) + for output in outputs: + for idx, file_name in enumerate(output['batch_files']): + sorted_y[file_name].append(output['y'][idx].cpu()) + sorted_y = dict(sorted_y) + + for file_name in sorted_y: + sorted_y.update({file_name: torch.stack(sorted_y[file_name])}) + + y_max = torch.stack( + [torch.argmax(x.mean(dim=0)) if x.shape[0] > 1 else torch.argmax(x) for x in sorted_y.values()] + ).squeeze().cpu() + class_names = {val: key for val, key in enumerate(['background', 'chimpanze', 'geunon', 'mandrille', 'redcap'])} + + df = pd.DataFrame(data=dict(filenames=[Path(x).stem for x in sorted_y.keys()], + prediction=y_max.cpu().numpy(), + prediction_named=[class_names[x.item()] for x in y_max.cpu().numpy()])) + result_file = Path(self.logger.log_dir / 'predictions.csv') + if result_file.exists(): + try: + result_file.unlink() + except: + print('File allready existed') + pass + with result_file.open(mode='wb') as csv_file: + df.to_csv(index=False, path_or_buf=csv_file) + with result_file.open(mode='rb') as csv_file: + try: + self.logger.neptunelogger.log_artifact(csv_file) + except: + print('No possible to send to neptune') + pass class CombinedModelMixins(LossMixin, diff --git a/util/optimizer_mixin.py b/util/optimizer_mixin.py index 1dc8181..dfba45c 100644 --- a/util/optimizer_mixin.py +++ b/util/optimizer_mixin.py @@ -24,11 +24,16 @@ class OptimizerMixin: if self.params.sto_weight_avg: optimizer = SWA(optimizer, swa_start=10, swa_freq=5, swa_lr=0.05) optimizer_dict.update(optimizer=optimizer) - if self.params.lr_warm_restart_epochs: - scheduler = CosineAnnealingWarmRestarts(optimizer, self.params.lr_warm_restart_epochs) + + if self.params.scheduler == CosineAnnealingWarmRestarts.__name__: + scheduler = CosineAnnealingWarmRestarts(optimizer, self.params.lr_scheduler_parameter) + elif self.params.scheduler == LambdaLR.__name__: + lr_reduce_ratio = self.params.lr_scheduler_parameter + scheduler = LambdaLR(optimizer, lr_lambda=lambda epoch: lr_reduce_ratio ** epoch) else: - scheduler = LambdaLR(optimizer, lr_lambda=lambda epoch: 0.95 ** epoch) - optimizer_dict.update(lr_scheduler=scheduler) + scheduler = None + if scheduler: + optimizer_dict.update(lr_scheduler=scheduler) return optimizer_dict def on_train_end(self):