From 256e09a8b9d7a3654a9e2fc1303bc1d0fa9f3261 Mon Sep 17 00:00:00 2001 From: Si11ium Date: Sun, 14 Jun 2020 20:50:54 +0200 Subject: [PATCH] ensembles --- ensemble_methods/__init__.py | 0 ensemble_methods/ensemble_checkpoints.py | 117 +++++++++++++++++++ ensemble_methods/global_inference.py | 92 +++++++++++++++ ensemble_methods/model_ensemble.py | 141 +++++++++++++++++++++++ ensemble_methods/paramter_ensemble.py | 137 ++++++++++++++++++++++ main.py | 4 +- 6 files changed, 489 insertions(+), 2 deletions(-) create mode 100644 ensemble_methods/__init__.py create mode 100644 ensemble_methods/ensemble_checkpoints.py create mode 100644 ensemble_methods/global_inference.py create mode 100644 ensemble_methods/model_ensemble.py create mode 100644 ensemble_methods/paramter_ensemble.py diff --git a/ensemble_methods/__init__.py b/ensemble_methods/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ensemble_methods/ensemble_checkpoints.py b/ensemble_methods/ensemble_checkpoints.py new file mode 100644 index 0000000..0f2e2b6 --- /dev/null +++ b/ensemble_methods/ensemble_checkpoints.py @@ -0,0 +1,117 @@ +import csv +import pickle +from collections import defaultdict +from pathlib import Path + +from sklearn import metrics +from tqdm import tqdm + +import numpy as np + +from util.config import MConfig + + +def accumulate_predictions(config_filename, output_folders): + for output_folder in tqdm(output_folders, total=len(output_folders)): + # Gather Predictions and labels + inference_files = output_folder.glob('*.csv') + + config = MConfig() + config.read_file((output_folder.parent / config_filename).open('r')) + + result_dict = defaultdict(list) + for inf_idx, inference_file in enumerate(inference_files): + with inference_file.open('r') as f: + # Read Headers to skip the first line + _ = f.readline() + for row in f: + prediction, label = [float(x) for x in row.strip().split(',')] + result_dict[inference_file.name[:-4]].append(prediction) + if inf_idx == 0: + result_dict['labels'].append(label) + result_dict = dict(result_dict) + with (output_folder / Path(__file__).name[:-3]).open('wb') as f: + pickle.dump(result_dict, f, protocol=pickle.HIGHEST_PROTOCOL) + pass + + +def accumulate_uars(output_folders): + for model_type in output_folders.iterdir(): + for param_config in model_type.iterdir(): + per_seed_ensemble_files = param_config.rglob(Path(__file__).name[:-3]) + + for ensemble_file in per_seed_ensemble_files: + uar_dict = dict() + with ensemble_file.open('rb') as f: + loaded_ensemble_file = pickle.load(f) + labels = loaded_ensemble_file.pop('labels') + for decision_boundry in range(10, 91, 5): + decision_boundry = round(decision_boundry * 0.01, 2) + majority_votes = [] + mean_votes = [] + voters = len(loaded_ensemble_file.keys()) * 0.5 + for i in range(len(labels)): + majority_vote = [] + mean_vote = [] + for key in loaded_ensemble_file.keys(): + majority_vote.append(loaded_ensemble_file[key][i] > decision_boundry) + mean_vote.append(loaded_ensemble_file[key][i]) + mean_votes.append(int(sum(mean_vote) / len(loaded_ensemble_file.keys()) > decision_boundry)) + majority_votes.append(sum(majority_vote) > voters) + + for predictions, name in zip([mean_votes, majority_votes], ['mean', 'majority']): + + uar_score = metrics.recall_score(labels, predictions, labels=[0, 1], average='macro', + sample_weight=None, zero_division='warn') + uar_dict[f'{name}_decb_{decision_boundry}'] = uar_score + with (ensemble_file.parent / 'ensemble_uar_dict_decb').open('wb') as ef: + pickle.dump(uar_dict, ef, protocol=pickle.HIGHEST_PROTOCOL) + + +def gather_results(config_filename, outpath): + for model_type in outpath.iterdir(): + result_dict = defaultdict(list) + for param_config in model_type.iterdir(): + tmp_uar_dict = defaultdict(list) + config: MConfig + for idx, version_uar in enumerate(param_config.rglob('uar_dict_decb')): + if not idx: + config = MConfig() + config.read_file((version_uar.parent.parent / config_filename).open('r')) + for parameter, value in config.model_paramters.items(): + if parameter in ['exp_path', 'exp_fingerprint', 'loudness_ratio', 'mask_ratio', 'noise_ratio', + 'shift_ratio', 'speed_amount', 'speed_max', 'speed_min']: + result_dict[parameter].append(value) + + with version_uar.open('rb') as f: + loaded_uar_file = pickle.load(f) + + for key in loaded_uar_file.keys(): + tmp_uar_dict[key].append(loaded_uar_file[key]) + for key, value in tmp_uar_dict.items(): + result_dict[f'{key}_mean'].append(np.mean(value)) + result_dict[f'{key}_std'].append(np.std(value)) + with (model_type / 'checkpoint_ensemble_results.csv').open('w') as f: + headers = list(result_dict.keys()) + + writer = csv.DictWriter(f, delimiter=',', lineterminator='\n', fieldnames=headers) + writer.writeheader() # write a header + + for row_idx in range(len(result_dict['exp_path'])): + writer.writerow({key: result_dict[key][row_idx] for key in headers}) + + +if __name__ == '__main__': + outpath = Path().absolute().parent / 'output' + + config_filename = 'config.ini' + output_folders_path = list(outpath.rglob('outputs')) + + # Accumulate the Predictions + #accumulate_predictions(config_filename, output_folders_path) + + # Accumulate the UARS + accumulate_uars(outpath) + + # Gather Results to final CSV + #gather_results(config_filename, outpath) diff --git a/ensemble_methods/global_inference.py b/ensemble_methods/global_inference.py new file mode 100644 index 0000000..950b295 --- /dev/null +++ b/ensemble_methods/global_inference.py @@ -0,0 +1,92 @@ +from pathlib import Path +from pickle import UnpicklingError + +import torch +from tqdm import tqdm + +import variables as V +from torch.utils.data import DataLoader, Dataset +from torchvision.transforms import Compose, RandomApply + +from ml_lib.audio_toolset.audio_augmentation import Speed +from ml_lib.audio_toolset.audio_io import AudioToMel, NormalizeLocal, MelToImage + +# Dataset and Dataloaders +# ============================================================================= + +# Transforms +from ml_lib.audio_toolset.mel_augmentation import NoiseInjection, LoudnessManipulator, ShiftTime, MaskAug +from ml_lib.utils.logging import Logger +from ml_lib.utils.model_io import SavedLightningModels +from ml_lib.utils.transforms import ToTensor +from ml_lib.visualization.tools import Plotter +from util.config import MConfig + +# Datasets +from datasets.binar_masks import BinaryMasksDataset + + +def prepare_dataloader(config_obj): + mel_transforms = Compose([ + AudioToMel(sr=config_obj.data.sr, n_mels=config_obj.data.n_mels, n_fft=config_obj.data.n_fft, + hop_length=config_obj.data.hop_length), + MelToImage()]) + transforms = Compose([NormalizeLocal(), ToTensor()]) + """ + aug_transforms = Compose([ + NoiseInjection(0.4), + LoudnessManipulator(0.4), + ShiftTime(0.3), + MaskAug(0.2), + NormalizeLocal(), ToTensor() + ]) + """ + + dataset: Dataset = BinaryMasksDataset(config_obj.data.root, setting=V.DATA_OPTIONS.devel, + mel_transforms=mel_transforms, transforms=transforms + ) + # noinspection PyTypeChecker + return DataLoader(dataset, batch_size=config_obj.train.batch_size, + num_workers=config_obj.data.worker if False else 0, shuffle=False) + + +def restore_logger_and_model(log_dir, ckpt): + model = SavedLightningModels.load_checkpoint(models_root_path=log_dir, checkpoint=ckpt) + model = model.restore() + if torch.cuda.is_available(): + model.cuda() + else: + model.cpu() + return model + + +if __name__ == '__main__': + outpath = Path('output') + + config_filename = 'config.ini' + for checkpoint in outpath.rglob('*.ckpt'): + + inference_out = checkpoint.parent / 'outputs' / f'{checkpoint.name[:-5]}.csv' + if inference_out.exists(): + continue + inference_out.parent.mkdir(parents=True, exist_ok=True) + + config = MConfig() + config.read_file((checkpoint.parent / config_filename).open('r')) + + devel_dataloader = prepare_dataloader(config) + + try: + loaded_model = restore_logger_and_model(checkpoint.parent, ckpt=checkpoint) + loaded_model.eval() + except UnpicklingError: + continue + with inference_out.open(mode='w') as outfile: + outfile.write(f'file_name,prediction\n') + + for batch in tqdm(devel_dataloader, total=len(devel_dataloader)): + batch_x, batch_y = batch + y = loaded_model(batch_x.to(device='cuda' if torch.cuda.is_available() else 'cpu')).main_out + for prediction, label in zip(y, batch_y): + outfile.write(f'{prediction.item()},{label.item()}\n') + print('Done') diff --git a/ensemble_methods/model_ensemble.py b/ensemble_methods/model_ensemble.py new file mode 100644 index 0000000..64630a2 --- /dev/null +++ b/ensemble_methods/model_ensemble.py @@ -0,0 +1,141 @@ +import csv +import pickle +from collections import defaultdict +from pathlib import Path + +import numpy as np +from sklearn import metrics + + +decision_boundrys = [round(x * 0.01, 2) for x in range(10, 90, 5)] + + +def accumulate_uars(output_folders): + for model_type in output_folders.iterdir(): + for param_config in model_type.iterdir(): + per_seed_ensemble_files = param_config.rglob('ensemble_checkpoints') + for ensemble_file in per_seed_ensemble_files: + uar_dict = dict() + with ensemble_file.open('rb') as f: + loaded_ensemble_file = pickle.load(f) + labels = loaded_ensemble_file.pop('labels') + for decision_boundry in decision_boundrys: + decisions = [] + try: + for i in range(len(labels)): + decisions.append(loaded_ensemble_file['weights'][i] > decision_boundry) + + uar_score = metrics.recall_score(labels, decisions, labels=[0, 1], average='macro', + sample_weight=None, zero_division='warn') + uar_dict[f'weights_decb_{decision_boundry}'] = uar_score + except KeyError: + continue + + with (ensemble_file.parent / 'weights_uar_dict_decb').open('wb') as ef: + pickle.dump(uar_dict, ef, protocol=pickle.HIGHEST_PROTOCOL) + + +def accumulate_predictions_along_paramter_within_model(outpath): + version_dicts = defaultdict(lambda: defaultdict(dict)) + labels = [] + labels_loaded = False + for model_type in outpath.iterdir(): + if not model_type.is_dir(): + continue + for parameter_configuration in model_type.iterdir(): + if not parameter_configuration.is_dir(): + continue + for version in parameter_configuration.rglob('weights_uar_dict_decb'): + try: + with (version.parent / 'weights.csv').open('r') as f: + predictions = [] + # Read Headers to skip the first line + _ = f.readline() + for row in f: + prediction, label = [float(x) for x in row.strip().split(',')] + predictions.append(prediction) + if not labels_loaded: + labels.append(label) + if not labels_loaded: + labels_loaded = True + version_dicts[version.parent.parent.name][parameter_configuration.name.split('_')[1]][model_type.name] = dict( + path=version, + predictions=predictions) + except KeyError: + continue + except FileNotFoundError: + continue + + result_dict = defaultdict(list) + final_dict = dict() + uar_dict = dict() + + for decision_boundry in decision_boundrys: + for i in range(len(labels)): + for version_key, version_dict in version_dicts.items(): + for parameter_key, parameter_dict in version_dict.items(): + majority_votes = [] + mean_votes = [] + for model_type, model_dict in parameter_dict.items(): + majority_votes.append(model_dict['predictions'][i] > decision_boundry) + mean_votes.append(model_dict['predictions'][i]) + result_dict[f'{parameter_key}_{decision_boundry}_mean_vote_pred_{version_key}'].append( + int(sum(mean_votes) / len(mean_votes) > decision_boundry) + ) + result_dict[f'{parameter_key}_{decision_boundry}_majority_vote_pred_{version_key}'].append( + sum(majority_votes) > (len(majority_votes) // 2) + ) + parameter_configurations = list(set([x.split('_')[0] for x in result_dict.keys()])) + for vote in ['mean', 'majority']: + for version_key, version_dict in version_dicts.items(): + for parameter_key, parameter_dict in version_dict.items(): + # for model_key, model_dict in parameter_dict.items(): + predictions = result_dict[f'{parameter_key}_{decision_boundry}_{vote}_vote_pred_{version_key}'] + uar_score = metrics.recall_score(labels, predictions, labels=[0, 1], average='macro', + sample_weight=None, zero_division='warn') + uar_dict[f'{parameter_key}_{decision_boundry}_{vote}_vote_pred_{version_key}'] = uar_score + for parameter_key in parameter_configurations: + final_dict[f'{parameter_key}_{decision_boundry}_{vote}_vote_mean_uar'] = np.mean([ + uar_dict[f'{parameter_key}_{decision_boundry}_{vote}_vote_pred_{version_key}'] + for version_key in version_dicts.keys()] + ) + final_dict[f'{parameter_key}_{decision_boundry}_{vote}_vote_std_uar'] = np.std([ + uar_dict[f'{parameter_key}_{decision_boundry}_{vote}_vote_pred_{version_key}'] + for version_key in version_dicts.keys()] + ) + + with (outpath / 'model_ensemble_uar_dict').open('wb') as f: + pickle.dump(final_dict, f, pickle.HIGHEST_PROTOCOL) + with (outpath / 'model_ensemble_uar_dict.csv').open('w') as f: + headers = ['parameter_config', 'vote', 'decision_boundry', 'mean/std', 'value'] + + writer = csv.DictWriter(f, delimiter=',', lineterminator='\n', fieldnames=headers) + writer.writeheader() # write a header + + for result_key in final_dict.keys(): + splited_key = result_key.split('_') + writer.writerow({key: value for key, value in + zip(headers, + [splited_key[0], splited_key[2], splited_key[1], splited_key[4], + final_dict[result_key]]) + } + ) + + +if __name__ == '__main__': + outpath = Path().absolute().parent / 'output' + + config_filename = 'config.ini' + output_folders_path = list(outpath.rglob('outputs')) + + # Accumulate the Predictions + # accumulate_predictions(config_filename, output_folders_path) + + # Accumulate the UARS + # accumulate_uars(outpath) + + # Find the Best UARS per paramter and Model and combine predictions + accumulate_predictions_along_paramter_within_model(outpath) + + # Gather Results to final CSV + # gather_results(config_filename, outpath) diff --git a/ensemble_methods/paramter_ensemble.py b/ensemble_methods/paramter_ensemble.py new file mode 100644 index 0000000..1919f25 --- /dev/null +++ b/ensemble_methods/paramter_ensemble.py @@ -0,0 +1,137 @@ +import csv +import pickle +from collections import defaultdict +from pathlib import Path + +from sklearn import metrics + +import numpy as np + + +decision_boundrys = [round(x * 0.01, 2) for x in range(10, 90, 5)] + + +def accumulate_uars(output_folders): + for model_type in output_folders.iterdir(): + for param_config in model_type.iterdir(): + per_seed_ensemble_files = param_config.rglob('ensemble_checkpoints') + for ensemble_file in per_seed_ensemble_files: + uar_dict = dict() + with ensemble_file.open('rb') as f: + loaded_ensemble_file = pickle.load(f) + labels = loaded_ensemble_file.pop('labels') + for decision_boundry in decision_boundrys: + decisions = [] + try: + for i in range(len(labels)): + decisions.append(loaded_ensemble_file['weights'][i] > decision_boundry) + + uar_score = metrics.recall_score(labels, decisions, labels=[0, 1], average='macro', + sample_weight=None, zero_division='warn') + uar_dict[f'weights_decb_{decision_boundry}'] = uar_score + except KeyError: + continue + + with (ensemble_file.parent / 'weights_uar_dict_decb').open('wb') as ef: + pickle.dump(uar_dict, ef, protocol=pickle.HIGHEST_PROTOCOL) + + +def accumulate_predictions_along_paramter_within_model(outpath): + + for model_type in outpath.iterdir(): + version_dicts = defaultdict(dict) + labels = [] + labels_loaded = False + for parameter_configuration in model_type.iterdir(): + for version in parameter_configuration.rglob('weights_uar_dict_decb'): + try: + with (version.parent / 'weights.csv').open('r') as f: + predictions = [] + # Read Headers to skip the first line + _ = f.readline() + for row in f: + prediction, label = [float(x) for x in row.strip().split(',')] + predictions.append(prediction) + if not labels_loaded: + labels.append(label) + if not labels_loaded: + labels_loaded = True + version_dicts[version.parent.parent.name][parameter_configuration.name] = dict( + path=version, + predictions=predictions) + except KeyError: + continue + except FileNotFoundError: + continue + + result_dict = defaultdict(list) + final_dict = dict() + uar_dict = dict() + + for decision_boundry in decision_boundrys: + for i in range(len(labels)): + for version_key, version_dict in version_dicts.items(): + majority_votes = [] + mean_votes = [] + for parameter_key, parameter_dict in version_dict.items(): + majority_votes.append(parameter_dict['predictions'][i] > decision_boundry) + mean_votes.append(parameter_dict['predictions'][i]) + result_dict[f'{decision_boundry}_mean_vote_pred_{version_key}'].append( + int(sum(mean_votes) / len(mean_votes) > decision_boundry) + ) + result_dict[f'{decision_boundry}_majority_vote_pred_{version_key}'].append( + sum(majority_votes) > (len(majority_votes) // 2) + ) + for vote in ['mean', 'majority']: + for version_key, version_dict in version_dicts.items(): + + predictions = result_dict[f'{decision_boundry}_{vote}_vote_pred_{version_key}'] + uar_score = metrics.recall_score(labels, predictions, labels=[0, 1], average='macro', + sample_weight=None, zero_division='warn') + uar_dict[f'weights_decb_{decision_boundry}_{vote}_{version_key}_uar'] = uar_score + + final_dict[f'weights_decb_{decision_boundry}_{vote}_vote_mean_uar'] = np.mean([ + uar_dict[f'weights_decb_{decision_boundry}_{vote}_{version_key}_uar'] + for version_key in version_dicts.keys()] + ) + final_dict[f'weights_decb_{decision_boundry}_{vote}_vote_std_uar'] = np.std([ + uar_dict[f'weights_decb_{decision_boundry}_{vote}_{version_key}_uar'] + for version_key in version_dicts.keys()] + ) + + with (model_type / 'parameter_ensemble_uar_dict').open('wb') as f: + pickle.dump(final_dict, f, pickle.HIGHEST_PROTOCOL) + with (model_type / 'parameter_ensemble_uar_dict.csv').open('w') as f: + headers = ['model_type', 'vote', 'decision_boundry', 'mean/std', 'value'] + + writer = csv.DictWriter(f, delimiter=',', lineterminator='\n', fieldnames=headers) + writer.writeheader() # write a header + + for result_key in final_dict.keys(): + splited_key = result_key.split('_') + writer.writerow({key: value for key, value in + zip(headers, + [model_type.name, + splited_key[3], splited_key[2], splited_key[5], + final_dict[result_key]]) + } + ) + + +if __name__ == '__main__': + outpath = Path().absolute().parent / 'output' + + config_filename = 'config.ini' + output_folders_path = list(outpath.rglob('outputs')) + + # Accumulate the Predictions + # accumulate_predictions(config_filename, output_folders_path) + + # Accumulate the UARS + # accumulate_uars(outpath) + + # Find the Best UARS per paramter and Model and combine predictions + accumulate_predictions_along_paramter_within_model(outpath) + + # Gather Results to final CSV + # gather_results(config_filename, outpath) diff --git a/main.py b/main.py index 8ad872c..22245a9 100644 --- a/main.py +++ b/main.py @@ -118,11 +118,11 @@ def run_lightning_loop(config_obj): from tqdm import tqdm for batch in tqdm(test_dataloader, total=len(test_dataloader)): - batch_x, file_name = batch + batch_x, file_names = batch batch_x = batch_x.to(device='cuda' if model.on_gpu else 'cpu') y = model(batch_x).main_out predictions = (y >= 0.5).int() - for prediction in predictions: + for prediction, file_name in zip(predictions, file_names): prediction_text = 'clear' if prediction == V.CLEAR else 'mask' outfile.write(f'{file_name},{prediction_text}\n') return model