import csv import pickle from collections import defaultdict from pathlib import Path import numpy as np from sklearn import metrics decision_boundrys = [round(x * 0.01, 2) for x in range(10, 90, 5)] def accumulate_uars(output_folders): for model_type in output_folders.iterdir(): for param_config in model_type.iterdir(): per_seed_ensemble_files = param_config.rglob('ensemble_checkpoints') for ensemble_file in per_seed_ensemble_files: uar_dict = dict() with ensemble_file.open('rb') as f: loaded_ensemble_file = pickle.load(f) labels = loaded_ensemble_file.pop('labels') for decision_boundry in decision_boundrys: decisions = [] try: for i in range(len(labels)): decisions.append(loaded_ensemble_file['weights'][i] > decision_boundry) uar_score = metrics.recall_score(labels, decisions, labels=[0, 1], average='macro', sample_weight=None, zero_division='warn') uar_dict[f'weights_decb_{decision_boundry}'] = uar_score except KeyError: continue with (ensemble_file.parent / 'weights_uar_dict_decb').open('wb') as ef: pickle.dump(uar_dict, ef, protocol=pickle.HIGHEST_PROTOCOL) def accumulate_predictions_along_paramter_within_model(outpath): version_dicts = defaultdict(lambda: defaultdict(dict)) labels = [] labels_loaded = False for model_type in outpath.iterdir(): if not model_type.is_dir(): continue for parameter_configuration in model_type.iterdir(): if not parameter_configuration.is_dir(): continue for version in parameter_configuration.rglob('weights_uar_dict_decb'): try: with (version.parent / 'weights.csv').open('r') as f: predictions = [] # Read Headers to skip the first line _ = f.readline() for row in f: prediction, label = [float(x) for x in row.strip().split(',')] predictions.append(prediction) if not labels_loaded: labels.append(label) if not labels_loaded: labels_loaded = True version_dicts[version.parent.parent.name][parameter_configuration.name.split('_')[1]][model_type.name] = dict( path=version, predictions=predictions) except KeyError: continue except FileNotFoundError: continue result_dict = defaultdict(list) final_dict = dict() uar_dict = dict() for decision_boundry in decision_boundrys: for i in range(len(labels)): for version_key, version_dict in version_dicts.items(): for parameter_key, parameter_dict in version_dict.items(): majority_votes = [] mean_votes = [] for model_type, model_dict in parameter_dict.items(): majority_votes.append(model_dict['predictions'][i] > decision_boundry) mean_votes.append(model_dict['predictions'][i]) result_dict[f'{parameter_key}_{decision_boundry}_mean_vote_pred_{version_key}'].append( int(sum(mean_votes) / len(mean_votes) > decision_boundry) ) result_dict[f'{parameter_key}_{decision_boundry}_majority_vote_pred_{version_key}'].append( sum(majority_votes) > (len(majority_votes) // 2) ) parameter_configurations = list(set([x.split('_')[0] for x in result_dict.keys()])) for vote in ['mean', 'majority']: for version_key, version_dict in version_dicts.items(): for parameter_key, parameter_dict in version_dict.items(): # for model_key, model_dict in parameter_dict.items(): predictions = result_dict[f'{parameter_key}_{decision_boundry}_{vote}_vote_pred_{version_key}'] uar_score = metrics.recall_score(labels, predictions, labels=[0, 1], average='macro', sample_weight=None, zero_division='warn') uar_dict[f'{parameter_key}_{decision_boundry}_{vote}_vote_pred_{version_key}'] = uar_score for parameter_key in parameter_configurations: final_dict[f'{parameter_key}_{decision_boundry}_{vote}_vote_mean_uar'] = np.mean([ uar_dict[f'{parameter_key}_{decision_boundry}_{vote}_vote_pred_{version_key}'] for version_key in version_dicts.keys()] ) final_dict[f'{parameter_key}_{decision_boundry}_{vote}_vote_std_uar'] = np.std([ uar_dict[f'{parameter_key}_{decision_boundry}_{vote}_vote_pred_{version_key}'] for version_key in version_dicts.keys()] ) with (outpath / 'model_ensemble_uar_dict').open('wb') as f: pickle.dump(final_dict, f, pickle.HIGHEST_PROTOCOL) with (outpath / 'model_ensemble_uar_dict.csv').open('w') as f: headers = ['parameter_config', 'vote', 'decision_boundry', 'mean/std', 'value'] writer = csv.DictWriter(f, delimiter=',', lineterminator='\n', fieldnames=headers) writer.writeheader() # write a header for result_key in final_dict.keys(): splited_key = result_key.split('_') writer.writerow({key: value for key, value in zip(headers, [splited_key[0], splited_key[2], splited_key[1], splited_key[4], final_dict[result_key]]) } ) if __name__ == '__main__': outpath = Path().absolute().parent / 'output' config_filename = 'config.ini' output_folders_path = list(outpath.rglob('outputs')) # Accumulate the Predictions # accumulate_predictions(config_filename, output_folders_path) # Accumulate the UARS # accumulate_uars(outpath) # Find the Best UARS per paramter and Model and combine predictions accumulate_predictions_along_paramter_within_model(outpath) # Gather Results to final CSV # gather_results(config_filename, outpath)