diff --git a/cfg.py b/cfg.py index 1f63a05..991779f 100644 --- a/cfg.py +++ b/cfg.py @@ -2,11 +2,11 @@ from pathlib import Path import torch BATCH_SIZE = 96 -NUM_EPOCHS = 100 +NUM_EPOCHS = 75 NUM_WORKERS = 4 NUM_SEGMENTS = 80 NUM_SEGMENT_HOPS = 15 -SEEDS = [42, 1337, 666, 2012, 1e42] +SEEDS = [42, 1337, 666] ALL_DATASET_PATHS = list((Path(__file__).parent.absolute() / 'data' / 'mimii').glob('*/')) DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' diff --git a/clean.py b/clean.py new file mode 100644 index 0000000..d563cda --- /dev/null +++ b/clean.py @@ -0,0 +1,18 @@ +# scores on band level per file +# merge scores using GMM/IsoForest +# combine combine using mean +# compute roc auc + +# 1. inputs: scorer function 80x80 -> 1x7 +# 2. GMM: 1x7 -> 1x1 +# 3. mean over all snippets +# single score + +class Scorer: + def __init__(self): + pass + +class Evaluator: + def __init__(self): + pass + diff --git a/main.py b/main.py index 783102a..6f0f171 100644 --- a/main.py +++ b/main.py @@ -9,7 +9,8 @@ if __name__ == '__main__': import torch.optim as optim from models.layers import Subspectrogram - def train(dataset_path, machine_id, band, norm, seed): + + def train(dataset_path, machine_id, band, norm='batch', loss_fn='mse', seed=42): torch.manual_seed(seed) torch.cuda.manual_seed(seed) np.random.seed(seed) @@ -32,12 +33,12 @@ if __name__ == '__main__': transform=tfms ) - model = SubSpecCAE(norm=norm, band=band).to(DEVICE) + model = SubSpecCAE(norm=norm, loss_fn=loss_fn, band=band).to(DEVICE) model.init_weights() # print(model(torch.randn(128, 1, 20, 80).to(DEVICE)).shape) - optimizer = optim.Adam(model.parameters(), lr=0.001) + optimizer = optim.Adam(model.parameters(), lr=0.0005) for epoch in range(NUM_EPOCHS): @@ -60,17 +61,18 @@ if __name__ == '__main__': print(f'AUC: {auc}, Machine: {machine_id}, Band: {band}, Norm: {norm}, Seed: {seed}') return auc - + loss_fn = 'mse' results = [] - for norm in ('instance', 'batch'): + for norm in ['batch']: for seed in SEEDS: for dataset_path in ALL_DATASET_PATHS: - for machine_id in [0, 2, 4, 6]: - for band in range(7): - auc = train(dataset_path, machine_id, band, norm, seed) - results.append([dataset_path.name, machine_id, seed, band, norm, auc]) - with open(f'results_{norm}.pkl', 'wb') as f: - pickle.dump(results, f) + if '-6_dB' in dataset_path.name: + for machine_id in [4]: + for band in range(7): + auc = train(dataset_path, machine_id, band, norm, loss_fn, seed) + results.append([dataset_path.name, machine_id, seed, band, norm, auc]) + with open(f'results2_hard_{norm}_{loss_fn}.pkl', 'wb') as f: + pickle.dump(results, f) diff --git a/main2.py b/main2.py new file mode 100644 index 0000000..4c48cff --- /dev/null +++ b/main2.py @@ -0,0 +1,79 @@ +if __name__ == '__main__': + import numpy as np + import random + from tqdm import tqdm + from cfg import * + from mimii import MIMII + from models.ae import GlobalCAE + import pickle + import torch.optim as optim + from models.layers import Subspectrogram + + + def train(dataset_path, machine_id, norm='batch', loss_fn='mse', seed=42): + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + np.random.seed(seed) + torch.cuda.manual_seed_all(seed) + torch.backends.cudnn.deterministic = True + random.seed(seed) + + print(f'Training on {dataset_path.name}') + mimii = MIMII(dataset_path=dataset_path, machine_id=machine_id) + mimii.to(DEVICE) + #mimii.preprocess(n_fft=1024, hop_length=256, n_mels=80, center=False, power=2.0) # 80 x 80 + + dl = mimii.train_dataloader( + segment_len=NUM_SEGMENTS, + hop_len=NUM_SEGMENT_HOPS, + batch_size=BATCH_SIZE, + num_workers=NUM_WORKERS, + shuffle=True + ) + + model = GlobalCAE(norm=norm).to(DEVICE) + model.init_weights() + + # print(model(torch.randn(128, 1, 20, 80).to(DEVICE)).shape) + + optimizer = optim.Adam(model.parameters(), lr=0.0005) + + + for epoch in range(NUM_EPOCHS): + print(f'EPOCH #{epoch+1}') + losses = [] + for batch in tqdm(dl): + data, labels = batch + data = data.to(DEVICE) # torch.Size([128, 4, 20, 80]) batch x subs_specs x height x width + + loss = model.train_loss(data) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + losses.append(loss.item()) + print(f'Loss: {np.mean(losses)}') + + auc = mimii.evaluate_model(model, NUM_SEGMENTS, NUM_SEGMENTS) + print(f'AUC: {auc}, Machine: {machine_id}, Norm: {norm}, Seed: {seed}') + return auc + + loss_fn = 'mse' + results = [] + for norm in ['batch']: + for seed in SEEDS: + for dataset_path in ALL_DATASET_PATHS: + if '-6_dB' in dataset_path.name: + for machine_id in [4]: + auc = train(dataset_path, machine_id, norm, loss_fn, seed) + results.append([dataset_path.name, machine_id, seed, norm, auc]) + with open(f'TODO_hard_{norm}_{loss_fn}.pkl', 'wb') as f: + pickle.dump(results, f) + + + + + + + diff --git a/main3.py b/main3.py new file mode 100644 index 0000000..f3cc8b0 --- /dev/null +++ b/main3.py @@ -0,0 +1,81 @@ +if __name__ == '__main__': + import numpy as np + import random + from tqdm import tqdm + from cfg import * + from mimii import MIMII + from models.ae import FullSubSpecCAE, SubSpecCAE + import pickle + import torch.optim as optim + from models.layers import Subspectrogram + + + def train(dataset_path, machine_id, norm='batch', seed=42): + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + np.random.seed(seed) + torch.cuda.manual_seed_all(seed) + torch.backends.cudnn.deterministic = True + random.seed(seed) + + print(f'Training on {dataset_path.name}') + mimii = MIMII(dataset_path=dataset_path, machine_id=machine_id) + mimii.to(DEVICE) + #mimii.preprocess(n_fft=1024, hop_length=256, n_mels=80, center=False, power=2.0) # 80 x 80 + tfms = Subspectrogram(SUB_SPEC_HEIGT, SUB_SPEC_HOP) + + dl = mimii.train_dataloader( + segment_len=NUM_SEGMENTS, + hop_len=NUM_SEGMENT_HOPS, + batch_size=BATCH_SIZE, + num_workers=NUM_WORKERS, + shuffle=True, + transform=tfms + ) + + model = FullSubSpecCAE().to(DEVICE) + model.init_weights() + + # print(model(torch.randn(128, 1, 20, 80).to(DEVICE)).shape) + + optimizer = optim.Adam(model.parameters(), lr=0.0005) + + + for epoch in range(NUM_EPOCHS): + print(f'EPOCH #{epoch+1}') + losses = [] + for batch in tqdm(dl): + data, labels = batch + data = data.to(DEVICE) # torch.Size([128, 4, 20, 80]) batch x subs_specs x height x width + + loss = model.train_loss(data) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + losses.append(loss.item()) + print(f'Loss: {np.mean(losses)}') + + auc = mimii.evaluate_model(model, NUM_SEGMENTS, NUM_SEGMENTS, transform=tfms) + print(f'AUC: {auc}, Machine: {machine_id}, Norm: {norm}, Seed: {seed}') + return auc + + loss_fn = 'mse' + results = [] + for norm in ['batch']: + for seed in SEEDS: + for dataset_path in ALL_DATASET_PATHS: + if '-6_dB' in dataset_path.name: + for machine_id in [4]: + auc = train(dataset_path, machine_id, norm, seed) + results.append([dataset_path.name, machine_id, seed, -1, norm, auc]) + with open(f'full_hard_{norm}_{loss_fn}.pkl', 'wb') as f: + pickle.dump(results, f) + + + + + + + diff --git a/mimii.py b/mimii.py index 8bb4d5f..e9ed080 100644 --- a/mimii.py +++ b/mimii.py @@ -71,11 +71,7 @@ class MIMII(object): ) return DataLoader(ConcatDataset(ds), **kwargs) - def test_dataloader(self, *args, **kwargs): - raise NotImplementedError('test_dataloader is not supported') - - def evaluate_model(self, f, segment_len=20, hop_len=5, transform=None): - f.eval() + def test_datasets(self, segment_len=20, hop_len=5, transform=None): datasets = [] for p, l in zip(self.test_paths, self.test_labels): datasets.append( @@ -83,6 +79,11 @@ class MIMII(object): segment_len=segment_len, hop=hop_len, transform=transform) ) + return datasets + + def evaluate_model(self, f, segment_len=20, hop_len=5, transform=None): + f.eval() + datasets = self.test_datasets(segment_len, hop_len, transform) y_true, y_score = [], [] with torch.no_grad(): for dataset in tqdm(datasets): diff --git a/models/ae.py b/models/ae.py index 4271500..78eea82 100644 --- a/models/ae.py +++ b/models/ae.py @@ -1,3 +1,4 @@ +import numpy as np import torch import torch.nn as nn import torch.functional as F @@ -67,42 +68,176 @@ class AE(nn.Module): class SubSpecCAE(nn.Module): - def __init__(self, F=20, T=80, norm='batch', activation='relu', dropout_prob=0.25, band=0): + def __init__(self, F=20, T=80, norm='batch', + activation='relu', dropout_prob=0.25): super(SubSpecCAE, self).__init__() self.T = T self.F = F self.activation = activation - self.band = band + self.loss_fn = nn.MSELoss() Norm = nn.BatchNorm2d if norm == 'batch' else nn.InstanceNorm2d Activation = nn.ReLU if activation == 'relu' else nn.LeakyReLU + a, b = 20, 40 self.encoder = nn.Sequential( - nn.Conv2d(in_channels=1, out_channels=32, kernel_size=7, stride=1, padding=3), # 32 x 20 x 80 - Norm(32), + nn.Conv2d(in_channels=1, out_channels=a, kernel_size=7, stride=1, padding=3), # 32 x 20 x 80 + Norm(a), Activation(), nn.MaxPool2d((F//10, 5)), nn.Dropout(dropout_prob), - nn.Conv2d(in_channels=32, out_channels=64, kernel_size=7, stride=1, padding=3), # 64 x 10 x 16 - Norm(64), + nn.Conv2d(in_channels=a, out_channels=b, kernel_size=7, stride=1, padding=3), # 64 x 10 x 16 + Norm(b), Activation(), nn.MaxPool2d(4, T), nn.Dropout(dropout_prob), Flatten(), - nn.Linear(64, 16) + nn.Linear(b, 16) ) + self.decoder = nn.Sequential( - nn.Linear(16, 64), - Reshape(64, 1, 1), + nn.Linear(16, b), + Reshape(b, 1, 1), nn.Upsample(size=(10, 16), mode='bilinear', align_corners=False), - nn.ConvTranspose2d(in_channels=64, out_channels=32, kernel_size=7, stride=1, padding=3), - Norm(32), + nn.ConvTranspose2d(in_channels=b, out_channels=a, kernel_size=7, stride=1, padding=3), + Norm(a), Activation(), nn.Upsample(size=(20, 80), mode='bilinear', align_corners=False), nn.Dropout(dropout_prob), - nn.ConvTranspose2d(in_channels=32, out_channels=1, kernel_size=7, stride=1, padding=3) + nn.ConvTranspose2d(in_channels=a, out_channels=1, kernel_size=7, stride=1, padding=3), + nn.Sigmoid() + ) + + def forward(self, sub_x): + #x = x[:, self.band, :,].unsqueeze(1) # select a single supspec + encoded = self.encoder(sub_x) + decoded = self.decoder(encoded) + return decoded, sub_x + + def init_weights(self): + def weight_init(m): + if isinstance(m, nn.Conv2d) or isinstance(m, torch.nn.Linear): + torch.nn.init.kaiming_uniform_(m.weight) + if m.bias is not None: + m.bias.data.fill_(0.2) + self.apply(weight_init) + + +class FullSubSpecCAE(nn.Module): + def __init__(self, F=20, T=80, + norm='batch', activation='relu', + dropout_prob=0.25, weight_sharing=False, + sub_bands=[0, 1, 2, 3, 4, 5, 6]): + super(FullSubSpecCAE, self).__init__() + self.bands = sub_bands + self.weight_sharing = weight_sharing + self.aes = nn.ModuleList([ + SubSpecCAE(F, T, norm, activation, dropout_prob) for band in range( + 1 if weight_sharing else len(sub_bands) + ) + ]) + for ae in self.aes: ae.init_weights() + self.loss_fn = nn.MSELoss() + + def select_sub_ae(self, band): + if self.weight_sharing: + return self.aes[0] + else: + return self.aes[band] + + def select_sub_band(self, x, band): + return x[:, band, :, ].unsqueeze(1) + + def forward(self, x): + y_hat, y = [], [] + for band in self.bands: + sub_ae = self.select_sub_ae(band) + sub_x = self.select_sub_band(x, band) + decoded, target = sub_ae(sub_x) + y.append(target) + y_hat.append(decoded) + decoded = torch.cat(y_hat, dim=1) + y = torch.cat(y, dim=1) + return decoded, y + + def train_loss(self, data): + y_hat, y = self.forward(data) # torch.Size([96, 7, 20, 80]) + loss = self.complete_mse(y_hat, y) + return loss + + def test_loss(self, data): + y_hat, y = self.forward(data) + preds = torch.sum((y_hat - y) ** 2, dim=tuple(range(1, y_hat.dim()))) + return preds + + def sub_band_mse(self, y_hat, y): + losses = [] + for band in self.bands: + sub_y = self.select_sub_band(y, band) + sub_y_hat = self.select_sub_band(y_hat, band) + sub_loss = torch.mean((sub_y_hat - sub_y) ** 2, dim=tuple(range(1, sub_y.dim()))) # torch.Size([96]) + losses.append(sub_loss) + losses = torch.stack(losses, dim=1) # torch.Size([96, 7]) + return losses + + def complete_mse(self, y_hat, y): + return self.sub_band_mse(y_hat, y).mean(dim=0).sum() + + def gather_predictions(self, dataloader): + device = next(self.parameters()).device + predictions = [] + labels = [] + self.eval() + with torch.no_grad(): + for batch in dataloader: + data, l = batch + data = data.to(device) + y_hat, y = self.forward(data) + mse = self.sub_band_mse(y_hat, y) # 96 x 7 + predictions.append(mse) + labels += l.tolist() + predictions = torch.cat(predictions).cpu().numpy() + self.train() + return predictions, labels + + + +class GlobalCAE(nn.Module): + def __init__(self, F=20, T=80, norm='batch', activation='relu', dropout_prob=0.25): + super(GlobalCAE, self).__init__() + self.activation = activation + Norm = nn.BatchNorm2d if norm == 'batch' else nn.InstanceNorm2d + Activation = nn.ReLU if activation == 'relu' else nn.LeakyReLU + self.encoder = nn.Sequential( + nn.Conv2d(in_channels=1, out_channels=20, kernel_size=8, stride=2), # 32 x 20 x 80 + Norm(20), + Activation(), + nn.Dropout(dropout_prob), + nn.Conv2d(in_channels=20, out_channels=40, kernel_size=5, stride=2), # 64 x 10 x 16 + Norm(40), + Activation(), + nn.Dropout(dropout_prob), + nn.Conv2d(in_channels=40, out_channels=60, kernel_size=3, stride=2), # 64 x 10 x 16 + Norm(60), + Flatten(), + nn.Linear(60*8*8, 64) + + ) + self.decoder = nn.Sequential( + nn.Linear(64, 60*8*8), + Reshape(60, 8, 8), + nn.ConvTranspose2d(in_channels=60, out_channels=40, kernel_size=3, stride=2), + Norm(40), + Activation(), + nn.Dropout(dropout_prob), + nn.ConvTranspose2d(in_channels=40, out_channels=20, kernel_size=5, stride=2), + Norm(20), + Activation(), + nn.Dropout(dropout_prob), + nn.ConvTranspose2d(in_channels=20, out_channels=1, kernel_size=8, stride=2), + nn.Sigmoid() ) def forward(self, x): - x = x[:,self.band,:,].unsqueeze(1) # select a single supspec + x = x.unsqueeze(1) encoded = self.encoder(x) decoded = self.decoder(encoded) return decoded, x @@ -123,5 +258,5 @@ class SubSpecCAE(nn.Module): if isinstance(m, nn.Conv2d) or isinstance(m, torch.nn.Linear): torch.nn.init.kaiming_uniform_(m.weight) if m.bias is not None: - m.bias.data.fill_(0.02) + m.bias.data.fill_(0.01) self.apply(weight_init) \ No newline at end of file diff --git a/models/utils.py b/models/utils.py new file mode 100644 index 0000000..aef4fdf --- /dev/null +++ b/models/utils.py @@ -0,0 +1,2 @@ +def count_parameters(model): + return sum(p.numel() for p in model.parameters() if p.requires_grad) \ No newline at end of file diff --git a/new_main.py b/new_main.py new file mode 100644 index 0000000..85c6db6 --- /dev/null +++ b/new_main.py @@ -0,0 +1,101 @@ +if __name__ == '__main__': + import numpy as np + import random + from tqdm import tqdm + from cfg import * + from mimii import MIMII + from models.ae import FullSubSpecCAE, SubSpecCAE + import pickle + import torch.optim as optim + from models.layers import Subspectrogram + from sklearn.preprocessing import StandardScaler, MinMaxScaler + from sklearn.metrics import roc_auc_score + from torch.utils.data import DataLoader + from sklearn.ensemble import IsolationForest + from sklearn.mixture import GaussianMixture + + + def train(dataset_path, machine_id, norm='batch', seed=42): + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + np.random.seed(seed) + torch.cuda.manual_seed_all(seed) + torch.backends.cudnn.deterministic = True + random.seed(seed) + + print(f'Training on {dataset_path.name}') + mimii = MIMII(dataset_path=dataset_path, machine_id=machine_id) + mimii.to(DEVICE) + #mimii.preprocess(n_fft=1024, hop_length=256, n_mels=80, center=False, power=2.0) # 80 x 80 + tfms = Subspectrogram(SUB_SPEC_HEIGT, SUB_SPEC_HOP) + + dl = mimii.train_dataloader( + segment_len=NUM_SEGMENTS, + hop_len=NUM_SEGMENT_HOPS, + batch_size=BATCH_SIZE, + num_workers=NUM_WORKERS, + shuffle=True, + transform=tfms + ) + + model = FullSubSpecCAE(weight_sharing=False).to(DEVICE) + + # print(model(torch.randn(128, 1, 20, 80).to(DEVICE)).shape) + + optimizer = optim.Adam(model.parameters(), lr=0.0005) + + + for epoch in range(NUM_EPOCHS): + print(f'EPOCH #{epoch+1}') + losses = [] + for batch in tqdm(dl): + data, labels = batch + data = data.to(DEVICE) # torch.Size([128, 4, 20, 80]) batch x subs_specs x height x width + + loss = model.train_loss(data) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + losses.append(loss.item()) + print(f'Loss: {np.mean(losses)}') + + preds, _ = model.gather_predictions(dl) + scaler = MinMaxScaler() + scaler.fit(preds) + forest = IsolationForest() + forest.fit(preds) + + datasets = mimii.test_datasets(NUM_SEGMENTS, NUM_SEGMENTS, transform=tfms) + y_true, y_score = [], [] + for dataset in tqdm(datasets): + loader = DataLoader(dataset, batch_size=len(dataset), shuffle=False, num_workers=2) + preds, labels = model.gather_predictions(loader) + preds = scaler.transform(preds) + y_true.append(labels[0]) # always the same + score = forest.decision_function(preds) # 7x7 -> 1 one score for the whole dataset + y_score.append(score.mean()) + auc = roc_auc_score(y_true, y_score) + print(f'AUC: {auc}, Dataset: {dataset_path.name}, Machine: {machine_id}, Norm: {norm}, Seed: {seed}') + return auc + + + loss_fn = 'mse' + results = [] + for norm in ['batch']: + for seed in SEEDS: + for dataset_path in ALL_DATASET_PATHS: + if '-6_dB' in dataset_path.name: + for machine_id in [4]: + auc = train(dataset_path, machine_id, norm, seed) + results.append([dataset_path.name, machine_id, seed, -1, norm, auc]) + with open(f'full234_hard_{norm}_{loss_fn}.pkl', 'wb') as f: + pickle.dump(results, f) + + + + + + + diff --git a/plots/__init__.py b/plots/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/plots/normal_vs_abnormal.py b/plots/normal_vs_abnormal.py new file mode 100644 index 0000000..0e537f2 --- /dev/null +++ b/plots/normal_vs_abnormal.py @@ -0,0 +1,60 @@ +import numpy as np +import matplotlib as mpl +import matplotlib.pyplot as plt +import librosa +import librosa.display +# set LaTeX font +# ============== +nice_fonts = { + # Use LaTeX to write all text + "text.usetex": True, + "font.family": "serif", + # Use 10pt font in plots, to match 10pt font in document + "axes.labelsize": 16, # default 10 + "font.size": 16, # default 10 + # Make the legend/label fonts a little smaller + "legend.fontsize": 14, # default 8 + "xtick.labelsize": 14, # default 8 + "ytick.labelsize": 14, # default 8 +} +mpl.rcParams.update(nice_fonts, ) +mpl.rcParams['axes.unicode_minus'] = False + +# TODO LaTex +fan_normal = '../data/mimii/-6_dB_fan/id_04/normal/00000042.wav' +fan_anomaly = '../data/mimii/-6_dB_fan/id_04/abnormal/00000048.wav' + +pump_normal = '../data/mimii/-6_dB_pump/id_04/normal/00000042.wav' +pump_anomaly = '../data/mimii/-6_dB_pump/id_04/abnormal/00000042.wav' + +slider_normal = '../data/mimii/-6_dB_slider/id_04/normal/00000042.wav' +slider_anomaly = '../data/mimii/-6_dB_slider/id_04/abnormal/00000042.wav' + +valve_normal = '../data/mimii/-6_dB_valve/id_04/normal/00000042.wav' +valve_anomaly = '../data/mimii/-6_dB_valve/id_04/abnormal/00000042.wav' + +fig = plt.figure(figsize=(17, 5)) +for i, (p, title) in enumerate(zip([fan_normal, pump_normal, slider_normal, valve_normal, fan_anomaly, pump_anomaly, slider_anomaly, valve_anomaly], + ['Fan', 'Pump', 'Slider', 'Valve', 'Fan', 'Pump', 'Slider', 'Valve'])): + plt.subplot(2, 4, i+1) + audio, sr = librosa.load(p, sr=None, mono=True) + # n_fft=1024, hop_length=256, n_mels=80, center=False, power=2.0 + S = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=1024, hop_length=256, n_mels=80, center=False, power=2.0) + S_dB = librosa.power_to_db(S, ref=np.max) + librosa.display.specshow(S_dB, x_axis='s' if i == 4 else 'off', hop_length=256, + y_axis='mel' if i==4 else 'off', sr=16000, cmap='viridis') + if i < 4: + plt.title(title) + else: + plt.title(title + ' malfunction') + +cbar_ax = fig.add_axes([0.94, 0.18, 0.015, 0.7]) +cmap = mpl.cm.viridis +norm = mpl.colors.Normalize(vmin=0, vmax=-80) +cb1 = mpl.colorbar.ColorbarBase(cbar_ax, cmap=cmap, + norm=norm, + orientation='vertical', + format='%+2.0f dB') +plt.tight_layout() +fig.subplots_adjust(right=0.93) +plt.savefig('normal_vs_abnormal.png') \ No newline at end of file diff --git a/plots/playground.py b/plots/playground.py new file mode 100644 index 0000000..f1c583e --- /dev/null +++ b/plots/playground.py @@ -0,0 +1,64 @@ +import librosa +from matplotlib import pyplot as plt +import numpy as np +from models.utils import count_parameters +from models.ae import SubSpecCAE, AE, GlobalCAE +from cfg import ALL_DATASET_PATHS +import librosa.display +import librosa.feature +import seaborn as sns + +purple = (126/255, 87/255, 194/255) +params_subspecae = count_parameters(SubSpecCAE()) +params_ae = count_parameters(AE()) +params_globalcae = count_parameters(GlobalCAE()) +print(f'#Parameters SubSpecCAE: {params_subspecae}') +print(f'#Parameters AE: {params_ae}') +print(f'#SubSpecAe/#AE: {(params_subspecae/params_ae):.2f}') +print(f'#GlobalCAE: {params_globalcae}') +print(f'#GlobalCAE/#SubSpecCAE: {params_globalcae/params_subspecae}') + + +fig = plt.figure(figsize=(10, 5)) +slider_normal = '../data/mimii/-6_dB_slider/id_04/normal/00000042.wav' +audio, sr = librosa.load(slider_normal) +waveplot = librosa.display.waveplot(audio, sr=sr) +plt.tight_layout() +plt.savefig('wavplot.png') + +ids = range(0) +specs = [] +i = 0 +audios = list((ALL_DATASET_PATHS[1] / 'id_00' / 'normal').glob('*.wav')) +print(str(audios[80])) +audio, sr = librosa.load(str(audios[80]), sr=None) +mel_spec = librosa.feature.melspectrogram(audio, sr=sr, n_fft=1024, hop_length=256, n_mels=80, center=False, power=2.0) +mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max) +librosa.display.specshow(mel_spec_db, hop_length=256, x_axis='s', y_axis='mel', cmap='viridis', sr=sr) +plt.savefig('notadjusted.png') +plt.clf() + +centroids = [] +for p in audios: + audio, sr = librosa.load(str(p), sr=None) + spectral_centroids = librosa.feature.spectral_centroid(audio, sr=sr)[0] + centroids += spectral_centroids.tolist() + +sns.distplot(centroids, hist=True, kde=True, + color=purple, + hist_kws={'edgecolor':'black'}, + kde_kws={'linewidth': 2}) +plt.xlabel('Occurence counts') +plt.ylabel('Density') +plt.title('Spectral centroid distribution') +plt.tight_layout() +plt.savefig('hist.png') + +def get_bands(centroids, n_mels=80): + std = np.std(centroids) + mu = np.mean(centroids) + return int((mu-3*std)/n_mels), int((mu+3*std)/n_mels) + +print(get_bands(centroids)) + + diff --git a/transfer_learning/__init__.py b/transfer_learning/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/transfer_learning/extractors.py b/transfer_learning/extractors.py new file mode 100644 index 0000000..d7af894 --- /dev/null +++ b/transfer_learning/extractors.py @@ -0,0 +1,149 @@ +import torch +import torchvision +import torchvision.transforms as transforms +from torch.utils.data import Dataset +import torch.nn as nn +from PIL import Image +from tqdm import tqdm +import pandas as pd + + +class FeatureExtractor: + supported_extractors = ['resnet18', 'resnet34', 'resnet50', + 'alexnet_fc6', 'alexnet_fc7', 'vgg16', + 'densenet121', 'inception_v3', 'squeezenet'] + + def __init__(self, version='resnet18', device='cpu'): + assert version.lower() in self.supported_extractors + self.device = device + self.version = version + self.F = self.__choose_feature_extractor(version) + for param in self.F.parameters(): + param.requires_grad = False + self.F.eval() + self.input_size = (299, 299) if version.lower() == 'inception' else (224, 224) + self.transforms = transforms.Compose([ + transforms.Resize(self.input_size), + transforms.ToTensor(), + transforms.Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]) + ]) + + def to(self, device): + self.device = device + self.F = self.F.to(self.device) + return self + + def __choose_feature_extractor(self, version): + if 'resnet' in version.lower(): + v = int(version[-2:]) + if v == 18: + resnet = torchvision.models.resnet18(pretrained=True) + elif v == 34: + resnet = torchvision.models.resnet34(pretrained=True) + elif v == 50: + resnet = torchvision.models.resnet50(pretrained=True) + return nn.Sequential(*list(resnet.children())[:-1]) + elif 'alexnet' in version.lower(): + v = int(version[-1]) + alexnet = torchvision.models.alexnet(pretrained=True) + if v == 7: + f = nn.Sequential(*list(alexnet.classifier.children())[:-2]) + elif v == 6: + f = nn.Sequential(*list(alexnet.classifier.children())[:-5]) + alexnet.classifier = f + return alexnet + elif version.lower() == 'vgg16': + vgg = torchvision.models.vgg16_bn(pretrained=True) + classifier = list( + vgg.classifier.children())[:4] + vgg.classifier = nn.Sequential(*classifier) + return vgg + elif version.lower() == 'densenet121': + densenet = torchvision.models.densenet121(pretrained=True) + avg_pool = nn.AvgPool2d(kernel_size=7) + densenet = nn.Sequential(*list(densenet.children())[:-1]) + densenet.add_module('avg_pool', avg_pool) + return densenet + elif version.lower() == 'inception_v3': + inception = torchvision.models.inception_v3(pretrained=True) + f = nn.Sequential(*list(inception.children())[:-1]) + f._modules.pop('13') + f.add_module('global average', nn.AvgPool2d(26)) + return f + elif version.lower() == 'squeezenet': + squeezenet = torchvision.models.squeezenet1_1(pretrained=True) + f = torch.nn.Sequential( + squeezenet.features, + torch.nn.AdaptiveAvgPool2d(output_size=(2, 2)) + ) + return f + else: + raise NotImplementedError('The feature extractor you requested is not yet supported') + + + @property + def feature_size(self): + x = torch.randn(size=(1, 3, *self.input_size)).to(self.device) + return self.F(x).squeeze().shape[0] + + def __call__(self, batch): + batch = self.transforms(batch) + if len(batch.shape) <= 3: + batch = batch.unsqueeze(0) + return self.F(batch).view(batch.shape[0], -1).squeeze() + + def from_image_folder(self, folder_path, extension='jpg'): + sorted_files = sorted(list(folder_path.glob(f'*.{extension}'))) + split_names = [x.stem.split('_') for x in sorted_files] + names = [x[0] for x in split_names] + seq_ids = [x[1] for x in split_names] + X = [] + for i, p_img in enumerate(tqdm(sorted_files)): + x = Image.open(p_img) + features = self(x) + X.append([names[i], seq_ids[i]] + features.tolist()) + return pd.DataFrame(X, columns=['name', 'seq_id', *(f'feature_{i}' for i in range(self.feature_size))]) + + +class AudioTransferLearningImageDataset(Dataset): + def __init__(self, root_or_files, extension='jpg', input_size=224): + self.root_or_files = root_or_files + if type(root_or_files) == list: + self.files = root_or_files + else: + self.files = list(self.root.glob(f'*.{extension}')) + self.transforms = transforms.Compose([ + transforms.Resize(input_size), + transforms.ToTensor(), + transforms.Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]) + ]) + + def process_name(self, name): + split_name = name.stem.split('_') + return split_name[0], split_name[1] #name, seq_id + + def __getitem__(self, item): + p_img = self.files[item] + x = Image.open(p_img) + x = self.transforms(x) + name, seq_id = self.process_name(p_img) + return x, name, seq_id + + def __len__(self): + return len(self.files) + +if __name__ == '__main__': + from pathlib import Path + version='resnet18' + extractor = FeatureExtractor(version=version) + models = ['slider', 'pump', 'fan'] + model_ids = [0, 2, 4, 6] + for model in models: + for model_id in model_ids: + df = extractor.from_image_folder(Path( f'/home/robert/coding/audio_anomaly_detection/data/mimii/-6_dB_{model}/id_0{model_id}/normal/melspec_images/')) + df.to_csv(Path(f'/home/robert/coding/audio_anomaly_detection/data/mimii/-6_dB_{model}/id_0{model_id}/normal/{version}_features.csv'), index=False) + del df + df = extractor.from_image_folder(Path( f'/home/robert/coding/audio_anomaly_detection/data/mimii/-6_dB_{model}/id_0{model_id}/abnormal/melspec_images/')) + df.to_csv(Path(f'/home/robert/coding/audio_anomaly_detection/data/mimii/-6_dB_{model}/id_0{model_id}/abnormal/{version}_features.csv'), index=False) diff --git a/transfer_learning/main.py b/transfer_learning/main.py new file mode 100644 index 0000000..6b2890d --- /dev/null +++ b/transfer_learning/main.py @@ -0,0 +1,134 @@ +from cfg import ALL_DATASET_PATHS +from pathlib import Path +import pandas as pd +import numpy as np +from sklearn.mixture import GaussianMixture, BayesianGaussianMixture +from sklearn.ensemble import IsolationForest +from sklearn.svm import OneClassSVM +from sklearn.neighbors import KernelDensity +from sklearn.metrics import roc_auc_score +from sklearn.preprocessing import StandardScaler, MinMaxScaler +from sklearn.decomposition import PCA +from sklearn.pipeline import Pipeline +from transfer_learning.extractors import AudioTransferLearningImageDataset, FeatureExtractor +from torch.utils.data import DataLoader +import torch.nn as nn +from torch.optim import Adam +from tqdm import tqdm +from transfer_learning.my_model import MyModel +import random +np.random.seed(1337) +random.seed(1337) + +# Switch +OURS = False +# Parameters +leave_n_out = 150 +model = 'fan' +model_id = 0 + +# get all wav files +wavs_normal = Path(f'/home/robert/coding/audio_anomaly_detection/data/mimii/-6_dB_{model}/id_0{model_id}/normal').glob('*.wav') +wavs_abnormal = Path(f'/home/robert/coding/audio_anomaly_detection/data/mimii/-6_dB_{model}/id_0{model_id}/abnormal').glob('*.wav') + +# as list + shuffle +normal_files = list(wavs_normal) +abnormal_files = list(wavs_abnormal) +random.shuffle(normal_files) +random.shuffle(abnormal_files) + +normal_train_files = normal_files[leave_n_out:] +normal_test_files = normal_files[:leave_n_out] + +abnormal_test_files = abnormal_files[:leave_n_out] + +print(len(normal_train_files), len(normal_test_files), len(normal_files)) +print(len(abnormal_test_files)) + +if not OURS: + normal_df = pd.read_csv(Path(f'/home/robert/coding/audio_anomaly_detection/data/mimii/-6_dB_{model}/id_0{model_id}/normal/resnet18_features.csv')) + abnormal_df = pd.read_csv(Path(f'/home/robert/coding/audio_anomaly_detection/data/mimii/-6_dB_{model}/id_0{model_id}/abnormal/resnet18_features.csv')) + + print(normal_df.shape, abnormal_df.shape) + + normal_trainset = normal_df[normal_df.name.isin([int(x.stem.split('_')[0]) for x in normal_train_files])] + normal_testset = normal_df[normal_df.name.isin([int(x.stem.split('_')[0]) for x in normal_test_files])] + abnormal_testset = abnormal_df[abnormal_df.name.isin([int(x.stem.split('_')[0]) for x in abnormal_test_files])] + + print(f'normal train: {normal_trainset.shape}') + print(f'normal test: {normal_testset.shape}') + print(f'abnormal test: {abnormal_testset.shape}') + + only_features = lambda x: x[:, 2:] + + print(f'#Normal files:{len(normal_files)}\t#Abnormal files: {len(abnormal_files)}') + print(f'#Normal test snippets normal: {len(normal_testset)}, #Abnormal test snippets: {len(abnormal_testset)}') + + #model = BayesianGaussianMixture(n_components=64, max_iter=150, covariance_type='diag') + #model = GaussianMixture(n_components=64, covariance_type='diag') + #model = OneClassSVM(nu=1e-2) + #model = IsolationForest(n_estimators=128, contamination='auto') + model = KernelDensity(kernel='gaussian', bandwidth=0.1) + scaler = Pipeline( + [('Scaler', StandardScaler())] + ) + X = only_features(normal_trainset.values) + + scaler.fit(X) + model.fit(scaler.transform(X)) + + scores_normal_test = model.score_samples( + scaler.transform(only_features(normal_testset.values)) + ) + + scores_abnormal_test = model.score_samples( + scaler.transform(only_features(abnormal_testset.values)) + ) + + normal_with_scores = normal_testset[['name', 'seq_id']].copy() + normal_with_scores['score'] = -scores_normal_test + normal_with_scores['label'] = 0 + normal_grouped = normal_with_scores.groupby(by=['name']).mean() + + abnormal_with_scores = abnormal_testset[['name', 'seq_id']].copy() + abnormal_with_scores['score'] = -scores_abnormal_test + abnormal_with_scores['label'] = 0 + abnormal_grouped = abnormal_with_scores.groupby(by=['name']).mean() + + scores_normal_grouped = normal_grouped.score.values.tolist() + scores_abnormal_grouped = abnormal_grouped.score.values.tolist() + + labels_normal = [0] * len(scores_normal_grouped) + labels_abnormal = [1] * len(scores_abnormal_grouped) + labels = labels_normal + labels_abnormal + + scores = scores_normal_grouped + scores_abnormal_grouped + + auc = roc_auc_score(y_score=scores, y_true=labels) + print(auc) + +else: + dataset = AudioTransferLearningImageDataset(root_or_files=normal_train_files) + dataloader = DataLoader(dataset, batch_size=80, shuffle=True) + F = FeatureExtractor(version='resnet18').to('cuda') + feature_size = F.feature_size + criterion = nn.MSELoss() + my_model = MyModel(F).to('cuda') + + for e in range(0): + losses = [] + for batch in tqdm(dataloader): + imgs, names, seq_ids = batch + imgs = imgs.to('cuda') + prediction, target = my_model(imgs) + loss = criterion(prediction, target) + my_model.optimizer.zero_grad() + loss.backward() + my_model.optimizer.step() + losses.append(loss.item()) + print(sum(losses)/len(losses)) + + # test + dataset = AudioTransferLearningImageDataset(root_or_files=normal_test_files) + dataloader = DataLoader(dataset, batch_size=80, shuffle=False) + my_model.scores_from_dataloader(dataloader) \ No newline at end of file diff --git a/transfer_learning/my_model.py b/transfer_learning/my_model.py new file mode 100644 index 0000000..0ff1e07 --- /dev/null +++ b/transfer_learning/my_model.py @@ -0,0 +1,87 @@ +import torch +import torch.nn as nn +from torch.optim import Adam + + +class MyModel(nn.Module): + def __init__(self, feature_extractor): + super(MyModel, self).__init__() + self.feature_extractor = feature_extractor + feature_size = feature_extractor.feature_size + self.noise = nn.Sequential( + nn.Linear(feature_size, feature_size // 2), + nn.ELU(), + nn.Linear(feature_size // 2, feature_size // 4) + ) + for p in self.noise.parameters(): + p.requires_grad = False + self.noise.eval() + + self.student = nn.Sequential( + nn.Linear(feature_size, feature_size // 2), + nn.ELU(), + nn.Linear(feature_size // 2, feature_size // 4), + nn.ELU(), + nn.Linear(feature_size // 4, feature_size // 4) + ) + self.optimizer = Adam(self.student.parameters(), lr=0.0001, weight_decay=1e-7, amsgrad=True) + + def forward(self, imgs): + features = self.feature_extractor.F(imgs).squeeze() + target = self.noise(features) + prediction = self.student(features) + return target, prediction + + + def scores_from_dataloader(self, dataloader): + scores = [] + with torch.no_grad(): + for batch in dataloader: + imgs, names, seq_ids = batch + imgs = imgs.to('cuda') + target, prediction = self.forward(imgs) + preds = torch.sum((prediction - target) ** 2, dim=tuple(range(1, target.dim()))) + print(preds.shape) + + +class HyperFraud(nn.Module): + def __init__(self, hidden_dim=256): + super(HyperFraud, self).__init__() + self.hidden_dim = hidden_dim + self.mean = torch.randn(size=(1, 512)) + self.std = torch.randn(size=(1, 512)) + + self.W_forget = nn.Sequential( + nn.Linear(512, hidden_dim) + ) + self.U_forget = nn.Sequential( + nn.Linear(hidden_dim, hidden_dim) + ) + self.W_hidden = nn.Sequential( + nn.Linear(512, 256) + ) + self.U_hidden = nn.Sequential( + nn.Linear(hidden_dim, 256) + ) + self.b_forget = nn.Parameter(torch.randn(1, hidden_dim)) + self.b_hidden = nn.Parameter(torch.randn(1, hidden_dim)) + + def forward(self, data, max_seq_len=10): + # data. batch x seqs x dim + # random seq sampling + h_prev = [torch.zeros(size=(data.shape[0], self.hidden_dim))] + for i in range(0, max_seq_len): + x_t = data[:, i] + h_t_prev = h_prev[-1] + W_x_t = self.W_forget(x_t) + U_h_prev = self.U_forget(h_t_prev) + forget_t = torch.sigmoid(W_x_t + U_h_prev + self.b_forget) + h_t = forget_t * h_t_prev + (1.0 - forget_t) * torch.tanh(self.W_hidden(x_t) + self.U_hidden(forget_t * h_t_prev) + self.b_hidden) + h_prev.append(h_t) + return torch.stack(h_prev[1:], dim=1) + + +#hf = HyperFraud() +#rand_input = torch.randn(size=(42, 10, 512)) + +#print(hf(rand_input).shape) diff --git a/transfer_learning/preprocessor.py b/transfer_learning/preprocessor.py new file mode 100644 index 0000000..5ef0b29 --- /dev/null +++ b/transfer_learning/preprocessor.py @@ -0,0 +1,82 @@ +import numpy as np +from tqdm import tqdm +import librosa +import librosa.display +from matplotlib import pyplot as plt +from pathlib import Path + + +class Preprocessor: + def __init__(self, sr=16000, n_mels=64, n_fft=1024, hop_length=256, chunk_size=64, chunk_hop=32, cmap='viridis'): + self.sr = sr + self.n_fft = n_fft + self.n_mels = n_mels + self.hop_length = hop_length + self.chunk_size = chunk_size + self.chunk_hop = chunk_hop + self.cmap = cmap + + def process_audio(self, path, out_folder=None): + mel_spec = self.to_mel_spec(path) + for count, i in enumerate(range(0, mel_spec.shape[1], self.chunk_hop)): + try: + chunk = mel_spec[:, i:i+self.chunk_size] + out_path = out_folder / f'{path.stem}_{count}.jpg' + self.mel_spec_to_img(chunk, out_path) # todo must adjust outpath name + except IndexError: + pass + + + def to_mel_spec(self, path): + audio, sr = librosa.load(str(path), sr=self.sr, mono=True) + spectrogram = librosa.stft(audio, + n_fft=self.n_fft, + hop_length=self.n_fft // 2, + center=False) + spectrogram = librosa.feature.melspectrogram(S=np.abs(spectrogram) ** 2, + sr=sr, + n_mels=self.n_mels, + hop_length=self.hop_length) + # prepare plot + spectrogram = librosa.power_to_db(spectrogram, ref=np.max, top_db=None) + return spectrogram + + def mel_spec_to_img(self, spectrogram, out_path, size=227): + # prepare plotting + fig = plt.figure(frameon=False, tight_layout=False) + fig.set_size_inches(1, 1) + ax = plt.Axes(fig, [0., 0., 1., 1.]) + ax.set_axis_off() + fig.add_axes(ax) + + spectrogram_axes = librosa.display.specshow(spectrogram, + hop_length=self.n_fft // 2, + fmax=self.sr/2, + sr=self.sr, + cmap=self.cmap, + y_axis='mel', + x_axis='time') + + fig.add_axes(spectrogram_axes, id='spectrogram') + fig.savefig(out_path, format='jpg', dpi=size) + plt.clf() + plt.close() + + def process_folder(self, folder_in, folder_out): + wavs = folder_in.glob('*.wav') + folder_out.mkdir(parents=True, exist_ok=True) + for wav in tqdm(list(wavs)): + self.process_audio(wav, folder_out) + +if __name__ == '__main__': + models = ['slider', 'pump', 'fan'] + model_ids = [0, 2, 4, 6] + preprocessor = Preprocessor() + for model in models: + for model_id in model_ids: + preprocessor.process_folder(Path(f'/home/robert/coding/audio_anomaly_detection/data/mimii/-6_dB_{model}/id_0{model_id}/normal'), + Path(f'/home/robert/coding/audio_anomaly_detection/data/mimii/-6_dB_{model}/id_0{model_id}/normal/melspec_images/') + ) + preprocessor.process_folder(Path(f'/home/robert/coding/audio_anomaly_detection/data/mimii/-6_dB_{model}/id_0{model_id}/abnormal'), + Path(f'/home/robert/coding/audio_anomaly_detection/data/mimii/-6_dB_{model}/id_0{model_id}/abnormal/melspec_images/') + ) \ No newline at end of file