audio_anomaly_detection/transfer_learning/preprocessor.py

import numpy as np
from tqdm import tqdm
import librosa
import librosa.display
from matplotlib import pyplot as plt
from pathlib import Path


class Preprocessor:
    def __init__(self, sr=16000, n_mels=64, n_fft=1024, hop_length=256, chunk_size=64, chunk_hop=32, cmap='viridis'):
        self.sr = sr
        self.n_fft = n_fft
        self.n_mels = n_mels
        self.hop_length = hop_length
        self.chunk_size = chunk_size
        self.chunk_hop = chunk_hop
        self.cmap = cmap

    def process_audio(self, path, out_folder=None):
        mel_spec = self.to_mel_spec(path)
        for count, i in enumerate(range(0, mel_spec.shape[1], self.chunk_hop)):
            try:
                chunk = mel_spec[:, i:i+self.chunk_size]
                out_path = out_folder / f'{path.stem}_{count}.jpg'
                self.mel_spec_to_img(chunk, out_path)  # todo must adjust outpath name
            except IndexError:
                pass


    def to_mel_spec(self, path):
        audio, sr = librosa.load(str(path), sr=self.sr, mono=True)
        spectrogram = librosa.stft(audio,
                                   n_fft=self.n_fft,
                                   hop_length=self.n_fft // 2,
                                   center=False)
        spectrogram = librosa.feature.melspectrogram(S=np.abs(spectrogram) ** 2,
                                                     sr=sr,
                                                     n_mels=self.n_mels,
                                                     hop_length=self.hop_length)
        # prepare plot
        spectrogram = librosa.power_to_db(spectrogram, ref=np.max, top_db=None)
        return spectrogram

    def mel_spec_to_img(self, spectrogram, out_path, size=227):
        # prepare plotting
        fig = plt.figure(frameon=False, tight_layout=False)
        fig.set_size_inches(1, 1)
        ax = plt.Axes(fig, [0., 0., 1., 1.])
        ax.set_axis_off()
        fig.add_axes(ax)

        spectrogram_axes = librosa.display.specshow(spectrogram,
                                                    hop_length=self.n_fft // 2,
                                                    fmax=self.sr/2,
                                                    sr=self.sr,
                                                    cmap=self.cmap,
                                                    y_axis='mel',
                                                    x_axis='time')

        fig.add_axes(spectrogram_axes, id='spectrogram')
        fig.savefig(out_path, format='jpg', dpi=size)
        plt.clf()
        plt.close()

    def process_folder(self, folder_in, folder_out):
        wavs = folder_in.glob('*.wav')
        folder_out.mkdir(parents=True, exist_ok=True)
        for wav in tqdm(list(wavs)):
            self.process_audio(wav, folder_out)

if __name__ == '__main__':
    models = ['slider', 'pump', 'fan']
    model_ids = [0, 2, 4, 6]
    preprocessor = Preprocessor()
    for model in models:
        for model_id in model_ids:
            preprocessor.process_folder(Path(f'/home/robert/coding/audio_anomaly_detection/data/mimii/-6_dB_{model}/id_0{model_id}/normal'),
                                        Path(f'/home/robert/coding/audio_anomaly_detection/data/mimii/-6_dB_{model}/id_0{model_id}/normal/melspec_images/')
                                        )
            preprocessor.process_folder(Path(f'/home/robert/coding/audio_anomaly_detection/data/mimii/-6_dB_{model}/id_0{model_id}/abnormal'),
                                        Path(f'/home/robert/coding/audio_anomaly_detection/data/mimii/-6_dB_{model}/id_0{model_id}/abnormal/melspec_images/')
                                        )