From 62d9eb6e8f77631aa97d629491e72325b683602c Mon Sep 17 00:00:00 2001 From: Si11ium Date: Thu, 17 Dec 2020 08:02:28 +0100 Subject: [PATCH] torchaudio testing --- audio_toolset/audio_io.py | 45 ++++--- audio_toolset/audio_to_mel_dataset.py | 163 ++++++++++++++++++++++++++ audio_toolset/mel_augmentation.py | 57 +++++---- audio_toolset/mel_dataset.py | 38 +++--- modules/util.py | 3 + utils/transforms.py | 13 ++ 6 files changed, 264 insertions(+), 55 deletions(-) create mode 100644 audio_toolset/audio_to_mel_dataset.py diff --git a/audio_toolset/audio_io.py b/audio_toolset/audio_io.py index 12acefb..aaede83 100644 --- a/audio_toolset/audio_io.py +++ b/audio_toolset/audio_io.py @@ -1,3 +1,7 @@ +from typing import Union + +import torch + try: import librosa except ImportError: # pragma: no-cover @@ -10,9 +14,6 @@ except ImportError: # pragma: no-cover ' install it with `pip install scikit-learn`.') -import numpy as np - - def scale_minmax(x, min_val=0.0, max_val=1.0): x_std = (x - x.min()) / (x.max() - x.min()) x_scaled = x_std * (max_val - min_val) + min_val @@ -47,13 +48,12 @@ class MFCC(object): class NormalizeLocal(object): def __init__(self): - self.cache: np.ndarray pass def __repr__(self): return f'{self.__class__.__name__}({self.__dict__})' - def __call__(self, x: np.ndarray): + def __call__(self, x: torch.Tensor): mean = x.mean() std = x.std() + 0.0001 @@ -61,37 +61,47 @@ class NormalizeLocal(object): # tensor = tensor.__sub__(mean).__div__(std) # Numpy Version x = (x - mean) / std - x[np.isnan(x)] = 0 - x[np.isinf(x)] = 0 + + x[torch.isnan(x)] = 0 + x[torch.isinf(x)] = 0 + return x class NormalizeMelband(object): def __init__(self): - self.cache: np.ndarray + pass def __repr__(self): return f'{self.__class__.__name__}({self.__dict__})' - def __call__(self, x: np.ndarray): + def __call__(self, x: torch.Tensor): mean = x.mean(-1).unsqueeze(-1) std = x.std(-1).unsqueeze(-1) x = x.__sub__(mean).__div__(std) - x[np.isnan(x)] = 0 - x[np.isinf(x)] = 0 + x[torch.isnan(x)] = 0 + x[torch.isinf(x)] = 0 return x -class AudioToMel(object): +class LibrosaAudioToMel(object): def __init__(self, amplitude_to_db=False, power_to_db=False, **mel_kwargs): assert not all([amplitude_to_db, power_to_db]), "Choose amplitude_to_db or power_to_db, not both!" + # Mel kwargs are: + # sr + # n_mels + # n_fft + # hop_length + self.mel_kwargs = mel_kwargs self.amplitude_to_db = amplitude_to_db self.power_to_db = power_to_db def __call__(self, y): + import numpy as np + mel = librosa.feature.melspectrogram(y, **self.mel_kwargs) if self.amplitude_to_db: mel = librosa.amplitude_to_db(mel, ref=np.max) @@ -111,6 +121,7 @@ class PowerToDB(object): return f'{self.__class__.__name__}({self.__dict__})' def __call__(self, x): + import numpy as np if self.running_max is not None: self.running_max = max(np.max(x), self.running_max) return librosa.power_to_db(x, ref=self.running_max) @@ -137,11 +148,11 @@ class MelToImage(object): def __call__(self, x): # Source to Solution: https://stackoverflow.com/a/57204349 - mels = np.log(x + 1e-9) # add small number to avoid log(0) + mels = torch.log(x + 1e-9) # add small number to avoid log(0) # min-max scale to fit inside 8-bit range - img = scale_minmax(mels, 0, 255).astype(np.uint8) - img = np.flip(img, axis=0) # put low frequencies at the bottom in image - img = 255 - img # invert. make black==more energy - img = img.astype(np.float32) + img = scale_minmax(mels, 0, 255).int() + img = torch.flip(img, dims=(0,)) # put low frequencies at the bottom in image + img = torch.as_tensor(255) - img # invert. make black==more energy + img = img.float() return img diff --git a/audio_toolset/audio_to_mel_dataset.py b/audio_toolset/audio_to_mel_dataset.py new file mode 100644 index 0000000..eb2d7f4 --- /dev/null +++ b/audio_toolset/audio_to_mel_dataset.py @@ -0,0 +1,163 @@ +import sys +from pathlib import Path + +import pickle +from abc import ABC +from torch.utils.data import Dataset +from torchvision.transforms import Compose + +from ml_lib.audio_toolset.audio_io import LibrosaAudioToMel, MelToImage +from ml_lib.audio_toolset.mel_dataset import TorchMelDataset + + +class _AudioToMelDataset(Dataset, ABC): + + @property + def audio_file_duration(self): + raise NotImplementedError + + @property + def sampling_rate(self): + raise NotImplementedError + + def __init__(self, audio_file_path, label, sample_segment_len=1, sample_hop_len=1, reset=False, + audio_augmentations=None, mel_augmentations=None, mel_kwargs=None, **kwargs): + self.ignored_kwargs = kwargs + self.mel_kwargs = mel_kwargs + self.reset = reset + self.audio_path = Path(audio_file_path) + + mel_folder_suffix = self.audio_path.parent.parent.name + self.mel_file_path = Path(str(self.audio_path) + .replace(mel_folder_suffix, f'{mel_folder_suffix}_mel_folder') + .replace(self.audio_path.suffix, '.npy')) + + self.audio_augmentations = audio_augmentations + + self.dataset = TorchMelDataset(self.mel_file_path, sample_segment_len, sample_hop_len, label, + self.audio_file_duration, mel_kwargs['sample_rate'], mel_kwargs['hop_length'], + mel_kwargs['n_mels'], transform=mel_augmentations) + + def _build_mel(self): + raise NotImplementedError + + def __getitem__(self, item): + try: + return self.dataset[item] + except FileNotFoundError: + assert self._build_mel() + return self.dataset[item] + + def __len__(self): + return len(self.dataset) + + +import librosa + + +class LibrosaAudioToMelDataset(_AudioToMelDataset): + + @property + def audio_file_duration(self): + return librosa.get_duration(sr=self.mel_kwargs.get('sr', None), filename=self.audio_path) + + @property + def sampling_rate(self): + return self.mel_kwargs.get('sr', None) + + def __init__(self, audio_file_path, *args, **kwargs): + + audio_file_path = Path(audio_file_path) + # audio_file, sampling_rate = librosa.load(self.audio_path, sr=sampling_rate) + mel_kwargs = kwargs.get('mel_kwargs', dict()) + mel_kwargs.update(sr=mel_kwargs.get('sr', None) or librosa.get_samplerate(self.audio_path)) + kwargs.update(mel_kwargs=mel_kwargs) + + super(LibrosaAudioToMelDataset, self).__init__(audio_file_path, *args, **kwargs) + + self._mel_transform = Compose([LibrosaAudioToMel(**mel_kwargs), + MelToImage() + ]) + + + def _build_mel(self): + if self.reset: + self.mel_file_path.unlink(missing_ok=True) + if not self.mel_file_path.exists(): + self.mel_file_path.parent.mkdir(parents=True, exist_ok=True) + raw_sample, _ = librosa.core.load(self.audio_path, sr=self.sampling_rate) + mel_sample = self._mel_transform(raw_sample) + with self.mel_file_path.open('wb') as mel_file: + pickle.dump(mel_sample, mel_file, protocol=pickle.HIGHEST_PROTOCOL) + else: + pass + + return self.mel_file_path.exists() + + +import torchaudio +if sys.platform =='windows': + torchaudio.set_audio_backend('soundfile') +else: + torchaudio.set_audio_backend('sox_io') + + +class PyTorchAudioToMelDataset(_AudioToMelDataset): + + @property + def audio_file_duration(self): + info_obj = torchaudio.info(self.audio_path) + return info_obj.num_frames / info_obj.sample_rate + + @property + def sampling_rate(self): + return self.mel_kwargs['sample_rate'] + + def __init__(self, audio_file_path, *args, **kwargs): + super(PyTorchAudioToMelDataset, self).__init__(audio_file_path, *args, **kwargs) + + audio_file_path = Path(audio_file_path) + # audio_file, sampling_rate = librosa.load(self.audio_path, sr=sampling_rate) + + from torchaudio.transforms import MelSpectrogram + self._mel_transform = Compose([MelSpectrogram(**self.mel_kwargs), + MelToImage() + ]) + + def _build_mel(self): + if self.reset: + self.mel_file_path.unlink(missing_ok=True) + if not self.mel_file_path.exists(): + self.mel_file_path.parent.mkdir(parents=True, exist_ok=True) + lock_file = Path(str(self.mel_file_path).replace(self.mel_file_path.suffix, '.lock')) + lock_file.touch(exist_ok=False) + + try: + audio_sample, sample_rate = torchaudio.load(self.audio_path) + except RuntimeError: + import soundfile + + data, samplerate = soundfile.read(self.audio_path) + # sf.available_formats() + # sf.available_subtypes() + soundfile.write(self.audio_path, data, samplerate, subtype='PCM_32') + + audio_sample, sample_rate = torchaudio.load(self.audio_path) + if sample_rate != self.sampling_rate: + resample = torchaudio.transforms.Resample(orig_freq=int(sample_rate), new_freq=int(self.sampling_rate)) + audio_sample = resample(audio_sample) + if audio_sample.shape[0] > 1: + # Transform Stereo to Mono + audio_sample = audio_sample.mean(dim=0, keepdim=True) + mel_sample = self._mel_transform(audio_sample) + with self.mel_file_path.open('wb') as mel_file: + pickle.dump(mel_sample, mel_file, protocol=pickle.HIGHEST_PROTOCOL) + lock_file.unlink() + else: + # print(f"Already existed.. Skipping {filename}") + # mel_file = mel_file + pass + + # with mel_file.open(mode='rb') as f: + # mel_sample = pickle.load(f, fix_imports=True) + return self.mel_file_path.exists() diff --git a/audio_toolset/mel_augmentation.py b/audio_toolset/mel_augmentation.py index acb356b..ccb16cf 100644 --- a/audio_toolset/mel_augmentation.py +++ b/audio_toolset/mel_augmentation.py @@ -1,63 +1,66 @@ +import torch import numpy as np +from ml_lib.utils.transforms import _BaseTransformation -class NoiseInjection(object): - def __init__(self, noise_factor: float, sigma=0.5, mu=0.5): - assert noise_factor >= 0, f'max_shift_ratio has to be greater then 0, but was: {noise_factor}.' +class NoiseInjection(_BaseTransformation): + + def __init__(self, noise_factor: float, sigma=1, mu=0): + super(NoiseInjection, self).__init__() + assert noise_factor >= 0, f'noise_factor has to be greater then 0, but was: {noise_factor}.' self.mu = mu self.sigma = sigma self.noise_factor = noise_factor - def __call__(self, x: np.ndarray): + def __call__(self, x): if self.noise_factor: - noise = np.random.uniform(0, self.noise_factor, size=x.shape) + noise = torch.normal(self.mu, self.sigma, size=x.shape, device=x.device) * self.noise_factor augmented_data = x + x * noise - # Cast back to same data type - augmented_data = augmented_data.astype(x.dtype) return augmented_data else: return x -class LoudnessManipulator(object): +class LoudnessManipulator(_BaseTransformation): def __init__(self, max_factor: float): + super(LoudnessManipulator, self).__init__() assert 1 > max_factor >= 0, f'max_shift_ratio has to be between [0,1], but was: {max_factor}.' self.max_factor = max_factor - def __call__(self, x: np.ndarray): + def __call__(self, x): if self.max_factor: - augmented_data = x + x * (np.random.random() * self.max_factor) - # Cast back to same data type - augmented_data = augmented_data.astype(x.dtype) + augmented_data = x + x * (torch.rand(1, device=x.device) * self.max_factor) return augmented_data else: return x -class ShiftTime(object): +class ShiftTime(_BaseTransformation): valid_shifts = ['right', 'left', 'any'] def __init__(self, max_shift_ratio: float, shift_direction: str = 'any'): + super(ShiftTime, self).__init__() assert 1 > max_shift_ratio >= 0, f'max_shift_ratio has to be between [0,1], but was: {max_shift_ratio}.' assert shift_direction.lower() in self.valid_shifts, f'shift_direction has to be one of: {self.valid_shifts}' self.max_shift_ratio = max_shift_ratio self.shift_direction = shift_direction.lower() - def __call__(self, x: np.ndarray): + def __call__(self, x): if self.max_shift_ratio: - shift = np.random.randint(max(int(self.max_shift_ratio * x.shape[-1]), 1)) + shift = torch.randint(max(int(self.max_shift_ratio * x.shape[-1]), 1), (1,)).item() if self.shift_direction == 'right': shift = -1 * shift elif self.shift_direction == 'any': - direction = np.random.choice([1, -1], 1) + # The ugly pytorch alternative + # direction = [-1, 1][torch.multinomial(torch.as_tensor([1, 2]).float(), 1).item()] + direction = np.asscalar(np.random.choice([1, -1], 1)) shift = direction * shift - augmented_data = np.roll(x, shift) + augmented_data = torch.roll(x, shift, dims=-1) # Set to silence for heading/ tailing - shift = int(shift) if shift > 0: augmented_data[:shift, :] = 0 else: @@ -67,12 +70,13 @@ class ShiftTime(object): return x -class MaskAug(object): +class MaskAug(_BaseTransformation): w_idx = -1 h_idx = -2 def __init__(self, duration_ratio_max=0.3, mask_with_noise=True): + super(MaskAug, self).__init__() assertion = f'"duration_ratio" has to be within [0..1], but was: {duration_ratio_max}' if isinstance(duration_ratio_max, (tuple, list)): assert all([0 < max_val < 1 for max_val in duration_ratio_max]), assertion @@ -85,15 +89,20 @@ class MaskAug(object): else (duration_ratio_max, duration_ratio_max) def __call__(self, x): + assert x.ndim == 3, "This function was made to wotk with two-dimensional inputs" for dim in (self.w_idx, self.h_idx): if self.duration_ratio_max[dim]: - start = int(np.random.choice(x.shape[dim], 1)) - v_max = x.shape[dim] * self.duration_ratio_max[dim] - size = int(np.random.randint(0, v_max, 1)) + if dim == self.w_idx and x.shape[dim] == 0: + print(x) + start = np.asscalar(np.random.choice(x.shape[dim], 1)) + v_max = int(x.shape[dim] * self.duration_ratio_max[dim]) + size = torch.randint(0, v_max, (1,)).item() end = int(min(start + size, x.shape[dim])) size = end - start if dim == self.w_idx: - x[:, start:end] = np.random.random((x.shape[self.h_idx], size)) if self.mask_with_noise else 0 + mask = torch.randn(size=(x.shape[self.h_idx], size), device=x.device) if self.mask_with_noise else 0 + x[:, :, start:end] = mask else: - x[start:end, :] = np.random.random((size, x.shape[self.w_idx])) if self.mask_with_noise else 0 + mask = torch.randn((size, x.shape[self.w_idx]), device=x.device) if self.mask_with_noise else 0 + x[:, start:end, :] = mask return x diff --git a/audio_toolset/mel_dataset.py b/audio_toolset/mel_dataset.py index d2b4536..b2a55b7 100644 --- a/audio_toolset/mel_dataset.py +++ b/audio_toolset/mel_dataset.py @@ -1,30 +1,40 @@ +import time from pathlib import Path -import numpy as np +import pickle from torch.utils.data import Dataset +from ml_lib.modules.util import AutoPadToShape + class TorchMelDataset(Dataset): - def __init__(self, identifier, mel_path, segment_len, hop_len, label, padding=0, transform=None): - self.padding = padding - self.path = next(iter(Path(mel_path).glob(f'{identifier}_*'))) - self.segment_len = segment_len - self.m, self.n = str(self.path).split('_')[-2:] # get spectrogram dimensions - self.n = int(self.n.split('.', 1)[0]) # remove .npy - self.m, self.n = (int(i) for i in (self.m, self.n)) - self.offsets = list(range(0, self.n - segment_len, hop_len)) + def __init__(self, mel_path, sub_segment_len, sub_segment_hop_len, label, audio_file_len, + sampling_rate, mel_hop_len, n_mels, transform=None, auto_pad_to_shape=True): + super(TorchMelDataset, self).__init__() + self.sampling_rate = sampling_rate + self.audio_file_len = audio_file_len + self.padding = AutoPadToShape((1, n_mels , sub_segment_len)) if auto_pad_to_shape else None + self.path = Path(mel_path) + self.sub_segment_len = sub_segment_len + self.mel_hop_len = mel_hop_len + self.sub_segment_hop_len = sub_segment_hop_len + self.n = int((self.sampling_rate / self.mel_hop_len) * self.audio_file_len + 1) + self.offsets = list(range(0, self.n - self.sub_segment_len, self.sub_segment_hop_len)) self.label = label self.transform = transform def __getitem__(self, item): + while Path(str(self.path).replace(self.path.suffix, '.lock')).exists(): + time.sleep(0.01) + with self.path.open('rb') as mel_file: + mel_spec = pickle.load(mel_file, fix_imports=True) start = self.offsets[item] - mel_spec = np.load(str(self.path), allow_pickle=True) - if self.padding > 0: - mel_spec = np.pad(mel_spec, pad_width=[(0, 0), (self.padding // 2, self.padding // 2)], mode='mean') - snippet = mel_spec[:, start: start + self.segment_len] + snippet = mel_spec[:, : , start: start + self.sub_segment_len] if self.transform: snippet = self.transform(snippet) + if self.padding: + snippet = self.padding(snippet) return snippet, self.label def __len__(self): - return len(self.offsets) \ No newline at end of file + return len(self.offsets) diff --git a/modules/util.py b/modules/util.py index 4ab4300..680a088 100644 --- a/modules/util.py +++ b/modules/util.py @@ -44,6 +44,9 @@ try: def size(self): return self.shape + def additional_scores(self, outputs): + raise NotImplementedError + @property def dataset_class(self): try: diff --git a/utils/transforms.py b/utils/transforms.py index 4c4d955..09bdae0 100644 --- a/utils/transforms.py +++ b/utils/transforms.py @@ -1,6 +1,19 @@ +from abc import ABC from torchvision.transforms import ToTensor as TorchVisionToTensor +class _BaseTransformation(ABC): + + def __init__(self, *args): + pass + + def __repr__(self): + return f'{self.__class__.__name__}({self.__dict__})' + + def __call__(self, *args, **kwargs): + raise NotImplementedError + + class ToTensor(TorchVisionToTensor): def __call__(self, pic):