如何合成可用于训练噪声鲁棒 ASR 模型的噪声数据集
如何合成可用于训练噪声鲁棒 ASR 模型的噪声数据集#
数据增强是一种有用的方法,可以提高跨多个领域的模型性能。它可以显著提高模型对嘈杂声学环境的鲁棒性。本教程将引导您完成使用示例文件创建噪声数据集的过程。
"""
You can run either this tutorial locally (if you have all the dependencies and a GPU) or on Google Colab.
Instructions for setting up Colab are as follows:
1. Open a new Python 3 notebook.
2. Import this notebook from GitHub (File > Upload Notebook > "GITHUB" tab > copy/paste GitHub URL).
3. Connect to an instance with a GPU (Runtime > Change runtime type > select "GPU" for hardware accelerator.)
4. Run this cell to set up dependencies.
5. Restart the runtime (Runtime > Restart Runtime) for any upgraded packages to take effect.
"""
# If you're using Google Colab and not running locally, run this cell.
## Install dependencies
!pip install wget
!apt-get install sox
"""
Remember to restart the runtime for the kernel to pick up any upgraded packages (for example, Matplotlib)!
Alternatively, you can uncomment the `exit()` below to crash and restart the kernel, in the case
that you want to use the "Run All Cells" (or similar) option.
"""
# exit()
import os
# This is where the noise samples will be placed.
noise_samples = 'noise_samples'
if not os.path.exists(noise_samples):
os.makedirs(noise_samples)
对于背景噪声,我们将使用 OpenSLR 数据库中 Room Impulse Response and Noise 数据库的背景噪声样本。对于数据集中的每个 30 秒各向同性噪声样本,我们使用前 15 秒进行训练,后 15 秒进行评估。
让我们下载数据集。
# Download noise samples
import subprocess
import wget
import glob
import tarfile
import librosa
import soundfile as sf
# Download and unzip the clean audio file.
if not os.path.exists(noise_samples + '/rirs_noises.zip'):
rirs_noises_url = 'https://www.openslr.org/resources/28/rirs_noises.zip'
rirs_noises_path = wget.download(rirs_noises_url, noise_samples)
print(f"Dataset downloaded at: {rirs_noises_path}")
else:
print("Zipfile already exists.")
rirs_noises_path = noise_samples + '/rirs_noises.zip'
现在,我们将解压缩 .zip
文件,这将为我们提供数据集音频文件,格式为 8 声道 .wav
文件,采样率为 16kHz。格式和采样率符合我们的目的,但我们需要将这些文件转换为单声道,以匹配 AN4 数据集中的文件。幸运的是,SoX 库也为此提供了工具。
注意:转换将花费几分钟。
from zipfile import ZipFile
if not os.path.exists(noise_samples + '/RIRS_NOISES'):
try:
with ZipFile(rirs_noises_path, "r") as zipObj:
zipObj.extractall(noise_samples)
print("Extracting noise data complete")
# Convert 8-channel audio files to mono-channel
wav_list = glob.glob(noise_samples + '/RIRS_NOISES/**/*.wav', recursive=True)
for wav_path in wav_list:
mono_wav_path = wav_path[:-4] + '_mono.wav'
cmd = f"sox {wav_path} {mono_wav_path} remix 1"
subprocess.call(cmd, shell=True)
print("Finished converting the 8-channel noise data .wav files to mono-channel")
except Exception:
print("Not extracting. Extracted noise data might already exist.")
else:
print("Extracted noise data already exists. Proceed to the next step.")
# Let's create the following list of noise samples to better showcase the effect of SNR in synthesizing noisy audio files.
noise_sample_list = [
'noise_samples/RIRS_NOISES/pointsource_noises/noise-free-sound-0057_mono.wav',
'noise_samples/RIRS_NOISES/pointsource_noises/noise-free-sound-0113_mono.wav',
'noise_samples/RIRS_NOISES/pointsource_noises/noise-free-sound-0232_mono.wav',
'noise_samples/RIRS_NOISES/pointsource_noises/noise-free-sound-0532_mono.wav',
'noise_samples/RIRS_NOISES/pointsource_noises/noise-free-sound-0533_mono.wav',
'noise_samples/RIRS_NOISES/pointsource_noises/noise-free-sound-0603_mono.wav',
'noise_samples/RIRS_NOISES/pointsource_noises/noise-free-sound-0605_mono.wav',
]
# This is where the clean audio files will be placed.
clean_audio = 'clean_audio'
if not os.path.exists(clean_audio):
os.makedirs(clean_audio)
下载和处理 AN4 数据集 AN4 是卡内基梅隆大学 (CMU) 录制和分发的小型数据集。它由人们拼写地址、姓名等的录音组成。有关此数据集的信息可以在 CMU 官方网站上找到。
让我们下载 AN4 数据集 tar 文件。
# Download and untar the clean audio file.
if not os.path.exists(clean_audio + '/an4_sphere.tar.gz'):
an4_url = 'https://dldata-public.s3.us-east-2.amazonaws.com/an4_sphere.tar.gz' # for the original source, please visit http://www.speech.cs.cmu.edu/databases/an4/an4_sphere.tar.gz
an4_path = wget.download(an4_url, clean_audio)
print(f"Dataset downloaded at: {an4_path}")
else:
print("Tarfile already exists.")
an4_path = clean_audio + '/an4_sphere.tar.gz'
#if os.path.exists(clean_audio_data + '/an4/'):
# Untar and convert `.sph` to `.wav` (using SoX).
tar = tarfile.open(an4_path)
tar.extractall(path=clean_audio)
print("Converting .sph to .wav...")
sph_list = glob.glob(clean_audio + '/an4/**/*.sph', recursive=True)
for sph_path in sph_list:
wav_path = sph_path[:-4] + '.wav'
cmd = ["sox", sph_path, wav_path]
subprocess.run(cmd)
print("Finished conversion.\n******")
clean_audio_list = [os.path.join(dp, f) for dp, dn, filenames in os.walk(clean_audio) for f in filenames if os.path.splitext(f)[1] == '.wav']
# Create a directory where we put the synthesized noisy files.
noisy_files = "noisy_files"
if not os.path.exists(noisy_files):
os.makedirs(noisy_files)
# function to read audio
def audioread(path, norm = True, start=0, stop=None):
path = os.path.abspath(path)
if not os.path.exists(path):
raise ValueError("[{}] does not exist!".format(path))
try:
x, sr = sf.read(path, start=start, stop=stop)
except RuntimeError: # fix for sph pcm-embedded shortened v2
print('WARNING: Audio type not supported')
if len(x.shape) == 1: # mono
if norm:
rms = (x ** 2).mean() ** 0.5
if rms == 0:
rms = 1
scalar = 10 ** (-25 / 20) / (rms)
x = x * scalar
return x, sr
else: # multi-channel
x = x.T
x = x.sum(axis=0)/x.shape[0]
if norm:
rms = (x ** 2).mean() ** 0.5
if rms == 0:
rms = 1
scalar = 10 ** (-25 / 20) / (rms)
x = x * scalar
return x, sr
# funtion to write audio
def audiowrite(data, fs, destpath, norm=False):
if norm:
rms = (data ** 2).mean() ** 0.5
scalar = 10 ** (-25 / 10) / (rms+eps)
data = data * scalar
if max(abs(data))>=1:
data = data/max(abs(data), eps)
destpath = os.path.abspath(destpath)
destdir = os.path.dirname(destpath)
if not os.path.exists(destdir):
os.makedirs(destdir)
sf.write(destpath, data, fs)
return
# function to mix a clean speech with a noise sample at a specified SNR level
def snr_mixer(clean, noise, snr):
# Normalizing to -25 dB FS
rmsclean = (clean**2).mean()**0.5
if rmsclean == 0:
rmsclean = 1
scalarclean = 10 ** (-25 / 20) / rmsclean
clean = clean * scalarclean
rmsclean = (clean**2).mean()**0.5
rmsnoise = (noise**2).mean()**0.5
if rmsnoise == 0:
rmsnoise = 1
scalarnoise = 10 ** (-25 / 20) /rmsnoise
noise = noise * scalarnoise
rmsnoise = (noise**2).mean()**0.5
if rmsnoise == 0:
rmsnoise = 1
# Set the noise level for a given SNR
noisescalar = np.sqrt(rmsclean / (10**(snr/20)) / rmsnoise)
noisenewlevel = noise * noisescalar
noisyspeech = clean + noisenewlevel
return clean, noisenewlevel, noisyspeech
# Add zeros to a noise sample to make it of the same duration as the clean audio.
def concatenate_noise_sample(noise, fs, len_clean):
silence_length = 0.5
while len(noise) <= len_clean:
noiseconcat = np.append(noise, np.zeros(int(fs*silence_length)))
noise = np.append(noiseconcat, noise)
if noise.size > len_clean:
noise = noise[0:len_clean]
return noise
# Let's randomly select one clean audio and one noise sample.
import random
import numpy as np
c_size = len(clean_audio_list)-1
n_size = len(noise_sample_list)-1
idx_c = random.randint(0, c_size)
idx_n = random.randint(0, n_size)
# Now, let's mix the selected clean audio and noise sample at 0dB SNR.
SNR = 0
clean_f_name = clean_audio_list[idx_c]
noise_sample_f_name = noise_sample_list[idx_n]
clean, fs = audioread(clean_f_name)
noise, n_fs = audioread(noise_sample_f_name)
if len(noise) > len(clean):
noise = noise[0:len(clean)]
elif len(noise) < len(clean):
noise = concatenate_noise_sample(noise, n_fs, clean.size)
file_name = os.path.basename(clean_f_name)
noisy_f_name = noisy_files + "/" + file_name[:-4] + "_0dB_snr.wav"
clean_snr, noise_snr, noisy_snr = snr_mixer(clean=clean, noise=noise, snr=SNR)
audiowrite(noisy_snr, fs, noisy_f_name, norm=False)
print("Finished creating noisy file.\n******")
# Now, let's playback the clean audio.
from IPython.display import Audio, display
display(Audio(clean_f_name, autoplay=True))
# Play the corresponding noisy file.
noisy_file = "noisy_files/" + os.path.basename(noisy_f_name)
display(Audio(noisy_f_name, autoplay=True))
#Let's mix the files at 15dB SNR
SNR = 15
noisy_f_name = noisy_files + "/" + file_name[:-4] + "_15dB_snr.wav"
clean_snr, noise_snr, noisy_snr = snr_mixer(clean=clean, noise=noise, snr=SNR)
audiowrite(noisy_snr, fs, noisy_f_name, norm=False)
print("Finished creating noisy file.\n******")
# Play the corresponding noisy file
noisy_file = "noisy_files/" + os.path.basename(noisy_f_name)
display(Audio(noisy_f_name, autoplay=True))
通常,可以按如下方式创建任意数量的噪声文件
对于干净数据集中的每个干净音频剪辑
从噪声样本集中随机选择一个噪声样本
从 [5, 10, 15] 中随机选择一个 SNR
使用目标 SNR 混合两个片段
保存结果