赞
踩
婴儿啼哭声是婴儿沟通需求的重要信号,对于父母和护理者而言至关重要。本项目基于PaddleSpeech框架,致力于构建婴儿啼哭识别系统,通过深度学习将啼哭声翻译成成人语言,帮助理解婴儿的需求和状态。

婴儿啼哭声是一种生物报警器,传递婴儿的生理和心理需求。有效地识别啼哭声有助于提高婴儿护理的效率和质量。
项目使用六类人工添加噪声的哭声作为训练数据集,分别代表不同的婴儿需求,如苏醒、换尿布、要抱抱、饥饿、困乏、不舒服。噪声数据来自Noisex-92标准数据库。
安装PaddleSpeech和PaddleAudio,确保环境准备就绪。
!python -m pip install -q -U pip --user
!pip install paddlespeech paddleaudio -U -q
解压缩训练数据集,获取音频文件。
!unzip -qoa data/data41960/dddd.zip
通过可视化展示音频波形,了解样本数据的特征。
from paddleaudio import load
data, sr = load(file='train/awake/awake_0.wav', mono=True, dtype='float32')
print('wav shape: {}'.format(data.shape))
print('sample rate: {}'.format(sr))
plt.figure()
plt.plot(data)
plt.show()
统一音频文件长度,确保训练数据格式一致。
# 音频信息查看
import soundfile as sf
import numpy as np
import librosa
data, samplerate = sf.read('hungry_0.wav')
channels = len(data.shape)
length_s = len(data) / float(samplerate)
format_rate = 16000
print(f"channels: {channels}")
print(f"length_s: {length_s}")
print(f"samplerate: {samplerate}")
创建自定义数据集类,包含六类婴儿需求的音频文件。
class CustomDataset(AudioClassificationDataset): # List all the class labels label_list = [ 'awake', 'diaper', 'hug', 'hungry', 'sleepy', 'uncomfortable' ] train_data_dir = './train/' def __init__(self, **kwargs): files, labels = self._get_data() super(CustomDataset, self).__init__( files=files, labels=labels, feat_type='raw', **kwargs) # 返回音频文件、label值 def _get_data(self): ''' This method offer information of wave files and labels. ''' files = [] labels = [] for i in range(len(self.label_list)): single_class_path = os.path.join(self.train_data_dir, self.label_list[i]) for sound in os.listdir(single_class_path): if 'wav' in sound: sound = os.path.join(single_class_path, sound) files.append(sound) labels.append(i) return files, labels
选取预训练模型作为特征提取器,构建分类模型进行模型训练。
# 选取cnn14作为 backbone,用于提取音频的特征 from paddlespeech.cls.models import cnn14 backbone = cnn14(pretrained=True, extract_embedding=True) # 构建分类模型 class SoundClassifier(nn.Layer): def __init__(self, backbone, num_class, dropout=0.1): super().__init__() self.backbone = backbone self.dropout = nn.Dropout(dropout) self.fc = nn.Linear(self.backbone.emb_size, num_class) def forward(self, x): x = x.unsqueeze(1) x = self.backbone(x) x = self.dropout(x) logits = self.fc(x) return logits model = SoundClassifier(backbone, num_class=len(train_ds.label_list))
定义优化器和损失函数,进行模型训练。
# 定义优化器和 Loss optimizer = paddle.optimizer.Adam(learning_rate=1e-4, parameters=model.parameters()) criterion = paddle.nn.loss.CrossEntropyLoss() # 模型训练 epochs = 20 steps_per_epoch = len(train_loader) log_freq = 10 eval_freq = 10 for epoch in range(1, epochs + 1): model.train() avg_loss = 0 num_corrects = 0 num_samples = 0 for batch_idx, batch in enumerate(train_loader): waveforms, labels = batch feats = feature_extractor(waveforms) feats = paddle.transpose(feats, [0, 2, 1]) logits = model(feats) loss = criterion(logits, labels) loss.backward() optimizer.step() if isinstance(optimizer._learning_rate, paddle.optimizer.lr.LRScheduler): optimizer._learning_rate.step() optimizer.clear_grad() # 计算损失 avg_loss += loss.numpy()[0] # 计算指标 preds = paddle.argmax(logits, axis=1) num_corrects += (preds == labels).numpy().sum() num_samples += feats.shape[0] if (batch_idx + 1) % log_freq == 0: lr = optimizer.get_lr() avg_loss /= log_freq avg_acc = num_corrects / num_samples print_msg = 'Epoch={}/{}, Step={}/{}'.format( epoch, epochs, batch_idx + 1, steps_per_epoch) print_msg += ' loss={:.4f}'.format(avg_loss) print_msg += ' acc={:.4f}'.format(avg_acc) print_msg += ' lr={:.6f}'.format(lr) logger.train(print_msg) avg_loss = 0 num_corrects = 0 num_samples = 0
通过模型对测试音频进行推理,输出对应的婴儿需求概率。
# 模型测试 top_k = 3 wav_file = 'test/test_0.wav' n_fft = 1024 win_length = 1024 hop_length = 320 f_min = 50.0 f_max = 16000.0 waveform, sr = load(wav_file, sr=sr) feature_extractor = LogMelSpectrogram( sr=sr, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window='hann', f_min=f_min, f_max=f_max, n_mels=64) feats = feature_extractor(paddle.to_tensor(paddle.to_tensor(waveform).unsqueeze(0))) feats = paddle.transpose(feats, [0, 2, 1]) logits = model(feats) probs = nn.functional.softmax(logits, axis=1).numpy() sorted_indices = probs[0].argsort() msg = f'[{wav_file}]\n' for idx in sorted_indices[-1:-top_k-1:-1]: msg += f'{train_ds.label_list[idx]}: {probs[0][idx]:.5f}\n' print(msg)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。