关注 码龄 粉丝数 原力等级 -- 被采纳 被点赞 采纳率 f5520916 2024-06-04 12:43
采纳率: 40%
浏览 10 首页/
编程语言
/ python自动语音识别 pythontensorflowkeras 基于深度学习方法构建和训练一个基本的自动语音识别 (ASR) 模型来识别八个不同的单词。数据集将使用 Speech Commands 数据集(Warden,2018 年)的一部分,其中包含命令的短(一秒或更短)音频片段,例如“down”、“go”、“left”、“no”、“right”、“stop”、“up”和“yes”。要求实现:1.绘制波形和频谱图,从数据集中任选9条不同的语音文件,分别绘制其波形图和频谱图。2.特征提取,提取MFCC特征3.基于深度学习的语音识别,创建神经网络模型、训练模型并绘制训练和验证损失曲线、评估模型性能。
代码用的是https://tensorflow.google.cn/tutorials/audio/simple_audio?hl=zh-cn#%E5%B0%86%E6%B3%A2%E5%BD%A2%E8%BD%AC%E6%8D%A2%E4%B8%BA%E9%A2%91%E8%B0%B1%E5%9B%BE文章里面的,按照自己的要求改了一些,在添加MFCC特征提取的时候出现错误:ValueError: The padded shape () is not compatible with the shape (None,) of the corresponding input component.(SOS)是哪个地方要修改的,一直改不过来
import os
import pathlib
import random
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import models
from IPython import display
# 导入训练集
DATASET_PATH = 'D:/noice/Text/mini_speech_commands'
words = ["down", "go", "left", "no", "right", "stop", "up", "yes"]
data_dir = pathlib.Path(DATASET_PATH)
commands = np.array(tf.io.gfile.listdir(str(data_dir)))
commands = commands[(commands != 'README.md') & (commands != '.DS_Store')]
print('Commands:', commands)
train_ds, val_ds = tf.keras.utils.audio_dataset_from_directory(
directory=data_dir,
batch_size=64,
validation_split=0.2,
seed=0,
output_sequence_length=16000,
subset='both')
label_names = np.array(train_ds.class_names)
print()
print("label names:", label_names)
train_ds.element_spec
def squeeze(audio, labels):
audio = tf.squeeze(audio, axis=-1)
return audio, labels
train_ds = train_ds.map(squeeze, tf.data.AUTOTUNE)
val_ds = val_ds.map(squeeze, tf.data.AUTOTUNE)
test_ds = val_ds.shard(num_shards=2, index=0)
val_ds = val_ds.shard(num_shards=2, index=1)
for example_audio, example_labels in train_ds.take(1):
print(example_audio.shape)
print(example_labels.shape)
label_names[[1,1,3,0]]
# 绘制波形图和频谱图
def get_spectrogram(waveform):
spectrogram = tf.signal.stft(
waveform, frame_length=255, frame_step=128)
spectrogram = tf.abs(spectrogram)
spectrogram = spectrogram[..., tf.newaxis]
return spectrogram
def plot_spectrogram(spectrogram, ax):
if len(spectrogram.shape) > 2:
assert len(spectrogram.shape) == 3
spectrogram = np.squeeze(spectrogram, axis=-1)
log_spec = np.log(spectrogram.T + np.finfo(float).eps)
height = log_spec.shape[0]
width = log_spec.shape[1]
X = np.linspace(0, np.size(spectrogram), num=width, dtype=int)
Y = range(height)
ax.pcolormesh(X, Y, log_spec)
# 随机选择9条不同的语音文件
random_examples = []
for audio, label in train_ds.unbatch().shuffle(buffer_size=10000).take(9):
random_examples.append((audio, label))
# 绘制波形图
plt.figure(figsize=(16, 10))
rows = 3
cols = 3
n = rows * cols
for i in range(n):
plt.subplot(rows, cols, i+1)
audio_signal, label = random_examples[i]
plt.plot(audio_signal)
plt.title(label_names[label])
plt.yticks(np.arange(-1.2, 1.2, 0.2))
plt.ylim([-1.1, 1.1])
plt.show()
# 绘制频谱图
fig, axes = plt.subplots(3, 3, figsize=(16, 10))
for i in range(9):
r = i // cols
c = i % cols
ax = axes[r][c]
waveform, label = random_examples[i]
spectrogram = get_spectrogram(waveform)
plot_spectrogram(spectrogram.numpy(), ax)
ax.set_title(label_names[label])
plt.show()
# 提取MFCC特征
def get_mfcc(waveform, sample_rate=16000, n_mfcc=13):
mfcc = librosa.feature.mfcc(y=waveform, sr=sample_rate, n_mfcc=n_mfcc)
return mfcc
# 绘制MFCC特征图
fig, axes = plt.subplots(3, 3, figsize=(16, 10))
for i in range(9):
r = i // cols
c = i % cols
ax = axes[r][c]
waveform, label = random_examples[i]
mfcc = get_mfcc(waveform.numpy())
librosa.display.specshow(mfcc, sr=16000, ax=ax, x_axis='time')
ax.set_title(label_names[label])
plt.show()
# 后续模型训练部分
def make_spec_ds(ds):
def get_mfcc_wrapper(waveform, label):
mfcc = tf.numpy_function(get_mfcc, [waveform], tf.float32)
mfcc.set_shape([13, None]) # 设置形状,保留时间轴的可变长度
mfcc = tf.expand_dims(mfcc, -1)
return mfcc, label
return ds.map(get_mfcc_wrapper, num_parallel_calls=tf.data.AUTOTUNE)
train_mfcc_ds = make_spec_ds(train_ds)
val_mfcc_ds = make_spec_ds(val_ds)
test_mfcc_ds = make_spec_ds(test_ds)
# 确定MFCC特征的固定长度
def preprocess_dataset(ds, batch_size=64):
return ds.padded_batch(batch_size, padded_shapes=([13, None, 1], []), padding_values=(0.0, -1))
train_mfcc_ds = preprocess_dataset(train_mfcc_ds)
val_mfcc_ds = preprocess_dataset(val_mfcc_ds)
test_mfcc_ds = preprocess_dataset(test_mfcc_ds)
# 检查示例MFCC特征的形状
for example_mfccs, example_mfcc_labels in train_mfcc_ds.take(1):
print(example_mfccs.shape)
# 模型训练
input_shape = example_mfccs.shape[1:]
print('Input shape:', input_shape)
num_labels = len(label_names)
norm_layer = layers.Normalization()
norm_layer.adapt(data=train_mfcc_ds.map(map_func=lambda spec, label: spec))
model = tf.keras.Sequential([
layers.Input(shape=input_shape),
# Normalize.
norm_layer,
layers.Conv2D(32, 3, activation='relu'),
layers.Conv2D(64, 3, activation='relu'),
layers.MaxPooling2D(),
layers.Dropout(0.25),
layers.Flatten(),
layers.Dense(128, activation='relu'),
layers.Dropout(0.5),
layers.Dense(num_labels, activation='softmax')
])
model.summary()
# Adam优化模型
model.compile(
optimizer=tf.keras.optimizers.Adam(),
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=['accuracy'],
)
# 设置模型训练周期为10
EPOCHS = 10
history = model.fit(
train_mfcc_ds,
validation_data=val_mfcc_ds,
epochs=EPOCHS,
callbacks=tf.keras.callbacks.EarlyStopping(verbose=1, patience=2),
)
# 绘制训练和验证损失函数
metrics = history.history
plt.figure(figsize=(16,6))
# 绘制损失函数图像
plt.subplot(1,2,1)
plt.plot(history.epoch, metrics['loss'], label='loss')
plt.plot(history.epoch, metrics['val_loss'], label='val_loss')
plt.legend()
plt.ylim([0, max(max(metrics['loss']), max(metrics['val_loss']))])
plt.xlabel('Epoch')
plt.ylabel('Loss [CrossEntropy]')
# 绘制准确率图像
plt.subplot(1,2,2)
plt.plot(history.epoch, 100*np.array(metrics['accuracy']), label='accuracy')
plt.plot(history.epoch, 100*np.array(metrics['val_accuracy']), label='val_accuracy')
plt.legend()
plt.ylim([0, 100])
plt.xlabel('Epoch')
plt.ylabel('Accuracy [%]')
plt.show()
# 评估模型性能
model.evaluate(test_mfcc_ds, return_dict=True)
展开全部
收起
写回答
好问题
0 提建议
追加酬金
关注问题
微信扫一扫 点击复制链接 分享 邀请回答
编辑 收藏 删除 结题 收藏 举报 追加酬金 (90%的用户在追加酬金后获得了解决方案) 当前问题酬金 ¥ 0
(可追加 ¥500)
¥ 15¥ 20¥ 50¥ 100¥ 200 支付方式 扫码支付
二维码出错
点击刷新
支付金额
15 元
提供问题酬金的用户不参与问题酬金结算和分配
支付即为同意
《付费问题酬金结算规则》 结题 再想想 删除 再等等