目录
本项目实现了一个实时流式语音识别系统,利用百度语音识别服务和WebSocket协议,实现从麦克风捕获音频数据并实时进行语音识别。该系统适用于需要将实时语音转换为文本的应用场景。

首先,确保安装必要的库:
bash
Copy code
pip install websocket-client pyaudio
运行程序时,可以选择输入音频文件路径或者直接使用麦克风捕获音频:
python realtime_asr.py
在const.py文件中,需要配置以下参数:
python
Copy code
URI = "your_baidu_asr_service_uri"
APPID = "your_appid"
APPKEY = "your_appkey"
DEV_PID = 1537 # 选择合适的识别模型
python code
import websocketimport pyaudioimport threadingimport timeimport uuidimport jsonimport logging
import const
logger = logging.getLogger()
# 配置音频输入
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
p = pyaudio.PyAudio()
# 列出所有音频设备
info = p.get_host_api_info_by_index(0)
numdevices = info.get('deviceCount')for i in range(0, numdevices):
if (p.get_device_info_by_host_api_device_index(0, i).get('maxInputChannels')) > 0:
print("Input Device id ", i, " - ", p.get_device_info_by_host_api_device_index(0, i).get('name'))
# 选择设备
device_index = int(input("Select device index: "))
def send_start_params(ws):
req = {
"type": "START",
"data": {
"appid": const.APPID,
"appkey": const.APPKEY,
"dev_pid": const.DEV_PID,
"cuid": "yourself_defined_user_id",
"sample": 16000,
"format": "pcm"
}
}
body = json.dumps(req)
ws.send(body, websocket.ABNF.OPCODE_TEXT)
logger.info("send START frame with params:" + body)
def send_audio(ws):
def callback(in_data, frame_count, time_info, status):
ws.send(in_data, websocket.ABNF.OPCODE_BINARY)
return (in_data, pyaudio.paContinue)
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
input_device_index=device_index,
frames_per_buffer=CHUNK,
stream_callback=callback)
stream.start_stream()
while stream.is_active():
time.sleep(0.1)
stream.stop_stream()
stream.close()
def send_finish(ws):
req = {
"type": "FINISH"
}
body = json.dumps(req)
ws.send(body, websocket.ABNF.OPCODE_TEXT)
logger.info("send FINISH frame")
def send_cancel(ws):
req = {
"type": "CANCEL"
}
body = json.dumps(req)
ws.send(body, websocket.ABNF.OPCODE_TEXT)
logger.info("send Cancel frame")
def on_open(ws):
def run(*args):
send_start_params(ws)
send_audio(ws)
send_finish(ws)
logger.debug("thread terminating")
threading.Thread(target=run).start()
def on_message(ws, message):
logger.info("Response: " + message)
def on_error(ws, error):
logger.error("error: " + str(error))
def on_close(ws):
logger.info("ws close ...")
if __name__ == "__main__":
logging.basicConfig(format='[%(asctime)-15s] [%(funcName)s()][%(levelname)s] %(message)s')
logger.setLevel(logging.DEBUG)
logger.info("begin")
uri = const.URI + "?sn=" + str(uuid.uuid1())
logger.info("uri is "+ uri)
ws_app = websocket.WebSocketApp(uri,
on_open=on_open,
on_message=on_message,
on_error=on_error,
on_close=on_close)
ws_app.run_forever()
本系统实现了从麦克风实时捕获音频并通过WebSocket与百度语音识别服务进行通信,实现实时语音识别的功能。该系统可应用于各种需要实时语音转文字的场景,如实时字幕、语音助手等。