我长期以来一直在努力为呼叫中心用例同时转录实时麦克风输入和扬声器输出音频。这个项目的背景信息和我的挣扎都记录在上一个问题中。
代码:
import azure.cognitiveservices.speech as speechsdk
from dotenv import load_dotenv
import os
import threading
# Store credentials in .env file, initialize speech_config
load_dotenv()
audio_key = os.getenv("audio_key")
audio_region = os.getenv("audio_region")
speech_config = speechsdk.SpeechConfig(subscription=audio_key, region=audio_region)
speech_config.speech_recognition_language = "en-US"
# Endpoint strings found using aforementioned code
mic = "{0.0.1.00000000}.{6dd64d0d-e876-4f3f-b1fe-464843289599}"
stereo_mix = "{0.0.1.00000000}.{c4c4d95c-5bd1-4f09-a07e-ad3a96c381f0}"
# Initialize audio_config as shown in Azure documentation
microphone_audio_config = speechsdk.audio.AudioConfig(device_name=mic)
speaker_audio_config = speechsdk.audio.AudioConfig(device_name=stereo_mix)
# Azure Speech-to-Text Conversation Transcriber
def transcribing(evt, name):
print(f"{name} transcribing! {evt}")
def transcribed(evt, name):
print(f"{name} transcribed! {evt}")
# Function to start Azure speech recognition
def start_recognition(audio_config, speech_config, name):
transcriber = speechsdk.transcription.ConversationTranscriber(speech_config=speech_config, audio_config=audio_config)
transcriber.transcribed.connect(lambda evt: transcribed(evt, name))
transcriber.transcribing.connect(lambda evt: transcribing(evt, name))
transcriber.start_transcribing_async()
print(f"{name} started!")
# Infinite Loop to continue transcription
while True:
pass
# Individual threads for each transcriber
threading.Thread(target=start_recognition, args=(microphone_audio_config, speech_config, "Microphone",)).start()
threading.Thread(target=start_recognition, args=(speaker_audio_config, speech_config, "Speaker",)).start()
这是我的代码的修改版本,但主要思想和我的问题仍然存在。
在正常的对话速度下,转录员奇怪地远远落后,该项目变得毫无用处。即使语速慢得令人痛苦,转录也会失败。
我怀疑这是由于线程造成的,但我无法将问题与线程或 Azure 转录器本身或立体声混音隔离。
如果大家有任何问题请告诉我,我一定会解答。
不要使用可能导致高 CPU 使用率的
while True
循环,而是确保转录功能在异步上下文中高效运行。
重构版本:
# Azure Speech-to-Text Conversation Transcriber
def transcribing(evt, name):
print(f"{name} transcribing: {evt.result.text}")
def transcribed(evt, name):
print(f"{name} transcribed: {evt.result.text}")
async def start_recognition(audio_config, speech_config, name, stop_event):
transcriber = speechsdk.transcription.ConversationTranscriber(speech_config=speech_config, audio_config=audio_config)
transcriber.transcribed.connect(lambda evt: transcribed(evt, name))
transcriber.transcribing.connect(lambda evt: transcribing(evt, name))
await transcriber.start_transcribing_async()
print(f"{name} started!")
while not stop_event.is_set():
await asyncio.sleep(0.1) # Non-blocking wait
await transcriber.stop_transcribing_async()
print(f"{name} stopped!")
def run_recognition_thread(audio_config, speech_config, name, stop_event):
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(start_recognition(audio_config, speech_config, name, stop_event))
# Event to signal the threads to stop
stop_event = threading.Event()
# Individual threads for each transcriber
microphone_thread = threading.Thread(target=run_recognition_thread, args=(microphone_audio_config, speech_config, "Microphone", stop_event))
speaker_thread = threading.Thread(target=run_recognition_thread, args=(speaker_audio_config, speech_config, "Speaker", stop_event))
# Start threads
microphone_thread.start()
speaker_thread.start()
try:
while True:
# Main thread non-blocking wait
if not microphone_thread.is_alive() or not speaker_thread.is_alive():
break
asyncio.sleep(1)
except KeyboardInterrupt:
stop_event.set()
# Join threads to ensure clean exit
microphone_thread.join()
speaker_thread.join()
asyncio.sleep(1)
进行非阻塞等待并检查任一线程是否已停止。或者尝试使用这个 OtterPilot 来录制两个设备的音频。