我一直在使用这里提到的 Realtime API 和 Twilio for python 示例:https://www.twilio.com/en-us/blog/voice-ai-assistant-openai-realtime-api-python
我基本上想存储录音的 mp3,所以我将我的 twiml 创建扩展到:
response = VoiceResponse()
response.say("Please wait while we connect your call to Pony")
response.pause(length=5)
response.say("O.K. you can start talking!")
host = request.url.hostname
start = Start()
start.stream(url=f'wss://{host}/store-stream', track='both_tracks')
response.append(start)
connect = Connect()
connect.stream(url=f'wss://{host}/media-stream')
response.append(connect)
return HTMLResponse(content=str(response), media_type="application/xml")
通过第二个流,我将能够接收两个曲目(用户和实时 API)并创建 .wav 或 .mp3 文件。
我有以下代码,但 mp3 的质量非常低并且是慢动作。
@app.websocket("/store-stream")
async def handle_store_stream(ws: WebSocket):
await ws.accept()
pcm_data = bytearray()
try:
async for message in ws.iter_text():
data = json.loads(message)
event = data['event']
if event == "media":
# Decode the Base64 payload to raw G.711 µ-law bytes
payload = data["media"]["payload"]
ulaw_chunk = base64.b64decode(payload)
# Convert µ-law to linear PCM (16-bit)
pcm_chunk = audioop.ulaw2lin(ulaw_chunk, 2)
pcm_data.extend(pcm_chunk)
if event == "stop":
audio_segment = AudioSegment(
data=bytes(pcm_data),
sample_width=2, # 16-bit audio
frame_rate=8000, # G.711 µ-law is usually sampled at 8 kHz
channels=1 # mono audio
)
audio_segment.export('audio.mp3', format="mp3")
break
我尝试了很多不同的设置,但似乎没有什么可以改善输出 mp3 的结果。
此存储流分别接收入站和出站轨道,因此您应该单独存储它们,然后如果您想要“完整”录音,则将它们覆盖。
类似这样的:
async def handle_messages(self):
"""Handle messages from Twilio WebSocket."""
try:
async for message in self.twilio_ws.iter_text():
event = json.loads(message)
event_type = event["event"]
if event_type == "start":
self.stream_id = event["start"]["streamSid"]
self.call_id = event["start"]["callSid"]
self.account_id = event["start"]["accountSid"]
if event_type == "media":
# Decode the Base64 payload to raw G.711 µ-law bytes
payload = event["media"]["payload"]
ulaw_chunk = base64.b64decode(payload)
# Convert µ-law to linear PCM (16-bit)
pcm_chunk = audioop.ulaw2lin(ulaw_chunk, 2) # 2 means 16-bit PCM
self.pcm_data[event["media"]["track"]].extend(pcm_chunk)
if event_type == "stop":
self.export_audio()
break
def export_audio(self):
"""Export the recorded audio to an MP3 file."""
try:
audio_segments = {}
for track in self.pcm_data:
audio_segment = AudioSegment(
data=bytes(self.pcm_data[track]),
sample_width=2, # 16-bit audio
frame_rate=8000, # G.711 µ-law is usually sampled at 8 kHz
channels=1, # mono audio
)
audio_segments[track] = audio_segment
combined_audio = audio_segments["inbound"].overlay(
audio_segments["outbound"]
)
combined_audio.export(
os.path.join(self.export_path, f"{self.call_id}_twilio.mp3"),
format="mp3",
)