我基本上是通过使用 twilio 的电话构建一个简单的 GPT 助手,使用谷歌云语音转录用户的输入。现在我有一个示例 tts_output4 一个音频文件,我想在转录完成后播放它。我查看了各种来源,但找不到任何可以播放音频的内容。
基本上在我的呼叫被接受并打开 A.I 响应后,我有一个需要播放的音频文件,名为 tts_output4.wav,它将使用 send_static_audio 函数播放。
下面是我的server.py代码:
import base64
import json
import threading
from io import BytesIO
import time
from pydub import AudioSegment
current_sentence = ""
last_timestamp = 0
sentence_timeout = 1.5
from flask import Flask, render_template
from flask_sockets import Sockets
from google.cloud.speech import RecognitionConfig, StreamingRecognitionConfig
from gtts import gTTS
import os
import openai # GPT-4 integration
from SpeechClientBridge import SpeechClientBridge
# GPT-4 API setup
openai.api_key = os.getenviron("OpenAI_API_key")
HTTP_SERVER_PORT = 8080
config = RecognitionConfig(
encoding=RecognitionConfig.AudioEncoding.MULAW,
sample_rate_hertz=8000,
language_code="en-US",
)
streaming_config = StreamingRecognitionConfig(config=config, interim_results=True)
app = Flask(__name__)
sockets = Sockets(app)
@app.route("/twiml", methods=["POST"])
def return_twiml():
print("POST TwiML")
return render_template("streams.xml")
def on_transcription_response(response, ws):
global current_sentence, last_timestamp
if not response.results:
return
result = response.results[0]
if not result.alternatives:
return
transcription = result.alternatives[0].transcript
stability = result.stability
is_final = result.is_final
current_time = time.time()
if is_final or (current_time - last_timestamp > sentence_timeout and stability > 0.8):
# Append only the new part of the transcription
if len(transcription) > len(current_sentence):
current_sentence += transcription[len(current_sentence):]
print("Complete Sentence:", current_sentence)
gpt_response = get_gpt_response(current_sentence)
print("GPT Response:", gpt_response)
send_static_audio(ws)
# send_gpt_response_as_audio(gpt_response, ws)
current_sentence = ""
last_timestamp = current_time
else:
if len(transcription) > len(current_sentence):
current_sentence = transcription
last_timestamp = current_time
def get_gpt_response(prompt):
"""Get response from GPT-4 using the v1/chat/completions endpoint."""
response = openai.ChatCompletion.create(
model="gpt-4o",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt}
],
max_tokens=150
)
return response['choices'][0]['message']['content'].strip()
def send_gpt_response_as_audio(text, ws):
try:
# Convert the GPT response to speech (TTS)
tts = gTTS(text, lang='en')
audio_fp = BytesIO()
tts.write_to_fp(audio_fp)
audio_fp.seek(0)
# Convert the TTS audio to the required format for Twilio
audio_segment = AudioSegment.from_mp3(audio_fp)
audio_segment = audio_segment.set_frame_rate(8000).set_channels(1).set_sample_width(1)
audio_data = BytesIO()
audio_segment.export(audio_data, format="wav", codec="pcm_mulaw")
audio_data = audio_data.getvalue()
# Split and send the audio data in chunks
CHUNK_SIZE = 1024 # Adjust chunk size if necessary
for i in range(0, len(audio_data), CHUNK_SIZE):
chunk = audio_data[i:i + CHUNK_SIZE]
audio_b64 = base64.b64encode(chunk).decode('utf-8')
message = json.dumps({
"event": "media",
"media": {
"payload": audio_b64
}
})
ws.send(message)
# sleep(0.1) # Small delay to prevent overwhelming the WebSocket connection
print("Audio sent successfully")
except Exception as e:
print(f"Error sending audio: {e}")
def send_static_audio(ws):
try:
# Load a static audio file (in PCM mu-law format)
with open("tts_output4.wav", "rb") as f:
audio_data = f.read()
audio_b64 = base64.b64encode(audio_data).decode('utf-8')
message = json.dumps({
"event": "media",
"media": {
"payload": audio_b64
}
})
ws.send(message)
# sleep(0.1) # Small delay between chunks
print("Static audio sent successfully")
except Exception as e:
print(f"Error sending static audio: {e}")
def convert_text_to_speech(text, save_locally=True):
print(f"Converting text to speech: {text}")
try:
tts = gTTS(text, lang='en')
audio_fp = BytesIO()
tts.write_to_fp(audio_fp)
audio_fp.seek(0)
audio_data = audio_fp.read()
print(f"TTS conversion complete, audio data length: {len(audio_data)}")
if save_locally:
with open("tts_output.mp3", "wb") as f:
f.write(audio_data)
print("TTS output saved locally as tts_output.mp3")
return audio_data
except Exception as e:
print(f"Error in TTS conversion: {e}")
return None
def convert_audio_for_twilio(audio_data):
"""Convert audio to 8-bit mu-law audio at 8kHz."""
audio = AudioSegment.from_mp3(BytesIO(audio_data))
audio = audio.set_frame_rate(8000).set_channels(1)
buffer = BytesIO()
audio.export(buffer, format="wav", codec="pcm_mulaw")
return buffer.getvalue()
def send_audio_to_twilio(audio_data, ws):
"""Send the audio data to Twilio via WebSocket."""
if ws.closed:
print("WebSocket is closed. Cannot send audio.")
return
audio_b64 = base64.b64encode(audio_data).decode('utf-8')
try:
message = json.dumps({
"event": "media",
"media": {
"payload": audio_b64
}
})
print(f"Sending audio message of length: {len(message)}")
ws.send(message)
print("Audio sent successfully")
except Exception as e:
print(f"Error sending audio: {e}")
@sockets.route("/")
def transcript(ws):
print("WS connection opened")
bridge = SpeechClientBridge(streaming_config, lambda response: on_transcription_response(response, ws))
t = threading.Thread(target=bridge.start)
t.start()
while not ws.closed:
message = ws.receive()
if message is None:
bridge.add_request(None)
bridge.terminate()
break
data = json.loads(message)
if data["event"] in ("connected", "start"):
print(f"Media WS: Received event '{data['event']}': {message}")
continue
if data["event"] == "media":
media = data["media"]
chunk = base64.b64decode(media["payload"])
bridge.add_request(chunk)
if data["event"] == "stop":
print(f"Media WS: Received event 'stop': {message}")
print("Stopping...")
break
bridge.terminate()
print("WS connection closed")
if __name__ == "__main__":
from gevent import pywsgi
from geventwebsocket.handler import WebSocketHandler
server = pywsgi.WSGIServer(
("", HTTP_SERVER_PORT), app, handler_class=WebSocketHandler
)
print("Server listening on: http://localhost:" + str(HTTP_SERVER_PORT))
server.serve_forever()
仅供参考,这是我的streams.xml 文件:
<?xml version="1.0" encoding="UTF-8"?>
<Response>
<Connect>
<Stream url="wss://my-grok-app-link.ngrok-free.app/"></Stream>
</Connect>
<Pause length="3600"/>
</Response>
任何代码帮助表示赞赏:)
我没有将streamSid作为参数之一包含在消息负载中,这解决了我的问题。
message = json.dumps({
"event": "media",
"streamSid": streamSid,
"media": {
"payload": audio_b64
}
})