如何将音频文件媒体发送到 twilio 流电话

Question

我基本上是通过使用 twilio 的电话构建一个简单的 GPT 助手，使用谷歌云语音转录用户的输入。现在我有一个示例 tts_output4 一个音频文件，我想在转录完成后播放它。我查看了各种来源，但找不到任何可以播放音频的内容。

基本上在我的呼叫被接受并打开 A.I 响应后，我有一个需要播放的音频文件，名为 tts_output4.wav，它将使用 send_static_audio 函数播放。

下面是我的server.py代码：


import base64
import json
import threading
from io import BytesIO
import time
from pydub import AudioSegment
current_sentence = ""
last_timestamp = 0
sentence_timeout = 1.5

from flask import Flask, render_template
from flask_sockets import Sockets
from google.cloud.speech import RecognitionConfig, StreamingRecognitionConfig
from gtts import gTTS
import os
import openai  # GPT-4 integration

from SpeechClientBridge import SpeechClientBridge

# GPT-4 API setup
openai.api_key = os.getenviron("OpenAI_API_key")

HTTP_SERVER_PORT = 8080

config = RecognitionConfig(
    encoding=RecognitionConfig.AudioEncoding.MULAW,
    sample_rate_hertz=8000,
    language_code="en-US",
)
streaming_config = StreamingRecognitionConfig(config=config, interim_results=True)

app = Flask(__name__)
sockets = Sockets(app)


@app.route("/twiml", methods=["POST"])
def return_twiml():
    print("POST TwiML")
    return render_template("streams.xml")


def on_transcription_response(response, ws):
    global current_sentence, last_timestamp

    if not response.results:
        return

    result = response.results[0]
    if not result.alternatives:
        return

    transcription = result.alternatives[0].transcript
    stability = result.stability
    is_final = result.is_final

    current_time = time.time()

    if is_final or (current_time - last_timestamp > sentence_timeout and stability > 0.8):
        # Append only the new part of the transcription
        if len(transcription) > len(current_sentence):
            current_sentence += transcription[len(current_sentence):]
        
        print("Complete Sentence:", current_sentence)

        gpt_response = get_gpt_response(current_sentence)
        print("GPT Response:", gpt_response)
        send_static_audio(ws)

        # send_gpt_response_as_audio(gpt_response, ws)
        


        current_sentence = ""
        last_timestamp = current_time
    else:

        if len(transcription) > len(current_sentence):
            current_sentence = transcription
            last_timestamp = current_time



def get_gpt_response(prompt):
    """Get response from GPT-4 using the v1/chat/completions endpoint."""
    response = openai.ChatCompletion.create(
        model="gpt-4o",  
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=150  
    )
    return response['choices'][0]['message']['content'].strip()

def send_gpt_response_as_audio(text, ws):
    try:
        # Convert the GPT response to speech (TTS)
        tts = gTTS(text, lang='en')
        audio_fp = BytesIO()
        tts.write_to_fp(audio_fp)
        audio_fp.seek(0)

        # Convert the TTS audio to the required format for Twilio
        audio_segment = AudioSegment.from_mp3(audio_fp)
        audio_segment = audio_segment.set_frame_rate(8000).set_channels(1).set_sample_width(1)
        audio_data = BytesIO()
        audio_segment.export(audio_data, format="wav", codec="pcm_mulaw")
        audio_data = audio_data.getvalue()

        # Split and send the audio data in chunks
        CHUNK_SIZE = 1024  # Adjust chunk size if necessary
        for i in range(0, len(audio_data), CHUNK_SIZE):
            chunk = audio_data[i:i + CHUNK_SIZE]
            audio_b64 = base64.b64encode(chunk).decode('utf-8')
            message = json.dumps({
                "event": "media",
                "media": {
                    "payload": audio_b64
                }
            })
            ws.send(message)
            # sleep(0.1)  # Small delay to prevent overwhelming the WebSocket connection

        print("Audio sent successfully")
    
    except Exception as e:
        print(f"Error sending audio: {e}")

def send_static_audio(ws):
    try:
        # Load a static audio file (in PCM mu-law format)
        with open("tts_output4.wav", "rb") as f:
            audio_data = f.read()

            audio_b64 = base64.b64encode(audio_data).decode('utf-8')
            message = json.dumps({
                "event": "media",
                "media": {
                    "payload": audio_b64
                }
            })
            ws.send(message)
            # sleep(0.1)  # Small delay between chunks

        print("Static audio sent successfully")
    except Exception as e:
        print(f"Error sending static audio: {e}")


def convert_text_to_speech(text, save_locally=True):
    print(f"Converting text to speech: {text}")
    try:
        tts = gTTS(text, lang='en')
        audio_fp = BytesIO()
        tts.write_to_fp(audio_fp)
        audio_fp.seek(0)
        audio_data = audio_fp.read()
        print(f"TTS conversion complete, audio data length: {len(audio_data)}")
        
        if save_locally:
            with open("tts_output.mp3", "wb") as f:
                f.write(audio_data)
            print("TTS output saved locally as tts_output.mp3")
        
        return audio_data
    except Exception as e:
        print(f"Error in TTS conversion: {e}")
        return None


def convert_audio_for_twilio(audio_data):
    """Convert audio to 8-bit mu-law audio at 8kHz."""
    audio = AudioSegment.from_mp3(BytesIO(audio_data))
    audio = audio.set_frame_rate(8000).set_channels(1)
    buffer = BytesIO()
    audio.export(buffer, format="wav", codec="pcm_mulaw")
    return buffer.getvalue()

def send_audio_to_twilio(audio_data, ws):
    """Send the audio data to Twilio via WebSocket."""
    if ws.closed:
        print("WebSocket is closed. Cannot send audio.")
        return
    audio_b64 = base64.b64encode(audio_data).decode('utf-8')
    try:
        message = json.dumps({
            "event": "media",
            "media": {
                "payload": audio_b64
            }
        })
        print(f"Sending audio message of length: {len(message)}")
        ws.send(message)
        print("Audio sent successfully")
    except Exception as e:
        print(f"Error sending audio: {e}")


@sockets.route("/")
def transcript(ws):
    print("WS connection opened")
    bridge = SpeechClientBridge(streaming_config, lambda response: on_transcription_response(response, ws))
    t = threading.Thread(target=bridge.start)
    t.start()

    while not ws.closed:
        message = ws.receive()
        if message is None:
            bridge.add_request(None)
            bridge.terminate()
            break

        data = json.loads(message)
        if data["event"] in ("connected", "start"):
            print(f"Media WS: Received event '{data['event']}': {message}")
            continue
        if data["event"] == "media":
            media = data["media"]
            chunk = base64.b64decode(media["payload"])
            bridge.add_request(chunk)
        if data["event"] == "stop":
            print(f"Media WS: Received event 'stop': {message}")
            print("Stopping...")
            break

    bridge.terminate()
    print("WS connection closed")


if __name__ == "__main__":
    from gevent import pywsgi
    from geventwebsocket.handler import WebSocketHandler

    server = pywsgi.WSGIServer(
        ("", HTTP_SERVER_PORT), app, handler_class=WebSocketHandler
    )
    print("Server listening on: http://localhost:" + str(HTTP_SERVER_PORT))
    server.serve_forever()

仅供参考，这是我的streams.xml 文件：

<?xml version="1.0" encoding="UTF-8"?>
<Response>
  <Connect>
    <Stream url="wss://my-grok-app-link.ngrok-free.app/"></Stream>
  </Connect>
  <Pause length="3600"/>
</Response>

任何代码帮助表示赞赏:)

Answer 1

我没有将streamSid作为参数之一包含在消息负载中，这解决了我的问题。

 message = json.dumps({
                "event": "media",
                "streamSid": streamSid,
                "media": {
                    "payload": audio_b64
                }
            })

如何将音频文件媒体发送到 twilio 流电话

问题描述投票：0回答：1

1个回答

最新问题

如何将音频文件媒体发送到 twilio 流电话

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1