在 Twilio 双向媒体流期间拨出

Question

在较高层面上，我想在 Twilio 双向媒体流中利用 Twilio 的 Dial 功能，但似乎 Twilio 的 Media Streams 功能仅支持将原始音频发送回呼叫者的功能。

这里有更多细节：

我目前有一个应用程序，它使用双向媒体流来使用 OpenAI Realtime API 通过电话回答问题和疑虑（我的代码的灵感此处）。

这是接收音频并发送回音频的主要双向媒体流代码：

@app.websocket("/media-stream")
async def handle_media_stream(websocket: WebSocket):
    """Handle WebSocket connections between Twilio and OpenAI."""
    print("Client connected")
    await websocket.accept()

    async with websockets.connect(
        "wss://api.openai.com/v1/realtime?model=gpt-4o-realtime-preview-2024-10-01",
        extra_headers={
            "Authorization": f"Bearer {CONFIG.api_key}",
            "OpenAI-Beta": "realtime=v1",
        },
    ) as openai_ws:
        await initialize_session(openai_ws)

        # Connection specific state
        stream_sid = None
        latest_media_timestamp = 0
        last_assistant_item = None
        mark_queue = []
        response_start_timestamp_twilio = None

        async def receive_from_twilio():
            """Receive audio data from Twilio and send it to the OpenAI Realtime API."""
            nonlocal stream_sid, latest_media_timestamp
            try:
                async for message in websocket.iter_text():
                    data = json.loads(message)
                    if data["event"] == "media" and openai_ws.open:
                        latest_media_timestamp = int(data["media"]["timestamp"])
                        audio_append = {
                            "type": "input_audio_buffer.append",
                            "audio": data["media"]["payload"],
                        }
                        await openai_ws.send(json.dumps(audio_append))
                    elif data["event"] == "start":
                        stream_sid = data["start"]["streamSid"]
                        print(f"Incoming stream has started {stream_sid}")
                        response_start_timestamp_twilio = None  # noqa: F841
                        latest_media_timestamp = 0
                        last_assistant_item = None  # noqa: F841
                    elif data["event"] == "mark":
                        if mark_queue:
                            mark_queue.pop(0)
            except WebSocketDisconnect:
                print("Client disconnected.")
                if openai_ws.open:
                    await openai_ws.close()

        async def send_to_twilio():
            """Receive events from the OpenAI Realtime API, send audio back to Twilio."""
            nonlocal stream_sid, last_assistant_item, response_start_timestamp_twilio
            try:
                async for openai_message in openai_ws:
                    response = json.loads(openai_message)
                    response_type = response.get("type")
                    if response_type in CONFIG.log_event_types:
                        # print(f"Received event: {response['type']}", response)
                        logging.info(f"Received event: {response['type']}")

                    match response_type:
                        case "response.audio.delta":
                            if "delta" not in response:
                                continue

                            audio_payload = base64.b64encode(
                                base64.b64decode(response["delta"])
                            ).decode("utf-8")
                            audio_delta = {
                                "event": "media",
                                "streamSid": stream_sid,
                                "media": {"payload": audio_payload},
                            }
                            await websocket.send_json(audio_delta)

                            if response_start_timestamp_twilio is None:
                                response_start_timestamp_twilio = latest_media_timestamp
                                if CONFIG.show_timing_math:
                                    print(
                                        f"Setting start timestamp for new response: {response_start_timestamp_twilio}ms"
                                    )

                            # Update last_assistant_item safely
                            if response.get("item_id"):
                                last_assistant_item = response["item_id"]

                            await send_mark(websocket, stream_sid)

                        # Trigger an interruption. Your use case might work better using `input_audio_buffer.speech_stopped`, or combining the two.
                        case "input_audio_buffer.speech_started":
                            print("Speech started detected.")
                            if last_assistant_item:
                                print(
                                    f"Interrupting response with id: {last_assistant_item}"
                                )
                                await handle_speech_started_event()

                        case "response.function_call_arguments.done":
                            # https://platform.openai.com/docs/api-reference/realtime-server-events/response/function_call_arguments/done
                            # TODO: eventually migrate domain model to voice/
                            event = FunctionCallArgumentsEvent(**response)
                            logging.info(
                                f"Calling {event.name=} with {event.arguments=}"
                            )
                            await call_tool(
                                event.call_id,
                                event.name,
                                json.loads(event.arguments),
                                openai_ws,
                            )

            except Exception as e:
                traceback.print_exc()
                print(f"Error in send_to_twilio: {e}")

问题是，在某些时候，如果呼叫者感到沮丧，我希望能够通过人工拨打他们的电话号码。所以基本上类似于

from twilio.twiml.voice_response import Dial, VoiceResponse, Say

response = VoiceResponse()
response.dial("111-111-1111") # dial out to human

但后者（即 TwiML）仅可在用于传入呼叫的 Webhook 中使用，而不是在用于双向媒体流的 Webhook 中使用。有谁知道解决方法或基本上能够在双向媒体流中以编程方式拨出？

感谢您花时间阅读本文。

Answer 1

如果我理解正确的话，您希望在移交给人工代理时完全退出媒体流，对吗？

在这种情况下，您可以使用新的 TwiML 覆盖调用（发起 Web 套接字流）的原始 TwiML。为此，您需要知道活动呼叫的呼叫 SID。

// Download the helper library from https://www.twilio.com/docs/node/install
const twilio = require("twilio"); // Or, for ESM: import twilio from "twilio";

// Find your Account SID and Auth Token at twilio.com/console
// and set the environment variables. See http://twil.io/secure
const accountSid = process.env.TWILIO_ACCOUNT_SID;
const authToken = process.env.TWILIO_AUTH_TOKEN;
const client = twilio(accountSid, authToken);

async function updateCall() {
  const call = await client
    .calls("CAaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa")
    .update({ twiml: `<?xml version="1.0" encoding="UTF-8"?>
<Response>
    <Dial>111-111-1111</Dial>
</Response>` });

  console.log(call.sid);
}

updateCall();

在 Twilio 双向媒体流期间拨出

问题描述投票：0回答：1

1个回答

最新问题

在 Twilio 双向媒体流期间拨出

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1