实时音频流无法通过 WebSocket 和音频 API 在 iOS 上播放

Question

我正在使用 WebSocket 和音频 API 构建一个实时音频流应用程序，但我特别在 iOS 上遇到了问题。该应用程序流式传输通过 WebSocket 从 ElevenLabs API 接收的音频块并按顺序播放它们。它在大多数平台上运行良好，但在 iOS 上，音频要么无法播放，要么有延迟。

我尝试过的：

使用 Web Audio API：我尝试直接使用 AudioContext 播放音频块，但 iOS 与实时流媒体存在兼容性问题。
转换为 Blob：我将接收到的音频块转换为 Blob 并使用一个元素来播放它们。这有效，但它在连续块之间引入了显着的延迟，这对于实时流是不可接受的。

let connectButton = null;
let disconnectButton = null;
let sendButton = null;
let receiveBox = null;
let inputText = null;
let audioPlayer = document.getElementById("audioPlayer");

let ws = null;
let audioQueue = []; 
let isPlaying = false; 

const ELEVENLABS_WS_URL = "wss://api.elevenlabs.io/v1/text-to-speech/XrExE9yKIg1WjnnlVkGX/stream-input?model_id=eleven_multilingual_v1&inactivity_timeout=30&language=English_American";

function startup() {
    connectButton = document.getElementById("connectButton");
    disconnectButton = document.getElementById("disconnectButton");
    sendButton = document.getElementById("sendButton");
    receiveBox = document.getElementById("receive-box");
    inputText = document.getElementById("inputText");
    audioPlayer = document.getElementById("audioPlayer");

    connectButton.addEventListener("click", initializeWebSocket, false);
    disconnectButton.addEventListener("click", closeWebSocket, false);
    sendButton.addEventListener("click", sendMessage, false);
}

function initializeWebSocket() {
    if (!ws || ws.readyState === WebSocket.CLOSED) {        
        ws = new WebSocket(ELEVENLABS_WS_URL);

        ws.onopen = () => {
            console.log("Connected to ElevenLabs WebSocket.");
            connectButton.disabled = true;
            disconnectButton.disabled = false;
        };

        ws.onmessage = handleWebSocketMessage;
        ws.onerror = (error) => console.error("WebSocket error:", error);
        ws.onclose = () => {
            console.log("Disconnected from ElevenLabs WebSocket.");
            connectButton.disabled = false;
            disconnectButton.disabled = true;
        };
    }
}

function closeWebSocket() {
    if (ws) {
        ws.close();
        console.log("WebSocket connection closed.");
    }
}

function sendMessage() {
    const text = inputText.value.trim();
    if (text) {
        const message = {
            text: text,
            voice_settings: {
                stability: 1,
                similarity_boost: true,
                optimize_streaming_latency: 0
            },
            flush: true,
            xi_api_key: "",
            streaming: true
        };

        if (ws && ws.readyState === WebSocket.OPEN) {
            ws.send(JSON.stringify(message));
        } else {
            console.error("WebSocket is not connected.");
        }
    }
}

function handleWebSocketMessage(event) {
    const audioData = JSON.parse(event.data);
    console.log('audioData: ', typeof audioData);
    
    if (audioData && audioData.audio) {
        const base64Audio = audioData.audio;
        addAudioChunkToQueue(base64Audio);
    } else {
        console.error("No audio data received.");
    }
}

function addAudioChunkToQueue(base64Audio) {
    const audioBuffer = base64ToArrayBuffer(base64Audio);
    audioQueue.push(audioBuffer);

    if (!isPlaying) {
        playNextChunk();
    }
}

function playNextChunk() {
    if (audioQueue.length > 0) {
        isPlaying = true;
        const audioBuffer = audioQueue.shift();

        const audioBlob = new Blob([audioBuffer], { type: 'audio/mpeg' });
        const audioUrl = URL.createObjectURL(audioBlob);
        audioPlayer.src = audioUrl;
        audioPlayer.play()
            .then(() => {
                console.log("Playing audio chunk...");
                audioPlayer.onended = () => {
                    isPlaying = false;
                    playNextChunk(); 
                };
            })
            .catch((error) => {
                console.error("Error playing audio:", error);
                isPlaying = false;
            });
    } else {
        isPlaying = false;
    }
}

function base64ToArrayBuffer(base64) {
    console.log('Converting to arraybuffer');
    const binaryString = window.atob(base64);
    const len = binaryString.length;
    const bytes = new Uint8Array(len);
    for (let i = 0; i < len; i++) {
        bytes[i] = binaryString.charCodeAt(i);
    }
    return bytes.buffer;
}
`
window.addEventListener("load", startup);```

Answer 1

理想的解决方案（使用 HTTP GET，而不是 Web Socket）

在理想情况下，您不使用 Web Socket...您使用普通的 HTTP GET 请求。您的提供商不提供它吗？如果他们这样做，您可以使用普通的音频元素...

const audio = new Audio('https://speech-api.example.com/some-stream');
audio.play();

如果您发现播放缓冲区落后，您可以将其向前漂移。

audio.playbackRate = 1.05

非理想解决方案（受 Web 套接字困扰）

如果您无法使用 Web Sockets，那么您还必须自己将音频解码为 PCM。换句话说，你需要实现 MP3 客户端。

好消息是 MP3 专利已过期，您可以使用 Emscripten 为 Web Assembly 构建现有编解码器（例如 MPG123）。

您需要将 MPEG 帧流 解码为 32 位浮点 PCM。这与使用

audioContext.decodeAudioData()

不同，因为音频上下文打算解码整个独立文件。 MP3 无法自行干净地拼接。解码后的独立文件将有填充。

接下来，您需要播放这些 PCM 块。您不能仅仅通过将它们加载到 AudioContext 的 AudioBuffer 实例中然后一个接一个地播放它们来做到这一点。想要把握正确的时机几乎是不可能的。相反，您需要做的是拥有能够生成音频流的东西。您可以使用 ScriptProcessorNode 或其等效的 AudioWorklet。然后，在该过程的每个循环中，切掉之前缓冲的 PCM 块并将其作为新缓冲区发送回 AudioContext。然后，AudioContext 将处理将其缓冲到底层音频系统，确保样本精确的无缝播放。

将来，WebCodecs API将使其中一些事情变得更容易，但它还不兼容，核心原则仍然存在。

实时音频流无法通过 WebSocket 和音频 API 在 iOS 上播放

问题描述投票：0回答：1

1个回答

理想的解决方案（使用 HTTP GET，而不是 Web Socket）

非理想解决方案（受 Web 套接字困扰）

最新问题

实时音频流无法通过 WebSocket 和音频 API 在 iOS 上播放

问题描述 投票：0回答：1

1个回答

理想的解决方案（使用 HTTP GET，而不是 Web Socket）

非理想解决方案（受 Web 套接字困扰）

最新问题

问题描述投票：0回答：1