我正在 Node.js 中使用 Azure 的 WebSocket API 实现实时文本到语音流传输。 WebSocket 连接成功建立,但在发送初始 SSML 消息时立即关闭并出现错误。
WebSocket connection established
--------------open done------------------
Received message: X-RequestId:13604750-2bxd-42a2-b523-cdesadas3d47
Content-Type:application/json; charset=utf-8
Path:turn.start
{
"context": {
"serviceTag": "9a23ce3549axdf7ddfg1d42317bcx5210"
}
}
WebSocket connection closed with code 1007. Reason: Input stream cannot be null
这是代码的相关部分(完整源代码这里):
import WebSocket from 'ws';
import axios from 'axios';
import crypto from 'crypto';
async function streamTextToSpeech(textStream) {
const accessToken = await getAccessToken();
const ws = new WebSocket(
`wss://${region}.tts.speech.microsoft.com/cognitiveservices/websocket/v2?Authorization=Bearer ${accessToken}`
);
ws.on('open', () => {
// Send configuration message
const configMessage = {
context: {
synthesis: {
audio: {
metadataOptions: {
sentenceBoundaryEnabled: 'false',
wordBoundaryEnabled: 'false'
},
outputFormat: 'audio-24khz-48kbitrate-mono-mp3'
},
language: {
autoDetection: false
},
voice: {
languageId: 'en-US',
name: 'en-US-JennyNeural'
}
}
}
};
const configMessageString = `Content-Type:application/json\r\nPath:speech.config\r\n\r\n${JSON.stringify(configMessage)}`;
ws.send(configMessageString);
// This is where the error occurs
const initialSsml = `<speak version='1.0' xml:lang='en-US'><voice name='en-US-JennyNeural'>Initializing speech synthesis.</voice></speak>`;
const initialMessage = `Path:ssml\r\nX-RequestId:${crypto.randomUUID()}\r\nContent-Type:application/ssml+xml\r\n\r\n${initialSsml}`;
ws.send(initialMessage);
});
// Error handling and message processing code...
}
寻求调试此 WebSocket 实现的帮助或 Azure TTS WebSocket API 的正确消息格式示例。
我尝试使用以下代码使用 Websocket 将文本转换为语音,并成功将流式音频输出转换为 .wav 文件。
代码:
服务器.js:
const express = require('express');
const axios = require('axios');
const bodyParser = require('body-parser');
const fs = require('fs');
const path = require('path');
const WebSocket = require('ws');
const app = express();
const PORT = process.env.PORT || 3000;
const AZURE_SPEECH_KEY = '<speech_key>';
const AZURE_SERVICE_REGION = '<speech_region>';
app.use(bodyParser.urlencoded({ extended: true }));
app.use(express.static('public'));
app.get('/', (req, res) => {
res.sendFile(path.join(__dirname, 'public', 'index.html'));
});
const server = app.listen(PORT, () => {
console.log(`Server is running on http://localhost:${PORT}`);
});
const wss = new WebSocket.Server({ server });
wss.on('connection', (ws) => {
console.log('New client connected');
ws.on('message', async (message) => {
console.log(`Received message from client: ${message}`);
const { text, voice_name } = JSON.parse(message);
try {
const audioBuffer = await textToSpeech(text, voice_name);
const filePath = path.join(__dirname, 'kam.wav');
fs.writeFileSync(filePath, audioBuffer);
console.log(`Audio saved to ${filePath}`);
ws.send(audioBuffer);
} catch (error) {
console.error(error);
ws.send("Failed to generate audio.");
}
});
ws.on('close', () => {
console.log('Client disconnected');
});
});
app.post('/synthesize', (req, res) => {
res.status(405).send("Use WebSocket to synthesize text to speech.");
});
async function textToSpeech(text, voiceName = 'en-US-JennyNeural') {
const url = `https://${AZURE_SERVICE_REGION}.tts.speech.microsoft.com/cognitiveservices/v1`;
const headers = {
'Ocp-Apim-Subscription-Key': AZURE_SPEECH_KEY,
'Content-Type': 'application/ssml+xml',
'X-Microsoft-OutputFormat': 'riff-24khz-16bit-mono-pcm',
};
const ssml =
`<speak version='1.0' xml:lang='en-US'>
<voice xml:lang='en-US' name='${voiceName}'>${text}</voice>
</speak>`;
const response = await axios.post(url, ssml, { headers, responseType: 'arraybuffer' });
return response.data;
}
public/index.html:
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
<title>Text to Speech</title>
</head>
<body>
<h1>Text to Speech</h1>
<form id="ttsForm">
<label for="text">Enter Text:</label><br>
<textarea id="text" name="text" rows="4" cols="50" required></textarea><br><br>
<label for="voice_name">Voice Name:</label><br>
<input type="text" id="voice_name" name="voice_name" value="en-US-JennyNeural" required><br><br>
<input type="submit" value="Convert to Speech">
</form>
<script>
let socket;
function initWebSocket() {
socket = new WebSocket('ws://localhost:3000');
socket.onopen = () => {
console.log('WebSocket connection established');
};
socket.onmessage = (event) => {
console.log('Message from server: ', event.data);
if (event.data instanceof Blob) {
const audioUrl = URL.createObjectURL(event.data);
const audio = new Audio(audioUrl);
audio.play();
} else {
console.error('Received non-audio message:', event.data);
}
};
socket.onclose = () => {
console.log('WebSocket connection closed');
};
}
document.addEventListener('DOMContentLoaded', () => {
initWebSocket();
document.getElementById('ttsForm').addEventListener('submit', (event) => {
event.preventDefault();
sendMessage();
});
});
function sendMessage() {
const text = document.getElementById('text').value;
const voiceName = document.getElementById('voice_name').value;
const message = JSON.stringify({ text, voice_name: voiceName });
socket.send(message);
}
</script>
</body>
</html>
浏览器输出:
我在网络上成功将文本转换为语音,并将流式音频输出保存到 .wav 文件中。
输出:
流式传输后,文本转语音转换的输出已保存到以下位置。