我正在努力从 SpeechSynthesizer 获取生成的语音并通过 Unity 音频源播放音频。 我需要通过 Unity 音频源播放音频,因为我正在使用口型同步包的音频源。
我能够播放神经语音,甚至可以在本地制作音频剪辑。但我无法在运行时从本地存储中获取生成的音频剪辑,并将其播放到 AudioSource。
桌面版和 Oculus 都可以吗?我正在使用 Unity 认知服务语音 SDK。
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
// <code>
using System;
using System.Threading;
using UnityEngine;
using UnityEngine.UI;
using Microsoft.CognitiveServices.Speech;
public class HelloWorld : MonoBehaviour
{
// Hook up the three properties below with a Text, InputField and Button object in your UI.
public Text outputText;
public InputField inputField;
public Button speakButton;
public AudioSource audioSource;
// Replace with your own subscription key and service region (e.g., "westus").
private const string SubscriptionKey = "YourSubscriptionKey";
private const string Region = "YourServiceRegion";
private const int SampleRate = 24000;
private object threadLocker = new object();
private bool waitingForSpeak;
private bool audioSourceNeedStop;
private string message;
private SpeechConfig speechConfig;
private SpeechSynthesizer synthesizer;
public void ButtonClick()
{
lock (threadLocker)
{
waitingForSpeak = true;
}
string newMessage = null;
var startTime = DateTime.Now;
// Starts speech synthesis, and returns once the synthesis is started.
using (var result = synthesizer.StartSpeakingTextAsync(inputField.text).Result)
{
// Native playback is not supported on Unity yet (currently only supported on Windows/Linux Desktop).
// Use the Unity API to play audio here as a short term solution.
// Native playback support will be added in the future release.
var audioDataStream = AudioDataStream.FromResult(result);
var isFirstAudioChunk = true;
var audioClip = AudioClip.Create(
"Speech",
SampleRate * 600, // Can speak 10mins audio as maximum
1,
SampleRate,
true,
(float[] audioChunk) =>
{
var chunkSize = audioChunk.Length;
var audioChunkBytes = new byte[chunkSize * 2];
var readBytes = audioDataStream.ReadData(audioChunkBytes);
if (isFirstAudioChunk && readBytes > 0)
{
var endTime = DateTime.Now;
var latency = endTime.Subtract(startTime).TotalMilliseconds;
newMessage = $"Speech synthesis succeeded!\nLatency: {latency} ms.";
isFirstAudioChunk = false;
}
for (int i = 0; i < chunkSize; ++i)
{
if (i < readBytes / 2)
{
audioChunk[i] = (short)(audioChunkBytes[i * 2 + 1] << 8 | audioChunkBytes[i * 2]) / 32768.0F;
}
else
{
audioChunk[i] = 0.0f;
}
}
if (readBytes == 0)
{
Thread.Sleep(200); // Leave some time for the audioSource to finish playback
audioSourceNeedStop = true;
}
});
audioSource.clip = audioClip;
audioSource.Play();
}
lock (threadLocker)
{
if (newMessage != null)
{
message = newMessage;
}
waitingForSpeak = false;
}
}
void Start()
{
if (outputText == null)
{
UnityEngine.Debug.LogError("outputText property is null! Assign a UI Text element to it.");
}
else if (inputField == null)
{
message = "inputField property is null! Assign a UI InputField element to it.";
UnityEngine.Debug.LogError(message);
}
else if (speakButton == null)
{
message = "speakButton property is null! Assign a UI Button to it.";
UnityEngine.Debug.LogError(message);
}
else
{
// Continue with normal initialization, Text, InputField and Button objects are present.
inputField.text = "Enter text you wish spoken here.";
message = "Click button to synthesize speech";
speakButton.onClick.AddListener(ButtonClick);
// Creates an instance of a speech config with specified subscription key and service region.
speechConfig = SpeechConfig.FromSubscription(SubscriptionKey, Region);
// The default format is RIFF, which has a riff header.
// We are playing the audio in memory as audio clip, which doesn't require riff header.
// So we need to set the format to raw (24KHz for better quality).
speechConfig.SetSpeechSynthesisOutputFormat(SpeechSynthesisOutputFormat.Raw24Khz16BitMonoPcm);
// Creates a speech synthesizer.
// Make sure to dispose the synthesizer after use!
synthesizer = new SpeechSynthesizer(speechConfig, null);
synthesizer.SynthesisCanceled += (s, e) =>
{
var cancellation = SpeechSynthesisCancellationDetails.FromResult(e.Result);
message = $"CANCELED:\nReason=[{cancellation.Reason}]\nErrorDetails=[{cancellation.ErrorDetails}]\nDid you update the subscription info?";
};
}
}
void Update()
{
lock (threadLocker)
{
if (speakButton != null)
{
speakButton.interactable = !waitingForSpeak;
}
if (outputText != null)
{
outputText.text = message;
}
if (audioSourceNeedStop)
{
audioSource.Stop();
audioSourceNeedStop = false;
}
}
}
void OnDestroy()
{
if (synthesizer != null)
{
synthesizer.Dispose();
}
}
}
// </code>
要将 Microsoft 的认知服务神经语音集成到 Unity 中并通过 Unity 播放音频
AudioSource
,您应该使用如下示例的代码:
var speechConfig = SpeechConfig.FromSubscription(speechKey, region);
speechConfig.SpeechSynthesisVoiceName = "en-US-AriaNeural";
var audioConfig = AudioConfig.FromDefaultSpeakerOutput();
synthesizer = new SpeechSynthesizer(speechConfig, audioConfig);
使用
SpeechSynthesisVoiceName
,您可以从语音服务中的各种神经语音中进行选择。我参考了这个link来在语音服务中合成语音。请参阅此 GitHub 了解完整代码。
您可以使用下面的代码将文件视为
.wav
。文件完成后即可查看。
您还可以尝试检查其他条件,例如音频是否已开始、正在生成、是否已完成,通过检查
ResultReason
if (result.Reason == ResultReason.SynthesizingAudioCompleted)
{
using var stream = AudioDataStream.FromResult(result);
await stream.SaveToWaveFileAsync("output.wav");
}