如何在 Unity 中将神经语音播放到 Unity 音频源 - Microsoft 认知服务

Question

我正在努力从 SpeechSynthesizer 获取生成的语音并通过 Unity 音频源播放音频。我需要通过 Unity 音频源播放音频，因为我正在使用口型同步包的音频源。

我能够播放神经语音，甚至可以在本地制作音频剪辑。但我无法在运行时从本地存储中获取生成的音频剪辑，并将其播放到 AudioSource。

桌面版和 Oculus 都可以吗？我正在使用 Unity 认知服务语音 SDK。

//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
// <code>
using System;
using System.Threading;
using UnityEngine;
using UnityEngine.UI;
using Microsoft.CognitiveServices.Speech;

public class HelloWorld : MonoBehaviour
{
    // Hook up the three properties below with a Text, InputField and Button object in your UI.
    public Text outputText;
    public InputField inputField;
    public Button speakButton;
    public AudioSource audioSource;

    // Replace with your own subscription key and service region (e.g., "westus").
    private const string SubscriptionKey = "YourSubscriptionKey";
    private const string Region = "YourServiceRegion";

    private const int SampleRate = 24000;

    private object threadLocker = new object();
    private bool waitingForSpeak;
    private bool audioSourceNeedStop;
    private string message;

    private SpeechConfig speechConfig;
    private SpeechSynthesizer synthesizer;

    public void ButtonClick()
    {
        lock (threadLocker)
        {
            waitingForSpeak = true;
        }

        string newMessage = null;
        var startTime = DateTime.Now;

        // Starts speech synthesis, and returns once the synthesis is started.
        using (var result = synthesizer.StartSpeakingTextAsync(inputField.text).Result)
        {
            // Native playback is not supported on Unity yet (currently only supported on Windows/Linux Desktop).
            // Use the Unity API to play audio here as a short term solution.
            // Native playback support will be added in the future release.
            var audioDataStream = AudioDataStream.FromResult(result);
            var isFirstAudioChunk = true;
            var audioClip = AudioClip.Create(
                "Speech",
                SampleRate * 600, // Can speak 10mins audio as maximum
                1,
                SampleRate,
                true,
                (float[] audioChunk) =>
                {
                    var chunkSize = audioChunk.Length;
                    var audioChunkBytes = new byte[chunkSize * 2];
                    var readBytes = audioDataStream.ReadData(audioChunkBytes);
                    if (isFirstAudioChunk && readBytes > 0)
                    {
                        var endTime = DateTime.Now;
                        var latency = endTime.Subtract(startTime).TotalMilliseconds;
                        newMessage = $"Speech synthesis succeeded!\nLatency: {latency} ms.";
                        isFirstAudioChunk = false;
                    }

                    for (int i = 0; i < chunkSize; ++i)
                    {
                        if (i < readBytes / 2)
                        {
                            audioChunk[i] = (short)(audioChunkBytes[i * 2 + 1] << 8 | audioChunkBytes[i * 2]) / 32768.0F;
                        }
                        else
                        {
                            audioChunk[i] = 0.0f;
                        }
                    }

                    if (readBytes == 0)
                    {
                        Thread.Sleep(200); // Leave some time for the audioSource to finish playback
                        audioSourceNeedStop = true;
                    }
                });

            audioSource.clip = audioClip;
            audioSource.Play();
        }

        lock (threadLocker)
        {
            if (newMessage != null)
            {
                message = newMessage;
            }

            waitingForSpeak = false;
        }
    }

    void Start()
    {
        if (outputText == null)
        {
            UnityEngine.Debug.LogError("outputText property is null! Assign a UI Text element to it.");
        }
        else if (inputField == null)
        {
            message = "inputField property is null! Assign a UI InputField element to it.";
            UnityEngine.Debug.LogError(message);
        }
        else if (speakButton == null)
        {
            message = "speakButton property is null! Assign a UI Button to it.";
            UnityEngine.Debug.LogError(message);
        }
        else
        {
            // Continue with normal initialization, Text, InputField and Button objects are present.
            inputField.text = "Enter text you wish spoken here.";
            message = "Click button to synthesize speech";
            speakButton.onClick.AddListener(ButtonClick);

            // Creates an instance of a speech config with specified subscription key and service region.
            speechConfig = SpeechConfig.FromSubscription(SubscriptionKey, Region);

            // The default format is RIFF, which has a riff header.
            // We are playing the audio in memory as audio clip, which doesn't require riff header.
            // So we need to set the format to raw (24KHz for better quality).
            speechConfig.SetSpeechSynthesisOutputFormat(SpeechSynthesisOutputFormat.Raw24Khz16BitMonoPcm);

            // Creates a speech synthesizer.
            // Make sure to dispose the synthesizer after use!
            synthesizer = new SpeechSynthesizer(speechConfig, null);

            synthesizer.SynthesisCanceled += (s, e) =>
            {
                var cancellation = SpeechSynthesisCancellationDetails.FromResult(e.Result);
                message = $"CANCELED:\nReason=[{cancellation.Reason}]\nErrorDetails=[{cancellation.ErrorDetails}]\nDid you update the subscription info?";
            };
        }
    }

    void Update()
    {
        lock (threadLocker)
        {
            if (speakButton != null)
            {
                speakButton.interactable = !waitingForSpeak;
            }

            if (outputText != null)
            {
                outputText.text = message;
            }

            if (audioSourceNeedStop)
            {
                audioSource.Stop();
                audioSourceNeedStop = false;
            }
        }
    }

    void OnDestroy()
    {
        if (synthesizer != null)
        {
            synthesizer.Dispose();
        }
    }
}
// </code>

Answer 1

要将 Microsoft 的认知服务神经语音集成到 Unity 中并通过 Unity 播放音频

AudioSource

，您应该使用如下示例的代码：

  var speechConfig = SpeechConfig.FromSubscription(speechKey, region);
  speechConfig.SpeechSynthesisVoiceName = "en-US-AriaNeural"; 
  var audioConfig = AudioConfig.FromDefaultSpeakerOutput();
  synthesizer = new SpeechSynthesizer(speechConfig, audioConfig);

使用

SpeechSynthesisVoiceName

，您可以从语音服务中的各种神经语音中进行选择。我参考了这个link来在语音服务中合成语音。请参阅此 GitHub 了解完整代码。

您可以使用下面的代码将文件视为

.wav

。文件完成后即可查看。

您还可以尝试检查其他条件，例如音频是否已开始、正在生成、是否已完成，通过检查

ResultReason

 if (result.Reason == ResultReason.SynthesizingAudioCompleted)
 {
     using var stream = AudioDataStream.FromResult(result);
     await stream.SaveToWaveFileAsync("output.wav");

     
 }

请参阅此链接，了解在 Unity 中设置 Oculus 的指南，并参阅此链接，了解在桌面上进行设置。

enter image description here

如何在 Unity 中将神经语音播放到 Unity 音频源 - Microsoft 认知服务

问题描述投票：0回答：1

1个回答

最新问题

如何在 Unity 中将神经语音播放到 Unity 音频源 - Microsoft 认知服务

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1