如何使用 SDK 在 iOS 上进行文本转语音传输?

我正在尝试使用 SPXPushAudioOutputStream 流式传输从语音 SDK 获得的音频。我毫无问题地获取了所有数据,并且可以将其写入 wav 或 mp3,然后使用下面的代码进行播放。

struct ContentView: View {
    @State private var inputText = """
    @State private var resultText = ""
    @State private var isPlaying = false
    @State private var audioPlayer: AVAudioPlayer?
    @State private var synthesisCompleted = false
    let speechKey = "censored"
    let serviceRegion = "switzerlandnorth"
    var body: some View {
        VStack {
            TextField("Enter text to synthesize", text: $inputText)
            Button(action: synthesisToPushAudioOutputStream) {
                Text("Synthesize Speech")
            Button(action: playAudio) {
                Text(isPlaying ? "Stop" : "Play")
        .onChange(of: resultText) { newValue in
            debug("Result text changed to: \(newValue)", function: "body.onChange")
            synthesisCompleted = newValue.contains("Speech synthesis completed")
            debug("Synthesis completed: \(synthesisCompleted)", function: "body.onChange")
    private func synthesisToPushAudioOutputStream() {
        let startTime = Date()
        debug("Starting speech synthesis...", function: #function)
        let filePath = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0].appendingPathComponent("pushStream.mp3")
        debug("File path: \(filePath.path)", function: #function)
        if !FileManager.default.fileExists(atPath: filePath.path) {
            debug("File doesn't exist. Creating new file...", function: #function)
            FileManager.default.createFile(atPath: filePath.path, contents: nil, attributes: nil)
        } else {
            debug("File already exists. Will overwrite.", function: #function)
        guard let fileHandle = try? FileHandle(forWritingTo: filePath) else {
            debug("Failed to open file handle", function: #function)
            updateResultText("Failed to open file at \(filePath.path)")
        debug("File handle opened successfully", function: #function)
        var totalBytesWritten: UInt = 0
        let stream = SPXPushAudioOutputStream(writeHandler: { data -> UInt in
            totalBytesWritten += UInt(data.count)
            debug("Wrote \(data.count) bytes. Total: \(totalBytesWritten) bytes", function: "SPXPushAudioOutputStream.writeHandler")
            return UInt(data.count)
        }, closeHandler: {
            debug("File closed. Total bytes written: \(totalBytesWritten)", function: "SPXPushAudioOutputStream.closeHandler")
        debug("Configuring audio and speech...", function: #function)
        let audioConfig = try? SPXAudioConfiguration(streamOutput: stream)
        let speechConfig = try? SPXSpeechConfiguration(subscription: speechKey, region: serviceRegion)
        guard let config = speechConfig, let audio = audioConfig else {
            debug("Failed to create speech or audio configuration", function: #function)
            updateResultText("Speech Config Error")
        debug("Set output format to MP3", function: #function)
        debug("Creating speech synthesizer...", function: #function)
        let synthesizer = try? SPXSpeechSynthesizer(speechConfiguration: config, audioConfiguration: audio)
        guard let synth = synthesizer else {
            debug("Failed to create speech synthesizer", function: #function)
            updateResultText("Speech Synthesis Error")
        debug("Starting text-to-speech...", function: #function)
        let speechResult = try? synth.speakText(inputText)
        if let result = speechResult {
            if result.reason == SPXResultReason.canceled {
                let details = try! SPXSpeechSynthesisCancellationDetails(fromCanceledSynthesisResult: result)
                debug("Speech synthesis canceled: \(details.errorDetails ?? "Unknown error")", function: #function)
                updateResultText("Canceled: \(details.errorDetails ?? "Unknown error")")
            } else if result.reason == SPXResultReason.synthesizingAudioCompleted {
                let synthesisTime = Date().timeIntervalSince(startTime)
                debug("Speech synthesis completed successfully in \(String(format: "%.2f", synthesisTime)) seconds", function: #function)
                updateResultText("Speech synthesis completed in \(String(format: "%.2f", synthesisTime)) seconds.")
                // Add a small delay to ensure file writing is complete
                DispatchQueue.main.asyncAfter(deadline: .now() + 0.5) {
                    // Get file size
                    do {
                        let attributes = try FileManager.default.attributesOfItem(atPath: filePath.path)
                        let fileSize = attributes[.size] as? Int64 ?? 0
                        debug("File size: \(fileSize) bytes", function: "DispatchQueue.asyncAfter")
                    } catch {
                        debug("Error getting file size: \(error)", function: "DispatchQueue.asyncAfter")
                    // Get audio duration
                    let asset = AVAsset(url: filePath)
                    let duration = asset.duration
                    let durationSeconds = CMTimeGetSeconds(duration)
                    debug("Audio duration: \(durationSeconds) seconds", function: "DispatchQueue.asyncAfter")
                    self.updateResultText("Speech synthesis completed in \(String(format: "%.2f", synthesisTime)) seconds. Audio Duration: \(String(format: "%.2f", durationSeconds)) seconds, Size: \(FileManager.default.sizeFormatted(ofPath: filePath.path) ?? "Unknown")")
            } else {
                debug("Speech synthesis failed with reason: \(result.reason)", function: #function)
                updateResultText("Speech synthesis error.")
        } else {
            debug("Speech synthesis failed (no result)", function: #function)
            updateResultText("Speech synthesis error.")
    private func updateResultText(_ text: String) {
        DispatchQueue.main.async {
            self.resultText = text
            debug("Updated result text: \(text)", function: #function)
            self.synthesisCompleted = text.contains("Speech synthesis completed")
            debug("Synthesis completed: \(self.synthesisCompleted)", function: #function)
    private func playAudio() {
        if isPlaying {
            isPlaying = false
            debug("Audio playback stopped", function: #function)
        } else {
            let filePath = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0].appendingPathComponent("pushStream.mp3")
            debug("Attempting to play audio from: \(filePath.path)", function: #function)
            do {
                audioPlayer = try AVAudioPlayer(contentsOf: filePath)
                isPlaying = true
                debug("Audio playback started", function: #function)
                if let duration = audioPlayer?.duration {
                    debug("Audio duration: \(duration) seconds", function: #function)
            } catch {
                updateResultText("Error playing audio: \(error.localizedDescription)")
                debug("Detailed error playing audio: \(error)", function: #function)
    private func debug(_ message: String, function: String) {
        let timestamp = DateFormatter.localizedString(from: Date(), dateStyle: .none, timeStyle: .medium)
        print("[\(timestamp)] [\(function)] \(message)")

// Add this extension for formatting file size
extension FileManager {
    func sizeFormatted(ofPath path: String) -> String? {
        guard let attributes = try? attributesOfItem(atPath: path) else { return nil }
        let size = attributes[.size] as? Int64 ?? 0
        return ByteCountFormatter.string(fromByteCount: size, countStyle: .file)

但是我一生都无法弄清楚如何进行流媒体播放。我对 AVPlayer 知之甚少,所以这显然没有帮助,但我尝试使用浏览网络时能找到的所有方法...任何指向潜在解决方案的指针将不胜感激!

swift audio-streaming azure-cognitive-services


流式传输从语音 SDK 生成的音频,您可以修改现有代码以在流式传输时播放音频。



private func synthesisToPushAudioOutputStream() {
    let startTime = Date()
    debug("Starting speech synthesis...", function: #function)
    guard let audioEngine = audioEngine else {
        debug("Audio engine is not initialized", function: #function)
        updateResultText("Audio Engine Error")
    // Prepare audio engine and player node
    let format = audioEngine.mainMixerNode.outputFormat(forBus: 0)
    audioEngine.connect(audioPlayerNode, to: audioEngine.mainMixerNode, format: format)
    let stream = SPXPushAudioOutputStream(writeHandler: { data -> UInt in
        if let pcmBuffer = self.convertDataToPCMBuffer(data: data, format: format) {
            self.audioPlayerNode.scheduleBuffer(pcmBuffer, completionHandler: nil)
        return UInt(data.count)
    }, closeHandler: {
        debug("Audio engine stopped", function: "SPXPushAudioOutputStream.closeHandler")
    let audioConfig = try? SPXAudioConfiguration(streamOutput: stream)
    let speechConfig = try? SPXSpeechConfiguration(subscription: speechKey, region: serviceRegion)
    guard let config = speechConfig, let audio = audioConfig else {
        debug("Failed to create speech or audio configuration", function: #function)
        updateResultText("Speech Config Error")
    debug("Set output format to PCM", function: #function)
    let synthesizer = try? SPXSpeechSynthesizer(speechConfiguration: config, audioConfiguration: audio)
    guard let synth = synthesizer else {
        debug("Failed to create speech synthesizer", function: #function)
        updateResultText("Speech Synthesis Error")
    debug("Starting text-to-speech...", function: #function)
    let speechResult = try? synth.speakText(inputText)
    if let result = speechResult {
        if result.reason == SPXResultReason.canceled {
            let details = try! SPXSpeechSynthesisCancellationDetails(fromCanceledSynthesisResult: result)
            debug("Speech synthesis canceled: \(details.errorDetails ?? "Unknown error")", function: #function)
            updateResultText("Canceled: \(details.errorDetails ?? "Unknown error")")
        } else if result.reason == SPXResultReason.synthesizingAudioCompleted {
            let synthesisTime = Date().timeIntervalSince(startTime)
            debug("Speech synthesis completed successfully in \(String(format: "%.2f", synthesisTime)) seconds", function: #function)
            updateResultText("Speech synthesis completed in \(String(format: "%.2f", synthesisTime)) seconds.")
            synthesisCompleted = true
        } else {
            debug("Speech synthesis failed with reason: \(result.reason)", function: #function)
            updateResultText("Speech synthesis error.")
    } else {
        debug("Speech synthesis failed (no result)", function: #function)
        updateResultText("Speech synthesis error.")

private func convertDataToPCMBuffer(data: Data, format: AVAudioFormat) -> AVAudioPCMBuffer? {
    let audioBuffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: UInt32(data.count) / format.streamDescription.pointee.mBytesPerFrame)
    audioBuffer?.frameLength = audioBuffer!.frameCapacity
    let audioBufferPointer = audioBuffer?.floatChannelData?[0]
    data.copyBytes(to: UnsafeMutableBufferPointer(start: audioBufferPointer, count: data.count / MemoryLayout<Float>.size))
    return audioBuffer

import UIKit
import MicrosoftCognitiveServicesSpeech

let EmbeddedSpeechSynthesisVoicesFolderName = "TTS"
let EmbeddedSpeechSynthesisVoiceName = "YourEmbeddedSpeechSynthesisVoiceName"
let EmbeddedSpeechSynthesisVoiceKey = "YourEmbeddedSpeechSynthesisVoiceKey"

class ViewController: UIViewController, UITextFieldDelegate {
    var textField: UITextField!
    var synthButton: UIButton!
    var inputText: String!
    var embeddedSpeechConfig: SPXEmbeddedSpeechConfiguration?
    override func viewDidLoad() {
        let bundle = Bundle(for: type(of: self))
        if let absoluteModelPath = bundle.path(forResource: EmbeddedSpeechSynthesisVoicesFolderName, ofType: nil) {
            do {
                embeddedSpeechConfig = try SPXEmbeddedSpeechConfiguration(fromPath: absoluteModelPath)
                embeddedSpeechConfig?.setSpeechSynthesisVoice(EmbeddedSpeechSynthesisVoiceName, key: EmbeddedSpeechSynthesisVoiceKey)
            } catch {
                print("Error: \(error) in initializing embedded speech configuration.")
                embeddedSpeechConfig = nil
        } else {
            print("Error: Unable to locate the specified embedded speech synthesis voice.")
    func setupUI() {
        textField = UITextField(frame: CGRect(x: 100, y: 250, width: 200, height: 50))
        textField.textColor = UIColor.black
        textField.borderStyle = UITextField.BorderStyle.roundedRect
        textField.placeholder = "Type something to synthesize."
        textField.delegate = self
        inputText = ""
        synthButton = UIButton(frame: CGRect(x: 100, y: 400, width: 200, height: 50))
        synthButton.setTitle("Synthesize", for: .normal)
        synthButton.addTarget(self, action: #selector(synthesisButtonClicked), for: .touchUpInside)
        synthButton.setTitleColor(UIColor.black, for: .normal)
    func textField(_ textField: UITextField, shouldChangeCharactersIn range: NSRange, replacementString string: String) -> Bool {
        if let demotext = textField.text, let textRange = Range(range, in: text) {
            self.inputText = demotext.replacingCharacters(in: textRange, with: string)
        return true
    @objc func synthesisButtonClicked() {
        DispatchQueue.global(qos: .userInitiated).async {
    func synthesisToWAV() {
        let synthesizer = try! SPXSpeechSynthesizer(embeddedSpeechConfiguration: embeddedSpeechConfig!)
        if inputText.isEmpty {
        do {
            let audioConfig = try SPXAudioConfiguration.fromDefaultSpeakerOutput()
            let result = try synthesizer.synthesizeSpeech(inputText, audioConfig: audioConfig)
            guard let audioData = result.audioData else {
                print("Error: Audio data is nil.")
            let documentsPath = NSSearchPathForDirectoriesInDomains(.documentDirectory, .userDomainMask, true)[0] as NSString
            let filePath = documentsPath.appendingPathComponent("output.wav")
            let fileURL = URL(fileURLWithPath: filePath)
            do {
                try audioData.write(to: fileURL)
                print("Speech synthesized and saved to: \(fileURL)")
            } catch {
                print("Error writing file: \(error)")
        } catch {
            print("Error synthesizing speech: \(error)")

我已经使用这个 git 来使用 swift 语言进行文本到语音的代码。

