如何使用 SDK 在 iOS 上进行文本转语音传输?

问题描述 投票:0回答:1

我正在尝试使用 SPXPushAudioOutputStream 流式传输从语音 SDK 获得的音频。我毫无问题地获取了所有数据,并且可以将其写入 wav 或 mp3,然后使用下面的代码进行播放。

struct ContentView: View {
    @State private var inputText = """
    Die Gesundheitspolitik bleibt ein hartes Pflaster für Reformen. Bundesrätin Elisabeth Baume-Schneider forderte alle Akteure am Sonntag «nachdrücklich» auf, ihren Teil der Verantwortung zu übernehmen und «konkrete, mehrheitsfähige Sparvorschläge» vorzulegen. Mit Blick auf die vergangenen Jahrzehnte kann man darüber nur schmunzeln.
    Solange besagte Akteure ihren Besitzstand eisern verteidigen und solange die politischen Kräfte aus allen Lagern ihrem Lobbydruck nachgeben, wird sich nichts ändern. Auch in den Kantonen überwiegen die Hemmungen, Spitäler zu schliessen und über die Grenzen hinweg die Zusammenarbeit zu verstärken. Ausnahmen bestätigen die Regel.
    Das sagen die Ökonomen
    Deshalb stellt sich die Frage, ob man nicht das zunehmend absurde Kopfprämiensystem abschaffen und auf ein durch Steuergelder finanziertes Gesundheitswesen umstellen sollte, wie in anderen Ländern. watson hat diese Frage den Gesundheitsökonomen Heinz Locher und Willy Oggier gestellt – und interessante Antworten erhalten.
    """
    @State private var resultText = ""
    @State private var isPlaying = false
    @State private var audioPlayer: AVAudioPlayer?
    @State private var synthesisCompleted = false
    
    let speechKey = "censored"
    let serviceRegion = "switzerlandnorth"
    
    var body: some View {
        VStack {
            TextField("Enter text to synthesize", text: $inputText)
                .textFieldStyle(RoundedBorderTextFieldStyle())
                .padding()
            
            Button(action: synthesisToPushAudioOutputStream) {
                Text("Synthesize Speech")
            }
            .padding()
            
            Button(action: playAudio) {
                Text(isPlaying ? "Stop" : "Play")
            }
            .padding()
            .disabled(!synthesisCompleted)
            
            Text(resultText)
                .padding()
        }
        .onChange(of: resultText) { newValue in
            debug("Result text changed to: \(newValue)", function: "body.onChange")
            synthesisCompleted = newValue.contains("Speech synthesis completed")
            debug("Synthesis completed: \(synthesisCompleted)", function: "body.onChange")
        }
    }
    
    private func synthesisToPushAudioOutputStream() {
        let startTime = Date()
        debug("Starting speech synthesis...", function: #function)
        let filePath = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0].appendingPathComponent("pushStream.mp3")
        debug("File path: \(filePath.path)", function: #function)
        
        if !FileManager.default.fileExists(atPath: filePath.path) {
            debug("File doesn't exist. Creating new file...", function: #function)
            FileManager.default.createFile(atPath: filePath.path, contents: nil, attributes: nil)
        } else {
            debug("File already exists. Will overwrite.", function: #function)
        }
        
        guard let fileHandle = try? FileHandle(forWritingTo: filePath) else {
            debug("Failed to open file handle", function: #function)
            updateResultText("Failed to open file at \(filePath.path)")
            return
        }
        debug("File handle opened successfully", function: #function)
        
        var totalBytesWritten: UInt = 0
        let stream = SPXPushAudioOutputStream(writeHandler: { data -> UInt in
            fileHandle.write(data)
            totalBytesWritten += UInt(data.count)
            debug("Wrote \(data.count) bytes. Total: \(totalBytesWritten) bytes", function: "SPXPushAudioOutputStream.writeHandler")
            return UInt(data.count)
        }, closeHandler: {
            fileHandle.closeFile()
            debug("File closed. Total bytes written: \(totalBytesWritten)", function: "SPXPushAudioOutputStream.closeHandler")
        })!
        
        debug("Configuring audio and speech...", function: #function)
        let audioConfig = try? SPXAudioConfiguration(streamOutput: stream)
        let speechConfig = try? SPXSpeechConfiguration(subscription: speechKey, region: serviceRegion)
        
        guard let config = speechConfig, let audio = audioConfig else {
            debug("Failed to create speech or audio configuration", function: #function)
            updateResultText("Speech Config Error")
            return
        }
        
        config.setSpeechSynthesisOutputFormat(.audio24Khz160KBitRateMonoMp3)
        debug("Set output format to MP3", function: #function)
        
        updateResultText("Synthesizing...")
        
        debug("Creating speech synthesizer...", function: #function)
        let synthesizer = try? SPXSpeechSynthesizer(speechConfiguration: config, audioConfiguration: audio)
        guard let synth = synthesizer else {
            debug("Failed to create speech synthesizer", function: #function)
            updateResultText("Speech Synthesis Error")
            return
        }
        
        debug("Starting text-to-speech...", function: #function)
        let speechResult = try? synth.speakText(inputText)
        if let result = speechResult {
            if result.reason == SPXResultReason.canceled {
                let details = try! SPXSpeechSynthesisCancellationDetails(fromCanceledSynthesisResult: result)
                debug("Speech synthesis canceled: \(details.errorDetails ?? "Unknown error")", function: #function)
                updateResultText("Canceled: \(details.errorDetails ?? "Unknown error")")
            } else if result.reason == SPXResultReason.synthesizingAudioCompleted {
                let synthesisTime = Date().timeIntervalSince(startTime)
                debug("Speech synthesis completed successfully in \(String(format: "%.2f", synthesisTime)) seconds", function: #function)
                updateResultText("Speech synthesis completed in \(String(format: "%.2f", synthesisTime)) seconds.")
                
                // Add a small delay to ensure file writing is complete
                DispatchQueue.main.asyncAfter(deadline: .now() + 0.5) {
                    // Get file size
                    do {
                        let attributes = try FileManager.default.attributesOfItem(atPath: filePath.path)
                        let fileSize = attributes[.size] as? Int64 ?? 0
                        debug("File size: \(fileSize) bytes", function: "DispatchQueue.asyncAfter")
                    } catch {
                        debug("Error getting file size: \(error)", function: "DispatchQueue.asyncAfter")
                    }
                    
                    // Get audio duration
                    let asset = AVAsset(url: filePath)
                    let duration = asset.duration
                    let durationSeconds = CMTimeGetSeconds(duration)
                    debug("Audio duration: \(durationSeconds) seconds", function: "DispatchQueue.asyncAfter")
                    self.updateResultText("Speech synthesis completed in \(String(format: "%.2f", synthesisTime)) seconds. Audio Duration: \(String(format: "%.2f", durationSeconds)) seconds, Size: \(FileManager.default.sizeFormatted(ofPath: filePath.path) ?? "Unknown")")
                }
            } else {
                debug("Speech synthesis failed with reason: \(result.reason)", function: #function)
                updateResultText("Speech synthesis error.")
            }
        } else {
            debug("Speech synthesis failed (no result)", function: #function)
            updateResultText("Speech synthesis error.")
        }
    }
    
    private func updateResultText(_ text: String) {
        DispatchQueue.main.async {
            self.resultText = text
            debug("Updated result text: \(text)", function: #function)
            self.synthesisCompleted = text.contains("Speech synthesis completed")
            debug("Synthesis completed: \(self.synthesisCompleted)", function: #function)
        }
    }
    
    private func playAudio() {
        if isPlaying {
            audioPlayer?.stop()
            isPlaying = false
            debug("Audio playback stopped", function: #function)
        } else {
            let filePath = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0].appendingPathComponent("pushStream.mp3")
            debug("Attempting to play audio from: \(filePath.path)", function: #function)
            
            do {
                audioPlayer = try AVAudioPlayer(contentsOf: filePath)
                audioPlayer?.play()
                isPlaying = true
                debug("Audio playback started", function: #function)
                if let duration = audioPlayer?.duration {
                    debug("Audio duration: \(duration) seconds", function: #function)
                }
            } catch {
                updateResultText("Error playing audio: \(error.localizedDescription)")
                debug("Detailed error playing audio: \(error)", function: #function)
            }
        }
    }
    
    private func debug(_ message: String, function: String) {
        let timestamp = DateFormatter.localizedString(from: Date(), dateStyle: .none, timeStyle: .medium)
        print("[\(timestamp)] [\(function)] \(message)")
    }
}

// Add this extension for formatting file size
extension FileManager {
    func sizeFormatted(ofPath path: String) -> String? {
        guard let attributes = try? attributesOfItem(atPath: path) else { return nil }
        let size = attributes[.size] as? Int64 ?? 0
        return ByteCountFormatter.string(fromByteCount: size, countStyle: .file)
    }
}

但是我一生都无法弄清楚如何进行流媒体播放。我对 AVPlayer 知之甚少,所以这显然没有帮助,但我尝试使用浏览网络时能找到的所有方法...任何指向潜在解决方案的指针将不胜感激!

swift audio-streaming azure-cognitive-services
1个回答
0
投票

要使用

SPXPushAudioOutputStream
流式传输从语音 SDK 生成的音频,您可以修改现有代码以在流式传输时播放音频。

我已将

SPXPushAudioOutputStream
配置为将数据流式传输到
AVAudioEngine
进行实时播放。

private func synthesisToPushAudioOutputStream() {
    let startTime = Date()
    debug("Starting speech synthesis...", function: #function)
    
    guard let audioEngine = audioEngine else {
        debug("Audio engine is not initialized", function: #function)
        updateResultText("Audio Engine Error")
        return
    }
    
    // Prepare audio engine and player node
    audioEngine.attach(audioPlayerNode)
    let format = audioEngine.mainMixerNode.outputFormat(forBus: 0)
    audioEngine.connect(audioPlayerNode, to: audioEngine.mainMixerNode, format: format)
    
    let stream = SPXPushAudioOutputStream(writeHandler: { data -> UInt in
        if let pcmBuffer = self.convertDataToPCMBuffer(data: data, format: format) {
            self.audioPlayerNode.scheduleBuffer(pcmBuffer, completionHandler: nil)
        }
        return UInt(data.count)
    }, closeHandler: {
        audioEngine.stop()
        debug("Audio engine stopped", function: "SPXPushAudioOutputStream.closeHandler")
    })!
    
    let audioConfig = try? SPXAudioConfiguration(streamOutput: stream)
    let speechConfig = try? SPXSpeechConfiguration(subscription: speechKey, region: serviceRegion)
    
    guard let config = speechConfig, let audio = audioConfig else {
        debug("Failed to create speech or audio configuration", function: #function)
        updateResultText("Speech Config Error")
        return
    }
    
    config.setSpeechSynthesisOutputFormat(.audio16Khz16KbpsMonoPcm)
    debug("Set output format to PCM", function: #function)
    
    updateResultText("Synthesizing...")
    
    let synthesizer = try? SPXSpeechSynthesizer(speechConfiguration: config, audioConfiguration: audio)
    guard let synth = synthesizer else {
        debug("Failed to create speech synthesizer", function: #function)
        updateResultText("Speech Synthesis Error")
        return
    }
    
    debug("Starting text-to-speech...", function: #function)
    let speechResult = try? synth.speakText(inputText)
    if let result = speechResult {
        if result.reason == SPXResultReason.canceled {
            let details = try! SPXSpeechSynthesisCancellationDetails(fromCanceledSynthesisResult: result)
            debug("Speech synthesis canceled: \(details.errorDetails ?? "Unknown error")", function: #function)
            updateResultText("Canceled: \(details.errorDetails ?? "Unknown error")")
        } else if result.reason == SPXResultReason.synthesizingAudioCompleted {
            let synthesisTime = Date().timeIntervalSince(startTime)
            debug("Speech synthesis completed successfully in \(String(format: "%.2f", synthesisTime)) seconds", function: #function)
            updateResultText("Speech synthesis completed in \(String(format: "%.2f", synthesisTime)) seconds.")
            synthesisCompleted = true
        } else {
            debug("Speech synthesis failed with reason: \(result.reason)", function: #function)
            updateResultText("Speech synthesis error.")
        }
    } else {
        debug("Speech synthesis failed (no result)", function: #function)
        updateResultText("Speech synthesis error.")
    }
}

private func convertDataToPCMBuffer(data: Data, format: AVAudioFormat) -> AVAudioPCMBuffer? {
    let audioBuffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: UInt32(data.count) / format.streamDescription.pointee.mBytesPerFrame)
    audioBuffer?.frameLength = audioBuffer!.frameCapacity
    let audioBufferPointer = audioBuffer?.floatChannelData?[0]
    data.copyBytes(to: UnsafeMutableBufferPointer(start: audioBufferPointer, count: data.count / MemoryLayout<Float>.size))
    return audioBuffer
}

import UIKit
import MicrosoftCognitiveServicesSpeech

let EmbeddedSpeechSynthesisVoicesFolderName = "TTS"
let EmbeddedSpeechSynthesisVoiceName = "YourEmbeddedSpeechSynthesisVoiceName"
let EmbeddedSpeechSynthesisVoiceKey = "YourEmbeddedSpeechSynthesisVoiceKey"

class ViewController: UIViewController, UITextFieldDelegate {
    
    var textField: UITextField!
    var synthButton: UIButton!
    
    var inputText: String!
    var embeddedSpeechConfig: SPXEmbeddedSpeechConfiguration?
    
    override func viewDidLoad() {
        super.viewDidLoad()
        
        let bundle = Bundle(for: type(of: self))
        if let absoluteModelPath = bundle.path(forResource: EmbeddedSpeechSynthesisVoicesFolderName, ofType: nil) {
            do {
                embeddedSpeechConfig = try SPXEmbeddedSpeechConfiguration(fromPath: absoluteModelPath)
                embeddedSpeechConfig?.setSpeechSynthesisVoice(EmbeddedSpeechSynthesisVoiceName, key: EmbeddedSpeechSynthesisVoiceKey)
            } catch {
                print("Error: \(error) in initializing embedded speech configuration.")
                embeddedSpeechConfig = nil
            }
        } else {
            print("Error: Unable to locate the specified embedded speech synthesis voice.")
        }
        
        setupUI()
    }
    
    func setupUI() {
        textField = UITextField(frame: CGRect(x: 100, y: 250, width: 200, height: 50))
        textField.textColor = UIColor.black
        textField.borderStyle = UITextField.BorderStyle.roundedRect
        textField.placeholder = "Type something to synthesize."
        textField.delegate = self
        
        inputText = ""
        
        synthButton = UIButton(frame: CGRect(x: 100, y: 400, width: 200, height: 50))
        synthButton.setTitle("Synthesize", for: .normal)
        synthButton.addTarget(self, action: #selector(synthesisButtonClicked), for: .touchUpInside)
        synthButton.setTitleColor(UIColor.black, for: .normal)
        
        self.view.addSubview(textField)
        self.view.addSubview(synthButton)
    }
    
    func textField(_ textField: UITextField, shouldChangeCharactersIn range: NSRange, replacementString string: String) -> Bool {
        if let demotext = textField.text, let textRange = Range(range, in: text) {
            self.inputText = demotext.replacingCharacters(in: textRange, with: string)
        }
        return true
    }
    
    @objc func synthesisButtonClicked() {
        DispatchQueue.global(qos: .userInitiated).async {
            self.synthesisToWAV()
        }
    }
    
    func synthesisToWAV() {
        let synthesizer = try! SPXSpeechSynthesizer(embeddedSpeechConfiguration: embeddedSpeechConfig!)
        if inputText.isEmpty {
            return
        }
        
        do {
            let audioConfig = try SPXAudioConfiguration.fromDefaultSpeakerOutput()
            let result = try synthesizer.synthesizeSpeech(inputText, audioConfig: audioConfig)
            
            guard let audioData = result.audioData else {
                print("Error: Audio data is nil.")
                return
            }
            
            let documentsPath = NSSearchPathForDirectoriesInDomains(.documentDirectory, .userDomainMask, true)[0] as NSString
            let filePath = documentsPath.appendingPathComponent("output.wav")
            let fileURL = URL(fileURLWithPath: filePath)
            
            do {
                try audioData.write(to: fileURL)
                print("Speech synthesized and saved to: \(fileURL)")
            } catch {
                print("Error writing file: \(error)")
            }
            
        } catch {
            print("Error synthesizing speech: \(error)")
        }
    }
}    

enter image description here

synthesisToWAV()
方法使用
SPXSpeechSynthesizer
将文本合成为语音,并将合成的音频保存为
.wav
文件。

enter image description here

samples_swift_ios.wav:

enter image description here

我已经使用这个 git 来使用 swift 语言进行文本到语音的代码。

© www.soinside.com 2019 - 2024. All rights reserved.