根据 this,可以在 Azure STT 服务的 JSON 输出中获取每个单词的置信度。问题是我似乎无法找到如何使用 Node JS 库来做到这一点(或者是否可能)。
现有代码
目前我的 STT 类中有以下配置:
// Create the push audio input stream with the correct audio format
const audioFormat = speechSdk.AudioStreamFormat.getWaveFormatPCM(
currentConfig.sampleRateHertz,
16, // Bits per sample
1 // Mono channel
);
this.audioStream = speechSdk.AudioInputStream.createPushStream(audioFormat);
// Configure the speech settings
this.speechConfig = speechSdk.SpeechConfig.fromSubscription(this.subscriptionKey, this.region);
this.speechConfig.speechRecognitionLanguage = currentConfig.language;
// MORE CONFIG
this.speechConfig.outputFormat = speechSdk.OutputFormat.Detailed;
// Create the audio config from the push stream
this.audioConfig = speechSdk.AudioConfig.fromStreamInput(this.audioStream);
// Create the speech recognizer
this.recognizer = new speechSdk.SpeechRecognizer(this.speechConfig, this.audioConfig);
然后我可以通过以下方式实时识别音频:
this.recognizer.recognizing = (s, e) => {
console.debug("AzureProvider: JSON Result: ", e.results);
if (e.result.reason === speechSdk.ResultReason.NoMatch) {
console.warn("AzureProvider: No match found.");
return;
}
let result = {
text: e.result.text,
score: e.result.confidence ? Math.round(e.result.confidence * 100) : 100,
};
if (this.results.length === this.maxResults) {
this.results.shift();
}
this.results.push(result);
};
this.recognizer.recognized = (s, e) => {
console.debug("AzureProvider: JSON Result: ", e.results);
// Test if no match was found
if (e.result.reason === speechSdk.ResultReason.NoMatch) {
console.warn("AzureProvider: No final match found.");
return;
}
let result = {
text: this.postProcessTranscript(e.result.text),
score: e.result.confidence ? Math.round(e.result.confidence * 100) : 100,
};
// Print to the console
console.debug("AzureProvider: Final recognized result: ", result);
this.emit('result', result);
if (this.results.length === this.maxResults) {
this.results.shift();
}
this.results.push(result);
};
尝试实现词级置信度
为了尝试启用单词级别的置信度,我尝试在上面添加以下配置(其中
MORE CONFIG
注释所在):
this.speechConfig.setProperty("SpeechServiceResponse_RequestWordLevelConfidence", "true");
this.speechConfig.setProperty("SpeechServiceResponse_RequestWordLevelTimestamps", "true");
this.speechConfig.setProperty("SpeechServiceResponse_NBestList", "5");
我还尝试添加以下内容:
this.speechConfig.setProperty("wordLevelConfidence", "true", speechSdk.ServicePropertyChannel.UriQueryParameter)
都没工作,我很茫然……
通过对您的代码进行一些更改,我已成功检索了单词级置信度。我已经配置了
setProperty
,如下所示。
speechConfig.setProperty(
speechSdk.PropertyId.SpeechServiceResponse_RequestWordLevelConfidence,
"true"
);
speechConfig.setProperty(
speechSdk.PropertyId.SpeechServiceResponse_RequestWordLevelTimestamps,
"true"
);
fs.readFileSync
加载 .wav
文件并通过 AudioConfig.fromWavFileInput
将其传递给识别器。const audioConfig = speechSdk.AudioConfig.fromWavFileInput(fs.readFileSync(audioFile));
app.js:
const speechSdk = require("microsoft-cognitiveservices-speech-sdk");
const fs = require("fs");
const subscriptionKey = '<speechKey>';
const region = '<speeckRegion>';
const audioFile = 'audio/SPkam.wav';
const speechConfig = speechSdk.SpeechConfig.fromSubscription(subscriptionKey, region);
speechConfig.speechRecognitionLanguage = "en-US";
speechConfig.outputFormat = speechSdk.OutputFormat.Detailed;
speechConfig.setProperty(
speechSdk.PropertyId.SpeechServiceResponse_RequestWordLevelConfidence,
"true"
);
speechConfig.setProperty(
speechSdk.PropertyId.SpeechServiceResponse_RequestWordLevelTimestamps,
"true"
);
if (!fs.existsSync(audioFile)) {
console.error("Audio file not found:", audioFile);
process.exit(1);
}
const audioConfig = speechSdk.AudioConfig.fromWavFileInput(fs.readFileSync(audioFile));
const recognizer = new speechSdk.SpeechRecognizer(speechConfig, audioConfig);
console.log("Starting speech recognition...");
recognizer.recognizing = (s, e) => {
};
recognizer.recognized = (s, e) => {
if (e.result.reason === speechSdk.ResultReason.RecognizedSpeech) {
console.log("Recognized:", e.result.text);
const resultJson = e.result;
try {
const parsedJson = JSON.parse(resultJson.privJson);
console.log("Corrected JSON:", parsedJson);
console.log("Transcription:", parsedJson.DisplayText);
parsedJson.NBest.forEach(bestResult => {
const words = bestResult.Words || [];
console.log("Word-level confidence:");
if (words.length > 0) {
words.forEach((word, index) => {
console.log(` Word ${index + 1}: "${word.Word}"`);
console.log(` Confidence: ${bestResult.Confidence}`);
});
} else {
console.log(" No words found in this result.");
}
console.log(` Overall confidence for this transcription: ${bestResult.Confidence}`);
});
} catch (error) {
console.error("Failed to parse JSON:", error);
}
} else if (e.result.reason === speechSdk.ResultReason.NoMatch) {
console.warn("No match found.");
}
};
recognizer.canceled = (s, e) => {
console.error(`Recognition canceled: ${e.errorDetails}`);
recognizer.close();
process.exit(1);
};
recognizer.sessionStopped = (s, e) => {
console.log("Session stopped.");
recognizer.close();
process.exit(0);
};
recognizer.startContinuousRecognitionAsync();
输出:
我成功获得了以下输出中的单词级别置信度。