.NET 中的 Azure 语音识别出现重复结果问题

问题描述 投票:0回答:1

我正在使用 .NET 在 Azure 上构建实时麦克风语音到文本转录。然而,单个句子会导致多个重复的句子。我需要帮助确定导致重复输出的问题。如果我对着麦克风说:

“你好,我的名字是约翰”,

代码产生以下输出:

“你好,我的名字是约翰你好我的名字是约翰你好我的名字是约翰你好我的名字是约翰你好我的名字是约翰你好我的名字是约翰你好我的名字是约翰你好我的名字是约翰你好我的名字是约翰你好我的名字约翰你好 我的名字是约翰你好 我的名字是约翰你好 我的名字是约翰你好 我的名字是约翰你好 我的名字是约翰你好 我的名字是约翰你好 我的名字是约翰你好 我的名字是约翰你好 我的名字是约翰你好我的名字约翰你好我的名字是约翰你好我的名字是约翰你好我的名字是约翰你好我的名字是约翰“

我想要的是出现一次而不重复的句子的实时转录,这正是它在 Azure 语音工作室上的工作方式 - 实时语音到文本。

这是我的代码:

程序.cs:

using Microsoft.AspNetCore.Hosting;
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using System;
using System.Linq;
using [ ].Hubs;
using [ ].Services;
using [ ].Data;

public class Program
{
    public static void Main(string[] args)
    {
        CreateHostBuilder(args).Build().Run();
    }

    public static IHostBuilder CreateHostBuilder(string[] args) =>
        Host.CreateDefaultBuilder(args)
            .ConfigureWebHostDefaults(webBuilder =>
            {
                webBuilder.UseUrls("http://localhost:5001");
                webBuilder.ConfigureServices((context, services) =>
                {
                    services.AddControllers();
                    services.AddSignalR();

                    // Configure CORS
                    services.AddCors(options =>
                    {
                        options.AddDefaultPolicy(builder =>
                        {
                            builder.WithOrigins("http://localhost:3000")
                                   .AllowAnyHeader()
                                   .AllowAnyMethod()
                                   .AllowCredentials();
                        });
                    });

                    services.AddSingleton<SpeechService>();
                    services.AddSingleton<TranscriptionService>();

                    // Add DbContext and hosted service
                    services.AddDbContext<CosmosDbContext>();
                    services.AddHostedService<CosmosDbTestService>();
                })
                .Configure((context, app) =>
                {
                    if (context.HostingEnvironment.IsDevelopment())
                    {
                        app.UseDeveloperExceptionPage();
                    }

                    // Enable CORS
                    app.UseCors();

                    app.UseRouting();

                    app.UseEndpoints(endpoints =>
                    {
                        endpoints.MapControllers();
                        endpoints.MapHub<TranscriptionHub>("/transcriptionHub");
                    });
                });
            })
            .ConfigureLogging(logging =>
            {
                logging.ClearProviders();
                logging.AddConsole();
            });
}

SpeechService.cs(/服务):

using Microsoft.CognitiveServices.Speech;
using Microsoft.CognitiveServices.Speech.Audio;

namespace [ ].Services
{
    public class SpeechService
    {
        private readonly SpeechRecognizer _speechRecognizer;
        private bool isRecognizing = false;
        public event Action<string>? OnRecognizing;
        public event Action<string>? OnRecognized;

        public SpeechService()
        {
            var subscriptionKey = Environment.GetEnvironmentVariable("AZURE_SPEECH_KEY");
            var region = Environment.GetEnvironmentVariable("AZURE_SPEECH_REGION");

            if (string.IsNullOrEmpty(subscriptionKey) || string.IsNullOrEmpty(region))
            {
                throw new InvalidOperationException("Azure Speech Service key and region must be provided via environment variables.");
            }

            var speechConfig = SpeechConfig.FromSubscription(subscriptionKey, region);
            speechConfig.SpeechRecognitionLanguage = "de-DE";
            speechConfig.EnableDictation(); // Enable dictation mode for explicit punctuation

            var audioConfig = AudioConfig.FromDefaultMicrophoneInput();
            _speechRecognizer = new SpeechRecognizer(speechConfig, audioConfig);

            _speechRecognizer.Recognizing += (s, e) =>
            {
                if (e.Result.Reason == ResultReason.RecognizingSpeech)
                {
                    OnRecognizing?.Invoke(e.Result.Text);
                }
            };

            _speechRecognizer.Recognized += (s, e) =>
            {
                if (e.Result.Reason == ResultReason.RecognizedSpeech)
                {
                    OnRecognized?.Invoke(e.Result.Text);
                }
            };

            _speechRecognizer.Canceled += (s, e) =>
            {
                isRecognizing = false;
                Console.WriteLine($"Recognition canceled: {e.Reason}, {e.ErrorDetails}");
            };

            _speechRecognizer.SessionStopped += (s, e) =>
            {
                isRecognizing = false;
                Console.WriteLine($"Session stopped: {e.SessionId}");
            };
        }

        public async Task StartRecognitionAsync()
        {
            if (!isRecognizing)
            {
                isRecognizing = true;
                await _speechRecognizer.StartContinuousRecognitionAsync().ConfigureAwait(false);
            }
        }

        public async Task StopRecognitionAsync()
        {
            if (isRecognizing)
            {
                await _speechRecognizer.StopContinuousRecognitionAsync().ConfigureAwait(false);
                isRecognizing = false;
            }
        }
    }
}

TranscriptionService.cs(/服务):

using Microsoft.AspNetCore.SignalR;
using System.Collections.Concurrent;
using [ ].Hubs;

namespace [ ].Services
{
    public class TranscriptionService
    {
        private readonly IHubContext<TranscriptionHub> _hubContext;
        private readonly ConcurrentDictionary<string, string> _connections = new ConcurrentDictionary<string, string>();

        public TranscriptionService(IHubContext<TranscriptionHub> hubContext)
        {
            _hubContext = hubContext;
        }

        public void AddConnection(string connectionId)
        {
            _connections[connectionId] = connectionId;
        }

        public void RemoveConnection(string connectionId)
        {
            _connections.TryRemove(connectionId, out _);
        }

        public async Task BroadcastRecognizing(string text)
        {
            foreach (var connectionId in _connections.Keys)
            {
                await _hubContext.Clients.Client(connectionId).SendAsync("ReceiveRecognizing", text);
            }
        }

        public async Task BroadcastRecognized(string text)
        {
            foreach (var connectionId in _connections.Keys)
            {
                await _hubContext.Clients.Client(connectionId).SendAsync("ReceiveRecognized", text);
            }
        }
    }
}

TranscriptionHub.cs (/Hub):

using Microsoft.AspNetCore.SignalR;
using [ ].Services;

namespace [ ].Hubs
{
    public class TranscriptionHub : Hub
    {
        private readonly SpeechService _speechService;
        private readonly TranscriptionService _transcriptionService;

        public TranscriptionHub(SpeechService speechService, TranscriptionService transcriptionService)
        {
            _speechService = speechService;
            _transcriptionService = transcriptionService;
        }

        public override async Task OnConnectedAsync()
        {
            _transcriptionService.AddConnection(Context.ConnectionId);
            _speechService.OnRecognizing += HandleRecognizing;
            _speechService.OnRecognized += HandleRecognized;
            await base.OnConnectedAsync();
        }

        public override async Task OnDisconnectedAsync(Exception? exception)
        {
            _transcriptionService.RemoveConnection(Context.ConnectionId);
            _speechService.OnRecognizing -= HandleRecognizing;
            _speechService.OnRecognized -= HandleRecognized;
            await base.OnDisconnectedAsync(exception);
        }

        private async void HandleRecognizing(string text)
        {
            await _transcriptionService.BroadcastRecognizing(text);
        }

        private async void HandleRecognized(string text)
        {
            await _transcriptionService.BroadcastRecognized(text);
        }

        public async Task StartTranscription()
        {
            await _speechService.StartRecognitionAsync();
        }

        public async Task StopTranscription()
        {
            await _speechService.StopRecognitionAsync();
        }
    }
}
c# asp.net azure speech-recognition speech-to-text
1个回答
0
投票

该问题是由于 Azure 认知服务语音 SDK 中处理连续识别的方式造成的。连续识别旨在提供中期结果(部分识别的短语)和最终结果(完全识别的短语)。如果处理不当,有时会导致看似重复的输出。 您以相同的方式处理中间结果和最终结果,这会导致重复输出。为了解决这个问题,您应该将临时结果(识别事件)与最终结果(识别事件)分开处理。

以下是如何修改代码以正确处理事件的方法:

我使用此链接进行语音转文本 和实时语音文本


using Microsoft.CognitiveServices.Speech;
using Microsoft.CognitiveServices.Speech.Audio;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;

namespace YourNamespace.Services
{
    public class SpeechService
    {
        private readonly SpeechRecognizer _speechRecognizer;
        private bool isRecognizing = false;
        private string lastRecognizedText = string.Empty;
        public event Action<string>? OnRecognizing;
        public event Action<string>? OnRecognized;

        public SpeechService()
        {
            var subscriptionKey = Environment.GetEnvironmentVariable("AZURE_SPEECH_KEY");
            var region = Environment.GetEnvironmentVariable("AZURE_SPEECH_REGION");

            if (string.IsNullOrEmpty(subscriptionKey) || string.IsNullOrEmpty(region))
            {
                throw new InvalidOperationException("Azure Speech Service key and region must be provided via environment variables.");
            }

            var speechConfig = SpeechConfig.FromSubscription(subscriptionKey, region);
            speechConfig.SpeechRecognitionLanguage = "de-DE";//"en-US"
            speechConfig.EnableDictation(); 

            var audioConfig = AudioConfig.FromDefaultMicrophoneInput();
            _speechRecognizer = new SpeechRecognizer(speechConfig, audioConfig);

    
            speechConfig.RequestWordLevelTimestamps();

            _speechRecognizer.Recognizing += (s, e) =>
            {
                if (e.Result.Reason == ResultReason.RecognizingSpeech)
                {
                    OnRecognizing?.Invoke(e.Result.Text);
                    Console.WriteLine($"RECOGNIZING: {e.Result.Text}");
                    //Console.WriteLine($"Offset in Ticks: {e.Result.OffsetInTicks}");
                   // Console.WriteLine($"Duration in Ticks: {e.Result.Duration.Ticks}");
                }
            };

            _speechRecognizer.Recognized += (s, e) =>
            {
                if (e.Result.Reason == ResultReason.RecognizedSpeech && !string.Equals(e.Result.Text, lastRecognizedText, StringComparison.OrdinalIgnoreCase))
                {
                    lastRecognizedText = e.Result.Text;
                    OnRecognized?.Invoke(e.Result.Text);
                    Console.WriteLine($"RECOGNIZED: Text={e.Result.Text}");
                 //   Console.WriteLine($"Offset in Ticks: {e.Result.OffsetInTicks}");
                   // Console.WriteLine($"Duration in Ticks: {e.Result.Duration.Ticks}");

                    var detailedResults = e.Result.Best();
                    if (detailedResults != null && detailedResults.Any())
                    {
                        var bestResults = detailedResults.First();
                        Console.WriteLine($"\tConfidence: {bestResults.Confidence}\n\tText: {bestResults.Text}\n\tLexicalForm: {bestResults.LexicalForm}\n\tNormalizedForm: {bestResults.NormalizedForm}\n\tMaskedNormalizedForm: {bestResults.MaskedNormalizedForm}");
                        Console.WriteLine($"\tWord-level timing:");
                        Console.WriteLine($"\t\tWord | Offset | Duration");
                        Console.WriteLine($"\t\t----- | ----- | ----- ");

                        foreach (var word in bestResults.Words)
                        {
                            Console.WriteLine($"\t\t{word.Word} | {word.Offset} | {word.Duration}");
                        }
                    }
                }
            };

            _speechRecognizer.Canceled += (s, e) =>
            {
                isRecognizing = false;
                Console.WriteLine($"Recognition canceled: {e.Reason}, {e.ErrorDetails}");
            };

            _speechRecognizer.SessionStopped += (s, e) =>
            {
                isRecognizing = false;
                Console.WriteLine($"Session stopped: {e.SessionId}");
            };
        }

        public async Task StartRecognitionAsync()
        {
            if (!isRecognizing)
            {
                isRecognizing = true;
                await _speechRecognizer.StartContinuousRecognitionAsync().ConfigureAwait(false);
            }
        }

        public async Task StopRecognitionAsync()
        {
            if (isRecognizing)
            {
                await _speechRecognizer.StopContinuousRecognitionAsync().ConfigureAwait(false);
                isRecognizing = false;
            }
        }
    }
}

输出: Code Output Example

© www.soinside.com 2019 - 2024. All rights reserved.