c# - How to use Azure Speech to text API to make a transcript result for arabic audios

I used an Azure speech to text API to get a text from audio, I used Arabic audio, the result of API the text does not match the speech in the audio files. How to improve the result to match the speech in audio?

I faced another problem trying to separate two speakers, Guest1 and Guest2. The result of API the text is not matching the speech in the audio file. How handle this?

I used the Azure speech to text API in an ASP.NET Core MVC app, the result did not match the speech on audio, I used this for a calls in call centers

This my action Upload audio that take audio for convert to text by speech API in Azure :

using Microsoft.AspNetCore.Mvc;
using Microsoft.EntityFrameworkCore;
using SpeechAnalytics.Backend.Services;
using SpeechAnalytics.Backend.Services.backgroundTask;
using SpeechAnalytics.Backend.ViewModel;
using SpeechAnalytics.Core.Entities;

namespace SpeechAnalytics.Backend.Controllers
{
    public class AudioController : BaseController
    {
        private readonly IWebHostEnvironment _webHostEnvironment;
        private readonly SpeechWordsCalculateService _speechWordsService;
        // private TranscriptionConvertService _transcription;
        private AudioTranscriptionTask _audioTranscriptionTask;

        public AudioController(AudioTranscriptionTask audioTranscriptionTask, IWebHostEnvironment webHostEnvironment, SpeechWordsCalculateService speechWordsService)
        {
            // _transcription = transcription;
            _audioTranscriptionTask = audioTranscriptionTask;
            _webHostEnvironment = webHostEnvironment;
            _speechWordsService = speechWordsService;
        }

        public IActionResult UpladAudio()
        {
            AudioVM model = new() { Date = DateTime.Now };
            return View(model);
        }

        //Done action
        [HttpPost]
        public async Task<IActionResult> UpladAudio(AudioVM audio)
        {
            #region PreviousNotEnqueueCode
            // Save the udio information to the database
            //var model = new Audio
            //{
            //    FilePath = audio.FilePath,
            //    AudioName = audio.AudioName,
            //    audioStatus = AudioStatus.WaitTranscribing,
            //    UserId = CurrentUserData.UserId,
            //    Date = audio.Date
            //};
            //await _context.Audios.AddAsync(model);
            //await _context.SaveChangesAsync();

            //string attachmentFolderPath = Path.Combine(_webHostEnvironment.WebRootPath, "Attachments");
            //string audioFilePath = Path.Combine(attachmentFolderPath, audio.FilePath);
            //var res = await _transcription.ConversationTranscriber(audioFilePath);
            //if (res.Success == true)
            //{
            //        model.audioStatus = AudioStatus.Transcribed;
            //        _context.Audios.Update(model);

            //    var TranscriptionModel = new AudioTranscription()
            //    {
            //        AudioId = model.Id,
            //        Transcription = JsonConvert.SerializeObject(res.Transcriptions),
            //        IsDeleted = false
            //    };

            //    _context.AudioTranscriptions.Add(TranscriptionModel);
            //}
            //else
            //{
            //    model.audioStatus = AudioStatus.NotTranscribed;
            //    _context.Audios.Update(model);
            //}
            //_context.SaveChanges();

            #endregion

            var StandreadQuota = TimeSpan.FromSeconds(_context.Settings.FirstOrDefault().QuotaSystemSeconds);
            var audioDurationSum = (int) _context.Audios.Where(a => a.UserId == CurrentUserData.UserId && a.audioStatus == AudioStatus.Transcribed && a.IsDeleted != true).Sum(a => a.audioDuration);
            var ReminingQuota = (int) Math.Round(StandreadQuota.Subtract(TimeSpan.FromSeconds(audioDurationSum)).TotalSeconds);
            var audioDuration = (int) GetAudioDuration(Path.Combine(_webHostEnvironment.WebRootPath, "Attachments", audio.FilePath));

            // Save the audio information to the database
            var model = new Audio
            {
                FilePath = audio.FilePath,
                AudioName = audio.AudioName,
                audioDuration = GetAudioDuration(Path.Combine(_webHostEnvironment.WebRootPath, "Attachments", audio.FilePath)),
                audioStatus = AudioStatus.WaitTranscribing,
                UserId = CurrentUserData.UserId,
                Date = audio.Date
            };

            if(audioDuration > ReminingQuota)
            {
                return Json(-1);
            }

            await _context.Audios.AddAsync(model);
            await _context.SaveChangesAsync();

            // Enqueue the audio transcription task as a background task
            //Task.Run(() => _audioTranscriptionTask.ExecuteAsync(Path.Combine(_webHostEnvironment.WebRootPath, "Attachments", audio.FilePath), model.Id , model.UserId , audioDuration));

            Task.Run(async () =>
            {
                await _audioTranscriptionTask.ExecuteAsync(Path.Combine(_webHostEnvironment.WebRootPath, "Attachments", audio.FilePath), model.Id, model.UserId, audioDuration);
                await _speechWordsService.CalculateSpeechWordsInAllTranscription(model.UserId);
            });

            return Json(1);
        }
    }
}

And in the Upload Audio action that contain a ExecuteAsync this method contain the method that connect on the Azure speech to text API and this code of it :

public async Task ExecuteAsync(string audioFilePath, int audioId , int userId , int audioDuration)
{
      Audio audio = new();
      var result = await _transcriptionService.ConversationTranscriber(audioFilePath);

      using (var scope = new TransactionScope(TransactionScopeAsyncFlowOption.Enabled))
      {
          var dbContextOptions = new DbContextOptionsBuilder<SpeechAnalyticsDbContext>().UseSqlServer("Server=10.1.1.210;Database=SpeechAnalyticsDB;User Id=sa;Password=sa_2014;TrustServerCertificate=true;").Options;

          using (var context = new SpeechAnalyticsDbContext(dbContextOptions))
          {
              audio = await context.Audios.FindAsync(audioId);

              #region Calculate Quota
                  var StandreadQuota = TimeSpan.FromSeconds(context.Settings.FirstOrDefault().QuotaSystemSeconds);
                  var audioDurationSum = (int)context.Audios.Where(a => a.UserId == userId && a.audioStatus == AudioStatus.Transcribed && a.IsDeleted != true).Sum(a => a.audioDuration);
                  var ReminingQuota = (int)Math.Round(StandreadQuota.Subtract(TimeSpan.FromSeconds(audioDurationSum)).TotalSeconds);
              #endregion

              //var User = context.Users.Find(UserId);
              if (result.Success)
              { 
                  if (audioDuration <= ReminingQuota)
                  {
                      audio.audioStatus = AudioStatus.Transcribed;

                      var transcriptionModel = new AudioTranscription()
                      {
                          AudioId = audio.Id,
                          Transcription = JsonConvert.SerializeObject(result.Transcriptions),
                          IsDeleted = false
                      };

                      context.AudioTranscriptions.Add(transcriptionModel);
                  }
                  else
                      audio.audioStatus = AudioStatus.OutOfQuota;
              }
              else
                  audio.audioStatus = AudioStatus.NotTranscribed;
              

              context.Audios.Update(audio);
              await context.SaveChangesAsync();

              scope.Complete();
          }

          // Notify User that the audio status  has been updated
          await _hubContext.Clients.All.SendAsync("AudioListUpdated", audioId , audio.audioStatus);
      }
}

The ConversationTranscriber method this connect to azure speech to API directly and this code :

public async Task<dynamic> ConversationTranscriber(string path)
{
        try
        {
            //not clean
            var speechConfig = SpeechConfig.FromSubscription("", "eastus");
            var autoDetectSourceLanguageConfig = AutoDetectSourceLanguageConfig.FromLanguages(new string[] { "ar-EG", "en-US" });

            speechConfig.OutputFormat = OutputFormat.Detailed;
            speechConfig.EnableDictation();
           //speechConfig.SetProperty("Profanity", "masked"); // Handle profanity masking if required
           // speechConfig.SetProperty("NoiseSuppression", "Auto"); // Enable automatic noise suppression

            var stopRecognition = new TaskCompletionSource<int>(TaskCreationOptions.RunContinuationsAsynchronously);

            // Create an audio stream from a WAV file or from the default microphone
            TranscriptionVM transcription = new TranscriptionVM();
            //befin treanscribe the audio
            using (var audioConfig = AudioConfig.FromWavFileInput(path))
            {
                using (var conversationTranscriber = new ConversationTranscriber(speechConfig, autoDetectSourceLanguageConfig, audioConfig))
                {
                    var transcriptions = new List<TranscriptionVM>();

                    conversationTranscriber.Transcribed += (s, e) =>
                    {
                        if (e.Result.Reason == ResultReason.RecognizedSpeech)
                        {
                            var detailedResults = e.Result.Best();
                            var words = new List<WordTimestampVM>();

                            if (detailedResults != null && detailedResults.Any())
                            {
                                var bestResults = detailedResults.ToList()[0];

                                transcription = new TranscriptionVM 
                                {
                                    SpeakerId = e.Result.SpeakerId,
                                    Text = e.Result.Text,
                                    StartTime = e.Result.OffsetInTicks / (10_000_000d * 60),
                                    Duration = e.Result.Duration.Ticks / (10_000_000d * 60),
                                    Words = bestResults.Words?.Select(a => new WordTimestampVM()
                                    {
                                        Word = a?.Word

                                    }).ToList(),
                                };
                            }
                            transcriptions.Add(transcription);
                        }
                    };

                    conversationTranscriber.Canceled += (s, e) =>
                    {
                        // Handle cancellation scenario if needed
                    };

                    conversationTranscriber.SessionStopped += (s, e) =>
                    {
                        // Handle session stopped scenario if needed
                        stopRecognition.TrySetResult(0);
                    };

                    await conversationTranscriber.StartTranscribingAsync();

                    // Wait for completion
                    await stopRecognition.Task;

                    await conversationTranscriber.StopTranscribingAsync();
                    // Check if transcriptions were generated
                    if (transcriptions.Count > 0)
                    {
                        var response = new
                        {
                            Success = true,
                            Transcriptions = transcriptions
                        };

                        return response;
                    }
                    else
                    {
                        var response = new
                        {
                            Success = false,
                            Message = "Transcription failed. No transcriptions were generated."
                        };

                        return response;
                    }
                }
            }
        }
        catch (Exception ex)
        {
            // Handle any exceptions that occur during transcription
            var response = new
            {
                Success = false,
                Message = "Transcription failed: " + ex.Message
            };

            return response;

        }
}

科技改变生活-雨落星辰 - 所有的伟大,都源于一个勇敢的开始

c# - How to use Azure Speech to text API to make a transcript result for arabic audios - Stack Overflow

与本文相关的文章

评论列表(0)