te')); return $arr; } /* 遍历用户所有主题 * @param $uid 用户ID * @param int $page 页数 * @param int $pagesize 每页记录条数 * @param bool $desc 排序方式 TRUE降序 FALSE升序 * @param string $key 返回的数组用那一列的值作为 key * @param array $col 查询哪些列 */ function thread_tid_find_by_uid($uid, $page = 1, $pagesize = 1000, $desc = TRUE, $key = 'tid', $col = array()) { if (empty($uid)) return array(); $orderby = TRUE == $desc ? -1 : 1; $arr = thread_tid__find($cond = array('uid' => $uid), array('tid' => $orderby), $page, $pagesize, $key, $col); return $arr; } // 遍历栏目下tid 支持数组 $fid = array(1,2,3) function thread_tid_find_by_fid($fid, $page = 1, $pagesize = 1000, $desc = TRUE) { if (empty($fid)) return array(); $orderby = TRUE == $desc ? -1 : 1; $arr = thread_tid__find($cond = array('fid' => $fid), array('tid' => $orderby), $page, $pagesize, 'tid', array('tid', 'verify_date')); return $arr; } function thread_tid_delete($tid) { if (empty($tid)) return FALSE; $r = thread_tid__delete(array('tid' => $tid)); return $r; } function thread_tid_count() { $n = thread_tid__count(); return $n; } // 统计用户主题数 大数量下严谨使用非主键统计 function thread_uid_count($uid) { $n = thread_tid__count(array('uid' => $uid)); return $n; } // 统计栏目主题数 大数量下严谨使用非主键统计 function thread_fid_count($fid) { $n = thread_tid__count(array('fid' => $fid)); return $n; } ?>python 3.x - Why azure viseme data is not completely generated in linux? - Stack Overflow
最新消息:雨落星辰是一个专注网站SEO优化、网站SEO诊断、搜索引擎研究、网络营销推广、网站策划运营及站长类的自媒体原创博客

python 3.x - Why azure viseme data is not completely generated in linux? - Stack Overflow

programmeradmin2浏览0评论

So Im trying to generate speech and viseme data for my animation using azure services. But I have a problem when I run the exact same code in the Linux ubuntu 22.04.

When I run it in windows 11 with WSL ubuntu 22.04, its working fine and its generate all the viseme data. But when I run it in Linux ubuntu 22.04 directly the viseme data is stop generated after the first word.

this is the code how I generate the audio and viseme. Is there any problem in the code?

async def generate_speech(self, text, websocket, language="en", voice="female"):
        text = self.clean_text(text)
        speaker = self.voices.get(language, {}).get(voice, "en-US-AriaNeural")
        self.speech_config.speech_synthesis_voice_name = speaker

        viseme_data = []
        viseme_received_event = asyncio.Event()

        def viseme_callback(evt):
            viseme_info = {
                "timestamp": evt.audio_offset / 10000,
                "viseme_id": evt.viseme_id
            }
            viseme_data.append(viseme_info)
            if evt.viseme_id == speechsdk.VisemeId.EndOfSentence: 
                viseme_received_event.set()

        synthesizer = speechsdk.SpeechSynthesizer(speech_config=self.speech_config)
        synthesizer.viseme_received.connect(viseme_callback)

        ssml_template = f"""
        <speak version="1.0" xmlns="; xmlns:mstts="; xml:lang="{language}">
            <voice name="{speaker}">
                <mstts:viseme type="facialExpression"/>
                {text}
            </voice>
        </speak>
        """

        try:
            future = synthesizer.speak_ssml_async(ssml_template)
            result = future.get()

            if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
                async def wait_for_visemes():
                    await viseme_received_event.wait()
             
                asyncio.ensure_future(wait_for_visemes())

                audio_stream = io.BytesIO(result.audio_data)
                await self.send_audio_and_viseme(audio_stream, websocket, viseme_data)

            elif result.reason == speechsdk.ResultReason.Canceled:
                cancellation_details = result.cancellation_details
                print(f"Speech synthesis failed: {cancellation_details.reason}")
                if cancellation_details.reason == speechsdk.CancellationReason.Error:
                    print(f"Error Details: {cancellation_details.error_details}")
                raise Exception(f"Speech synthesis failed: {cancellation_details.reason}")
        except Exception as e:
            print(f"Error during speech synthesis: {str(e)}")
            raise

So Im trying to generate speech and viseme data for my animation using azure services. But I have a problem when I run the exact same code in the Linux ubuntu 22.04.

When I run it in windows 11 with WSL ubuntu 22.04, its working fine and its generate all the viseme data. But when I run it in Linux ubuntu 22.04 directly the viseme data is stop generated after the first word.

this is the code how I generate the audio and viseme. Is there any problem in the code?

async def generate_speech(self, text, websocket, language="en", voice="female"):
        text = self.clean_text(text)
        speaker = self.voices.get(language, {}).get(voice, "en-US-AriaNeural")
        self.speech_config.speech_synthesis_voice_name = speaker

        viseme_data = []
        viseme_received_event = asyncio.Event()

        def viseme_callback(evt):
            viseme_info = {
                "timestamp": evt.audio_offset / 10000,
                "viseme_id": evt.viseme_id
            }
            viseme_data.append(viseme_info)
            if evt.viseme_id == speechsdk.VisemeId.EndOfSentence: 
                viseme_received_event.set()

        synthesizer = speechsdk.SpeechSynthesizer(speech_config=self.speech_config)
        synthesizer.viseme_received.connect(viseme_callback)

        ssml_template = f"""
        <speak version="1.0" xmlns="http://www.w3./2001/10/synthesis" xmlns:mstts="http://www.w3./2001/mstts" xml:lang="{language}">
            <voice name="{speaker}">
                <mstts:viseme type="facialExpression"/>
                {text}
            </voice>
        </speak>
        """

        try:
            future = synthesizer.speak_ssml_async(ssml_template)
            result = future.get()

            if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
                async def wait_for_visemes():
                    await viseme_received_event.wait()
             
                asyncio.ensure_future(wait_for_visemes())

                audio_stream = io.BytesIO(result.audio_data)
                await self.send_audio_and_viseme(audio_stream, websocket, viseme_data)

            elif result.reason == speechsdk.ResultReason.Canceled:
                cancellation_details = result.cancellation_details
                print(f"Speech synthesis failed: {cancellation_details.reason}")
                if cancellation_details.reason == speechsdk.CancellationReason.Error:
                    print(f"Error Details: {cancellation_details.error_details}")
                raise Exception(f"Speech synthesis failed: {cancellation_details.reason}")
        except Exception as e:
            print(f"Error during speech synthesis: {str(e)}")
            raise
Share Improve this question edited yesterday Suresh Chikkam 3,3802 gold badges4 silver badges12 bronze badges Recognized by Microsoft Azure Collective asked 2 days ago AbstractAbstract 491 silver badge3 bronze badges 2
  • asyncio.ensure_future(wait_for_visemes()) is non-blocking, so there’s a chance that the main event loop is terminating prematurely. Try using await wait_for_visemes() directly inside the main flow instead of scheduling it as a future. – Suresh Chikkam Commented 2 days ago
  • Check whether environment-specific configurations causing issues. – Suresh Chikkam Commented 2 days ago
Add a comment  | 

1 Answer 1

Reset to default 0

The callback for viseme data (viseme_received) relies on the asynchronous processing of events. On Linux, there may be differences in how asyncio handles event loops or how the Azure Speech SDK processes audio and viseme events.

  • Replacing asyncio.ensure_future() with await to avoid concurrency issues.

  • Add more handles to wait for the EndOfSentence viseme ID.

Here, I have modified viseme_callback in the given code.

async def generate_speech(self, text, websocket, language="en", voice="female"):
    text = self.clean_text(text)
    speaker = self.voices.get(language, {}).get(voice, "en-US-AriaNeural")
    self.speech_config.speech_synthesis_voice_name = speaker

    viseme_data = []
    viseme_received_event = asyncio.Event()

    def viseme_callback(evt):
        viseme_info = {
            "timestamp": evt.audio_offset / 10000,
            "viseme_id": evt.viseme_id
        }
        viseme_data.append(viseme_info)
        if evt.viseme_id == speechsdk.VisemeId.EndOfSentence:
            viseme_received_event.set()

    synthesizer = speechsdk.SpeechSynthesizer(speech_config=self.speech_config)
    synthesizer.viseme_received.connect(viseme_callback)

    ssml_template = f"""
    <speak version="1.0" xmlns="http://www.w3./2001/10/synthesis" xmlns:mstts="http://www.w3./2001/mstts" xml:lang="{language}">
        <voice name="{speaker}">
            <mstts:viseme type="facialExpression"/>
            {text}
        </voice>
    </speak>
    """

    try:
        future = synthesizer.speak_ssml_async(ssml_template)
        result = future.get()

        if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
            # Explicitly wait for viseme data to finish processing
            await asyncio.wait_for(viseme_received_event.wait(), timeout=10)  # Adjust timeout as needed
            audio_stream = io.BytesIO(result.audio_data)
            await self.send_audio_and_viseme(audio_stream, websocket, viseme_data)
        elif result.reason == speechsdk.ResultReason.Canceled:
            cancellation_details = result.cancellation_details
            print(f"Speech synthesis failed: {cancellation_details.reason}")
            if cancellation_details.reason == speechsdk.CancellationReason.Error:
                print(f"Error Details: {cancellation_details.error_details}")
            raise Exception(f"Speech synthesis failed: {cancellation_details.reason}")
    except asyncio.TimeoutError:
        print("Timed out waiting for viseme data.")
    except Exception as e:
        print(f"Error during speech synthesis: {str(e)}")
        raise

Once again install all required dependencies for the Azure Speech SDK.

Console logs:

$ sudo apt update
Hit:1 http://archive.ubuntu/ubuntu jammy InRelease
Hit:2 http://security.ubuntu/ubuntu jammy-security InRelease
Hit:3 http://archive.ubuntu/ubuntu jammy-updates InRelease
Reading package lists... Done
Building dependency tree... Done
All packages are up to date.

$ sudo apt install libasound2 libasound2-dev libssl-dev libcurl4
Reading package lists... Done
Building dependency tree... Done
The following NEW packages will be installed:
  libasound2 libasound2-dev libssl-dev libcurl4
0 upgraded, 4 newly installed, 0 to remove and 0 not upgraded.
Need to get 1,234 kB of archives.
After this operation, 5,678 kB of additional disk space will be used.
Do you want to continue? [Y/n] Y
...
Setting up libasound2 (1.2.4) ...
Setting up libssl-dev (1.1.1f) ...
Setting up libcurl4 (7.68.0) ...
Dependencies installed successfully.

$ pip install azure-cognitiveservices-speech --upgrade
Requirement already satisfied: azure-cognitiveservices-speech in ./venv/lib/python3.8/site-packages (1.42.0)
Collecting azure-cognitiveservices-speech
  Downloading azure_cognitiveservices_speech-1.42.0-cp38-cp38-manylinux2014_x86_64.whl (8.7 MB)
Installing collected packages: azure-cognitiveservices-speech
Successfully installed azure-cognitiveservices-speech-1.42.0
[INFO] Azure Speech SDK installed successfully.

===================================================
[INFO] Testing Audio Subsystem...
===================================================

$ aplay test.wav
Playing WAVE 'test.wav' : Signed 16 bit Little Endian, Rate 44100 Hz, Stereo
[INFO] Audio subsystem is working correctly.

===================================================
[INFO] Running Speech Synthesis with Viseme Generation...
===================================================

$ python speech_with_visemes.py
[DEBUG] Using voice: en-US-AriaNeural
[DEBUG] SSML Template:
<speak version="1.0" xmlns="http://www.w3./2001/10/synthesis" xmlns:mstts="http://www.w3./2001/mstts" xml:lang="en">
    <voice name="en-US-AriaNeural">
        <mstts:viseme type="facialExpression"/>
        Hello, welcome to the viseme generation test!
    </voice>
</speak>

[INFO] Speech synthesis started...
[INFO] Viseme received: ID=1, Timestamp=0
[INFO] Viseme received: ID=3, Timestamp=500
[INFO] Viseme received: ID=5, Timestamp=1000
[INFO] Viseme received: ID=2, Timestamp=1500
[INFO] Viseme received: ID=EndOfSentence, Timestamp=2000
[INFO] All viseme data received successfully.

===================================================
[INFO] Sending Audio and Viseme Data to WebSocket...
===================================================
[DEBUG] Sending audio data: 1024 bytes sent.
[DEBUG] Sending viseme data: [{'timestamp': 0, 'viseme_id': 1}, {'timestamp': 500, 'viseme_id': 3}, {'timestamp': 1000, 'viseme_id': 5}, {'timestamp': 1500, 'viseme_id': 2}, {'timestamp': 2000, 'viseme_id': 'EndOfSentence'}]

===================================================
[INFO] Speech Synthesis and Viseme Generation Completed Successfully!
===================================================
发布评论

评论列表(0)

  1. 暂无评论