python - How can I ensure that Azure Text-to-Speech properly pronounces word-for-word translations?

I'm working on an app using Azure, Gemini, Python, and Dart, and I want to make sure the pronunciation between languages is spot on. For example, I want to translate between German and Spanish: the goal is for 'hallo' -> 'hola' to be pronounced correctly in both languages. The same goes for English and Spanish 'hello' -> 'hola'. Azure does well with sentences, but struggles with word-for-word translations.

Here's my code:

translation_service.py


class TranslationService:
    def __init__(self):
        load_dotenv()
        api_key = os.getenv("GEMINI_API_KEY")
        if not api_key:
            raise ValueError("GEMINI_API_KEY not found in environment variables")

        genai.configure(api_key=api_key)
        
        self.generation_config = {
            "temperature": 1,
            "top_p": 0.95,
            "top_k": 40,
            "max_output_tokens": 8192,
            "response_mime_type": "text/plain",
        }
        
        self.model = GenerativeModel(
            model_name="gemini-2.0-flash-exp",
            generation_config=self.generation_config
        )
        
        self.tts_service = EnhancedTTSService()

        # Initialize chat session with translation instructions
        self.chat_session = self.model.start_chat(
            history=[
   {
                    "role": "user",
                    "parts": [
                        """
                       
                        Text
"
(Could be any phrase or word)
"

German Translation:
Conversational-native:
"Ich suche einen Job, damit ich finanziell unabhängig sein kann."
word by word Conversational-native German-Spanish:
"Ich (Yo) suche (busco) einen (un) Job (trabajo), damit (para que) ich (yo) finanziell (económicamente) unabhängig (independiente) sein (ser) kann (pueda)."


English Translation:

Conversational-native:
"I'm looking for a job so I can be financially independent."
word by word Conversational-native English-Spanish:
"I'm (Yo estoy) looking for (buscando) a job (un trabajo) so (para que) I (yo) can be (pueda ser) financially (económicamente) independent (independiente)."



                        """
                    ]
                }
            ]
        )
           
    def _restore_accents(self, text: str) -> str:
        """Restore proper accents and special characters."""
        accent_map = {
            "a": "á", "e": "é", "i": "í", "o": "ó", "u": "ú", "n": "ñ",
            "A": "Á", "E": "É", "I": "Í", "O": "Ó", "U": "Ú", "N": "Ñ",
        }

        patterns = {
            r"([aeiou])´": lambda m: accent_map[m.group(1)],
            r"([AEIOU])´": lambda m: accent_map[m.group(1)],
            r"n~": "ñ",
            r"N~": "Ñ",
        }

        for pattern, replacement in patterns.items():
            if callable(replacement):
                text = re.sub(pattern, replacement, text)
            else:
                text = re.sub(pattern, replacement, text)

        return text

    async def process_prompt(self, text: str, source_lang: str, target_lang: str) -> Translation:
        try:
            response = self.chat_session.send_message(text)
            generated_text = response.text
            
            print(f"Generated text from Gemini: {generated_text[:100]}...")
            
            audio_filename = await self.tts_service.text_to_speech(
                text=generated_text
            )
            
            if audio_filename:
                print(f"Successfully generated audio: {audio_filename}")
            else:
                print("Audio generation failed")
            
            return Translation(
                original_text=text,
                translated_text=generated_text,
                source_language=source_lang,
                target_language=target_lang,
                audio_path=audio_filename,
                translations={"main": generated_text},
                word_by_word=self._generate_word_by_word(text, generated_text),
                grammar_explanations=self._generate_grammar_explanations(generated_text)
            )
            
        except Exception as e:
            print(f"Error in process_prompt: {str(e)}")
            raise Exception(f"Translation processing failed: {str(e)}")


    def _generate_word_by_word(self, original: str, translated: str) -> dict[str, dict[str, str]]:
        """Generate word-by-word translation mapping."""
        result = {}
        original_words = original.split()
        translated_words = translated.split()

        for i, word in enumerate(original_words):
            if i < len(translated_words):
                result[word] = {
                    "translation": translated_words[i],
                    "pos": "unknown",
                }
        return result


    def _auto_fix_spelling(self, text: str) -> str:
        """Fix spelling in the given text."""
        words = re.findall(r"\b\w+\b|[^\w\s]", text)
        corrected_words = []

        for word in words:
            if not re.match(r"\w+", word):
                corrected_words.append(word)
                continue

            if self.spell.unknown([word]):
                correction = self.spell.correction(word)
                if correction:
                    if word.isupper():
                        correction = correction.upper()
                    elif word[0].isupper():
                        correction = correction.capitalize()
                    word = correction

            corrected_words.append(word)

        return " ".join(corrected_words)

tts_service.py


from azure.cognitiveservices.speech.audio import AudioOutputConfig
import os
from typing import Optional
from datetime import datetime
import asyncio
import re

class EnhancedTTSService:
    def __init__(self):
        # Initialize Speech Config
        self.subscription_key = os.getenv("AZURE_SPEECH_KEY")
        self.region = os.getenv("AZURE_SPEECH_REGION")
        
        if not self.subscription_key or not self.region:
            raise ValueError("Azure Speech credentials not found in environment variables")
        
        # Create speech config
        self.speech_config = SpeechConfig(
            subscription=self.subscription_key, 
            region=self.region
        )
        self.speech_config.set_speech_synthesis_output_format(
            SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3
        )
        
        # Voice mapping with specific styles and roles
        self.voice_mapping = {
             'en': 'en-US-JennyMultilingualNeural',
             'es': 'es-ES-ArabellaMultilingualNeural',
             'de': 'de-DE-SeraphinaMultilingualNeural'
        }



    def _get_temp_directory(self) -> str:
        """Create and return the temporary directory path"""
        if os.name == 'nt':  # Windows
            temp_dir = os.path.join(os.environ.get('TEMP', ''), 'tts_audio')
        else:  # Unix/Linux
            temp_dir = '/tmp/tts_audio'
        os.makedirs(temp_dir, exist_ok=True)
        return temp_dir

    def _detect_language(self, text: str) -> str:
        """Detect the primary language of the text"""
        # Simple language detection based on character patterns
        if re.search(r'[äöüßÄÖÜ]', text):
            return 'de'
        elif re.search(r'[áéíóúñ¿¡]', text):
            return 'es'
        return 'en'

    def _generate_ssml(self, text: str) -> str:
        """Generate valid SSML with proper escaping and language tags"""
        # Clean the text
        text = text.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
        
        # Detect primary language
        primary_lang = self._detect_language(text)
        voice_name = self.voice_mapping.get(primary_lang, self.voice_mapping['en'])
        
        ssml = f"""<?xml version='1.0'?>
<speak version='1.0' xmlns='' xml:lang='{primary_lang}'>
    <voice name='{voice_name}'>
        <prosody rate="0.95" pitch="0%">
            {text}
        </prosody>
    </voice>
</speak>"""
        return ssml

    async def text_to_speech(self, text: str, output_path: Optional[str] = None) -> Optional[str]:
        """Convert text to speech with robust error handling"""
        synthesizer = None
        try:
            print(f"\nStarting TTS process for text: {text[:100]}...")  # First 100 chars
            
            # Generate output path if not provided
            if not output_path:
                temp_dir = self._get_temp_directory()
                timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
                output_path = os.path.join(temp_dir, f"speech_{timestamp}.mp3")
            
            # Configure audio output
            audio_config = AudioOutputConfig(filename=output_path)
            
            # Create synthesizer for this request
            synthesizer = SpeechSynthesizer(
                speech_config=self.speech_config,
                audio_config=audio_config
            )
            
            # Generate and validate SSML
            ssml = self._generate_ssml(text)
            print(f"Generated SSML length: {len(ssml)} characters")
            
            # Perform synthesis
            print("Starting speech synthesis...")
            result = await asyncio.get_event_loop().run_in_executor(
                None,
                lambda: synthesizer.speak_ssml_async(ssml).get()
            )
            
            # Handle result
            if result.reason == ResultReason.SynthesizingAudioCompleted:
                print("Speech synthesis completed successfully")
                return os.path.basename(output_path)
            
            elif result.reason == ResultReason.Canceled:
                print(f"Speech synthesis canceled: {result.cancellation_details.reason}")
                print(f"Error details: {result.cancellation_details.error_details}")
                return None
            
            return None
            
        except Exception as e:
            print(f"Exception in text_to_speech: {str(e)}")
            return None
            
        finally:
            # Proper cleanup
            if synthesizer:
                try:
                    synthesizer.stop_speaking_async()
                except:
                    pass

This is an example of how the correct pronunciation should sound:

German-Spanish (hello example) (this is the desired output with the correct word-for-word pronunciation)

English-Spanish (hello example) (this is the desired output with the correct word-for-word pronunciation)

Now let’s do a word-for-word translation, where we’ll focus on pronouncing the Spanish "ñ," "h," and "ll" properly.

Here’s the Spanish sentence:

"Jugo de piña para la niña y jugo de mora para la señora porque están en el hospital y afuera está lloviendo."

Translation:

"I got pineapple juice for the girl and blackberry juice for the lady because they’re in the hospital and it’s raining outside."

German-Spanish (this is the desired output with the correct word-for-word pronunciation)

English-Spanish (this is the desired output with the correct word-for-word pronunciation)

Currently I have this pronunciation with the same examples

German-Spanish and English-Spanish (hello example) (which is incorrect because the word-for-word pronunciation is not accurate)

Let’s go back to the word-for-word breakdown, again emphasizing Spanish pronunciation for the tricky letters:

"ñ" (sounds like “ny” in canyon, e.g., piña, niña) "h" (silent in Spanish, e.g., hospital) "ll" (varies regionally but often sounds like “y” in yes, e.g., lloviendo). So here’s the sentence again:

"Jugo de piña para la niña y jugo de mora para la señora porque están en el hospital y afuera está lloviendo."

Translation:

"I got pineapple juice for the girl and blackberry juice for the lady because they’re in the hospital and it’s raining outside."

German-Spanish and English-Spanish (which is incorrect because the word-for-word pronunciation is not accurate)

This is the service I use with Azure:

I’ve tried the 'langid' library, but it seems like it doesn’t work for me. My goal is to be able to hear the correct pronunciation of the English-Spanish and German-Spanish word pairs during word-for-word translation.

Thank you.

Here's my code:

translation_service.py


class TranslationService:
    def __init__(self):
        load_dotenv()
        api_key = os.getenv("GEMINI_API_KEY")
        if not api_key:
            raise ValueError("GEMINI_API_KEY not found in environment variables")

        genai.configure(api_key=api_key)
        
        self.generation_config = {
            "temperature": 1,
            "top_p": 0.95,
            "top_k": 40,
            "max_output_tokens": 8192,
            "response_mime_type": "text/plain",
        }
        
        self.model = GenerativeModel(
            model_name="gemini-2.0-flash-exp",
            generation_config=self.generation_config
        )
        
        self.tts_service = EnhancedTTSService()

        # Initialize chat session with translation instructions
        self.chat_session = self.model.start_chat(
            history=[
   {
                    "role": "user",
                    "parts": [
                        """
                       
                        Text
"
(Could be any phrase or word)
"

German Translation:
Conversational-native:
"Ich suche einen Job, damit ich finanziell unabhängig sein kann."
word by word Conversational-native German-Spanish:
"Ich (Yo) suche (busco) einen (un) Job (trabajo), damit (para que) ich (yo) finanziell (económicamente) unabhängig (independiente) sein (ser) kann (pueda)."


English Translation:

Conversational-native:
"I'm looking for a job so I can be financially independent."
word by word Conversational-native English-Spanish:
"I'm (Yo estoy) looking for (buscando) a job (un trabajo) so (para que) I (yo) can be (pueda ser) financially (económicamente) independent (independiente)."



                        """
                    ]
                }
            ]
        )
           
    def _restore_accents(self, text: str) -> str:
        """Restore proper accents and special characters."""
        accent_map = {
            "a": "á", "e": "é", "i": "í", "o": "ó", "u": "ú", "n": "ñ",
            "A": "Á", "E": "É", "I": "Í", "O": "Ó", "U": "Ú", "N": "Ñ",
        }

        patterns = {
            r"([aeiou])´": lambda m: accent_map[m.group(1)],
            r"([AEIOU])´": lambda m: accent_map[m.group(1)],
            r"n~": "ñ",
            r"N~": "Ñ",
        }

        for pattern, replacement in patterns.items():
            if callable(replacement):
                text = re.sub(pattern, replacement, text)
            else:
                text = re.sub(pattern, replacement, text)

        return text

    async def process_prompt(self, text: str, source_lang: str, target_lang: str) -> Translation:
        try:
            response = self.chat_session.send_message(text)
            generated_text = response.text
            
            print(f"Generated text from Gemini: {generated_text[:100]}...")
            
            audio_filename = await self.tts_service.text_to_speech(
                text=generated_text
            )
            
            if audio_filename:
                print(f"Successfully generated audio: {audio_filename}")
            else:
                print("Audio generation failed")
            
            return Translation(
                original_text=text,
                translated_text=generated_text,
                source_language=source_lang,
                target_language=target_lang,
                audio_path=audio_filename,
                translations={"main": generated_text},
                word_by_word=self._generate_word_by_word(text, generated_text),
                grammar_explanations=self._generate_grammar_explanations(generated_text)
            )
            
        except Exception as e:
            print(f"Error in process_prompt: {str(e)}")
            raise Exception(f"Translation processing failed: {str(e)}")


    def _generate_word_by_word(self, original: str, translated: str) -> dict[str, dict[str, str]]:
        """Generate word-by-word translation mapping."""
        result = {}
        original_words = original.split()
        translated_words = translated.split()

        for i, word in enumerate(original_words):
            if i < len(translated_words):
                result[word] = {
                    "translation": translated_words[i],
                    "pos": "unknown",
                }
        return result


    def _auto_fix_spelling(self, text: str) -> str:
        """Fix spelling in the given text."""
        words = re.findall(r"\b\w+\b|[^\w\s]", text)
        corrected_words = []

        for word in words:
            if not re.match(r"\w+", word):
                corrected_words.append(word)
                continue

            if self.spell.unknown([word]):
                correction = self.spell.correction(word)
                if correction:
                    if word.isupper():
                        correction = correction.upper()
                    elif word[0].isupper():
                        correction = correction.capitalize()
                    word = correction

            corrected_words.append(word)

        return " ".join(corrected_words)

tts_service.py


from azure.cognitiveservices.speech.audio import AudioOutputConfig
import os
from typing import Optional
from datetime import datetime
import asyncio
import re

class EnhancedTTSService:
    def __init__(self):
        # Initialize Speech Config
        self.subscription_key = os.getenv("AZURE_SPEECH_KEY")
        self.region = os.getenv("AZURE_SPEECH_REGION")
        
        if not self.subscription_key or not self.region:
            raise ValueError("Azure Speech credentials not found in environment variables")
        
        # Create speech config
        self.speech_config = SpeechConfig(
            subscription=self.subscription_key, 
            region=self.region
        )
        self.speech_config.set_speech_synthesis_output_format(
            SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3
        )
        
        # Voice mapping with specific styles and roles
        self.voice_mapping = {
             'en': 'en-US-JennyMultilingualNeural',
             'es': 'es-ES-ArabellaMultilingualNeural',
             'de': 'de-DE-SeraphinaMultilingualNeural'
        }



    def _get_temp_directory(self) -> str:
        """Create and return the temporary directory path"""
        if os.name == 'nt':  # Windows
            temp_dir = os.path.join(os.environ.get('TEMP', ''), 'tts_audio')
        else:  # Unix/Linux
            temp_dir = '/tmp/tts_audio'
        os.makedirs(temp_dir, exist_ok=True)
        return temp_dir

    def _detect_language(self, text: str) -> str:
        """Detect the primary language of the text"""
        # Simple language detection based on character patterns
        if re.search(r'[äöüßÄÖÜ]', text):
            return 'de'
        elif re.search(r'[áéíóúñ¿¡]', text):
            return 'es'
        return 'en'

    def _generate_ssml(self, text: str) -> str:
        """Generate valid SSML with proper escaping and language tags"""
        # Clean the text
        text = text.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
        
        # Detect primary language
        primary_lang = self._detect_language(text)
        voice_name = self.voice_mapping.get(primary_lang, self.voice_mapping['en'])
        
        ssml = f"""<?xml version='1.0'?>
<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='{primary_lang}'>
    <voice name='{voice_name}'>
        <prosody rate="0.95" pitch="0%">
            {text}
        </prosody>
    </voice>
</speak>"""
        return ssml

    async def text_to_speech(self, text: str, output_path: Optional[str] = None) -> Optional[str]:
        """Convert text to speech with robust error handling"""
        synthesizer = None
        try:
            print(f"\nStarting TTS process for text: {text[:100]}...")  # First 100 chars
            
            # Generate output path if not provided
            if not output_path:
                temp_dir = self._get_temp_directory()
                timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
                output_path = os.path.join(temp_dir, f"speech_{timestamp}.mp3")
            
            # Configure audio output
            audio_config = AudioOutputConfig(filename=output_path)
            
            # Create synthesizer for this request
            synthesizer = SpeechSynthesizer(
                speech_config=self.speech_config,
                audio_config=audio_config
            )
            
            # Generate and validate SSML
            ssml = self._generate_ssml(text)
            print(f"Generated SSML length: {len(ssml)} characters")
            
            # Perform synthesis
            print("Starting speech synthesis...")
            result = await asyncio.get_event_loop().run_in_executor(
                None,
                lambda: synthesizer.speak_ssml_async(ssml).get()
            )
            
            # Handle result
            if result.reason == ResultReason.SynthesizingAudioCompleted:
                print("Speech synthesis completed successfully")
                return os.path.basename(output_path)
            
            elif result.reason == ResultReason.Canceled:
                print(f"Speech synthesis canceled: {result.cancellation_details.reason}")
                print(f"Error details: {result.cancellation_details.error_details}")
                return None
            
            return None
            
        except Exception as e:
            print(f"Exception in text_to_speech: {str(e)}")
            return None
            
        finally:
            # Proper cleanup
            if synthesizer:
                try:
                    synthesizer.stop_speaking_async()
                except:
                    pass

This is an example of how the correct pronunciation should sound:

German-Spanish (hello example) (this is the desired output with the correct word-for-word pronunciation)

https://jmp.sh/s/8sftiJ01aUreR3LDYRWn

English-Spanish (hello example) (this is the desired output with the correct word-for-word pronunciation)

https://jmp.sh/s/9MM1LqTqGH1CvddGhA1l

Now let’s do a word-for-word translation, where we’ll focus on pronouncing the Spanish "ñ," "h," and "ll" properly.

Here’s the Spanish sentence:

"Jugo de piña para la niña y jugo de mora para la señora porque están en el hospital y afuera está lloviendo."

Translation:

"I got pineapple juice for the girl and blackberry juice for the lady because they’re in the hospital and it’s raining outside."

German-Spanish (this is the desired output with the correct word-for-word pronunciation)

https://jmp.sh/s/aRFlpZc99Dw18Uexi8uS

English-Spanish (this is the desired output with the correct word-for-word pronunciation)

https://jmp.sh/eY9ZhlTi

Currently I have this pronunciation with the same examples

German-Spanish and English-Spanish (hello example) (which is incorrect because the word-for-word pronunciation is not accurate)

https://jmp.sh/iExSVBGk

Let’s go back to the word-for-word breakdown, again emphasizing Spanish pronunciation for the tricky letters:

"Jugo de piña para la niña y jugo de mora para la señora porque están en el hospital y afuera está lloviendo."

Translation:

"I got pineapple juice for the girl and blackberry juice for the lady because they’re in the hospital and it’s raining outside."

German-Spanish and English-Spanish (which is incorrect because the word-for-word pronunciation is not accurate)

https://jmp.sh/PxKHNWjx

This is the service I use with Azure:

Thank you.

Share Improve this question edited Jan 18 at 18:59 asked Jan 18 at 18:37 pomoworko.com 1,1182 gold badges15 silver badges43 bronze badges

Add a comment |

1 Answer 1

Sorted by: Reset to default 0

The key was explicit language tagging in SSML combined with strategic pauses. Azure TTS needs clear language context for each word/phrase, especially when mixing languages. Here's the implementation:


    def generate_german_spanish_wordforword_ssml(
        self,
        word_pairs: list[tuple[str, str]],
    ) -> str:
        """Generate SSML specifically for German-Spanish word-by-word translations"""
        ssml = """
        <voice name="en-US-JennyMultilingualNeural">
            <prosody rate="0.8">"""
        
        for source_word, target_word in word_pairs:
            source_word = source_word.strip().replace("&", "&amp;")
            target_word = target_word.strip().replace("&", "&amp;")
            
            ssml += f"""
                <lang xml:lang="de-DE">{source_word}</lang>
                <break time="300ms"/>
                <lang xml:lang="es-ES">{target_word}</lang>
                <break time="500ms"/>"""
        
        ssml += """
                <break time="1000ms"/>
            </prosody>
        </voice>"""
        
        return ssml


    def generate_english_spanish_wordforword_ssml(
        self,
        word_pairs: list[tuple[str, str]],
    ) -> str:
        """Generate SSML specifically for English-Spanish word-by-word translations"""
        ssml = """
        <voice name="en-US-JennyMultilingualNeural">
            <prosody rate="0.8">"""
        
        for source_word, target_word in word_pairs:
            source_word = source_word.strip().replace("&", "&amp;")
            target_word = target_word.strip().replace("&", "&amp;")
            
            ssml += f"""
                <lang xml:lang="en-US">{source_word}</lang>
                <break time="300ms"/>
                <lang xml:lang="es-ES">{target_word}</lang>
                <break time="500ms"/>"""
        
        ssml += """
                <break time="1000ms"/>
            </prosody>
        </voice>"""
        
        return ssml

Then, I modified the translation service to parse Gemini's output into clean word pairs:

    def _extract_word_pairs(self, text: str) -> list[tuple[str, str]]:
        word_pairs = []
        word_by_word_pattern = r'\* word by word.*?\n"([^"]+)"'
        word_by_word_match = re.search(word_by_word_pattern, text, re.DOTALL)
        
        if word_by_word_match:
            word_by_word_text = word_by_word_match.group(1)
            # Improved regex to capture multi-word phrases including those with apostrophes
            parts = re.findall(r'([^()]+?)\s*\(([^)]+)\)', word_by_word_text)
            for source, target in parts:
                # Clean and normalize both phrases
                source = re.sub(r'\s+', ' ', source.strip().replace("'", ""))
                target = target.strip()
                if source and target:
                    word_pairs.append((source, target))
        return word_pairs

And then I've updated and added new code on tts_service.py


    def _is_german_word(self, word: str) -> bool:
        # List of common German words that might appear in the English section
        german_words = {"dir", "ich", "du", "sie", "er", "es", "wir", "ihr", "ist", "sind", "haben", 
                    "sein", "werden", "kann", "könnte", "möchte", "muss", "darf", "soll"}
        return word.lower() in german_words

    def _is_english_word(self, word: str) -> bool:
        # List of common English words to verify
        english_words = {"the", "a", "an", "in", "on", "at", "to", "for", "with", "by"}
        return word.lower() in english_words


    def generate_enhanced_ssml(
        self,
        text: Optional[str] = None,
        word_pairs: Optional[list[tuple[str, str, bool]]] = None,
        source_lang: str = "de",
        target_lang: str = "es",
    ) -> str:
        """Generate SSML with proper phrase handling for both German and English"""
        ssml = """<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="en-US">"""

        if text:
            # Split text into lines and pad to 8 elements
            sentences = (text.split("\n") + [""] * 8)[:8]
            sentences = [t.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;") 
                        for t in sentences]

            # Destructure sentences
            (german_native, german_colloquial, german_informal, german_formal,
            english_native, english_colloquial, english_informal, english_formal) = sentences

            if word_pairs:
                # Separate pairs with language flag
                german_pairs = [(src, tgt) for src, tgt, is_german in word_pairs if is_german]
                english_pairs = [(src, tgt) for src, tgt, is_german in word_pairs if not is_german]

                # German Sections
                if german_native:
                    ssml += self._generate_language_section(
                        german_native, german_pairs,
                        voice="de-DE-SeraphinaMultilingualNeural",
                        lang="de-DE"
                    )

                if german_colloquial:
                    ssml += self._generate_language_section(
                        german_colloquial, german_pairs,
                        voice="de-DE-SeraphinaMultilingualNeural",
                        lang="de-DE"
                    )

                if german_informal:
                    ssml += self._generate_language_section(
                        german_informal, german_pairs,
                        voice="de-DE-KatjaNeural",
                        lang="de-DE"
                    )

                if german_formal:
                    ssml += self._generate_language_section(
                        german_formal, german_pairs,
                        voice="de-DE-SeraphinaMultilingualNeural",
                        lang="de-DE"
                    )

                # English Sections
                if english_native:
                    ssml += self._generate_language_section(
                        english_native, english_pairs,
                        voice="en-US-JennyMultilingualNeural",
                        lang="en-US"
                    )

                if english_colloquial:
                    ssml += self._generate_language_section(
                        english_colloquial, english_pairs,
                        voice="en-US-JennyMultilingualNeural",
                        lang="en-US"
                    )

                if english_informal:
                    ssml += self._generate_language_section(
                        english_informal, english_pairs,
                        voice="en-US-JennyNeural",
                        lang="en-US"
                    )

                if english_formal:
                    ssml += self._generate_language_section(
                        english_formal, english_pairs,
                        voice="en-US-JennyMultilingualNeural",
                        lang="en-US"
                    )

        # Final cleanup of SSML
        ssml = re.sub(r'(<break time="500ms"\s*/>\s*)+', '<break time="500ms"/>', ssml)
        ssml += "</speak>"
        return ssml

    def _generate_language_section(
        self,
        sentence: str,
        word_pairs: list[tuple[str, str]],
        voice: str,
        lang: str
    ) -> str:
        """Generate complete language section with phrase handling"""
        section = f"""
        <voice name="{voice}">
            <prosody rate="1.0">
                <lang xml:lang="{lang}">{sentence}</lang>
                <break time="1000ms"/>
            </prosody>
        </voice>"""

        if word_pairs:
            section += """
        <voice name="en-US-JennyMultilingualNeural">
            <prosody rate="0.8">"""
            
            # Create phrase map and sort by phrase length
            phrase_map = {src.lower(): (src, tgt) for src, tgt in word_pairs}
            phrases = sorted(phrase_map.keys(), key=lambda x: len(x.split()), reverse=True)
            words = sentence.split()
            index = 0
            
            while index < len(words):
                matched = False
                
                # Try to match multi-word phrases first
                for phrase_key in phrases:
                    phrase_words = phrase_key.split()
                    if index + len(phrase_words) > len(words):
                        continue
                    
                    candidate = ' '.join(words[index:index+len(phrase_words)]).lower()
                    if candidate == phrase_key:
                        original_phrase, translation = phrase_map[phrase_key]
                        section += f"""
            <lang xml:lang="{lang}">{original_phrase}</lang>
            <break time="300ms"/>
            <lang xml:lang="es-ES">{translation}</lang>
            <break time="500ms"/>"""
                        index += len(phrase_words)
                        matched = True
                        break
                        
                # Single word fallback
                if not matched:
                    word = words[index].strip(".,!?")
                    translation = next((tgt for src, tgt in word_pairs if src.lower() == word.lower()), None)
                    section += f"""
            <lang xml:lang="{lang}">{word}</lang>
            <break time="300ms"/>"""
                    if translation:
                        section += f"""
            <lang xml:lang="es-ES">{translation}</lang>
            <break time="500ms"/>"""
                    else:
                        section += """<break time="500ms"/>"""
                    index += 1

            section += """
            <break time="1000ms"/>
            </prosody>
        </voice>"""
        
        return section

    def _generate_sentence_section(
        self,
        sentence: str,
        word_pairs: list[tuple[str, str]],
        voice: str,
        lang: str,
    ) -> str:
        if not sentence:
            return ""
        
        # Generate the main sentence SSML
        ssml = f"""
            <voice name="{voice}">
                <prosody rate="1.0">
                    <lang xml:lang="{lang}">{sentence}</lang>
                    <break time="1000ms"/>
                </prosody>
            </voice>"""
        
        if word_pairs:
            ssml += """
                <voice name="en-US-JennyMultilingualNeural">
                    <prosody rate="0.8">"""
            
            # Create phrase map and sort by phrase length (longest first)
            phrase_map = {src.lower(): (src, tgt) for src, tgt in word_pairs}
            phrases = sorted(phrase_map.keys(), key=lambda x: len(x.split()), reverse=True)
            words = sentence.split()
            index = 0
            
            while index < len(words):
                matched = False
                
                # Try to match multi-word phrases first
                for phrase_key in phrases:
                    phrase_words = phrase_key.split()
                    phrase_len = len(phrase_words)
                    
                    if index + phrase_len <= len(words):
                        current_phrase = ' '.join(words[index:index+phrase_len]).lower()
                        if current_phrase == phrase_key:
                            original_phrase, translation = phrase_map[phrase_key]
                            ssml += f"""
                                <lang xml:lang="{lang}">{original_phrase}</lang>
                                <break time="300ms"/>
                                <lang xml:lang="es-ES">{translation}</lang>
                                <break time="500ms"/>"""
                            index += phrase_len
                            matched = True
                            break
                            
                # Fallback to single-word matching
                if not matched:
                    current_word = words[index].strip(".,!?").lower()
                    original_word = words[index]
                    translation = next((tgt for src, tgt in word_pairs if src.lower() == current_word), None)
                    
                    ssml += f"""
                        <lang xml:lang="{lang}">{original_word}</lang>
                        <break time="300ms"/>"""
                    if translation:
                        ssml += f"""
                            <lang xml:lang="es-ES">{translation}</lang>
                            <break time="500ms"/>"""
                    else:
                        ssml += """<break time="500ms"/>"""
                    
                    index += 1
            
            ssml += """
                        <break time="1000ms"/>
                    </prosody>
                </voice>"""
        
        return ssml

In translation_service.py I've updated this code to find the solution



    def _format_for_tts(self, word_pairs: list[tuple[str, str]], source_lang: str, target_lang: str) -> str:
        lang_map = {
            'en': 'en-US',
            'de': 'de-DE',
            'es': 'es-ES'
        }

        # Make sure to use the correct source language code for each word
        ssml = """<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="en-US">
        <voice name="en-US-JennyMultilingualNeural">"""

        for source_word, target_word in word_pairs:
            source_word = source_word.strip()
            target_word = target_word.strip()
            
            # Use the correct source language code based on the source_lang parameter
            source_lang_code = lang_map.get(source_lang, 'en-US')
            target_lang_code = lang_map.get(target_lang, 'es-ES')

            ssml += f"""
            <lang xml:lang="{source_lang_code}">{source_word}</lang>
            <break time="500ms"/>
            <lang xml:lang="{target_lang_code}">{target_word}</lang>
            <break time="500ms"/>"""

        ssml += """
        </voice>
    </speak>"""
        return ssml

    async def process_prompt(self, text: str, source_lang: str, target_lang: str) -> Translation:
        
        try:
            
            response = self.chat_session.send_message(text)
            generated_text = response.text
            
            print(f"Generated text from Gemini: {generated_text[:100]}...")
            
            translations, word_pairs = self._extract_text_and_pairs(generated_text)
                        
            audio_filename = None
            
            if translations and word_pairs:
                
                audio_filename = await self.tts_service.text_to_speech_word_pairs(
                    word_pairs=word_pairs,  
                    source_lang=source_lang,  
                    target_lang=target_lang,  
                    complete_text="\n".join(translations)  
                )
            elif translations:
                
                formatted_ssml = self.tts_service.generate_enhanced_ssml(
                    text="\n".join(translations),  
                    source_lang=source_lang,  
                    target_lang=target_lang  
                )
                audio_filename = await self.tts_service.text_to_speech(formatted_ssml)  
            
            if audio_filename:
                
                print(f"Successfully generated audio: {audio_filename}")
            else:
                
                print("Audio generation failed")
            
            return Translation(
                original_text=text, 
                translated_text=generated_text,  
                source_language=source_lang,  
                target_language=target_lang, 
                audio_path=audio_filename if audio_filename else None,  
                translations={"main": translations[0] if translations else generated_text},  
                word_by_word=self._generate_word_by_word(text, generated_text),  
                grammar_explanations=self._generate_grammar_explanations(generated_text)  
            )

        except Exception as e:
            
            print(f"Error in process_prompt: {str(e)}")
            raise Exception(f"Translation processing failed: {str(e)}")


    def _extract_text_and_pairs(self, generated_text: str) -> tuple[list[str], list[tuple[str, str, bool]]]:
        """
        Extract both native, colloquial, informal, and formal texts and word pairs from generated text.
        Returns: tuple of ([texts], [(source_word, target_word, is_german)])
        """
        translations = []
        word_pairs = []
        
        # Patterns for German translations
        german_patterns = [
            {
                'text_pattern': r'German Translation:.*?\* Conversational-native:\s*"([^"]+)"',
                'pairs_pattern': r'\* word by word Conversational-native German-Spanish:\s*"([^"]+)"',
                'is_german': True
            },
            {
                'text_pattern': r'\* Conversational-colloquial:\s*"([^"]+)"',
                'pairs_pattern': r'\* word by word Conversational-colloquial German-Spanish:\s*"([^"]+)"',
                'is_german': True
            },
            {
                'text_pattern': r'\* Conversational-informal:\s*"([^"]+)"',
                'pairs_pattern': r'\* word by word Conversational-informal German-Spanish:\s*"([^"]+)"',
                'is_german': True
            },
            {
                'text_pattern': r'\* Conversational-formal:\s*"([^"]+)"',
                'pairs_pattern': r'\* word by word Conversational-formal German-Spanish:\s*"([^"]+)"',
                'is_german': True
            }
        ]
        
        # Patterns for English translations
        english_patterns = [
            {
                'text_pattern': r'English Translation:.*?\* Conversational-native:\s*"([^"]+)"',
                'pairs_pattern': r'\* word by word Conversational-native English-Spanish:\s*"([^"]+)"',
                'is_german': False
            },
            {
                'text_pattern': r'English Translation:.*?\* Conversational-colloquial:\s*"([^"]+)"',
                'pairs_pattern': r'\* word by word Conversational-colloquial English-Spanish:\s*"([^"]+)"',
                'is_german': False
            },
            {
                'text_pattern': r'English Translation:.*?\* Conversational-informal:\s*"([^"]+)"',
                'pairs_pattern': r'\* word by word Conversational-informal English-Spanish:\s*"([^"]+)"',
                'is_german': False
            },
            {
                'text_pattern': r'English Translation:.*?\* Conversational-formal:\s*"([^"]+)"',
                'pairs_pattern': r'\* word by word Conversational-formal English-Spanish:\s*"([^"]+)"',
                'is_german': False
            }
        ]
        
        # Combine patterns
        all_patterns = german_patterns + english_patterns
        
        # Extract translations and word pairs
        for pattern_set in all_patterns:
            # Extract text
            text_match = re.search(pattern_set['text_pattern'], generated_text, re.DOTALL | re.IGNORECASE)
            if text_match:
                translations.append(text_match.group(1).strip())
            
            # Extract word pairs
            pairs_match = re.search(pattern_set['pairs_pattern'], generated_text, re.IGNORECASE)
            if pairs_match:
                pairs_text = pairs_match.group(1)
                # More robust word pair extraction
                pair_matches = re.findall(r'(\S+)\s*\(([^)]+)\)', pairs_text)
                for source, target in pair_matches:
                    source = source.strip()
                    target = target.strip()
                    if source and target:
                        word_pairs.append((source, target, pattern_set['is_german']))
        
        # Remove duplicates while preserving order
        seen_pairs = set()
        unique_pairs = []
        for pair in word_pairs:
            pair_tuple = (pair[0], pair[1], pair[2])
            if pair_tuple not in seen_pairs:
                seen_pairs.add(pair_tuple)
                unique_pairs.append(pair)
        
        return translations, unique_pairs

科技改变生活-雨落星辰 - 所有的伟大,都源于一个勇敢的开始

python - How can I ensure that Azure Text-to-Speech properly pronounces word-for-word translations? - Stack Overflow

1 Answer 1

与本文相关的文章

评论列表(0)