I'm working on an app using Azure, Gemini, Python, and Dart, and I want to make sure the pronunciation between languages is spot on. For example, I want to translate between German and Spanish: the goal is for 'hallo' -> 'hola' to be pronounced correctly in both languages. The same goes for English and Spanish 'hello' -> 'hola'. Azure does well with sentences, but struggles with word-for-word translations.
Here's my code:
- translation_service.py
class TranslationService:
def __init__(self):
load_dotenv()
api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
raise ValueError("GEMINI_API_KEY not found in environment variables")
genai.configure(api_key=api_key)
self.generation_config = {
"temperature": 1,
"top_p": 0.95,
"top_k": 40,
"max_output_tokens": 8192,
"response_mime_type": "text/plain",
}
self.model = GenerativeModel(
model_name="gemini-2.0-flash-exp",
generation_config=self.generation_config
)
self.tts_service = EnhancedTTSService()
# Initialize chat session with translation instructions
self.chat_session = self.model.start_chat(
history=[
{
"role": "user",
"parts": [
"""
Text
"
(Could be any phrase or word)
"
German Translation:
Conversational-native:
"Ich suche einen Job, damit ich finanziell unabhängig sein kann."
word by word Conversational-native German-Spanish:
"Ich (Yo) suche (busco) einen (un) Job (trabajo), damit (para que) ich (yo) finanziell (económicamente) unabhängig (independiente) sein (ser) kann (pueda)."
English Translation:
Conversational-native:
"I'm looking for a job so I can be financially independent."
word by word Conversational-native English-Spanish:
"I'm (Yo estoy) looking for (buscando) a job (un trabajo) so (para que) I (yo) can be (pueda ser) financially (económicamente) independent (independiente)."
"""
]
}
]
)
def _restore_accents(self, text: str) -> str:
"""Restore proper accents and special characters."""
accent_map = {
"a": "á", "e": "é", "i": "í", "o": "ó", "u": "ú", "n": "ñ",
"A": "Á", "E": "É", "I": "Í", "O": "Ó", "U": "Ú", "N": "Ñ",
}
patterns = {
r"([aeiou])´": lambda m: accent_map[m.group(1)],
r"([AEIOU])´": lambda m: accent_map[m.group(1)],
r"n~": "ñ",
r"N~": "Ñ",
}
for pattern, replacement in patterns.items():
if callable(replacement):
text = re.sub(pattern, replacement, text)
else:
text = re.sub(pattern, replacement, text)
return text
async def process_prompt(self, text: str, source_lang: str, target_lang: str) -> Translation:
try:
response = self.chat_session.send_message(text)
generated_text = response.text
print(f"Generated text from Gemini: {generated_text[:100]}...")
audio_filename = await self.tts_service.text_to_speech(
text=generated_text
)
if audio_filename:
print(f"Successfully generated audio: {audio_filename}")
else:
print("Audio generation failed")
return Translation(
original_text=text,
translated_text=generated_text,
source_language=source_lang,
target_language=target_lang,
audio_path=audio_filename,
translations={"main": generated_text},
word_by_word=self._generate_word_by_word(text, generated_text),
grammar_explanations=self._generate_grammar_explanations(generated_text)
)
except Exception as e:
print(f"Error in process_prompt: {str(e)}")
raise Exception(f"Translation processing failed: {str(e)}")
def _generate_word_by_word(self, original: str, translated: str) -> dict[str, dict[str, str]]:
"""Generate word-by-word translation mapping."""
result = {}
original_words = original.split()
translated_words = translated.split()
for i, word in enumerate(original_words):
if i < len(translated_words):
result[word] = {
"translation": translated_words[i],
"pos": "unknown",
}
return result
def _auto_fix_spelling(self, text: str) -> str:
"""Fix spelling in the given text."""
words = re.findall(r"\b\w+\b|[^\w\s]", text)
corrected_words = []
for word in words:
if not re.match(r"\w+", word):
corrected_words.append(word)
continue
if self.spell.unknown([word]):
correction = self.spell.correction(word)
if correction:
if word.isupper():
correction = correction.upper()
elif word[0].isupper():
correction = correction.capitalize()
word = correction
corrected_words.append(word)
return " ".join(corrected_words)
- tts_service.py
from azure.cognitiveservices.speech.audio import AudioOutputConfig
import os
from typing import Optional
from datetime import datetime
import asyncio
import re
class EnhancedTTSService:
def __init__(self):
# Initialize Speech Config
self.subscription_key = os.getenv("AZURE_SPEECH_KEY")
self.region = os.getenv("AZURE_SPEECH_REGION")
if not self.subscription_key or not self.region:
raise ValueError("Azure Speech credentials not found in environment variables")
# Create speech config
self.speech_config = SpeechConfig(
subscription=self.subscription_key,
region=self.region
)
self.speech_config.set_speech_synthesis_output_format(
SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3
)
# Voice mapping with specific styles and roles
self.voice_mapping = {
'en': 'en-US-JennyMultilingualNeural',
'es': 'es-ES-ArabellaMultilingualNeural',
'de': 'de-DE-SeraphinaMultilingualNeural'
}
def _get_temp_directory(self) -> str:
"""Create and return the temporary directory path"""
if os.name == 'nt': # Windows
temp_dir = os.path.join(os.environ.get('TEMP', ''), 'tts_audio')
else: # Unix/Linux
temp_dir = '/tmp/tts_audio'
os.makedirs(temp_dir, exist_ok=True)
return temp_dir
def _detect_language(self, text: str) -> str:
"""Detect the primary language of the text"""
# Simple language detection based on character patterns
if re.search(r'[äöüßÄÖÜ]', text):
return 'de'
elif re.search(r'[áéíóúñ¿¡]', text):
return 'es'
return 'en'
def _generate_ssml(self, text: str) -> str:
"""Generate valid SSML with proper escaping and language tags"""
# Clean the text
text = text.replace('&', '&').replace('<', '<').replace('>', '>')
# Detect primary language
primary_lang = self._detect_language(text)
voice_name = self.voice_mapping.get(primary_lang, self.voice_mapping['en'])
ssml = f"""<?xml version='1.0'?>
<speak version='1.0' xmlns='' xml:lang='{primary_lang}'>
<voice name='{voice_name}'>
<prosody rate="0.95" pitch="0%">
{text}
</prosody>
</voice>
</speak>"""
return ssml
async def text_to_speech(self, text: str, output_path: Optional[str] = None) -> Optional[str]:
"""Convert text to speech with robust error handling"""
synthesizer = None
try:
print(f"\nStarting TTS process for text: {text[:100]}...") # First 100 chars
# Generate output path if not provided
if not output_path:
temp_dir = self._get_temp_directory()
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_path = os.path.join(temp_dir, f"speech_{timestamp}.mp3")
# Configure audio output
audio_config = AudioOutputConfig(filename=output_path)
# Create synthesizer for this request
synthesizer = SpeechSynthesizer(
speech_config=self.speech_config,
audio_config=audio_config
)
# Generate and validate SSML
ssml = self._generate_ssml(text)
print(f"Generated SSML length: {len(ssml)} characters")
# Perform synthesis
print("Starting speech synthesis...")
result = await asyncio.get_event_loop().run_in_executor(
None,
lambda: synthesizer.speak_ssml_async(ssml).get()
)
# Handle result
if result.reason == ResultReason.SynthesizingAudioCompleted:
print("Speech synthesis completed successfully")
return os.path.basename(output_path)
elif result.reason == ResultReason.Canceled:
print(f"Speech synthesis canceled: {result.cancellation_details.reason}")
print(f"Error details: {result.cancellation_details.error_details}")
return None
return None
except Exception as e:
print(f"Exception in text_to_speech: {str(e)}")
return None
finally:
# Proper cleanup
if synthesizer:
try:
synthesizer.stop_speaking_async()
except:
pass
This is an example of how the correct pronunciation should sound:
German-Spanish (hello example) (this is the desired output with the correct word-for-word pronunciation)
English-Spanish (hello example) (this is the desired output with the correct word-for-word pronunciation)
Now let’s do a word-for-word translation, where we’ll focus on pronouncing the Spanish "ñ," "h," and "ll" properly.
Here’s the Spanish sentence:
"Jugo de piña para la niña y jugo de mora para la señora porque están en el hospital y afuera está lloviendo."
Translation:
"I got pineapple juice for the girl and blackberry juice for the lady because they’re in the hospital and it’s raining outside."
German-Spanish (this is the desired output with the correct word-for-word pronunciation)
English-Spanish (this is the desired output with the correct word-for-word pronunciation)
Currently I have this pronunciation with the same examples
German-Spanish and English-Spanish (hello example) (which is incorrect because the word-for-word pronunciation is not accurate)
Let’s go back to the word-for-word breakdown, again emphasizing Spanish pronunciation for the tricky letters:
"ñ" (sounds like “ny” in canyon, e.g., piña, niña) "h" (silent in Spanish, e.g., hospital) "ll" (varies regionally but often sounds like “y” in yes, e.g., lloviendo). So here’s the sentence again:
"Jugo de piña para la niña y jugo de mora para la señora porque están en el hospital y afuera está lloviendo."
Translation:
"I got pineapple juice for the girl and blackberry juice for the lady because they’re in the hospital and it’s raining outside."
German-Spanish and English-Spanish (which is incorrect because the word-for-word pronunciation is not accurate)
This is the service I use with Azure:
I’ve tried the 'langid' library, but it seems like it doesn’t work for me. My goal is to be able to hear the correct pronunciation of the English-Spanish and German-Spanish word pairs during word-for-word translation.
Thank you.
I'm working on an app using Azure, Gemini, Python, and Dart, and I want to make sure the pronunciation between languages is spot on. For example, I want to translate between German and Spanish: the goal is for 'hallo' -> 'hola' to be pronounced correctly in both languages. The same goes for English and Spanish 'hello' -> 'hola'. Azure does well with sentences, but struggles with word-for-word translations.
Here's my code:
- translation_service.py
class TranslationService:
def __init__(self):
load_dotenv()
api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
raise ValueError("GEMINI_API_KEY not found in environment variables")
genai.configure(api_key=api_key)
self.generation_config = {
"temperature": 1,
"top_p": 0.95,
"top_k": 40,
"max_output_tokens": 8192,
"response_mime_type": "text/plain",
}
self.model = GenerativeModel(
model_name="gemini-2.0-flash-exp",
generation_config=self.generation_config
)
self.tts_service = EnhancedTTSService()
# Initialize chat session with translation instructions
self.chat_session = self.model.start_chat(
history=[
{
"role": "user",
"parts": [
"""
Text
"
(Could be any phrase or word)
"
German Translation:
Conversational-native:
"Ich suche einen Job, damit ich finanziell unabhängig sein kann."
word by word Conversational-native German-Spanish:
"Ich (Yo) suche (busco) einen (un) Job (trabajo), damit (para que) ich (yo) finanziell (económicamente) unabhängig (independiente) sein (ser) kann (pueda)."
English Translation:
Conversational-native:
"I'm looking for a job so I can be financially independent."
word by word Conversational-native English-Spanish:
"I'm (Yo estoy) looking for (buscando) a job (un trabajo) so (para que) I (yo) can be (pueda ser) financially (económicamente) independent (independiente)."
"""
]
}
]
)
def _restore_accents(self, text: str) -> str:
"""Restore proper accents and special characters."""
accent_map = {
"a": "á", "e": "é", "i": "í", "o": "ó", "u": "ú", "n": "ñ",
"A": "Á", "E": "É", "I": "Í", "O": "Ó", "U": "Ú", "N": "Ñ",
}
patterns = {
r"([aeiou])´": lambda m: accent_map[m.group(1)],
r"([AEIOU])´": lambda m: accent_map[m.group(1)],
r"n~": "ñ",
r"N~": "Ñ",
}
for pattern, replacement in patterns.items():
if callable(replacement):
text = re.sub(pattern, replacement, text)
else:
text = re.sub(pattern, replacement, text)
return text
async def process_prompt(self, text: str, source_lang: str, target_lang: str) -> Translation:
try:
response = self.chat_session.send_message(text)
generated_text = response.text
print(f"Generated text from Gemini: {generated_text[:100]}...")
audio_filename = await self.tts_service.text_to_speech(
text=generated_text
)
if audio_filename:
print(f"Successfully generated audio: {audio_filename}")
else:
print("Audio generation failed")
return Translation(
original_text=text,
translated_text=generated_text,
source_language=source_lang,
target_language=target_lang,
audio_path=audio_filename,
translations={"main": generated_text},
word_by_word=self._generate_word_by_word(text, generated_text),
grammar_explanations=self._generate_grammar_explanations(generated_text)
)
except Exception as e:
print(f"Error in process_prompt: {str(e)}")
raise Exception(f"Translation processing failed: {str(e)}")
def _generate_word_by_word(self, original: str, translated: str) -> dict[str, dict[str, str]]:
"""Generate word-by-word translation mapping."""
result = {}
original_words = original.split()
translated_words = translated.split()
for i, word in enumerate(original_words):
if i < len(translated_words):
result[word] = {
"translation": translated_words[i],
"pos": "unknown",
}
return result
def _auto_fix_spelling(self, text: str) -> str:
"""Fix spelling in the given text."""
words = re.findall(r"\b\w+\b|[^\w\s]", text)
corrected_words = []
for word in words:
if not re.match(r"\w+", word):
corrected_words.append(word)
continue
if self.spell.unknown([word]):
correction = self.spell.correction(word)
if correction:
if word.isupper():
correction = correction.upper()
elif word[0].isupper():
correction = correction.capitalize()
word = correction
corrected_words.append(word)
return " ".join(corrected_words)
- tts_service.py
from azure.cognitiveservices.speech.audio import AudioOutputConfig
import os
from typing import Optional
from datetime import datetime
import asyncio
import re
class EnhancedTTSService:
def __init__(self):
# Initialize Speech Config
self.subscription_key = os.getenv("AZURE_SPEECH_KEY")
self.region = os.getenv("AZURE_SPEECH_REGION")
if not self.subscription_key or not self.region:
raise ValueError("Azure Speech credentials not found in environment variables")
# Create speech config
self.speech_config = SpeechConfig(
subscription=self.subscription_key,
region=self.region
)
self.speech_config.set_speech_synthesis_output_format(
SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3
)
# Voice mapping with specific styles and roles
self.voice_mapping = {
'en': 'en-US-JennyMultilingualNeural',
'es': 'es-ES-ArabellaMultilingualNeural',
'de': 'de-DE-SeraphinaMultilingualNeural'
}
def _get_temp_directory(self) -> str:
"""Create and return the temporary directory path"""
if os.name == 'nt': # Windows
temp_dir = os.path.join(os.environ.get('TEMP', ''), 'tts_audio')
else: # Unix/Linux
temp_dir = '/tmp/tts_audio'
os.makedirs(temp_dir, exist_ok=True)
return temp_dir
def _detect_language(self, text: str) -> str:
"""Detect the primary language of the text"""
# Simple language detection based on character patterns
if re.search(r'[äöüßÄÖÜ]', text):
return 'de'
elif re.search(r'[áéíóúñ¿¡]', text):
return 'es'
return 'en'
def _generate_ssml(self, text: str) -> str:
"""Generate valid SSML with proper escaping and language tags"""
# Clean the text
text = text.replace('&', '&').replace('<', '<').replace('>', '>')
# Detect primary language
primary_lang = self._detect_language(text)
voice_name = self.voice_mapping.get(primary_lang, self.voice_mapping['en'])
ssml = f"""<?xml version='1.0'?>
<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='{primary_lang}'>
<voice name='{voice_name}'>
<prosody rate="0.95" pitch="0%">
{text}
</prosody>
</voice>
</speak>"""
return ssml
async def text_to_speech(self, text: str, output_path: Optional[str] = None) -> Optional[str]:
"""Convert text to speech with robust error handling"""
synthesizer = None
try:
print(f"\nStarting TTS process for text: {text[:100]}...") # First 100 chars
# Generate output path if not provided
if not output_path:
temp_dir = self._get_temp_directory()
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_path = os.path.join(temp_dir, f"speech_{timestamp}.mp3")
# Configure audio output
audio_config = AudioOutputConfig(filename=output_path)
# Create synthesizer for this request
synthesizer = SpeechSynthesizer(
speech_config=self.speech_config,
audio_config=audio_config
)
# Generate and validate SSML
ssml = self._generate_ssml(text)
print(f"Generated SSML length: {len(ssml)} characters")
# Perform synthesis
print("Starting speech synthesis...")
result = await asyncio.get_event_loop().run_in_executor(
None,
lambda: synthesizer.speak_ssml_async(ssml).get()
)
# Handle result
if result.reason == ResultReason.SynthesizingAudioCompleted:
print("Speech synthesis completed successfully")
return os.path.basename(output_path)
elif result.reason == ResultReason.Canceled:
print(f"Speech synthesis canceled: {result.cancellation_details.reason}")
print(f"Error details: {result.cancellation_details.error_details}")
return None
return None
except Exception as e:
print(f"Exception in text_to_speech: {str(e)}")
return None
finally:
# Proper cleanup
if synthesizer:
try:
synthesizer.stop_speaking_async()
except:
pass
This is an example of how the correct pronunciation should sound:
German-Spanish (hello example) (this is the desired output with the correct word-for-word pronunciation)
https://jmp.sh/s/8sftiJ01aUreR3LDYRWn
English-Spanish (hello example) (this is the desired output with the correct word-for-word pronunciation)
https://jmp.sh/s/9MM1LqTqGH1CvddGhA1l
Now let’s do a word-for-word translation, where we’ll focus on pronouncing the Spanish "ñ," "h," and "ll" properly.
Here’s the Spanish sentence:
"Jugo de piña para la niña y jugo de mora para la señora porque están en el hospital y afuera está lloviendo."
Translation:
"I got pineapple juice for the girl and blackberry juice for the lady because they’re in the hospital and it’s raining outside."
German-Spanish (this is the desired output with the correct word-for-word pronunciation)
https://jmp.sh/s/aRFlpZc99Dw18Uexi8uS
English-Spanish (this is the desired output with the correct word-for-word pronunciation)
https://jmp.sh/eY9ZhlTi
Currently I have this pronunciation with the same examples
German-Spanish and English-Spanish (hello example) (which is incorrect because the word-for-word pronunciation is not accurate)
https://jmp.sh/iExSVBGk
Let’s go back to the word-for-word breakdown, again emphasizing Spanish pronunciation for the tricky letters:
"ñ" (sounds like “ny” in canyon, e.g., piña, niña) "h" (silent in Spanish, e.g., hospital) "ll" (varies regionally but often sounds like “y” in yes, e.g., lloviendo). So here’s the sentence again:
"Jugo de piña para la niña y jugo de mora para la señora porque están en el hospital y afuera está lloviendo."
Translation:
"I got pineapple juice for the girl and blackberry juice for the lady because they’re in the hospital and it’s raining outside."
German-Spanish and English-Spanish (which is incorrect because the word-for-word pronunciation is not accurate)
https://jmp.sh/PxKHNWjx
This is the service I use with Azure:
I’ve tried the 'langid' library, but it seems like it doesn’t work for me. My goal is to be able to hear the correct pronunciation of the English-Spanish and German-Spanish word pairs during word-for-word translation.
Thank you.
Share Improve this question edited Jan 18 at 18:59 pomoworko.com asked Jan 18 at 18:37 pomoworko.compomoworko.com 1,1182 gold badges15 silver badges43 bronze badges1 Answer
Reset to default 0The key was explicit language tagging in SSML combined with strategic pauses. Azure TTS needs clear language context for each word/phrase, especially when mixing languages. Here's the implementation:
def generate_german_spanish_wordforword_ssml(
self,
word_pairs: list[tuple[str, str]],
) -> str:
"""Generate SSML specifically for German-Spanish word-by-word translations"""
ssml = """
<voice name="en-US-JennyMultilingualNeural">
<prosody rate="0.8">"""
for source_word, target_word in word_pairs:
source_word = source_word.strip().replace("&", "&")
target_word = target_word.strip().replace("&", "&")
ssml += f"""
<lang xml:lang="de-DE">{source_word}</lang>
<break time="300ms"/>
<lang xml:lang="es-ES">{target_word}</lang>
<break time="500ms"/>"""
ssml += """
<break time="1000ms"/>
</prosody>
</voice>"""
return ssml
def generate_english_spanish_wordforword_ssml(
self,
word_pairs: list[tuple[str, str]],
) -> str:
"""Generate SSML specifically for English-Spanish word-by-word translations"""
ssml = """
<voice name="en-US-JennyMultilingualNeural">
<prosody rate="0.8">"""
for source_word, target_word in word_pairs:
source_word = source_word.strip().replace("&", "&")
target_word = target_word.strip().replace("&", "&")
ssml += f"""
<lang xml:lang="en-US">{source_word}</lang>
<break time="300ms"/>
<lang xml:lang="es-ES">{target_word}</lang>
<break time="500ms"/>"""
ssml += """
<break time="1000ms"/>
</prosody>
</voice>"""
return ssml
Then, I modified the translation service to parse Gemini's output into clean word pairs:
def _extract_word_pairs(self, text: str) -> list[tuple[str, str]]:
word_pairs = []
word_by_word_pattern = r'\* word by word.*?\n"([^"]+)"'
word_by_word_match = re.search(word_by_word_pattern, text, re.DOTALL)
if word_by_word_match:
word_by_word_text = word_by_word_match.group(1)
# Improved regex to capture multi-word phrases including those with apostrophes
parts = re.findall(r'([^()]+?)\s*\(([^)]+)\)', word_by_word_text)
for source, target in parts:
# Clean and normalize both phrases
source = re.sub(r'\s+', ' ', source.strip().replace("'", ""))
target = target.strip()
if source and target:
word_pairs.append((source, target))
return word_pairs
And then I've updated and added new code on tts_service.py
def _is_german_word(self, word: str) -> bool:
# List of common German words that might appear in the English section
german_words = {"dir", "ich", "du", "sie", "er", "es", "wir", "ihr", "ist", "sind", "haben",
"sein", "werden", "kann", "könnte", "möchte", "muss", "darf", "soll"}
return word.lower() in german_words
def _is_english_word(self, word: str) -> bool:
# List of common English words to verify
english_words = {"the", "a", "an", "in", "on", "at", "to", "for", "with", "by"}
return word.lower() in english_words
def generate_enhanced_ssml(
self,
text: Optional[str] = None,
word_pairs: Optional[list[tuple[str, str, bool]]] = None,
source_lang: str = "de",
target_lang: str = "es",
) -> str:
"""Generate SSML with proper phrase handling for both German and English"""
ssml = """<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="en-US">"""
if text:
# Split text into lines and pad to 8 elements
sentences = (text.split("\n") + [""] * 8)[:8]
sentences = [t.replace("&", "&").replace("<", "<").replace(">", ">")
for t in sentences]
# Destructure sentences
(german_native, german_colloquial, german_informal, german_formal,
english_native, english_colloquial, english_informal, english_formal) = sentences
if word_pairs:
# Separate pairs with language flag
german_pairs = [(src, tgt) for src, tgt, is_german in word_pairs if is_german]
english_pairs = [(src, tgt) for src, tgt, is_german in word_pairs if not is_german]
# German Sections
if german_native:
ssml += self._generate_language_section(
german_native, german_pairs,
voice="de-DE-SeraphinaMultilingualNeural",
lang="de-DE"
)
if german_colloquial:
ssml += self._generate_language_section(
german_colloquial, german_pairs,
voice="de-DE-SeraphinaMultilingualNeural",
lang="de-DE"
)
if german_informal:
ssml += self._generate_language_section(
german_informal, german_pairs,
voice="de-DE-KatjaNeural",
lang="de-DE"
)
if german_formal:
ssml += self._generate_language_section(
german_formal, german_pairs,
voice="de-DE-SeraphinaMultilingualNeural",
lang="de-DE"
)
# English Sections
if english_native:
ssml += self._generate_language_section(
english_native, english_pairs,
voice="en-US-JennyMultilingualNeural",
lang="en-US"
)
if english_colloquial:
ssml += self._generate_language_section(
english_colloquial, english_pairs,
voice="en-US-JennyMultilingualNeural",
lang="en-US"
)
if english_informal:
ssml += self._generate_language_section(
english_informal, english_pairs,
voice="en-US-JennyNeural",
lang="en-US"
)
if english_formal:
ssml += self._generate_language_section(
english_formal, english_pairs,
voice="en-US-JennyMultilingualNeural",
lang="en-US"
)
# Final cleanup of SSML
ssml = re.sub(r'(<break time="500ms"\s*/>\s*)+', '<break time="500ms"/>', ssml)
ssml += "</speak>"
return ssml
def _generate_language_section(
self,
sentence: str,
word_pairs: list[tuple[str, str]],
voice: str,
lang: str
) -> str:
"""Generate complete language section with phrase handling"""
section = f"""
<voice name="{voice}">
<prosody rate="1.0">
<lang xml:lang="{lang}">{sentence}</lang>
<break time="1000ms"/>
</prosody>
</voice>"""
if word_pairs:
section += """
<voice name="en-US-JennyMultilingualNeural">
<prosody rate="0.8">"""
# Create phrase map and sort by phrase length
phrase_map = {src.lower(): (src, tgt) for src, tgt in word_pairs}
phrases = sorted(phrase_map.keys(), key=lambda x: len(x.split()), reverse=True)
words = sentence.split()
index = 0
while index < len(words):
matched = False
# Try to match multi-word phrases first
for phrase_key in phrases:
phrase_words = phrase_key.split()
if index + len(phrase_words) > len(words):
continue
candidate = ' '.join(words[index:index+len(phrase_words)]).lower()
if candidate == phrase_key:
original_phrase, translation = phrase_map[phrase_key]
section += f"""
<lang xml:lang="{lang}">{original_phrase}</lang>
<break time="300ms"/>
<lang xml:lang="es-ES">{translation}</lang>
<break time="500ms"/>"""
index += len(phrase_words)
matched = True
break
# Single word fallback
if not matched:
word = words[index].strip(".,!?")
translation = next((tgt for src, tgt in word_pairs if src.lower() == word.lower()), None)
section += f"""
<lang xml:lang="{lang}">{word}</lang>
<break time="300ms"/>"""
if translation:
section += f"""
<lang xml:lang="es-ES">{translation}</lang>
<break time="500ms"/>"""
else:
section += """<break time="500ms"/>"""
index += 1
section += """
<break time="1000ms"/>
</prosody>
</voice>"""
return section
def _generate_sentence_section(
self,
sentence: str,
word_pairs: list[tuple[str, str]],
voice: str,
lang: str,
) -> str:
if not sentence:
return ""
# Generate the main sentence SSML
ssml = f"""
<voice name="{voice}">
<prosody rate="1.0">
<lang xml:lang="{lang}">{sentence}</lang>
<break time="1000ms"/>
</prosody>
</voice>"""
if word_pairs:
ssml += """
<voice name="en-US-JennyMultilingualNeural">
<prosody rate="0.8">"""
# Create phrase map and sort by phrase length (longest first)
phrase_map = {src.lower(): (src, tgt) for src, tgt in word_pairs}
phrases = sorted(phrase_map.keys(), key=lambda x: len(x.split()), reverse=True)
words = sentence.split()
index = 0
while index < len(words):
matched = False
# Try to match multi-word phrases first
for phrase_key in phrases:
phrase_words = phrase_key.split()
phrase_len = len(phrase_words)
if index + phrase_len <= len(words):
current_phrase = ' '.join(words[index:index+phrase_len]).lower()
if current_phrase == phrase_key:
original_phrase, translation = phrase_map[phrase_key]
ssml += f"""
<lang xml:lang="{lang}">{original_phrase}</lang>
<break time="300ms"/>
<lang xml:lang="es-ES">{translation}</lang>
<break time="500ms"/>"""
index += phrase_len
matched = True
break
# Fallback to single-word matching
if not matched:
current_word = words[index].strip(".,!?").lower()
original_word = words[index]
translation = next((tgt for src, tgt in word_pairs if src.lower() == current_word), None)
ssml += f"""
<lang xml:lang="{lang}">{original_word}</lang>
<break time="300ms"/>"""
if translation:
ssml += f"""
<lang xml:lang="es-ES">{translation}</lang>
<break time="500ms"/>"""
else:
ssml += """<break time="500ms"/>"""
index += 1
ssml += """
<break time="1000ms"/>
</prosody>
</voice>"""
return ssml
In translation_service.py I've updated this code to find the solution
def _format_for_tts(self, word_pairs: list[tuple[str, str]], source_lang: str, target_lang: str) -> str:
lang_map = {
'en': 'en-US',
'de': 'de-DE',
'es': 'es-ES'
}
# Make sure to use the correct source language code for each word
ssml = """<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="en-US">
<voice name="en-US-JennyMultilingualNeural">"""
for source_word, target_word in word_pairs:
source_word = source_word.strip()
target_word = target_word.strip()
# Use the correct source language code based on the source_lang parameter
source_lang_code = lang_map.get(source_lang, 'en-US')
target_lang_code = lang_map.get(target_lang, 'es-ES')
ssml += f"""
<lang xml:lang="{source_lang_code}">{source_word}</lang>
<break time="500ms"/>
<lang xml:lang="{target_lang_code}">{target_word}</lang>
<break time="500ms"/>"""
ssml += """
</voice>
</speak>"""
return ssml
async def process_prompt(self, text: str, source_lang: str, target_lang: str) -> Translation:
try:
response = self.chat_session.send_message(text)
generated_text = response.text
print(f"Generated text from Gemini: {generated_text[:100]}...")
translations, word_pairs = self._extract_text_and_pairs(generated_text)
audio_filename = None
if translations and word_pairs:
audio_filename = await self.tts_service.text_to_speech_word_pairs(
word_pairs=word_pairs,
source_lang=source_lang,
target_lang=target_lang,
complete_text="\n".join(translations)
)
elif translations:
formatted_ssml = self.tts_service.generate_enhanced_ssml(
text="\n".join(translations),
source_lang=source_lang,
target_lang=target_lang
)
audio_filename = await self.tts_service.text_to_speech(formatted_ssml)
if audio_filename:
print(f"Successfully generated audio: {audio_filename}")
else:
print("Audio generation failed")
return Translation(
original_text=text,
translated_text=generated_text,
source_language=source_lang,
target_language=target_lang,
audio_path=audio_filename if audio_filename else None,
translations={"main": translations[0] if translations else generated_text},
word_by_word=self._generate_word_by_word(text, generated_text),
grammar_explanations=self._generate_grammar_explanations(generated_text)
)
except Exception as e:
print(f"Error in process_prompt: {str(e)}")
raise Exception(f"Translation processing failed: {str(e)}")
def _extract_text_and_pairs(self, generated_text: str) -> tuple[list[str], list[tuple[str, str, bool]]]:
"""
Extract both native, colloquial, informal, and formal texts and word pairs from generated text.
Returns: tuple of ([texts], [(source_word, target_word, is_german)])
"""
translations = []
word_pairs = []
# Patterns for German translations
german_patterns = [
{
'text_pattern': r'German Translation:.*?\* Conversational-native:\s*"([^"]+)"',
'pairs_pattern': r'\* word by word Conversational-native German-Spanish:\s*"([^"]+)"',
'is_german': True
},
{
'text_pattern': r'\* Conversational-colloquial:\s*"([^"]+)"',
'pairs_pattern': r'\* word by word Conversational-colloquial German-Spanish:\s*"([^"]+)"',
'is_german': True
},
{
'text_pattern': r'\* Conversational-informal:\s*"([^"]+)"',
'pairs_pattern': r'\* word by word Conversational-informal German-Spanish:\s*"([^"]+)"',
'is_german': True
},
{
'text_pattern': r'\* Conversational-formal:\s*"([^"]+)"',
'pairs_pattern': r'\* word by word Conversational-formal German-Spanish:\s*"([^"]+)"',
'is_german': True
}
]
# Patterns for English translations
english_patterns = [
{
'text_pattern': r'English Translation:.*?\* Conversational-native:\s*"([^"]+)"',
'pairs_pattern': r'\* word by word Conversational-native English-Spanish:\s*"([^"]+)"',
'is_german': False
},
{
'text_pattern': r'English Translation:.*?\* Conversational-colloquial:\s*"([^"]+)"',
'pairs_pattern': r'\* word by word Conversational-colloquial English-Spanish:\s*"([^"]+)"',
'is_german': False
},
{
'text_pattern': r'English Translation:.*?\* Conversational-informal:\s*"([^"]+)"',
'pairs_pattern': r'\* word by word Conversational-informal English-Spanish:\s*"([^"]+)"',
'is_german': False
},
{
'text_pattern': r'English Translation:.*?\* Conversational-formal:\s*"([^"]+)"',
'pairs_pattern': r'\* word by word Conversational-formal English-Spanish:\s*"([^"]+)"',
'is_german': False
}
]
# Combine patterns
all_patterns = german_patterns + english_patterns
# Extract translations and word pairs
for pattern_set in all_patterns:
# Extract text
text_match = re.search(pattern_set['text_pattern'], generated_text, re.DOTALL | re.IGNORECASE)
if text_match:
translations.append(text_match.group(1).strip())
# Extract word pairs
pairs_match = re.search(pattern_set['pairs_pattern'], generated_text, re.IGNORECASE)
if pairs_match:
pairs_text = pairs_match.group(1)
# More robust word pair extraction
pair_matches = re.findall(r'(\S+)\s*\(([^)]+)\)', pairs_text)
for source, target in pair_matches:
source = source.strip()
target = target.strip()
if source and target:
word_pairs.append((source, target, pattern_set['is_german']))
# Remove duplicates while preserving order
seen_pairs = set()
unique_pairs = []
for pair in word_pairs:
pair_tuple = (pair[0], pair[1], pair[2])
if pair_tuple not in seen_pairs:
seen_pairs.add(pair_tuple)
unique_pairs.append(pair)
return translations, unique_pairs