I know I could use custom trained tacotron model and better vocoder, but are there other ways to make the voice more clear and better quality?
Here’s the code I’m currently working with:
import torch
import torchaudio
import IPython
import matplotlib.pyplot as plt
from IPython.display import display
import soundfile as sf
import os
class TextToSpeech:
def __init__(self):
torch.random.manual_seed(0)
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_PHONE_LJSPEECH
self.processor = self.bundle.get_text_processor()
self.tacotron2 = self.bundle.get_tacotron2().to(self.device)
self.vocoder = self.bundle.get_vocoder().to(self.device)
def text_to_speech(self, text, pitch=1.0):
with torch.inference_mode():
processed, lengths = self.processor(text)
processed = processed.to(self.device)
lengths = lengths.to(self.device)
spec, spec_lengths, _ = self.tacotron2.infer(processed, lengths)
waveforms, lengths = self.vocoder(spec, spec_lengths)
waveforms = self.adjust_pitch(waveforms, pitch)
return waveforms, spec, self.vocoder.sample_rate
def adjust_pitch(self, waveforms, pitch):
if pitch != 1.0:
waveforms = torchaudio.transforms.Resample(
orig_freq=self.vocoder.sample_rate,
new_freq=int(self.vocoder.sample_rate * pitch)
)(waveforms)
return waveforms
def plot_waveform_and_spectrogram(self, waveforms, spec, sample_rate):
waveforms = waveforms.cpu().detach()
fig, [ax1, ax2] = plt.subplots(2, 1)
ax1.plot(waveforms[0])
ax1.set_xlim(0, waveforms.size(-1))
ax1.grid(True)
ax2.imshow(spec[0].cpu().detach(), origin="lower", aspect="auto")
return IPython.display.Audio(waveforms[0:1], rate=sample_rate)
def save_waveform_to_file(self, waveforms, sample_rate, filename="output.wav"):
waveforms = waveforms.cpu().detach().numpy()
sf.write(filename, waveforms.T, sample_rate)
os.system(f'start {filename}') # This will auto open the file in the default media player on Windows
# Example usage
if __name__ == "__main__":
tts = TextToSpeech()
text = "Hello world"
pitch = 0.85 # Adjust the pitch here, for whatever reason lower value increases pitch, 0.85 seems good
waveforms, spec, sample_rate = tts.text_to_speech(text, pitch)
audio = tts.plot_waveform_and_spectrogram(waveforms, spec, sample_rate)
display(audio)
tts.save_waveform_to_file(waveforms, sample_rate)
ive tried to change the architecture of the code and this one seems to run best