python - How to improve voice quality of custom tts

I know I could use custom trained tacotron model and better vocoder, but are there other ways to make the voice more clear and better quality?

Here’s the code I’m currently working with:

import torch
import torchaudio
import IPython
import matplotlib.pyplot as plt
from IPython.display import display
import soundfile as sf
import os

class TextToSpeech:
    def __init__(self):
        torch.random.manual_seed(0)
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_PHONE_LJSPEECH
        self.processor = self.bundle.get_text_processor()
        self.tacotron2 = self.bundle.get_tacotron2().to(self.device)
        self.vocoder = self.bundle.get_vocoder().to(self.device)

    def text_to_speech(self, text, pitch=1.0):
        with torch.inference_mode():
            processed, lengths = self.processor(text)
            processed = processed.to(self.device)
            lengths = lengths.to(self.device)
            spec, spec_lengths, _ = self.tacotron2.infer(processed, lengths)
            waveforms, lengths = self.vocoder(spec, spec_lengths)
            waveforms = self.adjust_pitch(waveforms, pitch)
        return waveforms, spec, self.vocoder.sample_rate

    def adjust_pitch(self, waveforms, pitch):
        if pitch != 1.0:                               
            waveforms = torchaudio.transforms.Resample(
                orig_freq=self.vocoder.sample_rate,
                new_freq=int(self.vocoder.sample_rate * pitch)
            )(waveforms)
        return waveforms

    def plot_waveform_and_spectrogram(self, waveforms, spec, sample_rate):
        waveforms = waveforms.cpu().detach()
        fig, [ax1, ax2] = plt.subplots(2, 1)
        ax1.plot(waveforms[0])
        ax1.set_xlim(0, waveforms.size(-1))
        ax1.grid(True)
        ax2.imshow(spec[0].cpu().detach(), origin="lower", aspect="auto")
        return IPython.display.Audio(waveforms[0:1], rate=sample_rate)

    def save_waveform_to_file(self, waveforms, sample_rate, filename="output.wav"):
        waveforms = waveforms.cpu().detach().numpy()
        sf.write(filename, waveforms.T, sample_rate)
        os.system(f'start {filename}')  # This will auto open the file in the default media player on Windows

# Example usage
if __name__ == "__main__":
    tts = TextToSpeech()
    text = "Hello world"
    pitch = 0.85  # Adjust the pitch here, for whatever reason lower value increases pitch, 0.85 seems good
    waveforms, spec, sample_rate = tts.text_to_speech(text, pitch)
    audio = tts.plot_waveform_and_spectrogram(waveforms, spec, sample_rate)
    display(audio)
    tts.save_waveform_to_file(waveforms, sample_rate)

ive tried to change the architecture of the code and this one seems to run best

科技改变生活-雨落星辰 - 所有的伟大,都源于一个勇敢的开始

python - How to improve voice quality of custom tts - Stack Overflow

与本文相关的文章

评论列表(0)