|
import gradio as gr |
|
import torch |
|
import os |
|
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan |
|
from datasets import load_dataset, Audio |
|
import numpy as np |
|
from speechbrain.inference import EncoderClassifier |
|
|
|
|
|
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") |
|
model = SpeechT5ForTextToSpeech.from_pretrained("Tejasva-Maurya/Hindi_SpeechT5_finetuned") |
|
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") |
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
speaker_model = EncoderClassifier.from_hparams( |
|
source="speechbrain/spkrec-xvect-voxceleb", |
|
run_opts={"device": device}, |
|
savedir=os.path.join("/tmp", "speechbrain/spkrec-xvect-voxceleb") |
|
) |
|
|
|
def create_speaker_embedding(waveform): |
|
with torch.no_grad(): |
|
speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform)) |
|
speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2) |
|
speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy() |
|
return speaker_embeddings |
|
def prepare_dataset(example): |
|
audio = example["audio"] |
|
example["speaker_embeddings"] = create_speaker_embedding(audio["array"]) |
|
return example |
|
|
|
|
|
try: |
|
dataset = load_dataset("mozilla-foundation/common_voice_17_0", "hi", split="validated", trust_remote_code=True) |
|
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000)) |
|
spk_model_name = "speechbrain/spkrec-xvect-voxceleb" |
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
speaker_model = EncoderClassifier.from_hparams( |
|
source=spk_model_name, |
|
run_opts={"device": device}, |
|
savedir=os.path.join("/tmp", spk_model_name), |
|
) |
|
|
|
part = len(dataset) //800 |
|
|
|
|
|
dataset = dataset.select(range(part)) |
|
|
|
|
|
dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names) |
|
example = dataset[5] |
|
speaker_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0) |
|
|
|
except Exception as e: |
|
print(f"Error loading dataset: {e}") |
|
|
|
speaker_embedding = torch.randn(1, 512) |
|
|
|
def text_to_speech(text): |
|
replacements = [ |
|
|
|
("अ", "a"), |
|
("आ", "aa"), |
|
("इ", "i"), |
|
("ई", "ee"), |
|
("उ", "u"), |
|
("ऊ", "oo"), |
|
("ऋ", "ri"), |
|
("ए", "e"), |
|
("ऐ", "ai"), |
|
("ऑ", "o"), |
|
("ओ", "o"), |
|
("औ", "au"), |
|
|
|
("क", "k"), |
|
("ख", "kh"), |
|
("ग", "g"), |
|
("घ", "gh"), |
|
("ङ", "ng"), |
|
("च", "ch"), |
|
("छ", "chh"), |
|
("ज", "j"), |
|
("झ", "jh"), |
|
("ञ", "ny"), |
|
("ट", "t"), |
|
("ठ", "th"), |
|
("ड", "d"), |
|
("ढ", "dh"), |
|
("ण", "n"), |
|
("त", "t"), |
|
("थ", "th"), |
|
("द", "d"), |
|
("ध", "dh"), |
|
("न", "n"), |
|
("प", "p"), |
|
("फ", "ph"), |
|
("ब", "b"), |
|
("भ", "bh"), |
|
("म", "m"), |
|
("य", "y"), |
|
("र", "r"), |
|
("ल", "l"), |
|
("व", "v"), |
|
("श", "sh"), |
|
("ष", "sh"), |
|
("स", "s"), |
|
("ह", "h"), |
|
|
|
("क्ष", "ksh"), |
|
("त्र", "tr"), |
|
("ज्ञ", "gya"), |
|
("श्र", "shra"), |
|
|
|
("़", ""), |
|
("्", ""), |
|
("ऽ", ""), |
|
("ं", "n"), |
|
("ः", "h"), |
|
("ँ", "n"), |
|
|
|
("ा", "a"), |
|
("ि", "i"), |
|
("ी", "ee"), |
|
("ु", "u"), |
|
("ू", "oo"), |
|
("े", "e"), |
|
("ै", "ai"), |
|
("ो", "o"), |
|
("ौ", "au"), |
|
("ृ", "ri"), |
|
|
|
("ॅ", "e"), |
|
("ॉ", "o"), |
|
|
|
("क़", "q"), |
|
("ख़", "kh"), |
|
("ग़", "gh"), |
|
("ज़", "z"), |
|
("ड़", "r"), |
|
("ढ़", "rh"), |
|
("फ़", "f"), |
|
|
|
("।", "."), |
|
] |
|
|
|
|
|
text = ' '.join(text.split()) |
|
for src, dst in replacements: |
|
text = text.replace(src, dst) |
|
|
|
inputs = processor(text=text, return_tensors="pt") |
|
speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder) |
|
return (16000, speech.numpy()) |
|
|
|
iface = gr.Interface( |
|
fn=text_to_speech, |
|
inputs="text", |
|
outputs="audio", |
|
title="SpeechT5 finetuned Hindi Text-to-Speech", |
|
description="Enter Hindi text to convert it into an Audio" |
|
) |
|
|
|
iface.launch(share=True) |