import gradio as gr import torch import os from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan from datasets import load_dataset, Audio import numpy as np from speechbrain.inference import EncoderClassifier # Load models and processor processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") model = SpeechT5ForTextToSpeech.from_pretrained("Solo448/SpeechT5-fine-tune-en") vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") # Load speaker encoder device = "cuda" if torch.cuda.is_available() else "cpu" speaker_model = EncoderClassifier.from_hparams( source="speechbrain/spkrec-xvect-voxceleb", run_opts={"device": device}, savedir=os.path.join("/tmp", "speechbrain/spkrec-xvect-voxceleb") ) # Load a sample from the dataset for speaker embedding try: dataset = load_dataset("Yassmen/TTS_English_Technical_data", split="train") dataset = dataset.cast_column("audio", Audio(sampling_rate=16000)) sample = dataset[0] speaker_embedding = create_speaker_embedding(sample['audio']['array']) except Exception as e: print(f"Error loading dataset: {e}") # Use a random speaker embedding as fallback speaker_embedding = torch.randn(1, 512) def create_speaker_embedding(waveform): with torch.no_grad(): speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform)) speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2) speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy() return speaker_embeddings def text_to_speech(text): # Clean up text replacements = [ ('0', 'zero'), ('1', 'one'), ('2', 'two'), ('3', 'three'), ('4', 'four'), ('5', 'five'), ('6', 'six'), ('7', 'seven'), ('8', 'eight'), ('9', 'nine') ] for src, dst in replacements: text = text.replace(src, dst) inputs = processor(text=text, return_tensors="pt") speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder) return (16000, speech.numpy()) iface = gr.Interface( fn=text_to_speech, inputs="text", outputs="audio", title="Technical Text-to-Speech", description="Enter technical text to convert to speech. The model has been fine-tuned on technical data." ) iface.launch()