Solo448 commited on
Commit
ec45def
1 Parent(s): dc99051

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -2
app.py CHANGED
@@ -1,5 +1,61 @@
1
  import gradio as gr
2
- import spaces
 
 
 
 
 
3
 
 
 
 
 
4
 
5
- gr.Interface.load("models/microsoft/speecht5_tts").launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import torch
3
+ import os
4
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
5
+ from datasets import load_dataset, Audio
6
+ import numpy as np
7
+ from speechbrain.inference import EncoderClassifier
8
 
9
+ # Load models and processor
10
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
11
+ model = SpeechT5ForTextToSpeech.from_pretrained("Solo448/SpeechT5-fine-tune-en")
12
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
13
 
14
+ # Load speaker encoder
15
+ device = "cuda" if torch.cuda.is_available() else "cpu"
16
+ speaker_model = EncoderClassifier.from_hparams(
17
+ source="speechbrain/spkrec-xvect-voxceleb",
18
+ run_opts={"device": device},
19
+ savedir=os.path.join("/tmp", "speechbrain/spkrec-xvect-voxceleb")
20
+ )
21
+
22
+ # Load a sample from the dataset for speaker embedding
23
+ try:
24
+ dataset = load_dataset("Yassmen/TTS_English_Technical_data", split="train")
25
+ dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
26
+ sample = dataset[0]
27
+ speaker_embedding = create_speaker_embedding(sample['audio']['array'])
28
+ except Exception as e:
29
+ print(f"Error loading dataset: {e}")
30
+ # Use a random speaker embedding as fallback
31
+ speaker_embedding = torch.randn(1, 512)
32
+
33
+ def create_speaker_embedding(waveform):
34
+ with torch.no_grad():
35
+ speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
36
+ speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
37
+ speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
38
+ return speaker_embeddings
39
+
40
+ def text_to_speech(text):
41
+ # Clean up text
42
+ replacements = [
43
+ ('0', 'zero'), ('1', 'one'), ('2', 'two'), ('3', 'three'), ('4', 'four'),
44
+ ('5', 'five'), ('6', 'six'), ('7', 'seven'), ('8', 'eight'), ('9', 'nine')
45
+ ]
46
+ for src, dst in replacements:
47
+ text = text.replace(src, dst)
48
+
49
+ inputs = processor(text=text, return_tensors="pt")
50
+ speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder)
51
+ return (16000, speech.numpy())
52
+
53
+ iface = gr.Interface(
54
+ fn=text_to_speech,
55
+ inputs="text",
56
+ outputs="audio",
57
+ title="Technical Text-to-Speech",
58
+ description="Enter technical text to convert to speech. The model has been fine-tuned on technical data."
59
+ )
60
+
61
+ iface.launch()