aswathyraj commited on
Commit
edfdd67
·
verified ·
1 Parent(s): 89693d5

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +62 -0
  2. requirements.txt +64 -0
app.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # apis.py
2
+
3
+ import sys
4
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
5
+ from datasets import load_dataset
6
+ import torch
7
+ import soundfile as sf
8
+ import gradio as gr
9
+ import os
10
+
11
+ def generate_speech(text, person):
12
+ # Initialize SpeechT5 components
13
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
14
+ model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
15
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
16
+
17
+ # Process text using the processor
18
+ inputs = processor(text=text, return_tensors="pt")
19
+
20
+ # Load xvector containing speaker's voice characteristics from a dataset
21
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
22
+
23
+ # Set the speaker based on the provided person parameter
24
+ if person == "male":
25
+ speaker_index = 5004
26
+ elif person == "female":
27
+ speaker_index = 7306
28
+ else:
29
+ raise ValueError("Invalid value for 'person'. Use 'male' or 'female'.")
30
+
31
+ # Generate speech using the selected speaker
32
+ speaker_embeddings = torch.tensor(embeddings_dataset[speaker_index]["xvector"]).unsqueeze(0)
33
+ speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
34
+
35
+ # Save the generated speech as a WAV file
36
+ # sf.write("speech.wav", speech.numpy(), samplerate=16000)
37
+
38
+ # print(f"The speech was generated for {result_person}.")
39
+ # Create an in-memory buffer to hold the speech data
40
+ output_file = "output_file.wav"
41
+
42
+ # Write the speech data to the buffer
43
+ sf.write(output_file, speech.numpy(), samplerate=16000, format='wav', subtype='PCM_16')
44
+
45
+ # Return the in-memory buffer
46
+ return output_file
47
+
48
+
49
+ default_text = ""
50
+
51
+ demo = gr.Interface(
52
+ fn=generate_speech,
53
+ inputs = [
54
+ gr.Textbox(value=default_text, label="Input text", placeholder="Type something here.."),
55
+ gr.Radio(choices=['male', 'female'], label="Targert Speaker",value="female"),
56
+ ],
57
+ outputs=gr.Audio(label=""),
58
+ title= "Text to speech"
59
+ )
60
+
61
+ if __name__ == "__main__":
62
+ demo.launch(share=True)
requirements.txt ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohttp==3.9.2
2
+ aiosignal==1.3.1
3
+ async-timeout==4.0.3
4
+ attrs==23.2.0
5
+ blinker==1.7.0
6
+ certifi==2023.11.17
7
+ cffi==1.16.0
8
+ charset-normalizer==3.3.2
9
+ click==8.1.7
10
+ datasets==2.16.1
11
+ dill==0.3.7
12
+ filelock==3.13.1
13
+ frozenlist==1.4.1
14
+ fsspec==2023.10.0
15
+ huggingface-hub==0.20.3
16
+ idna==3.6
17
+ importlib-metadata==7.0.1
18
+ itsdangerous==2.1.2
19
+ Jinja2==3.1.3
20
+ MarkupSafe==2.1.4
21
+ mpmath==1.3.0
22
+ multidict==6.0.4
23
+ multiprocess==0.70.15
24
+ networkx==3.2.1
25
+ numpy==1.26.3
26
+ nvidia-cublas-cu12==12.1.3.1
27
+ nvidia-cuda-cupti-cu12==12.1.105
28
+ nvidia-cuda-nvrtc-cu12==12.1.105
29
+ nvidia-cuda-runtime-cu12==12.1.105
30
+ nvidia-cudnn-cu12==8.9.2.26
31
+ nvidia-cufft-cu12==11.0.2.54
32
+ nvidia-curand-cu12==10.3.2.106
33
+ nvidia-cusolver-cu12==11.4.5.107
34
+ nvidia-cusparse-cu12==12.1.0.106
35
+ nvidia-nccl-cu12==2.18.1
36
+ nvidia-nvjitlink-cu12==12.3.101
37
+ nvidia-nvtx-cu12==12.1.105
38
+ packaging==23.2
39
+ pandas==2.2.0
40
+ pyarrow==15.0.0
41
+ pyarrow-hotfix==0.6
42
+ pycparser==2.21
43
+ python-dateutil==2.8.2
44
+ pytz==2023.4
45
+ PyYAML==6.0.1
46
+ regex==2023.12.25
47
+ requests==2.31.0
48
+ safetensors==0.4.2
49
+ sentencepiece==0.1.99
50
+ six==1.16.0
51
+ soundfile==0.12.1
52
+ sympy==1.12
53
+ tokenizers==0.15.1
54
+ torch==2.1.2
55
+ tqdm==4.66.1
56
+ transformers==4.37.2
57
+ triton==2.1.0
58
+ typing_extensions==4.9.0
59
+ tzdata==2023.4
60
+ urllib3==2.1.0
61
+ Werkzeug==3.0.1
62
+ xxhash==3.4.1
63
+ yarl==1.9.4
64
+ zipp==3.17.0