TheComputerMan commited on
Commit
bb2a4dd
·
1 Parent(s): ebf6967

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +164 -0
app.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import gradio as gr
4
+ import numpy as np
5
+ import torch
6
+
7
+ from InferenceInterfaces.Meta_FastSpeech2 import Meta_FastSpeech2
8
+
9
+ os.system("pip uninstall -y gradio")
10
+ os.system("pip install gradio==2.7.5.2")
11
+
12
+
13
+ def float2pcm(sig, dtype='int16'):
14
+ """
15
+ https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182
16
+ """
17
+ sig = np.asarray(sig)
18
+ if sig.dtype.kind != 'f':
19
+ raise TypeError("'sig' must be a float array")
20
+ dtype = np.dtype(dtype)
21
+ if dtype.kind not in 'iu':
22
+ raise TypeError("'dtype' must be an integer type")
23
+ i = np.iinfo(dtype)
24
+ abs_max = 2 ** (i.bits - 1)
25
+ offset = i.min + abs_max
26
+ return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)
27
+
28
+
29
+ class TTS_Interface:
30
+
31
+ def __init__(self):
32
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
33
+ self.model = Meta_FastSpeech2(device=self.device)
34
+ self.current_speaker = "English Speaker's Voice"
35
+ self.current_language = "English"
36
+ self.current_accent = "English"
37
+ self.language_id_lookup = {
38
+ "English" : "en",
39
+ "German" : "de",
40
+ "Greek" : "el",
41
+ "Spanish" : "es",
42
+ "Finnish" : "fi",
43
+ "Russian" : "ru",
44
+ "Hungarian" : "hu",
45
+ "Dutch" : "nl",
46
+ "French" : "fr",
47
+ 'Polish' : "pl",
48
+ 'Portuguese': "pt",
49
+ 'Italian' : "it",
50
+ }
51
+ self.speaker_path_lookup = {
52
+ "English Speaker's Voice" : "reference_audios/english.wav",
53
+ "German Speaker's Voice" : "reference_audios/german.wav",
54
+ "Greek Speaker's Voice" : "reference_audios/greek.wav",
55
+ "Spanish Speaker's Voice" : "reference_audios/spanish.wav",
56
+ "Finnish Speaker's Voice" : "reference_audios/finnish.wav",
57
+ "Russian Speaker's Voice" : "reference_audios/russian.wav",
58
+ "Hungarian Speaker's Voice" : "reference_audios/hungarian.wav",
59
+ "Dutch Speaker's Voice" : "reference_audios/dutch.wav",
60
+ "French Speaker's Voice" : "reference_audios/french.wav",
61
+ "Polish Speaker's Voice" : "reference_audios/polish.flac",
62
+ "Portuguese Speaker's Voice": "reference_audios/portuguese.wav",
63
+ "Italian Speaker's Voice" : "reference_audios/italian.flac",
64
+ }
65
+ self.model.set_utterance_embedding(self.speaker_path_lookup[self.current_speaker])
66
+
67
+
68
+ def read(self, prompt, language, accent, speaker):
69
+ language = language.split()[0]
70
+ accent = accent.split()[0]
71
+ if self.current_language != language:
72
+ self.model.set_phonemizer_language(self.language_id_lookup[language])
73
+ self.current_language = language
74
+ if self.current_accent != accent:
75
+ self.model.set_accent_language(self.language_id_lookup[accent])
76
+ self.current_accent = accent
77
+ if self.current_speaker != speaker:
78
+ self.model.set_utterance_embedding(self.speaker_path_lookup[speaker])
79
+ self.current_speaker = speaker
80
+
81
+ phones = self.model.text2phone.get_phone_string(prompt)
82
+ if len(phones) > 1800:
83
+ if language == "English":
84
+ prompt = "Your input was too long. Please try either a shorter text or split it into several parts."
85
+ elif language == "German":
86
+ prompt = "Deine Eingabe war zu lang. Bitte versuche es entweder mit einem kürzeren Text oder teile ihn in mehrere Teile auf."
87
+ elif language == "Greek":
88
+ prompt = "Η εισήγησή σας ήταν πολύ μεγάλη. Παρακαλώ δοκιμάστε είτε ένα μικρότερο κείμενο είτε χωρίστε το σε διάφορα μέρη."
89
+ elif language == "Spanish":
90
+ prompt = "Su entrada es demasiado larga. Por favor, intente un texto más corto o divídalo en varias partes."
91
+ elif language == "Finnish":
92
+ prompt = "Vastauksesi oli liian pitkä. Kokeile joko lyhyempää tekstiä tai jaa se useampaan osaan."
93
+ elif language == "Russian":
94
+ prompt = "Ваш текст слишком длинный. Пожалуйста, попробуйте либо сократить текст, либо разделить его на несколько частей."
95
+ elif language == "Hungarian":
96
+ prompt = "Túl hosszú volt a bevitele. Kérjük, próbáljon meg rövidebb szöveget írni, vagy ossza több részre."
97
+ elif language == "Dutch":
98
+ prompt = "Uw input was te lang. Probeer een kortere tekst of splits het in verschillende delen."
99
+ elif language == "French":
100
+ prompt = "Votre saisie était trop longue. Veuillez essayer un texte plus court ou le diviser en plusieurs parties."
101
+ elif language == 'Polish':
102
+ prompt = "Twój wpis był zbyt długi. Spróbuj skrócić tekst lub podzielić go na kilka części."
103
+ elif language == 'Portuguese':
104
+ prompt = "O seu contributo foi demasiado longo. Por favor, tente um texto mais curto ou divida-o em várias partes."
105
+ elif language == 'Italian':
106
+ prompt = "Il tuo input era troppo lungo. Per favore, prova un testo più corto o dividilo in più parti."
107
+ phones = self.model.text2phone.get_phone_string(prompt)
108
+
109
+ wav = self.model(phones)
110
+ return 48000, float2pcm(wav.cpu().numpy())
111
+
112
+
113
+ meta_model = TTS_Interface()
114
+ article = "<p style='text-align: left'>This is still a work in progress, models will be exchanged for better ones as soon as they are done. All of those languages are spoken by a single model. Speakers can be transferred across languages. More languages will be added soon. If you just want to listen to some pregenerated audios <a href='https://multilingualtoucan.github.io/' target='_blank'>click here.</a></p><p style='text-align: center'><a href='https://github.com/DigitalPhonetics/IMS-Toucan' target='_blank'>Click here to learn more about the IMS Toucan Speech Synthesis Toolkit</a></p>"
115
+
116
+ iface = gr.Interface(fn=meta_model.read,
117
+ inputs=[gr.inputs.Textbox(lines=2,
118
+ placeholder="write what you want the synthesis to read here... \n(to prevent out of memory errors, too long inputs get replaced with a placeholder)",
119
+ label="Text input"),
120
+ gr.inputs.Dropdown(['English Text',
121
+ 'German Text',
122
+ 'Greek Text',
123
+ 'Spanish Text',
124
+ 'Finnish Text',
125
+ 'Russian Text',
126
+ 'Hungarian Text',
127
+ 'Dutch Text',
128
+ 'French Text',
129
+ 'Polish Text',
130
+ 'Portuguese Text',
131
+ 'Italian Text'], type="value", default='English Text', label="Select the Language of the Text"),
132
+ gr.inputs.Dropdown(['English Accent',
133
+ 'German Accent',
134
+ 'Greek Accent',
135
+ 'Spanish Accent',
136
+ 'Finnish Accent',
137
+ 'Russian Accent',
138
+ 'Hungarian Accent',
139
+ 'Dutch Accent',
140
+ 'French Accent',
141
+ 'Polish Accent',
142
+ 'Portuguese Accent',
143
+ 'Italian Accent'], type="value", default='English Accent', label="Select the Accent of the Speaker"),
144
+ gr.inputs.Dropdown(["English Speaker's Voice",
145
+ "German Speaker's Voice",
146
+ "Greek Speaker's Voice",
147
+ "Spanish Speaker's Voice",
148
+ "Finnish Speaker's Voice",
149
+ "Russian Speaker's Voice",
150
+ "Hungarian Speaker's Voice",
151
+ "Dutch Speaker's Voice",
152
+ "French Speaker's Voice",
153
+ "Polish Speaker's Voice",
154
+ "Portuguese Speaker's Voice",
155
+ "Italian Speaker's Voice"], type="value", default="English Speaker's Voice", label="Select the Voice of the Speaker")],
156
+ outputs=gr.outputs.Audio(type="numpy", label=None),
157
+ layout="vertical",
158
+ title="IMS Toucan - Multilingual Multispeaker",
159
+ thumbnail="Utility/toucan.png",
160
+ theme="default",
161
+ allow_flagging="never",
162
+ allow_screenshot=False,
163
+ article=article)
164
+ iface.launch(enable_queue=True)