Spaces:
Running
on
T4
Running
on
T4
Merge remote-tracking branch 'origin/main'
Browse files- InferenceInterfaces/ControllableInterface.py +3 -1
- Models/Embedding/embedding_gan.pt +3 -0
- Models/Embedding/init +0 -0
- Models/ToucanTTS_Meta/best.pt +3 -0
- Models/ToucanTTS_Meta/init +0 -0
- Models/Vocoder/best.pt +3 -0
- Models/Vocoder/init +0 -0
- Models/__init__.py +0 -0
- Models/audioseal/generator.pth +3 -0
- Models/audioseal/init +0 -0
- app.py +11 -5
InferenceInterfaces/ControllableInterface.py
CHANGED
@@ -92,8 +92,10 @@ class ControllableInterface:
|
|
92 |
if self.current_accent != "eng":
|
93 |
self.model.set_accent_language("eng")
|
94 |
self.current_accent = "eng"
|
95 |
-
|
96 |
print(prompt)
|
|
|
|
|
97 |
wav, sr, fig = self.model(prompt,
|
98 |
input_is_phones=False,
|
99 |
duration_scaling_factor=duration_scaling_factor,
|
|
|
92 |
if self.current_accent != "eng":
|
93 |
self.model.set_accent_language("eng")
|
94 |
self.current_accent = "eng"
|
95 |
+
print("\n\n")
|
96 |
print(prompt)
|
97 |
+
print(language)
|
98 |
+
print("\n\n")
|
99 |
wav, sr, fig = self.model(prompt,
|
100 |
input_is_phones=False,
|
101 |
duration_scaling_factor=duration_scaling_factor,
|
Models/Embedding/embedding_gan.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d8e9cdad432cc4ee0301754cce730d000c37b26a4cf3baab21a5c3447b725fc9
|
3 |
+
size 1261865
|
Models/Embedding/init
ADDED
File without changes
|
Models/ToucanTTS_Meta/best.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a875446cd80a7b27f8a7a65b5fd1df6bd2f11c89409ab2e4342d6b7e6dff1755
|
3 |
+
size 366674077
|
Models/ToucanTTS_Meta/init
ADDED
File without changes
|
Models/Vocoder/best.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1e3e4b03bd4ec1665706bcb78c21f47386439774cc53010e1f7c8d3b82d6006d
|
3 |
+
size 56113099
|
Models/Vocoder/init
ADDED
File without changes
|
Models/__init__.py
ADDED
File without changes
|
Models/audioseal/generator.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7a845b5fbe9364a63a3909d8ab3fe064d13a76ae4c2e983573e08c69b7b51748
|
3 |
+
size 58805980
|
Models/audioseal/init
ADDED
File without changes
|
app.py
CHANGED
@@ -2,8 +2,8 @@ import os
|
|
2 |
|
3 |
from run_model_downloader import download_models
|
4 |
|
5 |
-
if not os.path.exists("Models/ToucanTTS_Meta/best.pt"):
|
6 |
-
download_models()
|
7 |
import gradio as gr
|
8 |
from Preprocessing.multilinguality.SimilaritySolver import load_json_from_path
|
9 |
from Utility.utils import float2pcm
|
@@ -46,7 +46,7 @@ class ControllableInterface(torch.nn.Module):
|
|
46 |
loudness_in_db
|
47 |
):
|
48 |
if self.current_language != language:
|
49 |
-
self.model
|
50 |
self.current_language = language
|
51 |
|
52 |
self.wgan.set_latent(voice_seed)
|
@@ -59,11 +59,17 @@ class ControllableInterface(torch.nn.Module):
|
|
59 |
embedding = self.wgan.modify_embed(controllability_vector)
|
60 |
self.model.set_utterance_embedding(embedding=embedding)
|
61 |
|
|
|
|
|
62 |
phones = self.model.text2phone.get_phone_string(prompt)
|
63 |
if len(phones) > 1800:
|
64 |
-
|
65 |
|
|
|
66 |
print(prompt)
|
|
|
|
|
|
|
67 |
wav, sr, fig = self.model(prompt,
|
68 |
input_is_phones=False,
|
69 |
duration_scaling_factor=duration_scaling_factor,
|
@@ -114,7 +120,7 @@ def read(prompt,
|
|
114 |
iface = gr.Interface(fn=read,
|
115 |
inputs=[gr.Textbox(lines=2,
|
116 |
placeholder="write what you want the synthesis to read here...",
|
117 |
-
value="
|
118 |
label="Text input"),
|
119 |
gr.Dropdown(text_selection,
|
120 |
type="value",
|
|
|
2 |
|
3 |
from run_model_downloader import download_models
|
4 |
|
5 |
+
#if not os.path.exists("Models/ToucanTTS_Meta/best.pt"):
|
6 |
+
# download_models()
|
7 |
import gradio as gr
|
8 |
from Preprocessing.multilinguality.SimilaritySolver import load_json_from_path
|
9 |
from Utility.utils import float2pcm
|
|
|
46 |
loudness_in_db
|
47 |
):
|
48 |
if self.current_language != language:
|
49 |
+
self.model.set_language(language)
|
50 |
self.current_language = language
|
51 |
|
52 |
self.wgan.set_latent(voice_seed)
|
|
|
59 |
embedding = self.wgan.modify_embed(controllability_vector)
|
60 |
self.model.set_utterance_embedding(embedding=embedding)
|
61 |
|
62 |
+
if len(prompt) > 1800:
|
63 |
+
raise AssertionError("The input is too long!")
|
64 |
phones = self.model.text2phone.get_phone_string(prompt)
|
65 |
if len(phones) > 1800:
|
66 |
+
raise AssertionError("The input is too long!")
|
67 |
|
68 |
+
print("\n\n")
|
69 |
print(prompt)
|
70 |
+
print(language)
|
71 |
+
print("\n\n")
|
72 |
+
|
73 |
wav, sr, fig = self.model(prompt,
|
74 |
input_is_phones=False,
|
75 |
duration_scaling_factor=duration_scaling_factor,
|
|
|
120 |
iface = gr.Interface(fn=read,
|
121 |
inputs=[gr.Textbox(lines=2,
|
122 |
placeholder="write what you want the synthesis to read here...",
|
123 |
+
value="What I cannot create, I do not understand.",
|
124 |
label="Text input"),
|
125 |
gr.Dropdown(text_selection,
|
126 |
type="value",
|