JackismyShephard
commited on
Commit
•
5ed78b8
1
Parent(s):
1d1e03e
remove speech enhancement
Browse files- app.py +3 -21
- requirements.txt +1 -2
app.py
CHANGED
@@ -4,12 +4,10 @@ import torch
|
|
4 |
|
5 |
from transformers import pipeline
|
6 |
|
7 |
-
from resemble_enhance.enhancer.inference import denoise, enhance
|
8 |
-
|
9 |
checkpoint_finetuned = "JackismyShephard/speecht5_tts-finetuned-nst-da"
|
10 |
|
11 |
revision = "5af228df418092b681cf31c31e413bdd2b5f9c8c"
|
12 |
-
device =
|
13 |
|
14 |
pipe = pipeline(
|
15 |
"text-to-speech",
|
@@ -35,7 +33,7 @@ target_dtype = np.int16
|
|
35 |
max_range = np.iinfo(target_dtype).max
|
36 |
|
37 |
|
38 |
-
def predict(text, speaker
|
39 |
if len(text.strip()) == 0:
|
40 |
return (16000, np.zeros(0))
|
41 |
|
@@ -52,10 +50,7 @@ def predict(text, speaker, post_process):
|
|
52 |
forward_params = {"speaker_embeddings": speaker_embedding}
|
53 |
speech = pipe(text, forward_params=forward_params)
|
54 |
|
55 |
-
|
56 |
-
sr, audio = enhance_audio(speech["audio"], speech["sampling_rate"], device)
|
57 |
-
else:
|
58 |
-
sr, audio = speech["sampling_rate"], speech["audio"]
|
59 |
|
60 |
audio = (audio * max_range).astype(np.int16)
|
61 |
|
@@ -90,17 +85,6 @@ replacements = [
|
|
90 |
("ü", "y"),
|
91 |
]
|
92 |
|
93 |
-
|
94 |
-
def enhance_audio(waveform, sr, device="cuda"):
|
95 |
-
tensor = torch.tensor(waveform).float()
|
96 |
-
denoised, new_sr = denoise(tensor, sr, device)
|
97 |
-
enhanced, new_sr = enhance(
|
98 |
-
denoised, new_sr, device, nfe=64, solver="midpoint", lambd=0.1, tau=0.5
|
99 |
-
)
|
100 |
-
enhanced_cpu = enhanced.cpu().numpy()
|
101 |
-
return new_sr, enhanced_cpu
|
102 |
-
|
103 |
-
|
104 |
title = "Danish Speech Synthesis"
|
105 |
|
106 |
description = (
|
@@ -113,7 +97,6 @@ examples = [
|
|
113 |
[
|
114 |
"I sin oprindelige før-kristne form blev alferne sandsynligvis opfattet som en personificering af det land og den natur, der omgav menneskene, dvs. den opdyrkede jord, gården og de naturressourcer, som hørte dertil. De var guddommelige eller delvis guddommelige væsener, der besad magiske kræfter, som de brugte både til fordel og ulempe for menneskene.",
|
115 |
"F23 (Female, 23, Vestjylland)",
|
116 |
-
True,
|
117 |
],
|
118 |
]
|
119 |
|
@@ -133,7 +116,6 @@ demo = gr.Interface(
|
|
133 |
],
|
134 |
value="F23 (Female, 23, Vestjylland)",
|
135 |
),
|
136 |
-
gr.Checkbox(label="Enhance audio (takes substantially longer)"),
|
137 |
],
|
138 |
outputs=[
|
139 |
gr.Audio(label="Generated Speech", type="numpy"),
|
|
|
4 |
|
5 |
from transformers import pipeline
|
6 |
|
|
|
|
|
7 |
checkpoint_finetuned = "JackismyShephard/speecht5_tts-finetuned-nst-da"
|
8 |
|
9 |
revision = "5af228df418092b681cf31c31e413bdd2b5f9c8c"
|
10 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
11 |
|
12 |
pipe = pipeline(
|
13 |
"text-to-speech",
|
|
|
33 |
max_range = np.iinfo(target_dtype).max
|
34 |
|
35 |
|
36 |
+
def predict(text, speaker):
|
37 |
if len(text.strip()) == 0:
|
38 |
return (16000, np.zeros(0))
|
39 |
|
|
|
50 |
forward_params = {"speaker_embeddings": speaker_embedding}
|
51 |
speech = pipe(text, forward_params=forward_params)
|
52 |
|
53 |
+
sr, audio = speech["sampling_rate"], speech["audio"]
|
|
|
|
|
|
|
54 |
|
55 |
audio = (audio * max_range).astype(np.int16)
|
56 |
|
|
|
85 |
("ü", "y"),
|
86 |
]
|
87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
title = "Danish Speech Synthesis"
|
89 |
|
90 |
description = (
|
|
|
97 |
[
|
98 |
"I sin oprindelige før-kristne form blev alferne sandsynligvis opfattet som en personificering af det land og den natur, der omgav menneskene, dvs. den opdyrkede jord, gården og de naturressourcer, som hørte dertil. De var guddommelige eller delvis guddommelige væsener, der besad magiske kræfter, som de brugte både til fordel og ulempe for menneskene.",
|
99 |
"F23 (Female, 23, Vestjylland)",
|
|
|
100 |
],
|
101 |
]
|
102 |
|
|
|
116 |
],
|
117 |
value="F23 (Female, 23, Vestjylland)",
|
118 |
),
|
|
|
119 |
],
|
120 |
outputs=[
|
121 |
gr.Audio(label="Generated Speech", type="numpy"),
|
requirements.txt
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
torch
|
2 |
transformers
|
3 |
-
sentencepiece
|
4 |
-
resemble-enhance
|
|
|
1 |
torch
|
2 |
transformers
|
3 |
+
sentencepiece
|
|