JackismyShephard commited on
Commit
5ed78b8
1 Parent(s): 1d1e03e

remove speech enhancement

Browse files
Files changed (2) hide show
  1. app.py +3 -21
  2. requirements.txt +1 -2
app.py CHANGED
@@ -4,12 +4,10 @@ import torch
4
 
5
  from transformers import pipeline
6
 
7
- from resemble_enhance.enhancer.inference import denoise, enhance
8
-
9
  checkpoint_finetuned = "JackismyShephard/speecht5_tts-finetuned-nst-da"
10
 
11
  revision = "5af228df418092b681cf31c31e413bdd2b5f9c8c"
12
- device = 0 if torch.cuda.is_available() else "cpu"
13
 
14
  pipe = pipeline(
15
  "text-to-speech",
@@ -35,7 +33,7 @@ target_dtype = np.int16
35
  max_range = np.iinfo(target_dtype).max
36
 
37
 
38
- def predict(text, speaker, post_process):
39
  if len(text.strip()) == 0:
40
  return (16000, np.zeros(0))
41
 
@@ -52,10 +50,7 @@ def predict(text, speaker, post_process):
52
  forward_params = {"speaker_embeddings": speaker_embedding}
53
  speech = pipe(text, forward_params=forward_params)
54
 
55
- if post_process:
56
- sr, audio = enhance_audio(speech["audio"], speech["sampling_rate"], device)
57
- else:
58
- sr, audio = speech["sampling_rate"], speech["audio"]
59
 
60
  audio = (audio * max_range).astype(np.int16)
61
 
@@ -90,17 +85,6 @@ replacements = [
90
  ("ü", "y"),
91
  ]
92
 
93
-
94
- def enhance_audio(waveform, sr, device="cuda"):
95
- tensor = torch.tensor(waveform).float()
96
- denoised, new_sr = denoise(tensor, sr, device)
97
- enhanced, new_sr = enhance(
98
- denoised, new_sr, device, nfe=64, solver="midpoint", lambd=0.1, tau=0.5
99
- )
100
- enhanced_cpu = enhanced.cpu().numpy()
101
- return new_sr, enhanced_cpu
102
-
103
-
104
  title = "Danish Speech Synthesis"
105
 
106
  description = (
@@ -113,7 +97,6 @@ examples = [
113
  [
114
  "I sin oprindelige før-kristne form blev alferne sandsynligvis opfattet som en personificering af det land og den natur, der omgav menneskene, dvs. den opdyrkede jord, gården og de naturressourcer, som hørte dertil. De var guddommelige eller delvis guddommelige væsener, der besad magiske kræfter, som de brugte både til fordel og ulempe for menneskene.",
115
  "F23 (Female, 23, Vestjylland)",
116
- True,
117
  ],
118
  ]
119
 
@@ -133,7 +116,6 @@ demo = gr.Interface(
133
  ],
134
  value="F23 (Female, 23, Vestjylland)",
135
  ),
136
- gr.Checkbox(label="Enhance audio (takes substantially longer)"),
137
  ],
138
  outputs=[
139
  gr.Audio(label="Generated Speech", type="numpy"),
 
4
 
5
  from transformers import pipeline
6
 
 
 
7
  checkpoint_finetuned = "JackismyShephard/speecht5_tts-finetuned-nst-da"
8
 
9
  revision = "5af228df418092b681cf31c31e413bdd2b5f9c8c"
10
+ device = "cuda" if torch.cuda.is_available() else "cpu"
11
 
12
  pipe = pipeline(
13
  "text-to-speech",
 
33
  max_range = np.iinfo(target_dtype).max
34
 
35
 
36
+ def predict(text, speaker):
37
  if len(text.strip()) == 0:
38
  return (16000, np.zeros(0))
39
 
 
50
  forward_params = {"speaker_embeddings": speaker_embedding}
51
  speech = pipe(text, forward_params=forward_params)
52
 
53
+ sr, audio = speech["sampling_rate"], speech["audio"]
 
 
 
54
 
55
  audio = (audio * max_range).astype(np.int16)
56
 
 
85
  ("ü", "y"),
86
  ]
87
 
 
 
 
 
 
 
 
 
 
 
 
88
  title = "Danish Speech Synthesis"
89
 
90
  description = (
 
97
  [
98
  "I sin oprindelige før-kristne form blev alferne sandsynligvis opfattet som en personificering af det land og den natur, der omgav menneskene, dvs. den opdyrkede jord, gården og de naturressourcer, som hørte dertil. De var guddommelige eller delvis guddommelige væsener, der besad magiske kræfter, som de brugte både til fordel og ulempe for menneskene.",
99
  "F23 (Female, 23, Vestjylland)",
 
100
  ],
101
  ]
102
 
 
116
  ],
117
  value="F23 (Female, 23, Vestjylland)",
118
  ),
 
119
  ],
120
  outputs=[
121
  gr.Audio(label="Generated Speech", type="numpy"),
requirements.txt CHANGED
@@ -1,4 +1,3 @@
1
  torch
2
  transformers
3
- sentencepiece
4
- resemble-enhance
 
1
  torch
2
  transformers
3
+ sentencepiece