csukuangfj commited on
Commit
48092b3
1 Parent(s): cbd589e

small fixes

Browse files
Files changed (1) hide show
  1. app.py +23 -70
app.py CHANGED
@@ -21,6 +21,7 @@
21
 
22
  import logging
23
  import os
 
24
  import tempfile
25
  import time
26
  from datetime import datetime
@@ -90,7 +91,7 @@ def process_microphone(in_filename: str):
90
  def process(in_filename: str):
91
  logging.info(f"in_filename: {in_filename}")
92
 
93
- waveform = load_audio(waveform)
94
  duration = waveform.shape[0] / 44100 # in seconds
95
 
96
  vocals = load_model("vocals.pt")
@@ -107,49 +108,40 @@ def process(in_filename: str):
107
  date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
108
  end = time.time()
109
 
110
- metadata = torchaudio.info(filename)
111
- duration = metadata.num_frames / sample_rate
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  rtf = (end - start) / duration
113
 
114
  logging.info(f"Finished at {date_time} s. Elapsed: {end - start: .3f} s")
115
 
116
  info = f"""
117
- Wave duration : {duration: .3f} s <br/>
118
  Processing time: {end - start: .3f} s <br/>
119
  RTF: {end - start: .3f}/{duration: .3f} = {rtf:.3f} <br/>
120
  """
121
- if rtf > 1:
122
- info += (
123
- "<br/>We are loading the model for the first run. "
124
- "Please run again to measure the real RTF.<br/>"
125
- )
126
-
127
  logging.info(info)
128
  logging.info(f"\nrepo_id: {repo_id}\nhyp: {text}")
129
 
130
- return text, build_html_output(info)
131
 
132
 
133
- title = "# Automatic Speech Recognition with Next-gen Kaldi"
134
- description = """
135
- This space shows how to do automatic speech recognition with Next-gen Kaldi.
136
-
137
- Please visit
138
- <https://huggingface.co/spaces/k2-fsa/streaming-automatic-speech-recognition>
139
- for streaming speech recognition with **Next-gen Kaldi**.
140
-
141
- It is running on CPU within a docker container provided by Hugging Face.
142
-
143
- See more information by visiting the following links:
144
-
145
- - <https://github.com/k2-fsa/icefall>
146
- - <https://github.com/k2-fsa/sherpa>
147
- - <https://github.com/k2-fsa/k2>
148
- - <https://github.com/lhotse-speech/lhotse>
149
-
150
- If you want to deploy it locally, please see
151
- <https://k2-fsa.github.io/sherpa/>
152
- """
153
 
154
  # css style is copied from
155
  # https://huggingface.co/spaces/alphacep/asr/blob/main/app.py#L113
@@ -161,50 +153,11 @@ css = """
161
  """
162
 
163
 
164
- def update_model_dropdown(language: str):
165
- if language in language_to_models:
166
- choices = language_to_models[language]
167
- return gr.Dropdown.update(choices=choices, value=choices[0])
168
-
169
- raise ValueError(f"Unsupported language: {language}")
170
-
171
-
172
  demo = gr.Blocks(css=css)
173
 
174
 
175
  with demo:
176
  gr.Markdown(title)
177
- language_choices = list(language_to_models.keys())
178
-
179
- language_radio = gr.Radio(
180
- label="Language",
181
- choices=language_choices,
182
- value=language_choices[0],
183
- )
184
- model_dropdown = gr.Dropdown(
185
- choices=language_to_models[language_choices[0]],
186
- label="Select a model",
187
- value=language_to_models[language_choices[0]][0],
188
- )
189
-
190
- language_radio.change(
191
- update_model_dropdown,
192
- inputs=language_radio,
193
- outputs=model_dropdown,
194
- )
195
-
196
- decoding_method_radio = gr.Radio(
197
- label="Decoding method",
198
- choices=["greedy_search", "modified_beam_search"],
199
- value="greedy_search",
200
- )
201
-
202
- num_active_paths_slider = gr.Slider(
203
- minimum=1,
204
- value=4,
205
- step=1,
206
- label="Number of active paths for modified_beam_search",
207
- )
208
 
209
  with gr.Tabs():
210
  with gr.TabItem("Upload from disk"):
 
21
 
22
  import logging
23
  import os
24
+ from pydub import AudioSegment
25
  import tempfile
26
  import time
27
  from datetime import datetime
 
91
  def process(in_filename: str):
92
  logging.info(f"in_filename: {in_filename}")
93
 
94
+ waveform = load_audio(in_filename)
95
  duration = waveform.shape[0] / 44100 # in seconds
96
 
97
  vocals = load_model("vocals.pt")
 
108
  date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
109
  end = time.time()
110
 
111
+ vocals_wave = (vocals_wave.t() * 32768).to(torch.int16)
112
+ accompaniment_wave = (accompaniment_wave.t() * 32768).to(torch.int16)
113
+
114
+ vocals_sound = AudioSegment(
115
+ data=vocals_wave.numpy().tobytes(), sample_width=2, frame_rate=44100, channels=2
116
+ )
117
+ vocals_filename = in_filename + "-vocals.mp3"
118
+ vocals_sound.export(vocals_filename, format="mp3", bitrate="128k")
119
+
120
+ accompaniment_sound = AudioSegment(
121
+ data=accompaniment_wave.numpy().tobytes(),
122
+ sample_width=2,
123
+ frame_rate=44100,
124
+ channels=2,
125
+ )
126
+ accompaniment_filename = in_filename + "-accompaniment.mp3"
127
+ accompaniment_sound.export(accompaniment_filename, format="mp3", bitrate="128k")
128
+
129
  rtf = (end - start) / duration
130
 
131
  logging.info(f"Finished at {date_time} s. Elapsed: {end - start: .3f} s")
132
 
133
  info = f"""
134
+ Input duration : {duration: .3f} s <br/>
135
  Processing time: {end - start: .3f} s <br/>
136
  RTF: {end - start: .3f}/{duration: .3f} = {rtf:.3f} <br/>
137
  """
 
 
 
 
 
 
138
  logging.info(info)
139
  logging.info(f"\nrepo_id: {repo_id}\nhyp: {text}")
140
 
141
+ return vocals_filename, accompaniment_filename, build_html_output(info)
142
 
143
 
144
+ title = "# Music source separation with Spleeter in PyTorch"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
  # css style is copied from
147
  # https://huggingface.co/spaces/alphacep/asr/blob/main/app.py#L113
 
153
  """
154
 
155
 
 
 
 
 
 
 
 
 
156
  demo = gr.Blocks(css=css)
157
 
158
 
159
  with demo:
160
  gr.Markdown(title)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
  with gr.Tabs():
163
  with gr.TabItem("Upload from disk"):