cstr commited on
Commit
64259e4
·
verified ·
1 Parent(s): abafc9b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -21
app.py CHANGED
@@ -6,6 +6,7 @@ import subprocess
6
  import tempfile
7
  import requests
8
  from urllib.parse import urlparse
 
9
 
10
  # Clone and install faster-whisper from GitHub
11
  try:
@@ -90,7 +91,7 @@ def youtube_dl_alternative_method(url):
90
  ydl_opts = {
91
  'format': 'bestaudio/best',
92
  'postprocessors': [{
93
- 'key': 'FFmpegExtractAudio',
94
  'preferredcodec': 'mp3',
95
  'preferredquality': '192',
96
  }],
@@ -101,7 +102,7 @@ def youtube_dl_alternative_method(url):
101
  'prefer_insecure': True,
102
  }
103
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
104
- info = ydl.extract_info(url, download=True)
105
  return f"{info['id']}.mp3"
106
 
107
  def ffmpeg_method(url):
@@ -110,7 +111,7 @@ def ffmpeg_method(url):
110
  subprocess.run(command, check=True, capture_output=True)
111
  return output_file
112
 
113
- def aria2_method(url):
114
  output_file = tempfile.mktemp(suffix='.mp3')
115
  command = ['aria2c', '--split=4', '--max-connection-per-server=4', '--out', output_file, url]
116
  subprocess.run(command, check=True, capture_output=True)
@@ -130,14 +131,21 @@ def download_direct_audio(url, method_choice):
130
  raise Exception(f"Failed to download audio from {url}")
131
  except Exception as e:
132
  return f"Error downloading direct audio: {str(e)}"
133
-
134
  def wget_method(url):
135
- output_file = tempfile.mktemp(suffix='.mp3')
136
  command = ['wget', '-O', output_file, url]
137
  subprocess.run(command, check=True, capture_output=True)
138
  return output_file
139
 
140
- def transcribe_audio(input_source, batch_size, download_method):
 
 
 
 
 
 
 
141
  try:
142
  # Initialize the model
143
  model = WhisperModel("cstr/whisper-large-v3-turbo-int8_float32", device="auto", compute_type="int8")
@@ -154,25 +162,31 @@ def transcribe_audio(input_source, batch_size, download_method):
154
  # It's a local file path
155
  audio_path = input_source
156
 
 
 
 
 
 
157
  # Benchmark transcription time
158
- start_time = time.time()
159
- segments, info = batched_model.transcribe(audio_path, batch_size=batch_size)
160
- end_time = time.time()
161
 
162
  # Show initial metrics as soon as possible
163
- transcription_time = end_time - start_time
164
  real_time_factor = info.duration / transcription_time
165
  audio_file_size = os.path.getsize(audio_path) / (1024 * 1024) # Size in MB
166
-
167
  metrics_output = (
168
  f"Language: {info.language}, Probability: {info.language_probability:.2f}\n"
169
- f"Duration: {info.duration:.2f}s, Duration after VAD: {info.duration_after_vad:.2f}s\n"
170
  f"Transcription time: {transcription_time:.2f} seconds\n"
171
  f"Real-time factor: {real_time_factor:.2f}x\n"
172
  f"Audio file size: {audio_file_size:.2f} MB\n"
173
  )
174
 
175
- yield metrics_output, "", None
 
176
 
177
  transcription = ""
178
 
@@ -181,7 +195,8 @@ def transcribe_audio(input_source, batch_size, download_method):
181
  transcription_segment = f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}\n"
182
  transcription += transcription_segment
183
 
184
- yield metrics_output, transcription, None
 
185
 
186
  # Final output with download option
187
  transcription_file = save_transcription(transcription)
@@ -191,12 +206,17 @@ def transcribe_audio(input_source, batch_size, download_method):
191
  yield f"An error occurred: {str(e)}", "", None
192
 
193
  finally:
194
- # Clean up downloaded file if it was a URL
195
  if isinstance(input_source, str) and (input_source.startswith('http://') or input_source.startswith('https://')):
196
  try:
197
  os.remove(audio_path)
198
  except:
199
  pass
 
 
 
 
 
200
 
201
  def save_transcription(transcription):
202
  file_path = tempfile.mktemp(suffix='.txt')
@@ -210,21 +230,24 @@ iface = gr.Interface(
210
  inputs=[
211
  gr.Textbox(label="Audio Source (Upload, MP3 URL, or YouTube URL)"),
212
  gr.Slider(minimum=1, maximum=32, step=1, value=16, label="Batch Size"),
213
- gr.Dropdown(choices=["yt-dlp", "pytube", "youtube-dl", "yt-dlp-alt", "ffmpeg", "aria2", "wget"], label="Download Method", value="yt-dlp")
 
 
 
214
  ],
215
  outputs=[
216
  gr.Textbox(label="Transcription Metrics and Verbose Messages", live=True),
217
  gr.Textbox(label="Transcription", live=True),
218
- gr.File(label="Download Transcription")
219
  ],
220
  title="Faster Whisper Multi-Input Transcription",
221
  description="Enter an audio file path, MP3 URL, or YouTube URL to transcribe using Faster Whisper (GitHub version). Adjust the batch size and choose a download method.",
222
  examples=[
223
- ["https://www.youtube.com/watch?v=daQ_hqA6HDo", 16, "yt-dlp"],
224
- ["https://mcdn.podbean.com/mf/web/dir5wty678b6g4vg/HoP_453_-_The_Price_is_Right_-_Law_and_Economics_in_the_Second_Scholastic5yxzh.mp3", 16, "ffmpeg"],
225
- ["path/to/local/audio.mp3", 16, "yt-dlp"]
226
  ],
227
  cache_examples=False # Prevents automatic processing of examples
228
  )
229
 
230
- iface.launch()
 
6
  import tempfile
7
  import requests
8
  from urllib.parse import urlparse
9
+ from pydub import AudioSegment
10
 
11
  # Clone and install faster-whisper from GitHub
12
  try:
 
91
  ydl_opts = {
92
  'format': 'bestaudio/best',
93
  'postprocessors': [{
94
+ 'key': 'FFmpegExtractAudio',
95
  'preferredcodec': 'mp3',
96
  'preferredquality': '192',
97
  }],
 
102
  'prefer_insecure': True,
103
  }
104
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
105
+ info = ydl.extract_info(url, download=True)
106
  return f"{info['id']}.mp3"
107
 
108
  def ffmpeg_method(url):
 
111
  subprocess.run(command, check=True, capture_output=True)
112
  return output_file
113
 
114
+ def aria2_method(url):
115
  output_file = tempfile.mktemp(suffix='.mp3')
116
  command = ['aria2c', '--split=4', '--max-connection-per-server=4', '--out', output_file, url]
117
  subprocess.run(command, check=True, capture_output=True)
 
131
  raise Exception(f"Failed to download audio from {url}")
132
  except Exception as e:
133
  return f"Error downloading direct audio: {str(e)}"
134
+
135
  def wget_method(url):
136
+ output_file = tempfile.mktemp(suffix='.mp3')
137
  command = ['wget', '-O', output_file, url]
138
  subprocess.run(command, check=True, capture_output=True)
139
  return output_file
140
 
141
+ def trim_audio(audio_path, start_time, end_time):
142
+ audio = AudioSegment.from_mp3(audio_path)
143
+ trimmed_audio = audio[start_time*1000:end_time*1000] if end_time else audio[start_time*1000:]
144
+ trimmed_audio_path = tempfile.mktemp(suffix='.mp3')
145
+ trimmed_audio.export(trimmed_audio_path, format="mp3")
146
+ return trimmed_audio_path
147
+
148
+ def transcribe_audio(input_source, batch_size, download_method, start_time=None, end_time=None, verbose=False):
149
  try:
150
  # Initialize the model
151
  model = WhisperModel("cstr/whisper-large-v3-turbo-int8_float32", device="auto", compute_type="int8")
 
162
  # It's a local file path
163
  audio_path = input_source
164
 
165
+ # Trim the audio if start_time or end_time is specified
166
+ if start_time is not None or end_time is not None:
167
+ trimmed_audio_path = trim_audio(audio_path, start_time or 0, end_time)
168
+ audio_path = trimmed_audio_path
169
+
170
  # Benchmark transcription time
171
+ start_time_perf = time.time()
172
+ segments, info = batched_model.transcribe(audio_path, batch_size=batch_size, initial_prompt=None)
173
+ end_time_perf = time.time()
174
 
175
  # Show initial metrics as soon as possible
176
+ transcription_time = end_time_perf - start_time_perf
177
  real_time_factor = info.duration / transcription_time
178
  audio_file_size = os.path.getsize(audio_path) / (1024 * 1024) # Size in MB
179
+
180
  metrics_output = (
181
  f"Language: {info.language}, Probability: {info.language_probability:.2f}\n"
182
+ f"Duration: {info.duration:.2f}s, Duration after VAD: {info.duration_after_vad:.2f}s\n"
183
  f"Transcription time: {transcription_time:.2f} seconds\n"
184
  f"Real-time factor: {real_time_factor:.2f}x\n"
185
  f"Audio file size: {audio_file_size:.2f} MB\n"
186
  )
187
 
188
+ if verbose:
189
+ yield metrics_output, "", None
190
 
191
  transcription = ""
192
 
 
195
  transcription_segment = f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}\n"
196
  transcription += transcription_segment
197
 
198
+ if verbose:
199
+ yield metrics_output, transcription, None
200
 
201
  # Final output with download option
202
  transcription_file = save_transcription(transcription)
 
206
  yield f"An error occurred: {str(e)}", "", None
207
 
208
  finally:
209
+ # Clean up downloaded and trimmed files
210
  if isinstance(input_source, str) and (input_source.startswith('http://') or input_source.startswith('https://')):
211
  try:
212
  os.remove(audio_path)
213
  except:
214
  pass
215
+ if start_time is not None or end_time is not None:
216
+ try:
217
+ os.remove(trimmed_audio_path)
218
+ except:
219
+ pass
220
 
221
  def save_transcription(transcription):
222
  file_path = tempfile.mktemp(suffix='.txt')
 
230
  inputs=[
231
  gr.Textbox(label="Audio Source (Upload, MP3 URL, or YouTube URL)"),
232
  gr.Slider(minimum=1, maximum=32, step=1, value=16, label="Batch Size"),
233
+ gr.Dropdown(choices=["yt-dlp", "pytube", "youtube-dl", "yt-dlp-alt", "ffmpeg", "aria2", "wget"], label="Download Method", value="yt-dlp"),
234
+ gr.Number(label="Start Time (seconds)", value=0, optional=True),
235
+ gr.Number(label="End Time (seconds)", optional=True),
236
+ gr.Checkbox(label="Verbose Output", value=False)
237
  ],
238
  outputs=[
239
  gr.Textbox(label="Transcription Metrics and Verbose Messages", live=True),
240
  gr.Textbox(label="Transcription", live=True),
241
+ gr.File(label="Download Transcription")
242
  ],
243
  title="Faster Whisper Multi-Input Transcription",
244
  description="Enter an audio file path, MP3 URL, or YouTube URL to transcribe using Faster Whisper (GitHub version). Adjust the batch size and choose a download method.",
245
  examples=[
246
+ ["https://www.youtube.com/watch?v=daQ_hqA6HDo", 16, "yt-dlp", 0, None, False],
247
+ ["https://mcdn.podbean.com/mf/web/dir5wty678b6g4vg/HoP_453_-_The_Price_is_Right_-_Law_and_Economics_in_the_Second_Scholastic5yxzh.mp3", 16, "ffmpeg", 0, 300, True],
248
+ ["path/to/local/audio.mp3", 16, "yt-dlp", 60, 180, False]
249
  ],
250
  cache_examples=False # Prevents automatic processing of examples
251
  )
252
 
253
+ iface.launch()