transcribe_audio

Running

App Files Files Community

cstr commited on Oct 2, 2024

Commit

64259e4

verified ·

1 Parent(s): abafc9b

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -21

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ import subprocess
 import tempfile
 import requests
 from urllib.parse import urlparse
 # Clone and install faster-whisper from GitHub
 try:
@@ -90,7 +91,7 @@ def youtube_dl_alternative_method(url):
     ydl_opts = {
         'format': 'bestaudio/best',
         'postprocessors': [{
-            'key': 'FFmpegExtractAudio',
             'preferredcodec': 'mp3',
             'preferredquality': '192',
         }],
@@ -101,7 +102,7 @@ def youtube_dl_alternative_method(url):
         'prefer_insecure': True,
     }
     with yt_dlp.YoutubeDL(ydl_opts) as ydl:
-        info = ydl.extract_info(url, download=True)
         return f"{info['id']}.mp3"
 def ffmpeg_method(url):
@@ -110,7 +111,7 @@ def ffmpeg_method(url):
     subprocess.run(command, check=True, capture_output=True)
     return output_file
-def aria2_method(url):
     output_file = tempfile.mktemp(suffix='.mp3')
     command = ['aria2c', '--split=4', '--max-connection-per-server=4', '--out', output_file, url]
     subprocess.run(command, check=True, capture_output=True)
@@ -130,14 +131,21 @@ def download_direct_audio(url, method_choice):
                 raise Exception(f"Failed to download audio from {url}")
         except Exception as e:
             return f"Error downloading direct audio: {str(e)}"
 def wget_method(url):
-    output_file = tempfile.mktemp(suffix='.mp3')
     command = ['wget', '-O', output_file, url]
     subprocess.run(command, check=True, capture_output=True)
     return output_file
-def transcribe_audio(input_source, batch_size, download_method):
     try:
         # Initialize the model
         model = WhisperModel("cstr/whisper-large-v3-turbo-int8_float32", device="auto", compute_type="int8")
@@ -154,25 +162,31 @@ def transcribe_audio(input_source, batch_size, download_method):
             # It's a local file path
             audio_path = input_source
         # Benchmark transcription time
-        start_time = time.time()
-        segments, info = batched_model.transcribe(audio_path, batch_size=batch_size)
-        end_time = time.time()
         # Show initial metrics as soon as possible
-        transcription_time = end_time - start_time
         real_time_factor = info.duration / transcription_time
         audio_file_size = os.path.getsize(audio_path) / (1024 * 1024)  # Size in MB
         metrics_output = (
             f"Language: {info.language}, Probability: {info.language_probability:.2f}\n"
-            f"Duration: {info.duration:.2f}s, Duration after VAD: {info.duration_after_vad:.2f}s\n"
             f"Transcription time: {transcription_time:.2f} seconds\n"
             f"Real-time factor: {real_time_factor:.2f}x\n"
             f"Audio file size: {audio_file_size:.2f} MB\n"
         )
-        yield metrics_output, "", None
         transcription = ""
@@ -181,7 +195,8 @@ def transcribe_audio(input_source, batch_size, download_method):
             transcription_segment = f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}\n"
             transcription += transcription_segment
-            yield metrics_output, transcription, None
         # Final output with download option
         transcription_file = save_transcription(transcription)
@@ -191,12 +206,17 @@ def transcribe_audio(input_source, batch_size, download_method):
         yield f"An error occurred: {str(e)}", "", None
     finally:
-        # Clean up downloaded file if it was a URL
         if isinstance(input_source, str) and (input_source.startswith('http://') or input_source.startswith('https://')):
             try:
                 os.remove(audio_path)
             except:
                 pass
 def save_transcription(transcription):
     file_path = tempfile.mktemp(suffix='.txt')
@@ -210,21 +230,24 @@ iface = gr.Interface(
     inputs=[
         gr.Textbox(label="Audio Source (Upload, MP3 URL, or YouTube URL)"),
         gr.Slider(minimum=1, maximum=32, step=1, value=16, label="Batch Size"),
-        gr.Dropdown(choices=["yt-dlp", "pytube", "youtube-dl", "yt-dlp-alt", "ffmpeg", "aria2", "wget"], label="Download Method", value="yt-dlp")
     ],
     outputs=[
         gr.Textbox(label="Transcription Metrics and Verbose Messages", live=True),
         gr.Textbox(label="Transcription", live=True),
-        gr.File(label="Download Transcription")
     ],
     title="Faster Whisper Multi-Input Transcription",
     description="Enter an audio file path, MP3 URL, or YouTube URL to transcribe using Faster Whisper (GitHub version). Adjust the batch size and choose a download method.",
     examples=[
-        ["https://www.youtube.com/watch?v=daQ_hqA6HDo", 16, "yt-dlp"],
-        ["https://mcdn.podbean.com/mf/web/dir5wty678b6g4vg/HoP_453_-_The_Price_is_Right_-_Law_and_Economics_in_the_Second_Scholastic5yxzh.mp3", 16, "ffmpeg"],
-        ["path/to/local/audio.mp3", 16, "yt-dlp"]
     ],
     cache_examples=False  # Prevents automatic processing of examples
 )
-iface.launch()

 import tempfile
 import requests
 from urllib.parse import urlparse
+from pydub import AudioSegment
 # Clone and install faster-whisper from GitHub
 try:
     ydl_opts = {
         'format': 'bestaudio/best',
         'postprocessors': [{
+            'key': 'FFmpegExtractAudio',
             'preferredcodec': 'mp3',
             'preferredquality': '192',
         }],
         'prefer_insecure': True,
     }
     with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        info = ydl.extract_info(url, download=True)
         return f"{info['id']}.mp3"
 def ffmpeg_method(url):
     subprocess.run(command, check=True, capture_output=True)
     return output_file
+def aria2_method(url):
     output_file = tempfile.mktemp(suffix='.mp3')
     command = ['aria2c', '--split=4', '--max-connection-per-server=4', '--out', output_file, url]
     subprocess.run(command, check=True, capture_output=True)
                 raise Exception(f"Failed to download audio from {url}")
         except Exception as e:
             return f"Error downloading direct audio: {str(e)}"
 def wget_method(url):
+    output_file = tempfile.mktemp(suffix='.mp3')
     command = ['wget', '-O', output_file, url]
     subprocess.run(command, check=True, capture_output=True)
     return output_file
+def trim_audio(audio_path, start_time, end_time):
+    audio = AudioSegment.from_mp3(audio_path)
+    trimmed_audio = audio[start_time*1000:end_time*1000] if end_time else audio[start_time*1000:]
+    trimmed_audio_path = tempfile.mktemp(suffix='.mp3')
+    trimmed_audio.export(trimmed_audio_path, format="mp3")
+    return trimmed_audio_path
+def transcribe_audio(input_source, batch_size, download_method, start_time=None, end_time=None, verbose=False):
     try:
         # Initialize the model
         model = WhisperModel("cstr/whisper-large-v3-turbo-int8_float32", device="auto", compute_type="int8")
             # It's a local file path
             audio_path = input_source
+        # Trim the audio if start_time or end_time is specified
+        if start_time is not None or end_time is not None:
+            trimmed_audio_path = trim_audio(audio_path, start_time or 0, end_time)
+            audio_path = trimmed_audio_path
         # Benchmark transcription time
+        start_time_perf = time.time()
+        segments, info = batched_model.transcribe(audio_path, batch_size=batch_size, initial_prompt=None)
+        end_time_perf = time.time()
         # Show initial metrics as soon as possible
+        transcription_time = end_time_perf - start_time_perf
         real_time_factor = info.duration / transcription_time
         audio_file_size = os.path.getsize(audio_path) / (1024 * 1024)  # Size in MB
         metrics_output = (
             f"Language: {info.language}, Probability: {info.language_probability:.2f}\n"
+            f"Duration: {info.duration:.2f}s, Duration after VAD: {info.duration_after_vad:.2f}s\n"
             f"Transcription time: {transcription_time:.2f} seconds\n"
             f"Real-time factor: {real_time_factor:.2f}x\n"
             f"Audio file size: {audio_file_size:.2f} MB\n"
         )
+        if verbose:
+            yield metrics_output, "", None
         transcription = ""
             transcription_segment = f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}\n"
             transcription += transcription_segment
+            if verbose:
+                yield metrics_output, transcription, None
         # Final output with download option
         transcription_file = save_transcription(transcription)
         yield f"An error occurred: {str(e)}", "", None
     finally:
+        # Clean up downloaded and trimmed files
         if isinstance(input_source, str) and (input_source.startswith('http://') or input_source.startswith('https://')):
             try:
                 os.remove(audio_path)
             except:
                 pass
+        if start_time is not None or end_time is not None:
+            try:
+                os.remove(trimmed_audio_path)
+            except:
+                pass
 def save_transcription(transcription):
     file_path = tempfile.mktemp(suffix='.txt')
     inputs=[
         gr.Textbox(label="Audio Source (Upload, MP3 URL, or YouTube URL)"),
         gr.Slider(minimum=1, maximum=32, step=1, value=16, label="Batch Size"),
+        gr.Dropdown(choices=["yt-dlp", "pytube", "youtube-dl", "yt-dlp-alt", "ffmpeg", "aria2", "wget"], label="Download Method", value="yt-dlp"),
+        gr.Number(label="Start Time (seconds)", value=0, optional=True),
+        gr.Number(label="End Time (seconds)", optional=True),
+        gr.Checkbox(label="Verbose Output", value=False)
     ],
     outputs=[
         gr.Textbox(label="Transcription Metrics and Verbose Messages", live=True),
         gr.Textbox(label="Transcription", live=True),
+        gr.File(label="Download Transcription")
     ],
     title="Faster Whisper Multi-Input Transcription",
     description="Enter an audio file path, MP3 URL, or YouTube URL to transcribe using Faster Whisper (GitHub version). Adjust the batch size and choose a download method.",
     examples=[
+        ["https://www.youtube.com/watch?v=daQ_hqA6HDo", 16, "yt-dlp", 0, None, False],
+        ["https://mcdn.podbean.com/mf/web/dir5wty678b6g4vg/HoP_453_-_The_Price_is_Right_-_Law_and_Economics_in_the_Second_Scholastic5yxzh.mp3", 16, "ffmpeg", 0, 300, True],
+        ["path/to/local/audio.mp3", 16, "yt-dlp", 60, 180, False]
     ],
     cache_examples=False  # Prevents automatic processing of examples
 )
+iface.launch()