Spaces:

sikhuni
/

yt_to_text_model

Build error

App Files Files Community

amritsar commited on Dec 13, 2024

Commit

b32079a

verified ·

1 Parent(s): 5d70d44

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -21

app.py CHANGED Viewed

@@ -1,12 +1,13 @@
 import gradio as gr
-from transformers import AutoProcessor, SeamlessM4Tv2Model
 from pytube import YouTube
 import os
-# Load the SeamlessM4T model and processor for Punjabi transcription
-model_id = "facebook/seamless-m4t-v2-large"
-processor = AutoProcessor.from_pretrained(model_id)
-model = SeamlessM4Tv2Model.from_pretrained(model_id)
 def transcribe_youtube_video(youtube_url):
     try:
@@ -14,31 +15,32 @@ def transcribe_youtube_video(youtube_url):
         yt = YouTube(youtube_url)
         audio_stream = yt.streams.filter(only_audio=True).first()
         audio_file_path = audio_stream.download(filename="audio.mp4")
-        # Process the downloaded audio
-        audio_inputs = processor(audios=audio_file_path, return_tensors="pt")
-        # Generate transcription
-        output = model.generate(**audio_inputs)
-        # Decode the output to get transcription text
-        transcription = processor.decode(output[0], skip_special_tokens=True)
-        # Clean up by removing the downloaded audio file
         os.remove(audio_file_path)
-        return transcription
     except Exception as e:
         return f"Error: {str(e)}"
-# Gradio interface for inputting a YouTube URL
 iface = gr.Interface(
     fn=transcribe_youtube_video,
     inputs=gr.Textbox(label="YouTube Video URL"),
-    outputs=gr.Textbox(label="Punjabi Transcription"),
-    title="Punjabi YouTube Video Transcription",
-    description="Enter a YouTube video URL to download and transcribe its Punjabi audio."
 )
 iface.launch()

 import gradio as gr
+from transformers import WhisperProcessor, WhisperForConditionalGeneration
 from pytube import YouTube
+import torch
 import os
+# Load Whisper model
+model_id = "openai/whisper-small"
+processor = WhisperProcessor.from_pretrained(model_id)
+model = WhisperForConditionalGeneration.from_pretrained(model_id)
 def transcribe_youtube_video(youtube_url):
     try:
         yt = YouTube(youtube_url)
         audio_stream = yt.streams.filter(only_audio=True).first()
         audio_file_path = audio_stream.download(filename="audio.mp4")
+        # Load and preprocess the audio
+        import librosa
+        audio, _ = librosa.load(audio_file_path, sr=16000)
+        input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features
+        # Generate token ids
+        predicted_ids = model.generate(input_features)
+        # Decode token ids to text
+        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
+        # Clean up
         os.remove(audio_file_path)
+        return transcription[0]
     except Exception as e:
         return f"Error: {str(e)}"
+# Gradio interface
 iface = gr.Interface(
     fn=transcribe_youtube_video,
     inputs=gr.Textbox(label="YouTube Video URL"),
+    outputs=gr.Textbox(label="Transcription"),
+    title="YouTube Video Transcription",
+    description="Enter a YouTube video URL to transcribe its audio."
 )
 iface.launch()