Spaces:

danilotpnta
/

Youtube-Whisper

Runtime error

App Files Files Community

danilotpnta commited on Sep 14, 2024

Commit

afbb88c

1 Parent(s): c646c7f

fix: handle captcha detection

Browse files

Files changed (3) hide show

app.py +17 -30
download_video.py +79 -0
environment.yml +4 -2

app.py CHANGED Viewed

@@ -1,35 +1,17 @@
 import whisper
 import gradio as gr
 import os
-import subprocess
-# Try to install yt-dlp if not available
-try:
-    subprocess.check_call(["pip", "install", "yt-dlp"])
-except subprocess.CalledProcessError as e:
-    print(f"Error installing yt-dlp: {e}")
-import yt_dlp
-# Function to download the audio and extract metadata from YouTube
-def download_video_info(url):
-    ydl_opts = {
-        'format': 'bestaudio/best',
-        'outtmpl': 'audio.%(ext)s',
-        'postprocessors': [{
-            'key': 'FFmpegExtractAudio',
-            'preferredcodec': 'mp3',
-            'preferredquality': '192',
-        }],
-    }
     try:
-        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
-            info = ydl.extract_info(url, download=False)  # Extract video info
-            title = info.get('title', 'Unknown Title')
-            thumbnail_url = info.get('thumbnail', '')
-            ydl.download([url])  # Download the audio
-        audio_file = "audio.mp3"
         return audio_file, title, thumbnail_url
     except Exception as e:
         return None, None, str(e)
@@ -41,9 +23,9 @@ def transcribe_audio(audio_path, model_size="base"):
     return result['text']
 # Split logic: First fetch title and thumbnail, then transcribe
-def get_video_info_and_transcribe(youtube_url, model_size="base"):
     # Fetch title and thumbnail first
-    audio_path, title, thumbnail_url = download_video_info(youtube_url)
     # If fetching video info fails
     if not audio_path or not os.path.exists(audio_path):
@@ -51,8 +33,13 @@ def get_video_info_and_transcribe(youtube_url, model_size="base"):
     # Show title and thumbnail to the user while the transcription is happening
     title_output = gr.update(value=title)
-    thumbnail_output = gr.update(value=thumbnail_url)
     # Start transcription
     transcription = transcribe_audio(audio_path, model_size)

 import whisper
 import gradio as gr
 import os
+import asyncio
+# Import the async download function from your download script
+from download_video import download_mp3_playwright
+# Function to download the audio, title, and thumbnail from YouTube
+async def download_video_info(url):
     try:
+        # Call the async function to download video and get title and thumbnail
+        title, thumbnail_url = await download_mp3_playwright(url)
+        audio_file = "downloaded_video.mp4"  # Path to the downloaded audio
         return audio_file, title, thumbnail_url
     except Exception as e:
         return None, None, str(e)
     return result['text']
 # Split logic: First fetch title and thumbnail, then transcribe
+async def get_video_info_and_transcribe(youtube_url, model_size="base"):
     # Fetch title and thumbnail first
+    audio_path, title, thumbnail_url = await download_video_info(youtube_url)
     # If fetching video info fails
     if not audio_path or not os.path.exists(audio_path):
     # Show title and thumbnail to the user while the transcription is happening
     title_output = gr.update(value=title)
+    # Show the thumbnail if available
+    if thumbnail_url:
+        thumbnail_output = gr.update(value=thumbnail_url)
+    else:
+        thumbnail_output = gr.update(visible=False)  # Hide if no thumbnail
     # Start transcription
     transcription = transcribe_audio(audio_path, model_size)

download_video.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import asyncio
+from playwright.async_api import async_playwright
+import requests
+async def download_mp3_playwright(youtube_url):
+    async with async_playwright() as p:
+        # Launch browser in headless mode
+        browser = await p.chromium.launch(headless=True)
+        page = await browser.new_page()
+        # Open the YouTube video page
+        await page.goto(youtube_url)
+        # Scrape the title
+        title = await page.title()  # This gives you the video title
+        # Scrape the thumbnail (YouTube page has a meta tag for the thumbnail)
+        thumbnail_url = await page.get_attribute('meta[property="og:image"]', 'content')
+        # Open the YouTube downloader site
+        await page.goto("https://yt1d.com/en/")
+        # Input the YouTube URL into the downloader
+        await page.fill("input#txt-url", youtube_url)
+        await page.press("input#txt-url", "Enter")  # Simulate pressing enter
+        # Wait for the MP3 download button to appear
+        await page.wait_for_selector("button[data-ftype='mp3']")
+        # Extract the download URL for the MP3
+        download_button = await page.query_selector("button[data-ftype='mp3']")
+        onclick_attr = await download_button.get_attribute("onclick")
+        # Extract parameters from the JavaScript function call
+        params = onclick_attr.split("'")
+        if len(params) >= 7:
+            mp3_download_url = params[1]  # Extracted base download URL
+            # Wait for the JavaScript to modify the link
+            await page.wait_for_function(
+                """() => document.querySelector('a[href*="googlevideo.com/videoplayback"]')"""
+            )
+            # Get the final download URL after JavaScript modifications
+            final_link = await page.query_selector("a[href*='googlevideo.com/videoplayback']")
+            mp3_download_url = await final_link.get_attribute("href")
+            print(f"Final MP3 Download URL: {mp3_download_url}")
+            response = requests.get(mp3_download_url, stream=True)
+            # Check if the request was successful
+            if response.status_code == 200:
+                # Write the video content to a file
+                output_file = "downloaded_video.mp4"
+                with open(output_file, "wb") as f:
+                    for chunk in response.iter_content(chunk_size=1024):
+                        if chunk:
+                            f.write(chunk)
+                print(f"Video downloaded successfully as {output_file}")
+            else:
+                print(f"Failed to download video. HTTP Status Code: {response.status_code}")
+        else:
+            print("Failed to extract MP3 download link from the page.")
+        # Close the browser
+        await browser.close()
+        # Return the title and thumbnail for display
+        return title, thumbnail_url
+# Call the async function
+async def main():
+    youtube_url = "https://youtu.be/MAZyQ-38b8M?si=q0dai-wF6FQz6MGN"
+    await download_mp3_playwright(youtube_url)
+# Run the asyncio loop
+if __name__ == "__main__":
+    asyncio.run(main())

environment.yml CHANGED Viewed

@@ -11,5 +11,7 @@ dependencies:
       - pytube==15.0.0
       - git+https://github.com/openai/whisper.git
       - torch==2.0.1
-      - yt-dlp
-      - pydantic==1.10  # Use Pydantic v1 to avoid the incompatibility

       - pytube==15.0.0
       - git+https://github.com/openai/whisper.git
       - torch==2.0.1
+      - pydantic==1.10  # Use Pydantic v1 to avoid incompatibility issues
+      - BeautifulSoup4
+      - playwright  # Required for Playwright-based scraping and downloads
+      - requests  # Used for handling HTTP requests during downloads