Spaces:

danilotpnta
/

Youtube-Whisper

Runtime error

App Files Files Community

danilotpnta commited on Sep 16, 2024

Commit

f4549cb

1 Parent(s): 035bc5f

update to selenium

Browse files

Files changed (4) hide show

app.py +10 -35
download_video.py +77 -65
environment.yml +5 -11
requirements.txt +13 -7

app.py CHANGED Viewed

@@ -1,46 +1,21 @@
 import whisper
 import gradio as gr
 import os
-import asyncio
-import subprocess
-# Ensure Playwright is installed and browsers are downloaded
-def install_playwright():
-    try:
-        # Install Playwright via pip if not already installed
-        subprocess.run(["pip", "install", "playwright"], check=True)
-        # Install the Playwright browsers
-        subprocess.run(["playwright", "install"], check=True)
-        print("Playwright and browsers installed.")
-    except subprocess.CalledProcessError as e:
-        print(f"Error during Playwright setup: {e}")
-        exit(1)
-# Call the function to install Playwright
-install_playwright()
-from download_video import download_mp3_playwright
-# Function to convert MP4 to MP3 using FFmpeg
-def convert_to_mp3(input_file, output_file):
-    command = ["ffmpeg", "-i", input_file, "-q:a", "0", "-map", "a", output_file]
-    try:
-        subprocess.run(command, check=True)
-    except subprocess.CalledProcessError as e:
-        print(f"Error converting {input_file} to {output_file}: {e}")
 # Function to download the audio, title, and thumbnail from YouTube
-async def download_video_info(url):
     try:
-        # Call the async function to download video and get title and thumbnail
-        title, thumbnail_url = await download_mp3_playwright(url)
-        audio_file = "downloaded_video.mp4"  # Path to the downloaded audio
-        # Convert MP4 to MP3 before passing to Whisper
-        convert_to_mp3(audio_file, "downloaded_audio.mp3")
-        return "downloaded_audio.mp3", title, thumbnail_url
     except Exception as e:
         return None, None, str(e)
@@ -51,9 +26,9 @@ def transcribe_audio(audio_path, model_size="base"):
     return result['text']
 # Split logic: First fetch title and thumbnail, then transcribe
-async def get_video_info_and_transcribe(youtube_url, model_size="base"):
     # Fetch title and thumbnail first
-    audio_path, title, thumbnail_url = await download_video_info(youtube_url)
     # If fetching video info fails
     if not audio_path or not os.path.exists(audio_path):

 import whisper
 import gradio as gr
 import os
+import warnings
+warnings.filterwarnings("ignore", category=FutureWarning, module="torch")
+from download_video import download_mp3_selenium  # Assuming you saved your Selenium code in 'selenium_download.py'
 # Function to download the audio, title, and thumbnail from YouTube
+def download_video_info(url):
     try:
+        # Call the function to download video and get title and thumbnail
+        title, thumbnail_url = download_mp3_selenium(url)
+        audio_file = "downloaded_video.mp4"  # Path to the downloaded audio (MP4)
+        return audio_file, title, thumbnail_url
     except Exception as e:
         return None, None, str(e)
     return result['text']
 # Split logic: First fetch title and thumbnail, then transcribe
+def get_video_info_and_transcribe(youtube_url, model_size="base"):
     # Fetch title and thumbnail first
+    audio_path, title, thumbnail_url = download_video_info(youtube_url)
     # If fetching video info fails
     if not audio_path or not os.path.exists(audio_path):

download_video.py CHANGED Viewed

@@ -1,68 +1,80 @@
-from playwright.async_api import async_playwright
-async def download_mp3_playwright(youtube_url):
-    async with async_playwright() as p:
-        # Launch browser in headless mode
-        browser = await p.chromium.launch(headless=True, args=['--no-sandbox'])
-        page = await browser.new_page()
-        # Open the YouTube video page
-        await page.goto(youtube_url)
-        # Scrape the title
-        title = await page.title()  # This gives you the video title
-        # Scrape the thumbnail (YouTube page has a meta tag for the thumbnail)
-        thumbnail_url = await page.get_attribute('meta[property="og:image"]', 'content')
-        # Open the YouTube downloader site
-        await page.goto("https://yt1d.com/en/")
-        # Input the YouTube URL into the downloader
-        await page.fill("input#txt-url", youtube_url)
-        await page.press("input#txt-url", "Enter")  # Simulate pressing enter
-        # Wait for the MP3 download button to appear
-        await page.wait_for_selector("button[data-ftype='mp3']")
-        # Extract the download URL for the MP3
-        download_button = await page.query_selector("button[data-ftype='mp3']")
-        onclick_attr = await download_button.get_attribute("onclick")
-        # Extract parameters from the JavaScript function call
-        params = onclick_attr.split("'")
-        if len(params) >= 7:
-            mp3_download_url = params[1]  # Extracted base download URL
-            # Wait for the JavaScript to modify the link
-            await page.wait_for_function(
-                """() => document.querySelector('a[href*="googlevideo.com/videoplayback"]')"""
-            )
-            # Get the final download URL after JavaScript modifications
-            final_link = await page.query_selector("a[href*='googlevideo.com/videoplayback']")
-            mp3_download_url = await final_link.get_attribute("href")
-            print(f"Final MP3 Download URL: {mp3_download_url}")
-            response = requests.get(mp3_download_url, stream=True)
-            # Check if the request was successful
-            if response.status_code == 200:
-                # Write the video content to a file
-                output_file = "downloaded_video.mp4"
-                with open(output_file, "wb") as f:
-                    for chunk in response.iter_content(chunk_size=1024):
-                        if chunk:
-                            f.write(chunk)
-                print(f"Video downloaded successfully as {output_file}")
-            else:
-                print(f"Failed to download video. HTTP Status Code: {response.status_code}")
         else:
-            print("Failed to extract MP3 download link from the page.")
-        # Close the browser
-        await browser.close()
-        # Return the title and thumbnail for display
-        return title, thumbnail_url

+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.common.keys import Keys
+import requests
+import time
+def download_mp3_selenium(youtube_url):
+    # Set up the Selenium WebDriver
+    options = webdriver.ChromeOptions()
+    options.add_argument("--headless")
+    options.add_argument("--no-sandbox")
+    options.add_argument('--disable-dev-shm-usage')
+    driver = webdriver.Chrome(options=options)
+    # Open the YouTube video page
+    driver.get(youtube_url)
+    time.sleep(2)  # Wait for the page to load
+    # Scrape the title
+    title = driver.title  # This gives you the video title
+    # Scrape the thumbnail (YouTube page has a meta tag for the thumbnail)
+    thumbnail_meta = driver.find_element(By.XPATH, "//meta[@property='og:image']")
+    thumbnail_url = thumbnail_meta.get_attribute('content')
+    # Open the YouTube downloader site
+    driver.get("https://yt1d.com/en/")
+    time.sleep(2)  # Wait for the page to load
+    # Input the YouTube URL into the downloader
+    input_box = driver.find_element(By.ID, "txt-url")
+    input_box.send_keys(youtube_url)
+    input_box.send_keys(Keys.RETURN)
+    time.sleep(2)  # Wait for the download options to load
+    # Wait for the MP3 download button to appear
+    mp3_download_button = driver.find_element(By.CSS_SELECTOR, "button[data-ftype='mp3']")
+    onclick_attr = mp3_download_button.get_attribute("onclick")
+    # Extract parameters from the JavaScript function call
+    params = onclick_attr.split("'")
+    if len(params) >= 7:
+        mp3_download_url = params[1]  # Extracted base download URL
+        # Wait for the JavaScript to modify the link
+        time.sleep(2)  # Allow time for the page to modify the link
+        # Get the final download URL after JavaScript modifications
+        final_link = driver.find_element(By.CSS_SELECTOR, "a[href*='googlevideo.com/videoplayback']")
+        mp3_download_url = final_link.get_attribute("href")
+        print(f"Final MP3 Download URL: {mp3_download_url}")
+        response = requests.get(mp3_download_url, stream=True)
+        # Check if the request was successful
+        if response.status_code == 200:
+            # Write the video content to a file
+            output_file = "downloaded_video.mp4"
+            with open(output_file, "wb") as f:
+                for chunk in response.iter_content(chunk_size=1024):
+                    if chunk:
+                        f.write(chunk)
+            print(f"Video downloaded successfully as {output_file}")
         else:
+            print(f"Failed to download video. HTTP Status Code: {response.status_code}")
+    else:
+        print("Failed to extract MP3 download link from the page.")
+    # Close the browser
+    driver.quit()
+    # Return the title and thumbnail for display
+    return title, thumbnail_url
+# Example usage:
+# youtube_url = "https://youtu.be/MAZyQ-38b8M?si=q0dai-wF6FQz6MGN"
+# title, thumbnail_url = download_mp3_selenium(youtube_url)
+# print(f"Title: {title}")
+# print(f"Thumbnail: {thumbnail_url}")

environment.yml CHANGED Viewed

@@ -1,18 +1,12 @@
-name: yt-whisper
 channels:
   - defaults
   - conda-forge
 dependencies:
   - python=3.9
   - pip
-  - numpy<2  # Pinning NumPy to a version below 2.0 to avoid compatibility issues
   - pip:
-      - gradio==3.39.0  # Downgrade Gradio to work with Pydantic v1
-      - pytube==15.0.0
-      - git+https://github.com/openai/whisper.git
-      - torch==2.0.1
-      # - yt-dlp
-      - pydantic==1.10  # Use Pydantic v1 to avoid the incompatibility
-      - BeautifulSoup4
-      - git+https://github.com/microsoft/playwright-python.git
-      - requests  # Used for handling HTTP requests during downloads

+name: yt-whisper-2
 channels:
   - defaults
   - conda-forge
 dependencies:
   - python=3.9
   - pip
   - pip:
+      - selenium
+      - requests
+      - gradio
+      - openai-whisper @ git+https://github.com/openai/whisper.git

requirements.txt CHANGED Viewed

@@ -1,7 +1,13 @@
-numpy<2  # Pinning NumPy to a version below 2.0
-gradio==3.39.0  # Downgrade Gradio to work with Pydantic v1
-pytube==15.0.0
-git+https://github.com/openai/whisper.git  # Install Whisper from GitHub
-torch==2.0.1
-yt-dlp
-pydantic==1.10  # Use Pydantic v1 to avoid the incompatibility

+# numpy<2  # Pinning NumPy to a version below 2.0
+# gradio==3.39.0  # Downgrade Gradio to work with Pydantic v1
+# pytube==15.0.0
+# git+https://github.com/openai/whisper.git  # Install Whisper from GitHub
+# torch==2.0.1
+# pydantic==1.10  # Use Pydantic v1 to avoid the incompatibility
+# git+https://github.com/microsoft/playwright-python.git
+# BeautifulSoup4
+# requests
+selenium
+requests
+gradio
+openai-whisper @ git+https://github.com/openai/whisper.git