danilotpnta commited on
Commit
f4549cb
·
1 Parent(s): 035bc5f

update to selenium

Browse files
Files changed (4) hide show
  1. app.py +10 -35
  2. download_video.py +77 -65
  3. environment.yml +5 -11
  4. requirements.txt +13 -7
app.py CHANGED
@@ -1,46 +1,21 @@
1
  import whisper
2
  import gradio as gr
3
  import os
4
- import asyncio
5
- import subprocess
6
 
7
- # Ensure Playwright is installed and browsers are downloaded
8
- def install_playwright():
9
- try:
10
- # Install Playwright via pip if not already installed
11
- subprocess.run(["pip", "install", "playwright"], check=True)
12
-
13
- # Install the Playwright browsers
14
- subprocess.run(["playwright", "install"], check=True)
15
-
16
- print("Playwright and browsers installed.")
17
- except subprocess.CalledProcessError as e:
18
- print(f"Error during Playwright setup: {e}")
19
- exit(1)
20
 
21
- # Call the function to install Playwright
22
- install_playwright()
23
 
24
- from download_video import download_mp3_playwright
25
-
26
- # Function to convert MP4 to MP3 using FFmpeg
27
- def convert_to_mp3(input_file, output_file):
28
- command = ["ffmpeg", "-i", input_file, "-q:a", "0", "-map", "a", output_file]
29
- try:
30
- subprocess.run(command, check=True)
31
- except subprocess.CalledProcessError as e:
32
- print(f"Error converting {input_file} to {output_file}: {e}")
33
 
34
  # Function to download the audio, title, and thumbnail from YouTube
35
- async def download_video_info(url):
36
  try:
37
- # Call the async function to download video and get title and thumbnail
38
- title, thumbnail_url = await download_mp3_playwright(url)
39
- audio_file = "downloaded_video.mp4" # Path to the downloaded audio
40
 
41
- # Convert MP4 to MP3 before passing to Whisper
42
- convert_to_mp3(audio_file, "downloaded_audio.mp3")
43
- return "downloaded_audio.mp3", title, thumbnail_url
44
  except Exception as e:
45
  return None, None, str(e)
46
 
@@ -51,9 +26,9 @@ def transcribe_audio(audio_path, model_size="base"):
51
  return result['text']
52
 
53
  # Split logic: First fetch title and thumbnail, then transcribe
54
- async def get_video_info_and_transcribe(youtube_url, model_size="base"):
55
  # Fetch title and thumbnail first
56
- audio_path, title, thumbnail_url = await download_video_info(youtube_url)
57
 
58
  # If fetching video info fails
59
  if not audio_path or not os.path.exists(audio_path):
 
1
  import whisper
2
  import gradio as gr
3
  import os
 
 
4
 
5
+ import warnings
6
+ warnings.filterwarnings("ignore", category=FutureWarning, module="torch")
 
 
 
 
 
 
 
 
 
 
 
7
 
 
 
8
 
9
+ from download_video import download_mp3_selenium # Assuming you saved your Selenium code in 'selenium_download.py'
 
 
 
 
 
 
 
 
10
 
11
  # Function to download the audio, title, and thumbnail from YouTube
12
+ def download_video_info(url):
13
  try:
14
+ # Call the function to download video and get title and thumbnail
15
+ title, thumbnail_url = download_mp3_selenium(url)
16
+ audio_file = "downloaded_video.mp4" # Path to the downloaded audio (MP4)
17
 
18
+ return audio_file, title, thumbnail_url
 
 
19
  except Exception as e:
20
  return None, None, str(e)
21
 
 
26
  return result['text']
27
 
28
  # Split logic: First fetch title and thumbnail, then transcribe
29
+ def get_video_info_and_transcribe(youtube_url, model_size="base"):
30
  # Fetch title and thumbnail first
31
+ audio_path, title, thumbnail_url = download_video_info(youtube_url)
32
 
33
  # If fetching video info fails
34
  if not audio_path or not os.path.exists(audio_path):
download_video.py CHANGED
@@ -1,68 +1,80 @@
1
- from playwright.async_api import async_playwright
2
-
3
- async def download_mp3_playwright(youtube_url):
4
- async with async_playwright() as p:
5
- # Launch browser in headless mode
6
- browser = await p.chromium.launch(headless=True, args=['--no-sandbox'])
7
- page = await browser.new_page()
8
-
9
- # Open the YouTube video page
10
- await page.goto(youtube_url)
11
-
12
- # Scrape the title
13
- title = await page.title() # This gives you the video title
14
-
15
- # Scrape the thumbnail (YouTube page has a meta tag for the thumbnail)
16
- thumbnail_url = await page.get_attribute('meta[property="og:image"]', 'content')
17
-
18
- # Open the YouTube downloader site
19
- await page.goto("https://yt1d.com/en/")
20
-
21
- # Input the YouTube URL into the downloader
22
- await page.fill("input#txt-url", youtube_url)
23
- await page.press("input#txt-url", "Enter") # Simulate pressing enter
24
-
25
- # Wait for the MP3 download button to appear
26
- await page.wait_for_selector("button[data-ftype='mp3']")
27
-
28
- # Extract the download URL for the MP3
29
- download_button = await page.query_selector("button[data-ftype='mp3']")
30
- onclick_attr = await download_button.get_attribute("onclick")
31
-
32
- # Extract parameters from the JavaScript function call
33
- params = onclick_attr.split("'")
34
- if len(params) >= 7:
35
- mp3_download_url = params[1] # Extracted base download URL
36
-
37
- # Wait for the JavaScript to modify the link
38
- await page.wait_for_function(
39
- """() => document.querySelector('a[href*="googlevideo.com/videoplayback"]')"""
40
- )
41
-
42
- # Get the final download URL after JavaScript modifications
43
- final_link = await page.query_selector("a[href*='googlevideo.com/videoplayback']")
44
- mp3_download_url = await final_link.get_attribute("href")
45
- print(f"Final MP3 Download URL: {mp3_download_url}")
46
-
47
- response = requests.get(mp3_download_url, stream=True)
48
-
49
- # Check if the request was successful
50
- if response.status_code == 200:
51
- # Write the video content to a file
52
- output_file = "downloaded_video.mp4"
53
- with open(output_file, "wb") as f:
54
- for chunk in response.iter_content(chunk_size=1024):
55
- if chunk:
56
- f.write(chunk)
57
- print(f"Video downloaded successfully as {output_file}")
58
- else:
59
- print(f"Failed to download video. HTTP Status Code: {response.status_code}")
60
-
 
 
 
61
  else:
62
- print("Failed to extract MP3 download link from the page.")
 
 
 
 
 
 
63
 
64
- # Close the browser
65
- await browser.close()
66
 
67
- # Return the title and thumbnail for display
68
- return title, thumbnail_url
 
 
 
 
1
+ from selenium import webdriver
2
+ from selenium.webdriver.common.by import By
3
+ from selenium.webdriver.common.keys import Keys
4
+ import requests
5
+ import time
6
+
7
+ def download_mp3_selenium(youtube_url):
8
+ # Set up the Selenium WebDriver
9
+ options = webdriver.ChromeOptions()
10
+ options.add_argument("--headless")
11
+ options.add_argument("--no-sandbox")
12
+ options.add_argument('--disable-dev-shm-usage')
13
+ driver = webdriver.Chrome(options=options)
14
+
15
+ # Open the YouTube video page
16
+ driver.get(youtube_url)
17
+ time.sleep(2) # Wait for the page to load
18
+
19
+ # Scrape the title
20
+ title = driver.title # This gives you the video title
21
+
22
+ # Scrape the thumbnail (YouTube page has a meta tag for the thumbnail)
23
+ thumbnail_meta = driver.find_element(By.XPATH, "//meta[@property='og:image']")
24
+ thumbnail_url = thumbnail_meta.get_attribute('content')
25
+
26
+ # Open the YouTube downloader site
27
+ driver.get("https://yt1d.com/en/")
28
+ time.sleep(2) # Wait for the page to load
29
+
30
+ # Input the YouTube URL into the downloader
31
+ input_box = driver.find_element(By.ID, "txt-url")
32
+ input_box.send_keys(youtube_url)
33
+ input_box.send_keys(Keys.RETURN)
34
+ time.sleep(2) # Wait for the download options to load
35
+
36
+ # Wait for the MP3 download button to appear
37
+ mp3_download_button = driver.find_element(By.CSS_SELECTOR, "button[data-ftype='mp3']")
38
+ onclick_attr = mp3_download_button.get_attribute("onclick")
39
+
40
+ # Extract parameters from the JavaScript function call
41
+ params = onclick_attr.split("'")
42
+ if len(params) >= 7:
43
+ mp3_download_url = params[1] # Extracted base download URL
44
+
45
+ # Wait for the JavaScript to modify the link
46
+ time.sleep(2) # Allow time for the page to modify the link
47
+
48
+ # Get the final download URL after JavaScript modifications
49
+ final_link = driver.find_element(By.CSS_SELECTOR, "a[href*='googlevideo.com/videoplayback']")
50
+ mp3_download_url = final_link.get_attribute("href")
51
+ print(f"Final MP3 Download URL: {mp3_download_url}")
52
+
53
+ response = requests.get(mp3_download_url, stream=True)
54
+
55
+ # Check if the request was successful
56
+ if response.status_code == 200:
57
+ # Write the video content to a file
58
+ output_file = "downloaded_video.mp4"
59
+ with open(output_file, "wb") as f:
60
+ for chunk in response.iter_content(chunk_size=1024):
61
+ if chunk:
62
+ f.write(chunk)
63
+ print(f"Video downloaded successfully as {output_file}")
64
  else:
65
+ print(f"Failed to download video. HTTP Status Code: {response.status_code}")
66
+
67
+ else:
68
+ print("Failed to extract MP3 download link from the page.")
69
+
70
+ # Close the browser
71
+ driver.quit()
72
 
73
+ # Return the title and thumbnail for display
74
+ return title, thumbnail_url
75
 
76
+ # Example usage:
77
+ # youtube_url = "https://youtu.be/MAZyQ-38b8M?si=q0dai-wF6FQz6MGN"
78
+ # title, thumbnail_url = download_mp3_selenium(youtube_url)
79
+ # print(f"Title: {title}")
80
+ # print(f"Thumbnail: {thumbnail_url}")
environment.yml CHANGED
@@ -1,18 +1,12 @@
1
- name: yt-whisper
2
  channels:
3
  - defaults
4
  - conda-forge
5
  dependencies:
6
  - python=3.9
7
  - pip
8
- - numpy<2 # Pinning NumPy to a version below 2.0 to avoid compatibility issues
9
  - pip:
10
- - gradio==3.39.0 # Downgrade Gradio to work with Pydantic v1
11
- - pytube==15.0.0
12
- - git+https://github.com/openai/whisper.git
13
- - torch==2.0.1
14
- # - yt-dlp
15
- - pydantic==1.10 # Use Pydantic v1 to avoid the incompatibility
16
- - BeautifulSoup4
17
- - git+https://github.com/microsoft/playwright-python.git
18
- - requests # Used for handling HTTP requests during downloads
 
1
+ name: yt-whisper-2
2
  channels:
3
  - defaults
4
  - conda-forge
5
  dependencies:
6
  - python=3.9
7
  - pip
 
8
  - pip:
9
+ - selenium
10
+ - requests
11
+ - gradio
12
+ - openai-whisper @ git+https://github.com/openai/whisper.git
 
 
 
 
 
requirements.txt CHANGED
@@ -1,7 +1,13 @@
1
- numpy<2 # Pinning NumPy to a version below 2.0
2
- gradio==3.39.0 # Downgrade Gradio to work with Pydantic v1
3
- pytube==15.0.0
4
- git+https://github.com/openai/whisper.git # Install Whisper from GitHub
5
- torch==2.0.1
6
- yt-dlp
7
- pydantic==1.10 # Use Pydantic v1 to avoid the incompatibility
 
 
 
 
 
 
 
1
+ # numpy<2 # Pinning NumPy to a version below 2.0
2
+ # gradio==3.39.0 # Downgrade Gradio to work with Pydantic v1
3
+ # pytube==15.0.0
4
+ # git+https://github.com/openai/whisper.git # Install Whisper from GitHub
5
+ # torch==2.0.1
6
+ # pydantic==1.10 # Use Pydantic v1 to avoid the incompatibility
7
+ # git+https://github.com/microsoft/playwright-python.git
8
+ # BeautifulSoup4
9
+ # requests
10
+ selenium
11
+ requests
12
+ gradio
13
+ openai-whisper @ git+https://github.com/openai/whisper.git