danilotpnta commited on
Commit
afbb88c
·
1 Parent(s): c646c7f

fix: handle captcha detection

Browse files
Files changed (3) hide show
  1. app.py +17 -30
  2. download_video.py +79 -0
  3. environment.yml +4 -2
app.py CHANGED
@@ -1,35 +1,17 @@
1
  import whisper
2
  import gradio as gr
3
  import os
 
4
 
5
- import subprocess
6
- # Try to install yt-dlp if not available
7
- try:
8
- subprocess.check_call(["pip", "install", "yt-dlp"])
9
- except subprocess.CalledProcessError as e:
10
- print(f"Error installing yt-dlp: {e}")
11
-
12
- import yt_dlp
13
-
14
- # Function to download the audio and extract metadata from YouTube
15
- def download_video_info(url):
16
- ydl_opts = {
17
- 'format': 'bestaudio/best',
18
- 'outtmpl': 'audio.%(ext)s',
19
- 'postprocessors': [{
20
- 'key': 'FFmpegExtractAudio',
21
- 'preferredcodec': 'mp3',
22
- 'preferredquality': '192',
23
- }],
24
- }
25
 
 
 
26
  try:
27
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
28
- info = ydl.extract_info(url, download=False) # Extract video info
29
- title = info.get('title', 'Unknown Title')
30
- thumbnail_url = info.get('thumbnail', '')
31
- ydl.download([url]) # Download the audio
32
- audio_file = "audio.mp3"
33
  return audio_file, title, thumbnail_url
34
  except Exception as e:
35
  return None, None, str(e)
@@ -41,9 +23,9 @@ def transcribe_audio(audio_path, model_size="base"):
41
  return result['text']
42
 
43
  # Split logic: First fetch title and thumbnail, then transcribe
44
- def get_video_info_and_transcribe(youtube_url, model_size="base"):
45
  # Fetch title and thumbnail first
46
- audio_path, title, thumbnail_url = download_video_info(youtube_url)
47
 
48
  # If fetching video info fails
49
  if not audio_path or not os.path.exists(audio_path):
@@ -51,8 +33,13 @@ def get_video_info_and_transcribe(youtube_url, model_size="base"):
51
 
52
  # Show title and thumbnail to the user while the transcription is happening
53
  title_output = gr.update(value=title)
54
- thumbnail_output = gr.update(value=thumbnail_url)
55
-
 
 
 
 
 
56
  # Start transcription
57
  transcription = transcribe_audio(audio_path, model_size)
58
 
 
1
  import whisper
2
  import gradio as gr
3
  import os
4
+ import asyncio
5
 
6
+ # Import the async download function from your download script
7
+ from download_video import download_mp3_playwright
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
+ # Function to download the audio, title, and thumbnail from YouTube
10
+ async def download_video_info(url):
11
  try:
12
+ # Call the async function to download video and get title and thumbnail
13
+ title, thumbnail_url = await download_mp3_playwright(url)
14
+ audio_file = "downloaded_video.mp4" # Path to the downloaded audio
 
 
 
15
  return audio_file, title, thumbnail_url
16
  except Exception as e:
17
  return None, None, str(e)
 
23
  return result['text']
24
 
25
  # Split logic: First fetch title and thumbnail, then transcribe
26
+ async def get_video_info_and_transcribe(youtube_url, model_size="base"):
27
  # Fetch title and thumbnail first
28
+ audio_path, title, thumbnail_url = await download_video_info(youtube_url)
29
 
30
  # If fetching video info fails
31
  if not audio_path or not os.path.exists(audio_path):
 
33
 
34
  # Show title and thumbnail to the user while the transcription is happening
35
  title_output = gr.update(value=title)
36
+
37
+ # Show the thumbnail if available
38
+ if thumbnail_url:
39
+ thumbnail_output = gr.update(value=thumbnail_url)
40
+ else:
41
+ thumbnail_output = gr.update(visible=False) # Hide if no thumbnail
42
+
43
  # Start transcription
44
  transcription = transcribe_audio(audio_path, model_size)
45
 
download_video.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ from playwright.async_api import async_playwright
3
+ import requests
4
+
5
+ async def download_mp3_playwright(youtube_url):
6
+ async with async_playwright() as p:
7
+ # Launch browser in headless mode
8
+ browser = await p.chromium.launch(headless=True)
9
+ page = await browser.new_page()
10
+
11
+ # Open the YouTube video page
12
+ await page.goto(youtube_url)
13
+
14
+ # Scrape the title
15
+ title = await page.title() # This gives you the video title
16
+
17
+ # Scrape the thumbnail (YouTube page has a meta tag for the thumbnail)
18
+ thumbnail_url = await page.get_attribute('meta[property="og:image"]', 'content')
19
+
20
+ # Open the YouTube downloader site
21
+ await page.goto("https://yt1d.com/en/")
22
+
23
+ # Input the YouTube URL into the downloader
24
+ await page.fill("input#txt-url", youtube_url)
25
+ await page.press("input#txt-url", "Enter") # Simulate pressing enter
26
+
27
+ # Wait for the MP3 download button to appear
28
+ await page.wait_for_selector("button[data-ftype='mp3']")
29
+
30
+ # Extract the download URL for the MP3
31
+ download_button = await page.query_selector("button[data-ftype='mp3']")
32
+ onclick_attr = await download_button.get_attribute("onclick")
33
+
34
+ # Extract parameters from the JavaScript function call
35
+ params = onclick_attr.split("'")
36
+ if len(params) >= 7:
37
+ mp3_download_url = params[1] # Extracted base download URL
38
+
39
+ # Wait for the JavaScript to modify the link
40
+ await page.wait_for_function(
41
+ """() => document.querySelector('a[href*="googlevideo.com/videoplayback"]')"""
42
+ )
43
+
44
+ # Get the final download URL after JavaScript modifications
45
+ final_link = await page.query_selector("a[href*='googlevideo.com/videoplayback']")
46
+ mp3_download_url = await final_link.get_attribute("href")
47
+ print(f"Final MP3 Download URL: {mp3_download_url}")
48
+
49
+ response = requests.get(mp3_download_url, stream=True)
50
+
51
+ # Check if the request was successful
52
+ if response.status_code == 200:
53
+ # Write the video content to a file
54
+ output_file = "downloaded_video.mp4"
55
+ with open(output_file, "wb") as f:
56
+ for chunk in response.iter_content(chunk_size=1024):
57
+ if chunk:
58
+ f.write(chunk)
59
+ print(f"Video downloaded successfully as {output_file}")
60
+ else:
61
+ print(f"Failed to download video. HTTP Status Code: {response.status_code}")
62
+
63
+ else:
64
+ print("Failed to extract MP3 download link from the page.")
65
+
66
+ # Close the browser
67
+ await browser.close()
68
+
69
+ # Return the title and thumbnail for display
70
+ return title, thumbnail_url
71
+
72
+ # Call the async function
73
+ async def main():
74
+ youtube_url = "https://youtu.be/MAZyQ-38b8M?si=q0dai-wF6FQz6MGN"
75
+ await download_mp3_playwright(youtube_url)
76
+
77
+ # Run the asyncio loop
78
+ if __name__ == "__main__":
79
+ asyncio.run(main())
environment.yml CHANGED
@@ -11,5 +11,7 @@ dependencies:
11
  - pytube==15.0.0
12
  - git+https://github.com/openai/whisper.git
13
  - torch==2.0.1
14
- - yt-dlp
15
- - pydantic==1.10 # Use Pydantic v1 to avoid the incompatibility
 
 
 
11
  - pytube==15.0.0
12
  - git+https://github.com/openai/whisper.git
13
  - torch==2.0.1
14
+ - pydantic==1.10 # Use Pydantic v1 to avoid incompatibility issues
15
+ - BeautifulSoup4
16
+ - playwright # Required for Playwright-based scraping and downloads
17
+ - requests # Used for handling HTTP requests during downloads