Spaces:
Runtime error
Runtime error
danilotpnta
commited on
Commit
·
afbb88c
1
Parent(s):
c646c7f
fix: handle captcha detection
Browse files- app.py +17 -30
- download_video.py +79 -0
- environment.yml +4 -2
app.py
CHANGED
@@ -1,35 +1,17 @@
|
|
1 |
import whisper
|
2 |
import gradio as gr
|
3 |
import os
|
|
|
4 |
|
5 |
-
|
6 |
-
|
7 |
-
try:
|
8 |
-
subprocess.check_call(["pip", "install", "yt-dlp"])
|
9 |
-
except subprocess.CalledProcessError as e:
|
10 |
-
print(f"Error installing yt-dlp: {e}")
|
11 |
-
|
12 |
-
import yt_dlp
|
13 |
-
|
14 |
-
# Function to download the audio and extract metadata from YouTube
|
15 |
-
def download_video_info(url):
|
16 |
-
ydl_opts = {
|
17 |
-
'format': 'bestaudio/best',
|
18 |
-
'outtmpl': 'audio.%(ext)s',
|
19 |
-
'postprocessors': [{
|
20 |
-
'key': 'FFmpegExtractAudio',
|
21 |
-
'preferredcodec': 'mp3',
|
22 |
-
'preferredquality': '192',
|
23 |
-
}],
|
24 |
-
}
|
25 |
|
|
|
|
|
26 |
try:
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
thumbnail_url = info.get('thumbnail', '')
|
31 |
-
ydl.download([url]) # Download the audio
|
32 |
-
audio_file = "audio.mp3"
|
33 |
return audio_file, title, thumbnail_url
|
34 |
except Exception as e:
|
35 |
return None, None, str(e)
|
@@ -41,9 +23,9 @@ def transcribe_audio(audio_path, model_size="base"):
|
|
41 |
return result['text']
|
42 |
|
43 |
# Split logic: First fetch title and thumbnail, then transcribe
|
44 |
-
def get_video_info_and_transcribe(youtube_url, model_size="base"):
|
45 |
# Fetch title and thumbnail first
|
46 |
-
audio_path, title, thumbnail_url = download_video_info(youtube_url)
|
47 |
|
48 |
# If fetching video info fails
|
49 |
if not audio_path or not os.path.exists(audio_path):
|
@@ -51,8 +33,13 @@ def get_video_info_and_transcribe(youtube_url, model_size="base"):
|
|
51 |
|
52 |
# Show title and thumbnail to the user while the transcription is happening
|
53 |
title_output = gr.update(value=title)
|
54 |
-
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
56 |
# Start transcription
|
57 |
transcription = transcribe_audio(audio_path, model_size)
|
58 |
|
|
|
1 |
import whisper
|
2 |
import gradio as gr
|
3 |
import os
|
4 |
+
import asyncio
|
5 |
|
6 |
+
# Import the async download function from your download script
|
7 |
+
from download_video import download_mp3_playwright
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
+
# Function to download the audio, title, and thumbnail from YouTube
|
10 |
+
async def download_video_info(url):
|
11 |
try:
|
12 |
+
# Call the async function to download video and get title and thumbnail
|
13 |
+
title, thumbnail_url = await download_mp3_playwright(url)
|
14 |
+
audio_file = "downloaded_video.mp4" # Path to the downloaded audio
|
|
|
|
|
|
|
15 |
return audio_file, title, thumbnail_url
|
16 |
except Exception as e:
|
17 |
return None, None, str(e)
|
|
|
23 |
return result['text']
|
24 |
|
25 |
# Split logic: First fetch title and thumbnail, then transcribe
|
26 |
+
async def get_video_info_and_transcribe(youtube_url, model_size="base"):
|
27 |
# Fetch title and thumbnail first
|
28 |
+
audio_path, title, thumbnail_url = await download_video_info(youtube_url)
|
29 |
|
30 |
# If fetching video info fails
|
31 |
if not audio_path or not os.path.exists(audio_path):
|
|
|
33 |
|
34 |
# Show title and thumbnail to the user while the transcription is happening
|
35 |
title_output = gr.update(value=title)
|
36 |
+
|
37 |
+
# Show the thumbnail if available
|
38 |
+
if thumbnail_url:
|
39 |
+
thumbnail_output = gr.update(value=thumbnail_url)
|
40 |
+
else:
|
41 |
+
thumbnail_output = gr.update(visible=False) # Hide if no thumbnail
|
42 |
+
|
43 |
# Start transcription
|
44 |
transcription = transcribe_audio(audio_path, model_size)
|
45 |
|
download_video.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
from playwright.async_api import async_playwright
|
3 |
+
import requests
|
4 |
+
|
5 |
+
async def download_mp3_playwright(youtube_url):
|
6 |
+
async with async_playwright() as p:
|
7 |
+
# Launch browser in headless mode
|
8 |
+
browser = await p.chromium.launch(headless=True)
|
9 |
+
page = await browser.new_page()
|
10 |
+
|
11 |
+
# Open the YouTube video page
|
12 |
+
await page.goto(youtube_url)
|
13 |
+
|
14 |
+
# Scrape the title
|
15 |
+
title = await page.title() # This gives you the video title
|
16 |
+
|
17 |
+
# Scrape the thumbnail (YouTube page has a meta tag for the thumbnail)
|
18 |
+
thumbnail_url = await page.get_attribute('meta[property="og:image"]', 'content')
|
19 |
+
|
20 |
+
# Open the YouTube downloader site
|
21 |
+
await page.goto("https://yt1d.com/en/")
|
22 |
+
|
23 |
+
# Input the YouTube URL into the downloader
|
24 |
+
await page.fill("input#txt-url", youtube_url)
|
25 |
+
await page.press("input#txt-url", "Enter") # Simulate pressing enter
|
26 |
+
|
27 |
+
# Wait for the MP3 download button to appear
|
28 |
+
await page.wait_for_selector("button[data-ftype='mp3']")
|
29 |
+
|
30 |
+
# Extract the download URL for the MP3
|
31 |
+
download_button = await page.query_selector("button[data-ftype='mp3']")
|
32 |
+
onclick_attr = await download_button.get_attribute("onclick")
|
33 |
+
|
34 |
+
# Extract parameters from the JavaScript function call
|
35 |
+
params = onclick_attr.split("'")
|
36 |
+
if len(params) >= 7:
|
37 |
+
mp3_download_url = params[1] # Extracted base download URL
|
38 |
+
|
39 |
+
# Wait for the JavaScript to modify the link
|
40 |
+
await page.wait_for_function(
|
41 |
+
"""() => document.querySelector('a[href*="googlevideo.com/videoplayback"]')"""
|
42 |
+
)
|
43 |
+
|
44 |
+
# Get the final download URL after JavaScript modifications
|
45 |
+
final_link = await page.query_selector("a[href*='googlevideo.com/videoplayback']")
|
46 |
+
mp3_download_url = await final_link.get_attribute("href")
|
47 |
+
print(f"Final MP3 Download URL: {mp3_download_url}")
|
48 |
+
|
49 |
+
response = requests.get(mp3_download_url, stream=True)
|
50 |
+
|
51 |
+
# Check if the request was successful
|
52 |
+
if response.status_code == 200:
|
53 |
+
# Write the video content to a file
|
54 |
+
output_file = "downloaded_video.mp4"
|
55 |
+
with open(output_file, "wb") as f:
|
56 |
+
for chunk in response.iter_content(chunk_size=1024):
|
57 |
+
if chunk:
|
58 |
+
f.write(chunk)
|
59 |
+
print(f"Video downloaded successfully as {output_file}")
|
60 |
+
else:
|
61 |
+
print(f"Failed to download video. HTTP Status Code: {response.status_code}")
|
62 |
+
|
63 |
+
else:
|
64 |
+
print("Failed to extract MP3 download link from the page.")
|
65 |
+
|
66 |
+
# Close the browser
|
67 |
+
await browser.close()
|
68 |
+
|
69 |
+
# Return the title and thumbnail for display
|
70 |
+
return title, thumbnail_url
|
71 |
+
|
72 |
+
# Call the async function
|
73 |
+
async def main():
|
74 |
+
youtube_url = "https://youtu.be/MAZyQ-38b8M?si=q0dai-wF6FQz6MGN"
|
75 |
+
await download_mp3_playwright(youtube_url)
|
76 |
+
|
77 |
+
# Run the asyncio loop
|
78 |
+
if __name__ == "__main__":
|
79 |
+
asyncio.run(main())
|
environment.yml
CHANGED
@@ -11,5 +11,7 @@ dependencies:
|
|
11 |
- pytube==15.0.0
|
12 |
- git+https://github.com/openai/whisper.git
|
13 |
- torch==2.0.1
|
14 |
-
-
|
15 |
-
-
|
|
|
|
|
|
11 |
- pytube==15.0.0
|
12 |
- git+https://github.com/openai/whisper.git
|
13 |
- torch==2.0.1
|
14 |
+
- pydantic==1.10 # Use Pydantic v1 to avoid incompatibility issues
|
15 |
+
- BeautifulSoup4
|
16 |
+
- playwright # Required for Playwright-based scraping and downloads
|
17 |
+
- requests # Used for handling HTTP requests during downloads
|