Spaces:
Runtime error
Runtime error
Delete youtube_shorts_downloader
Browse files- youtube_shorts_downloader/__init__.py +0 -4
- youtube_shorts_downloader/__pycache__/__init__.cpython-310.pyc +0 -0
- youtube_shorts_downloader/__pycache__/input_output.cpython-310.pyc +0 -0
- youtube_shorts_downloader/__pycache__/transcripts.cpython-310.pyc +0 -0
- youtube_shorts_downloader/app.py +0 -65
- youtube_shorts_downloader/input_output.py +0 -14
- youtube_shorts_downloader/transcripts.py +0 -53
youtube_shorts_downloader/__init__.py
DELETED
@@ -1,4 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
|
3 |
-
base_dir = os.path.dirname(os.path.abspath(__file__))
|
4 |
-
main_dir = os.path.dirname(base_dir)
|
|
|
|
|
|
|
|
|
|
youtube_shorts_downloader/__pycache__/__init__.cpython-310.pyc
DELETED
Binary file (310 Bytes)
|
|
youtube_shorts_downloader/__pycache__/input_output.cpython-310.pyc
DELETED
Binary file (746 Bytes)
|
|
youtube_shorts_downloader/__pycache__/transcripts.cpython-310.pyc
DELETED
Binary file (1.82 kB)
|
|
youtube_shorts_downloader/app.py
DELETED
@@ -1,65 +0,0 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
import io
|
3 |
-
import pandas as pd
|
4 |
-
from youtube_shorts_downloader import main_dir
|
5 |
-
from youtube_shorts_downloader.transcripts import get_batch_transcripts
|
6 |
-
from youtube_shorts_downloader.input_output import parse_input_file, save_output
|
7 |
-
|
8 |
-
|
9 |
-
st.title("YT Shorts Transcript Downloader")
|
10 |
-
st.markdown(
|
11 |
-
"instructions: upload a text file with valid youtube urls, one per line, to fetch transcripts"
|
12 |
-
)
|
13 |
-
|
14 |
-
|
15 |
-
base = st.container(border=True)
|
16 |
-
with base:
|
17 |
-
x, col1, col2 = st.columns([3, 20, 5])
|
18 |
-
with col1:
|
19 |
-
uploaded_file = st.file_uploader("Choose a File", type=["txt"])
|
20 |
-
col2, col3, col4 = st.columns([3, 2, 3])
|
21 |
-
with col2:
|
22 |
-
trans_button_val = st.button(label="fetch transcripts", type="primary")
|
23 |
-
with col3:
|
24 |
-
empty_container = st.container()
|
25 |
-
with col4:
|
26 |
-
placeholder = st.empty()
|
27 |
-
|
28 |
-
|
29 |
-
@st.cache_data
|
30 |
-
def convert_df(df: pd.DataFrame) -> "csv":
|
31 |
-
# IMPORTANT: Cache the conversion to prevent computation on every rerun
|
32 |
-
return df.to_csv().encode("utf-8")
|
33 |
-
|
34 |
-
|
35 |
-
def button_logic(youtube_short_urls: list) -> None:
|
36 |
-
if trans_button_val:
|
37 |
-
batch_transcripts = get_batch_transcripts(youtube_short_urls)
|
38 |
-
df = pd.DataFrame(batch_transcripts)
|
39 |
-
converted_dv = convert_df(df)
|
40 |
-
|
41 |
-
with col4:
|
42 |
-
st.download_button(
|
43 |
-
label="Download transcripts",
|
44 |
-
data=converted_dv,
|
45 |
-
file_name="output.csv",
|
46 |
-
mime="text/csv",
|
47 |
-
disabled=False,
|
48 |
-
type="primary",
|
49 |
-
)
|
50 |
-
|
51 |
-
|
52 |
-
# default_file_path = main_dir + "/data/input/test_input.txt"
|
53 |
-
youtube_short_urls = []
|
54 |
-
if uploaded_file is not None:
|
55 |
-
if uploaded_file.type == "text/plain":
|
56 |
-
from io import StringIO
|
57 |
-
|
58 |
-
stringio = StringIO(uploaded_file.read().decode("utf-8"))
|
59 |
-
for line in stringio:
|
60 |
-
youtube_short_urls.append(line.strip())
|
61 |
-
# else:
|
62 |
-
# youtube_short_urls = parse_input_file(default_file_path)
|
63 |
-
|
64 |
-
with st.spinner(text="transcript pull in progress..."):
|
65 |
-
button_logic(youtube_short_urls)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
youtube_shorts_downloader/input_output.py
DELETED
@@ -1,14 +0,0 @@
|
|
1 |
-
import pandas as pd
|
2 |
-
|
3 |
-
|
4 |
-
def parse_input_file(input_file_path: str) -> list:
|
5 |
-
youtube_urls = []
|
6 |
-
with open(input_file_path, "r") as file:
|
7 |
-
for line in file:
|
8 |
-
youtube_urls.append(line.strip())
|
9 |
-
return youtube_urls
|
10 |
-
|
11 |
-
|
12 |
-
def save_output(data: list, output_file_path: str) -> None:
|
13 |
-
df = pd.DataFrame(data)
|
14 |
-
df.to_csv(output_file_path, index=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
youtube_shorts_downloader/transcripts.py
DELETED
@@ -1,53 +0,0 @@
|
|
1 |
-
import re
|
2 |
-
from typing import List, Dict
|
3 |
-
from youtube_transcript_api import YouTubeTranscriptApi
|
4 |
-
|
5 |
-
|
6 |
-
def is_valid_youtube_shorts_url(potential_url: str) -> bool:
|
7 |
-
pattern = r"^https://www\.youtube\.com/shorts/[A-Za-z0-9_-]{11}$" # youtube vido ids are always 11 chars long
|
8 |
-
return re.match(pattern, potential_url) is not None
|
9 |
-
|
10 |
-
|
11 |
-
def get_single_transcript(youtube_url: str) -> dict:
|
12 |
-
try:
|
13 |
-
if is_valid_youtube_shorts_url(youtube_url):
|
14 |
-
video_id = youtube_url.split("/")[-1]
|
15 |
-
video_transcript = YouTubeTranscriptApi.get_transcript(video_id)
|
16 |
-
entry = {}
|
17 |
-
entry["youtube_url"] = youtube_url
|
18 |
-
entry["video_id"] = video_id
|
19 |
-
entry["transcript"] = video_transcript
|
20 |
-
return entry
|
21 |
-
else:
|
22 |
-
print(f"FAILURE: youtube_url is not valid - {youtube_url}")
|
23 |
-
return {}
|
24 |
-
except Exception as e:
|
25 |
-
print(
|
26 |
-
f"FAILURE: transcript pull for youtube_url - {youtube_url} - failed with exception {e}"
|
27 |
-
)
|
28 |
-
return {}
|
29 |
-
|
30 |
-
|
31 |
-
def get_batch_transcripts(youtube_urls: List[str]) -> List[Dict]:
|
32 |
-
valid_urls = []
|
33 |
-
valid_vids = []
|
34 |
-
for i, url in enumerate(youtube_urls):
|
35 |
-
if is_valid_youtube_shorts_url(url):
|
36 |
-
vid = url.split("/")[-1]
|
37 |
-
valid_urls.append(url)
|
38 |
-
valid_vids.append(vid)
|
39 |
-
try:
|
40 |
-
video_transcripts = YouTubeTranscriptApi.get_transcripts(
|
41 |
-
valid_vids, languages=["en"]
|
42 |
-
)[0]
|
43 |
-
entries = []
|
44 |
-
for i in range(len(valid_urls)):
|
45 |
-
entry = {}
|
46 |
-
entry["youtube_url"] = valid_urls[i]
|
47 |
-
entry["video_id"] = valid_vids[i]
|
48 |
-
entry["transcript"] = video_transcripts[valid_vids[i]]
|
49 |
-
entries.append(entry)
|
50 |
-
return entries
|
51 |
-
except Exception as e:
|
52 |
-
print(f"FAILURE: batch transcription fetch failed with exception {e}")
|
53 |
-
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|