neonwatty commited on
Commit
5700536
1 Parent(s): fcfeb31

Upload 7 files

Browse files
youtube_shorts_transcript_downloader/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ import os
2
+
3
+ base_dir = os.path.dirname(os.path.abspath(__file__))
4
+ main_dir = os.path.dirname(base_dir)
youtube_shorts_transcript_downloader/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (310 Bytes). View file
 
youtube_shorts_transcript_downloader/__pycache__/input_output.cpython-310.pyc ADDED
Binary file (746 Bytes). View file
 
youtube_shorts_transcript_downloader/__pycache__/transcripts.cpython-310.pyc ADDED
Binary file (1.82 kB). View file
 
youtube_shorts_transcript_downloader/app.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from youtube_shorts_transcript_downloader.transcripts import get_batch_transcripts
4
+
5
+
6
+ st.title("YT Shorts Transcript Downloader")
7
+ st.markdown(
8
+ "instructions: upload a text file with valid youtube urls, one per line, to fetch transcripts"
9
+ )
10
+
11
+
12
+ base = st.container(border=True)
13
+ with base:
14
+ x, col1, col2 = st.columns([3, 20, 5])
15
+ with col1:
16
+ uploaded_file = st.file_uploader("Choose a File", type=["txt"])
17
+ col2, col3, col4 = st.columns([3, 2, 3])
18
+ with col2:
19
+ trans_button_val = st.button(label="fetch transcripts", type="primary")
20
+ with col3:
21
+ empty_container = st.container()
22
+ with col4:
23
+ placeholder = st.empty()
24
+
25
+
26
+ @st.cache_data
27
+ def convert_df(df: pd.DataFrame) -> "csv":
28
+ # IMPORTANT: Cache the conversion to prevent computation on every rerun
29
+ return df.to_csv().encode("utf-8")
30
+
31
+
32
+ def button_logic(youtube_short_urls: list) -> None:
33
+ if trans_button_val:
34
+ batch_transcripts = get_batch_transcripts(youtube_short_urls)
35
+ df = pd.DataFrame(batch_transcripts)
36
+ converted_dv = convert_df(df)
37
+
38
+ with col4:
39
+ st.download_button(
40
+ label="Download transcripts",
41
+ data=converted_dv,
42
+ file_name="output.csv",
43
+ mime="text/csv",
44
+ disabled=False,
45
+ type="primary",
46
+ )
47
+
48
+
49
+ # default_file_path = main_dir + "/data/input/test_input.txt"
50
+ youtube_short_urls = []
51
+ if uploaded_file is not None:
52
+ if uploaded_file.type == "text/plain":
53
+ from io import StringIO
54
+
55
+ stringio = StringIO(uploaded_file.read().decode("utf-8"))
56
+ for line in stringio:
57
+ youtube_short_urls.append(line.strip())
58
+ # else:
59
+ # youtube_short_urls = parse_input_file(default_file_path)
60
+
61
+ with st.spinner(text="transcript pull in progress..."):
62
+ button_logic(youtube_short_urls)
youtube_shorts_transcript_downloader/input_output.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+
4
+ def parse_input_file(input_file_path: str) -> list:
5
+ youtube_urls = []
6
+ with open(input_file_path, "r") as file:
7
+ for line in file:
8
+ youtube_urls.append(line.strip())
9
+ return youtube_urls
10
+
11
+
12
+ def save_output(data: list, output_file_path: str) -> None:
13
+ df = pd.DataFrame(data)
14
+ df.to_csv(output_file_path, index=False)
youtube_shorts_transcript_downloader/transcripts.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import List, Dict
3
+ from youtube_transcript_api import YouTubeTranscriptApi
4
+
5
+
6
+ def is_valid_youtube_shorts_url(potential_url: str) -> bool:
7
+ pattern = r"^https://www\.youtube\.com/shorts/[A-Za-z0-9_-]{11}$" # youtube vido ids are always 11 chars long
8
+ return re.match(pattern, potential_url) is not None
9
+
10
+
11
+ def get_single_transcript(youtube_url: str) -> dict:
12
+ try:
13
+ if is_valid_youtube_shorts_url(youtube_url):
14
+ video_id = youtube_url.split("/")[-1]
15
+ video_transcript = YouTubeTranscriptApi.get_transcript(video_id)
16
+ entry = {}
17
+ entry["youtube_url"] = youtube_url
18
+ entry["video_id"] = video_id
19
+ entry["transcript"] = video_transcript
20
+ return entry
21
+ else:
22
+ print(f"FAILURE: youtube_url is not valid - {youtube_url}")
23
+ return {}
24
+ except Exception as e:
25
+ print(
26
+ f"FAILURE: transcript pull for youtube_url - {youtube_url} - failed with exception {e}"
27
+ )
28
+ return {}
29
+
30
+
31
+ def get_batch_transcripts(youtube_urls: List[str]) -> List[Dict]:
32
+ valid_urls = []
33
+ valid_vids = []
34
+ for i, url in enumerate(youtube_urls):
35
+ if is_valid_youtube_shorts_url(url):
36
+ vid = url.split("/")[-1]
37
+ valid_urls.append(url)
38
+ valid_vids.append(vid)
39
+ try:
40
+ video_transcripts = YouTubeTranscriptApi.get_transcripts(
41
+ valid_vids, languages=["en"]
42
+ )[0]
43
+ entries = []
44
+ for i in range(len(valid_urls)):
45
+ entry = {}
46
+ entry["youtube_url"] = valid_urls[i]
47
+ entry["video_id"] = valid_vids[i]
48
+ entry["transcript"] = video_transcripts[valid_vids[i]]
49
+ entries.append(entry)
50
+ return entries
51
+ except Exception as e:
52
+ print(f"FAILURE: batch transcription fetch failed with exception {e}")
53
+ return []