neonwatty commited on
Commit
1a8e416
·
verified ·
1 Parent(s): e411bc6

Upload 11 files

Browse files
data/.DS_Store ADDED
Binary file (6.15 kB). View file
 
data/input/test_input.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ https://www.youtube.com/shorts/xaRRZKgj5aQ
2
+ https://www.youtube.com/shorts/xK9_V9LF4PE
data/output/test_output.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ youtube_url,video_id,transcript
2
+ https://www.youtube.com/shorts/xaRRZKgj5aQ,xaRRZKgj5aQ,"[{'text': 'what happens if you pull a shot of', 'start': 0.08, 'duration': 2.839}, {'text': 'espresso through an orange I thought', 'start': 1.24, 'duration': 3.119}, {'text': 'this could potentially taste pretty good', 'start': 2.919, 'duration': 2.801}, {'text': ""so let's try it out and see if it even"", 'start': 4.359, 'duration': 3.04}, {'text': 'works so I started off by grabbing a', 'start': 5.72, 'duration': 3.16}, {'text': 'small orange and cutting off a small', 'start': 7.399, 'duration': 3.001}, {'text': 'slice and then I measured out a pretty', 'start': 8.88, 'duration': 3.48}, {'text': 'typical 18 G dose of coffee but I', 'start': 10.4, 'duration': 3.279}, {'text': 'grounded a little bit coarser than', 'start': 12.36, 'duration': 2.8}, {'text': 'normal I then did all my pretty normal', 'start': 13.679, 'duration': 3.001}, {'text': 'Puck prep with the only exception being', 'start': 15.16, 'duration': 2.92}, {'text': 'I put an orange at the bottom of the', 'start': 16.68, 'duration': 3.28}, {'text': 'Porta filter now to be honest I was a', 'start': 18.08, 'duration': 3.039}, {'text': 'little worried that this orange was', 'start': 19.96, 'duration': 2.6}, {'text': 'going to completely block the flow of', 'start': 21.119, 'duration': 2.721}, {'text': 'espresso and if you look at what', 'start': 22.56, 'duration': 2.639}, {'text': 'happened to this first shot you can see', 'start': 23.84, 'duration': 2.96}, {'text': ""that's exactly what happened so I had a"", 'start': 25.199, 'duration': 3.281}, {'text': 'different idea I added a paper filter to', 'start': 26.8, 'duration': 2.92}, {'text': 'the bottom of the basket which should', 'start': 28.48, 'duration': 2.68}, {'text': 'allow the espresso to flow through a', 'start': 29.72, 'duration': 2.92}, {'text': ""little bit easier so let's see how the"", 'start': 31.16, 'duration': 2.96}, {'text': 'shot looks and more importantly see how', 'start': 32.64, 'duration': 3.439}, {'text': ""it tastes it's definitely not the"", 'start': 34.12, 'duration': 3.2}, {'text': ""prettiest looking shot but we can't"", 'start': 36.079, 'duration': 2.64}, {'text': 'dismiss it just on that but how does', 'start': 37.32, 'duration': 3.079}, {'text': 'this espresso', 'start': 38.719, 'duration': 3.761}, {'text': ""taste I'm kind of getting chocolate"", 'start': 40.399, 'duration': 5.281}, {'text': 'covered orange vibes', 'start': 42.48, 'duration': 3.2}]"
3
+ https://www.youtube.com/shorts/xK9_V9LF4PE,xK9_V9LF4PE,"[{'text': 'here are snacks I packed for my flight', 'start': 0.199, 'duration': 4.241}, {'text': 'part 19 yesterday I packed snacks for', 'start': 1.959, 'duration': 4.44}, {'text': ""Doug and today I'm packing snacks for"", 'start': 4.44, 'duration': 3.439}, {'text': ""myself since we're going on separate"", 'start': 6.399, 'duration': 3.841}, {'text': ""trips I'll be departing around lunchtime"", 'start': 7.879, 'duration': 3.961}, {'text': 'so I wanted to make a little meal with', 'start': 10.24, 'duration': 3.8}, {'text': 'protein but also enough to fill me up', 'start': 11.84, 'duration': 3.52}, {'text': ""there's a character on my desk and I"", 'start': 14.04, 'duration': 3.28}, {'text': 'heard him say tuna Mayo so I decided to', 'start': 15.36, 'duration': 4.04}, {'text': ""make tuna mayo musui and I'm using the"", 'start': 17.32, 'duration': 3.4}, {'text': ""musui M that we're currently in"", 'start': 19.4, 'duration': 3.2}, {'text': ""production on we're launching very soon"", 'start': 20.72, 'duration': 4.16}, {'text': 'go to Janelle eats.com musubi for', 'start': 22.6, 'duration': 5.28}, {'text': 'updates I added some moabi QB mayo and', 'start': 24.88, 'duration': 5.12}, {'text': 'pickled daon before wrapping it in rice', 'start': 27.88, 'duration': 4.28}, {'text': ""and Nori for extra snacks I've been on a"", 'start': 30.0, 'duration': 4.719}, {'text': ""golden kiwi kick I'm peeling and slicing"", 'start': 32.16, 'duration': 4.84}, {'text': 'it for easier eating on the plane then', 'start': 34.719, 'duration': 3.801}, {'text': ""I'm packing the rest of the Rainer"", 'start': 37.0, 'duration': 3.239}, {'text': ""cherries so they don't go bad while"", 'start': 38.52, 'duration': 3.8}, {'text': ""we're gone of course at least one"", 'start': 40.239, 'duration': 4.121}, {'text': ""protein bar and I'm bringing this"", 'start': 42.32, 'duration': 4.919}, {'text': 'amazing Sichuan pepper chocolate brittle', 'start': 44.36, 'duration': 4.92}, {'text': ""so my family can try it I'm bringing two"", 'start': 47.239, 'duration': 4.0}, {'text': 'musu bees so that my sister can have the', 'start': 49.28, 'duration': 3.52}, {'text': ""other one just in case she didn't pack"", 'start': 51.239, 'duration': 6.241}, {'text': 'any snacks she turns to do that', 'start': 52.8, 'duration': 4.68}]"
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ pandas
2
+ youtube-transcript-api
3
+ streamlit
youtube_shorts_downloader/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ import os
2
+
3
+ base_dir = os.path.dirname(os.path.abspath(__file__))
4
+ main_dir = os.path.dirname(base_dir)
youtube_shorts_downloader/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (310 Bytes). View file
 
youtube_shorts_downloader/__pycache__/input_output.cpython-310.pyc ADDED
Binary file (746 Bytes). View file
 
youtube_shorts_downloader/__pycache__/transcripts.cpython-310.pyc ADDED
Binary file (1.82 kB). View file
 
youtube_shorts_downloader/app.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import io
3
+ import pandas as pd
4
+ from youtube_shorts_downloader import main_dir
5
+ from youtube_shorts_downloader.transcripts import get_batch_transcripts
6
+ from youtube_shorts_downloader.input_output import parse_input_file, save_output
7
+
8
+
9
+ st.title("YT Shorts Transcript Downloader")
10
+ st.markdown(
11
+ "instructions: upload a text file with valid youtube urls, one per line, to fetch transcripts"
12
+ )
13
+
14
+
15
+ base = st.container(border=True)
16
+ with base:
17
+ x, col1, col2 = st.columns([3, 20, 5])
18
+ with col1:
19
+ uploaded_file = st.file_uploader("Choose a File", type=["txt"])
20
+ col2, col3, col4 = st.columns([3, 2, 3])
21
+ with col2:
22
+ trans_button_val = st.button(label="fetch transcripts", type="primary")
23
+ with col3:
24
+ empty_container = st.container()
25
+ with col4:
26
+ placeholder = st.empty()
27
+
28
+
29
+ @st.cache_data
30
+ def convert_df(df: pd.DataFrame) -> "csv":
31
+ # IMPORTANT: Cache the conversion to prevent computation on every rerun
32
+ return df.to_csv().encode("utf-8")
33
+
34
+
35
+ def button_logic(youtube_short_urls: list) -> None:
36
+ if trans_button_val:
37
+ batch_transcripts = get_batch_transcripts(youtube_short_urls)
38
+ df = pd.DataFrame(batch_transcripts)
39
+ converted_dv = convert_df(df)
40
+
41
+ with col4:
42
+ st.download_button(
43
+ label="Download transcripts",
44
+ data=converted_dv,
45
+ file_name="output.csv",
46
+ mime="text/csv",
47
+ disabled=False,
48
+ type="primary",
49
+ )
50
+
51
+
52
+ # default_file_path = main_dir + "/data/input/test_input.txt"
53
+ youtube_short_urls = []
54
+ if uploaded_file is not None:
55
+ if uploaded_file.type == "text/plain":
56
+ from io import StringIO
57
+
58
+ stringio = StringIO(uploaded_file.read().decode("utf-8"))
59
+ for line in stringio:
60
+ youtube_short_urls.append(line.strip())
61
+ # else:
62
+ # youtube_short_urls = parse_input_file(default_file_path)
63
+
64
+ with st.spinner(text="transcript pull in progress..."):
65
+ button_logic(youtube_short_urls)
youtube_shorts_downloader/input_output.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+
4
+ def parse_input_file(input_file_path: str) -> list:
5
+ youtube_urls = []
6
+ with open(input_file_path, "r") as file:
7
+ for line in file:
8
+ youtube_urls.append(line.strip())
9
+ return youtube_urls
10
+
11
+
12
+ def save_output(data: list, output_file_path: str) -> None:
13
+ df = pd.DataFrame(data)
14
+ df.to_csv(output_file_path, index=False)
youtube_shorts_downloader/transcripts.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import List, Dict
3
+ from youtube_transcript_api import YouTubeTranscriptApi
4
+
5
+
6
+ def is_valid_youtube_shorts_url(potential_url: str) -> bool:
7
+ pattern = r"^https://www\.youtube\.com/shorts/[A-Za-z0-9_-]{11}$" # youtube vido ids are always 11 chars long
8
+ return re.match(pattern, potential_url) is not None
9
+
10
+
11
+ def get_single_transcript(youtube_url: str) -> dict:
12
+ try:
13
+ if is_valid_youtube_shorts_url(youtube_url):
14
+ video_id = youtube_url.split("/")[-1]
15
+ video_transcript = YouTubeTranscriptApi.get_transcript(video_id)
16
+ entry = {}
17
+ entry["youtube_url"] = youtube_url
18
+ entry["video_id"] = video_id
19
+ entry["transcript"] = video_transcript
20
+ return entry
21
+ else:
22
+ print(f"FAILURE: youtube_url is not valid - {youtube_url}")
23
+ return {}
24
+ except Exception as e:
25
+ print(
26
+ f"FAILURE: transcript pull for youtube_url - {youtube_url} - failed with exception {e}"
27
+ )
28
+ return {}
29
+
30
+
31
+ def get_batch_transcripts(youtube_urls: List[str]) -> List[Dict]:
32
+ valid_urls = []
33
+ valid_vids = []
34
+ for i, url in enumerate(youtube_urls):
35
+ if is_valid_youtube_shorts_url(url):
36
+ vid = url.split("/")[-1]
37
+ valid_urls.append(url)
38
+ valid_vids.append(vid)
39
+ try:
40
+ video_transcripts = YouTubeTranscriptApi.get_transcripts(
41
+ valid_vids, languages=["en"]
42
+ )[0]
43
+ entries = []
44
+ for i in range(len(valid_urls)):
45
+ entry = {}
46
+ entry["youtube_url"] = valid_urls[i]
47
+ entry["video_id"] = valid_vids[i]
48
+ entry["transcript"] = video_transcripts[valid_vids[i]]
49
+ entries.append(entry)
50
+ return entries
51
+ except Exception as e:
52
+ print(f"FAILURE: batch transcription fetch failed with exception {e}")
53
+ return []