neonwatty commited on
Commit
fcfeb31
·
verified ·
1 Parent(s): 1a8e416

Delete youtube_shorts_downloader

Browse files
youtube_shorts_downloader/__init__.py DELETED
@@ -1,4 +0,0 @@
1
- import os
2
-
3
- base_dir = os.path.dirname(os.path.abspath(__file__))
4
- main_dir = os.path.dirname(base_dir)
 
 
 
 
 
youtube_shorts_downloader/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (310 Bytes)
 
youtube_shorts_downloader/__pycache__/input_output.cpython-310.pyc DELETED
Binary file (746 Bytes)
 
youtube_shorts_downloader/__pycache__/transcripts.cpython-310.pyc DELETED
Binary file (1.82 kB)
 
youtube_shorts_downloader/app.py DELETED
@@ -1,65 +0,0 @@
1
- import streamlit as st
2
- import io
3
- import pandas as pd
4
- from youtube_shorts_downloader import main_dir
5
- from youtube_shorts_downloader.transcripts import get_batch_transcripts
6
- from youtube_shorts_downloader.input_output import parse_input_file, save_output
7
-
8
-
9
- st.title("YT Shorts Transcript Downloader")
10
- st.markdown(
11
- "instructions: upload a text file with valid youtube urls, one per line, to fetch transcripts"
12
- )
13
-
14
-
15
- base = st.container(border=True)
16
- with base:
17
- x, col1, col2 = st.columns([3, 20, 5])
18
- with col1:
19
- uploaded_file = st.file_uploader("Choose a File", type=["txt"])
20
- col2, col3, col4 = st.columns([3, 2, 3])
21
- with col2:
22
- trans_button_val = st.button(label="fetch transcripts", type="primary")
23
- with col3:
24
- empty_container = st.container()
25
- with col4:
26
- placeholder = st.empty()
27
-
28
-
29
- @st.cache_data
30
- def convert_df(df: pd.DataFrame) -> "csv":
31
- # IMPORTANT: Cache the conversion to prevent computation on every rerun
32
- return df.to_csv().encode("utf-8")
33
-
34
-
35
- def button_logic(youtube_short_urls: list) -> None:
36
- if trans_button_val:
37
- batch_transcripts = get_batch_transcripts(youtube_short_urls)
38
- df = pd.DataFrame(batch_transcripts)
39
- converted_dv = convert_df(df)
40
-
41
- with col4:
42
- st.download_button(
43
- label="Download transcripts",
44
- data=converted_dv,
45
- file_name="output.csv",
46
- mime="text/csv",
47
- disabled=False,
48
- type="primary",
49
- )
50
-
51
-
52
- # default_file_path = main_dir + "/data/input/test_input.txt"
53
- youtube_short_urls = []
54
- if uploaded_file is not None:
55
- if uploaded_file.type == "text/plain":
56
- from io import StringIO
57
-
58
- stringio = StringIO(uploaded_file.read().decode("utf-8"))
59
- for line in stringio:
60
- youtube_short_urls.append(line.strip())
61
- # else:
62
- # youtube_short_urls = parse_input_file(default_file_path)
63
-
64
- with st.spinner(text="transcript pull in progress..."):
65
- button_logic(youtube_short_urls)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
youtube_shorts_downloader/input_output.py DELETED
@@ -1,14 +0,0 @@
1
- import pandas as pd
2
-
3
-
4
- def parse_input_file(input_file_path: str) -> list:
5
- youtube_urls = []
6
- with open(input_file_path, "r") as file:
7
- for line in file:
8
- youtube_urls.append(line.strip())
9
- return youtube_urls
10
-
11
-
12
- def save_output(data: list, output_file_path: str) -> None:
13
- df = pd.DataFrame(data)
14
- df.to_csv(output_file_path, index=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
youtube_shorts_downloader/transcripts.py DELETED
@@ -1,53 +0,0 @@
1
- import re
2
- from typing import List, Dict
3
- from youtube_transcript_api import YouTubeTranscriptApi
4
-
5
-
6
- def is_valid_youtube_shorts_url(potential_url: str) -> bool:
7
- pattern = r"^https://www\.youtube\.com/shorts/[A-Za-z0-9_-]{11}$" # youtube vido ids are always 11 chars long
8
- return re.match(pattern, potential_url) is not None
9
-
10
-
11
- def get_single_transcript(youtube_url: str) -> dict:
12
- try:
13
- if is_valid_youtube_shorts_url(youtube_url):
14
- video_id = youtube_url.split("/")[-1]
15
- video_transcript = YouTubeTranscriptApi.get_transcript(video_id)
16
- entry = {}
17
- entry["youtube_url"] = youtube_url
18
- entry["video_id"] = video_id
19
- entry["transcript"] = video_transcript
20
- return entry
21
- else:
22
- print(f"FAILURE: youtube_url is not valid - {youtube_url}")
23
- return {}
24
- except Exception as e:
25
- print(
26
- f"FAILURE: transcript pull for youtube_url - {youtube_url} - failed with exception {e}"
27
- )
28
- return {}
29
-
30
-
31
- def get_batch_transcripts(youtube_urls: List[str]) -> List[Dict]:
32
- valid_urls = []
33
- valid_vids = []
34
- for i, url in enumerate(youtube_urls):
35
- if is_valid_youtube_shorts_url(url):
36
- vid = url.split("/")[-1]
37
- valid_urls.append(url)
38
- valid_vids.append(vid)
39
- try:
40
- video_transcripts = YouTubeTranscriptApi.get_transcripts(
41
- valid_vids, languages=["en"]
42
- )[0]
43
- entries = []
44
- for i in range(len(valid_urls)):
45
- entry = {}
46
- entry["youtube_url"] = valid_urls[i]
47
- entry["video_id"] = valid_vids[i]
48
- entry["transcript"] = video_transcripts[valid_vids[i]]
49
- entries.append(entry)
50
- return entries
51
- except Exception as e:
52
- print(f"FAILURE: batch transcription fetch failed with exception {e}")
53
- return []