Spaces:

neonwatty
/

youtube_shorts_transcript_downloader

Running

App Files Files Community

neonwatty commited on Jun 13

Commit

5700536

•

1 Parent(s): fcfeb31

Upload 7 files

Browse files

Files changed (7) hide show

youtube_shorts_transcript_downloader/__init__.py +4 -0
youtube_shorts_transcript_downloader/__pycache__/__init__.cpython-310.pyc +0 -0
youtube_shorts_transcript_downloader/__pycache__/input_output.cpython-310.pyc +0 -0
youtube_shorts_transcript_downloader/__pycache__/transcripts.cpython-310.pyc +0 -0
youtube_shorts_transcript_downloader/app.py +62 -0
youtube_shorts_transcript_downloader/input_output.py +14 -0
youtube_shorts_transcript_downloader/transcripts.py +53 -0

youtube_shorts_transcript_downloader/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+import os
+base_dir = os.path.dirname(os.path.abspath(__file__))
+main_dir = os.path.dirname(base_dir)

youtube_shorts_transcript_downloader/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (310 Bytes). View file

youtube_shorts_transcript_downloader/__pycache__/input_output.cpython-310.pyc ADDED Viewed

Binary file (746 Bytes). View file

youtube_shorts_transcript_downloader/__pycache__/transcripts.cpython-310.pyc ADDED Viewed

Binary file (1.82 kB). View file

youtube_shorts_transcript_downloader/app.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import streamlit as st
+import pandas as pd
+from youtube_shorts_transcript_downloader.transcripts import get_batch_transcripts
+st.title("YT Shorts Transcript Downloader")
+st.markdown(
+    "instructions: upload a text file with valid youtube urls, one per line, to fetch transcripts"
+)
+base = st.container(border=True)
+with base:
+    x, col1, col2 = st.columns([3, 20, 5])
+    with col1:
+        uploaded_file = st.file_uploader("Choose a File", type=["txt"])
+        col2, col3, col4 = st.columns([3, 2, 3])
+        with col2:
+            trans_button_val = st.button(label="fetch transcripts", type="primary")
+        with col3:
+            empty_container = st.container()
+        with col4:
+            placeholder = st.empty()
+@st.cache_data
+def convert_df(df: pd.DataFrame) -> "csv":
+    # IMPORTANT: Cache the conversion to prevent computation on every rerun
+    return df.to_csv().encode("utf-8")
+def button_logic(youtube_short_urls: list) -> None:
+    if trans_button_val:
+        batch_transcripts = get_batch_transcripts(youtube_short_urls)
+        df = pd.DataFrame(batch_transcripts)
+        converted_dv = convert_df(df)
+        with col4:
+            st.download_button(
+                label="Download transcripts",
+                data=converted_dv,
+                file_name="output.csv",
+                mime="text/csv",
+                disabled=False,
+                type="primary",
+            )
+# default_file_path = main_dir + "/data/input/test_input.txt"
+youtube_short_urls = []
+if uploaded_file is not None:
+    if uploaded_file.type == "text/plain":
+        from io import StringIO
+        stringio = StringIO(uploaded_file.read().decode("utf-8"))
+        for line in stringio:
+            youtube_short_urls.append(line.strip())
+    # else:
+    #     youtube_short_urls = parse_input_file(default_file_path)
+    with st.spinner(text="transcript pull in progress..."):
+        button_logic(youtube_short_urls)

youtube_shorts_transcript_downloader/input_output.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import pandas as pd
+def parse_input_file(input_file_path: str) -> list:
+    youtube_urls = []
+    with open(input_file_path, "r") as file:
+        for line in file:
+            youtube_urls.append(line.strip())
+    return youtube_urls
+def save_output(data: list, output_file_path: str) -> None:
+    df = pd.DataFrame(data)
+    df.to_csv(output_file_path, index=False)

youtube_shorts_transcript_downloader/transcripts.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import re
+from typing import List, Dict
+from youtube_transcript_api import YouTubeTranscriptApi
+def is_valid_youtube_shorts_url(potential_url: str) -> bool:
+    pattern = r"^https://www\.youtube\.com/shorts/[A-Za-z0-9_-]{11}$"  # youtube vido ids are always 11 chars long
+    return re.match(pattern, potential_url) is not None
+def get_single_transcript(youtube_url: str) -> dict:
+    try:
+        if is_valid_youtube_shorts_url(youtube_url):
+            video_id = youtube_url.split("/")[-1]
+            video_transcript = YouTubeTranscriptApi.get_transcript(video_id)
+            entry = {}
+            entry["youtube_url"] = youtube_url
+            entry["video_id"] = video_id
+            entry["transcript"] = video_transcript
+            return entry
+        else:
+            print(f"FAILURE: youtube_url is not valid - {youtube_url}")
+            return {}
+    except Exception as e:
+        print(
+            f"FAILURE: transcript pull for youtube_url - {youtube_url} - failed with exception {e}"
+        )
+        return {}
+def get_batch_transcripts(youtube_urls: List[str]) -> List[Dict]:
+    valid_urls = []
+    valid_vids = []
+    for i, url in enumerate(youtube_urls):
+        if is_valid_youtube_shorts_url(url):
+            vid = url.split("/")[-1]
+            valid_urls.append(url)
+            valid_vids.append(vid)
+    try:
+        video_transcripts = YouTubeTranscriptApi.get_transcripts(
+            valid_vids, languages=["en"]
+        )[0]
+        entries = []
+        for i in range(len(valid_urls)):
+            entry = {}
+            entry["youtube_url"] = valid_urls[i]
+            entry["video_id"] = valid_vids[i]
+            entry["transcript"] = video_transcripts[valid_vids[i]]
+            entries.append(entry)
+        return entries
+    except Exception as e:
+        print(f"FAILURE: batch transcription fetch failed with exception {e}")
+        return []