Spaces:

neonwatty
/

youtube_shorts_transcript_downloader

Runtime error

App Files Files Community

neonwatty commited on Jun 13, 2024

Commit

fcfeb31

verified ·

1 Parent(s): 1a8e416

Delete youtube_shorts_downloader

Browse files

Files changed (7) hide show

youtube_shorts_downloader/__init__.py +0 -4
youtube_shorts_downloader/__pycache__/__init__.cpython-310.pyc +0 -0
youtube_shorts_downloader/__pycache__/input_output.cpython-310.pyc +0 -0
youtube_shorts_downloader/__pycache__/transcripts.cpython-310.pyc +0 -0
youtube_shorts_downloader/app.py +0 -65
youtube_shorts_downloader/input_output.py +0 -14
youtube_shorts_downloader/transcripts.py +0 -53

youtube_shorts_downloader/__init__.py DELETED Viewed

@@ -1,4 +0,0 @@
-import os
-base_dir = os.path.dirname(os.path.abspath(__file__))
-main_dir = os.path.dirname(base_dir)

youtube_shorts_downloader/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (310 Bytes)

youtube_shorts_downloader/__pycache__/input_output.cpython-310.pyc DELETED Viewed

Binary file (746 Bytes)

youtube_shorts_downloader/__pycache__/transcripts.cpython-310.pyc DELETED Viewed

Binary file (1.82 kB)

youtube_shorts_downloader/app.py DELETED Viewed

@@ -1,65 +0,0 @@
-import streamlit as st
-import io
-import pandas as pd
-from youtube_shorts_downloader import main_dir
-from youtube_shorts_downloader.transcripts import get_batch_transcripts
-from youtube_shorts_downloader.input_output import parse_input_file, save_output
-st.title("YT Shorts Transcript Downloader")
-st.markdown(
-    "instructions: upload a text file with valid youtube urls, one per line, to fetch transcripts"
-)
-base = st.container(border=True)
-with base:
-    x, col1, col2 = st.columns([3, 20, 5])
-    with col1:
-        uploaded_file = st.file_uploader("Choose a File", type=["txt"])
-        col2, col3, col4 = st.columns([3, 2, 3])
-        with col2:
-            trans_button_val = st.button(label="fetch transcripts", type="primary")
-        with col3:
-            empty_container = st.container()
-        with col4:
-            placeholder = st.empty()
-@st.cache_data
-def convert_df(df: pd.DataFrame) -> "csv":
-    # IMPORTANT: Cache the conversion to prevent computation on every rerun
-    return df.to_csv().encode("utf-8")
-def button_logic(youtube_short_urls: list) -> None:
-    if trans_button_val:
-        batch_transcripts = get_batch_transcripts(youtube_short_urls)
-        df = pd.DataFrame(batch_transcripts)
-        converted_dv = convert_df(df)
-        with col4:
-            st.download_button(
-                label="Download transcripts",
-                data=converted_dv,
-                file_name="output.csv",
-                mime="text/csv",
-                disabled=False,
-                type="primary",
-            )
-# default_file_path = main_dir + "/data/input/test_input.txt"
-youtube_short_urls = []
-if uploaded_file is not None:
-    if uploaded_file.type == "text/plain":
-        from io import StringIO
-        stringio = StringIO(uploaded_file.read().decode("utf-8"))
-        for line in stringio:
-            youtube_short_urls.append(line.strip())
-    # else:
-    #     youtube_short_urls = parse_input_file(default_file_path)
-    with st.spinner(text="transcript pull in progress..."):
-        button_logic(youtube_short_urls)

youtube_shorts_downloader/input_output.py DELETED Viewed

@@ -1,14 +0,0 @@
-import pandas as pd
-def parse_input_file(input_file_path: str) -> list:
-    youtube_urls = []
-    with open(input_file_path, "r") as file:
-        for line in file:
-            youtube_urls.append(line.strip())
-    return youtube_urls
-def save_output(data: list, output_file_path: str) -> None:
-    df = pd.DataFrame(data)
-    df.to_csv(output_file_path, index=False)

youtube_shorts_downloader/transcripts.py DELETED Viewed

@@ -1,53 +0,0 @@
-import re
-from typing import List, Dict
-from youtube_transcript_api import YouTubeTranscriptApi
-def is_valid_youtube_shorts_url(potential_url: str) -> bool:
-    pattern = r"^https://www\.youtube\.com/shorts/[A-Za-z0-9_-]{11}$"  # youtube vido ids are always 11 chars long
-    return re.match(pattern, potential_url) is not None
-def get_single_transcript(youtube_url: str) -> dict:
-    try:
-        if is_valid_youtube_shorts_url(youtube_url):
-            video_id = youtube_url.split("/")[-1]
-            video_transcript = YouTubeTranscriptApi.get_transcript(video_id)
-            entry = {}
-            entry["youtube_url"] = youtube_url
-            entry["video_id"] = video_id
-            entry["transcript"] = video_transcript
-            return entry
-        else:
-            print(f"FAILURE: youtube_url is not valid - {youtube_url}")
-            return {}
-    except Exception as e:
-        print(
-            f"FAILURE: transcript pull for youtube_url - {youtube_url} - failed with exception {e}"
-        )
-        return {}
-def get_batch_transcripts(youtube_urls: List[str]) -> List[Dict]:
-    valid_urls = []
-    valid_vids = []
-    for i, url in enumerate(youtube_urls):
-        if is_valid_youtube_shorts_url(url):
-            vid = url.split("/")[-1]
-            valid_urls.append(url)
-            valid_vids.append(vid)
-    try:
-        video_transcripts = YouTubeTranscriptApi.get_transcripts(
-            valid_vids, languages=["en"]
-        )[0]
-        entries = []
-        for i in range(len(valid_urls)):
-            entry = {}
-            entry["youtube_url"] = valid_urls[i]
-            entry["video_id"] = valid_vids[i]
-            entry["transcript"] = video_transcripts[valid_vids[i]]
-            entries.append(entry)
-        return entries
-    except Exception as e:
-        print(f"FAILURE: batch transcription fetch failed with exception {e}")
-        return []