Spaces:
Runtime error
Runtime error
sradc
commited on
Commit
•
04848c9
1
Parent(s):
3a8e829
added gitignores when downloading videos/video-ids, and added download_videos.py to run_pipeline.sh
Browse files- pipeline/download_videos.py +1 -0
- pipeline/get_video_ids.py +6 -6
- run_pipeline.sh +1 -0
pipeline/download_videos.py
CHANGED
@@ -17,6 +17,7 @@ def get_id(url: str) -> str:
|
|
17 |
|
18 |
def download_videos(video_ids: List[str]) -> None:
|
19 |
VIDEO_DIR.mkdir(exist_ok=True, parents=True)
|
|
|
20 |
for video_id in tqdm(video_ids):
|
21 |
video_url = f"https://www.youtube.com/watch?v={video_id}"
|
22 |
video_path = VIDEO_DIR / f"{video_id}.mp4"
|
|
|
17 |
|
18 |
def download_videos(video_ids: List[str]) -> None:
|
19 |
VIDEO_DIR.mkdir(exist_ok=True, parents=True)
|
20 |
+
(VIDEO_DIR / "gitignore").write_text("*")
|
21 |
for video_id in tqdm(video_ids):
|
22 |
video_url = f"https://www.youtube.com/watch?v={video_id}"
|
23 |
video_path = VIDEO_DIR / f"{video_id}.mp4"
|
pipeline/get_video_ids.py
CHANGED
@@ -6,6 +6,8 @@ from typing import Final, Optional
|
|
6 |
|
7 |
import youtube_dl
|
8 |
|
|
|
|
|
9 |
logging.basicConfig(
|
10 |
level=logging.INFO,
|
11 |
format="%(asctime)s - %(levelname)s - %(message)s",
|
@@ -26,7 +28,7 @@ PLAYLIST_URLS = [
|
|
26 |
"https://www.youtube.com/playlist?list=PLCQCtoOJpI_A5oktQImEdDBJ50BqHXujj", # 495, MTV Classic 2000's music videos (US Version)
|
27 |
]
|
28 |
URL_FILE: Final[Optional[str]] = os.environ.get("URL_FILE")
|
29 |
-
OUTPUT_DIR: Final[str] =
|
30 |
|
31 |
|
32 |
def get_all_video_ids(channel_url: str) -> list[str]:
|
@@ -58,12 +60,10 @@ def get_all_video_ids(channel_url: str) -> list[str]:
|
|
58 |
def process_youtube_url(url: str):
|
59 |
logging.info(f"Processing {url}")
|
60 |
ids = get_all_video_ids(url)
|
61 |
-
|
62 |
-
|
63 |
-
output_dir.mkdir(parents=True, exist_ok=True)
|
64 |
-
|
65 |
output = "\n".join(ids)
|
66 |
-
output_path =
|
67 |
logging.info(f"Writing {len(ids)} video IDs to {output_path}")
|
68 |
with output_path.open(mode="w") as f:
|
69 |
f.write(output)
|
|
|
6 |
|
7 |
import youtube_dl
|
8 |
|
9 |
+
from pipeline.download_videos import DATA_DIR
|
10 |
+
|
11 |
logging.basicConfig(
|
12 |
level=logging.INFO,
|
13 |
format="%(asctime)s - %(levelname)s - %(message)s",
|
|
|
28 |
"https://www.youtube.com/playlist?list=PLCQCtoOJpI_A5oktQImEdDBJ50BqHXujj", # 495, MTV Classic 2000's music videos (US Version)
|
29 |
]
|
30 |
URL_FILE: Final[Optional[str]] = os.environ.get("URL_FILE")
|
31 |
+
OUTPUT_DIR: Final[str] = DATA_DIR / "ids"
|
32 |
|
33 |
|
34 |
def get_all_video_ids(channel_url: str) -> list[str]:
|
|
|
60 |
def process_youtube_url(url: str):
|
61 |
logging.info(f"Processing {url}")
|
62 |
ids = get_all_video_ids(url)
|
63 |
+
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
64 |
+
(OUTPUT_DIR / ".gitignore").write_text("*")
|
|
|
|
|
65 |
output = "\n".join(ids)
|
66 |
+
output_path = OUTPUT_DIR / f"{hashlib.md5(output.encode()).hexdigest()}.txt"
|
67 |
logging.info(f"Writing {len(ids)} video IDs to {output_path}")
|
68 |
with output_path.open(mode="w") as f:
|
69 |
f.write(output)
|
run_pipeline.sh
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
#!/usr/bin/env bash
|
2 |
set -e
|
3 |
|
|
|
4 |
poetry run python pipeline/download_videos.py
|
5 |
poetry run python pipeline/process_videos.py
|
|
|
1 |
#!/usr/bin/env bash
|
2 |
set -e
|
3 |
|
4 |
+
poetry run python pipeline/get_video_ids.py
|
5 |
poetry run python pipeline/download_videos.py
|
6 |
poetry run python pipeline/process_videos.py
|