Spaces:
Running
on
Zero
Running
on
Zero
import json | |
from tqdm import tqdm | |
from pytubefix import YouTube | |
import xml.etree.ElementTree as ET | |
import os | |
with open ('VideoInstruct100K.json','r') as f : | |
data=json.load(f) | |
# Usage | |
existed_video_id={} | |
for video_name in os.listdir('videos'): | |
video_id = video_name.split('.')[0] | |
existed_video_id[video_id]=True | |
def download_video_with_subtitles(video_id): | |
# Create a YouTube object. | |
yt = YouTube(f'https://www.youtube.com/watch?v={video_id}') | |
video_filename = f"{video_id}.mp4" | |
video_downloaded=False | |
try : | |
# Get the video stream with the highest resolution and download the video. | |
stream = yt.streams.get_highest_resolution() | |
stream.download(output_path='videos', filename=video_filename) | |
video_downloaded=True | |
except Exception as e: | |
print(f"Error downloading video {video_id}: {str(e)}") | |
video_downloaded=False | |
if not video_downloaded: | |
return False,False | |
# Get the video's available captions (subtitles). | |
captions = yt.captions.all() | |
# Download the captions if available in xml format. | |
caption_downloaded = False | |
for caption in captions: | |
caption_code = caption.code | |
# select only english captions | |
if 'en' in caption_code: | |
caption.download(title=f"{video_id}", output_path='subtitles_xml',srt=False) | |
caption_downloaded = True | |
return video_downloaded,caption_downloaded | |
def convert_xml_vtt(xml_path, vtt_path): | |
# Parse the XML subtitle file | |
tree = ET.parse(xml_path) | |
root = tree.getroot() | |
# Initialize a list to store VTT subtitle entries | |
vtt_subtitle = [] | |
# Function to convert time in milliseconds to WebVTT format | |
def ms_to_vtt_time(milliseconds): | |
seconds, milliseconds = divmod(milliseconds, 1000) | |
minutes, seconds = divmod(seconds, 60) | |
return f"{minutes:02d}:{seconds:02d}.{milliseconds:03d}" | |
# Iterate through subtitle elements | |
toggle = True | |
for p in root.findall(".//p"): | |
if toggle: | |
start_time = int(p.get("t")) | |
subtitle_text = " ".join(s.text.strip() for s in p.findall(".//s")) | |
# duration = int(p.get("d")) if p.get("d") is not None else 0 | |
if not toggle: | |
end_time = int(p.get("t")) | |
# Format and append the VTT entry to the list | |
vtt_subtitle.append(f"{ms_to_vtt_time(start_time)} --> {ms_to_vtt_time(end_time)}\n{subtitle_text}\n") | |
toggle = not toggle | |
# Join the VTT entries into a single string | |
vtt_content = "WEBVTT\n\n" + "\n".join(vtt_subtitle) | |
# Save the VTT content to a file | |
with open(vtt_path, "w", encoding="utf-8") as vtt_file: | |
vtt_file.write(vtt_content) | |
import os | |
os.makedirs('videos', exist_ok=True) | |
os.makedirs('subtitles_vtt', exist_ok=True) | |
os.makedirs('subtitles_xml', exist_ok=True) | |
for video_path in tqdm(data,desc='Downloading videos') : | |
video_id=video_path.split('/')[-1].split('.')[0] | |
if existed_video_id.get(video_id,False): | |
continue | |
video_downloaded,caption_downloaded=download_video_with_subtitles(video_id) | |
if caption_downloaded: | |
# convert xml to vtt | |
xml_file_path=f'subtitles_xml/{video_id} (a.en).xml' | |
convert_xml_vtt(xml_file_path,f'subtitles_vtt/{video_id}.vtt') | |