|
import json |
|
from tqdm import tqdm |
|
from pytubefix import YouTube |
|
|
|
import xml.etree.ElementTree as ET |
|
import os |
|
|
|
with open ('VideoInstruct100K.json','r') as f : |
|
data=json.load(f) |
|
|
|
|
|
existed_video_id={} |
|
for video_name in os.listdir('videos'): |
|
video_id = video_name.split('.')[0] |
|
existed_video_id[video_id]=True |
|
|
|
|
|
|
|
def download_video_with_subtitles(video_id): |
|
|
|
yt = YouTube(f'https://www.youtube.com/watch?v={video_id}') |
|
|
|
video_filename = f"{video_id}.mp4" |
|
video_downloaded=False |
|
try : |
|
|
|
stream = yt.streams.get_highest_resolution() |
|
stream.download(output_path='videos', filename=video_filename) |
|
video_downloaded=True |
|
except Exception as e: |
|
print(f"Error downloading video {video_id}: {str(e)}") |
|
video_downloaded=False |
|
if not video_downloaded: |
|
return False,False |
|
|
|
|
|
captions = yt.captions.all() |
|
|
|
|
|
caption_downloaded = False |
|
for caption in captions: |
|
caption_code = caption.code |
|
|
|
if 'en' in caption_code: |
|
caption.download(title=f"{video_id}", output_path='subtitles_xml',srt=False) |
|
caption_downloaded = True |
|
return video_downloaded,caption_downloaded |
|
def convert_xml_vtt(xml_path, vtt_path): |
|
|
|
tree = ET.parse(xml_path) |
|
root = tree.getroot() |
|
|
|
|
|
vtt_subtitle = [] |
|
|
|
|
|
def ms_to_vtt_time(milliseconds): |
|
seconds, milliseconds = divmod(milliseconds, 1000) |
|
minutes, seconds = divmod(seconds, 60) |
|
return f"{minutes:02d}:{seconds:02d}.{milliseconds:03d}" |
|
|
|
|
|
toggle = True |
|
for p in root.findall(".//p"): |
|
if toggle: |
|
start_time = int(p.get("t")) |
|
subtitle_text = " ".join(s.text.strip() for s in p.findall(".//s")) |
|
|
|
if not toggle: |
|
end_time = int(p.get("t")) |
|
|
|
vtt_subtitle.append(f"{ms_to_vtt_time(start_time)} --> {ms_to_vtt_time(end_time)}\n{subtitle_text}\n") |
|
toggle = not toggle |
|
|
|
vtt_content = "WEBVTT\n\n" + "\n".join(vtt_subtitle) |
|
|
|
|
|
with open(vtt_path, "w", encoding="utf-8") as vtt_file: |
|
vtt_file.write(vtt_content) |
|
import os |
|
os.makedirs('videos', exist_ok=True) |
|
os.makedirs('subtitles_vtt', exist_ok=True) |
|
os.makedirs('subtitles_xml', exist_ok=True) |
|
for video_path in tqdm(data,desc='Downloading videos') : |
|
video_id=video_path.split('/')[-1].split('.')[0] |
|
if existed_video_id.get(video_id,False): |
|
continue |
|
video_downloaded,caption_downloaded=download_video_with_subtitles(video_id) |
|
if caption_downloaded: |
|
|
|
xml_file_path=f'subtitles_xml/{video_id} (a.en).xml' |
|
convert_xml_vtt(xml_file_path,f'subtitles_vtt/{video_id}.vtt') |
|
|
|
|
|
|