Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,326 Bytes
2ada650 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
import json
from tqdm import tqdm
from pytubefix import YouTube
import xml.etree.ElementTree as ET
import os
with open ('VideoInstruct100K.json','r') as f :
data=json.load(f)
# Usage
existed_video_id={}
for video_name in os.listdir('videos'):
video_id = video_name.split('.')[0]
existed_video_id[video_id]=True
def download_video_with_subtitles(video_id):
# Create a YouTube object.
yt = YouTube(f'https://www.youtube.com/watch?v={video_id}')
video_filename = f"{video_id}.mp4"
video_downloaded=False
try :
# Get the video stream with the highest resolution and download the video.
stream = yt.streams.get_highest_resolution()
stream.download(output_path='videos', filename=video_filename)
video_downloaded=True
except Exception as e:
print(f"Error downloading video {video_id}: {str(e)}")
video_downloaded=False
if not video_downloaded:
return False,False
# Get the video's available captions (subtitles).
captions = yt.captions.all()
# Download the captions if available in xml format.
caption_downloaded = False
for caption in captions:
caption_code = caption.code
# select only english captions
if 'en' in caption_code:
caption.download(title=f"{video_id}", output_path='subtitles_xml',srt=False)
caption_downloaded = True
return video_downloaded,caption_downloaded
def convert_xml_vtt(xml_path, vtt_path):
# Parse the XML subtitle file
tree = ET.parse(xml_path)
root = tree.getroot()
# Initialize a list to store VTT subtitle entries
vtt_subtitle = []
# Function to convert time in milliseconds to WebVTT format
def ms_to_vtt_time(milliseconds):
seconds, milliseconds = divmod(milliseconds, 1000)
minutes, seconds = divmod(seconds, 60)
return f"{minutes:02d}:{seconds:02d}.{milliseconds:03d}"
# Iterate through subtitle elements
toggle = True
for p in root.findall(".//p"):
if toggle:
start_time = int(p.get("t"))
subtitle_text = " ".join(s.text.strip() for s in p.findall(".//s"))
# duration = int(p.get("d")) if p.get("d") is not None else 0
if not toggle:
end_time = int(p.get("t"))
# Format and append the VTT entry to the list
vtt_subtitle.append(f"{ms_to_vtt_time(start_time)} --> {ms_to_vtt_time(end_time)}\n{subtitle_text}\n")
toggle = not toggle
# Join the VTT entries into a single string
vtt_content = "WEBVTT\n\n" + "\n".join(vtt_subtitle)
# Save the VTT content to a file
with open(vtt_path, "w", encoding="utf-8") as vtt_file:
vtt_file.write(vtt_content)
import os
os.makedirs('videos', exist_ok=True)
os.makedirs('subtitles_vtt', exist_ok=True)
os.makedirs('subtitles_xml', exist_ok=True)
for video_path in tqdm(data,desc='Downloading videos') :
video_id=video_path.split('/')[-1].split('.')[0]
if existed_video_id.get(video_id,False):
continue
video_downloaded,caption_downloaded=download_video_with_subtitles(video_id)
if caption_downloaded:
# convert xml to vtt
xml_file_path=f'subtitles_xml/{video_id} (a.en).xml'
convert_xml_vtt(xml_file_path,f'subtitles_vtt/{video_id}.vtt')
|