File size: 3,326 Bytes
2ada650
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import json 
from tqdm import tqdm
from pytubefix import YouTube

import xml.etree.ElementTree as ET
import os

with open ('VideoInstruct100K.json','r') as f :
    data=json.load(f)

# Usage
existed_video_id={}
for video_name in os.listdir('videos'):
    video_id = video_name.split('.')[0]
    existed_video_id[video_id]=True 



def download_video_with_subtitles(video_id):
    # Create a YouTube object.
    yt = YouTube(f'https://www.youtube.com/watch?v={video_id}')

    video_filename = f"{video_id}.mp4"
    video_downloaded=False
    try :
        # Get the video stream with the highest resolution and download the video.
        stream = yt.streams.get_highest_resolution()
        stream.download(output_path='videos', filename=video_filename)
        video_downloaded=True
    except Exception as e:
        print(f"Error downloading video {video_id}: {str(e)}")
        video_downloaded=False
    if not video_downloaded:
        return False,False

    # Get the video's available captions (subtitles).
    captions = yt.captions.all()

    # Download the captions if available in xml format.
    caption_downloaded = False
    for caption in captions:
        caption_code = caption.code
        # select only english captions
        if 'en' in caption_code:
            caption.download(title=f"{video_id}", output_path='subtitles_xml',srt=False)
            caption_downloaded = True
    return video_downloaded,caption_downloaded
def convert_xml_vtt(xml_path, vtt_path):
    # Parse the XML subtitle file
    tree = ET.parse(xml_path)
    root = tree.getroot()

    # Initialize a list to store VTT subtitle entries
    vtt_subtitle = []

    # Function to convert time in milliseconds to WebVTT format
    def ms_to_vtt_time(milliseconds):
        seconds, milliseconds = divmod(milliseconds, 1000)
        minutes, seconds = divmod(seconds, 60)
        return f"{minutes:02d}:{seconds:02d}.{milliseconds:03d}"

    # Iterate through subtitle elements
    toggle = True
    for p in root.findall(".//p"):
        if toggle:
            start_time = int(p.get("t"))
            subtitle_text = " ".join(s.text.strip() for s in p.findall(".//s"))
        # duration = int(p.get("d")) if p.get("d") is not None else 0
        if not toggle:
            end_time = int(p.get("t"))
            # Format and append the VTT entry to the list
            vtt_subtitle.append(f"{ms_to_vtt_time(start_time)} --> {ms_to_vtt_time(end_time)}\n{subtitle_text}\n")
        toggle = not toggle
    # Join the VTT entries into a single string
    vtt_content = "WEBVTT\n\n" + "\n".join(vtt_subtitle)

    # Save the VTT content to a file
    with open(vtt_path, "w", encoding="utf-8") as vtt_file:
        vtt_file.write(vtt_content)
import os
os.makedirs('videos', exist_ok=True)
os.makedirs('subtitles_vtt', exist_ok=True)
os.makedirs('subtitles_xml', exist_ok=True)
for video_path in tqdm(data,desc='Downloading videos') :
    video_id=video_path.split('/')[-1].split('.')[0]
    if existed_video_id.get(video_id,False):
        continue
    video_downloaded,caption_downloaded=download_video_with_subtitles(video_id)
    if caption_downloaded:
        # convert xml to vtt
        xml_file_path=f'subtitles_xml/{video_id} (a.en).xml'
        convert_xml_vtt(xml_file_path,f'subtitles_vtt/{video_id}.vtt')