File size: 1,821 Bytes
697eefa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import regex as re
from youtube_transcript_api import YouTubeRequestFailed, YouTubeTranscriptApi

from preprocessing import stride_sentences


def validate_youtube_link(url: str) -> str:
    """
    this method validates the youtube video link provided.
    input  : url (str)
    outputs: transcript (string/dict) 
    """
    yt_regex = r"^.*(youtu.be\/|v\/|u\/\w\/|embed\/|watch\?v=|\&v=|\?v=)([^#\&\?]*).*"
    matches = re.findall(yt_regex, url)
    
    assert (len(matches[0][1]) == 11), "Invalid YouTube Link"

    video_id:str = matches[0][1]

    return video_id


def zip_transcript(transcript:list) -> dict:
    start_times = []
    texts = []
    for item in transcript:
        start_times.append(item['start'])
        texts.append(item['text'].strip().replace('\n',' '))
    
    return {
        'timestamps': start_times,
        'texts': texts
    }



def full_text(transcript: list) -> str:
    texts = []
    for item in transcript:
        texts.append(item['text'])
    return ' '.join(texts).strip()


def fetch_transcript(url: str) -> list:
    
    video_id = validate_youtube_link(url)
    
    try:
        transcript:list = YouTubeTranscriptApi.get_transcript(video_id=video_id)

    except YouTubeRequestFailed:
        raise Exception('YouTube Request Failed, try again later.')
    
    return transcript   



if __name__ == '__main__':
    sample = 'https://www.youtube.com/watch?v=t6V9i8fFADI'
    sample2 = 'https://www.youtube.com/watch?v=1nLHIM2IPRY'
    fake_sample = 'https://www.youtube.com/watch?v=asdf3'
    transcript = fetch_transcript(url=sample)
    
    times, texts = zip_transcript(transcript)
    texts = stride_sentences(texts)
    print(texts[0])
    
    # with open('sample_group.txt','w') as f:
    #     for group in groups:
    #         f.write(f"{group}\n\n")