Oshchepkov commited on
Commit
a88932d
1 Parent(s): 18dce9e

Add application file

Browse files
Files changed (1) hide show
  1. app.py +47 -0
app.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ # https://pypi.org/project/youtube-transcript-api/
3
+ from youtube_transcript_api import YouTubeTranscriptApi
4
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
5
+
6
+
7
+ def get_video_id(url: str) -> str:
8
+ """
9
+ https://github.com/KennethSC/YouTube-Captions-Formatter/blob/master/YouTube_Captions.py
10
+ """
11
+ split_at = 'watch?v='
12
+ if 'https://www.youtube.com/watch?v=' in url:
13
+ video_id = url.partition(split_at)[2]
14
+ else:
15
+ raise Exception("This is not a valid video URL")
16
+ return video_id
17
+
18
+
19
+ def get_youtube_subtitle(video_id: str) -> str:
20
+ try:
21
+ parse = YouTubeTranscriptApi.get_transcript(video_id, languages=['ru'])
22
+ result = ''
23
+ for i in parse:
24
+ if (i['text'][0] =='[') & (i['text'][-1] ==']'):
25
+ continue
26
+ result += ' ' + i['text']
27
+ return result
28
+ except:
29
+ raise Exception("This video doesn't have a captions transcript")
30
+
31
+
32
+ url = st.text_input('Enter the URL of the Youtube video', 'https://www.youtube.com/watch?v=yR4VmxwZh0s')
33
+ video_id = get_video_id(url)
34
+ subtitle = get_youtube_subtitle(video_id)
35
+ st.write('Video_id', video_id)
36
+
37
+ st.text(subtitle)
38
+ m_name = 'summarize1'
39
+ tokenizer = AutoTokenizer.from_pretrained(m_name)
40
+ inputs = tokenizer(subtitle[:1024], return_tensors="pt").input_ids
41
+
42
+ model = AutoModelForSeq2SeqLM.from_pretrained(m_name)
43
+ outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)
44
+
45
+ summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
46
+ st.write('subtitle', "")
47
+ st.text(summary)