Spaces:
Runtime error
Runtime error
alankabisov
commited on
Commit
Β·
179d87b
1
Parent(s):
9292468
added ui and refactoring
Browse files- .gitignore +1 -0
- .streamlit/config.toml +4 -0
- app.py +92 -49
- requirements.txt +2 -1
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
.idea/
|
.streamlit/config.toml
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[theme]
|
2 |
+
backgroundColor="#d2e7ee"
|
3 |
+
secondaryBackgroundColor="#79c1ee"
|
4 |
+
textColor="#151516"
|
app.py
CHANGED
@@ -1,22 +1,20 @@
|
|
1 |
import os
|
2 |
|
3 |
-
|
4 |
import streamlit as st
|
5 |
from urllib.parse import urlparse, parse_qs
|
6 |
|
7 |
-
from tqdm import tqdm
|
8 |
from stqdm import stqdm
|
9 |
|
10 |
# https://github.com/pytorch/pytorch/issues/77764
|
11 |
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
|
12 |
|
13 |
-
from youtube_transcript_api import YouTubeTranscriptApi
|
14 |
|
15 |
-
from transformers import pipeline
|
16 |
|
17 |
import torch
|
18 |
|
19 |
-
# Setting device for
|
20 |
if torch.cuda.is_available():
|
21 |
device = torch.device('cuda')
|
22 |
elif torch.has_mps:
|
@@ -25,47 +23,79 @@ else:
|
|
25 |
device = torch.device('cpu')
|
26 |
|
27 |
|
|
|
|
|
|
|
28 |
|
29 |
-
def get_videoid_from_url(url:str):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
url_data = urlparse(url)
|
31 |
query = parse_qs(url_data.query)
|
32 |
|
33 |
-
|
34 |
video_id = query["v"][0]
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
return video_id
|
39 |
|
40 |
-
def process_click_callback():
|
41 |
-
st.session_state.process_btn = True
|
42 |
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
-
transcript_list = YouTubeTranscriptApi.list_transcripts(
|
46 |
|
47 |
try:
|
48 |
transcript = transcript_list.find_manually_created_transcript(['en'])
|
49 |
-
except
|
50 |
-
|
51 |
transcript = transcript_list.find_generated_transcript(['en'])
|
52 |
|
53 |
subtitles = transcript.fetch()
|
54 |
|
55 |
subtitles = [sbt['text'] for sbt in subtitles if sbt['text'] != '[Music]']
|
56 |
-
subtitles_len = [len(sbt) for sbt in subtitles]
|
57 |
-
sbt_mean_len = sum(subtitles_len)/len(subtitles_len)
|
58 |
|
59 |
-
|
60 |
-
|
61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
# Number of subtitles per step/summary
|
64 |
# Since number length of transcripts differs
|
65 |
# between generated and manual ones
|
66 |
# we set different step size
|
67 |
n_sbt_per_step = int(400 / (sbt_mean_len / 4))
|
68 |
-
print('Number subtitles per summary: {}'.format(n_sbt_per_step))
|
69 |
|
70 |
n_steps = len(subtitles) // n_sbt_per_step if len(subtitles) % n_sbt_per_step == 0 else \
|
71 |
len(subtitles) // n_sbt_per_step + 1
|
@@ -73,9 +103,7 @@ def process_click_callback():
|
|
73 |
summaries = []
|
74 |
|
75 |
for i in stqdm(range(n_steps)):
|
76 |
-
sbt_txt = ' '.join(subtitles[n_sbt_per_step*i:n_sbt_per_step*(i+1)])
|
77 |
-
# print('length of text: {}'.format(len(sbt_txt)))
|
78 |
-
# print(sbt_txt)
|
79 |
|
80 |
summarizer = pipeline('summarization', model='t5-small', tokenizer='t5-small',
|
81 |
max_length=512, truncation=True)
|
@@ -83,44 +111,59 @@ def process_click_callback():
|
|
83 |
summary = summarizer(sbt_txt, do_sample=False)
|
84 |
summary = summary[0]['summary_text']
|
85 |
|
86 |
-
# print('Summary: ' + summary)
|
87 |
summaries.append(summary)
|
88 |
|
89 |
-
|
90 |
-
print(out)
|
91 |
|
92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
st.success('Processing complete!', icon="β
")
|
94 |
-
st.session_state.process_btn = False
|
95 |
|
|
|
96 |
|
97 |
|
98 |
-
|
|
|
|
|
|
|
99 |
st.title('YouTube Video Summary π')
|
100 |
st.markdown('Creates summary for given YouTube video URL based on transcripts.')
|
101 |
-
st.code('https://www.youtube.com/watch?v=
|
102 |
-
st.code('https://youtu.be/
|
103 |
|
104 |
col1, col2 = st.columns(2)
|
105 |
|
106 |
with col1:
|
107 |
-
|
108 |
-
|
109 |
-
st.write(get_videoid_from_url(video_url))
|
110 |
|
111 |
with col2:
|
112 |
-
st.button('Process
|
113 |
-
|
114 |
-
st.text_area(label='', key='summary_output', height=444)
|
115 |
-
|
116 |
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
# x = st.slider('Select a value')
|
122 |
-
# st.write(x, 'squared is', x * x)
|
123 |
-
|
124 |
-
|
125 |
-
if __name__ == "__main__":
|
126 |
-
main()
|
|
|
1 |
import os
|
2 |
|
|
|
3 |
import streamlit as st
|
4 |
from urllib.parse import urlparse, parse_qs
|
5 |
|
|
|
6 |
from stqdm import stqdm
|
7 |
|
8 |
# https://github.com/pytorch/pytorch/issues/77764
|
9 |
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
|
10 |
|
11 |
+
from youtube_transcript_api import YouTubeTranscriptApi, NoTranscriptFound, TranscriptsDisabled
|
12 |
|
13 |
+
from transformers import pipeline
|
14 |
|
15 |
import torch
|
16 |
|
17 |
+
# Setting device for PyTorch
|
18 |
if torch.cuda.is_available():
|
19 |
device = torch.device('cuda')
|
20 |
elif torch.has_mps:
|
|
|
23 |
device = torch.device('cpu')
|
24 |
|
25 |
|
26 |
+
class InvalidURLException(Exception):
|
27 |
+
pass
|
28 |
+
|
29 |
|
30 |
+
def get_videoid_from_url(url: str):
|
31 |
+
'''
|
32 |
+
Gets video ID from give YouTube video URL
|
33 |
+
|
34 |
+
:param url: YouTube video URL in 2 formats (standard and short)
|
35 |
+
:return: id of YouTube video
|
36 |
+
:raises InvalidURLException: If URL is not valid
|
37 |
+
'''
|
38 |
url_data = urlparse(url)
|
39 |
query = parse_qs(url_data.query)
|
40 |
|
41 |
+
if ('v' in query) & ('youtube.com' in url_data.netloc):
|
42 |
video_id = query["v"][0]
|
43 |
+
elif 'youtu.be' in url_data.netloc:
|
44 |
+
path_lst = url.split('/')
|
45 |
+
|
46 |
+
if path_lst:
|
47 |
+
video_id = path_lst[-1]
|
48 |
+
else:
|
49 |
+
raise InvalidURLException('Invalid URL')
|
50 |
+
else:
|
51 |
+
raise InvalidURLException('Invalid URL')
|
52 |
|
53 |
return video_id
|
54 |
|
|
|
|
|
55 |
|
56 |
+
def get_transcripts(url: str):
|
57 |
+
'''
|
58 |
+
Loads transcripts for given URL
|
59 |
+
|
60 |
+
:param url: YouTube video URL
|
61 |
+
:return: list, list of subtitles
|
62 |
+
'''
|
63 |
+
|
64 |
+
video_id = get_videoid_from_url(video_url_inp)
|
65 |
|
66 |
+
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
|
67 |
|
68 |
try:
|
69 |
transcript = transcript_list.find_manually_created_transcript(['en'])
|
70 |
+
except NoTranscriptFound as e:
|
71 |
+
st.info('No manual transcripts were found, trying to load generated ones...')
|
72 |
transcript = transcript_list.find_generated_transcript(['en'])
|
73 |
|
74 |
subtitles = transcript.fetch()
|
75 |
|
76 |
subtitles = [sbt['text'] for sbt in subtitles if sbt['text'] != '[Music]']
|
|
|
|
|
77 |
|
78 |
+
return subtitles
|
79 |
+
|
80 |
+
|
81 |
+
def generate_summary(subtitles: list):
|
82 |
+
'''
|
83 |
+
Creates summary based on subtitles of YouTube video.
|
84 |
+
|
85 |
+
Uses T5-small model which shows best results for different topics
|
86 |
+
of videos.
|
87 |
+
|
88 |
+
:param subtitles: list of subtitles strings
|
89 |
+
:return: summary based on subtitles
|
90 |
+
'''
|
91 |
+
subtitles_len = [len(sbt) for sbt in subtitles]
|
92 |
+
sbt_mean_len = sum(subtitles_len) / len(subtitles_len)
|
93 |
|
94 |
# Number of subtitles per step/summary
|
95 |
# Since number length of transcripts differs
|
96 |
# between generated and manual ones
|
97 |
# we set different step size
|
98 |
n_sbt_per_step = int(400 / (sbt_mean_len / 4))
|
|
|
99 |
|
100 |
n_steps = len(subtitles) // n_sbt_per_step if len(subtitles) % n_sbt_per_step == 0 else \
|
101 |
len(subtitles) // n_sbt_per_step + 1
|
|
|
103 |
summaries = []
|
104 |
|
105 |
for i in stqdm(range(n_steps)):
|
106 |
+
sbt_txt = ' '.join(subtitles[n_sbt_per_step * i:n_sbt_per_step * (i + 1)])
|
|
|
|
|
107 |
|
108 |
summarizer = pipeline('summarization', model='t5-small', tokenizer='t5-small',
|
109 |
max_length=512, truncation=True)
|
|
|
111 |
summary = summarizer(sbt_txt, do_sample=False)
|
112 |
summary = summary[0]['summary_text']
|
113 |
|
|
|
114 |
summaries.append(summary)
|
115 |
|
116 |
+
return ' '.join(summaries)
|
|
|
117 |
|
118 |
+
|
119 |
+
def process_click_callback():
|
120 |
+
'''
|
121 |
+
Callback for process button click
|
122 |
+
'''
|
123 |
+
global is_processing
|
124 |
+
|
125 |
+
if is_processing:
|
126 |
+
return
|
127 |
+
else:
|
128 |
+
is_processing = True
|
129 |
+
|
130 |
+
global video_url_inp
|
131 |
+
|
132 |
+
try:
|
133 |
+
subtitles = get_transcripts(video_url_inp)
|
134 |
+
except InvalidURLException as iue:
|
135 |
+
is_processing = False
|
136 |
+
st.error('Invalid YouTube URL, please provide URL in format that is shown on Examples')
|
137 |
+
st.experimental_rerun()
|
138 |
+
except TranscriptsDisabled as tde:
|
139 |
+
is_processing = False
|
140 |
+
st.error('Could not retrieve a transcript for given ID')
|
141 |
+
st.experimental_rerun()
|
142 |
+
|
143 |
+
summary = generate_summary(subtitles)
|
144 |
+
|
145 |
+
st.session_state.summary_output = summary
|
146 |
st.success('Processing complete!', icon="β
")
|
|
|
147 |
|
148 |
+
is_processing = False
|
149 |
|
150 |
|
151 |
+
if __name__ == "__main__":
|
152 |
+
# State of processing
|
153 |
+
is_processing = False
|
154 |
+
|
155 |
st.title('YouTube Video Summary π')
|
156 |
st.markdown('Creates summary for given YouTube video URL based on transcripts.')
|
157 |
+
st.code('https://www.youtube.com/watch?v=skl4OXNA12U')
|
158 |
+
st.code('https://youtu.be/mEQc-iAbEBk')
|
159 |
|
160 |
col1, col2 = st.columns(2)
|
161 |
|
162 |
with col1:
|
163 |
+
video_url_inp = st.text_input('YouTube Video URL:', placeholder='YouTube URL',
|
164 |
+
label_visibility='collapsed')
|
|
|
165 |
|
166 |
with col2:
|
167 |
+
process_btn = st.button('ποΈProcess', key='process_btn', on_click=process_click_callback)
|
|
|
|
|
|
|
168 |
|
169 |
+
summary_out_txt = st.text_area(label='', key='summary_output', height=400)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -2,4 +2,5 @@ torch
|
|
2 |
transformers
|
3 |
youtube_transcript_api
|
4 |
tqdm
|
5 |
-
stqdm
|
|
|
|
2 |
transformers
|
3 |
youtube_transcript_api
|
4 |
tqdm
|
5 |
+
stqdm
|
6 |
+
streamlit
|