Spaces:
Runtime error
Runtime error
Commit
·
3299970
1
Parent(s):
2937856
Update app.py
Browse files
app.py
CHANGED
@@ -1,17 +1,29 @@
|
|
|
|
1 |
import gradio as gr
|
2 |
import whisper
|
3 |
from pytube import YouTube
|
4 |
from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration
|
5 |
from wordcloud import WordCloud
|
6 |
|
7 |
-
|
8 |
class GradioInference:
|
9 |
def __init__(self):
|
|
|
|
|
10 |
self.sizes = list(whisper._MODELS.keys())
|
|
|
|
|
11 |
self.langs = ["none"] + sorted(list(whisper.tokenizer.LANGUAGES.values()))
|
|
|
|
|
12 |
self.current_size = "base"
|
|
|
|
|
13 |
self.loaded_model = whisper.load_model(self.current_size)
|
|
|
|
|
14 |
self.yt = None
|
|
|
|
|
15 |
self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
|
16 |
|
17 |
# Initialize VoiceLabT5 model and tokenizer
|
@@ -26,8 +38,20 @@ class GradioInference:
|
|
26 |
self.classifier = pipeline("text-classification")
|
27 |
|
28 |
def __call__(self, link, lang, size):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
if self.yt is None:
|
30 |
self.yt = YouTube(link)
|
|
|
|
|
31 |
path = self.yt.streams.filter(only_audio=True)[0].download(filename="tmp.mp4")
|
32 |
|
33 |
if lang == "none":
|
@@ -37,6 +61,7 @@ class GradioInference:
|
|
37 |
self.loaded_model = whisper.load_model(size)
|
38 |
self.current_size = size
|
39 |
|
|
|
40 |
results = self.loaded_model.transcribe(path, language=lang)
|
41 |
|
42 |
# Perform summarization on the transcription
|
@@ -56,8 +81,13 @@ class GradioInference:
|
|
56 |
predicted = self.keyword_tokenizer.decode(output[0], skip_special_tokens=True)
|
57 |
keywords = [x.strip() for x in predicted.split(",") if x.strip()]
|
58 |
|
|
|
59 |
label = self.classifier(results["text"])[0]["label"]
|
|
|
|
|
60 |
wordcloud = WordCloud().generate(results["text"])
|
|
|
|
|
61 |
wordcloud_image = wordcloud.to_image()
|
62 |
|
63 |
return (
|
@@ -69,10 +99,24 @@ class GradioInference:
|
|
69 |
)
|
70 |
|
71 |
def populate_metadata(self, link):
|
|
|
|
|
|
|
|
|
|
|
72 |
self.yt = YouTube(link)
|
73 |
return self.yt.thumbnail_url, self.yt.title
|
74 |
|
75 |
def from_audio_input(self, lang, size, audio_file):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
if lang == "none":
|
77 |
lang = None
|
78 |
|
@@ -99,7 +143,10 @@ class GradioInference:
|
|
99 |
predicted = self.keyword_tokenizer.decode(output[0], skip_special_tokens=True)
|
100 |
keywords = [x.strip() for x in predicted.split(",") if x.strip()]
|
101 |
|
|
|
102 |
label = self.classifier(results["text"])[0]["label"]
|
|
|
|
|
103 |
wordcloud = WordCloud().generate(
|
104 |
results["text"]
|
105 |
)
|
@@ -161,12 +208,10 @@ with block as demo:
|
|
161 |
label="Keywords", placeholder="Keywords Output...", lines=5
|
162 |
).style(show_copy_button=True, container=True)
|
163 |
label = gr.Label(label="Sentiment Analysis")
|
164 |
-
with gr.Row().style(equal_height=True):
|
165 |
-
# Display the Word Cloud
|
166 |
wordcloud_image = gr.Image()
|
167 |
with gr.Row().style(equal_height=True):
|
168 |
clear = gr.ClearButton(
|
169 |
-
[link, title, img, text, summary, keywords, label], scale=1
|
170 |
)
|
171 |
btn = gr.Button("Get video insights", variant="primary", scale=1)
|
172 |
btn.click(
|
@@ -200,11 +245,12 @@ with block as demo:
|
|
200 |
label="Keywords", placeholder="Keywords Output", lines=5
|
201 |
)
|
202 |
label = gr.Label(label="Sentiment Analysis")
|
|
|
203 |
with gr.Row().style(equal_height=True):
|
204 |
-
clear = gr.ClearButton([text], scale=1)
|
205 |
btn = gr.Button(
|
206 |
"Get video insights", variant="primary", scale=1
|
207 |
-
)
|
208 |
btn.click(
|
209 |
gio.from_audio_input,
|
210 |
inputs=[lang, size, audio_file],
|
|
|
1 |
+
# Imports
|
2 |
import gradio as gr
|
3 |
import whisper
|
4 |
from pytube import YouTube
|
5 |
from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration
|
6 |
from wordcloud import WordCloud
|
7 |
|
|
|
8 |
class GradioInference:
|
9 |
def __init__(self):
|
10 |
+
|
11 |
+
# OpenAI's Whisper model sizes
|
12 |
self.sizes = list(whisper._MODELS.keys())
|
13 |
+
|
14 |
+
# Whisper's available languages for ASR
|
15 |
self.langs = ["none"] + sorted(list(whisper.tokenizer.LANGUAGES.values()))
|
16 |
+
|
17 |
+
# Default size
|
18 |
self.current_size = "base"
|
19 |
+
|
20 |
+
# Default model size
|
21 |
self.loaded_model = whisper.load_model(self.current_size)
|
22 |
+
|
23 |
+
# Initialize Pytube Object
|
24 |
self.yt = None
|
25 |
+
|
26 |
+
# Initialize summary model
|
27 |
self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
|
28 |
|
29 |
# Initialize VoiceLabT5 model and tokenizer
|
|
|
38 |
self.classifier = pipeline("text-classification")
|
39 |
|
40 |
def __call__(self, link, lang, size):
|
41 |
+
"""
|
42 |
+
Call the Gradio Inference python class.
|
43 |
+
This class gets access to a YouTube video using python's library Pytube and downloads its audio.
|
44 |
+
Then it uses the Whisper model to perform Automatic Speech Recognition (i.e Speech-to-Text).
|
45 |
+
Once the function has the transcription of the video it proccess it to obtain:
|
46 |
+
- Summary: using Facebook's BART transformer.
|
47 |
+
- KeyWords: using VoiceLabT5 keyword extractor.
|
48 |
+
- Sentiment Analysis: using Hugging Face's default sentiment classifier
|
49 |
+
- WordCloud: using the wordcloud python library.
|
50 |
+
"""
|
51 |
if self.yt is None:
|
52 |
self.yt = YouTube(link)
|
53 |
+
|
54 |
+
# Pytube library to access to YouTube audio stream
|
55 |
path = self.yt.streams.filter(only_audio=True)[0].download(filename="tmp.mp4")
|
56 |
|
57 |
if lang == "none":
|
|
|
61 |
self.loaded_model = whisper.load_model(size)
|
62 |
self.current_size = size
|
63 |
|
64 |
+
# Transcribe the audio extracted from pytube
|
65 |
results = self.loaded_model.transcribe(path, language=lang)
|
66 |
|
67 |
# Perform summarization on the transcription
|
|
|
81 |
predicted = self.keyword_tokenizer.decode(output[0], skip_special_tokens=True)
|
82 |
keywords = [x.strip() for x in predicted.split(",") if x.strip()]
|
83 |
|
84 |
+
# Sentiment label
|
85 |
label = self.classifier(results["text"])[0]["label"]
|
86 |
+
|
87 |
+
# Generate WordCloud object
|
88 |
wordcloud = WordCloud().generate(results["text"])
|
89 |
+
|
90 |
+
# WordCloud image to display
|
91 |
wordcloud_image = wordcloud.to_image()
|
92 |
|
93 |
return (
|
|
|
99 |
)
|
100 |
|
101 |
def populate_metadata(self, link):
|
102 |
+
"""
|
103 |
+
Access to the YouTube video title and thumbnail image to further display it
|
104 |
+
params:
|
105 |
+
- link: a YouTube URL.
|
106 |
+
"""
|
107 |
self.yt = YouTube(link)
|
108 |
return self.yt.thumbnail_url, self.yt.title
|
109 |
|
110 |
def from_audio_input(self, lang, size, audio_file):
|
111 |
+
"""
|
112 |
+
Call the Gradio Inference python class.
|
113 |
+
Uses it directly the Whisper model to perform Automatic Speech Recognition (i.e Speech-to-Text).
|
114 |
+
Once the function has the transcription of the video it proccess it to obtain:
|
115 |
+
- Summary: using Facebook's BART transformer.
|
116 |
+
- KeyWords: using VoiceLabT5 keyword extractor.
|
117 |
+
- Sentiment Analysis: using Hugging Face's default sentiment classifier
|
118 |
+
- WordCloud: using the wordcloud python library.
|
119 |
+
"""
|
120 |
if lang == "none":
|
121 |
lang = None
|
122 |
|
|
|
143 |
predicted = self.keyword_tokenizer.decode(output[0], skip_special_tokens=True)
|
144 |
keywords = [x.strip() for x in predicted.split(",") if x.strip()]
|
145 |
|
146 |
+
# Sentiment label
|
147 |
label = self.classifier(results["text"])[0]["label"]
|
148 |
+
|
149 |
+
# WordCloud object
|
150 |
wordcloud = WordCloud().generate(
|
151 |
results["text"]
|
152 |
)
|
|
|
208 |
label="Keywords", placeholder="Keywords Output...", lines=5
|
209 |
).style(show_copy_button=True, container=True)
|
210 |
label = gr.Label(label="Sentiment Analysis")
|
|
|
|
|
211 |
wordcloud_image = gr.Image()
|
212 |
with gr.Row().style(equal_height=True):
|
213 |
clear = gr.ClearButton(
|
214 |
+
[link, title, img, text, summary, keywords, label, wordcloud_image], scale=1
|
215 |
)
|
216 |
btn = gr.Button("Get video insights", variant="primary", scale=1)
|
217 |
btn.click(
|
|
|
245 |
label="Keywords", placeholder="Keywords Output", lines=5
|
246 |
)
|
247 |
label = gr.Label(label="Sentiment Analysis")
|
248 |
+
wordcloud_image = gr.Image()
|
249 |
with gr.Row().style(equal_height=True):
|
250 |
+
clear = gr.ClearButton([audio_file,text, summary, keywords, label, wordcloud_image], scale=1)
|
251 |
btn = gr.Button(
|
252 |
"Get video insights", variant="primary", scale=1
|
253 |
+
)
|
254 |
btn.click(
|
255 |
gio.from_audio_input,
|
256 |
inputs=[lang, size, audio_file],
|