Spaces:
Sleeping
Sleeping
import yt_dlp | |
import os | |
import streamlit as st | |
import transformers | |
from transformers import pipeline | |
from transformers import AutoTokenizer | |
import nltk | |
#Download youtube video | |
def download_audio(link): | |
with yt_dlp.YoutubeDL({'extract_audio': True, 'format': 'bestaudio', 'outtmpl': 'video.mp3'}) as video: | |
video.download(link) | |
#Load Whisper pipeline via HuggingFace | |
def load_whisper(seconds): | |
return pipeline("automatic-speech-recognition", | |
model="openai/whisper-tiny", | |
chunk_length_s=seconds, | |
) | |
#Load Extractive Summarizer pipeline via HuggingFace | |
def load_extractive(): | |
return pipeline("summarization", | |
model = "NotXia/longformer-bio-ext-summ", | |
tokenizer = AutoTokenizer.from_pretrained("NotXia/longformer-bio-ext-summ"), | |
trust_remote_code = True, | |
) | |
''' | |
def get_transcription_en(mp3_file): | |
model = whisper.load_model("tiny.en") | |
directory = os.getcwd() | |
result = model.transcribe(os.path.join(directory, mp3_file)) | |
return result["text"] | |
#def portuguese_sum_pipeline(link): | |
# video_title = download_audio(link) | |
def english_sum_pipeline(link): | |
download_audio(link) | |
transcript_text = get_transcription_en("video.mp3") | |
#extractive summarization | |
extractive_model = Summarizer() | |
extractive = extractive_model(transcript_text, num_sentences=15) | |
#abstractive summarization | |
device_num = 0 if torch.cuda.is_available() else -1 | |
abstractive_summarizer = pipeline("summarization", model="facebook/bart-large-cnn", tokenizer="facebook/bart-large-cnn", device=device_num) | |
output_text = abstractive_summarizer(extractive)[0]["summary_text"] | |
return transcript_text, output_text | |
def english_qa_pipeline(questions, context): | |
nlp = pipeline("question-answering", model='distilbert-base-uncased-distilled-squad') | |
answers = [] | |
for question in questions: | |
result = nlp(question=question, context=context) | |
answers.append(result["answer"]) | |
return answers | |
''' | |
def main(): | |
with st.sidebar(): | |
st.title(":blue[Turing]Videos") | |
with st.form("data_collection"): | |
compression_rate = st.slider("Selecione a taxa de compressão:", | |
min_value=0.1, max_value=0.9, | |
value=0.2, step=0.05 | |
) | |
# Every form must have a submit button. | |
submitted = st.form_submit_button("Submit") | |
if submitted: | |
st.success('Dados coletados!', icon="✅") | |
else: | |
st.error('Dados ainda não coletados!', icon="🚨") | |
main() | |
''' | |
#Collect inputs and create the interface | |
def main(): | |
header = st.container() | |
model = st.container() | |
model_1, model_2 = st.columns(2) | |
qa = st.container() | |
qa_1, qa_2 = st.columns(2) | |
with header: | |
st.title("TuringVideos") | |
st.write("Este trabalho visa a criação de uma interface capaz de sumarizar e responder perguntas sobre um determinado vídeo em português ou inglês!") | |
with model: | |
st.header("Modelo para sumarização") | |
with model_1: | |
language = st.selectbox('Qual a linguagem do seu modelo?', ('Português (pt)', 'Inglês (en)', 'Outra')) | |
link = st.text_area(label="Coloque o link do seu vídeo do YouTube!", height=25, placeholder="Digite seu link...") | |
questions = st.text_area(label="Coloque suas perguntas separadas por vírgula!", height=50, placeholder="Digite suas perguntas...").split(",") | |
submit_1 = st.button('Gerar soluções!') | |
with model_2: | |
if submit_1: | |
with st.spinner('Wait for it...'): | |
if language == 'Português (pt)': | |
#outputs = portuguese_sum_pipeline(link) | |
st.write("Modelo ainda não implementado.") | |
elif language == 'Inglês (en)': | |
outputs = english_sum_pipeline(link) | |
answers = english_qa_pipeline(questions, outputs[0]) | |
else: | |
st.write("Erro na seleção de linguagem.") | |
st.write("Sumário.....................................................................: \n {} \n \n".format(outputs[1])) | |
st.write("Resposta....................................................................: \n") | |
for i in range(len(answers)): | |
st.write(questions[i] + ": " + answers[i]) | |
main() | |
''' |