Spaces:
Runtime error
Runtime error
from transformers import BartForConditionalGeneration, BartTokenizer | |
import streamlit as st | |
import torch | |
from transformers import AutoProcessor, WhisperForConditionalGeneration | |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
import torchaudio | |
from transformers import pipeline | |
# Load your own audio file | |
audio = st.file_uploader(label="Upload your audio file here",type=["wav",'mp3']) | |
option_language = st.selectbox( | |
'Select the language of your audio', | |
('English', 'Spanish', 'German','French','Chinese')) | |
if audio == None: | |
st.write("Please upload the audio in the box above") | |
else: | |
if option_language == "English": | |
def transcribe_audio(audio_file): | |
# Load the audio file | |
waveform, sample_rate = torchaudio.load(audio_file) | |
# Ensure mono-channel audio | |
if waveform.shape[0] > 1: | |
waveform = torch.mean(waveform, dim=0, keepdim=True) | |
# Convert to a 16kHz sample rate if not already | |
if sample_rate != 16000: | |
waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform) | |
# Convert to a list of integers | |
audio_input = waveform.squeeze().numpy().astype(int).tolist() | |
# Use Hugging Face's ASR pipeline | |
asr_pipeline = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h") | |
# Transcribe the audio | |
transcript = asr_pipeline(waveform.numpy()[0]) | |
return transcript | |
transcription = transcribe_audio(audio) | |
print("Transcription",transcription) | |
## Inititate Summary Model | |
tokenizer_summary = BartTokenizer.from_pretrained("facebook/bart-large-cnn") | |
model_summary = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn") | |
def summarize_text(text, model, tokenizer, max_length=100): | |
input_ids = tokenizer.encode(text, return_tensors="pt") | |
summary_ids = model.generate(input_ids, max_length=max_length, num_beams=4, early_stopping=True) | |
return tokenizer.decode(summary_ids[0], skip_special_tokens=True) | |
summary = summarize_text(transcription['text'], model_summary, tokenizer_summary) | |
st.write("Here is your summary!") | |
st.write(summary) | |
elif option_language == 'Spanish': | |
def transcribe_audio(audio_file): | |
# Load the audio file | |
waveform, sample_rate = torchaudio.load(audio_file) | |
# Ensure mono-channel audio | |
if waveform.shape[0] > 1: | |
waveform = torch.mean(waveform, dim=0, keepdim=True) | |
# Convert to a 16kHz sample rate if not already | |
if sample_rate != 16000: | |
waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform) | |
# Convert to a list of integers | |
audio_input = waveform.squeeze().numpy().astype(int).tolist() | |
# Use Hugging Face's ASR pipeline | |
asr_pipeline = pipeline("automatic-speech-recognition", model="Sandiago21/whisper-large-v2-spanish") | |
# Transcribe the audio | |
transcript = asr_pipeline(waveform.numpy()[0]) | |
return transcript | |
transcription = transcribe_audio(audio) | |
print("Aqui tienes tu transcripción:",transcription) | |
## Inititate Summary Model | |
tokenizer_summary = AutoTokenizer.from_pretrained("facebook/mbart-large-50", src_lang="es_XX") | |
model_summary = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50") | |
def summarize_text(text, model, tokenizer, max_length=100): | |
input_ids = tokenizer.encode(text, return_tensors="pt") | |
summary_ids = model.generate(input_ids, max_length=max_length, num_beams=4, early_stopping=True) | |
return tokenizer.decode(summary_ids[0], skip_special_tokens=True) | |
summary = summarize_text(transcription['text'], model_summary, tokenizer_summary) | |
st.write("Aqui tienes tu resumen!") | |
st.write(summary) |