Spaces:
Runtime error
Runtime error
File size: 4,137 Bytes
d8e07ba |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
from transformers import BartForConditionalGeneration, BartTokenizer
import streamlit as st
import torch
from transformers import AutoProcessor, WhisperForConditionalGeneration
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torchaudio
from transformers import pipeline
# Load your own audio file
audio = st.file_uploader(label="Upload your audio file here",type=["wav",'mp3'])
option_language = st.selectbox(
'Select the language of your audio',
('English', 'Spanish', 'German','French','Chinese'))
if audio == None:
st.write("Please upload the audio in the box above")
else:
if option_language == "English":
def transcribe_audio(audio_file):
# Load the audio file
waveform, sample_rate = torchaudio.load(audio_file)
# Ensure mono-channel audio
if waveform.shape[0] > 1:
waveform = torch.mean(waveform, dim=0, keepdim=True)
# Convert to a 16kHz sample rate if not already
if sample_rate != 16000:
waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
# Convert to a list of integers
audio_input = waveform.squeeze().numpy().astype(int).tolist()
# Use Hugging Face's ASR pipeline
asr_pipeline = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")
# Transcribe the audio
transcript = asr_pipeline(waveform.numpy()[0])
return transcript
transcription = transcribe_audio(audio)
print("Transcription",transcription)
## Inititate Summary Model
tokenizer_summary = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
model_summary = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
def summarize_text(text, model, tokenizer, max_length=100):
input_ids = tokenizer.encode(text, return_tensors="pt")
summary_ids = model.generate(input_ids, max_length=max_length, num_beams=4, early_stopping=True)
return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
summary = summarize_text(transcription['text'], model_summary, tokenizer_summary)
st.write("Here is your summary!")
st.write(summary)
elif option_language == 'Spanish':
def transcribe_audio(audio_file):
# Load the audio file
waveform, sample_rate = torchaudio.load(audio_file)
# Ensure mono-channel audio
if waveform.shape[0] > 1:
waveform = torch.mean(waveform, dim=0, keepdim=True)
# Convert to a 16kHz sample rate if not already
if sample_rate != 16000:
waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
# Convert to a list of integers
audio_input = waveform.squeeze().numpy().astype(int).tolist()
# Use Hugging Face's ASR pipeline
asr_pipeline = pipeline("automatic-speech-recognition", model="Sandiago21/whisper-large-v2-spanish")
# Transcribe the audio
transcript = asr_pipeline(waveform.numpy()[0])
return transcript
transcription = transcribe_audio(audio)
print("Aqui tienes tu transcripción:",transcription)
## Inititate Summary Model
tokenizer_summary = AutoTokenizer.from_pretrained("facebook/mbart-large-50", src_lang="es_XX")
model_summary = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50")
def summarize_text(text, model, tokenizer, max_length=100):
input_ids = tokenizer.encode(text, return_tensors="pt")
summary_ids = model.generate(input_ids, max_length=max_length, num_beams=4, early_stopping=True)
return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
summary = summarize_text(transcription['text'], model_summary, tokenizer_summary)
st.write("Aqui tienes tu resumen!")
st.write(summary) |