File size: 4,065 Bytes
d8e07ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from transformers import BartForConditionalGeneration, BartTokenizer
import streamlit as st
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torchaudio
from transformers import pipeline

# Load your own audio file

audio = st.file_uploader(label="Upload your audio file here",type=["wav",'mp3'])

option_language = st.selectbox(
    'Select the language of your audio',
    ('English', 'Spanish', 'German','French','Chinese'))

if audio == None:
    st.write("Please upload the audio in the box above")



else:
    if option_language == "English":
        def transcribe_audio(audio_file):
            # Load the audio file
            waveform, sample_rate = torchaudio.load(audio_file)

            # Ensure mono-channel audio
            if waveform.shape[0] > 1:
                waveform = torch.mean(waveform, dim=0, keepdim=True)

            # Convert to a 16kHz sample rate if not already
            if sample_rate != 16000:
                waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
            
            # Convert to a list of integers
            audio_input = waveform.squeeze().numpy().astype(int).tolist()

            # Use Hugging Face's ASR pipeline
            asr_pipeline = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")
            
            # Transcribe the audio
            transcript = asr_pipeline(waveform.numpy()[0])

            return transcript

        transcription = transcribe_audio(audio)
        print("Transcription",transcription)

        ## Inititate Summary Model
        tokenizer_summary = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
        model_summary = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")


        def summarize_text(text, model, tokenizer, max_length=100):
            input_ids = tokenizer.encode(text, return_tensors="pt")
            summary_ids = model.generate(input_ids, max_length=max_length, num_beams=4, early_stopping=True)
            return tokenizer.decode(summary_ids[0], skip_special_tokens=True)


        summary = summarize_text(transcription['text'], model_summary, tokenizer_summary)
        st.write("Here is your summary!")
        st.write(summary)


    elif option_language == 'Spanish':

        def transcribe_audio(audio_file):
            
            # Load the audio file
            waveform, sample_rate = torchaudio.load(audio_file)

            # Ensure mono-channel audio
            if waveform.shape[0] > 1:
                waveform = torch.mean(waveform, dim=0, keepdim=True)

            # Convert to a 16kHz sample rate if not already
            if sample_rate != 16000:
                waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
            
            # Convert to a list of integers
            audio_input = waveform.squeeze().numpy().astype(int).tolist()

            # Use Hugging Face's ASR pipeline
            asr_pipeline = pipeline("automatic-speech-recognition", model="Sandiago21/whisper-large-v2-spanish")
            
            # Transcribe the audio
            transcript = asr_pipeline(waveform.numpy()[0])

            return transcript

        transcription = transcribe_audio(audio)
        print("Aqui tienes tu transcripción:",transcription)

        ## Inititate Summary Model
        
        tokenizer_summary = AutoTokenizer.from_pretrained("facebook/mbart-large-50", src_lang="es_XX")
        model_summary = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50")


        def summarize_text(text, model, tokenizer, max_length=100):
            input_ids = tokenizer.encode(text, return_tensors="pt")
            summary_ids = model.generate(input_ids, max_length=max_length, num_beams=4, early_stopping=True)
            return tokenizer.decode(summary_ids[0], skip_special_tokens=True)


        summary = summarize_text(transcription['text'], model_summary, tokenizer_summary)
        st.write("Aqui tienes tu resumen!")
        st.write(summary)