ajchri5 commited on
Commit
0b2a06f
·
verified ·
1 Parent(s): fb85ae9

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +159 -0
  2. model_download_py.py +15 -0
  3. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Assignment-2-IT164_ajchri5
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1RtE7mmtyUWwiuowgyQq4eCuH-ep_D1QQ
8
+ """
9
+
10
+ # mount gd
11
+ from google.colab import drive
12
+ drive.mount('/content/drive')
13
+
14
+ # Commented out IPython magic to ensure Python compatibility.
15
+ # # token
16
+ # %%capture
17
+ # from google.colab import userdata
18
+ # hftoken=userdata.get('hftoken')
19
+
20
+ # Commented out IPython magic to ensure Python compatibility.
21
+ # # pi
22
+ # %%capture
23
+ # !pip install gradio
24
+ # !pip install huggingface_hub
25
+
26
+ # packages required for colab
27
+ !pip install gradio
28
+ !pip install transformers
29
+ !pip install torchaudio
30
+ !pip install fasttext
31
+
32
+ # fastText for language detection
33
+ !wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin
34
+
35
+ # imports required for colab
36
+ import gradio as gr
37
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline, EncoderDecoderCache
38
+ import torchaudio
39
+ import warnings
40
+ import fasttext
41
+ import pandas as pd
42
+ import csv
43
+ import os
44
+
45
+ # hides warnings with pysoundfile
46
+ warnings.filterwarnings("ignore", category=UserWarning, message="PySoundFile failed.*")
47
+
48
+ # load model 1 transcription
49
+ whisper_model_name = "openai/whisper-large"
50
+ processor = WhisperProcessor.from_pretrained(whisper_model_name)
51
+ whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_name)
52
+
53
+ # load model 2 translation
54
+ translation_model = pipeline("translation", model="Helsinki-NLP/opus-mt-ROMANCE-en")
55
+
56
+ # load additional model 3 language detection
57
+ lang_model = fasttext.load_model('lid.176.bin') # pre-trained model
58
+
59
+ # app usage history
60
+ history_data = []
61
+
62
+ # save data csv
63
+ def saveData(text, language, translated_text, confidence_score):
64
+ # gd path
65
+ file_path = '/content/drive/MyDrive/IT164/a2prompt.csv'
66
+
67
+ # check if file exists, if not make new one with headers
68
+ file_exists = os.path.isfile(file_path)
69
+
70
+ # open csv file to append data
71
+ with open(file_path, 'a', newline='', encoding='utf-8') as f:
72
+ w = csv.writer(f)
73
+ if not file_exists:
74
+ # write header if file is created
75
+ w.writerow(['Text', 'Language', 'Translation', 'Confidence Score'])
76
+ # write new data row
77
+ w.writerow([text, language, translated_text, confidence_score])
78
+
79
+ # load audio input and transcribe
80
+ def transcribe_audio(audio_file, sampling_rate=48000): # set to 48 kHz
81
+ # load audio file with torchaudio
82
+ waveform, sr = torchaudio.load(audio_file, normalize=True)
83
+
84
+ # max 16kHz (resample)
85
+ if sr != 16000:
86
+ transform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000) # resample to 16 kHz
87
+ waveform = transform(waveform)
88
+ sr = 16000 # update as 16 kHz
89
+
90
+ # whisperprocessor
91
+ inputs = processor(waveform.squeeze(0).numpy(), return_tensors="pt", sampling_rate=sr)
92
+
93
+ # generate transcription and handle "past_key_values deprecation" error
94
+ past_key_values = None
95
+ generated_ids = whisper_model.generate(
96
+ inputs["input_features"],
97
+ past_key_values=past_key_values
98
+ )
99
+
100
+ # encoderdecodercache (to handle past_key_values)
101
+ if past_key_values is not None:
102
+ past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values)
103
+
104
+ return processor.decode(generated_ids[0], skip_special_tokens=True)
105
+
106
+ # detect language using fastText
107
+ def detect_language(text):
108
+ result = lang_model.predict(text) # predict language with fasttext
109
+ language = result[0][0].replace('__label__', '') # extract the predicted language label
110
+ score = result[1][0] # confidence score
111
+ return language, score
112
+
113
+ # translate text (to english)
114
+ def translate_text_to_english(text, source_lang="fr"):
115
+ # translate detected language
116
+ translation = translation_model(text, src_lang=source_lang, tgt_lang="en")
117
+ return translation[0]['translation_text']
118
+
119
+ # function to track history (save results to the list and save to csv)
120
+ def save_to_history(text, language, translation, confidence_score):
121
+ history_data.append([text, language, translation, confidence_score])
122
+ # save csv
123
+ saveData(text, language, translation, confidence_score)
124
+
125
+ # process audio, transcribe, detect language, and translate
126
+ def process_audio(audio_file):
127
+ transcription = transcribe_audio(audio_file, sampling_rate=48000) # use 48 kHz initially (mac rate)
128
+ language, score = detect_language(transcription) # detect language of the transcription
129
+ translated_text = translate_text_to_english(transcription, source_lang=language) # translate
130
+ save_to_history(transcription, language, translated_text, score) # save results
131
+ return transcription, language, score, translated_text
132
+
133
+ # update visibility of the history table in gradio
134
+ def update_vis(radio_value):
135
+ if radio_value == 'show':
136
+ return gr.DataFrame(pd.DataFrame(history_data, columns=["Text", "Language", "Translation", "Confidence Score"]), visible=True)
137
+ else:
138
+ return gr.DataFrame(pd.DataFrame(history_data, columns=["Text", "Language", "Translation", "Confidence Score"]), visible=False)
139
+
140
+ # gradio interface
141
+ with gr.Blocks() as demo:
142
+ with gr.Row():
143
+ with gr.Column():
144
+ audio_input = gr.Audio(label="Record your voice", type="filepath") # audio input
145
+ transcription_output = gr.Textbox(label="Transcription") # transcription output
146
+ language_output = gr.Textbox(label="Detected Language") # detected language output
147
+ score_output = gr.Textbox(label="Confidence Score") # confidence score output
148
+ translated_output = gr.Textbox(label="Translated Text to English") # translated text output
149
+ process_button = gr.Button("Process Audio") # button to process the audio
150
+
151
+ with gr.Column():
152
+ history = gr.Radio(['show', 'hide'], label="App usage history") # "show" or "hide" (history)
153
+ dataframe = gr.DataFrame(pd.DataFrame(history_data, columns=["Text", "Language", "Translation", "Confidence Score"]), visible=False)
154
+
155
+ # button click (process audio and display output)
156
+ process_button.click(fn=process_audio, inputs=[audio_input], outputs=[transcription_output, language_output, score_output, translated_output])
157
+ history.change(fn=update_vis, inputs=history, outputs=dataframe)
158
+
159
+ demo.launch(debug=True)
model_download_py.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """model_download.py
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1Y_JvDuAVDbA_d7NCISXd_6nbyLn3yDZa
8
+ """
9
+
10
+ import os
11
+
12
+ # Check if the model is already downloaded
13
+ if not os.path.exists('lid.176.bin'):
14
+ print("Downloading fastText language detection model...")
15
+ os.system('wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin')
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio
2
+ transformers
3
+ torchaudio
4
+ fasttext
5
+ pandas