Spaces:

ajchri5
/

164-S2-Assignment_2

Runtime error

App Files Files Community

ajchri5 commited on Nov 30, 2024

Commit

0b2a06f

verified ·

1 Parent(s): fb85ae9

Upload 3 files

Browse files

Files changed (3) hide show

app.py +159 -0
model_download_py.py +15 -0
requirements.txt +5 -0

app.py ADDED Viewed

	@@ -0,0 +1,159 @@

+# -*- coding: utf-8 -*-
+"""Assignment-2-IT164_ajchri5
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1RtE7mmtyUWwiuowgyQq4eCuH-ep_D1QQ
+"""
+# mount gd
+from google.colab import drive
+drive.mount('/content/drive')
+# Commented out IPython magic to ensure Python compatibility.
+# # token
+# %%capture
+# from google.colab import userdata
+# hftoken=userdata.get('hftoken')
+# Commented out IPython magic to ensure Python compatibility.
+# # pi
+# %%capture
+# !pip install gradio
+# !pip install huggingface_hub
+# packages required for colab
+!pip install gradio
+!pip install transformers
+!pip install torchaudio
+!pip install fasttext
+# fastText for language detection
+!wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin
+# imports required for colab
+import gradio as gr
+from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline, EncoderDecoderCache
+import torchaudio
+import warnings
+import fasttext
+import pandas as pd
+import csv
+import os
+# hides warnings with pysoundfile
+warnings.filterwarnings("ignore", category=UserWarning, message="PySoundFile failed.*")
+# load model 1 transcription
+whisper_model_name = "openai/whisper-large"
+processor = WhisperProcessor.from_pretrained(whisper_model_name)
+whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_name)
+# load model 2 translation
+translation_model = pipeline("translation", model="Helsinki-NLP/opus-mt-ROMANCE-en")
+# load additional model 3 language detection
+lang_model = fasttext.load_model('lid.176.bin')  # pre-trained model
+# app usage history
+history_data = []
+# save data csv
+def saveData(text, language, translated_text, confidence_score):
+    # gd path
+    file_path = '/content/drive/MyDrive/IT164/a2prompt.csv'
+    # check if file exists, if not make new one with headers
+    file_exists = os.path.isfile(file_path)
+    # open csv file to append data
+    with open(file_path, 'a', newline='', encoding='utf-8') as f:
+        w = csv.writer(f)
+        if not file_exists:
+            # write header if file is created
+            w.writerow(['Text', 'Language', 'Translation', 'Confidence Score'])
+        # write new data row
+        w.writerow([text, language, translated_text, confidence_score])
+# load audio input and transcribe
+def transcribe_audio(audio_file, sampling_rate=48000):  # set to 48 kHz
+    # load audio file with torchaudio
+    waveform, sr = torchaudio.load(audio_file, normalize=True)
+    # max 16kHz (resample)
+    if sr != 16000:
+        transform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)  # resample to 16 kHz
+        waveform = transform(waveform)
+        sr = 16000  # update as 16 kHz
+    # whisperprocessor
+    inputs = processor(waveform.squeeze(0).numpy(), return_tensors="pt", sampling_rate=sr)
+    # generate transcription and handle "past_key_values deprecation" error
+    past_key_values = None
+    generated_ids = whisper_model.generate(
+        inputs["input_features"],
+        past_key_values=past_key_values
+    )
+    # encoderdecodercache (to handle past_key_values)
+    if past_key_values is not None:
+        past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values)
+    return processor.decode(generated_ids[0], skip_special_tokens=True)
+# detect language using fastText
+def detect_language(text):
+    result = lang_model.predict(text)  # predict language with fasttext
+    language = result[0][0].replace('__label__', '')  # extract the predicted language label
+    score = result[1][0]  # confidence score
+    return language, score
+# translate text (to english)
+def translate_text_to_english(text, source_lang="fr"):
+    # translate detected language
+    translation = translation_model(text, src_lang=source_lang, tgt_lang="en")
+    return translation[0]['translation_text']
+# function to track history (save results to the list and save to csv)
+def save_to_history(text, language, translation, confidence_score):
+    history_data.append([text, language, translation, confidence_score])
+    # save csv
+    saveData(text, language, translation, confidence_score)
+# process audio, transcribe, detect language, and translate
+def process_audio(audio_file):
+    transcription = transcribe_audio(audio_file, sampling_rate=48000)  # use 48 kHz initially (mac rate)
+    language, score = detect_language(transcription)  # detect language of the transcription
+    translated_text = translate_text_to_english(transcription, source_lang=language)  # translate
+    save_to_history(transcription, language, translated_text, score)  # save results
+    return transcription, language, score, translated_text
+# update visibility of the history table in gradio
+def update_vis(radio_value):
+    if radio_value == 'show':
+        return gr.DataFrame(pd.DataFrame(history_data, columns=["Text", "Language", "Translation", "Confidence Score"]), visible=True)
+    else:
+        return gr.DataFrame(pd.DataFrame(history_data, columns=["Text", "Language", "Translation", "Confidence Score"]), visible=False)
+# gradio interface
+with gr.Blocks() as demo:
+    with gr.Row():
+        with gr.Column():
+            audio_input = gr.Audio(label="Record your voice", type="filepath")  # audio input
+            transcription_output = gr.Textbox(label="Transcription")  # transcription output
+            language_output = gr.Textbox(label="Detected Language")  # detected language output
+            score_output = gr.Textbox(label="Confidence Score")  # confidence score output
+            translated_output = gr.Textbox(label="Translated Text to English")  # translated text output
+            process_button = gr.Button("Process Audio")  # button to process the audio
+        with gr.Column():
+            history = gr.Radio(['show', 'hide'], label="App usage history")  # "show" or "hide" (history)
+            dataframe = gr.DataFrame(pd.DataFrame(history_data, columns=["Text", "Language", "Translation", "Confidence Score"]), visible=False)
+        # button click (process audio and display output)
+        process_button.click(fn=process_audio, inputs=[audio_input], outputs=[transcription_output, language_output, score_output, translated_output])
+        history.change(fn=update_vis, inputs=history, outputs=dataframe)
+demo.launch(debug=True)

model_download_py.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# -*- coding: utf-8 -*-
+"""model_download.py
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1Y_JvDuAVDbA_d7NCISXd_6nbyLn3yDZa
+"""
+import os
+# Check if the model is already downloaded
+if not os.path.exists('lid.176.bin'):
+    print("Downloading fastText language detection model...")
+    os.system('wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin')

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio
+transformers
+torchaudio
+fasttext
+pandas