Spaces:
Running
Running
import os | |
os.system("pip install --upgrade pip") | |
os.system("curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh") #Installing Rust manually | |
os.system("pip install transformers==3.4.0") #Some interoperability issue with Wav2Vec2CTCTokenizer | |
os.system("pip install numpy==1.23.0") #NumPy 1.24 or less needed by Numba. Use 1.23, librosa still uses np.complex which was dropped in NumPy 1.24 | |
os.system("pip install git+https://github.com/huggingface/transformers datasets[torch]") | |
os.system("pip install torch accelerate torchaudio datasets librosa easymms") | |
import gradio as gr | |
from transformers import pipeline, Wav2Vec2ForCTC, AutoProcessor | |
from datasets import load_dataset, Audio, Dataset | |
import torch | |
import librosa #For converting audio sample rate to 16k | |
from easymms.models.tts import TTSModel #For TTS inference using EasyMMS | |
LANG = "dtp" #Change to tih for Timugon Murut or iba for Iban | |
model_id = "facebook/mms-1b-all" | |
processor = AutoProcessor.from_pretrained(model_id) | |
model = Wav2Vec2ForCTC.from_pretrained(model_id).to("cpu") | |
processor.tokenizer.set_target_lang(LANG) | |
model.load_adapter(LANG) | |
asr_pipeline = pipeline(task = "automatic-speech-recognition", model = model_id) #Function that returns a dict, transcription stored in item with key "text" | |
def preprocess(input): #Sets recording sampling rate to 16k and returns numpy ndarray from audio | |
speech, sample_rate = librosa.load(input) | |
speech = librosa.resample(speech, orig_sr=sample_rate, target_sr=16000) | |
loaded_audio = Dataset.from_dict({"audio": [input]}).cast_column("audio", Audio(sampling_rate=16000)) | |
audio_to_array = loaded_audio[0]["audio"]["array"] | |
return audio_to_array | |
def run(input): | |
inputs = processor(input, sampling_rate=16_000, return_tensors="pt") | |
with torch.no_grad(): | |
outputs = model(**inputs).logits | |
ids = torch.argmax(outputs, dim=-1)[0] | |
transcription = processor.decode(ids) | |
return transcription | |
def transcribe(input): #Gradio UI wrapper function | |
audioarray = preprocess(input) #Call preprocessor function | |
out = run(audioarray) | |
return out | |
with gr.Blocks(theme = gr.themes.Soft()) as demo: | |
gr.HTML( | |
""" | |
<h1 align="center">Ponutun Tuturan om Pomorolou Sinuat Boros Dusun</h1> | |
<h5 align="center"> Poomitanan kopogunaan do somit tutun tuturan om pomorolou sinuat (speech recognition and text-to-speech models) | |
pinoluda' di Woyotanud Tuturan Gumukabang Tagayo di Meta (Meta Massive Multilingual Speech Project)</h5> | |
<h6 align = "center">Guguno (app) diti winonsoi di Ander © 2023 id Universiti Teknologi PETRONAS</h6> | |
<div style='display:flex; gap: 0.25rem; '> | |
<div class = "image"> <a href='https://github.com/andergisomon/dtp-nlp-demo'><img src='https://img.shields.io/badge/Github-Code-success'></a> </div> | |
<div class = "image"> <a href='https://huggingface.co/spaces/anderbogia/dtp-asr-demo-v2/'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue'></a> </div> | |
</div> | |
""") | |
tts = TTSModel(LANG) | |
def fn2(input): | |
res = tts.synthesize(input) | |
flip_tuple = (res[1], res[0]) #EasyMMS synthesize() returns Tuple(data, sample_rate) where data is a numpy.array and sample_rate is int, | |
#but Gradio Audio() expects the same tuple but with the elements flipped | |
return flip_tuple | |
with gr.Row(): | |
with gr.Column(scale = 1): | |
gr.HTML("""<h1 align="center"><img src="https://user-images.githubusercontent.com/120112847/249789954-8dbadc59-4f39-48fa-a97c-a70998f2c551.png", alt="" border="0" style="margin: 0 auto; height: 200px;" /></a></h1>""") | |
gr.Markdown(""" | |
**Huminodun, nulai di somit pongulai kikito DALL-E** | |
*Huminodun, generated by the image generation model DALL-E* | |
""") | |
with gr.Column(scale = 4): | |
with gr.Tab("Rolou kumaa ginarit"): | |
input_audio = gr.Audio(source = "microphone", type = "filepath", label = "Gakamai rolou nu") | |
output_text = gr.components.Textbox(label = "Dalinsuat") | |
button1 = gr.Button("Dalinsuato' | Transcribe") | |
button1.click(transcribe, inputs = input_audio, outputs = output_text) | |
with gr.Tab("Ginarit kumaa rolou"): | |
input_text = gr.components.Textbox(label = "Ginarit", placeholder = "Potutakai suat nu hiti") | |
button2 = gr.Button("Poulayo'") | |
output_audio = gr.components.Audio(label = "Rolou pinoulai") | |
button2.click(fn2, inputs = input_text, outputs = output_audio) | |
demo.launch(debug = True) |