File size: 4,625 Bytes
0cea3a7
2854844
0eeec49
449112a
2854844
8a04c2b
f524090
2abf6fe
b1508bf
2854844
0cea3a7
431e989
0cea3a7
 
9cff099
0cea3a7
cac0a2c
0cea3a7
 
 
 
040ebdb
 
0cea3a7
 
 
431e989
 
 
0cea3a7
 
 
 
 
 
 
 
040ebdb
0cea3a7
 
 
 
 
 
040ebdb
 
 
 
0cea3a7
9cff099
 
 
cac0a2c
 
 
433efc6
cac0a2c
 
 
 
 
9cff099
 
 
431e989
32a72a1
0769ed0
431e989
32a72a1
2854844
 
431e989
 
9cff099
 
 
9c0b499
9cff099
 
 
 
 
 
 
 
f649f40
67d3502
9cff099
67d3502
9cff099
 
449112a
9cff099
499b1df
431e989
9cff099
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import os
os.system("pip install --upgrade transformers accelerate")
os.system("pip install tokenizers fairseq")
os.system("pip install numpy==1.24") #NumPy 1.24 or less needed by Numba
os.system("pip install torch transformers accelerate torchaudio datasets")
os.system("pip install librosa==0.9.0")
# os.system("pip install gradio==4.16.0") # Rollback to pre 4.17.0 due to gr Audio playback issues
os.system("pip install --upgrade gradio")

import scipy
import gradio as gr
from transformers import pipeline, Wav2Vec2ForCTC, AutoProcessor, VitsModel, AutoTokenizer
from datasets import load_dataset, Audio, Dataset
import torch
import librosa #For converting audio sample rate to 16k

LANG = "dtp" #Change to tih for Timugon Murut or iba for Iban
model_id = "facebook/mms-1b-all"

processor = AutoProcessor.from_pretrained(model_id)
model = Wav2Vec2ForCTC.from_pretrained(model_id).to("cpu")
processor.tokenizer.set_target_lang(LANG)
model.load_adapter(LANG)

asr_pipeline = pipeline(task = "automatic-speech-recognition", model = model_id) #Function that returns a dict, transcription stored in item with key "text"

model_tts = VitsModel.from_pretrained("facebook/mms-tts-dtp")
tokenizer_tts = AutoTokenizer.from_pretrained("facebook/mms-tts-dtp")

def preprocess(input): #Sets recording sampling rate to 16k and returns numpy ndarray from audio
  speech, sample_rate = librosa.load(input)
  speech = librosa.resample(speech, orig_sr=sample_rate, target_sr=16000)
  loaded_audio = Dataset.from_dict({"audio": [input]}).cast_column("audio", Audio(sampling_rate=16000))
  audio_to_array = loaded_audio[0]["audio"]["array"]
  return audio_to_array

def run(input):
    inputs = processor(input, sampling_rate=16_000, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs).logits
    ids = torch.argmax(outputs, dim=-1)[0]
    transcription = processor.decode(ids)
    return transcription

def transcribe(input): #Gradio UI wrapper function
    audioarray = preprocess(input) #Call preprocessor function
    out = run(audioarray)
    return out

with gr.Blocks(theme = gr.themes.Soft()) as demo:
    gr.HTML(
        """
            <h1 align="center">Ponutun Tuturan om Pomorolou Sinuat Boros Dusun</h1>
            <h5 align="center">  Poomitanan kopogunaan do somit tutun tuturan om pomorolou sinuat (speech recognition and text-to-speech models)
              pinoluda' di Woyotanud Tuturan Gumukabang Tagayo di Meta (Meta Massive Multilingual Speech Project)</h5>
            <h6 align = "center">Guguno (app) diti winonsoi di Ander © 2023-2024 id Universiti Teknologi PETRONAS</h6>

            <div style='display:flex; gap: 0.25rem; '>
                <div class = "image"> <a href='https://github.com/andergisomon/dtp-nlp-demo'><img src='https://img.shields.io/badge/Github-Code-success'></a> </div>
                <div class = "image"> <a href='https://huggingface.co/spaces/anderbogia/dtp-asr-demo-v2/'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue'></a> </div>
            </div>
        """)


    def tts_run(input):
        tokenized_input = tokenizer_tts(input, return_tensors="pt")
        
        with torch.no_grad():
            output = model_tts(**tokenized_input).waveform
        
        gradio_tuple = (16000, output[0].detach().cpu().numpy())
        
        return gradio_tuple

    with gr.Row():
      with gr.Column(scale = 1):
          gr.HTML("""<h1 align="center"><img src="https://user-images.githubusercontent.com/120112847/249789954-8dbadc59-4f39-48fa-a97c-a70998f2c551.png", alt="" border="0" style="margin: 0 auto; height: 200px;" /></a></h1>""")

          gr.Markdown("""
          **Huminodun, nulai di somit pongulai kikito DALL-E**

          *Huminodun, generated by the image generation model DALL-E*
          """)
      with gr.Column(scale = 4):
          with gr.Tab("Rolou kumaa ginarit"):
              input_audio = gr.Audio(sources = ["microphone"], type = "filepath", label = "Gakamai rolou nu", format = "wav")
              output_text = gr.components.Textbox(label = "Dalinsuat")
              button1 = gr.Button("Dalinsuato' | Transcribe")
              button1.click(transcribe, inputs = input_audio, outputs = output_text)

          with gr.Tab("Ginarit kumaa rolou"):
              input_text = gr.components.Textbox(label = "Ginarit", placeholder = "Popupukai suat nu hiti")
              button2 = gr.Button("Poulayo'")
              output_audio = gr.Audio(label = "Rolou pinoulai")
              button2.click(tts_run, inputs = input_text, outputs = output_audio)

demo.launch(debug = True)