|
|
|
|
|
import gradio as gr |
|
import numpy as np |
|
import soundfile as sf |
|
import spaces |
|
import torch |
|
import torchaudio |
|
from gradio.themes import Base |
|
from sv import process_audio |
|
|
|
|
|
@spaces.GPU |
|
def model_inference(input_wav, language): |
|
|
|
language = language if language else "auto" |
|
|
|
|
|
if isinstance(input_wav, tuple): |
|
fs, input_wav = input_wav |
|
input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max |
|
input_wav = input_wav.mean(-1) if len(input_wav.shape) > 1 else input_wav |
|
if fs != 16000: |
|
resampler = torchaudio.transforms.Resample(fs, 16000) |
|
input_wav = resampler(torch.from_numpy(input_wav).float()[None, :])[ |
|
0 |
|
].numpy() |
|
|
|
|
|
with sf.SoundFile("temp.wav", "w", samplerate=16000, channels=1) as f: |
|
f.write(input_wav) |
|
result = process_audio("temp.wav", language=language) |
|
|
|
return result |
|
|
|
|
|
def launch(): |
|
|
|
custom_css = """ |
|
.gradio-container {color: rgb(70, 70, 70);} |
|
""" |
|
|
|
with gr.Blocks(css=custom_css) as demo: |
|
gr.Markdown("# Cantonese Call Transcriber") |
|
gr.Markdown( |
|
""" |
|
This tool transcribes Cantonese audio calls into text. |
|
|
|
## How to use: |
|
1. Upload an audio file or use the example provided at the bottom of the page. |
|
2. Click the 'Process Audio' button. |
|
3. The transcription will appear in the output box. |
|
""" |
|
) |
|
|
|
|
|
audio_input = gr.Audio(label="Input") |
|
text_output = gr.Textbox(lines=10, label="Output") |
|
|
|
|
|
def render_example(example): |
|
return gr.Button("Try Example Audio") |
|
|
|
|
|
gr.Examples( |
|
examples=[["example/scb.mp3"]], |
|
inputs=[audio_input], |
|
outputs=[text_output], |
|
fn=lambda x: model_inference(x, "yue"), |
|
examples_per_page=1, |
|
) |
|
|
|
|
|
with gr.Row(): |
|
with gr.Column(scale=2): |
|
audio_input |
|
fn_button = gr.Button("Process Audio", variant="primary") |
|
|
|
with gr.Column(scale=3): |
|
text_output |
|
|
|
|
|
fn_button.click( |
|
fn=lambda x: model_inference(x, "yue"), |
|
inputs=[audio_input], |
|
outputs=[text_output], |
|
) |
|
|
|
demo.launch() |
|
|
|
|
|
if __name__ == "__main__": |
|
launch() |
|
|