File size: 2,599 Bytes
5cb9c90
 
57d9268
5cb9c90
9ecefd1
57d9268
5cb9c90
 
d3490a8
103d57b
5cb9c90
 
27c943a
103d57b
 
 
57d9268
9ecefd1
57d9268
 
 
103d57b
57d9268
 
103d57b
 
 
57d9268
103d57b
 
 
 
57d9268
9ecefd1
5cb9c90
 
 
d3490a8
e41a956
 
 
d3490a8
e41a956
98c06b0
d199239
 
 
 
 
d4123fe
d199239
 
 
 
98c06b0
1561f9d
94ef5c0
 
98c06b0
1e836f3
 
 
 
 
1561f9d
 
94ef5c0
 
1561f9d
 
 
57d9268
b8c0ef3
2fdad2f
 
94ef5c0
2fdad2f
 
 
94ef5c0
b8c0ef3
 
 
 
94ef5c0
 
b8c0ef3
57d9268
 
5cb9c90
 
 
57d9268
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# coding=utf-8

import gradio as gr
import numpy as np
import soundfile as sf
import spaces
import torch
import torchaudio
from gradio.themes import Base
from sv import process_audio


@spaces.GPU
def model_inference(input_wav, language):
    # Simplify language selection
    language = language if language else "auto"

    # Handle input_wav format
    if isinstance(input_wav, tuple):
        fs, input_wav = input_wav
        input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max
        input_wav = input_wav.mean(-1) if len(input_wav.shape) > 1 else input_wav
        if fs != 16000:
            resampler = torchaudio.transforms.Resample(fs, 16000)
            input_wav = resampler(torch.from_numpy(input_wav).float()[None, :])[
                0
            ].numpy()

    # Process audio
    with sf.SoundFile("temp.wav", "w", samplerate=16000, channels=1) as f:
        f.write(input_wav)
    result = process_audio("temp.wav", language=language)

    return result


def launch():
    # Create a custom theme
    custom_css = """
        .gradio-container {color: rgb(70, 70, 70);}
    """

    with gr.Blocks(css=custom_css) as demo:
        gr.Markdown("# Cantonese Call Transcriber")
        gr.Markdown(
            """
        This tool transcribes Cantonese audio calls into text.
        
        ## How to use:
        1. Upload an audio file or use the example provided at the bottom of the page.
        2. Click the 'Process Audio' button.
        3. The transcription will appear in the output box.
        """
        )

        # Define components
        audio_input = gr.Audio(label="Input")
        text_output = gr.Textbox(lines=10, label="Output")

        # Custom render function for Examples
        def render_example(example):
            return gr.Button("Try Example Audio")

        # Update the Examples component
        gr.Examples(
            examples=[["example/scb.mp3"]],
            inputs=[audio_input],
            outputs=[text_output],
            fn=lambda x: model_inference(x, "yue"),
            examples_per_page=1,
        )

        # Main interface
        with gr.Row():
            with gr.Column(scale=2):
                audio_input
                fn_button = gr.Button("Process Audio", variant="primary")

            with gr.Column(scale=3):
                text_output

        # Set up event handler
        fn_button.click(
            fn=lambda x: model_inference(x, "yue"),
            inputs=[audio_input],
            outputs=[text_output],
        )

    demo.launch()


if __name__ == "__main__":
    launch()