File size: 2,732 Bytes
20aa839
3a18b3b
bef8623
660776b
20aa839
448bf1b
 
 
 
 
 
 
 
 
 
 
c492cbb
448bf1b
 
 
 
 
 
 
 
 
03dc51e
 
 
9a0faf6
03dc51e
 
 
448bf1b
668fb3c
448bf1b
 
9db718b
448bf1b
 
 
 
 
 
 
c492cbb
448bf1b
9db718b
448bf1b
 
 
 
 
 
03dc51e
 
 
9a0faf6
03dc51e
 
 
448bf1b
668fb3c
448bf1b
 
20aa839
448bf1b
 
501d3b8
448bf1b
20aa839
 
3493c42
 
 
ca72173
3493c42
 
ca72173
3493c42
6a213c1
ffbaf39
da61140
 
3493c42
 
448bf1b
20aa839
 
9a0faf6
20aa839
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import gradio as gr
import asr
import tts
import util

mms_transcribe = gr.Interface(
    fn=asr.transcribe,
    inputs=[
        gr.Audio(
            label="Record or Upload Uyghur Audio",
            sources=["microphone", "upload"],
            type="filepath",
        ),
        gr.Dropdown(
            choices=[model for model in asr.models_info],
            label="Select a Model",
            value="Ixxan-FineTuned-MMS",
            interactive=True
        ),
    ],
    outputs=[
        gr.Textbox(label="Uyghur Arabic Transcription"),
        gr.Textbox(label="Uyghur Latin Transcription"),
    ],
    examples=util.asr_examples,
    description=(
        """
        Transcribe Uyghur audio from a microphone or input file.

        Cilck on examples below for sample usage.

        Please keep the audio length under 10 seconds for faster processing since this space is running on CPU basic.
        """
    ),
    article=util.asr_notes,
    allow_flagging="never",
)

mms_synthesize = gr.Interface(
    fn=tts.synthesize,
    inputs=[
        gr.Text(label="Input text"),
        gr.Dropdown(
            choices=[model for model in tts.models_info],
            label="Select a Model",
            value="Ixxan-FineTuned-MMS",
            interactive=True
        )
    ],
    outputs=[
        gr.Audio(label="Generated Audio"),
    ],
    examples=util.tts_examples,
    description=(
        """
        Generate audio from input Uyghur text.

        Cilck on examples below for sample usage.

        Please keep the input text length under 200 characters for faster processing since this space is running on CPU basic.
        """
        ),
    article=util.tts_notes,
    allow_flagging="never",
)

tabbed_interface = gr.TabbedInterface(
    [mms_transcribe, mms_synthesize],
    ["Speech-To-Text", "Text-To-Speech"],
)

with gr.Blocks() as demo:
    gr.Markdown(
        """
        <h1 style="text-align: center; font-size: 28px; color: #4A90E2;">
            Uyghur Speech-To-Text (STT) and Text-To-Speech (TTS) Models
        </h1>
        <p style="text-align: center; font-size: 16px; color: #555;">
            Comparisons of existing and fine-tuned speech models for transcribing and synthesizing Uyghur speech.
        </p>
        
        To learn more about Uyghur Speech Technology, please check out my [blog post](https://ixxan.github.io/blog/low-resource-speech-uyghur).

        To see the model fine-tuning code, please visit my [GitHub repository](https://github.com/ixxan/ug-speech).
        """
    )
    tabbed_interface.render()

if __name__ == "__main__":
    demo.queue(default_concurrency_limit = 2, max_size=20)  # <-- Sets up a queue with default parameters
    demo.launch()