File size: 3,657 Bytes
5ed89cf
dde51bf
 
 
 
 
e1a80bb
dde51bf
 
 
 
 
 
 
 
 
 
 
47c3f01
 
 
 
 
dde51bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2cb1676
6f88a4c
dde51bf
 
 
 
 
 
 
 
5ed89cf
0b96a3d
2cb1676
e1a80bb
 
 
 
dde51bf
 
 
 
 
 
 
 
e2aa46f
dde51bf
 
5ed89cf
cb06d8f
e1a80bb
5ed89cf
47c3f01
dde51bf
 
 
 
 
 
 
e2aa46f
dde51bf
 
5ed89cf
cb06d8f
e1a80bb
42533cf
47c3f01
36dc535
 
 
 
 
dde51bf
 
 
 
5ed89cf
dde51bf
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import string
import torch
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_read
import gradio as gr

MODEL_NAME = "vinai/PhoWhisper-large"
BATCH_SIZE = 8

device = 0 if torch.cuda.is_available() else "cpu"

pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=30,
    device=device,
)

desc = f"""
This space transcribes Vietnamese words, phrases, and sentences via microphone or audio files then compares the user's text input to what the language model hears.
You will then be given a PASS/FAIL grade to tell you if your spoken audio matches the text you entered.
[{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) is the Vietnamese Speech-to-Text model that powers the analysis of the audio files.
"""

# Copied from https://github.com/openai/whisper/blob/c09a7ae299c4c34c5839a76380ae407e7d785914/whisper/utils.py#L50
def format_timestamp(seconds: float, always_include_hours: bool = False, decimal_marker: str = "."):
    if seconds is not None:
        milliseconds = round(seconds * 1000.0)

        hours = milliseconds // 3_600_000
        milliseconds -= hours * 3_600_000

        minutes = milliseconds // 60_000
        milliseconds -= minutes * 60_000

        seconds = milliseconds // 1_000
        milliseconds -= seconds * 1_000

        hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
        return f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
    else:
        # we have a malformed timestamp so just return it as is
        return seconds


def transcribe(file, og_text, return_timestamps):
    outputs = pipe(file, batch_size=BATCH_SIZE, return_timestamps=return_timestamps)
    text = outputs["text"]
    if return_timestamps:
        timestamps = outputs["chunks"]
        timestamps = [
            f"[{format_timestamp(chunk['timestamp'][0])} -> {format_timestamp(chunk['timestamp'][1])}] {chunk['text']}"
            for chunk in timestamps
        ]
        text = "\n".join(str(feature) for feature in timestamps)
    text_nopunc = text.translate(str.maketrans('', '', string.punctuation))
    grade = ''
    if text_nopunc.lower() == og_text.lower():
        grade = "good!"
    else:
        grade = "could use some work..."
    return text, grade


demo = gr.Blocks()

mic_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.inputs.Audio(source="microphone", type="filepath", optional=True),
        gr.inputs.Textbox(label="Word/Phrase"),
        gr.inputs.Checkbox(default=False, label="Return timestamps"),
    ],
    outputs=[gr.Textbox(label="What I heard..."), gr.Textbox(label="Grade")],
    layout="vertical",
    theme="huggingface",
    title="Vietnamese Pronounciation Checker",
    description=(desc),
    allow_flagging="never",
)

file_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.inputs.Audio(source="upload", optional=True, label="Audio file", type="filepath"),
        gr.inputs.Textbox(label="Word/Phrase"),
        gr.inputs.Checkbox(default=False, label="Return timestamps"),
    ],
    outputs=[gr.Textbox(label="What I heard..."), gr.Textbox(label="Grade")],
    layout="vertical",
    theme="huggingface",
    title="Vietnamese Pronounciation Checker",
    description=(desc),
    examples=[
        ["./example.flac", "transcribe", False],
        ["./example.flac", "transcribe", True],
    ],
    cache_examples=True,
    allow_flagging="never",
)

with demo:
    gr.TabbedInterface([mic_transcribe, file_transcribe], ["Pronounce via Microphone", "Pronounce via Audio File"])

demo.launch(enable_queue=True)