Spaces:
Runtime error
Runtime error
File size: 3,657 Bytes
5ed89cf dde51bf e1a80bb dde51bf 47c3f01 dde51bf 2cb1676 6f88a4c dde51bf 5ed89cf 0b96a3d 2cb1676 e1a80bb dde51bf e2aa46f dde51bf 5ed89cf cb06d8f e1a80bb 5ed89cf 47c3f01 dde51bf e2aa46f dde51bf 5ed89cf cb06d8f e1a80bb 42533cf 47c3f01 36dc535 dde51bf 5ed89cf dde51bf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
import string
import torch
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_read
import gradio as gr
MODEL_NAME = "vinai/PhoWhisper-large"
BATCH_SIZE = 8
device = 0 if torch.cuda.is_available() else "cpu"
pipe = pipeline(
task="automatic-speech-recognition",
model=MODEL_NAME,
chunk_length_s=30,
device=device,
)
desc = f"""
This space transcribes Vietnamese words, phrases, and sentences via microphone or audio files then compares the user's text input to what the language model hears.
You will then be given a PASS/FAIL grade to tell you if your spoken audio matches the text you entered.
[{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) is the Vietnamese Speech-to-Text model that powers the analysis of the audio files.
"""
# Copied from https://github.com/openai/whisper/blob/c09a7ae299c4c34c5839a76380ae407e7d785914/whisper/utils.py#L50
def format_timestamp(seconds: float, always_include_hours: bool = False, decimal_marker: str = "."):
if seconds is not None:
milliseconds = round(seconds * 1000.0)
hours = milliseconds // 3_600_000
milliseconds -= hours * 3_600_000
minutes = milliseconds // 60_000
milliseconds -= minutes * 60_000
seconds = milliseconds // 1_000
milliseconds -= seconds * 1_000
hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
return f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
else:
# we have a malformed timestamp so just return it as is
return seconds
def transcribe(file, og_text, return_timestamps):
outputs = pipe(file, batch_size=BATCH_SIZE, return_timestamps=return_timestamps)
text = outputs["text"]
if return_timestamps:
timestamps = outputs["chunks"]
timestamps = [
f"[{format_timestamp(chunk['timestamp'][0])} -> {format_timestamp(chunk['timestamp'][1])}] {chunk['text']}"
for chunk in timestamps
]
text = "\n".join(str(feature) for feature in timestamps)
text_nopunc = text.translate(str.maketrans('', '', string.punctuation))
grade = ''
if text_nopunc.lower() == og_text.lower():
grade = "good!"
else:
grade = "could use some work..."
return text, grade
demo = gr.Blocks()
mic_transcribe = gr.Interface(
fn=transcribe,
inputs=[
gr.inputs.Audio(source="microphone", type="filepath", optional=True),
gr.inputs.Textbox(label="Word/Phrase"),
gr.inputs.Checkbox(default=False, label="Return timestamps"),
],
outputs=[gr.Textbox(label="What I heard..."), gr.Textbox(label="Grade")],
layout="vertical",
theme="huggingface",
title="Vietnamese Pronounciation Checker",
description=(desc),
allow_flagging="never",
)
file_transcribe = gr.Interface(
fn=transcribe,
inputs=[
gr.inputs.Audio(source="upload", optional=True, label="Audio file", type="filepath"),
gr.inputs.Textbox(label="Word/Phrase"),
gr.inputs.Checkbox(default=False, label="Return timestamps"),
],
outputs=[gr.Textbox(label="What I heard..."), gr.Textbox(label="Grade")],
layout="vertical",
theme="huggingface",
title="Vietnamese Pronounciation Checker",
description=(desc),
examples=[
["./example.flac", "transcribe", False],
["./example.flac", "transcribe", True],
],
cache_examples=True,
allow_flagging="never",
)
with demo:
gr.TabbedInterface([mic_transcribe, file_transcribe], ["Pronounce via Microphone", "Pronounce via Audio File"])
demo.launch(enable_queue=True) |