|
import os |
|
import time |
|
import warnings |
|
from pathlib import Path |
|
|
|
import gradio as gr |
|
import librosa |
|
import spaces |
|
import torch |
|
from transformers import pipeline, WhisperConfig |
|
from parse_accent import parse_pitch_accent |
|
from surface2katakana_with_acc import katakana_to_phones |
|
warnings.filterwarnings("ignore") |
|
|
|
is_hf = os.getenv("SYSTEM") == "spaces" |
|
|
|
|
|
generate_kwargs = { |
|
"max_new_tokens": 256, |
|
} |
|
|
|
pipe = pipeline( |
|
"automatic-speech-recognition", |
|
model="AkitoP/whisper-large-v3-japense-phone_accent", |
|
chunk_length_s=30, |
|
device="cuda" if torch.cuda.is_available() else "cpu", |
|
) |
|
|
|
|
|
@spaces.GPU |
|
def transcribe(audio: str) -> str: |
|
result = pipe(audio, generate_kwargs=generate_kwargs)["text"] |
|
print(result) |
|
return result, parse_pitch_accent(result), katakana_to_phones(result) |
|
|
|
|
|
initial_md = """ |
|
# Whisper Large V3 Japanese Phone Accent |
|
|
|
A Whisper model fine-tuned to transcribe Japanese speech into Katakana with pitch accent annotations. Built on whisper-large-v3-turbo, it uses a subset (1/20) of the Galgame-Speech dataset and the jsut-5000 dataset. |
|
""" |
|
|
|
with gr.Blocks() as app: |
|
gr.Markdown(initial_md) |
|
audio = gr.Audio(type="filepath") |
|
transcribe_btn = gr.Button("Transcribe") |
|
output = gr.Textbox(label="Result") |
|
output_HL_style = gr.Textbox(label="HL Result (SBV2 style)") |
|
output_UPDOWN_style = gr.Textbox(label="ββ Result (GSV style)") |
|
transcribe_btn.click(fn=transcribe,inputs=[audio], outputs=[output, output_HL_style, output_UPDOWN_style]) |
|
|
|
app.launch(inbrowser=True) |