AkitoP's picture
Update app.py
ab2fef5 verified
import os
import time
import warnings
from pathlib import Path
import gradio as gr
import librosa
import spaces
import torch
from transformers import pipeline, WhisperConfig
from parse_accent import parse_pitch_accent
from surface2katakana_with_acc import katakana_to_phones
warnings.filterwarnings("ignore")
is_hf = os.getenv("SYSTEM") == "spaces"
# reference from litagin / galgame-whisper-wip
generate_kwargs = {
"max_new_tokens": 256,
}
pipe = pipeline(
"automatic-speech-recognition",
model="AkitoP/whisper-large-v3-japense-phone_accent",
chunk_length_s=30,
device="cuda" if torch.cuda.is_available() else "cpu",
)
@spaces.GPU
def transcribe(audio: str) -> str:
result = pipe(audio, generate_kwargs=generate_kwargs)["text"]
print(result)
return result, parse_pitch_accent(result), katakana_to_phones(result)
initial_md = """
# Whisper Large V3 Japanese Phone Accent
A Whisper model fine-tuned to transcribe Japanese speech into Katakana with pitch accent annotations. Built on whisper-large-v3-turbo, it uses a subset (1/20) of the Galgame-Speech dataset and the jsut-5000 dataset.
"""
with gr.Blocks() as app:
gr.Markdown(initial_md)
audio = gr.Audio(type="filepath")
transcribe_btn = gr.Button("Transcribe")
output = gr.Textbox(label="Result")
output_HL_style = gr.Textbox(label="HL Result (SBV2 style)")
output_UPDOWN_style = gr.Textbox(label="↑↓ Result (GSV style)")
transcribe_btn.click(fn=transcribe,inputs=[audio], outputs=[output, output_HL_style, output_UPDOWN_style])
app.launch(inbrowser=True)