import os import time import warnings from pathlib import Path import gradio as gr import librosa import spaces import torch from transformers import pipeline, WhisperConfig from parse_accent import parse_pitch_accent from surface2katakana_with_acc import katakana_to_phones warnings.filterwarnings("ignore") is_hf = os.getenv("SYSTEM") == "spaces" # reference from litagin / galgame-whisper-wip generate_kwargs = { "max_new_tokens": 256, } pipe = pipeline( "automatic-speech-recognition", model="AkitoP/whisper-large-v3-japense-phone_accent", chunk_length_s=30, device="cuda" if torch.cuda.is_available() else "cpu", ) @spaces.GPU def transcribe(audio: str) -> str: result = pipe(audio, generate_kwargs=generate_kwargs)["text"] print(result) return result, parse_pitch_accent(result), katakana_to_phones(result) initial_md = """ # Whisper Large V3 Japanese Phone Accent A Whisper model fine-tuned to transcribe Japanese speech into Katakana with pitch accent annotations. Built on whisper-large-v3-turbo, it uses a subset (1/20) of the Galgame-Speech dataset and the jsut-5000 dataset. """ with gr.Blocks() as app: gr.Markdown(initial_md) audio = gr.Audio(type="filepath") transcribe_btn = gr.Button("Transcribe") output = gr.Textbox(label="Result") output_HL_style = gr.Textbox(label="HL Result (SBV2 style)") output_UPDOWN_style = gr.Textbox(label="↑↓ Result (GSV style)") transcribe_btn.click(fn=transcribe,inputs=[audio], outputs=[output, output_HL_style, output_UPDOWN_style]) app.launch(inbrowser=True)