import os import time import warnings from pathlib import Path import gradio as gr import librosa import spaces import torch from transformers import pipeline warnings.filterwarnings("ignore") is_hf = os.getenv("SYSTEM") == "spaces" # reference from litagin / galgame-whisper-wip generate_kwargs = { "max_new_tokens": 256, } pipe = pipeline( "automatic-speech-recognition", model="AkitoP/whisper-large-v3-japense-phone_accent", device="cuda" if torch.cuda.is_available() else "cpu", ) @spaces.GPU def transcribe(audio: str) -> str: result = pipe(audio, generate_kwargs=generate_kwargs)["text"] return result initial_md = """ # Whisper Large V3 Japanese Phone Accent A Whisper model fine-tuned to transcribe Japanese speech into Katakana with pitch accent annotations. Built on whisper-large-v3-turbo, it uses a subset (1/20) of the Galgame-Speech dataset and the jsut-5000 dataset. """ with gr.Blocks() as app: gr.Markdown(initial_md) audio = gr.Audio(type="filepath") transcribe_btn = gr.Button("Transcribe") output = gr.Textbox(label="Result") transcribe_btn.click(fn=transcribe,inputs=[audio], outputs=[output]) app.launch(inbrowser=True)