Spaces:

AkitoP
/

whisper-japanese-phone-demo

Running

Update app.py

ab2fef5 verified 2 months ago

1.59 kB

	import os
	import time
	import warnings
	from pathlib import Path

	import gradio as gr
	import librosa
	import spaces
	import torch
	from transformers import pipeline, WhisperConfig
	from parse_accent import parse_pitch_accent
	from surface2katakana_with_acc import katakana_to_phones
	warnings.filterwarnings("ignore")

	is_hf = os.getenv("SYSTEM") == "spaces"
	# reference from litagin / galgame-whisper-wip

	generate_kwargs = {
	"max_new_tokens": 256,
	}

	pipe = pipeline(
	"automatic-speech-recognition",
	model="AkitoP/whisper-large-v3-japense-phone_accent",
	chunk_length_s=30,
	device="cuda" if torch.cuda.is_available() else "cpu",
	)


	@spaces.GPU
	def transcribe(audio: str) -> str:
	result = pipe(audio, generate_kwargs=generate_kwargs)["text"]
	print(result)
	return result, parse_pitch_accent(result), katakana_to_phones(result)


	initial_md = """
	# Whisper Large V3 Japanese Phone Accent

	A Whisper model fine-tuned to transcribe Japanese speech into Katakana with pitch accent annotations. Built on whisper-large-v3-turbo, it uses a subset (1/20) of the Galgame-Speech dataset and the jsut-5000 dataset.
	"""

	with gr.Blocks() as app:
	gr.Markdown(initial_md)
	audio = gr.Audio(type="filepath")
	transcribe_btn = gr.Button("Transcribe")
	output = gr.Textbox(label="Result")
	output_HL_style = gr.Textbox(label="HL Result (SBV2 style)")
	output_UPDOWN_style = gr.Textbox(label="↑↓ Result (GSV style)")
	transcribe_btn.click(fn=transcribe,inputs=[audio], outputs=[output, output_HL_style, output_UPDOWN_style])

	app.launch(inbrowser=True)