Spaces:

AkitoP
/

whisper-japanese-phone-demo

Running

App Files Files Community

AkitoP commited on Oct 15, 2024

Commit

89f31d9

verified ·

1 Parent(s): 485039d

Upload app.py

Browse files

Files changed (1) hide show

app.py +63 -0

app.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import os
+import time
+import warnings
+from pathlib import Path
+import gradio as gr
+import librosa
+import spaces
+import torch
+from loguru import logger
+from transformers import pipeline
+warnings.filterwarnings("ignore")
+is_hf = os.getenv("SYSTEM") == "spaces"
+# reference from litagin / galgame-whisper-wip
+generate_kwargs = {
+    "language": "Japanese",
+    "max_new_tokens": 256,
+}
+pipe = pipeline(
+    "automatic-speech-recognition",
+    model="AkitoP/whisper-large-v3-japense-phone_accent",
+    device="cuda" if torch.cuda.is_available() else "cpu",
+)
+@spaces.GPU
+def transcribe(audio: str, model: str) -> tuple[str, float]:
+    if not audio:
+        return "No audio file", 0
+    filename = Path(audio).name
+    logger.info(f"Model: {model}")
+    logger.info(f"Audio: {filename}")
+    # Read and resample audio to 16kHz
+    y, sr = librosa.load(audio, mono=True, sr=16000)
+    # Get duration of audio
+    duration = librosa.get_duration(y=y, sr=sr)
+    logger.info(f"Duration: {duration:.2f}s")
+    start_time = time.time()
+    result = pipe(y, generate_kwargs=generate_kwargs)["text"]
+    end_time = time.time()
+    return result
+initial_md = """
+# Whisper Large V3 Japanese Phone Accent
+A Whisper model fine-tuned to transcribe Japanese speech into Katakana with pitch accent annotations. Built on whisper-large-v3-turbo, it uses a subset (1/20) of the Galgame-Speech dataset and the jsut-5000 dataset.
+"""
+with gr.Blocks() as app:
+    gr.Markdown(initial_md)
+    audio = gr.Audio(type="filepath")
+    transcript = gr.Button("Transcribe with Galgame-Whisper (WIP)")
+    output = gr.Textbox(label="Result")
+    transcript.click(transcribe(audio=audio, model="AkitoP/whisper-large-v3-japense-phone_accent"), outputs=[output])
+    # app.load(warmup, inputs=[], outputs=[warmup_result], queue=True)
+app.launch(inbrowser=True)