AkitoP commited on
Commit
89f31d9
·
verified ·
1 Parent(s): 485039d

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -0
app.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import warnings
4
+ from pathlib import Path
5
+
6
+ import gradio as gr
7
+ import librosa
8
+ import spaces
9
+ import torch
10
+ from loguru import logger
11
+ from transformers import pipeline
12
+
13
+ warnings.filterwarnings("ignore")
14
+
15
+ is_hf = os.getenv("SYSTEM") == "spaces"
16
+ # reference from litagin / galgame-whisper-wip
17
+
18
+ generate_kwargs = {
19
+ "language": "Japanese",
20
+ "max_new_tokens": 256,
21
+ }
22
+
23
+ pipe = pipeline(
24
+ "automatic-speech-recognition",
25
+ model="AkitoP/whisper-large-v3-japense-phone_accent",
26
+ device="cuda" if torch.cuda.is_available() else "cpu",
27
+ )
28
+
29
+
30
+ @spaces.GPU
31
+ def transcribe(audio: str, model: str) -> tuple[str, float]:
32
+ if not audio:
33
+ return "No audio file", 0
34
+ filename = Path(audio).name
35
+ logger.info(f"Model: {model}")
36
+ logger.info(f"Audio: {filename}")
37
+ # Read and resample audio to 16kHz
38
+ y, sr = librosa.load(audio, mono=True, sr=16000)
39
+ # Get duration of audio
40
+ duration = librosa.get_duration(y=y, sr=sr)
41
+ logger.info(f"Duration: {duration:.2f}s")
42
+ start_time = time.time()
43
+ result = pipe(y, generate_kwargs=generate_kwargs)["text"]
44
+ end_time = time.time()
45
+ return result
46
+
47
+
48
+ initial_md = """
49
+ # Whisper Large V3 Japanese Phone Accent
50
+
51
+ A Whisper model fine-tuned to transcribe Japanese speech into Katakana with pitch accent annotations. Built on whisper-large-v3-turbo, it uses a subset (1/20) of the Galgame-Speech dataset and the jsut-5000 dataset.
52
+ """
53
+
54
+ with gr.Blocks() as app:
55
+ gr.Markdown(initial_md)
56
+ audio = gr.Audio(type="filepath")
57
+ transcript = gr.Button("Transcribe with Galgame-Whisper (WIP)")
58
+ output = gr.Textbox(label="Result")
59
+ transcript.click(transcribe(audio=audio, model="AkitoP/whisper-large-v3-japense-phone_accent"), outputs=[output])
60
+
61
+
62
+ # app.load(warmup, inputs=[], outputs=[warmup_result], queue=True)
63
+ app.launch(inbrowser=True)