asahi417 commited on
Commit
714e414
β€’
1 Parent(s): 5befb0c
Files changed (5) hide show
  1. README.md +8 -1
  2. app.py +85 -0
  3. packages.txt +1 -0
  4. requirements.txt +6 -0
  5. sample_diarization_japanese.mp3 +0 -0
README.md CHANGED
@@ -4,9 +4,16 @@ emoji: πŸ‘
4
  colorFrom: gray
5
  colorTo: pink
6
  sdk: gradio
7
- sdk_version: 5.1.0
8
  app_file: app.py
9
  pinned: false
 
 
 
 
 
 
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
4
  colorFrom: gray
5
  colorTo: pink
6
  sdk: gradio
7
+ sdk_version: 4.39.0
8
  app_file: app.py
9
  pinned: false
10
+ hf_oauth: true
11
+ hf_oauth_expiration_minutes: 480
12
+ hf_oauth_scopes:
13
+ - read-repos
14
+ - write-repos
15
+ - manage-repos
16
+ - inference-api
17
  ---
18
 
19
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from math import floor
3
+ from typing import Optional
4
+
5
+ import spaces
6
+ import torch
7
+ import gradio as gr
8
+ from transformers import pipeline
9
+ from transformers.pipelines.audio_utils import ffmpeg_read
10
+
11
+ # config
12
+ model_name = "kotoba-tech/kotoba-whisper-v2.2"
13
+ example_file = "sample_diarization_japanese.mp3"
14
+
15
+ # device setting
16
+ if torch.cuda.is_available():
17
+ torch_dtype = torch.bfloat16
18
+ device = "cuda"
19
+ model_kwargs = {'attn_implementation': 'sdpa'}
20
+ else:
21
+ torch_dtype = torch.float32
22
+ device = "cpu"
23
+ model_kwargs = {}
24
+
25
+ # define the pipeline
26
+ pipe = pipeline(
27
+ model=model_name,
28
+ chunk_length_s=15,
29
+ batch_size=16,
30
+ torch_dtype=torch_dtype,
31
+ device=device,
32
+ model_kwargs=model_kwargs,
33
+ trust_remote_code=True
34
+ )
35
+ sampling_rate = pipe.feature_extractor.sampling_rate
36
+
37
+
38
+ def format_time(start: Optional[float], end: Optional[float]):
39
+
40
+ def _format_time(seconds: Optional[float]):
41
+ if seconds is None:
42
+ return "[no timestamp available]"
43
+ minutes = floor(seconds / 60)
44
+ hours = floor(seconds / 3600)
45
+ seconds = seconds - hours * 3600 - minutes * 60
46
+ m_seconds = floor(round(seconds - floor(seconds), 1) * 10)
47
+ seconds = floor(seconds)
48
+ return f'{minutes:02}:{seconds:02}.{m_seconds:01}'
49
+
50
+ return f"[{_format_time(start)} -> {_format_time(end)}]:"
51
+
52
+
53
+ @spaces.GPU
54
+ def get_prediction(inputs):
55
+ return pipe(inputs, generate_kwargs={"language": "ja", "task": "transcribe"})
56
+
57
+
58
+ def transcribe(inputs: str):
59
+ if inputs is None:
60
+ raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
61
+ with open(inputs, "rb") as f:
62
+ inputs = f.read()
63
+ prediction = get_prediction({"array": ffmpeg_read(inputs, sampling_rate), "sampling_rate": sampling_rate})
64
+ output = ""
65
+ for n, s in enumerate(prediction["speakers"]):
66
+ text_timestamped = "\n".join([f"- **{format_time(*c['timestamp'])}** {c['text']}" for c in prediction[f"chunks/{s}"]])
67
+ output += f'### Speaker {n+1} \n{text_timestamped}\n'
68
+ return output
69
+
70
+
71
+ description = (f"Transcribe and diarize long-form microphone or audio inputs with the click of a button! Demo uses "
72
+ f"Kotoba-Whisper [{model_name}](https://huggingface.co/{model_name}).")
73
+ title = f"Audio Transcription and Diarization with {os.path.basename(model_name)}"
74
+ shared_config = {"fn": transcribe, "title": title, "description": description, "allow_flagging": "never", "examples": [example_file]}
75
+ o_upload = gr.Markdown()
76
+ o_mic = gr.Markdown()
77
+ i_upload = gr.Interface(
78
+ inputs=[gr.Audio(sources="upload", type="filepath", label="Audio file")], outputs=gr.Markdown(), **shared_config
79
+ )
80
+ i_mic = gr.Interface(
81
+ inputs=[gr.Audio(sources="microphone", type="filepath", label="Microphone input")], outputs=gr.Markdown(), **shared_config
82
+ )
83
+ with gr.Blocks() as demo:
84
+ gr.TabbedInterface([i_upload, i_mic], ["Audio file", "Microphone"])
85
+ demo.queue(api_open=False, default_concurrency_limit=40).launch(show_api=False, show_error=True)
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ git+https://github.com/huggingface/transformers
2
+ git+https://github.com/huggingface/diarizers
3
+ torchaudio
4
+ torch
5
+ punctuators==0.0.5
6
+ pyannote.audio
sample_diarization_japanese.mp3 ADDED
Binary file (780 kB). View file