asahi417 commited on
Commit
b683426
·
1 Parent(s): dbdfadb
Files changed (1) hide show
  1. app.py +56 -32
app.py CHANGED
@@ -5,35 +5,24 @@ from typing import Optional
5
  import spaces
6
  import torch
7
  import gradio as gr
8
- import numpy as np
9
  from transformers import pipeline
10
  from transformers.pipelines.audio_utils import ffmpeg_read
11
 
12
  # config
13
  model_name = "kotoba-tech/kotoba-whisper-v2.2"
14
  example_file = "sample_diarization_japanese.mp3"
15
-
16
- # device setting
17
  if torch.cuda.is_available():
18
- torch_dtype = torch.bfloat16
19
- device = "cuda"
20
- model_kwargs = {'attn_implementation': 'sdpa'}
 
 
 
 
 
 
21
  else:
22
- torch_dtype = torch.float32
23
- device = "cpu"
24
- model_kwargs = {}
25
-
26
- # define the pipeline
27
- pipe = pipeline(
28
- model=model_name,
29
- chunk_length_s=15,
30
- batch_size=16,
31
- torch_dtype=torch_dtype,
32
- device=device,
33
- model_kwargs=model_kwargs,
34
- trust_remote_code=True
35
- )
36
- sampling_rate = pipe.feature_extractor.sampling_rate
37
 
38
 
39
  def format_time(start: Optional[float], end: Optional[float]):
@@ -52,23 +41,35 @@ def format_time(start: Optional[float], end: Optional[float]):
52
 
53
 
54
  @spaces.GPU
55
- def get_prediction(inputs):
56
- return pipe(inputs, generate_kwargs={"language": "ja", "task": "transcribe"})
57
 
58
 
59
- def transcribe(inputs: str):
 
 
 
 
 
 
60
  if inputs is None:
61
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
62
  with open(inputs, "rb") as f:
63
  inputs = f.read()
64
- inputs = ffmpeg_read(inputs, sampling_rate)
65
- array_pad = np.zeros(int(pipe.feature_extractor.sampling_rate * 0.5))
66
- inputs = np.concatenate([array_pad, inputs, array_pad])
67
- prediction = get_prediction({"array": inputs, "sampling_rate": sampling_rate})
 
 
 
 
 
 
68
  output = ""
69
- for n, s in enumerate(prediction["speakers"]):
70
  text_timestamped = "\n".join([f"- **{format_time(*c['timestamp'])}** {c['text']}" for c in prediction[f"chunks/{s}"]])
71
- output += f'### Speaker {n+1} \n{text_timestamped}\n'
72
  return output
73
 
74
 
@@ -78,11 +79,34 @@ title = f"Audio Transcription and Diarization with {os.path.basename(model_name)
78
  shared_config = {"fn": transcribe, "title": title, "description": description, "allow_flagging": "never", "examples": [example_file]}
79
  o_upload = gr.Markdown()
80
  o_mic = gr.Markdown()
 
 
 
81
  i_upload = gr.Interface(
82
- inputs=[gr.Audio(sources="upload", type="filepath", label="Audio file")], outputs=gr.Markdown(), **shared_config
 
 
 
 
 
 
 
 
 
 
83
  )
84
  i_mic = gr.Interface(
85
- inputs=[gr.Audio(sources="microphone", type="filepath", label="Microphone input")], outputs=gr.Markdown(), **shared_config
 
 
 
 
 
 
 
 
 
 
86
  )
87
  with gr.Blocks() as demo:
88
  gr.TabbedInterface([i_upload, i_mic], ["Audio file", "Microphone"])
 
5
  import spaces
6
  import torch
7
  import gradio as gr
 
8
  from transformers import pipeline
9
  from transformers.pipelines.audio_utils import ffmpeg_read
10
 
11
  # config
12
  model_name = "kotoba-tech/kotoba-whisper-v2.2"
13
  example_file = "sample_diarization_japanese.mp3"
 
 
14
  if torch.cuda.is_available():
15
+ pipe = pipeline(
16
+ model=model_name,
17
+ chunk_length_s=15,
18
+ batch_size=16,
19
+ torch_dtype=torch.bfloat16,
20
+ device="cuda",
21
+ model_kwargs={'attn_implementation': 'sdpa'},
22
+ trust_remote_code=True
23
+ )
24
  else:
25
+ pipe = pipeline(model=model_name, chunk_length_s=15, batch_size=16, trust_remote_code=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
 
28
  def format_time(start: Optional[float], end: Optional[float]):
 
41
 
42
 
43
  @spaces.GPU
44
+ def get_prediction(inputs, **kwargs):
45
+ return pipe(inputs, **kwargs)
46
 
47
 
48
+ def transcribe(inputs: str,
49
+ add_punctuation: bool,
50
+ num_speakers: Optional[float],
51
+ min_speakers: Optional[float],
52
+ max_speakers: Optional[float],
53
+ add_silence_end: Optional[float],
54
+ add_silence_start: Optional[float]):
55
  if inputs is None:
56
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
57
  with open(inputs, "rb") as f:
58
  inputs = f.read()
59
+ array = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
60
+ prediction = get_prediction(
61
+ inputs={"array": array, "sampling_rate": pipe.feature_extractor.sampling_rate},
62
+ add_punctuation=add_punctuation,
63
+ num_speakers=int(num_speakers) if num_speakers != 0 else None,
64
+ min_speakers=int(min_speakers) if min_speakers != 0 else None,
65
+ max_speakers=int(max_speakers) if max_speakers != 0 else None,
66
+ add_silence_end=add_silence_end if add_silence_end != 0 else None,
67
+ add_silence_start=add_silence_start if add_silence_start != 0 else None
68
+ )
69
  output = ""
70
+ for n, s in enumerate(prediction["speaker_ids"]):
71
  text_timestamped = "\n".join([f"- **{format_time(*c['timestamp'])}** {c['text']}" for c in prediction[f"chunks/{s}"]])
72
+ output += f'### Speaker {n+1} \n{prediction[f"text/{s}"]}\n\n{text_timestamped}\n'
73
  return output
74
 
75
 
 
79
  shared_config = {"fn": transcribe, "title": title, "description": description, "allow_flagging": "never", "examples": [example_file]}
80
  o_upload = gr.Markdown()
81
  o_mic = gr.Markdown()
82
+ options = [
83
+
84
+ ]
85
  i_upload = gr.Interface(
86
+ inputs=[
87
+ gr.Audio(sources="upload", type="filepath", label="Audio file"),
88
+ gr.Checkbox(label="add punctuation", value=True),
89
+ gr.Slider(0, 10, label="num speakers (set 0 for auto-detect mode)", value=0, step=1),
90
+ gr.Slider(0, 10, label="min speakers (set 0 for auto-detect mode)", value=0, step=1),
91
+ gr.Slider(0, 10, label="max speakers (set 0 for auto-detect mode)", value=0, step=1),
92
+ gr.Slider(0, 0.5, label="silence at the end", value=0.5, step=0.05),
93
+ gr.Slider(0, 0.5, label="silence at the start", value=0.5, step=0.05),
94
+ ],
95
+ outputs=gr.Markdown(),
96
+ **shared_config
97
  )
98
  i_mic = gr.Interface(
99
+ inputs=[
100
+ gr.Audio(sources="microphone", type="filepath", label="Microphone input"),
101
+ gr.Checkbox(label="add punctuation", value=True),
102
+ gr.Slider(0, 10, label="num speakers (set 0 for auto-detect mode)", value=0, step=1),
103
+ gr.Slider(0, 10, label="min speakers (set 0 for auto-detect mode)", value=0, step=1),
104
+ gr.Slider(0, 10, label="max speakers (set 0 for auto-detect mode)", value=0, step=1),
105
+ gr.Slider(0, 0.5, label="silence at the end", value=0.5, step=0.05),
106
+ gr.Slider(0, 0.5, label="silence at the start", value=0.5, step=0.05),
107
+ ],
108
+ outputs=gr.Markdown(),
109
+ **shared_config
110
  )
111
  with gr.Blocks() as demo:
112
  gr.TabbedInterface([i_upload, i_mic], ["Audio file", "Microphone"])