asahi417 commited on
Commit
4f38470
1 Parent(s): 429df62
README.md CHANGED
@@ -9,7 +9,7 @@ tags:
9
  widget:
10
  - example_title: Sample 1
11
  src: >-
12
- https://huggingface.co/datasets/japanese-asr/ja_asr.common_voice_8_0/resolve/main/sample.flac
13
  pipeline_tag: automatic-speech-recognition
14
  ---
15
 
@@ -58,8 +58,9 @@ install the latest version of Transformers.
58
  ```bash
59
  pip install --upgrade pip
60
  pip install --upgrade transformers accelerate torchaudio
61
- pip install stable-ts==2.16.0
62
- pip install punctuators==0.0.5
 
63
  ```
64
 
65
  ### Transcription
 
9
  widget:
10
  - example_title: Sample 1
11
  src: >-
12
+ https://huggingface.co/kotoba-tech/kotoba-whisper-v2.2/resolve/main/sample_audio/sample_diarization_japanese.mp3
13
  pipeline_tag: automatic-speech-recognition
14
  ---
15
 
 
58
  ```bash
59
  pip install --upgrade pip
60
  pip install --upgrade transformers accelerate torchaudio
61
+ pip install "punctuators==0.0.5"
62
+ pip install "pyannote.audio"
63
+ pip install git+https://github.com/huggingface/diarizers.git
64
  ```
65
 
66
  ### Transcription
pipeline/kotoba_whisper.py CHANGED
@@ -13,6 +13,7 @@ from transformers.feature_extraction_sequence_utils import SequenceFeatureExtrac
13
  from pyannote.audio import Pipeline
14
  from pyannote.core.annotation import Annotation
15
  from punctuators.models import PunctCapSegModelONNX
 
16
 
17
 
18
  class Punctuator:
@@ -45,25 +46,29 @@ class Punctuator:
45
 
46
  class SpeakerDiarization:
47
 
48
- def __init__(self, model_id: str, device: torch.device):
 
 
 
49
  self.device = device
50
  self.pipeline = Pipeline.from_pretrained(model_id)
51
  self.pipeline = self.pipeline.to(self.device)
52
-
53
- def __call__(self,
54
- audio: Union[str, torch.Tensor, np.ndarray],
55
- sampling_rate: Optional[int] = None) -> Annotation:
56
- if type(audio) is torch.Tensor or type(audio) is np.ndarray:
57
- if sampling_rate is None:
58
- raise ValueError("sampling_rate must be provided")
59
- if type(audio) is np.ndarray:
60
- audio = torch.as_tensor(audio)
61
- audio = torch.as_tensor(audio, dtype=torch.float32)
62
- if len(audio.shape) == 1:
63
- audio = audio.unsqueeze(0)
64
- elif len(audio.shape) > 3:
65
- raise ValueError("audio shape must be (channel, time)")
66
- audio = {"waveform": audio.to(self.device), "sample_rate": sampling_rate}
 
67
  output = self.pipeline(audio)
68
  return output
69
 
@@ -72,23 +77,28 @@ class KotobaWhisperPipeline(AutomaticSpeechRecognitionPipeline):
72
 
73
  def __init__(self,
74
  model: "PreTrainedModel",
75
- model_diarizarization: str="pyannote/speaker-diarization-3.1",
 
76
  feature_extractor: Union["SequenceFeatureExtractor", str] = None,
77
  tokenizer: Optional[PreTrainedTokenizer] = None,
78
  device: Union[int, "torch.device"] = None,
79
- device_diarizarization: Union[int, "torch.device"] = None,
80
  torch_dtype: Optional[Union[str, "torch.dtype"]] = None,
81
- return_unique_speaker: bool = False,
82
  punctuator: bool = False,
83
  **kwargs):
84
  self.type = "seq2seq_whisper"
85
  if device is None:
86
  device = "cpu"
87
- if device_diarizarization is None:
88
- device_diarizarization = device
89
- if type(device_diarizarization) is str:
90
- device_diarizarization = torch.device(device_diarizarization)
91
- self.model_speaker_diarization = SpeakerDiarization(model_diarizarization, device_diarizarization)
 
 
 
 
92
  self.return_unique_speaker = return_unique_speaker
93
  if punctuator:
94
  self.punctuator = Punctuator()
@@ -308,8 +318,13 @@ class KotobaWhisperPipeline(AutomaticSpeechRecognitionPipeline):
308
  outputs["text"] = "".join([c["text"] for c in outputs["chunks"]])
309
  outputs["speakers"] = sd.labels()
310
  outputs.pop("audio_array")
 
311
  for s in outputs["speakers"]:
312
- outputs[f"text/{s}"] = "".join([c["text"] for c in outputs["chunks"] if s in c["speaker"]])
313
- outputs[f"chunks/{s}"] = [c for c in outputs["chunks"] if s in c["speaker"]]
 
 
 
 
314
  return outputs
315
 
 
13
  from pyannote.audio import Pipeline
14
  from pyannote.core.annotation import Annotation
15
  from punctuators.models import PunctCapSegModelONNX
16
+ from diarizers import SegmentationModel
17
 
18
 
19
  class Punctuator:
 
46
 
47
  class SpeakerDiarization:
48
 
49
+ def __init__(self,
50
+ device: torch.device,
51
+ model_id: str = "pyannote/speaker-diarization-3.1",
52
+ model_id_diarizers: Optional[str] = None):
53
  self.device = device
54
  self.pipeline = Pipeline.from_pretrained(model_id)
55
  self.pipeline = self.pipeline.to(self.device)
56
+ if model_id_diarizers:
57
+ self.pipeline._segmentation.model = SegmentationModel().from_pretrained(
58
+ model_id_diarizers
59
+ ).to_pyannote_model().to(self.device)
60
+
61
+ def __call__(self, audio: Union[torch.Tensor, np.ndarray], sampling_rate: int) -> Annotation:
62
+ if sampling_rate is None:
63
+ raise ValueError("sampling_rate must be provided")
64
+ if type(audio) is np.ndarray:
65
+ audio = torch.as_tensor(audio)
66
+ audio = torch.as_tensor(audio, dtype=torch.float32)
67
+ if len(audio.shape) == 1:
68
+ audio = audio.unsqueeze(0)
69
+ elif len(audio.shape) > 3:
70
+ raise ValueError("audio shape must be (channel, time)")
71
+ audio = {"waveform": audio.to(self.device), "sample_rate": sampling_rate}
72
  output = self.pipeline(audio)
73
  return output
74
 
 
77
 
78
  def __init__(self,
79
  model: "PreTrainedModel",
80
+ model_pyannote: str = "pyannote/speaker-diarization-3.1",
81
+ model_diarizers: Optional[str] = "diarizers-community/speaker-segmentation-fine-tuned-callhome-jpn",
82
  feature_extractor: Union["SequenceFeatureExtractor", str] = None,
83
  tokenizer: Optional[PreTrainedTokenizer] = None,
84
  device: Union[int, "torch.device"] = None,
85
+ device_pyannote: Union[int, "torch.device"] = None,
86
  torch_dtype: Optional[Union[str, "torch.dtype"]] = None,
87
+ return_unique_speaker: bool = True,
88
  punctuator: bool = False,
89
  **kwargs):
90
  self.type = "seq2seq_whisper"
91
  if device is None:
92
  device = "cpu"
93
+ if device_pyannote is None:
94
+ device_pyannote = device
95
+ if type(device_pyannote) is str:
96
+ device_pyannote = torch.device(device_pyannote)
97
+ self.model_speaker_diarization = SpeakerDiarization(
98
+ device=device_pyannote,
99
+ model_id=model_pyannote,
100
+ model_id_diarizers=model_diarizers
101
+ )
102
  self.return_unique_speaker = return_unique_speaker
103
  if punctuator:
104
  self.punctuator = Punctuator()
 
318
  outputs["text"] = "".join([c["text"] for c in outputs["chunks"]])
319
  outputs["speakers"] = sd.labels()
320
  outputs.pop("audio_array")
321
+ speakers = []
322
  for s in outputs["speakers"]:
323
+ chunk_s = [c for c in outputs["chunks"] if s in c["speaker"]]
324
+ if len(chunk_s) != 0:
325
+ outputs[f"chunks/{s}"] = chunk_s
326
+ outputs[f"text/{s}"] = "".join([c["text"] for c in outputs["chunks"] if s in c["speaker"]])
327
+ speakers.append(s)
328
+ outputs["speakers"] = speakers
329
  return outputs
330
 
pipeline/push_pipeline.py CHANGED
@@ -1,9 +1,11 @@
 
1
  from pprint import pprint
2
  from kotoba_whisper import KotobaWhisperPipeline
3
  from transformers.pipelines import PIPELINE_REGISTRY, pipeline
4
  from transformers import WhisperForConditionalGeneration, TFWhisperForConditionalGeneration
5
 
6
 
 
7
  model_alias = "kotoba-tech/kotoba-whisper-v2.2"
8
  PIPELINE_REGISTRY.register_pipeline(
9
  "kotoba-whisper",
@@ -11,13 +13,9 @@ PIPELINE_REGISTRY.register_pipeline(
11
  pt_model=WhisperForConditionalGeneration,
12
  tf_model=TFWhisperForConditionalGeneration
13
  )
14
- test_audio = "/Users/asahiu/Desktop/speaker_diariazation_sample_1.wav"
15
- pipe = pipeline(task="kotoba-whisper", model="kotoba-tech/kotoba-whisper-v2.0", chunk_length_s=15, batch_size=16, return_unique_speaker=True)
16
- output = pipe(test_audio)
17
- pprint(output)
18
  pipe = pipeline(task="kotoba-whisper", model="kotoba-tech/kotoba-whisper-v2.0", chunk_length_s=15, batch_size=16)
19
- output = pipe(test_audio)
20
- pprint(output)
21
  pipe.push_to_hub(model_alias)
22
 
23
 
 
1
+ """wget https://huggingface.co/kotoba-tech/kotoba-whisper-v2.2/resolve/main/sample_audio/sample_diarization_japanese.mp3"""
2
  from pprint import pprint
3
  from kotoba_whisper import KotobaWhisperPipeline
4
  from transformers.pipelines import PIPELINE_REGISTRY, pipeline
5
  from transformers import WhisperForConditionalGeneration, TFWhisperForConditionalGeneration
6
 
7
 
8
+ test_audio = "sample_diarization_japanese.mp3"
9
  model_alias = "kotoba-tech/kotoba-whisper-v2.2"
10
  PIPELINE_REGISTRY.register_pipeline(
11
  "kotoba-whisper",
 
13
  pt_model=WhisperForConditionalGeneration,
14
  tf_model=TFWhisperForConditionalGeneration
15
  )
 
 
 
 
16
  pipe = pipeline(task="kotoba-whisper", model="kotoba-tech/kotoba-whisper-v2.0", chunk_length_s=15, batch_size=16)
17
+ # output = pipe(test_audio)
18
+ # pprint(output)
19
  pipe.push_to_hub(model_alias)
20
 
21
 
pipeline/test_pipeline.py CHANGED
@@ -1,7 +1,11 @@
 
1
  from pprint import pprint
2
  from transformers.pipelines import pipeline
3
 
4
- test_audio = "/Users/asahiu/Desktop/speaker_diariazation_sample_1.wav"
5
- pipe = pipeline(model="kotoba-tech/kotoba-whisper-v2.2", chunk_length_s=15, batch_size=16, trust_remote_code=True)
6
- output = pipe(test_audio)
 
 
 
7
  pprint(output)
 
1
+ """wget https://huggingface.co/kotoba-tech/kotoba-whisper-v2.2/resolve/main/sample_audio/sample_diarization_japanese.mp3"""
2
  from pprint import pprint
3
  from transformers.pipelines import pipeline
4
 
5
+ pipe = pipeline(model="kotoba-tech/kotoba-whisper-v2.2", chunk_length_s=None, batch_size=16, trust_remote_code=True)
6
+ output = pipe("sample_diarization_japanese.mp3")
7
+ pprint(output)
8
+
9
+ pipe = pipeline(model="kotoba-tech/kotoba-whisper-v2.2", chunk_length_s=None, batch_size=16, trust_remote_code=True, return_unique_speaker=False)
10
+ output = pipe("sample_diarization_japanese.mp3")
11
  pprint(output)
pipeline/test_speaker_diarization.py CHANGED
@@ -3,46 +3,42 @@
3
  # Requirement: Sumit access request for the following models.
4
  # https://huggingface.co/pyannote/speaker-diarization-3.1
5
  # https://huggingface.co/pyannote/segmentation-3.0
 
6
  import soundfile as sf
7
  import numpy as np
8
- from typing import Union, Optional, Dict, List
9
 
10
  import torch
11
  from pyannote.audio import Pipeline
 
12
 
13
 
14
  class SpeakerDiarization:
15
 
16
- def __init__(self, model_id: str):
17
- self.pipeline = Pipeline.from_pretrained(model_id)
 
 
 
18
 
19
  def __call__(self,
20
- audio: Union[str, torch.Tensor, np.ndarray],
21
- sampling_rate: Optional[int] = None) -> Dict[str, List[List[float]]]:
22
- if type(audio) is torch.Tensor or type(audio) is np.ndarray:
23
- if sampling_rate is None:
24
- raise ValueError("sampling_rate must be provided")
25
- if type(audio) is np.ndarray:
26
- audio = torch.as_tensor(audio)
27
- audio = torch.as_tensor(audio, dtype=torch.float32)
28
- if len(audio.shape) == 1:
29
- audio = audio.unsqueeze(0)
30
- elif len(audio.shape) > 3:
31
- raise ValueError("audio shape must be (channel, time)")
32
- audio = {"waveform": audio, "sample_rate": sampling_rate}
33
  output = self.pipeline(audio)
34
- # dictionary: {speaker_id: [[start, end],...]}
35
  return {s: [[i.start, i.end] for i in output.label_timeline(s)] for s in output.labels()}
36
 
37
 
38
- pipeline = SpeakerDiarization("pyannote/speaker-diarization-3.1")
39
- root_dir = "/Users/asahiu/Desktop"
40
- sample_audio_files = ["speaker_diariazation_sample_1.wav", "speaker_diariazation_sample_2.wav"]
41
 
42
- print(sample_audio_file)
43
- a, sr = sf.read(f"{root_dir}/{sample_audio_file}")
44
- output = pipeline(a, sampling_rate=sr)
45
- print(output)
46
- output = pipeline(f"{root_dir}/{sample_audio_file}")
47
- print(output)
48
- print()
 
3
  # Requirement: Sumit access request for the following models.
4
  # https://huggingface.co/pyannote/speaker-diarization-3.1
5
  # https://huggingface.co/pyannote/segmentation-3.0
6
+ # wget https://huggingface.co/kotoba-tech/kotoba-whisper-v2.2/resolve/main/sample_audio/sample_diarization_japanese.mp3
7
  import soundfile as sf
8
  import numpy as np
9
+ from typing import Union, Dict, List
10
 
11
  import torch
12
  from pyannote.audio import Pipeline
13
+ from diarizers import SegmentationModel
14
 
15
 
16
  class SpeakerDiarization:
17
 
18
+ def __init__(self):
19
+ self.pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1")
20
+ self.pipeline._segmentation.model = SegmentationModel().from_pretrained(
21
+ 'diarizers-community/speaker-segmentation-fine-tuned-callhome-jpn'
22
+ ).to_pyannote_model()
23
 
24
  def __call__(self,
25
+ audio: Union[torch.Tensor, np.ndarray],
26
+ sampling_rate: int) -> Dict[str, List[List[float]]]:
27
+ if sampling_rate is None:
28
+ raise ValueError("sampling_rate must be provided")
29
+ if type(audio) is np.ndarray:
30
+ audio = torch.as_tensor(audio)
31
+ audio = torch.as_tensor(audio, dtype=torch.float32)
32
+ if len(audio.shape) == 1:
33
+ audio = audio.unsqueeze(0)
34
+ elif len(audio.shape) > 3:
35
+ raise ValueError("audio shape must be (channel, time)")
36
+ audio = {"waveform": audio, "sample_rate": sampling_rate}
 
37
  output = self.pipeline(audio)
 
38
  return {s: [[i.start, i.end] for i in output.label_timeline(s)] for s in output.labels()}
39
 
40
 
41
+ pipeline = SpeakerDiarization()
42
+ a, sr = sf.read("sample_diarization_japanese.mp3")
43
+ print(pipeline(a.T, sampling_rate=sr))
44