init
Browse files- README.md +4 -3
- pipeline/kotoba_whisper.py +41 -26
- pipeline/push_pipeline.py +4 -6
- pipeline/test_pipeline.py +7 -3
- pipeline/test_speaker_diarization.py +23 -27
README.md
CHANGED
@@ -9,7 +9,7 @@ tags:
|
|
9 |
widget:
|
10 |
- example_title: Sample 1
|
11 |
src: >-
|
12 |
-
https://huggingface.co/
|
13 |
pipeline_tag: automatic-speech-recognition
|
14 |
---
|
15 |
|
@@ -58,8 +58,9 @@ install the latest version of Transformers.
|
|
58 |
```bash
|
59 |
pip install --upgrade pip
|
60 |
pip install --upgrade transformers accelerate torchaudio
|
61 |
-
pip install
|
62 |
-
pip install
|
|
|
63 |
```
|
64 |
|
65 |
### Transcription
|
|
|
9 |
widget:
|
10 |
- example_title: Sample 1
|
11 |
src: >-
|
12 |
+
https://huggingface.co/kotoba-tech/kotoba-whisper-v2.2/resolve/main/sample_audio/sample_diarization_japanese.mp3
|
13 |
pipeline_tag: automatic-speech-recognition
|
14 |
---
|
15 |
|
|
|
58 |
```bash
|
59 |
pip install --upgrade pip
|
60 |
pip install --upgrade transformers accelerate torchaudio
|
61 |
+
pip install "punctuators==0.0.5"
|
62 |
+
pip install "pyannote.audio"
|
63 |
+
pip install git+https://github.com/huggingface/diarizers.git
|
64 |
```
|
65 |
|
66 |
### Transcription
|
pipeline/kotoba_whisper.py
CHANGED
@@ -13,6 +13,7 @@ from transformers.feature_extraction_sequence_utils import SequenceFeatureExtrac
|
|
13 |
from pyannote.audio import Pipeline
|
14 |
from pyannote.core.annotation import Annotation
|
15 |
from punctuators.models import PunctCapSegModelONNX
|
|
|
16 |
|
17 |
|
18 |
class Punctuator:
|
@@ -45,25 +46,29 @@ class Punctuator:
|
|
45 |
|
46 |
class SpeakerDiarization:
|
47 |
|
48 |
-
def __init__(self,
|
|
|
|
|
|
|
49 |
self.device = device
|
50 |
self.pipeline = Pipeline.from_pretrained(model_id)
|
51 |
self.pipeline = self.pipeline.to(self.device)
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
audio = torch.as_tensor(audio
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
audio
|
|
|
67 |
output = self.pipeline(audio)
|
68 |
return output
|
69 |
|
@@ -72,23 +77,28 @@ class KotobaWhisperPipeline(AutomaticSpeechRecognitionPipeline):
|
|
72 |
|
73 |
def __init__(self,
|
74 |
model: "PreTrainedModel",
|
75 |
-
|
|
|
76 |
feature_extractor: Union["SequenceFeatureExtractor", str] = None,
|
77 |
tokenizer: Optional[PreTrainedTokenizer] = None,
|
78 |
device: Union[int, "torch.device"] = None,
|
79 |
-
|
80 |
torch_dtype: Optional[Union[str, "torch.dtype"]] = None,
|
81 |
-
return_unique_speaker: bool =
|
82 |
punctuator: bool = False,
|
83 |
**kwargs):
|
84 |
self.type = "seq2seq_whisper"
|
85 |
if device is None:
|
86 |
device = "cpu"
|
87 |
-
if
|
88 |
-
|
89 |
-
if type(
|
90 |
-
|
91 |
-
self.model_speaker_diarization = SpeakerDiarization(
|
|
|
|
|
|
|
|
|
92 |
self.return_unique_speaker = return_unique_speaker
|
93 |
if punctuator:
|
94 |
self.punctuator = Punctuator()
|
@@ -308,8 +318,13 @@ class KotobaWhisperPipeline(AutomaticSpeechRecognitionPipeline):
|
|
308 |
outputs["text"] = "".join([c["text"] for c in outputs["chunks"]])
|
309 |
outputs["speakers"] = sd.labels()
|
310 |
outputs.pop("audio_array")
|
|
|
311 |
for s in outputs["speakers"]:
|
312 |
-
|
313 |
-
|
|
|
|
|
|
|
|
|
314 |
return outputs
|
315 |
|
|
|
13 |
from pyannote.audio import Pipeline
|
14 |
from pyannote.core.annotation import Annotation
|
15 |
from punctuators.models import PunctCapSegModelONNX
|
16 |
+
from diarizers import SegmentationModel
|
17 |
|
18 |
|
19 |
class Punctuator:
|
|
|
46 |
|
47 |
class SpeakerDiarization:
|
48 |
|
49 |
+
def __init__(self,
|
50 |
+
device: torch.device,
|
51 |
+
model_id: str = "pyannote/speaker-diarization-3.1",
|
52 |
+
model_id_diarizers: Optional[str] = None):
|
53 |
self.device = device
|
54 |
self.pipeline = Pipeline.from_pretrained(model_id)
|
55 |
self.pipeline = self.pipeline.to(self.device)
|
56 |
+
if model_id_diarizers:
|
57 |
+
self.pipeline._segmentation.model = SegmentationModel().from_pretrained(
|
58 |
+
model_id_diarizers
|
59 |
+
).to_pyannote_model().to(self.device)
|
60 |
+
|
61 |
+
def __call__(self, audio: Union[torch.Tensor, np.ndarray], sampling_rate: int) -> Annotation:
|
62 |
+
if sampling_rate is None:
|
63 |
+
raise ValueError("sampling_rate must be provided")
|
64 |
+
if type(audio) is np.ndarray:
|
65 |
+
audio = torch.as_tensor(audio)
|
66 |
+
audio = torch.as_tensor(audio, dtype=torch.float32)
|
67 |
+
if len(audio.shape) == 1:
|
68 |
+
audio = audio.unsqueeze(0)
|
69 |
+
elif len(audio.shape) > 3:
|
70 |
+
raise ValueError("audio shape must be (channel, time)")
|
71 |
+
audio = {"waveform": audio.to(self.device), "sample_rate": sampling_rate}
|
72 |
output = self.pipeline(audio)
|
73 |
return output
|
74 |
|
|
|
77 |
|
78 |
def __init__(self,
|
79 |
model: "PreTrainedModel",
|
80 |
+
model_pyannote: str = "pyannote/speaker-diarization-3.1",
|
81 |
+
model_diarizers: Optional[str] = "diarizers-community/speaker-segmentation-fine-tuned-callhome-jpn",
|
82 |
feature_extractor: Union["SequenceFeatureExtractor", str] = None,
|
83 |
tokenizer: Optional[PreTrainedTokenizer] = None,
|
84 |
device: Union[int, "torch.device"] = None,
|
85 |
+
device_pyannote: Union[int, "torch.device"] = None,
|
86 |
torch_dtype: Optional[Union[str, "torch.dtype"]] = None,
|
87 |
+
return_unique_speaker: bool = True,
|
88 |
punctuator: bool = False,
|
89 |
**kwargs):
|
90 |
self.type = "seq2seq_whisper"
|
91 |
if device is None:
|
92 |
device = "cpu"
|
93 |
+
if device_pyannote is None:
|
94 |
+
device_pyannote = device
|
95 |
+
if type(device_pyannote) is str:
|
96 |
+
device_pyannote = torch.device(device_pyannote)
|
97 |
+
self.model_speaker_diarization = SpeakerDiarization(
|
98 |
+
device=device_pyannote,
|
99 |
+
model_id=model_pyannote,
|
100 |
+
model_id_diarizers=model_diarizers
|
101 |
+
)
|
102 |
self.return_unique_speaker = return_unique_speaker
|
103 |
if punctuator:
|
104 |
self.punctuator = Punctuator()
|
|
|
318 |
outputs["text"] = "".join([c["text"] for c in outputs["chunks"]])
|
319 |
outputs["speakers"] = sd.labels()
|
320 |
outputs.pop("audio_array")
|
321 |
+
speakers = []
|
322 |
for s in outputs["speakers"]:
|
323 |
+
chunk_s = [c for c in outputs["chunks"] if s in c["speaker"]]
|
324 |
+
if len(chunk_s) != 0:
|
325 |
+
outputs[f"chunks/{s}"] = chunk_s
|
326 |
+
outputs[f"text/{s}"] = "".join([c["text"] for c in outputs["chunks"] if s in c["speaker"]])
|
327 |
+
speakers.append(s)
|
328 |
+
outputs["speakers"] = speakers
|
329 |
return outputs
|
330 |
|
pipeline/push_pipeline.py
CHANGED
@@ -1,9 +1,11 @@
|
|
|
|
1 |
from pprint import pprint
|
2 |
from kotoba_whisper import KotobaWhisperPipeline
|
3 |
from transformers.pipelines import PIPELINE_REGISTRY, pipeline
|
4 |
from transformers import WhisperForConditionalGeneration, TFWhisperForConditionalGeneration
|
5 |
|
6 |
|
|
|
7 |
model_alias = "kotoba-tech/kotoba-whisper-v2.2"
|
8 |
PIPELINE_REGISTRY.register_pipeline(
|
9 |
"kotoba-whisper",
|
@@ -11,13 +13,9 @@ PIPELINE_REGISTRY.register_pipeline(
|
|
11 |
pt_model=WhisperForConditionalGeneration,
|
12 |
tf_model=TFWhisperForConditionalGeneration
|
13 |
)
|
14 |
-
test_audio = "/Users/asahiu/Desktop/speaker_diariazation_sample_1.wav"
|
15 |
-
pipe = pipeline(task="kotoba-whisper", model="kotoba-tech/kotoba-whisper-v2.0", chunk_length_s=15, batch_size=16, return_unique_speaker=True)
|
16 |
-
output = pipe(test_audio)
|
17 |
-
pprint(output)
|
18 |
pipe = pipeline(task="kotoba-whisper", model="kotoba-tech/kotoba-whisper-v2.0", chunk_length_s=15, batch_size=16)
|
19 |
-
output = pipe(test_audio)
|
20 |
-
pprint(output)
|
21 |
pipe.push_to_hub(model_alias)
|
22 |
|
23 |
|
|
|
1 |
+
"""wget https://huggingface.co/kotoba-tech/kotoba-whisper-v2.2/resolve/main/sample_audio/sample_diarization_japanese.mp3"""
|
2 |
from pprint import pprint
|
3 |
from kotoba_whisper import KotobaWhisperPipeline
|
4 |
from transformers.pipelines import PIPELINE_REGISTRY, pipeline
|
5 |
from transformers import WhisperForConditionalGeneration, TFWhisperForConditionalGeneration
|
6 |
|
7 |
|
8 |
+
test_audio = "sample_diarization_japanese.mp3"
|
9 |
model_alias = "kotoba-tech/kotoba-whisper-v2.2"
|
10 |
PIPELINE_REGISTRY.register_pipeline(
|
11 |
"kotoba-whisper",
|
|
|
13 |
pt_model=WhisperForConditionalGeneration,
|
14 |
tf_model=TFWhisperForConditionalGeneration
|
15 |
)
|
|
|
|
|
|
|
|
|
16 |
pipe = pipeline(task="kotoba-whisper", model="kotoba-tech/kotoba-whisper-v2.0", chunk_length_s=15, batch_size=16)
|
17 |
+
# output = pipe(test_audio)
|
18 |
+
# pprint(output)
|
19 |
pipe.push_to_hub(model_alias)
|
20 |
|
21 |
|
pipeline/test_pipeline.py
CHANGED
@@ -1,7 +1,11 @@
|
|
|
|
1 |
from pprint import pprint
|
2 |
from transformers.pipelines import pipeline
|
3 |
|
4 |
-
|
5 |
-
|
6 |
-
output
|
|
|
|
|
|
|
7 |
pprint(output)
|
|
|
1 |
+
"""wget https://huggingface.co/kotoba-tech/kotoba-whisper-v2.2/resolve/main/sample_audio/sample_diarization_japanese.mp3"""
|
2 |
from pprint import pprint
|
3 |
from transformers.pipelines import pipeline
|
4 |
|
5 |
+
pipe = pipeline(model="kotoba-tech/kotoba-whisper-v2.2", chunk_length_s=None, batch_size=16, trust_remote_code=True)
|
6 |
+
output = pipe("sample_diarization_japanese.mp3")
|
7 |
+
pprint(output)
|
8 |
+
|
9 |
+
pipe = pipeline(model="kotoba-tech/kotoba-whisper-v2.2", chunk_length_s=None, batch_size=16, trust_remote_code=True, return_unique_speaker=False)
|
10 |
+
output = pipe("sample_diarization_japanese.mp3")
|
11 |
pprint(output)
|
pipeline/test_speaker_diarization.py
CHANGED
@@ -3,46 +3,42 @@
|
|
3 |
# Requirement: Sumit access request for the following models.
|
4 |
# https://huggingface.co/pyannote/speaker-diarization-3.1
|
5 |
# https://huggingface.co/pyannote/segmentation-3.0
|
|
|
6 |
import soundfile as sf
|
7 |
import numpy as np
|
8 |
-
from typing import Union,
|
9 |
|
10 |
import torch
|
11 |
from pyannote.audio import Pipeline
|
|
|
12 |
|
13 |
|
14 |
class SpeakerDiarization:
|
15 |
|
16 |
-
def __init__(self
|
17 |
-
self.pipeline = Pipeline.from_pretrained(
|
|
|
|
|
|
|
18 |
|
19 |
def __call__(self,
|
20 |
-
audio: Union[
|
21 |
-
sampling_rate:
|
22 |
-
if
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
audio = {"waveform": audio, "sample_rate": sampling_rate}
|
33 |
output = self.pipeline(audio)
|
34 |
-
# dictionary: {speaker_id: [[start, end],...]}
|
35 |
return {s: [[i.start, i.end] for i in output.label_timeline(s)] for s in output.labels()}
|
36 |
|
37 |
|
38 |
-
pipeline = SpeakerDiarization(
|
39 |
-
|
40 |
-
|
41 |
|
42 |
-
print(sample_audio_file)
|
43 |
-
a, sr = sf.read(f"{root_dir}/{sample_audio_file}")
|
44 |
-
output = pipeline(a, sampling_rate=sr)
|
45 |
-
print(output)
|
46 |
-
output = pipeline(f"{root_dir}/{sample_audio_file}")
|
47 |
-
print(output)
|
48 |
-
print()
|
|
|
3 |
# Requirement: Sumit access request for the following models.
|
4 |
# https://huggingface.co/pyannote/speaker-diarization-3.1
|
5 |
# https://huggingface.co/pyannote/segmentation-3.0
|
6 |
+
# wget https://huggingface.co/kotoba-tech/kotoba-whisper-v2.2/resolve/main/sample_audio/sample_diarization_japanese.mp3
|
7 |
import soundfile as sf
|
8 |
import numpy as np
|
9 |
+
from typing import Union, Dict, List
|
10 |
|
11 |
import torch
|
12 |
from pyannote.audio import Pipeline
|
13 |
+
from diarizers import SegmentationModel
|
14 |
|
15 |
|
16 |
class SpeakerDiarization:
|
17 |
|
18 |
+
def __init__(self):
|
19 |
+
self.pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1")
|
20 |
+
self.pipeline._segmentation.model = SegmentationModel().from_pretrained(
|
21 |
+
'diarizers-community/speaker-segmentation-fine-tuned-callhome-jpn'
|
22 |
+
).to_pyannote_model()
|
23 |
|
24 |
def __call__(self,
|
25 |
+
audio: Union[torch.Tensor, np.ndarray],
|
26 |
+
sampling_rate: int) -> Dict[str, List[List[float]]]:
|
27 |
+
if sampling_rate is None:
|
28 |
+
raise ValueError("sampling_rate must be provided")
|
29 |
+
if type(audio) is np.ndarray:
|
30 |
+
audio = torch.as_tensor(audio)
|
31 |
+
audio = torch.as_tensor(audio, dtype=torch.float32)
|
32 |
+
if len(audio.shape) == 1:
|
33 |
+
audio = audio.unsqueeze(0)
|
34 |
+
elif len(audio.shape) > 3:
|
35 |
+
raise ValueError("audio shape must be (channel, time)")
|
36 |
+
audio = {"waveform": audio, "sample_rate": sampling_rate}
|
|
|
37 |
output = self.pipeline(audio)
|
|
|
38 |
return {s: [[i.start, i.end] for i in output.label_timeline(s)] for s in output.labels()}
|
39 |
|
40 |
|
41 |
+
pipeline = SpeakerDiarization()
|
42 |
+
a, sr = sf.read("sample_diarization_japanese.mp3")
|
43 |
+
print(pipeline(a.T, sampling_rate=sr))
|
44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|