File size: 3,633 Bytes
b2f67be 0898318 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
---
license: cc-by-nc-4.0
language:
- en
---
### English ASR sequence-to-sequence model. This model supports output normalizing text, labeling timestamps, and segmenting multiple speakers.
```python
# !pip install transformers sentencepiece
from transformers import SpeechEncoderDecoderModel
from transformers import AutoFeatureExtractor, AutoTokenizer, GenerationConfig
import torchaudio
import torch
model_path = 'nguyenvulebinh/wav2vec2-bartpho'
model = SpeechEncoderDecoderModel.from_pretrained(model_path).eval()
feature_extractor = AutoFeatureExtractor.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
if torch.cuda.is_available():
model = model.cuda()
def decode_tokens(token_ids, skip_special_tokens=True, time_precision=0.02):
timestamp_begin = tokenizer.vocab_size
outputs = [[]]
for token in token_ids:
if token >= timestamp_begin:
timestamp = f" |{(token - timestamp_begin) * time_precision:.2f}| "
outputs.append(timestamp)
outputs.append([])
else:
outputs[-1].append(token)
outputs = [
s if isinstance(s, str) else tokenizer.decode(s, skip_special_tokens=skip_special_tokens) for s in outputs
]
return "".join(outputs).replace("< |", "<|").replace("| >", "|>")
def decode_wav(audio_wavs, asr_model, prefix=""):
device = next(asr_model.parameters()).device
input_values = feature_extractor.pad(
[{"input_values": feature} for feature in audio_wavs],
padding=True,
max_length=None,
pad_to_multiple_of=None,
return_tensors="pt",
)
output_beam_ids = asr_model.generate(
input_values['input_values'].to(device),
attention_mask=input_values['attention_mask'].to(device),
decoder_input_ids=tokenizer.batch_encode_plus([prefix] * len(audio_wavs), return_tensors="pt")['input_ids'][..., :-1].to(device),
generation_config=GenerationConfig(decoder_start_token_id=tokenizer.bos_token_id),
max_length=250,
num_beams=25,
no_repeat_ngram_size=4,
num_return_sequences=1,
early_stopping=True,
return_dict_in_generate=True,
output_scores=True,
)
output_text = [decode_tokens(sequence) for sequence in output_beam_ids.sequences]
return output_text
# https://huggingface.co/nguyenvulebinh/wavlm-bart/resolve/main/sample.wav
print(decode_wav([torchaudio.load('sample.wav')[0].squeeze()], model))
# <|0.06| What are the many parts that make a machine learning system feel like it works so magically cheap? |5.86|>
# <|5.68| Explletability factors important, so they tend to gear towards more simpler models with less parameters, but easier to explain, and on the other spectrum there are |15.86|>
```
### Citation
This repository uses the idea from the following paper. Please cite the paper if this model is used to help produce published results or is incorporated into other software.
```text
@INPROCEEDINGS{10446589,
author={Nguyen, Thai-Binh and Waibel, Alexander},
booktitle={ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
title={Synthetic Conversations Improve Multi-Talker ASR},
year={2024},
volume={},
number={},
pages={10461-10465},
keywords={Systematics;Error analysis;Knowledge based systems;Oral communication;Signal processing;Data models;Acoustics;multi-talker;asr;synthetic conversation},
doi={10.1109/ICASSP48485.2024.10446589}
}
```
### Contact
nguyenvulebinh@gmail.com
[![Follow](https://img.shields.io/twitter/follow/nguyenvulebinh?style=social)](https://twitter.com/intent/follow?screen_name=nguyenvulebinh) |