|
--- |
|
license: cc-by-nc-4.0 |
|
language: |
|
- en |
|
--- |
|
|
|
### English ASR sequence-to-sequence model. This model supports output normalizing text, labeling timestamps, and segmenting multiple speakers. |
|
|
|
|
|
```python |
|
# !pip install transformers sentencepiece |
|
|
|
from transformers import SpeechEncoderDecoderModel |
|
from transformers import AutoFeatureExtractor, AutoTokenizer, GenerationConfig |
|
import torchaudio |
|
import torch |
|
|
|
model_path = 'nguyenvulebinh/wav2vec2-bartpho' |
|
model = SpeechEncoderDecoderModel.from_pretrained(model_path).eval() |
|
feature_extractor = AutoFeatureExtractor.from_pretrained(model_path) |
|
tokenizer = AutoTokenizer.from_pretrained(model_path) |
|
if torch.cuda.is_available(): |
|
model = model.cuda() |
|
|
|
|
|
def decode_tokens(token_ids, skip_special_tokens=True, time_precision=0.02): |
|
timestamp_begin = tokenizer.vocab_size |
|
outputs = [[]] |
|
for token in token_ids: |
|
if token >= timestamp_begin: |
|
timestamp = f" |{(token - timestamp_begin) * time_precision:.2f}| " |
|
outputs.append(timestamp) |
|
outputs.append([]) |
|
else: |
|
outputs[-1].append(token) |
|
outputs = [ |
|
s if isinstance(s, str) else tokenizer.decode(s, skip_special_tokens=skip_special_tokens) for s in outputs |
|
] |
|
return "".join(outputs).replace("< |", "<|").replace("| >", "|>") |
|
|
|
def decode_wav(audio_wavs, asr_model, prefix=""): |
|
device = next(asr_model.parameters()).device |
|
input_values = feature_extractor.pad( |
|
[{"input_values": feature} for feature in audio_wavs], |
|
padding=True, |
|
max_length=None, |
|
pad_to_multiple_of=None, |
|
return_tensors="pt", |
|
) |
|
|
|
output_beam_ids = asr_model.generate( |
|
input_values['input_values'].to(device), |
|
attention_mask=input_values['attention_mask'].to(device), |
|
decoder_input_ids=tokenizer.batch_encode_plus([prefix] * len(audio_wavs), return_tensors="pt")['input_ids'][..., :-1].to(device), |
|
generation_config=GenerationConfig(decoder_start_token_id=tokenizer.bos_token_id), |
|
max_length=250, |
|
num_beams=25, |
|
no_repeat_ngram_size=4, |
|
num_return_sequences=1, |
|
early_stopping=True, |
|
return_dict_in_generate=True, |
|
output_scores=True, |
|
) |
|
|
|
output_text = [decode_tokens(sequence) for sequence in output_beam_ids.sequences] |
|
|
|
return output_text |
|
|
|
|
|
# https://huggingface.co/nguyenvulebinh/wavlm-bart/resolve/main/sample.wav |
|
print(decode_wav([torchaudio.load('sample.wav')[0].squeeze()], model)) |
|
|
|
# <|0.06| What are the many parts that make a machine learning system feel like it works so magically cheap? |5.86|> |
|
# <|5.68| Explletability factors important, so they tend to gear towards more simpler models with less parameters, but easier to explain, and on the other spectrum there are |15.86|> |
|
|
|
``` |
|
|
|
### Citation |
|
|
|
This repository uses the idea from the following paper. Please cite the paper if this model is used to help produce published results or is incorporated into other software. |
|
|
|
```text |
|
@INPROCEEDINGS{10446589, |
|
author={Nguyen, Thai-Binh and Waibel, Alexander}, |
|
booktitle={ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, |
|
title={Synthetic Conversations Improve Multi-Talker ASR}, |
|
year={2024}, |
|
volume={}, |
|
number={}, |
|
pages={10461-10465}, |
|
keywords={Systematics;Error analysis;Knowledge based systems;Oral communication;Signal processing;Data models;Acoustics;multi-talker;asr;synthetic conversation}, |
|
doi={10.1109/ICASSP48485.2024.10446589} |
|
} |
|
``` |
|
|
|
### Contact |
|
|
|
nguyenvulebinh@gmail.com |
|
|
|
[![Follow](https://img.shields.io/twitter/follow/nguyenvulebinh?style=social)](https://twitter.com/intent/follow?screen_name=nguyenvulebinh) |