Wav2vec2-CTC-based French Phonemizer
Usage
Infer audio
import soundfile as sf
import torch
from transformers import AutoModelForCTC, AutoProcessor, pipeline
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
# Load model
model_name_or_path = "bofenghuang/phonemizer-wav2vec2-ctc-french"
processor = AutoProcessor.from_pretrained(model_name_or_path)
model_sample_rate = processor.feature_extractor.sampling_rate
model = AutoModelForCTC.from_pretrained(model_name_or_path, torch_dtype=torch_dtype)
model.to(device)
# Init pipeline
pipe = pipeline(
"automatic-speech-recognition",
model=model,
feature_extractor=processor.feature_extractor,
tokenizer=processor.tokenizer,
torch_dtype=torch_dtype,
device=device,
)
# Example audio
audio_file_path = "/path/to/example/wav/file"
# Infer with pipeline
result = pipe(audio_file_path)
print(result["text"])
# Infer w/ lower-level api
waveform, sample_rate = sf.read(audio_file_path, start=0, frames=-1, dtype="float32", always_2d=False)
input_dict = processor(waveform, sampling_rate=model_sample_rate, return_tensors="pt")
with torch.inference_mode():
input_values = input_dict.input_values.to(device, dtype=torch_dtype)
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
predicted_text = processor.batch_decode(predicted_ids)[0]
print(predicted_text)
Phonemes were generated using the following code snippet:
# !pip install phonemizer
from phonemizer.backend import EspeakBackend
from phonemizer.separator import Separator
# initialize the espeak backend for French
backend = EspeakBackend("fr-fr", language_switch="remove-flags")
# separate phones by a space and ignoring words boundaries
separator = Separator(phone=None, word=" ", syllable="")
def phonemize_text_phonemizer(s):
return backend.phonemize([s], separator=separator, strip=True, njobs=1)[0]
input_str = "ce modèle est utilisé pour identifier les phonèmes dans l'audio entrant"
print(phonemize_text_phonemizer(input_str))
# 'sə modɛl ɛt ytilize puʁ idɑ̃tifje le fonɛm dɑ̃ lodjo ɑ̃tʁɑ̃'
Acknowledgement
Inspired by Cnam-LMSSC/wav2vec2-french-phonemizer
- Downloads last month
- 146
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social
visibility and check back later, or deploy to Inference Endpoints (dedicated)
instead.