bofenghuang commited on
Commit
1cdb121
·
1 Parent(s): a3037d1
Files changed (1) hide show
  1. README.md +84 -0
README.md CHANGED
@@ -1,3 +1,87 @@
1
  ---
2
  license: mit
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: mit
3
+ language: fr
4
+ datasets:
5
+ - mozilla-foundation/common_voice_13_0
6
+ tags:
7
+ - automatic-speech-recognition
8
  ---
9
+
10
+ # Wav2vec2-CTC-based French Phonemizer
11
+
12
+ ## Usage
13
+
14
+ *Infer audio*
15
+
16
+ ```python
17
+ import soundfile as sf
18
+ import torch
19
+ from transformers import AutoModelForCTC, AutoProcessor, pipeline
20
+
21
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
22
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
23
+
24
+ # Load model
25
+ model_name_or_path = "bofenghuang/phonemizer-wav2vec2-ctc-french"
26
+ processor = AutoProcessor.from_pretrained(model_name_or_path)
27
+ model_sample_rate = processor.feature_extractor.sampling_rate
28
+ model = AutoModelForCTC.from_pretrained(model_name_or_path, torch_dtype=torch_dtype)
29
+ model.to(device)
30
+
31
+ # Init pipeline
32
+ pipe = pipeline(
33
+ "automatic-speech-recognition",
34
+ model=model,
35
+ feature_extractor=processor.feature_extractor,
36
+ tokenizer=processor.tokenizer,
37
+ torch_dtype=torch_dtype,
38
+ device=device,
39
+ )
40
+
41
+ # Example audio
42
+ audio_file_path = "/path/to/example/wav/file"
43
+
44
+ # Infer with pipeline
45
+ result = pipe(audio_file_path)
46
+ print(result["text"])
47
+
48
+ # Infer w/ lower-level api
49
+ waveform, sample_rate = sf.read(audio_file_path, start=0, frames=-1, dtype="float32", always_2d=False)
50
+
51
+ input_dict = processor(waveform, sampling_rate=model_sample_rate, return_tensors="pt")
52
+
53
+ with torch.inference_mode():
54
+ input_values = input_dict.input_values.to(device, dtype=torch_dtype)
55
+ logits = model(input_values).logits
56
+
57
+ predicted_ids = torch.argmax(logits, dim=-1)
58
+ predicted_text = processor.batch_decode(predicted_ids)[0]
59
+ print(predicted_text)
60
+ ```
61
+
62
+
63
+
64
+ *Phonemes were generated using the following code snippet:*
65
+
66
+ ```python
67
+ # !pip install phonemizer
68
+ from phonemizer.backend import EspeakBackend
69
+ from phonemizer.separator import Separator
70
+
71
+ # initialize the espeak backend for French
72
+ backend = EspeakBackend("fr-fr", language_switch="remove-flags")
73
+
74
+ # separate phones by a space and ignoring words boundaries
75
+ separator = Separator(phone=None, word=" ", syllable="")
76
+
77
+ def phonemize_text_phonemizer(s):
78
+ return backend.phonemize([s], separator=separator, strip=True, njobs=1)[0]
79
+
80
+ input_str = "ce modèle est utilisé pour identifier les phonèmes dans l'audio entrant"
81
+ print(phonemize_text_phonemizer(input_str))
82
+ # 'sə modɛl ɛt ytilize puʁ idɑ̃tifje le fonɛm dɑ̃ lodjo ɑ̃tʁɑ̃'
83
+ ```
84
+
85
+ ## Acknowledgement
86
+
87
+ Inspired by [Cnam-LMSSC/wav2vec2-french-phonemizer](https://huggingface.co/Cnam-LMSSC/wav2vec2-french-phonemizer)