cahya
/

wav2vec2-large-xlsr-indonesian-mix

@@ -23,7 +23,7 @@ model-index:
     metrics:
        - name: Test WER
          type: wer
-         value: 22.26
 ---
 # Wav2Vec2-Large-XLSR-Indonesian
@@ -47,12 +47,12 @@ test_dataset = load_dataset("common_voice", "id", split="test[:2%]")
 processor = Wav2Vec2Processor.from_pretrained("cahya/wav2vec2-large-xlsr-indonesian")
 model = Wav2Vec2ForCTC.from_pretrained("cahya/wav2vec2-large-xlsr-indonesian")
-resampler = torchaudio.transforms.Resample(48_000, 16_000)
 # Preprocessing the datasets.
 # We need to read the aduio files as arrays
 def speech_file_to_array_fn(batch):
     speech_array, sampling_rate = torchaudio.load(batch["path"])
     batch["speech"] = resampler(speech_array).squeeze().numpy()
     return batch
@@ -89,13 +89,13 @@ model.to("cuda")
 chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\'\”\�]'
-resampler = torchaudio.transforms.Resample(48_000, 16_000)
 # Preprocessing the datasets.
 # We need to read the aduio files as arrays
 def speech_file_to_array_fn(batch):
     batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
     speech_array, sampling_rate = torchaudio.load(batch["path"])
     batch["speech"] = resampler(speech_array).squeeze().numpy()
     return batch
@@ -118,7 +118,7 @@ result = test_dataset.map(evaluate, batched=True, batch_size=8)
 print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
 ```
-**Test Result**: 22.26 %
 ## Training

     metrics:
        - name: Test WER
          type: wer
+         value: 19.37
 ---
 # Wav2Vec2-Large-XLSR-Indonesian
 processor = Wav2Vec2Processor.from_pretrained("cahya/wav2vec2-large-xlsr-indonesian")
 model = Wav2Vec2ForCTC.from_pretrained("cahya/wav2vec2-large-xlsr-indonesian")
 # Preprocessing the datasets.
 # We need to read the aduio files as arrays
 def speech_file_to_array_fn(batch):
     speech_array, sampling_rate = torchaudio.load(batch["path"])
+    resampler = torchaudio.transforms.Resample(sampling_rate, 16_000)
     batch["speech"] = resampler(speech_array).squeeze().numpy()
     return batch
 chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\'\”\�]'
 # Preprocessing the datasets.
 # We need to read the aduio files as arrays
 def speech_file_to_array_fn(batch):
     batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
     speech_array, sampling_rate = torchaudio.load(batch["path"])
+    resampler = torchaudio.transforms.Resample(sampling_rate, 16_000)
     batch["speech"] = resampler(speech_array).squeeze().numpy()
     return batch
 print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
 ```
+**Test Result**: 19.37 %
 ## Training