bofenghuang commited on Sep 29, 2022

Commit

9ae2cb9

1 Parent(s): d2dc4e5

init

Browse files

Files changed (31) hide show

README.md +166 -0
added_tokens.json +4 -0
alphabet.json +1 -0
config.json +107 -0
eval.py +182 -0
language_model/5gram.bin +3 -0
language_model/attrs.json +1 -0
language_model/unigrams.txt +0 -0
preprocessor_config.json +10 -0
pytorch_model.bin +3 -0
results_mozilla-foundatio_common_voice_9_0/log_mozilla-foundation_common_voice_9_0_fr_test_predictions.txt +0 -0
results_mozilla-foundatio_common_voice_9_0/log_mozilla-foundation_common_voice_9_0_fr_test_targets.txt +0 -0
results_mozilla-foundatio_common_voice_9_0/mozilla-foundation_common_voice_9_0_fr_test_eval_results.txt +2 -0
results_mozilla-foundatio_common_voice_9_0_with_lm/log_mozilla-foundation_common_voice_9_0_fr_test_predictions.txt +0 -0
results_mozilla-foundatio_common_voice_9_0_with_lm/log_mozilla-foundation_common_voice_9_0_fr_test_targets.txt +0 -0
results_mozilla-foundatio_common_voice_9_0_with_lm/mozilla-foundation_common_voice_9_0_fr_test_eval_results.txt +2 -0
results_polinaeterna_voxpopuli/log_polinaeterna_voxpopuli_fr_test_predictions.txt +0 -0
results_polinaeterna_voxpopuli/log_polinaeterna_voxpopuli_fr_test_targets.txt +0 -0
results_polinaeterna_voxpopuli/polinaeterna_voxpopuli_fr_test_eval_results.txt +2 -0
results_polinaeterna_voxpopuli_with_lm/log_polinaeterna_voxpopuli_fr_test_predictions.txt +0 -0
results_polinaeterna_voxpopuli_with_lm/log_polinaeterna_voxpopuli_fr_test_targets.txt +0 -0
results_polinaeterna_voxpopuli_with_lm/polinaeterna_voxpopuli_fr_test_eval_results.txt +2 -0
results_speech-recognition-community-v2_dev_data/log_speech-recognition-community-v2_dev_data_fr_validation_predictions.txt +0 -0
results_speech-recognition-community-v2_dev_data/log_speech-recognition-community-v2_dev_data_fr_validation_targets.txt +0 -0
results_speech-recognition-community-v2_dev_data/speech-recognition-community-v2_dev_data_fr_validation_eval_results.txt +2 -0
results_speech-recognition-community-v2_dev_data_with_lm/log_speech-recognition-community-v2_dev_data_fr_validation_predictions.txt +0 -0
results_speech-recognition-community-v2_dev_data_with_lm/log_speech-recognition-community-v2_dev_data_fr_validation_targets.txt +0 -0
results_speech-recognition-community-v2_dev_data_with_lm/speech-recognition-community-v2_dev_data_fr_validation_eval_results.txt +2 -0
special_tokens_map.json +36 -0
tokenizer_config.json +13 -0
vocab.json +46 -0

README.md ADDED Viewed

	@@ -0,0 +1,166 @@

+---
+language:
+- fr
+license: apache-2.0
+tags:
+- automatic-speech-recognition
+- polinaeterna/voxpopuli
+- generated_from_trainer
+- hf-asr-leaderboard
+- robust-speech-event
+datasets:
+- polinaeterna/voxpopuli
+model-index:
+- name: Fine-tuned Wav2Vec2 XLS-R 1B model for ASR in French
+  results:
+  - task:
+      name: Automatic Speech Recognition
+      type: automatic-speech-recognition
+    dataset:
+      name: Voxpopuli
+      type: polinaeterna/voxpopuli
+      args: fr
+    metrics:
+    - name: Test WER
+      type: wer
+      value: 11.70
+    - name: Test CER
+      type: cer
+      value: 5.80
+    - name: Test WER (+LM)
+      type: wer
+      value: 10.01
+    - name: Test CER (+LM)
+      type: cer
+      value: 5.63
+  - task:
+      name: Automatic Speech Recognition
+      type: automatic-speech-recognition
+    dataset:
+      name: Common Voice 9
+      type: mozilla-foundation/common_voice_9_0
+      args: fr
+    metrics:
+    - name: Test WER
+      type: wer
+      value: 45.74
+    - name: Test CER
+      type: cer
+      value: 22.99
+    - name: Test WER (+LM)
+      type: wer
+      value: 38.81
+    - name: Test CER (+LM)
+      type: cer
+      value: 23.25
+  - task:
+      name: Automatic Speech Recognition
+      type: automatic-speech-recognition
+    dataset:
+      name: Robust Speech Event - Dev Data
+      type: speech-recognition-community-v2/dev_data
+      args: fr
+    metrics:
+    - name: Test WER
+      type: wer
+      value: 27.86
+    - name: Test CER
+      type: cer
+      value: 13.20
+    - name: Test WER (+LM)
+      type: wer
+      value: 22.53
+    - name: Test CER (+LM)
+      type: cer
+      value: 12.82
+---
+# Fine-tuned Wav2Vec2 XLS-R 1B model for ASR in French
+This model is a fine-tuned version of [facebook/wav2vec2-xls-r-1b](https://huggingface.co/facebook/wav2vec2-xls-r-1b) on the POLINAETERNA/VOXPOPULI - FR dataset.
+It achieves the following results on the evaluation set:
+- Loss: 0.2906
+- Wer: 0.1093
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0001
+- train_batch_size: 16
+- eval_batch_size: 8
+- seed: 42
+- gradient_accumulation_steps: 8
+- total_train_batch_size: 128
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: linear
+- lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 12.0
+- mixed_precision_training: Native AMP
+### Training results
+| Training Loss | Epoch | Step | Validation Loss | Wer    |
+|:-------------:|:-----:|:----:|:---------------:|:------:|
+| 0.4628        | 0.93  | 500  | 0.3834          | 0.1625 |
+| 0.3577        | 1.85  | 1000 | 0.3231          | 0.1367 |
+| 0.3103        | 2.78  | 1500 | 0.2918          | 0.1287 |
+| 0.2884        | 3.7   | 2000 | 0.2845          | 0.1227 |
+| 0.2615        | 4.63  | 2500 | 0.2819          | 0.1189 |
+| 0.242         | 5.56  | 3000 | 0.2915          | 0.1165 |
+| 0.2268        | 6.48  | 3500 | 0.2768          | 0.1187 |
+| 0.2188        | 7.41  | 4000 | 0.2719          | 0.1128 |
+| 0.1979        | 8.33  | 4500 | 0.2741          | 0.1134 |
+| 0.1834        | 9.26  | 5000 | 0.2827          | 0.1096 |
+| 0.1719        | 10.19 | 5500 | 0.2906          | 0.1093 |
+| 0.1723        | 11.11 | 6000 | 0.2868          | 0.1104 |
+### Framework versions
+- Transformers 4.23.0.dev0
+- Pytorch 1.12.0+cu113
+- Datasets 2.4.0
+- Tokenizers 0.12.1
+## Evaluation
+1. To evaluate on `mozilla-foundation/common_voice_9_0`
+```bash
+python eval.py \
+  --model_id "bhuang/wav2vec2-xls-r-1b-voxpopuli-fr" \
+  --dataset "polinaeterna/voxpopuli" \
+  --config "fr" \
+  --split "test" \
+  --log_outputs \
+  --outdir "outputs/results_polinaeterna_voxpopuli_with_lm"
+```
+2. To evaluate on `mozilla-foundation/common_voice_9_0`
+```bash
+python eval.py \
+  --model_id "bhuang/wav2vec2-xls-r-1b-voxpopuli-fr" \
+  --dataset "mozilla-foundation/common_voice_9_0" \
+  --config "fr" \
+  --split "test" \
+  --log_outputs \
+  --outdir "outputs/results_mozilla-foundatio_common_voice_9_0_with_lm"
+```
+3. To evaluate on `speech-recognition-community-v2/dev_data`
+```bash
+python eval.py \
+  --model_id "bhuang/wav2vec2-xls-r-1b-voxpopuli-fr" \
+  --dataset "speech-recognition-community-v2/dev_data" \
+  --config "fr" \
+  --split "validation" \
+  --chunk_length_s 5.0 \
+  --stride_length_s 1.0 \
+  --log_outputs \
+  --outdir "outputs/results_speech-recognition-community-v2_dev_data_with_lm"
+```

added_tokens.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "</s>": 45,
+  "<s>": 44
+}

alphabet.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"labels": [" ", "'", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "\u00e0", "\u00e2", "\u00e7", "\u00e8", "\u00e9", "\u00ea", "\u00eb", "\u00ee", "\u00ef", "\u00f4", "\u00f9", "\u00fb", "\u00fc", "\u0153", "\u2047", "", "<s>", "</s>"], "is_bpe": false}

config.json ADDED Viewed

	@@ -0,0 +1,107 @@

+{
+  "_name_or_path": "facebook/wav2vec2-xls-r-1b",
+  "activation_dropout": 0.1,
+  "adapter_kernel_size": 3,
+  "adapter_stride": 2,
+  "add_adapter": false,
+  "apply_spec_augment": true,
+  "architectures": [
+    "Wav2Vec2ForCTC"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "classifier_proj_size": 256,
+  "codevector_dim": 1024,
+  "contrastive_logits_temperature": 0.1,
+  "conv_bias": true,
+  "conv_dim": [
+    512,
+    512,
+    512,
+    512,
+    512,
+    512,
+    512
+  ],
+  "conv_kernel": [
+    10,
+    3,
+    3,
+    3,
+    3,
+    2,
+    2
+  ],
+  "conv_stride": [
+    5,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "ctc_loss_reduction": "mean",
+  "ctc_zero_infinity": true,
+  "diversity_loss_weight": 0.1,
+  "do_stable_layer_norm": true,
+  "eos_token_id": 2,
+  "feat_extract_activation": "gelu",
+  "feat_extract_dropout": 0.0,
+  "feat_extract_norm": "layer",
+  "feat_proj_dropout": 0.0,
+  "feat_quantizer_dropout": 0.0,
+  "final_dropout": 0.0,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 1280,
+  "initializer_range": 0.02,
+  "intermediate_size": 5120,
+  "layer_norm_eps": 1e-05,
+  "layerdrop": 0.1,
+  "mask_feature_length": 10,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_prob": 0.05,
+  "model_type": "wav2vec2",
+  "num_adapter_layers": 3,
+  "num_attention_heads": 16,
+  "num_codevector_groups": 2,
+  "num_codevectors_per_group": 320,
+  "num_conv_pos_embedding_groups": 16,
+  "num_conv_pos_embeddings": 128,
+  "num_feat_extract_layers": 7,
+  "num_hidden_layers": 48,
+  "num_negatives": 100,
+  "output_hidden_size": 1280,
+  "pad_token_id": 43,
+  "proj_codevector_dim": 1024,
+  "tdnn_dilation": [
+    1,
+    2,
+    3,
+    1,
+    1
+  ],
+  "tdnn_dim": [
+    512,
+    512,
+    512,
+    512,
+    1500
+  ],
+  "tdnn_kernel": [
+    5,
+    3,
+    3,
+    1,
+    1
+  ],
+  "torch_dtype": "float32",
+  "transformers_version": "4.23.0.dev0",
+  "use_weighted_layer_sum": false,
+  "vocab_size": 46,
+  "xvector_output_dim": 512
+}

eval.py ADDED Viewed

	@@ -0,0 +1,182 @@

+#!/usr/bin/env python
+import argparse
+import re
+from typing import Dict
+import torch
+from datasets import Audio, Dataset, load_dataset, load_metric
+from transformers import (
+    AutoConfig,
+    AutoFeatureExtractor,
+    AutoModelForCTC,
+    AutoTokenizer,
+    Wav2Vec2Processor,
+    Wav2Vec2ProcessorWithLM,
+    pipeline,
+)
+def log_results(result: Dataset, args: Dict[str, str]):
+    """ DO NOT CHANGE. This function computes and logs the result metrics. """
+    log_outputs = args.log_outputs
+    dataset_id = "_".join(args.dataset.split("/") + [args.config, args.split])
+    # load metric
+    wer = load_metric("wer")
+    cer = load_metric("cer")
+    # compute metrics
+    wer_result = wer.compute(references=result["target"], predictions=result["prediction"])
+    cer_result = cer.compute(references=result["target"], predictions=result["prediction"])
+    # print & log results
+    result_str = f"WER: {wer_result}\n" f"CER: {cer_result}"
+    print(result_str)
+    with open(f"{dataset_id}_eval_results.txt", "w") as f:
+        f.write(result_str)
+    # log all results in text file. Possibly interesting for analysis
+    if log_outputs is not None:
+        pred_file = f"log_{dataset_id}_predictions.txt"
+        target_file = f"log_{dataset_id}_targets.txt"
+        with open(pred_file, "w") as p, open(target_file, "w") as t:
+            # mapping function to write output
+            def write_to_file(batch, i):
+                p.write(f"{i}" + "\n")
+                p.write(batch["prediction"] + "\n")
+                t.write(f"{i}" + "\n")
+                t.write(batch["target"] + "\n")
+            result.map(write_to_file, with_indices=True)
+def normalize_text(text: str, invalid_chars_regex: str) -> str:
+    """ DO ADAPT FOR YOUR USE CASE. this function normalizes the target text. """
+    text = text.lower()
+    text = re.sub(r"’", "'", text)
+    text = re.sub(invalid_chars_regex, " ", text)
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+def main(args):
+    # load dataset
+    dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
+    # for testing: only process the first two examples as a test
+    # dataset = dataset.select(range(10))
+    # load processor
+    if args.greedy:
+        processor = Wav2Vec2Processor.from_pretrained(args.model_id)
+        decoder = None
+    else:
+        processor = Wav2Vec2ProcessorWithLM.from_pretrained(args.model_id)
+        decoder = processor.decoder
+    feature_extractor = processor.feature_extractor
+    tokenizer = processor.tokenizer
+    sampling_rate = feature_extractor.sampling_rate
+    # resample audio
+    dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
+    # load eval pipeline
+    if args.device is None:
+        args.device = 0 if torch.cuda.is_available() else -1
+    config = AutoConfig.from_pretrained(args.model_id)
+    model = AutoModelForCTC.from_pretrained(args.model_id)
+    # asr = pipeline("automatic-speech-recognition", model=args.model_id, device=args.device)
+    asr = pipeline(
+        "automatic-speech-recognition",
+        config=config,
+        model=model,
+        tokenizer=tokenizer,
+        feature_extractor=feature_extractor,
+        decoder=decoder,
+        device=args.device,
+    )
+    # build normalizer config
+    tokenizer = AutoTokenizer.from_pretrained(args.model_id)
+    tokens = [x for x in tokenizer.convert_ids_to_tokens(range(0, tokenizer.vocab_size))]
+    special_tokens = [
+        tokenizer.pad_token,
+        tokenizer.word_delimiter_token,
+        tokenizer.unk_token,
+        tokenizer.bos_token,
+        tokenizer.eos_token,
+    ]
+    non_special_tokens = [x for x in tokens if x not in special_tokens]
+    invalid_chars_regex = f"[^\s{re.escape(''.join(set(non_special_tokens)))}]"
+    # normalize_to_lower = False
+    # for token in non_special_tokens:
+    #     if token.isalpha() and token.islower():
+    #         normalize_to_lower = True
+    #         break
+    # map function to decode audio
+    def map_to_pred(batch):
+        prediction = asr(batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s)
+        batch["prediction"] = prediction["text"]
+        batch["target"] = normalize_text(batch["sentence"], invalid_chars_regex)
+        return batch
+    # run inference on all examples
+    result = dataset.map(map_to_pred, remove_columns=dataset.column_names)
+    # filtering out empty targets
+    result = result.filter(lambda example: example["target"] != "")
+    # compute and log_results
+    # do not change function below
+    log_results(result, args)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_id", type=str, required=True, help="Model identifier. Should be loadable with 🤗 Transformers")
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        required=True,
+        help="Dataset name to evaluate the `model_id`. Should be loadable with 🤗 Datasets",
+    )
+    parser.add_argument("--config", type=str, required=True, help="Config of the dataset. *E.g.* `'en'`  for Common Voice")
+    parser.add_argument("--split", type=str, required=True, help="Split of the dataset. *E.g.* `'test'`")
+    parser.add_argument(
+        "--chunk_length_s",
+        type=float,
+        default=None,
+        help="Chunk length in seconds. Defaults to None. For long audio files a good value would be 5.0 seconds.",
+    )
+    parser.add_argument(
+        "--stride_length_s",
+        type=float,
+        default=None,
+        help="Stride of the audio chunks. Defaults to None. For long audio files a good value would be 1.0 seconds.",
+    )
+    parser.add_argument("--log_outputs", action="store_true", help="If defined, write outputs to log file for analysis.")
+    parser.add_argument("--greedy", action="store_true", help="If defined, the LM will be ignored during inference.")
+    parser.add_argument(
+        "--device",
+        type=int,
+        default=None,
+        help="The device to run the pipeline on. -1 for CPU (default), 0 for the first GPU and so on.",
+    )
+    args = parser.parse_args()
+    main(args)

language_model/5gram.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e840f5bdc9863f9a9dc589c4fbdbc4d0ceba5ec9308b51f5de8df801afdb430
+size 110696534

language_model/attrs.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"alpha": 0.5, "beta": 1.5, "unk_score_offset": -10.0, "score_boundary": true}

language_model/unigrams.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "do_normalize": true,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0,
+  "processor_class": "Wav2Vec2ProcessorWithLM",
+  "return_attention_mask": true,
+  "sampling_rate": 16000
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b5c1a54580151b88335c8297a0dcc3c403aed8a91340048de1187c2fa8282464
+size 3850500657

results_mozilla-foundatio_common_voice_9_0/log_mozilla-foundation_common_voice_9_0_fr_test_predictions.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

results_mozilla-foundatio_common_voice_9_0/log_mozilla-foundation_common_voice_9_0_fr_test_targets.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

results_mozilla-foundatio_common_voice_9_0/mozilla-foundation_common_voice_9_0_fr_test_eval_results.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ WER: 0.4574742025825611
2	+ CER: 0.2299324803692149

results_mozilla-foundatio_common_voice_9_0_with_lm/log_mozilla-foundation_common_voice_9_0_fr_test_predictions.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

results_mozilla-foundatio_common_voice_9_0_with_lm/log_mozilla-foundation_common_voice_9_0_fr_test_targets.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

results_mozilla-foundatio_common_voice_9_0_with_lm/mozilla-foundation_common_voice_9_0_fr_test_eval_results.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ WER: 0.38810468081373856
2	+ CER: 0.2325219729607091

results_polinaeterna_voxpopuli/log_polinaeterna_voxpopuli_fr_test_predictions.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

results_polinaeterna_voxpopuli/log_polinaeterna_voxpopuli_fr_test_targets.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

results_polinaeterna_voxpopuli/polinaeterna_voxpopuli_fr_test_eval_results.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ WER: 0.11708419585146382
2	+ CER: 0.05802534895790652

results_polinaeterna_voxpopuli_with_lm/log_polinaeterna_voxpopuli_fr_test_predictions.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

results_polinaeterna_voxpopuli_with_lm/log_polinaeterna_voxpopuli_fr_test_targets.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

results_polinaeterna_voxpopuli_with_lm/polinaeterna_voxpopuli_fr_test_eval_results.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ WER: 0.10014363567234168
2	+ CER: 0.056365920948400663

results_speech-recognition-community-v2_dev_data/log_speech-recognition-community-v2_dev_data_fr_validation_predictions.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

results_speech-recognition-community-v2_dev_data/log_speech-recognition-community-v2_dev_data_fr_validation_targets.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

results_speech-recognition-community-v2_dev_data/speech-recognition-community-v2_dev_data_fr_validation_eval_results.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ WER: 0.27865348650814853
2	+ CER: 0.13207079892578805

results_speech-recognition-community-v2_dev_data_with_lm/log_speech-recognition-community-v2_dev_data_fr_validation_predictions.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

results_speech-recognition-community-v2_dev_data_with_lm/log_speech-recognition-community-v2_dev_data_fr_validation_targets.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

results_speech-recognition-community-v2_dev_data_with_lm/speech-recognition-community-v2_dev_data_fr_validation_eval_results.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ WER: 0.22535399412236173
2	+ CER: 0.1283249878106353

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "additional_special_tokens": [
+    {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "bos_token": "<s>",
+  "eos_token": "</s>",
+  "pad_token": "[PAD]",
+  "unk_token": "[UNK]"
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "bos_token": "<s>",
+  "do_lower_case": false,
+  "eos_token": "</s>",
+  "name_or_path": "outputs/voxpopuli/wav2vec2-xls-r-1b-ft",
+  "pad_token": "[PAD]",
+  "processor_class": "Wav2Vec2ProcessorWithLM",
+  "replace_word_delimiter_char": " ",
+  "special_tokens_map_file": null,
+  "tokenizer_class": "Wav2Vec2CTCTokenizer",
+  "unk_token": "[UNK]",
+  "word_delimiter_token": "|"
+}

vocab.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "'": 1,
+  "[PAD]": 43,
+  "[UNK]": 42,
+  "a": 2,
+  "b": 3,
+  "c": 4,
+  "d": 5,
+  "e": 6,
+  "f": 7,
+  "g": 8,
+  "h": 9,
+  "i": 10,
+  "j": 11,
+  "k": 12,
+  "l": 13,
+  "m": 14,
+  "n": 15,
+  "o": 16,
+  "p": 17,
+  "q": 18,
+  "r": 19,
+  "s": 20,
+  "t": 21,
+  "u": 22,
+  "v": 23,
+  "w": 24,
+  "x": 25,
+  "y": 26,
+  "z": 27,
+  "|": 0,
+  "à": 28,
+  "â": 29,
+  "ç": 30,
+  "è": 31,
+  "é": 32,
+  "ê": 33,
+  "ë": 34,
+  "î": 35,
+  "ï": 36,
+  "ô": 37,
+  "ù": 38,
+  "û": 39,
+  "ü": 40,
+  "œ": 41
+}