Training in progress, step 500

Browse files

Files changed (13) hide show

.ipynb_checkpoints/eval-checkpoint.py +128 -0
.ipynb_checkpoints/run-checkpoint.sh +38 -0
config.json +3 -3
log_mozilla-foundation_common_voice_8_0_ky_test_predictions.txt +0 -0
log_mozilla-foundation_common_voice_8_0_ky_test_targets.txt +0 -0
pytorch_model.bin +1 -1
run.sh +5 -5
runs/Feb04_19-31-13_job-699ba53c-fea9-4eb2-81af-a97f440eaa45/1644003121.6455085/events.out.tfevents.1644003121.job-699ba53c-fea9-4eb2-81af-a97f440eaa45.1462870.1 +3 -0
runs/Feb04_19-31-13_job-699ba53c-fea9-4eb2-81af-a97f440eaa45/events.out.tfevents.1644003121.job-699ba53c-fea9-4eb2-81af-a97f440eaa45.1462870.0 +3 -0
runs/Feb04_19-35-31_job-699ba53c-fea9-4eb2-81af-a97f440eaa45/1644003377.3563116/events.out.tfevents.1644003377.job-699ba53c-fea9-4eb2-81af-a97f440eaa45.1464599.1 +3 -0
runs/Feb04_19-35-31_job-699ba53c-fea9-4eb2-81af-a97f440eaa45/events.out.tfevents.1644003377.job-699ba53c-fea9-4eb2-81af-a97f440eaa45.1464599.0 +3 -0
special_tokens_map.json +1 -1
training_args.bin +1 -1

.ipynb_checkpoints/eval-checkpoint.py ADDED Viewed

	@@ -0,0 +1,128 @@

+#!/usr/bin/env python3
+import argparse
+import re
+from typing import Dict
+from datasets import Audio, Dataset, load_dataset, load_metric
+from transformers import AutoFeatureExtractor, pipeline
+def log_results(result: Dataset, args: Dict[str, str]):
+    """DO NOT CHANGE. This function computes and logs the result metrics."""
+    log_outputs = args.log_outputs
+    dataset_id = "_".join(args.dataset.split("/") + [args.config, args.split])
+    # load metric
+    wer = load_metric("wer")
+    cer = load_metric("cer")
+    # compute metrics
+    wer_result = wer.compute(references=result["target"], predictions=result["prediction"])
+    cer_result = cer.compute(references=result["target"], predictions=result["prediction"])
+    # print & log results
+    result_str = f"WER: {wer_result}\n" f"CER: {cer_result}"
+    print(result_str)
+    with open(f"{dataset_id}_eval_results.txt", "w") as f:
+        f.write(result_str)
+    # log all results in text file. Possibly interesting for analysis
+    if log_outputs is not None:
+        pred_file = f"log_{dataset_id}_predictions.txt"
+        target_file = f"log_{dataset_id}_targets.txt"
+        with open(pred_file, "w") as p, open(target_file, "w") as t:
+            # mapping function to write output
+            def write_to_file(batch, i):
+                p.write(f"{i}" + "\n")
+                p.write(batch["prediction"] + "\n")
+                t.write(f"{i}" + "\n")
+                t.write(batch["target"] + "\n")
+            result.map(write_to_file, with_indices=True)
+def normalize_text(text: str) -> str:
+    """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
+    chars_to_ignore_regex = '[!"%,.:;?\\_|©«¬»،؛؟‒–—’“”„…‹›−☺♂�\\\\-]'  # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
+    text = re.sub(chars_to_ignore_regex, "", text.lower())
+    # In addition, we can normalize the target text, e.g. removing new lines characters etc...
+    # note that order is important here!
+    token_sequences_to_ignore = ["\n\n", "\n", "   ", "  "]
+    for t in token_sequences_to_ignore:
+        text = " ".join(text.split(t))
+    return text
+def main(args):
+    # load dataset
+    dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
+    # for testing: only process the first two examples as a test
+    # dataset = dataset.select(range(10))
+    # load processor
+    feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
+    sampling_rate = feature_extractor.sampling_rate
+    # resample audio
+    dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
+    # load eval pipeline
+    asr = pipeline("automatic-speech-recognition", model=args.model_id)
+    # map function to decode audio
+    def map_to_pred(batch):
+        prediction = asr(
+            batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s
+        )
+        batch["prediction"] = prediction["text"]
+        batch["target"] = normalize_text(batch["sentence"])
+        return batch
+    # run inference on all examples
+    result = dataset.map(map_to_pred, remove_columns=dataset.column_names)
+    # compute and log_results
+    # do not change function below
+    log_results(result, args)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_id", type=str, required=True, help="Model identifier. Should be loadable with 🤗 Transformers"
+    )
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        required=True,
+        help="Dataset name to evaluate the `model_id`. Should be loadable with 🤗 Datasets",
+    )
+    parser.add_argument(
+        "--config", type=str, required=True, help="Config of the dataset. *E.g.* `'en'`  for Common Voice"
+    )
+    parser.add_argument("--split", type=str, required=True, help="Split of the dataset. *E.g.* `'test'`")
+    parser.add_argument(
+        "--chunk_length_s", type=float, default=None, help="Chunk length in seconds. Defaults to 5 seconds."
+    )
+    parser.add_argument(
+        "--stride_length_s", type=float, default=None, help="Stride of the audio chunks. Defaults to 1 second."
+    )
+    parser.add_argument(
+        "--log_outputs", action="store_true", help="If defined, write outputs to log file for analysis."
+    )
+    args = parser.parse_args()
+    main(args)

.ipynb_checkpoints/run-checkpoint.sh ADDED Viewed

	@@ -0,0 +1,38 @@

+python kyrgiz/run_speech_recognition_ctc.py \
+	--dataset_name="mozilla-foundation/common_voice_8_0" \
+	--model_name_or_path="facebook/wav2vec2-xls-r-300m" \
+	--dataset_config_name="ky" \
+    --train_split_name="train+validation[:50%]" \
+    --eval_split_name="validation[50%:]" \
+	--output_dir="./xls-r-kyrgiz-cv8" \
+	--overwrite_output_dir \
+	--num_train_epochs="50" \
+	--per_device_train_batch_size="16" \
+	--per_device_eval_batch_size="8" \
+	--gradient_accumulation_steps="4" \
+	--learning_rate="1e-4" \
+	--warmup_steps="250" \
+	--length_column_name="input_length" \
+	--evaluation_strategy="steps" \
+	--text_column_name="sentence" \
+	--chars_to_ignore , ? . ! \- \; \: \\ _ \| ‒ ☺ ♂ © « ¬ » \" „ “ % ” �  — ’ ، ؛ ؟ ‹ › − … – \
+	--eval_metrics="wer" \
+	--save_steps="500" \
+	--eval_steps="500" \
+	--logging_steps="100" \
+	--min_duration_in_seconds="0.2" \
+	--layerdrop="0.01" \
+	--activation_dropout="0.1" \
+	--save_total_limit="3" \
+	--freeze_feature_encoder \
+	--feat_proj_dropout="0.01" \
+	--mask_time_prob="0.50" \
+	--mask_time_length="10" \
+	--mask_feature_prob="0.25" \
+	--mask_feature_length="64" \
+	--gradient_checkpointing \
+	--use_auth_token \
+	--fp16 \
+	--group_by_length \
+	--do_train --do_eval \
+	--push_to_hub

config.json CHANGED Viewed

@@ -49,7 +49,7 @@
   "feat_extract_activation": "gelu",
   "feat_extract_dropout": 0.0,
   "feat_extract_norm": "layer",
-  "feat_proj_dropout": 0.0,
   "feat_quantizer_dropout": 0.0,
   "final_dropout": 0.0,
   "hidden_act": "gelu",
@@ -58,13 +58,13 @@
   "initializer_range": 0.02,
   "intermediate_size": 4096,
   "layer_norm_eps": 1e-05,
-  "layerdrop": 0.0,
   "mask_feature_length": 64,
   "mask_feature_min_masks": 0,
   "mask_feature_prob": 0.25,
   "mask_time_length": 10,
   "mask_time_min_masks": 2,
-  "mask_time_prob": 0.75,
   "model_type": "wav2vec2",
   "num_adapter_layers": 3,
   "num_attention_heads": 16,

   "feat_extract_activation": "gelu",
   "feat_extract_dropout": 0.0,
   "feat_extract_norm": "layer",
+  "feat_proj_dropout": 0.01,
   "feat_quantizer_dropout": 0.0,
   "final_dropout": 0.0,
   "hidden_act": "gelu",
   "initializer_range": 0.02,
   "intermediate_size": 4096,
   "layer_norm_eps": 1e-05,
+  "layerdrop": 0.01,
   "mask_feature_length": 64,
   "mask_feature_min_masks": 0,
   "mask_feature_prob": 0.25,
   "mask_time_length": 10,
   "mask_time_min_masks": 2,
+  "mask_time_prob": 0.5,
   "model_type": "wav2vec2",
   "num_adapter_layers": 3,
   "num_attention_heads": 16,

log_mozilla-foundation_common_voice_8_0_ky_test_predictions.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

log_mozilla-foundation_common_voice_8_0_ky_test_targets.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0011d45cb89b71516dfd84ec016513f83e261f45eb3b14519cb156fc9a56e796
 size 1262095857

 version https://git-lfs.github.com/spec/v1
+oid sha256:6199b399576e56ccf0b75f137f2c3b014f6ed6fc8036caf25a21c670c49ffc76
 size 1262095857

run.sh CHANGED Viewed

@@ -6,12 +6,12 @@ python kyrgiz/run_speech_recognition_ctc.py \
     --eval_split_name="validation[50%:]" \
 	--output_dir="./xls-r-kyrgiz-cv8" \
 	--overwrite_output_dir \
-	--num_train_epochs="200" \
 	--per_device_train_batch_size="16" \
 	--per_device_eval_batch_size="8" \
 	--gradient_accumulation_steps="4" \
 	--learning_rate="1e-4" \
-	--warmup_steps="500" \
 	--length_column_name="input_length" \
 	--evaluation_strategy="steps" \
 	--text_column_name="sentence" \
@@ -21,12 +21,12 @@ python kyrgiz/run_speech_recognition_ctc.py \
 	--eval_steps="500" \
 	--logging_steps="100" \
 	--min_duration_in_seconds="0.2" \
-	--layerdrop="0.0" \
 	--activation_dropout="0.1" \
 	--save_total_limit="3" \
 	--freeze_feature_encoder \
-	--feat_proj_dropout="0.0" \
-	--mask_time_prob="0.75" \
 	--mask_time_length="10" \
 	--mask_feature_prob="0.25" \
 	--mask_feature_length="64" \

     --eval_split_name="validation[50%:]" \
 	--output_dir="./xls-r-kyrgiz-cv8" \
 	--overwrite_output_dir \
+	--num_train_epochs="50" \
 	--per_device_train_batch_size="16" \
 	--per_device_eval_batch_size="8" \
 	--gradient_accumulation_steps="4" \
 	--learning_rate="1e-4" \
+	--warmup_steps="250" \
 	--length_column_name="input_length" \
 	--evaluation_strategy="steps" \
 	--text_column_name="sentence" \
 	--eval_steps="500" \
 	--logging_steps="100" \
 	--min_duration_in_seconds="0.2" \
+	--layerdrop="0.01" \
 	--activation_dropout="0.1" \
 	--save_total_limit="3" \
 	--freeze_feature_encoder \
+	--feat_proj_dropout="0.01" \
+	--mask_time_prob="0.50" \
 	--mask_time_length="10" \
 	--mask_feature_prob="0.25" \
 	--mask_feature_length="64" \

runs/Feb04_19-31-13_job-699ba53c-fea9-4eb2-81af-a97f440eaa45/1644003121.6455085/events.out.tfevents.1644003121.job-699ba53c-fea9-4eb2-81af-a97f440eaa45.1462870.1 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8684f706f1b6d98f1f2811df032c25d412d6d070d7b0c81d8687661090103c97
+size 4802

runs/Feb04_19-31-13_job-699ba53c-fea9-4eb2-81af-a97f440eaa45/events.out.tfevents.1644003121.job-699ba53c-fea9-4eb2-81af-a97f440eaa45.1462870.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2e6cc2357c5c3b93d68aa44f852cec869b2128d20e9d741ec82a1789e6a3afa9
+size 4756

runs/Feb04_19-35-31_job-699ba53c-fea9-4eb2-81af-a97f440eaa45/1644003377.3563116/events.out.tfevents.1644003377.job-699ba53c-fea9-4eb2-81af-a97f440eaa45.1464599.1 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ed147ab6a4afce4c3e6fb99e4d8f47da146b3db51412c4c65c6a8d03d50da309
+size 4802

runs/Feb04_19-35-31_job-699ba53c-fea9-4eb2-81af-a97f440eaa45/events.out.tfevents.1644003377.job-699ba53c-fea9-4eb2-81af-a97f440eaa45.1464599.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d99720c701214c1653a05e1396a33a187eb008fdc29e97e62c9a9607f1ad7823
+size 5856

special_tokens_map.json CHANGED Viewed

@@ -1 +1 @@

- {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}

+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cde060aa868bd66f6b2774ce16ec7877e122267f0549334cd1553b0d3ad50ad5
 size 3055

 version https://git-lfs.github.com/spec/v1
+oid sha256:2b4de827673e3dfe665950f44fa7245a141f080f380ad5cea7b1833a16096357
 size 3055