lucio commited on
Commit
90ad0d7
1 Parent(s): 01b2793

Training in progress, step 500

Browse files
.ipynb_checkpoints/added_tokens-checkpoint.json CHANGED
@@ -1 +1 @@
1
- {"<s>": 42, "</s>": 43}
 
1
+ {"<s>": 36, "</s>": 37}
.ipynb_checkpoints/run-checkpoint.sh ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ python run_speech_recognition_ctc.py \
2
+ --dataset_name="mozilla-foundation/common_voice_8_0" \
3
+ --model_name_or_path="facebook/wav2vec2-xls-r-300m" \
4
+ --dataset_config_name="ug" \
5
+ --output_dir="./xls-r-uyghur-cv8" \
6
+ --overwrite_output_dir \
7
+ --num_train_epochs="100" \
8
+ --per_device_train_batch_size="8" \
9
+ --per_device_eval_batch_size="8" \
10
+ --gradient_accumulation_steps="4" \
11
+ --learning_rate="1e-4" \
12
+ --warmup_steps="2000" \
13
+ --length_column_name="input_length" \
14
+ --evaluation_strategy="steps" \
15
+ --text_column_name="sentence" \
16
+ --chars_to_ignore , ? . ! \- \; \: \\ _ \| ‒ ☺ ♂ © « ¬ » \" „ “ % ” � — ’ ، ؛ ؟ ‹ › − … – \
17
+ --save_steps="500" \
18
+ --eval_steps="500" \
19
+ --logging_steps="100" \
20
+ --layerdrop="0.0" \
21
+ --activation_dropout="0.1" \
22
+ --save_total_limit="3" \
23
+ --freeze_feature_encoder \
24
+ --feat_proj_dropout="0.0" \
25
+ --mask_time_prob="0.75" \
26
+ --mask_time_length="10" \
27
+ --mask_feature_prob="0.25" \
28
+ --mask_feature_length="64" \
29
+ --gradient_checkpointing \
30
+ --use_auth_token \
31
+ --fp16 \
32
+ --group_by_length \
33
+ --do_train --do_eval \
34
+ --push_to_hub
.ipynb_checkpoints/special_tokens_map-checkpoint.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
added_tokens.json CHANGED
@@ -1 +1 @@
1
- {"<s>": 42, "</s>": 43}
 
1
+ {"<s>": 36, "</s>": 37}
config.json CHANGED
@@ -76,7 +76,7 @@
76
  "num_hidden_layers": 24,
77
  "num_negatives": 100,
78
  "output_hidden_size": 1024,
79
- "pad_token_id": 41,
80
  "proj_codevector_dim": 768,
81
  "tdnn_dilation": [
82
  1,
@@ -102,6 +102,6 @@
102
  "torch_dtype": "float32",
103
  "transformers_version": "4.16.0.dev0",
104
  "use_weighted_layer_sum": false,
105
- "vocab_size": 44,
106
  "xvector_output_dim": 512
107
  }
 
76
  "num_hidden_layers": 24,
77
  "num_negatives": 100,
78
  "output_hidden_size": 1024,
79
+ "pad_token_id": 35,
80
  "proj_codevector_dim": 768,
81
  "tdnn_dilation": [
82
  1,
 
102
  "torch_dtype": "float32",
103
  "transformers_version": "4.16.0.dev0",
104
  "use_weighted_layer_sum": false,
105
+ "vocab_size": 38,
106
  "xvector_output_dim": 512
107
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c1116f3ee11641a58b4d05b479eab62bd54209a9260c5cda2e23d53a2bde8d25
3
- size 1262104049
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a3ad97ece4793bda34e8150646674d77e896042808a6b9fc05701079d65c0e6
3
+ size 1262079473
run.sh ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ python run_speech_recognition_ctc.py \
2
+ --dataset_name="mozilla-foundation/common_voice_8_0" \
3
+ --model_name_or_path="facebook/wav2vec2-xls-r-300m" \
4
+ --dataset_config_name="ug" \
5
+ --output_dir="./xls-r-uyghur-cv8" \
6
+ --overwrite_output_dir \
7
+ --num_train_epochs="100" \
8
+ --per_device_train_batch_size="8" \
9
+ --per_device_eval_batch_size="8" \
10
+ --gradient_accumulation_steps="4" \
11
+ --learning_rate="1e-4" \
12
+ --warmup_steps="2000" \
13
+ --length_column_name="input_length" \
14
+ --evaluation_strategy="steps" \
15
+ --text_column_name="sentence" \
16
+ --chars_to_ignore , ? . ! \- \; \: \\ _ \| ‒ ☺ ♂ © « ¬ » \" „ “ % ” � — ’ ، ؛ ؟ ‹ › − … – \
17
+ --save_steps="500" \
18
+ --eval_steps="500" \
19
+ --logging_steps="100" \
20
+ --layerdrop="0.0" \
21
+ --activation_dropout="0.1" \
22
+ --save_total_limit="3" \
23
+ --freeze_feature_encoder \
24
+ --feat_proj_dropout="0.0" \
25
+ --mask_time_prob="0.75" \
26
+ --mask_time_length="10" \
27
+ --mask_feature_prob="0.25" \
28
+ --mask_feature_length="64" \
29
+ --gradient_checkpointing \
30
+ --use_auth_token \
31
+ --fp16 \
32
+ --group_by_length \
33
+ --do_train --do_eval \
34
+ --push_to_hub
runs/Jan29_15-39-53_job-0074bb36-c67f-4775-b1b6-176eb09b0ba4/1643470896.2816308/events.out.tfevents.1643470896.job-0074bb36-c67f-4775-b1b6-176eb09b0ba4.1361704.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e47b8b81878370179f320a4a41c2d9e135216183adc90744dd0b29f3284622f8
3
+ size 4802
runs/Jan29_15-39-53_job-0074bb36-c67f-4775-b1b6-176eb09b0ba4/events.out.tfevents.1643470896.job-0074bb36-c67f-4775-b1b6-176eb09b0ba4.1361704.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26425c50ab8b2485d1f2e80976ab298af47009398b653416088af305c50e5f98
3
+ size 5831
special_tokens_map.json CHANGED
@@ -1 +1 @@
1
- {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:81c852d75cc7308b223dd2f8668f73ae311d16d0fb416266206ec9a8c702837a
3
  size 3055
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b6c2e5784857823e08cb2de88b60d0e851153e8fa91b898a9b696464a846526
3
  size 3055
vocab.json CHANGED
@@ -1 +1 @@
1
- {"،": 1, "؛": 2, "؟": 3, "ئ": 4, "ا": 5, "ب": 6, "ت": 7, "ج": 8, "خ": 9, "د": 10, "ر": 11, "ز": 12, "س": 13, "ش": 14, "غ": 15, "ف": 16, "ق": 17, "ك": 18, "ل": 19, "م": 20, "ن": 21, "و": 22, "ى": 23, "ي": 24, "پ": 25, "چ": 26, "ژ": 27, "ڭ": 28, "گ": 29, "ھ": 30, "ۆ": 31, "ۇ": 32, "ۈ": 33, "ۋ": 34, "ې": 35, "ە": 36, "‹": 37, "›": 38, "−": 39, "|": 0, "[UNK]": 40, "[PAD]": 41}
 
1
+ {"ئ": 1, "ا": 2, "ب": 3, "ت": 4, "ج": 5, "خ": 6, "د": 7, "ر": 8, "ز": 9, "س": 10, "ش": 11, "غ": 12, "ف": 13, "ق": 14, "ك": 15, "ل": 16, "م": 17, "ن": 18, "و": 19, "ى": 20, "ي": 21, "پ": 22, "چ": 23, "ژ": 24, "ڭ": 25, "گ": 26, "ھ": 27, "ۆ": 28, "ۇ": 29, "ۈ": 30, "ۋ": 31, "ې": 32, "ە": 33, "|": 0, "[UNK]": 34, "[PAD]": 35}