Model save

Browse files

Files changed (11) hide show

README.md +59 -0
added_tokens.json +4 -0
breeze-listen-w2v2-kn-GF.log +122 -0
config.json +108 -0
model.safetensors +3 -0
preprocessor_config.json +10 -0
special_tokens_map.json +30 -0
tokenizer_config.json +48 -0
train-ctc-model.sh +108 -0
training_args.bin +3 -0
vocab.json +106 -0

README.md ADDED Viewed

	@@ -0,0 +1,59 @@

+---
+license: cc-by-nc-4.0
+base_model: facebook/mms-1b-all
+tags:
+- generated_from_trainer
+datasets:
+- fleurs
+model-index:
+- name: breeze-listen-w2v2-kn-GF
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# breeze-listen-w2v2-kn-GF
+This model is a fine-tuned version of [facebook/mms-1b-all](https://huggingface.co/facebook/mms-1b-all) on the fleurs dataset.
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.001
+- train_batch_size: 1
+- eval_batch_size: 8
+- seed: 42
+- distributed_type: multi-GPU
+- gradient_accumulation_steps: 32
+- total_train_batch_size: 32
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: linear
+- lr_scheduler_warmup_steps: 100
+- num_epochs: 4.0
+- mixed_precision_training: Native AMP
+### Training results
+### Framework versions
+- Transformers 4.38.0.dev0
+- Pytorch 2.1.2+cu121
+- Datasets 2.16.1
+- Tokenizers 0.15.1

added_tokens.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "</s>": 103,
+  "<s>": 102
+}

breeze-listen-w2v2-kn-GF.log ADDED Viewed

	@@ -0,0 +1,122 @@

+02/04/2024 13:54:35 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: True, 16-bits training: True
+02/04/2024 13:54:35 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
+_n_gpu=1,
+adafactor=False,
+adam_beta1=0.9,
+adam_beta2=0.999,
+adam_epsilon=1e-08,
+auto_find_batch_size=False,
+bf16=False,
+bf16_full_eval=False,
+data_seed=None,
+dataloader_drop_last=False,
+dataloader_num_workers=0,
+dataloader_persistent_workers=False,
+dataloader_pin_memory=True,
+dataloader_prefetch_factor=None,
+ddp_backend=None,
+ddp_broadcast_buffers=None,
+ddp_bucket_cap_mb=None,
+ddp_find_unused_parameters=None,
+ddp_timeout=1800,
+debug=[],
+deepspeed=None,
+disable_tqdm=False,
+dispatch_batches=None,
+do_eval=True,
+do_predict=False,
+do_train=True,
+eval_accumulation_steps=None,
+eval_delay=0,
+eval_steps=1000,
+evaluation_strategy=IntervalStrategy.STEPS,
+fp16=True,
+fp16_backend=auto,
+fp16_full_eval=False,
+fp16_opt_level=O1,
+fsdp=[],
+fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
+fsdp_min_num_params=0,
+fsdp_transformer_layer_cls_to_wrap=None,
+full_determinism=False,
+gradient_accumulation_steps=32,
+gradient_checkpointing=True,
+gradient_checkpointing_kwargs=None,
+greater_is_better=None,
+group_by_length=True,
+half_precision_backend=auto,
+hub_always_push=False,
+hub_model_id=simpragma/breeze-listen-w2v2-kn-GF,
+hub_private_repo=False,
+hub_strategy=HubStrategy.EVERY_SAVE,
+hub_token=<HUB_TOKEN>,
+ignore_data_skip=False,
+include_inputs_for_metrics=False,
+include_num_input_tokens_seen=False,
+include_tokens_per_second=False,
+jit_mode_eval=False,
+label_names=None,
+label_smoothing_factor=0.0,
+learning_rate=0.001,
+length_column_name=input_length,
+load_best_model_at_end=False,
+local_rank=0,
+log_level=passive,
+log_level_replica=warning,
+log_on_each_node=True,
+logging_dir=/cosmos/home/sp-operator/ai/training/models/simpragma/breeze-listen-w2v2-kn-GF/runs/Feb04_13-54-35_knight,
+logging_first_step=False,
+logging_nan_inf_filter=True,
+logging_steps=500,
+logging_strategy=IntervalStrategy.STEPS,
+lr_scheduler_kwargs={},
+lr_scheduler_type=SchedulerType.LINEAR,
+max_grad_norm=1.0,
+max_steps=-1,
+metric_for_best_model=None,
+mp_parameters=,
+neftune_noise_alpha=None,
+no_cuda=False,
+num_train_epochs=4.0,
+optim=OptimizerNames.ADAMW_BNB,
+optim_args=None,
+output_dir=/cosmos/home/sp-operator/ai/training/models/simpragma/breeze-listen-w2v2-kn-GF,
+overwrite_output_dir=True,
+past_index=-1,
+per_device_eval_batch_size=8,
+per_device_train_batch_size=1,
+prediction_loss_only=False,
+push_to_hub=True,
+push_to_hub_model_id=None,
+push_to_hub_organization=None,
+push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
+ray_scope=last,
+remove_unused_columns=True,
+report_to=[],
+resume_from_checkpoint=None,
+run_name=/cosmos/home/sp-operator/ai/training/models/simpragma/breeze-listen-w2v2-kn-GF,
+save_on_each_node=False,
+save_only_model=False,
+save_safetensors=True,
+save_steps=1000,
+save_strategy=IntervalStrategy.STEPS,
+save_total_limit=3,
+seed=42,
+skip_memory_metrics=True,
+split_batches=False,
+tf32=None,
+torch_compile=False,
+torch_compile_backend=None,
+torch_compile_mode=None,
+torchdynamo=None,
+tpu_metrics_debug=False,
+tpu_num_cores=None,
+use_cpu=False,
+use_ipex=False,
+use_legacy_prediction_loop=False,
+use_mps_device=False,
+warmup_ratio=0.0,
+warmup_steps=100,
+weight_decay=0.0,
+)
+{'train_runtime': 13678.922, 'train_samples_per_second': 0.723, 'train_steps_per_second': 0.023, 'train_loss': 3.364711216517857, 'epoch': 3.99}

config.json ADDED Viewed

	@@ -0,0 +1,108 @@

+{
+  "_name_or_path": "facebook/mms-1b-all",
+  "activation_dropout": 0.05,
+  "adapter_attn_dim": 16,
+  "adapter_kernel_size": 3,
+  "adapter_stride": 2,
+  "add_adapter": false,
+  "apply_spec_augment": true,
+  "architectures": [
+    "Wav2Vec2ForCTC"
+  ],
+  "attention_dropout": 0.05,
+  "bos_token_id": 1,
+  "classifier_proj_size": 256,
+  "codevector_dim": 1024,
+  "contrastive_logits_temperature": 0.1,
+  "conv_bias": true,
+  "conv_dim": [
+    512,
+    512,
+    512,
+    512,
+    512,
+    512,
+    512
+  ],
+  "conv_kernel": [
+    10,
+    3,
+    3,
+    3,
+    3,
+    2,
+    2
+  ],
+  "conv_stride": [
+    5,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "ctc_loss_reduction": "mean",
+  "ctc_zero_infinity": false,
+  "diversity_loss_weight": 0.1,
+  "do_stable_layer_norm": true,
+  "eos_token_id": 2,
+  "feat_extract_activation": "gelu",
+  "feat_extract_dropout": 0.0,
+  "feat_extract_norm": "layer",
+  "feat_proj_dropout": 0.05,
+  "feat_quantizer_dropout": 0.0,
+  "final_dropout": 0.0,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.05,
+  "hidden_size": 1280,
+  "initializer_range": 0.02,
+  "intermediate_size": 5120,
+  "layer_norm_eps": 1e-05,
+  "layerdrop": 0.0,
+  "mask_feature_length": 10,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_prob": 0.05,
+  "model_type": "wav2vec2",
+  "num_adapter_layers": 3,
+  "num_attention_heads": 16,
+  "num_codevector_groups": 2,
+  "num_codevectors_per_group": 320,
+  "num_conv_pos_embedding_groups": 16,
+  "num_conv_pos_embeddings": 128,
+  "num_feat_extract_layers": 7,
+  "num_hidden_layers": 48,
+  "num_negatives": 100,
+  "output_hidden_size": 1280,
+  "pad_token_id": 101,
+  "proj_codevector_dim": 1024,
+  "tdnn_dilation": [
+    1,
+    2,
+    3,
+    1,
+    1
+  ],
+  "tdnn_dim": [
+    512,
+    512,
+    512,
+    512,
+    1500
+  ],
+  "tdnn_kernel": [
+    5,
+    3,
+    3,
+    1,
+    1
+  ],
+  "torch_dtype": "float32",
+  "transformers_version": "4.38.0.dev0",
+  "use_weighted_layer_sum": false,
+  "vocab_size": 104,
+  "xvector_output_dim": 512
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:84fe74380883a43f5a315e535ef7668f1661b11fa85249c1f5ea0efd5b201db3
+size 3859264976

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "do_normalize": true,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0,
+  "processor_class": "Wav2Vec2Processor",
+  "return_attention_mask": true,
+  "sampling_rate": 16000
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": true,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": true,
+    "single_word": false
+  }
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "added_tokens_decoder": {
+    "100": {
+      "content": "[UNK]",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "101": {
+      "content": "[PAD]",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "102": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": true,
+  "do_lower_case": false,
+  "eos_token": "</s>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "processor_class": "Wav2Vec2Processor",
+  "replace_word_delimiter_char": " ",
+  "target_lang": "kan",
+  "tokenizer_class": "Wav2Vec2CTCTokenizer",
+  "unk_token": "[UNK]",
+  "word_delimiter_token": "|"
+}

train-ctc-model.sh ADDED Viewed

	@@ -0,0 +1,108 @@

+#! /usr/bin/bash
+#
+# This script runs the speech recognition training using DeepSpeed
+#
+# CHANGE THESE AS PER YOUR REQUIREMENTS
+# LANG as it is referred in the dataset
+#LANG=te			# 2 letter ISO code for the language
+LANG=kn_in		# 2 letter ISO code for the language with locale (some datasets like Google/Fleurs require this)
+LANG_ISO_3=kan		# 3 letter ISO code for the language
+LANGUAGE=Kannada	# Full language name as per Whisper convention
+# For Mozilla Commonvoice datasets, uncomment the following
+#DATASET="mozilla-foundation/common_voice_16_0"
+#TEXT_COLUMN="sentence"
+# For Google Fleurs datasets, uncomment the following
+DATASET="google/fleurs"
+TEXT_COLUMN="transcription"
+# Custom datasets
+#DATASET="parambharat/kannada_asr_corpus"
+#TEXT_COLUMN=${TEXT_COLUMN:-"sentence"}
+#
+# Main
+#
+SCRIPT_PATH=$(realpath "${BASH_SOURCE[0]}")
+SCRIPT_DIR=$(realpath $(dirname "${BASH_SOURCE[0]}"))
+# Port to use
+export MASTER_PORT="${MASTER_PORT:-29500}"
+echo "Using master_port for deepspeech: ${MASTER_PORT}"
+export "MASTER_ADDR"="localhost"
+export "RANK"="0"
+export "LOCAL_RANK"="0"
+export "WORLD_SIZE"="1"
+# Base model variant
+MODEL=w2v2
+# Model names and other stuff
+BASE_MODEL="facebook/mms-1b-all"
+JUST_LANG=${LANG%%_*}
+MY_MODEL="breeze-listen-${MODEL}-${JUST_LANG}-GF"
+OUTDIR="/cosmos/home/sp-operator/ai/training/models/simpragma/${MY_MODEL}"
+echo "OUTDIR: ${OUTDIR}"
+# Training parameters you can tweak. Feel free to directly change any of the parameters below.
+MAX_EPOCHS=4
+TRAIN_BATCH_SIZE=2
+EVAL_BATCH_SIZE=2
+LEARNING_RATE="1e-3"
+EVAL_STEPS="1000"
+SAVE_STEPS="1000"
+# Create dir
+mkdir -p ${OUTDIR}
+#	--overwrite_output_dir \
+# If you want to resume from existing checkpoint, include the following argument as well. Modify the checkpoint directory.
+# --resume_from_checkpoint="${MY_MODEL}/checkpoint-400" \
+echo "================ TRAINING: START ================"
+python ${SCRIPT_DIR}/run_speech_recognition_ctc_adapter.py \
+	--dataset_name="${DATASET}" \
+	--model_name_or_path="${BASE_MODEL}" \
+	--dataset_config_name="${LANG}" \
+	--target_language="${LANG_ISO_3}"	\
+	--output_dir="${OUTDIR}" \
+	--num_train_epochs="${MAX_EPOCHS}" \
+	--per_device_train_batch_size="${TRAIN_BATCH_SIZE}" \
+	--learning_rate="${LEARNING_RATE}" \
+	--warmup_steps="100" \
+	--evaluation_strategy="steps" \
+	--text_column_name="${TEXT_COLUMN}" \
+	--length_column_name="input_length" \
+	--save_steps="${SAVE_STEPS}" \
+	--eval_steps="${EVAL_STEPS}" \
+	--save_total_limit="3" \
+	--optim="adamw_bnb_8bit"	\
+	--hub_model_id "simpragma/${MY_MODEL}" \
+	--gradient_checkpointing \
+	--chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” � \
+	--fp16 \
+	--group_by_length \
+	--do_train 	\
+	--do_eval \
+	--push_to_hub	\
+	--overwrite_output_dir	\
+	| tee ${OUTDIR}/${MY_MODEL}.log
+# Copy the script to the output directory so that we can recreate the model
+cp ${SCRIPT_PATH} ${OUTDIR}
+echo "================ TRAINING: DONE ================"
+exit 0

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:889214cde0e59492b6da27c312fdf4c9aa1ba5439a639a7b5f6e3a9dc0f91c73
+size 4856

vocab.json ADDED Viewed

	@@ -0,0 +1,106 @@

+{
+  "kan": {
+    "[": 1,
+    "[PAD]": 101,
+    "[UNK]": 100,
+    "]": 2,
+    "a": 3,
+    "b": 4,
+    "c": 5,
+    "d": 6,
+    "e": 7,
+    "f": 8,
+    "g": 9,
+    "h": 10,
+    "i": 11,
+    "j": 12,
+    "k": 13,
+    "l": 14,
+    "m": 15,
+    "n": 16,
+    "o": 17,
+    "p": 18,
+    "q": 19,
+    "r": 20,
+    "s": 21,
+    "t": 22,
+    "u": 23,
+    "v": 24,
+    "w": 25,
+    "x": 26,
+    "y": 27,
+    "z": 28,
+    "|": 0,
+    "°": 29,
+    "²": 30,
+    "½": 31,
+    "¾": 32,
+    "õ": 33,
+    "ಂ": 34,
+    "ಃ": 35,
+    "ಅ": 36,
+    "ಆ": 37,
+    "ಇ": 38,
+    "ಈ": 39,
+    "ಉ": 40,
+    "ಊ": 41,
+    "ಋ": 42,
+    "ಎ": 43,
+    "ಏ": 44,
+    "ಐ": 45,
+    "ಒ": 46,
+    "ಓ": 47,
+    "ಔ": 48,
+    "ಕ": 49,
+    "ಖ": 50,
+    "ಗ": 51,
+    "ಘ": 52,
+    "ಚ": 53,
+    "ಛ": 54,
+    "ಜ": 55,
+    "ಝ": 56,
+    "ಞ": 57,
+    "ಟ": 58,
+    "ಠ": 59,
+    "ಡ": 60,
+    "ಢ": 61,
+    "ಣ": 62,
+    "ತ": 63,
+    "ಥ": 64,
+    "ದ": 65,
+    "ಧ": 66,
+    "ನ": 67,
+    "ಪ": 68,
+    "ಫ": 69,
+    "ಬ": 70,
+    "ಭ": 71,
+    "ಮ": 72,
+    "ಯ": 73,
+    "ರ": 74,
+    "ಲ": 75,
+    "ಳ": 76,
+    "ವ": 77,
+    "ಶ": 78,
+    "ಷ": 79,
+    "ಸ": 80,
+    "ಹ": 81,
+    "ಾ": 82,
+    "ಿ": 83,
+    "ೀ": 84,
+    "ು": 85,
+    "ೂ": 86,
+    "ೃ": 87,
+    "ೆ": 88,
+    "ೇ": 89,
+    "ೈ": 90,
+    "ೊ": 91,
+    "ೋ": 92,
+    "ೌ": 93,
+    "್": 94,
+    "೪": 95,
+    "": 96,
+    "‌": 97,
+    "‍": 98,
+    "–": 99
+  }
+}