ameerazam08 commited on Feb 22

Commit

e116e32

•

1 Parent(s): 34cfb62

Upload folder using huggingface_hub

Browse files

Files changed (39) hide show

.gitattributes +1 -0
gemma-jokes-gemma/checkpoint-100/README.md +204 -0
gemma-jokes-gemma/checkpoint-100/adapter_config.json +33 -0
gemma-jokes-gemma/checkpoint-100/adapter_model.safetensors +3 -0
gemma-jokes-gemma/checkpoint-100/optimizer.pt +3 -0
gemma-jokes-gemma/checkpoint-100/rng_state.pth +3 -0
gemma-jokes-gemma/checkpoint-100/scheduler.pt +3 -0
gemma-jokes-gemma/checkpoint-100/trainer_state.json +81 -0
gemma-jokes-gemma/checkpoint-100/training_args.bin +3 -0
gemma-jokes-gemma/checkpoint-125/README.md +204 -0
gemma-jokes-gemma/checkpoint-125/adapter_config.json +33 -0
gemma-jokes-gemma/checkpoint-125/adapter_model.safetensors +3 -0
gemma-jokes-gemma/checkpoint-125/optimizer.pt +3 -0
gemma-jokes-gemma/checkpoint-125/rng_state.pth +3 -0
gemma-jokes-gemma/checkpoint-125/scheduler.pt +3 -0
gemma-jokes-gemma/checkpoint-125/trainer_state.json +96 -0
gemma-jokes-gemma/checkpoint-125/training_args.bin +3 -0
gemma-jokes-gemma/checkpoint-150/README.md +204 -0
gemma-jokes-gemma/checkpoint-150/adapter_config.json +33 -0
gemma-jokes-gemma/checkpoint-150/adapter_model.safetensors +3 -0
gemma-jokes-gemma/checkpoint-150/optimizer.pt +3 -0
gemma-jokes-gemma/checkpoint-150/rng_state.pth +3 -0
gemma-jokes-gemma/checkpoint-150/scheduler.pt +3 -0
gemma-jokes-gemma/checkpoint-150/trainer_state.json +111 -0
gemma-jokes-gemma/checkpoint-150/training_args.bin +3 -0
install.sh +2 -1
test.py +46 -0
train.py +187 -0
wandb/debug-internal.log +0 -0
wandb/debug.log +27 -0
wandb/run-20240223_032422-b657btrg/files/conda-environment.yaml +123 -0
wandb/run-20240223_032422-b657btrg/files/config.yaml +682 -0
wandb/run-20240223_032422-b657btrg/files/output.log +1198 -0
wandb/run-20240223_032422-b657btrg/files/requirements.txt +101 -0
wandb/run-20240223_032422-b657btrg/files/wandb-metadata.json +202 -0
wandb/run-20240223_032422-b657btrg/files/wandb-summary.json +1 -0
wandb/run-20240223_032422-b657btrg/logs/debug-internal.log +0 -0
wandb/run-20240223_032422-b657btrg/logs/debug.log +27 -0
wandb/run-20240223_032422-b657btrg/run-b657btrg.wandb +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+wandb/run-20240223_032422-b657btrg/run-b657btrg.wandb filter=lfs diff=lfs merge=lfs -text

gemma-jokes-gemma/checkpoint-100/README.md ADDED Viewed

	@@ -0,0 +1,204 @@

+---
+library_name: peft
+base_model: google/gemma-2b
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.8.2

gemma-jokes-gemma/checkpoint-100/adapter_config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "google/gemma-2b",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "up_proj",
+    "gate_proj",
+    "k_proj",
+    "q_proj",
+    "down_proj",
+    "lm_head",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_rslora": false
+}

gemma-jokes-gemma/checkpoint-100/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ba1299f3415bf95b0644b47cb71b5138366ce6f53cf9164b0e8a1cc5533b9496
+size 2287110224

gemma-jokes-gemma/checkpoint-100/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ce5d8346185649c69fd43648077312ca54b94f012fecbd6392b9c5d34919ce2a
+size 95447052

gemma-jokes-gemma/checkpoint-100/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e82555c41335904322e09b317647eeaf90302b11e9543ea1d4b6183e81292a78
+size 14244

gemma-jokes-gemma/checkpoint-100/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:223c492754e7d0cc7c6aeaee1fe7206a93415033615b3379f11649bd0f09644e
+size 1064

gemma-jokes-gemma/checkpoint-100/trainer_state.json ADDED Viewed

	@@ -0,0 +1,81 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.0009592694204094162,
+  "eval_steps": 25,
+  "global_step": 100,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0,
+      "grad_norm": 6.853318691253662,
+      "learning_rate": 2.3797595190380762e-05,
+      "loss": 4.0257,
+      "step": 25
+    },
+    {
+      "epoch": 0.0,
+      "eval_loss": 3.5309877395629883,
+      "eval_runtime": 335.3635,
+      "eval_samples_per_second": 69.077,
+      "eval_steps_per_second": 8.635,
+      "step": 25
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 5.111692905426025,
+      "learning_rate": 2.2545090180360722e-05,
+      "loss": 3.3251,
+      "step": 50
+    },
+    {
+      "epoch": 0.0,
+      "eval_loss": 3.3726470470428467,
+      "eval_runtime": 336.3014,
+      "eval_samples_per_second": 68.885,
+      "eval_steps_per_second": 8.611,
+      "step": 50
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 5.772947788238525,
+      "learning_rate": 2.1292585170340683e-05,
+      "loss": 3.2615,
+      "step": 75
+    },
+    {
+      "epoch": 0.0,
+      "eval_loss": 3.2642982006073,
+      "eval_runtime": 362.3653,
+      "eval_samples_per_second": 63.93,
+      "eval_steps_per_second": 7.992,
+      "step": 75
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 5.64016056060791,
+      "learning_rate": 2.0040080160320643e-05,
+      "loss": 3.4283,
+      "step": 100
+    },
+    {
+      "epoch": 0.0,
+      "eval_loss": 3.2154319286346436,
+      "eval_runtime": 342.7157,
+      "eval_samples_per_second": 67.595,
+      "eval_steps_per_second": 8.45,
+      "step": 100
+    }
+  ],
+  "logging_steps": 25,
+  "max_steps": 500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 25,
+  "total_flos": 121761914880000.0,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

gemma-jokes-gemma/checkpoint-100/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f059017caffcb1fb28aff1f4c982294af4d4d3d017c7f0167808226f9c5b68d7
+size 4856

gemma-jokes-gemma/checkpoint-125/README.md ADDED Viewed

	@@ -0,0 +1,204 @@

+---
+library_name: peft
+base_model: google/gemma-2b
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.8.2

gemma-jokes-gemma/checkpoint-125/adapter_config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "google/gemma-2b",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "up_proj",
+    "gate_proj",
+    "k_proj",
+    "q_proj",
+    "down_proj",
+    "lm_head",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_rslora": false
+}

gemma-jokes-gemma/checkpoint-125/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0c46879da55a72aa71a9af05d77fd4e78c252169a290ed4e1930627993d2f8fb
+size 2287110224

gemma-jokes-gemma/checkpoint-125/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cd28c053a5bbb064b50f81bd640d4002016fcf9e68145f1bd0871b45e1e0a88a
+size 95447052

gemma-jokes-gemma/checkpoint-125/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ae300081bb1f69bdfc9a63609c207f5532ae1014058470231e302ebe8926a3a9
+size 14244

gemma-jokes-gemma/checkpoint-125/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:66c553965e43a5bbbc5dd40791eefe844e774f93abdd4ce992e718809b1731a0
+size 1064

gemma-jokes-gemma/checkpoint-125/trainer_state.json ADDED Viewed

	@@ -0,0 +1,96 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.0011990867755117703,
+  "eval_steps": 25,
+  "global_step": 125,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0,
+      "grad_norm": 6.853318691253662,
+      "learning_rate": 2.3797595190380762e-05,
+      "loss": 4.0257,
+      "step": 25
+    },
+    {
+      "epoch": 0.0,
+      "eval_loss": 3.5309877395629883,
+      "eval_runtime": 335.3635,
+      "eval_samples_per_second": 69.077,
+      "eval_steps_per_second": 8.635,
+      "step": 25
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 5.111692905426025,
+      "learning_rate": 2.2545090180360722e-05,
+      "loss": 3.3251,
+      "step": 50
+    },
+    {
+      "epoch": 0.0,
+      "eval_loss": 3.3726470470428467,
+      "eval_runtime": 336.3014,
+      "eval_samples_per_second": 68.885,
+      "eval_steps_per_second": 8.611,
+      "step": 50
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 5.772947788238525,
+      "learning_rate": 2.1292585170340683e-05,
+      "loss": 3.2615,
+      "step": 75
+    },
+    {
+      "epoch": 0.0,
+      "eval_loss": 3.2642982006073,
+      "eval_runtime": 362.3653,
+      "eval_samples_per_second": 63.93,
+      "eval_steps_per_second": 7.992,
+      "step": 75
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 5.64016056060791,
+      "learning_rate": 2.0040080160320643e-05,
+      "loss": 3.4283,
+      "step": 100
+    },
+    {
+      "epoch": 0.0,
+      "eval_loss": 3.2154319286346436,
+      "eval_runtime": 342.7157,
+      "eval_samples_per_second": 67.595,
+      "eval_steps_per_second": 8.45,
+      "step": 100
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 7.610867977142334,
+      "learning_rate": 1.87875751503006e-05,
+      "loss": 3.1832,
+      "step": 125
+    },
+    {
+      "epoch": 0.0,
+      "eval_loss": 3.1817972660064697,
+      "eval_runtime": 339.5184,
+      "eval_samples_per_second": 68.232,
+      "eval_steps_per_second": 8.53,
+      "step": 125
+    }
+  ],
+  "logging_steps": 25,
+  "max_steps": 500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 25,
+  "total_flos": 152202393600000.0,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

gemma-jokes-gemma/checkpoint-125/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f059017caffcb1fb28aff1f4c982294af4d4d3d017c7f0167808226f9c5b68d7
+size 4856

gemma-jokes-gemma/checkpoint-150/README.md ADDED Viewed

	@@ -0,0 +1,204 @@

+---
+library_name: peft
+base_model: google/gemma-2b
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.8.2

gemma-jokes-gemma/checkpoint-150/adapter_config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "google/gemma-2b",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "up_proj",
+    "gate_proj",
+    "k_proj",
+    "q_proj",
+    "down_proj",
+    "lm_head",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_rslora": false
+}

gemma-jokes-gemma/checkpoint-150/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a9e5f2eb55b05c5660819606699071231e7711d3e6656f6ab5a44303231edddc
+size 2287110224

gemma-jokes-gemma/checkpoint-150/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ca4ea0a3ecc9381d273184c1c8ffea8460bc7cd78d55b40933f341bf7da03dba
+size 95447052

gemma-jokes-gemma/checkpoint-150/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:911f542961bfe159c4260beec26575c38298b01e202acf750fe9ac00888e42c3
+size 14244

gemma-jokes-gemma/checkpoint-150/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:75963aea35a682aa234e18318cad27399bd95e2bebcf3b0d847043464e637baa
+size 1064

gemma-jokes-gemma/checkpoint-150/trainer_state.json ADDED Viewed

	@@ -0,0 +1,111 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.0014389041306141243,
+  "eval_steps": 25,
+  "global_step": 150,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0,
+      "grad_norm": 6.853318691253662,
+      "learning_rate": 2.3797595190380762e-05,
+      "loss": 4.0257,
+      "step": 25
+    },
+    {
+      "epoch": 0.0,
+      "eval_loss": 3.5309877395629883,
+      "eval_runtime": 335.3635,
+      "eval_samples_per_second": 69.077,
+      "eval_steps_per_second": 8.635,
+      "step": 25
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 5.111692905426025,
+      "learning_rate": 2.2545090180360722e-05,
+      "loss": 3.3251,
+      "step": 50
+    },
+    {
+      "epoch": 0.0,
+      "eval_loss": 3.3726470470428467,
+      "eval_runtime": 336.3014,
+      "eval_samples_per_second": 68.885,
+      "eval_steps_per_second": 8.611,
+      "step": 50
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 5.772947788238525,
+      "learning_rate": 2.1292585170340683e-05,
+      "loss": 3.2615,
+      "step": 75
+    },
+    {
+      "epoch": 0.0,
+      "eval_loss": 3.2642982006073,
+      "eval_runtime": 362.3653,
+      "eval_samples_per_second": 63.93,
+      "eval_steps_per_second": 7.992,
+      "step": 75
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 5.64016056060791,
+      "learning_rate": 2.0040080160320643e-05,
+      "loss": 3.4283,
+      "step": 100
+    },
+    {
+      "epoch": 0.0,
+      "eval_loss": 3.2154319286346436,
+      "eval_runtime": 342.7157,
+      "eval_samples_per_second": 67.595,
+      "eval_steps_per_second": 8.45,
+      "step": 100
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 7.610867977142334,
+      "learning_rate": 1.87875751503006e-05,
+      "loss": 3.1832,
+      "step": 125
+    },
+    {
+      "epoch": 0.0,
+      "eval_loss": 3.1817972660064697,
+      "eval_runtime": 339.5184,
+      "eval_samples_per_second": 68.232,
+      "eval_steps_per_second": 8.53,
+      "step": 125
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 5.710243225097656,
+      "learning_rate": 1.7535070140280564e-05,
+      "loss": 3.1834,
+      "step": 150
+    },
+    {
+      "epoch": 0.0,
+      "eval_loss": 3.162160873413086,
+      "eval_runtime": 340.5303,
+      "eval_samples_per_second": 68.029,
+      "eval_steps_per_second": 8.504,
+      "step": 150
+    }
+  ],
+  "logging_steps": 25,
+  "max_steps": 500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 25,
+  "total_flos": 182642872320000.0,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

gemma-jokes-gemma/checkpoint-150/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f059017caffcb1fb28aff1f4c982294af4d4d3d017c7f0167808226f9c5b68d7
+size 4856

install.sh CHANGED Viewed

@@ -2,4 +2,5 @@ pip install  -U bitsandbytes
 pip install  -U git+https://github.com/huggingface/transformers.git
 pip install -U git+https://github.com/huggingface/peft.git
 pip install  -U git+https://github.com/huggingface/accelerate.git
-pip install  -U datasets scipy ipywidgets matplotlib

 pip install  -U git+https://github.com/huggingface/transformers.git
 pip install -U git+https://github.com/huggingface/peft.git
 pip install  -U git+https://github.com/huggingface/accelerate.git
+pip install  -U datasets scipy ipywidgets matplotlib
+pip install wandb

test.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+base_model_id= "google/gemma-2b"
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.bfloat16
+)
+base_model = AutoModelForCausalLM.from_pretrained(
+    base_model_id,  # Mistral, same as before
+    quantization_config=bnb_config,  # Same quantization config as before
+    device_map="auto",
+    trust_remote_code=True,
+)
+eval_tokenizer = AutoTokenizer.from_pretrained(base_model_id, add_bos_token=True, trust_remote_code=True)
+from peft import PeftModel
+ft_model = PeftModel.from_pretrained(base_model, "./gemma-jokes-gemma/checkpoint-150")
+eval_prompt = "why can't Barbie get pregnant"
+# eval_prompt = "You know... When someone says to you Jesus loves you It's always comforting. Unless you are in a Mexican jail."
+model_input = eval_tokenizer(eval_prompt, return_tensors="pt").to("cuda:0")
+ft_model.eval()
+with torch.no_grad():
+    print(eval_tokenizer.decode(ft_model.generate(**model_input, max_new_tokens=100, repetition_penalty=1.15)[0], skip_special_tokens=True))
+# Result
+# why can't Barbie get pregnant? Because she has no eggs.
+# Why did the chicken cross the road? To get to the other side of the egg.
+# Why do chickens lay eggs in their sleep? Because they don't want to wake up and find out they're dead.
+# Why do chickens wear glasses? Because they have a hard time seeing the yolk.
+# Why do chickens eat so much? Because they are always hungry.
+# Why do chickens like to go to the beach? Because they love laying eggs

train.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import torch
+from datasets import load_dataset
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+from transformers import AutoTokenizer, AutoModelForCausalLM, AdamW, get_linear_schedule_with_warmup, BitsAndBytesConfig
+import transformers
+import warnings
+warnings.filterwarnings("ignore")
+base_model_id= "google/gemma-2b"
+torch.cuda.set_device(0)
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+print("Using device:", device)
+# Load the jokes dataset
+dataset = load_dataset("ysharma/short_jokes")
+# Accessing the train split
+train_data = dataset['train']
+# Shuffle the dataset and select 20% of the data
+twenty_percent_size = int(0.2 * len(train_data))
+subset = train_data.shuffle(seed=42)[:twenty_percent_size]
+import torch
+print("Available devices:", torch.cuda.device_count())
+print("Current device:", torch.cuda.current_device())
+#accelerate
+from accelerate import FullyShardedDataParallelPlugin, Accelerator
+from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig
+fsdp_plugin = FullyShardedDataParallelPlugin(
+    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
+    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
+)
+tokenizer = AutoTokenizer.from_pretrained(
+    base_model_id,
+    padding_side="left",
+    add_eos_token=True,
+    add_bos_token=True,
+)
+tokenizer.pad_token = tokenizer.eos_token
+def formatting_func(example):
+    text = f"### The following is a note by Eevee the Dog: {example['note']}"
+    return text
+def generate_and_tokenize_prompt(prompt):
+    return tokenizer(formatting_func(prompt))
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.bfloat16
+)
+model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config, device_map="auto")
+def tokenize_function(examples):
+    return tokenizer(examples["Joke"], padding="max_length", truncation=True, max_length=50)
+from datasets import load_dataset
+dataset = load_dataset("ysharma/short_jokes")
+# Shuffle the dataset and select 20% of the data
+# print("train_data ",train_data, "subset ",subset)
+train_test_split = dataset['train'].train_test_split(test_size=0.1)
+train_data = train_test_split['train']
+test_data = train_test_split['test']
+# Now, tokenize the newly split datasets
+tokenized_train_data = train_data.map(tokenize_function, batched=True)
+tokenized_test_data = test_data.map(tokenize_function, batched=True)
+eval_prompt = " why man are  "
+eval_tokenizer = AutoTokenizer.from_pretrained(
+    base_model_id,
+    add_bos_token=True,
+)
+model_input = eval_tokenizer(eval_prompt, return_tensors="pt").to("cuda")
+model.eval()
+with torch.no_grad():
+    print(eval_tokenizer.decode(model.generate(**model_input, max_new_tokens=50, repetition_penalty=1.15)[0], skip_special_tokens=True))
+from peft import prepare_model_for_kbit_training,LoraConfig, get_peft_model
+model.gradient_checkpointing_enable()
+model = prepare_model_for_kbit_training(model)
+def print_trainable_parameters(model):
+    """
+    Prints the number of trainable parameters in the model.
+    """
+    trainable_params = 0
+    all_param = 0
+    for _, param in model.named_parameters():
+        all_param += param.numel()
+        if param.requires_grad:
+            trainable_params += param.numel()
+    print(
+        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
+    )
+config = LoraConfig(
+    r=32,
+    lora_alpha=64,
+    target_modules=[
+        "q_proj",
+        "k_proj",
+        "v_proj",
+        "o_proj",
+        "gate_proj",
+        "up_proj",
+        "down_proj",
+        "lm_head",
+    ],
+    bias="none",
+    lora_dropout=0.05,  # Conventional
+    task_type="CAUSAL_LM",
+)
+model = get_peft_model(model, config)
+print_trainable_parameters(model)
+# if torch.cuda.device_count() > 1: # If more than 1 GPU
+#     model.is_parallelizable = True
+#     model.model_parallel = True
+model.to(device)
+accelerator = Accelerator(fsdp_plugin=fsdp_plugin)
+print("Accelerator device:", accelerator.device)
+model = accelerator.prepare_model(model)
+from datetime import datetime
+project = "jokes-gemma"
+base_model_name = "gemma"
+run_name = base_model_name + "-" + project
+output_dir = "./" + run_name
+trainer = transformers.Trainer(
+    model=model,
+    train_dataset=tokenized_train_data,
+    eval_dataset=tokenized_test_data,
+    args=transformers.TrainingArguments(
+        output_dir=output_dir,
+        warmup_steps=1,
+        per_device_train_batch_size=2,
+        gradient_accumulation_steps=1,
+        gradient_checkpointing=True,
+        max_steps=500,
+        learning_rate=2.5e-5, # Want a small lr for finetuning
+        bf16=True,
+        optim="paged_adamw_8bit",
+        logging_steps=25,              # When to start reporting loss
+        logging_dir="./logs",        # Directory for storing logs
+        save_strategy="steps",       # Save the model checkpoint every logging step
+        save_steps=25,                # Save checkpoints every 50 steps
+        evaluation_strategy="steps", # Evaluate the model every logging step
+        eval_steps=25,               # Evaluate and save checkpoints every 50 steps
+        do_eval=True,                # Perform evaluation at the end of training
+        report_to="wandb",           # Comment this out if you don't want to use weights & baises
+        run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"          # Name of the W&B run (optional)
+    ),
+    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
+)
+model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
+trainer.train()

wandb/debug-internal.log ADDED Viewed

The diff for this file is too large to render. See raw diff

wandb/debug.log ADDED Viewed

	@@ -0,0 +1,27 @@

+2024-02-23 03:24:22,980 INFO    MainThread:116027 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
+2024-02-23 03:24:22,980 INFO    MainThread:116027 [wandb_setup.py:_flush():76] Configure stats pid to 116027
+2024-02-23 03:24:22,980 INFO    MainThread:116027 [wandb_setup.py:_flush():76] Loading settings from /home/rnd/.config/wandb/settings
+2024-02-23 03:24:22,980 INFO    MainThread:116027 [wandb_setup.py:_flush():76] Loading settings from /home/rnd/Documents/Ameer/gemma/wandb/settings
+2024-02-23 03:24:22,980 INFO    MainThread:116027 [wandb_setup.py:_flush():76] Loading settings from environment variables: {}
+2024-02-23 03:24:22,980 INFO    MainThread:116027 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
+2024-02-23 03:24:22,980 INFO    MainThread:116027 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'new.py', 'program_abspath': '/home/rnd/Documents/Ameer/gemma/new.py', 'program': '/home/rnd/Documents/Ameer/gemma/new.py'}
+2024-02-23 03:24:22,980 INFO    MainThread:116027 [wandb_init.py:_log_setup():526] Logging user logs to /home/rnd/Documents/Ameer/gemma/wandb/run-20240223_032422-b657btrg/logs/debug.log
+2024-02-23 03:24:22,980 INFO    MainThread:116027 [wandb_init.py:_log_setup():527] Logging internal logs to /home/rnd/Documents/Ameer/gemma/wandb/run-20240223_032422-b657btrg/logs/debug-internal.log
+2024-02-23 03:24:22,981 INFO    MainThread:116027 [wandb_init.py:init():566] calling init triggers
+2024-02-23 03:24:22,981 INFO    MainThread:116027 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
+config: {}
+2024-02-23 03:24:22,981 INFO    MainThread:116027 [wandb_init.py:init():616] starting backend
+2024-02-23 03:24:22,981 INFO    MainThread:116027 [wandb_init.py:init():620] setting up manager
+2024-02-23 03:24:22,982 INFO    MainThread:116027 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-02-23 03:24:22,984 INFO    MainThread:116027 [wandb_init.py:init():628] backend started and connected
+2024-02-23 03:24:22,986 INFO    MainThread:116027 [wandb_init.py:init():720] updated telemetry
+2024-02-23 03:24:22,986 INFO    MainThread:116027 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
+2024-02-23 03:24:23,690 INFO    MainThread:116027 [wandb_run.py:_on_init():2262] communicating current version
+2024-02-23 03:24:23,800 INFO    MainThread:116027 [wandb_run.py:_on_init():2271] got version response
+2024-02-23 03:24:23,800 INFO    MainThread:116027 [wandb_init.py:init():804] starting run threads in backend
+2024-02-23 03:24:24,871 INFO    MainThread:116027 [wandb_run.py:_console_start():2241] atexit reg
+2024-02-23 03:24:24,871 INFO    MainThread:116027 [wandb_run.py:_redirect():2096] redirect: wrap_raw
+2024-02-23 03:24:24,871 INFO    MainThread:116027 [wandb_run.py:_redirect():2161] Wrapping output streams.
+2024-02-23 03:24:24,871 INFO    MainThread:116027 [wandb_run.py:_redirect():2186] Redirects installed.
+2024-02-23 03:24:24,871 INFO    MainThread:116027 [wandb_init.py:init():847] run started, returning control to user process
+2024-02-23 03:24:24,872 INFO    MainThread:116027 [wandb_run.py:_config_callback():1343] config_cb None None {'vocab_size': 256000, 'max_position_embeddings': 8192, 'hidden_size': 2048, 'intermediate_size': 16384, 'num_hidden_layers': 18, 'num_attention_heads': 8, 'head_dim': 256, 'num_key_value_heads': 1, 'hidden_act': 'gelu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'rope_theta': 10000.0, 'attention_bias': False, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['GemmaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 2, 'pad_token_id': 0, 'eos_token_id': 1, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'google/gemma-2b', 'transformers_version': '4.39.0.dev0', 'model_type': 'gemma', 'rope_scaling': None, 'quantization_config': {'quant_method': 'QuantizationMethod.BITS_AND_BYTES', '_load_in_8bit': False, '_load_in_4bit': True, 'llm_int8_threshold': 6.0, 'llm_int8_skip_modules': None, 'llm_int8_enable_fp32_cpu_offload': False, 'llm_int8_has_fp16_weight': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'bfloat16', 'load_in_4bit': True, 'load_in_8bit': False}, 'output_dir': './gemma-jokes-gemma', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'evaluation_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 2, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'learning_rate': 2.5e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3.0, 'max_steps': 500, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 1, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './logs', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 25, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 25, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 25, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 'gemma-jokes-gemma-2024-02-23-03-24', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'paged_adamw_8bit', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None}

wandb/run-20240223_032422-b657btrg/files/conda-environment.yaml ADDED Viewed

	@@ -0,0 +1,123 @@

+name: gemma
+channels:
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - ca-certificates=2023.12.12=h06a4308_0
+  - ld_impl_linux-64=2.38=h1181459_1
+  - libffi=3.4.4=h6a678d5_0
+  - libgcc-ng=11.2.0=h1234567_1
+  - libgomp=11.2.0=h1234567_1
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - ncurses=6.4=h6a678d5_0
+  - openssl=3.0.13=h7f8727e_0
+  - pip=23.3.1=py39h06a4308_0
+  - python=3.9.18=h955ad1f_0
+  - readline=8.2=h5eee18b_0
+  - setuptools=68.2.2=py39h06a4308_0
+  - sqlite=3.41.2=h5eee18b_0
+  - tk=8.6.12=h1ccaba5_0
+  - wheel=0.41.2=py39h06a4308_0
+  - xz=5.4.5=h5eee18b_0
+  - zlib=1.2.13=h5eee18b_0
+  - pip:
+      - accelerate==0.28.0.dev0
+      - aiohttp==3.9.3
+      - aiosignal==1.3.1
+      - appdirs==1.4.4
+      - asttokens==2.4.1
+      - async-timeout==4.0.3
+      - attrs==23.2.0
+      - bitsandbytes==0.42.0
+      - certifi==2024.2.2
+      - charset-normalizer==3.3.2
+      - click==8.1.7
+      - comm==0.2.1
+      - contourpy==1.2.0
+      - cycler==0.12.1
+      - datasets==2.17.1
+      - decorator==5.1.1
+      - dill==0.3.8
+      - docker-pycreds==0.4.0
+      - exceptiongroup==1.2.0
+      - executing==2.0.1
+      - filelock==3.13.1
+      - fonttools==4.49.0
+      - frozenlist==1.4.1
+      - fsspec==2023.10.0
+      - gitdb==4.0.11
+      - gitpython==3.1.42
+      - huggingface-hub==0.20.3
+      - idna==3.6
+      - importlib-resources==6.1.1
+      - ipython==8.18.1
+      - ipywidgets==8.1.2
+      - jedi==0.19.1
+      - jinja2==3.1.3
+      - jupyterlab-widgets==3.0.10
+      - kiwisolver==1.4.5
+      - markupsafe==2.1.5
+      - matplotlib==3.8.3
+      - matplotlib-inline==0.1.6
+      - mpmath==1.3.0
+      - multidict==6.0.5
+      - multiprocess==0.70.16
+      - networkx==3.2.1
+      - numpy==1.26.4
+      - nvidia-cublas-cu12==12.1.3.1
+      - nvidia-cuda-cupti-cu12==12.1.105
+      - nvidia-cuda-nvrtc-cu12==12.1.105
+      - nvidia-cuda-runtime-cu12==12.1.105
+      - nvidia-cudnn-cu12==8.9.2.26
+      - nvidia-cufft-cu12==11.0.2.54
+      - nvidia-curand-cu12==10.3.2.106
+      - nvidia-cusolver-cu12==11.4.5.107
+      - nvidia-cusparse-cu12==12.1.0.106
+      - nvidia-nccl-cu12==2.19.3
+      - nvidia-nvjitlink-cu12==12.3.101
+      - nvidia-nvtx-cu12==12.1.105
+      - packaging==23.2
+      - pandas==2.2.0
+      - parso==0.8.3
+      - peft==0.8.2
+      - pexpect==4.9.0
+      - pillow==10.2.0
+      - prompt-toolkit==3.0.43
+      - protobuf==4.25.3
+      - psutil==5.9.8
+      - ptyprocess==0.7.0
+      - pure-eval==0.2.2
+      - pyarrow==15.0.0
+      - pyarrow-hotfix==0.6
+      - pygments==2.17.2
+      - pyparsing==3.1.1
+      - python-dateutil==2.8.2
+      - pytz==2024.1
+      - pyyaml==6.0.1
+      - regex==2023.12.25
+      - requests==2.31.0
+      - safetensors==0.4.2
+      - scipy==1.12.0
+      - sentry-sdk==1.40.5
+      - setproctitle==1.3.3
+      - six==1.16.0
+      - smmap==5.0.1
+      - stack-data==0.6.3
+      - sympy==1.12
+      - tokenizers==0.15.2
+      - torch==2.2.1
+      - tqdm==4.66.2
+      - traitlets==5.14.1
+      - transformers==4.39.0.dev0
+      - triton==2.2.0
+      - typing-extensions==4.9.0
+      - tzdata==2024.1
+      - urllib3==2.2.1
+      - wandb==0.16.3
+      - wcwidth==0.2.13
+      - widgetsnbextension==4.0.10
+      - xxhash==3.4.1
+      - yarl==1.9.4
+      - zipp==3.17.0
+prefix: /home/rnd/miniconda3/envs/gemma

wandb/run-20240223_032422-b657btrg/files/config.yaml ADDED Viewed

	@@ -0,0 +1,682 @@

+wandb_version: 1
+_wandb:
+  desc: null
+  value:
+    python_version: 3.9.18
+    cli_version: 0.16.3
+    framework: huggingface
+    huggingface_version: 4.39.0.dev0
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    start_time: 1708638862.984978
+    t:
+      1:
+      - 1
+      - 11
+      - 49
+      - 51
+      - 55
+      - 71
+      - 98
+      2:
+      - 1
+      - 11
+      - 49
+      - 51
+      - 55
+      - 71
+      - 98
+      3:
+      - 7
+      - 13
+      - 23
+      4: 3.9.18
+      5: 0.16.3
+      6: 4.39.0.dev0
+      8:
+      - 5
+      9:
+        1: transformers_trainer
+      13: linux-x86_64
+    m:
+    - 1: train/global_step
+      6:
+      - 3
+    - 1: train/loss
+      5: 1
+      6:
+      - 1
+    - 1: train/grad_norm
+      5: 1
+      6:
+      - 1
+    - 1: train/learning_rate
+      5: 1
+      6:
+      - 1
+    - 1: train/epoch
+      5: 1
+      6:
+      - 1
+    - 1: eval/loss
+      5: 1
+      6:
+      - 1
+    - 1: eval/runtime
+      5: 1
+      6:
+      - 1
+    - 1: eval/samples_per_second
+      5: 1
+      6:
+      - 1
+    - 1: eval/steps_per_second
+      5: 1
+      6:
+      - 1
+vocab_size:
+  desc: null
+  value: 256000
+max_position_embeddings:
+  desc: null
+  value: 8192
+hidden_size:
+  desc: null
+  value: 2048
+intermediate_size:
+  desc: null
+  value: 16384
+num_hidden_layers:
+  desc: null
+  value: 18
+num_attention_heads:
+  desc: null
+  value: 8
+head_dim:
+  desc: null
+  value: 256
+num_key_value_heads:
+  desc: null
+  value: 1
+hidden_act:
+  desc: null
+  value: gelu
+initializer_range:
+  desc: null
+  value: 0.02
+rms_norm_eps:
+  desc: null
+  value: 1.0e-06
+use_cache:
+  desc: null
+  value: false
+rope_theta:
+  desc: null
+  value: 10000.0
+attention_bias:
+  desc: null
+  value: false
+attention_dropout:
+  desc: null
+  value: 0.0
+return_dict:
+  desc: null
+  value: true
+output_hidden_states:
+  desc: null
+  value: false
+output_attentions:
+  desc: null
+  value: false
+torchscript:
+  desc: null
+  value: false
+torch_dtype:
+  desc: null
+  value: bfloat16
+use_bfloat16:
+  desc: null
+  value: false
+tf_legacy_loss:
+  desc: null
+  value: false
+pruned_heads:
+  desc: null
+  value: {}
+tie_word_embeddings:
+  desc: null
+  value: true
+chunk_size_feed_forward:
+  desc: null
+  value: 0
+is_encoder_decoder:
+  desc: null
+  value: false
+is_decoder:
+  desc: null
+  value: false
+cross_attention_hidden_size:
+  desc: null
+  value: null
+add_cross_attention:
+  desc: null
+  value: false
+tie_encoder_decoder:
+  desc: null
+  value: false
+max_length:
+  desc: null
+  value: 20
+min_length:
+  desc: null
+  value: 0
+do_sample:
+  desc: null
+  value: false
+early_stopping:
+  desc: null
+  value: false
+num_beams:
+  desc: null
+  value: 1
+num_beam_groups:
+  desc: null
+  value: 1
+diversity_penalty:
+  desc: null
+  value: 0.0
+temperature:
+  desc: null
+  value: 1.0
+top_k:
+  desc: null
+  value: 50
+top_p:
+  desc: null
+  value: 1.0
+typical_p:
+  desc: null
+  value: 1.0
+repetition_penalty:
+  desc: null
+  value: 1.0
+length_penalty:
+  desc: null
+  value: 1.0
+no_repeat_ngram_size:
+  desc: null
+  value: 0
+encoder_no_repeat_ngram_size:
+  desc: null
+  value: 0
+bad_words_ids:
+  desc: null
+  value: null
+num_return_sequences:
+  desc: null
+  value: 1
+output_scores:
+  desc: null
+  value: false
+return_dict_in_generate:
+  desc: null
+  value: false
+forced_bos_token_id:
+  desc: null
+  value: null
+forced_eos_token_id:
+  desc: null
+  value: null
+remove_invalid_values:
+  desc: null
+  value: false
+exponential_decay_length_penalty:
+  desc: null
+  value: null
+suppress_tokens:
+  desc: null
+  value: null
+begin_suppress_tokens:
+  desc: null
+  value: null
+architectures:
+  desc: null
+  value:
+  - GemmaForCausalLM
+finetuning_task:
+  desc: null
+  value: null
+id2label:
+  desc: null
+  value:
+    '0': LABEL_0
+    '1': LABEL_1
+label2id:
+  desc: null
+  value:
+    LABEL_0: 0
+    LABEL_1: 1
+tokenizer_class:
+  desc: null
+  value: null
+prefix:
+  desc: null
+  value: null
+bos_token_id:
+  desc: null
+  value: 2
+pad_token_id:
+  desc: null
+  value: 0
+eos_token_id:
+  desc: null
+  value: 1
+sep_token_id:
+  desc: null
+  value: null
+decoder_start_token_id:
+  desc: null
+  value: null
+task_specific_params:
+  desc: null
+  value: null
+problem_type:
+  desc: null
+  value: null
+_name_or_path:
+  desc: null
+  value: google/gemma-2b
+transformers_version:
+  desc: null
+  value: 4.39.0.dev0
+model_type:
+  desc: null
+  value: gemma
+rope_scaling:
+  desc: null
+  value: null
+quantization_config:
+  desc: null
+  value:
+    quant_method: QuantizationMethod.BITS_AND_BYTES
+    _load_in_8bit: false
+    _load_in_4bit: true
+    llm_int8_threshold: 6.0
+    llm_int8_skip_modules: null
+    llm_int8_enable_fp32_cpu_offload: false
+    llm_int8_has_fp16_weight: false
+    bnb_4bit_quant_type: nf4
+    bnb_4bit_use_double_quant: true
+    bnb_4bit_compute_dtype: bfloat16
+    load_in_4bit: true
+    load_in_8bit: false
+output_dir:
+  desc: null
+  value: ./gemma-jokes-gemma
+overwrite_output_dir:
+  desc: null
+  value: false
+do_train:
+  desc: null
+  value: false
+do_eval:
+  desc: null
+  value: true
+do_predict:
+  desc: null
+  value: false
+evaluation_strategy:
+  desc: null
+  value: steps
+prediction_loss_only:
+  desc: null
+  value: false
+per_device_train_batch_size:
+  desc: null
+  value: 2
+per_device_eval_batch_size:
+  desc: null
+  value: 8
+per_gpu_train_batch_size:
+  desc: null
+  value: null
+per_gpu_eval_batch_size:
+  desc: null
+  value: null
+gradient_accumulation_steps:
+  desc: null
+  value: 1
+eval_accumulation_steps:
+  desc: null
+  value: null
+eval_delay:
+  desc: null
+  value: 0
+learning_rate:
+  desc: null
+  value: 2.5e-05
+weight_decay:
+  desc: null
+  value: 0.0
+adam_beta1:
+  desc: null
+  value: 0.9
+adam_beta2:
+  desc: null
+  value: 0.999
+adam_epsilon:
+  desc: null
+  value: 1.0e-08
+max_grad_norm:
+  desc: null
+  value: 1.0
+num_train_epochs:
+  desc: null
+  value: 3.0
+max_steps:
+  desc: null
+  value: 500
+lr_scheduler_type:
+  desc: null
+  value: linear
+lr_scheduler_kwargs:
+  desc: null
+  value: {}
+warmup_ratio:
+  desc: null
+  value: 0.0
+warmup_steps:
+  desc: null
+  value: 1
+log_level:
+  desc: null
+  value: passive
+log_level_replica:
+  desc: null
+  value: warning
+log_on_each_node:
+  desc: null
+  value: true
+logging_dir:
+  desc: null
+  value: ./logs
+logging_strategy:
+  desc: null
+  value: steps
+logging_first_step:
+  desc: null
+  value: false
+logging_steps:
+  desc: null
+  value: 25
+logging_nan_inf_filter:
+  desc: null
+  value: true
+save_strategy:
+  desc: null
+  value: steps
+save_steps:
+  desc: null
+  value: 25
+save_total_limit:
+  desc: null
+  value: null
+save_safetensors:
+  desc: null
+  value: true
+save_on_each_node:
+  desc: null
+  value: false
+save_only_model:
+  desc: null
+  value: false
+no_cuda:
+  desc: null
+  value: false
+use_cpu:
+  desc: null
+  value: false
+use_mps_device:
+  desc: null
+  value: false
+seed:
+  desc: null
+  value: 42
+data_seed:
+  desc: null
+  value: null
+jit_mode_eval:
+  desc: null
+  value: false
+use_ipex:
+  desc: null
+  value: false
+bf16:
+  desc: null
+  value: true
+fp16:
+  desc: null
+  value: false
+fp16_opt_level:
+  desc: null
+  value: O1
+half_precision_backend:
+  desc: null
+  value: auto
+bf16_full_eval:
+  desc: null
+  value: false
+fp16_full_eval:
+  desc: null
+  value: false
+tf32:
+  desc: null
+  value: null
+local_rank:
+  desc: null
+  value: 0
+ddp_backend:
+  desc: null
+  value: null
+tpu_num_cores:
+  desc: null
+  value: null
+tpu_metrics_debug:
+  desc: null
+  value: false
+debug:
+  desc: null
+  value: []
+dataloader_drop_last:
+  desc: null
+  value: false
+eval_steps:
+  desc: null
+  value: 25
+dataloader_num_workers:
+  desc: null
+  value: 0
+dataloader_prefetch_factor:
+  desc: null
+  value: null
+past_index:
+  desc: null
+  value: -1
+run_name:
+  desc: null
+  value: gemma-jokes-gemma-2024-02-23-03-24
+disable_tqdm:
+  desc: null
+  value: false
+remove_unused_columns:
+  desc: null
+  value: true
+label_names:
+  desc: null
+  value: null
+load_best_model_at_end:
+  desc: null
+  value: false
+metric_for_best_model:
+  desc: null
+  value: null
+greater_is_better:
+  desc: null
+  value: null
+ignore_data_skip:
+  desc: null
+  value: false
+fsdp:
+  desc: null
+  value: []
+fsdp_min_num_params:
+  desc: null
+  value: 0
+fsdp_config:
+  desc: null
+  value:
+    min_num_params: 0
+    xla: false
+    xla_fsdp_v2: false
+    xla_fsdp_grad_ckpt: false
+fsdp_transformer_layer_cls_to_wrap:
+  desc: null
+  value: null
+accelerator_config:
+  desc: null
+  value:
+    split_batches: false
+    dispatch_batches: null
+    even_batches: true
+    use_seedable_sampler: true
+deepspeed:
+  desc: null
+  value: null
+label_smoothing_factor:
+  desc: null
+  value: 0.0
+optim:
+  desc: null
+  value: paged_adamw_8bit
+optim_args:
+  desc: null
+  value: null
+adafactor:
+  desc: null
+  value: false
+group_by_length:
+  desc: null
+  value: false
+length_column_name:
+  desc: null
+  value: length
+report_to:
+  desc: null
+  value:
+  - wandb
+ddp_find_unused_parameters:
+  desc: null
+  value: null
+ddp_bucket_cap_mb:
+  desc: null
+  value: null
+ddp_broadcast_buffers:
+  desc: null
+  value: null
+dataloader_pin_memory:
+  desc: null
+  value: true
+dataloader_persistent_workers:
+  desc: null
+  value: false
+skip_memory_metrics:
+  desc: null
+  value: true
+use_legacy_prediction_loop:
+  desc: null
+  value: false
+push_to_hub:
+  desc: null
+  value: false
+resume_from_checkpoint:
+  desc: null
+  value: null
+hub_model_id:
+  desc: null
+  value: null
+hub_strategy:
+  desc: null
+  value: every_save
+hub_token:
+  desc: null
+  value: <HUB_TOKEN>
+hub_private_repo:
+  desc: null
+  value: false
+hub_always_push:
+  desc: null
+  value: false
+gradient_checkpointing:
+  desc: null
+  value: true
+gradient_checkpointing_kwargs:
+  desc: null
+  value: null
+include_inputs_for_metrics:
+  desc: null
+  value: false
+fp16_backend:
+  desc: null
+  value: auto
+push_to_hub_model_id:
+  desc: null
+  value: null
+push_to_hub_organization:
+  desc: null
+  value: null
+push_to_hub_token:
+  desc: null
+  value: <PUSH_TO_HUB_TOKEN>
+mp_parameters:
+  desc: null
+  value: ''
+auto_find_batch_size:
+  desc: null
+  value: false
+full_determinism:
+  desc: null
+  value: false
+torchdynamo:
+  desc: null
+  value: null
+ray_scope:
+  desc: null
+  value: last
+ddp_timeout:
+  desc: null
+  value: 1800
+torch_compile:
+  desc: null
+  value: false
+torch_compile_backend:
+  desc: null
+  value: null
+torch_compile_mode:
+  desc: null
+  value: null
+dispatch_batches:
+  desc: null
+  value: null
+split_batches:
+  desc: null
+  value: null
+include_tokens_per_second:
+  desc: null
+  value: false
+include_num_input_tokens_seen:
+  desc: null
+  value: false
+neftune_noise_alpha:
+  desc: null
+  value: null

wandb/run-20240223_032422-b657btrg/files/output.log ADDED Viewed

	@@ -0,0 +1,1198 @@

+  5%|███▎                                                               | 25/500 [00:03<01:11,  6.69it/s]
+{'loss': 4.0257, 'grad_norm': 6.853318691253662, 'learning_rate': 2.3797595190380762e-05, 'epoch': 0.0}
+100%|███████████████████████████████████████████████████████████████▊| 2888/2896 [05:34<00:00,  8.65it/s]
+  9%|██████▏                                                            | 46/500 [05:44<01:44,  4.35it/s]
+ 10%|██████▋                                                            | 50/500 [05:44<01:16,  5.90it/s]
+100%|███████████████████████████████████████████████████████████████████████████████████▉| 2893/2896 [05:35<00:00,  8.65it/s]
+ 15%|█████████████                                                                          | 75/500 [11:26<01:12,  5.88it/s]
+{'loss': 3.2615, 'grad_norm': 5.772947788238525, 'learning_rate': 2.1292585170340683e-05, 'epoch': 0.0}
+ 99%|███████████████████████████████████████████████████████████████████████████████████▌| 2880/2896 [06:00<00:01,  8.69it/s]
+ 20%|█████████████████▏                                                                    | 100/500 [17:34<01:08,  5.83it/s]
+  0%|                                                                                       | 4/2896 [00:00<04:43, 10.21it/s]
+ 99%|███████████████████████████████████████████████████████████████████████████████████▌| 2881/2896 [05:40<00:01,  8.69it/s]
+ 23%|████████████████████                                                                  | 117/500 [23:21<03:08,  2.03it/s]
+ 25%|█████████████████████▌                                                                | 125/500 [23:22<01:03,  5.91it/s]
+100%|███████████████████████████████████████████████████████████████████████████████████▉| 2894/2896 [05:39<00:00,  8.67it/s]
+ 30%|█████████████████████████▊                                                            | 150/500 [29:07<00:59,  5.91it/s]
+  0%|                                                                                       | 2/2896 [00:00<02:46, 17.41it/s]
+ 99%|███████████████████████████████████████████████████████████████████████████████████▌| 2880/2896 [05:38<00:01,  8.68it/s]
+ 35%|██████████████████████████████                                                        | 175/500 [34:54<00:55,  5.88it/s]
+  0%|                                                                                       | 4/2896 [00:00<04:25, 10.91it/s]
+  File "/home/rnd/Documents/Ameer/gemma/new.py", line 184, in <module>                   | 1763/2896 [03:27<02:10,  8.69it/s]
+    trainer.train()
+  File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/transformers/trainer.py", line 1624, in train
+    return inner_training_loop(
+  File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/transformers/trainer.py", line 2029, in _inner_training_loop
+    self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval)
+  File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/transformers/trainer.py", line 2412, in _maybe_log_save_evaluate
+    metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
+  File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/transformers/trainer.py", line 3229, in evaluate
+    output = eval_loop(
+  File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/transformers/trainer.py", line 3418, in evaluation_loop
+    loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
+  File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/transformers/trainer.py", line 3635, in prediction_step
+    loss, outputs = self.compute_loss(model, inputs, return_outputs=True)
+  File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/transformers/trainer.py", line 2925, in compute_loss
+    outputs = model(**inputs)
+  File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/accelerate/utils/operations.py", line 829, in forward
+    return model_forward(*args, **kwargs)
+  File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/accelerate/utils/operations.py", line 817, in __call__
+    return convert_to_fp32(self.model_forward(*args, **kwargs))
+  File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/torch/amp/autocast_mode.py", line 16, in decorate_autocast
+    return func(*args, **kwargs)
+  File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/peft/peft_model.py", line 1091, in forward
+    return self.base_model(
+  File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/peft/tuners/tuners_utils.py", line 160, in forward
+    return self.model.forward(*args, **kwargs)
+  File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/accelerate/hooks.py", line 166, in new_forward
+    output = module._old_forward(*args, **kwargs)
+  File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/transformers/models/gemma/modeling_gemma.py", line 1070, in forward
+    outputs = self.model(
+  File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/accelerate/hooks.py", line 166, in new_forward
+    output = module._old_forward(*args, **kwargs)
+  File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/transformers/models/gemma/modeling_gemma.py", line 875, in forward
+    causal_mask = self._update_causal_mask(attention_mask, inputs_embeds)
+  File "/home/rnd/miniconda3/envs/gemma/lib/python3.9/site-packages/transformers/models/gemma/modeling_gemma.py", line 979, in _update_causal_mask
+    if not is_tracing and torch.any(attention_mask != 1):
+KeyboardInterrupt

wandb/run-20240223_032422-b657btrg/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,101 @@

+accelerate==0.28.0.dev0
+aiohttp==3.9.3
+aiosignal==1.3.1
+appdirs==1.4.4
+asttokens==2.4.1
+async-timeout==4.0.3
+attrs==23.2.0
+bitsandbytes==0.42.0
+certifi==2024.2.2
+charset-normalizer==3.3.2
+click==8.1.7
+comm==0.2.1
+contourpy==1.2.0
+cycler==0.12.1
+datasets==2.17.1
+decorator==5.1.1
+dill==0.3.8
+docker-pycreds==0.4.0
+exceptiongroup==1.2.0
+executing==2.0.1
+filelock==3.13.1
+fonttools==4.49.0
+frozenlist==1.4.1
+fsspec==2023.10.0
+gitdb==4.0.11
+gitpython==3.1.42
+huggingface-hub==0.20.3
+idna==3.6
+importlib-resources==6.1.1
+ipython==8.18.1
+ipywidgets==8.1.2
+jedi==0.19.1
+jinja2==3.1.3
+jupyterlab-widgets==3.0.10
+kiwisolver==1.4.5
+markupsafe==2.1.5
+matplotlib-inline==0.1.6
+matplotlib==3.8.3
+mpmath==1.3.0
+multidict==6.0.5
+multiprocess==0.70.16
+networkx==3.2.1
+numpy==1.26.4
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==8.9.2.26
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-nccl-cu12==2.19.3
+nvidia-nvjitlink-cu12==12.3.101
+nvidia-nvtx-cu12==12.1.105
+packaging==23.2
+pandas==2.2.0
+parso==0.8.3
+peft==0.8.2
+pexpect==4.9.0
+pillow==10.2.0
+pip==23.3.1
+prompt-toolkit==3.0.43
+protobuf==4.25.3
+psutil==5.9.8
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow-hotfix==0.6
+pyarrow==15.0.0
+pygments==2.17.2
+pyparsing==3.1.1
+python-dateutil==2.8.2
+pytz==2024.1
+pyyaml==6.0.1
+regex==2023.12.25
+requests==2.31.0
+safetensors==0.4.2
+scipy==1.12.0
+sentry-sdk==1.40.5
+setproctitle==1.3.3
+setuptools==68.2.2
+six==1.16.0
+smmap==5.0.1
+stack-data==0.6.3
+sympy==1.12
+tokenizers==0.15.2
+torch==2.2.1
+tqdm==4.66.2
+traitlets==5.14.1
+transformers==4.39.0.dev0
+triton==2.2.0
+typing-extensions==4.9.0
+tzdata==2024.1
+urllib3==2.2.1
+wandb==0.16.3
+wcwidth==0.2.13
+wheel==0.41.2
+widgetsnbextension==4.0.10
+xxhash==3.4.1
+yarl==1.9.4
+zipp==3.17.0

wandb/run-20240223_032422-b657btrg/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,202 @@

+{
+    "os": "Linux-5.4.0-172-generic-x86_64-with-glibc2.31",
+    "python": "3.9.18",
+    "heartbeatAt": "2024-02-22T21:54:23.837753",
+    "startedAt": "2024-02-22T21:54:22.976651",
+    "docker": null,
+    "cuda": null,
+    "args": [],
+    "state": "running",
+    "program": "/home/rnd/Documents/Ameer/gemma/new.py",
+    "codePathLocal": "new.py",
+    "codePath": "new.py",
+    "host": "rnd-System-Product-Name",
+    "username": "rnd",
+    "executable": "/home/rnd/miniconda3/envs/gemma/bin/python",
+    "cpu_count": 24,
+    "cpu_count_logical": 32,
+    "cpu_freq": {
+        "current": 4938.592250000001,
+        "min": 800.0,
+        "max": 5700.0
+    },
+    "cpu_freq_per_core": [
+        {
+            "current": 5500.0,
+            "min": 800.0,
+            "max": 7000.0
+        },
+        {
+            "current": 5500.0,
+            "min": 800.0,
+            "max": 7000.0
+        },
+        {
+            "current": 5500.0,
+            "min": 800.0,
+            "max": 7000.0
+        },
+        {
+            "current": 5504.691,
+            "min": 800.0,
+            "max": 7000.0
+        },
+        {
+            "current": 5500.0,
+            "min": 800.0,
+            "max": 7000.0
+        },
+        {
+            "current": 5526.894,
+            "min": 800.0,
+            "max": 7000.0
+        },
+        {
+            "current": 5500.0,
+            "min": 800.0,
+            "max": 7000.0
+        },
+        {
+            "current": 5516.923,
+            "min": 800.0,
+            "max": 7000.0
+        },
+        {
+            "current": 5800.0,
+            "min": 800.0,
+            "max": 7400.0
+        },
+        {
+            "current": 5819.727,
+            "min": 800.0,
+            "max": 7400.0
+        },
+        {
+            "current": 5684.191,
+            "min": 800.0,
+            "max": 7400.0
+        },
+        {
+            "current": 5815.223,
+            "min": 800.0,
+            "max": 7400.0
+        },
+        {
+            "current": 5500.0,
+            "min": 800.0,
+            "max": 7000.0
+        },
+        {
+            "current": 5500.0,
+            "min": 800.0,
+            "max": 7000.0
+        },
+        {
+            "current": 5500.0,
+            "min": 800.0,
+            "max": 7000.0
+        },
+        {
+            "current": 5536.423,
+            "min": 800.0,
+            "max": 7000.0
+        },
+        {
+            "current": 4296.422,
+            "min": 800.0,
+            "max": 4300.0
+        },
+        {
+            "current": 4302.26,
+            "min": 800.0,
+            "max": 4300.0
+        },
+        {
+            "current": 4292.475,
+            "min": 800.0,
+            "max": 4300.0
+        },
+        {
+            "current": 4279.054,
+            "min": 800.0,
+            "max": 4300.0
+        },
+        {
+            "current": 4283.433,
+            "min": 800.0,
+            "max": 4300.0
+        },
+        {
+            "current": 4300.044,
+            "min": 800.0,
+            "max": 4300.0
+        },
+        {
+            "current": 4368.421,
+            "min": 800.0,
+            "max": 4300.0
+        },
+        {
+            "current": 4280.523,
+            "min": 800.0,
+            "max": 4300.0
+        },
+        {
+            "current": 4291.353,
+            "min": 800.0,
+            "max": 4300.0
+        },
+        {
+            "current": 4314.482,
+            "min": 800.0,
+            "max": 4300.0
+        },
+        {
+            "current": 4299.578,
+            "min": 800.0,
+            "max": 4300.0
+        },
+        {
+            "current": 4300.072,
+            "min": 800.0,
+            "max": 4300.0
+        },
+        {
+            "current": 4301.981,
+            "min": 800.0,
+            "max": 4300.0
+        },
+        {
+            "current": 4311.285,
+            "min": 800.0,
+            "max": 4300.0
+        },
+        {
+            "current": 4302.597,
+            "min": 800.0,
+            "max": 4300.0
+        },
+        {
+            "current": 4306.9,
+            "min": 800.0,
+            "max": 4300.0
+        }
+    ],
+    "disk": {
+        "/": {
+            "total": 1832.2072448730469,
+            "used": 1698.8227272033691
+        }
+    },
+    "gpu": "NVIDIA GeForce RTX 3090 Ti",
+    "gpu_count": 1,
+    "gpu_devices": [
+        {
+            "name": "NVIDIA GeForce RTX 3090 Ti",
+            "memory_total": 25757220864
+        }
+    ],
+    "memory": {
+        "total": 62.508731842041016
+    }
+}

wandb/run-20240223_032422-b657btrg/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"train/loss": 2.9901, "train/grad_norm": 5.66660213470459, "train/learning_rate": 1.628256513026052e-05, "train/epoch": 0.0, "train/global_step": 175, "_timestamp": 1708640958.932987, "_runtime": 2095.9480090141296, "_step": 12, "eval/loss": 3.162160873413086, "eval/runtime": 340.5303, "eval/samples_per_second": 68.029, "eval/steps_per_second": 8.504, "_wandb": {"runtime": 2302}}

wandb/run-20240223_032422-b657btrg/logs/debug-internal.log ADDED Viewed

The diff for this file is too large to render. See raw diff

wandb/run-20240223_032422-b657btrg/logs/debug.log ADDED Viewed

	@@ -0,0 +1,27 @@

+2024-02-23 03:24:22,980 INFO    MainThread:116027 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
+2024-02-23 03:24:22,980 INFO    MainThread:116027 [wandb_setup.py:_flush():76] Configure stats pid to 116027
+2024-02-23 03:24:22,980 INFO    MainThread:116027 [wandb_setup.py:_flush():76] Loading settings from /home/rnd/.config/wandb/settings
+2024-02-23 03:24:22,980 INFO    MainThread:116027 [wandb_setup.py:_flush():76] Loading settings from /home/rnd/Documents/Ameer/gemma/wandb/settings
+2024-02-23 03:24:22,980 INFO    MainThread:116027 [wandb_setup.py:_flush():76] Loading settings from environment variables: {}
+2024-02-23 03:24:22,980 INFO    MainThread:116027 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
+2024-02-23 03:24:22,980 INFO    MainThread:116027 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'new.py', 'program_abspath': '/home/rnd/Documents/Ameer/gemma/new.py', 'program': '/home/rnd/Documents/Ameer/gemma/new.py'}
+2024-02-23 03:24:22,980 INFO    MainThread:116027 [wandb_init.py:_log_setup():526] Logging user logs to /home/rnd/Documents/Ameer/gemma/wandb/run-20240223_032422-b657btrg/logs/debug.log
+2024-02-23 03:24:22,980 INFO    MainThread:116027 [wandb_init.py:_log_setup():527] Logging internal logs to /home/rnd/Documents/Ameer/gemma/wandb/run-20240223_032422-b657btrg/logs/debug-internal.log
+2024-02-23 03:24:22,981 INFO    MainThread:116027 [wandb_init.py:init():566] calling init triggers
+2024-02-23 03:24:22,981 INFO    MainThread:116027 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
+config: {}
+2024-02-23 03:24:22,981 INFO    MainThread:116027 [wandb_init.py:init():616] starting backend
+2024-02-23 03:24:22,981 INFO    MainThread:116027 [wandb_init.py:init():620] setting up manager
+2024-02-23 03:24:22,982 INFO    MainThread:116027 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-02-23 03:24:22,984 INFO    MainThread:116027 [wandb_init.py:init():628] backend started and connected
+2024-02-23 03:24:22,986 INFO    MainThread:116027 [wandb_init.py:init():720] updated telemetry
+2024-02-23 03:24:22,986 INFO    MainThread:116027 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
+2024-02-23 03:24:23,690 INFO    MainThread:116027 [wandb_run.py:_on_init():2262] communicating current version
+2024-02-23 03:24:23,800 INFO    MainThread:116027 [wandb_run.py:_on_init():2271] got version response
+2024-02-23 03:24:23,800 INFO    MainThread:116027 [wandb_init.py:init():804] starting run threads in backend
+2024-02-23 03:24:24,871 INFO    MainThread:116027 [wandb_run.py:_console_start():2241] atexit reg
+2024-02-23 03:24:24,871 INFO    MainThread:116027 [wandb_run.py:_redirect():2096] redirect: wrap_raw
+2024-02-23 03:24:24,871 INFO    MainThread:116027 [wandb_run.py:_redirect():2161] Wrapping output streams.
+2024-02-23 03:24:24,871 INFO    MainThread:116027 [wandb_run.py:_redirect():2186] Redirects installed.
+2024-02-23 03:24:24,871 INFO    MainThread:116027 [wandb_init.py:init():847] run started, returning control to user process
+2024-02-23 03:24:24,872 INFO    MainThread:116027 [wandb_run.py:_config_callback():1343] config_cb None None {'vocab_size': 256000, 'max_position_embeddings': 8192, 'hidden_size': 2048, 'intermediate_size': 16384, 'num_hidden_layers': 18, 'num_attention_heads': 8, 'head_dim': 256, 'num_key_value_heads': 1, 'hidden_act': 'gelu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'rope_theta': 10000.0, 'attention_bias': False, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['GemmaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 2, 'pad_token_id': 0, 'eos_token_id': 1, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'google/gemma-2b', 'transformers_version': '4.39.0.dev0', 'model_type': 'gemma', 'rope_scaling': None, 'quantization_config': {'quant_method': 'QuantizationMethod.BITS_AND_BYTES', '_load_in_8bit': False, '_load_in_4bit': True, 'llm_int8_threshold': 6.0, 'llm_int8_skip_modules': None, 'llm_int8_enable_fp32_cpu_offload': False, 'llm_int8_has_fp16_weight': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'bfloat16', 'load_in_4bit': True, 'load_in_8bit': False}, 'output_dir': './gemma-jokes-gemma', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'evaluation_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 2, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'learning_rate': 2.5e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3.0, 'max_steps': 500, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 1, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './logs', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 25, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 25, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 25, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 'gemma-jokes-gemma-2024-02-23-03-24', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'paged_adamw_8bit', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None}

wandb/run-20240223_032422-b657btrg/run-b657btrg.wandb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d50af94be5df0d2db6e804280eafdecb87c0ad05f6e33f9783c5396a91cc8b25
+size 8654393