diff --git a/README.md b/README.md index 54cac61cb063a8bb1bbd46b78f9f0090cfd7a0f7..b3406917f3229edc5165ba222038d9bffe957a2f 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,41 @@ --- pipeline_tag: sentence-similarity -license: apache-2.0 tags: - sentence-transformers - feature-extraction - sentence-similarity -- transformers +language: en +license: apache-2.0 +datasets: +- s2orc +- flax-sentence-embeddings/stackexchange_xml +- ms_marco +- gooaq +- yahoo_answers_topics +- code_search_net +- search_qa +- eli5 +- snli +- multi_nli +- wikihow +- natural_questions +- trivia_qa +- embedding-data/sentence-compression +- embedding-data/flickr30k-captions +- embedding-data/altlex +- embedding-data/simple-wiki +- embedding-data/QQP +- embedding-data/SPECTER +- embedding-data/PAQ_pairs +- embedding-data/WikiAnswers + --- -# sentence-transformers/paraphrase-MiniLM-L6-v2 +# all-MiniLM-L6-v2 This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search. - - ## Usage (Sentence-Transformers) - Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed: ``` @@ -23,25 +43,22 @@ pip install -U sentence-transformers ``` Then you can use the model like this: - ```python from sentence_transformers import SentenceTransformer sentences = ["This is an example sentence", "Each sentence is converted"] -model = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2') +model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') embeddings = model.encode(sentences) print(embeddings) ``` - - ## Usage (HuggingFace Transformers) Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings. ```python from transformers import AutoTokenizer, AutoModel import torch - +import torch.nn.functional as F #Mean Pooling - Take attention mask into account for correct averaging def mean_pooling(model_output, attention_mask): @@ -54,8 +71,8 @@ def mean_pooling(model_output, attention_mask): sentences = ['This is an example sentence', 'Each sentence is converted'] # Load model from HuggingFace Hub -tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-MiniLM-L6-v2') -model = AutoModel.from_pretrained('sentence-transformers/paraphrase-MiniLM-L6-v2') +tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2') +model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2') # Tokenize sentences encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt') @@ -64,44 +81,96 @@ encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tenso with torch.no_grad(): model_output = model(**encoded_input) -# Perform pooling. In this case, max pooling. +# Perform pooling sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']) +# Normalize embeddings +sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1) + print("Sentence embeddings:") print(sentence_embeddings) ``` +## Evaluation Results +For an automated evaluation of this model, see the *Sentence Embeddings Benchmark*: [https://seb.sbert.net](https://seb.sbert.net?model_name=sentence-transformers/all-MiniLM-L6-v2) -## Evaluation Results +------ +## Background +The project aims to train sentence embedding models on very large sentence level datasets using a self-supervised +contrastive learning objective. We used the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model and fine-tuned in on a +1B sentence pairs dataset. We use a contrastive learning objective: given a sentence from the pair, the model should predict which out of a set of randomly sampled other sentences, was actually paired with it in our dataset. -For an automated evaluation of this model, see the *Sentence Embeddings Benchmark*: [https://seb.sbert.net](https://seb.sbert.net?model_name=sentence-transformers/paraphrase-MiniLM-L6-v2) +We developped this model during the +[Community week using JAX/Flax for NLP & CV](https://discuss.huggingface.co/t/open-to-the-community-community-week-using-jax-flax-for-nlp-cv/7104), +organized by Hugging Face. We developped this model as part of the project: +[Train the Best Sentence Embedding Model Ever with 1B Training Pairs](https://discuss.huggingface.co/t/train-the-best-sentence-embedding-model-ever-with-1b-training-pairs/7354). We benefited from efficient hardware infrastructure to run the project: 7 TPUs v3-8, as well as intervention from Googles Flax, JAX, and Cloud team member about efficient deep learning frameworks. +## Intended uses +Our model is intented to be used as a sentence and short paragraph encoder. Given an input text, it ouptuts a vector which captures +the semantic information. The sentence vector may be used for information retrieval, clustering or sentence similarity tasks. + +By default, input text longer than 256 word pieces is truncated. + + +## Training procedure + +### Pre-training + +We use the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model. Please refer to the model card for more detailed information about the pre-training procedure. + +### Fine-tuning + +We fine-tune the model using a contrastive objective. Formally, we compute the cosine similarity from each possible sentence pairs from the batch. +We then apply the cross entropy loss by comparing with true pairs. + +#### Hyper parameters + +We trained ou model on a TPU v3-8. We train the model during 100k steps using a batch size of 1024 (128 per TPU core). +We use a learning rate warm up of 500. The sequence length was limited to 128 tokens. We used the AdamW optimizer with +a 2e-5 learning rate. The full training script is accessible in this current repository: `train_script.py`. + +#### Training data + +We use the concatenation from multiple datasets to fine-tune our model. The total number of sentence pairs is above 1 billion sentences. +We sampled each dataset given a weighted probability which configuration is detailed in the `data_config.json` file. -## Full Model Architecture -``` -SentenceTransformer( - (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel - (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False}) -) -``` -## Citing & Authors - -This model was trained by [sentence-transformers](https://www.sbert.net/). - -If you find this model helpful, feel free to cite our publication [Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks](https://arxiv.org/abs/1908.10084): -```bibtex -@inproceedings{reimers-2019-sentence-bert, - title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks", - author = "Reimers, Nils and Gurevych, Iryna", - booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing", - month = "11", - year = "2019", - publisher = "Association for Computational Linguistics", - url = "http://arxiv.org/abs/1908.10084", -} -``` \ No newline at end of file +| Dataset | Paper | Number of training tuples | +|--------------------------------------------------------|:----------------------------------------:|:--------------------------:| +| [Reddit comments (2015-2018)](https://github.com/PolyAI-LDN/conversational-datasets/tree/master/reddit) | [paper](https://arxiv.org/abs/1904.06472) | 726,484,430 | +| [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Abstracts) | [paper](https://aclanthology.org/2020.acl-main.447/) | 116,288,806 | +| [WikiAnswers](https://github.com/afader/oqa#wikianswers-corpus) Duplicate question pairs | [paper](https://doi.org/10.1145/2623330.2623677) | 77,427,422 | +| [PAQ](https://github.com/facebookresearch/PAQ) (Question, Answer) pairs | [paper](https://arxiv.org/abs/2102.07033) | 64,371,441 | +| [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Titles) | [paper](https://aclanthology.org/2020.acl-main.447/) | 52,603,982 | +| [S2ORC](https://github.com/allenai/s2orc) (Title, Abstract) | [paper](https://aclanthology.org/2020.acl-main.447/) | 41,769,185 | +| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Body) pairs | - | 25,316,456 | +| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title+Body, Answer) pairs | - | 21,396,559 | +| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Answer) pairs | - | 21,396,559 | +| [MS MARCO](https://microsoft.github.io/msmarco/) triplets | [paper](https://doi.org/10.1145/3404835.3462804) | 9,144,553 | +| [GOOAQ: Open Question Answering with Diverse Answer Types](https://github.com/allenai/gooaq) | [paper](https://arxiv.org/pdf/2104.08727.pdf) | 3,012,496 | +| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 1,198,260 | +| [Code Search](https://huggingface.co/datasets/code_search_net) | - | 1,151,414 | +| [COCO](https://cocodataset.org/#home) Image captions | [paper](https://link.springer.com/chapter/10.1007%2F978-3-319-10602-1_48) | 828,395| +| [SPECTER](https://github.com/allenai/specter) citation triplets | [paper](https://doi.org/10.18653/v1/2020.acl-main.207) | 684,100 | +| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Question, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 681,164 | +| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Question) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 659,896 | +| [SearchQA](https://huggingface.co/datasets/search_qa) | [paper](https://arxiv.org/abs/1704.05179) | 582,261 | +| [Eli5](https://huggingface.co/datasets/eli5) | [paper](https://doi.org/10.18653/v1/p19-1346) | 325,475 | +| [Flickr 30k](https://shannon.cs.illinois.edu/DenotationGraph/) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/229/33) | 317,695 | +| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles) | | 304,525 | +| AllNLI ([SNLI](https://nlp.stanford.edu/projects/snli/) and [MultiNLI](https://cims.nyu.edu/~sbowman/multinli/) | [paper SNLI](https://doi.org/10.18653/v1/d15-1075), [paper MultiNLI](https://doi.org/10.18653/v1/n18-1101) | 277,230 | +| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (bodies) | | 250,519 | +| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles+bodies) | | 250,460 | +| [Sentence Compression](https://github.com/google-research-datasets/sentence-compression) | [paper](https://www.aclweb.org/anthology/D13-1155/) | 180,000 | +| [Wikihow](https://github.com/pvl/wikihow_pairs_dataset) | [paper](https://arxiv.org/abs/1810.09305) | 128,542 | +| [Altlex](https://github.com/chridey/altlex/) | [paper](https://aclanthology.org/P16-1135.pdf) | 112,696 | +| [Quora Question Triplets](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) | - | 103,663 | +| [Simple Wikipedia](https://cs.pomona.edu/~dkauchak/simplification/) | [paper](https://www.aclweb.org/anthology/P11-2117/) | 102,225 | +| [Natural Questions (NQ)](https://ai.google.com/research/NaturalQuestions) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/1455) | 100,231 | +| [SQuAD2.0](https://rajpurkar.github.io/SQuAD-explorer/) | [paper](https://aclanthology.org/P18-2124.pdf) | 87,599 | +| [TriviaQA](https://huggingface.co/datasets/trivia_qa) | - | 73,346 | +| **Total** | | **1,170,060,424** | \ No newline at end of file diff --git a/checkpoint-6027/model.safetensors b/checkpoint-6027/model.safetensors index 352e15ab9341fbb2730e87ad1795fcadf79170fe..3c98bf2907ed9a3be3e465603aca9a186a2fb838 100644 --- a/checkpoint-6027/model.safetensors +++ b/checkpoint-6027/model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4733fd7d4f13b14f19647c36ba4fa4454d7f3de192d83b1afb82511a84823e23 +oid sha256:20f54df4e8c9bd1866a8ac7adca8e778928f22e625414141a00dd280c85a20bf size 90866120 diff --git a/checkpoint-6027/optimizer.pt b/checkpoint-6027/optimizer.pt index 4f268bb55328052e7690414807912c495cda6a0d..390beafdceece651df7055d4f4afe0c504da2a1a 100644 --- a/checkpoint-6027/optimizer.pt +++ b/checkpoint-6027/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c886a67f3900eef2655cca574218b61d5a1cd40487a321e085fec5a326ae7540 +oid sha256:5f79ccb443804e2bba2882215d428bb3b09c351e80f0ff1716e374562ceb1a13 size 180607738 diff --git a/checkpoint-6027/rng_state.pth b/checkpoint-6027/rng_state.pth index d25102f6afeebf1f9a47b484afa3f64747a69559..bb1083563bc83d588b4ced0c57a937409b92c5b7 100644 --- a/checkpoint-6027/rng_state.pth +++ b/checkpoint-6027/rng_state.pth @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8c8132ebefa53f250b44c875ba8e0ff411e4d800e0e9e5925eb8ae5a49fcd489 +oid sha256:2940e9c82b14997acafbac7306669587243bb5d36e9e8963c06bf27bc987b68a size 14244 diff --git a/checkpoint-6027/trainer_state.json b/checkpoint-6027/trainer_state.json index 82efb61786b2ab0c1d4825483281fb6cfb8734ea..9d60f5bc6ac23ea7e9af443830ec0c88c41575f0 100644 --- a/checkpoint-6027/trainer_state.json +++ b/checkpoint-6027/trainer_state.json @@ -11,127 +11,127 @@ { "epoch": 1.0, "learning_rate": 0.000484375, - "loss": 3.0823, + "loss": 2.8375, "step": 287 }, { "epoch": 2.0, "learning_rate": 0.00046875, - "loss": 2.7242, + "loss": 2.4263, "step": 574 }, { "epoch": 3.0, "learning_rate": 0.000453125, - "loss": 2.5348, + "loss": 2.2043, "step": 861 }, { "epoch": 4.0, "learning_rate": 0.0004375, - "loss": 2.4455, + "loss": 2.0835, "step": 1148 }, { "epoch": 5.0, "learning_rate": 0.000421875, - "loss": 2.3794, + "loss": 2.0225, "step": 1435 }, { "epoch": 6.0, "learning_rate": 0.00040625000000000004, - "loss": 2.3375, + "loss": 1.9901, "step": 1722 }, { "epoch": 7.0, "learning_rate": 0.000390625, - "loss": 2.3262, + "loss": 1.9992, "step": 2009 }, { "epoch": 8.0, "learning_rate": 0.000375, - "loss": 2.3114, + "loss": 1.9665, "step": 2296 }, { "epoch": 9.0, "learning_rate": 0.000359375, - "loss": 2.2921, + "loss": 1.943, "step": 2583 }, { "epoch": 10.0, "learning_rate": 0.00034375, - "loss": 2.2918, + "loss": 1.9327, "step": 2870 }, { "epoch": 11.0, "learning_rate": 0.000328125, - "loss": 2.2578, + "loss": 1.9184, "step": 3157 }, { "epoch": 12.0, "learning_rate": 0.0003125, - "loss": 2.2693, + "loss": 1.9191, "step": 3444 }, { "epoch": 13.0, "learning_rate": 0.000296875, - "loss": 2.2594, + "loss": 1.9074, "step": 3731 }, { "epoch": 14.0, "learning_rate": 0.00028125000000000003, - "loss": 2.2555, + "loss": 1.9066, "step": 4018 }, { "epoch": 15.0, "learning_rate": 0.000265625, - "loss": 2.2481, + "loss": 1.9053, "step": 4305 }, { "epoch": 16.0, "learning_rate": 0.00025, - "loss": 2.2468, + "loss": 1.8906, "step": 4592 }, { "epoch": 17.0, "learning_rate": 0.000234375, - "loss": 2.248, + "loss": 1.8876, "step": 4879 }, { "epoch": 18.0, "learning_rate": 0.00021875, - "loss": 2.2435, + "loss": 1.8837, "step": 5166 }, { "epoch": 19.0, "learning_rate": 0.00020312500000000002, - "loss": 2.2319, + "loss": 1.8766, "step": 5453 }, { "epoch": 20.0, "learning_rate": 0.0001875, - "loss": 2.2303, + "loss": 1.8701, "step": 5740 }, { "epoch": 21.0, "learning_rate": 0.000171875, - "loss": 2.2215, + "loss": 1.8698, "step": 6027 } ], diff --git a/checkpoint-6027/training_args.bin b/checkpoint-6027/training_args.bin index 363673a315b486cbc5df029c76d9fd96ac4823ba..f58a03d4ef7176bd1b794380352baf592d0f65d9 100644 --- a/checkpoint-6027/training_args.bin +++ b/checkpoint-6027/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1e68121a9357c4f016eb6bc0f031c8d8d3f664e26a8b5ed965be82c62d99c0bf +oid sha256:4f41e362c3bb6d45be0b656b1cdad4a1214468db81442967fe04c0d32b3ce8ef size 4792 diff --git a/checkpoint-6314/model.safetensors b/checkpoint-6314/model.safetensors index e0811ba89b3ac9e707743e9b7c8cd11e3108e3cd..da9d509cd969006578446febd9ef556a57d06e17 100644 --- a/checkpoint-6314/model.safetensors +++ b/checkpoint-6314/model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ffb9d234ce16eb62bfaa4221636dafbdd629621a3d5d6c5fd9ab84b3b0b1b1a6 +oid sha256:7603e4168414f34ed2739a8584bcc72a715b6bfe1ba92ddf4b2b0c93c307d714 size 90866120 diff --git a/checkpoint-6314/optimizer.pt b/checkpoint-6314/optimizer.pt index 73c1eba49ab14120ca5ad0c97a7b5737c22baa3b..6f8f6e15547173b99f299c0f945d1f22bffb4b76 100644 --- a/checkpoint-6314/optimizer.pt +++ b/checkpoint-6314/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9a87cfb151fe8553d7bf1bc304afb20d87b6535e8731fa1a235ed08ffc7a2fe6 +oid sha256:ce526795569446586023b13e812d7b1123d70054718bfb5f33ac2c43f985ac89 size 180607738 diff --git a/checkpoint-6314/rng_state.pth b/checkpoint-6314/rng_state.pth index ace69420580fc68a8d924fe0e349672936542cf3..b34eb886cf6bc00742ed159268e9aa2ff7eb5003 100644 --- a/checkpoint-6314/rng_state.pth +++ b/checkpoint-6314/rng_state.pth @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:19e881a8dd55719ee21e204acb99a2911d67016f769fb6b3b7466fafbaf0f9cd +oid sha256:0f0b7f5add7ad3f61f7f1a7b12ca6fe6f0f06d7c6730356f74df585b3dfe72cc size 14244 diff --git a/checkpoint-6314/trainer_state.json b/checkpoint-6314/trainer_state.json index 39b7015bd06a8db8fbcef275eaf142098be0a3e8..9d4f48e4895992d8b467fad296b7696d32fa2afc 100644 --- a/checkpoint-6314/trainer_state.json +++ b/checkpoint-6314/trainer_state.json @@ -11,133 +11,133 @@ { "epoch": 1.0, "learning_rate": 0.000484375, - "loss": 3.0823, + "loss": 2.8375, "step": 287 }, { "epoch": 2.0, "learning_rate": 0.00046875, - "loss": 2.7242, + "loss": 2.4263, "step": 574 }, { "epoch": 3.0, "learning_rate": 0.000453125, - "loss": 2.5348, + "loss": 2.2043, "step": 861 }, { "epoch": 4.0, "learning_rate": 0.0004375, - "loss": 2.4455, + "loss": 2.0835, "step": 1148 }, { "epoch": 5.0, "learning_rate": 0.000421875, - "loss": 2.3794, + "loss": 2.0225, "step": 1435 }, { "epoch": 6.0, "learning_rate": 0.00040625000000000004, - "loss": 2.3375, + "loss": 1.9901, "step": 1722 }, { "epoch": 7.0, "learning_rate": 0.000390625, - "loss": 2.3262, + "loss": 1.9992, "step": 2009 }, { "epoch": 8.0, "learning_rate": 0.000375, - "loss": 2.3114, + "loss": 1.9665, "step": 2296 }, { "epoch": 9.0, "learning_rate": 0.000359375, - "loss": 2.2921, + "loss": 1.943, "step": 2583 }, { "epoch": 10.0, "learning_rate": 0.00034375, - "loss": 2.2918, + "loss": 1.9327, "step": 2870 }, { "epoch": 11.0, "learning_rate": 0.000328125, - "loss": 2.2578, + "loss": 1.9184, "step": 3157 }, { "epoch": 12.0, "learning_rate": 0.0003125, - "loss": 2.2693, + "loss": 1.9191, "step": 3444 }, { "epoch": 13.0, "learning_rate": 0.000296875, - "loss": 2.2594, + "loss": 1.9074, "step": 3731 }, { "epoch": 14.0, "learning_rate": 0.00028125000000000003, - "loss": 2.2555, + "loss": 1.9066, "step": 4018 }, { "epoch": 15.0, "learning_rate": 0.000265625, - "loss": 2.2481, + "loss": 1.9053, "step": 4305 }, { "epoch": 16.0, "learning_rate": 0.00025, - "loss": 2.2468, + "loss": 1.8906, "step": 4592 }, { "epoch": 17.0, "learning_rate": 0.000234375, - "loss": 2.248, + "loss": 1.8876, "step": 4879 }, { "epoch": 18.0, "learning_rate": 0.00021875, - "loss": 2.2435, + "loss": 1.8837, "step": 5166 }, { "epoch": 19.0, "learning_rate": 0.00020312500000000002, - "loss": 2.2319, + "loss": 1.8766, "step": 5453 }, { "epoch": 20.0, "learning_rate": 0.0001875, - "loss": 2.2303, + "loss": 1.8701, "step": 5740 }, { "epoch": 21.0, "learning_rate": 0.000171875, - "loss": 2.2215, + "loss": 1.8698, "step": 6027 }, { "epoch": 22.0, "learning_rate": 0.00015625, - "loss": 2.2256, + "loss": 1.8713, "step": 6314 } ], diff --git a/checkpoint-6314/training_args.bin b/checkpoint-6314/training_args.bin index 363673a315b486cbc5df029c76d9fd96ac4823ba..f58a03d4ef7176bd1b794380352baf592d0f65d9 100644 --- a/checkpoint-6314/training_args.bin +++ b/checkpoint-6314/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1e68121a9357c4f016eb6bc0f031c8d8d3f664e26a8b5ed965be82c62d99c0bf +oid sha256:4f41e362c3bb6d45be0b656b1cdad4a1214468db81442967fe04c0d32b3ce8ef size 4792 diff --git a/checkpoint-6601/model.safetensors b/checkpoint-6601/model.safetensors index cf94a7e6387bd9b14b122209243f1dc82f94a4cf..e69c37a57e3a35924401070aeccc7325354750cd 100644 --- a/checkpoint-6601/model.safetensors +++ b/checkpoint-6601/model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f34e4e0514126e422d0077461ee3fdf8da8668c43e97aaf363dc673364948c6f +oid sha256:a88576850a220374dc780a89afe685efa8336d62628e1c336b8039e2f50ae536 size 90866120 diff --git a/checkpoint-6601/optimizer.pt b/checkpoint-6601/optimizer.pt index faa0eaaa70a14ab24cbf23e991b3ff92981f7443..b420e8d35f9e7770808911bb22b0338f68d4afe6 100644 --- a/checkpoint-6601/optimizer.pt +++ b/checkpoint-6601/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:63e359bde6424914d379e217b4ba17a6f2d39e97e20cd4836b7d77b4ca84d62a +oid sha256:10134f6ecfcfe9238c08f272279b4c12c95c3a6e63bee9d0231bb1adf6d7fbc5 size 180607738 diff --git a/checkpoint-6601/rng_state.pth b/checkpoint-6601/rng_state.pth index baedc4f875b8a056b98d11997d9bc4079144442e..058c18ef1debdf86d52999088df44405db371e7d 100644 --- a/checkpoint-6601/rng_state.pth +++ b/checkpoint-6601/rng_state.pth @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d798f414d39bf019c39269ef4f56532a529aab04afb3a417a5ff16b7ed0ad786 +oid sha256:fed85929eff002edec8edb87bbeb1690f47728b450557a98d09554980c227b55 size 14244 diff --git a/checkpoint-6601/trainer_state.json b/checkpoint-6601/trainer_state.json index c5ca239710f7a95d0e8083b6b0d03137150b9443..f4ab328f769ed82089d5b10a7c3bc119a360157f 100644 --- a/checkpoint-6601/trainer_state.json +++ b/checkpoint-6601/trainer_state.json @@ -11,139 +11,139 @@ { "epoch": 1.0, "learning_rate": 0.000484375, - "loss": 3.0823, + "loss": 2.8375, "step": 287 }, { "epoch": 2.0, "learning_rate": 0.00046875, - "loss": 2.7242, + "loss": 2.4263, "step": 574 }, { "epoch": 3.0, "learning_rate": 0.000453125, - "loss": 2.5348, + "loss": 2.2043, "step": 861 }, { "epoch": 4.0, "learning_rate": 0.0004375, - "loss": 2.4455, + "loss": 2.0835, "step": 1148 }, { "epoch": 5.0, "learning_rate": 0.000421875, - "loss": 2.3794, + "loss": 2.0225, "step": 1435 }, { "epoch": 6.0, "learning_rate": 0.00040625000000000004, - "loss": 2.3375, + "loss": 1.9901, "step": 1722 }, { "epoch": 7.0, "learning_rate": 0.000390625, - "loss": 2.3262, + "loss": 1.9992, "step": 2009 }, { "epoch": 8.0, "learning_rate": 0.000375, - "loss": 2.3114, + "loss": 1.9665, "step": 2296 }, { "epoch": 9.0, "learning_rate": 0.000359375, - "loss": 2.2921, + "loss": 1.943, "step": 2583 }, { "epoch": 10.0, "learning_rate": 0.00034375, - "loss": 2.2918, + "loss": 1.9327, "step": 2870 }, { "epoch": 11.0, "learning_rate": 0.000328125, - "loss": 2.2578, + "loss": 1.9184, "step": 3157 }, { "epoch": 12.0, "learning_rate": 0.0003125, - "loss": 2.2693, + "loss": 1.9191, "step": 3444 }, { "epoch": 13.0, "learning_rate": 0.000296875, - "loss": 2.2594, + "loss": 1.9074, "step": 3731 }, { "epoch": 14.0, "learning_rate": 0.00028125000000000003, - "loss": 2.2555, + "loss": 1.9066, "step": 4018 }, { "epoch": 15.0, "learning_rate": 0.000265625, - "loss": 2.2481, + "loss": 1.9053, "step": 4305 }, { "epoch": 16.0, "learning_rate": 0.00025, - "loss": 2.2468, + "loss": 1.8906, "step": 4592 }, { "epoch": 17.0, "learning_rate": 0.000234375, - "loss": 2.248, + "loss": 1.8876, "step": 4879 }, { "epoch": 18.0, "learning_rate": 0.00021875, - "loss": 2.2435, + "loss": 1.8837, "step": 5166 }, { "epoch": 19.0, "learning_rate": 0.00020312500000000002, - "loss": 2.2319, + "loss": 1.8766, "step": 5453 }, { "epoch": 20.0, "learning_rate": 0.0001875, - "loss": 2.2303, + "loss": 1.8701, "step": 5740 }, { "epoch": 21.0, "learning_rate": 0.000171875, - "loss": 2.2215, + "loss": 1.8698, "step": 6027 }, { "epoch": 22.0, "learning_rate": 0.00015625, - "loss": 2.2256, + "loss": 1.8713, "step": 6314 }, { "epoch": 23.0, "learning_rate": 0.00014062500000000002, - "loss": 2.2257, + "loss": 1.8756, "step": 6601 } ], diff --git a/checkpoint-6601/training_args.bin b/checkpoint-6601/training_args.bin index 363673a315b486cbc5df029c76d9fd96ac4823ba..f58a03d4ef7176bd1b794380352baf592d0f65d9 100644 --- a/checkpoint-6601/training_args.bin +++ b/checkpoint-6601/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1e68121a9357c4f016eb6bc0f031c8d8d3f664e26a8b5ed965be82c62d99c0bf +oid sha256:4f41e362c3bb6d45be0b656b1cdad4a1214468db81442967fe04c0d32b3ce8ef size 4792 diff --git a/checkpoint-6888/model.safetensors b/checkpoint-6888/model.safetensors index f7632e445458182953739b7c1059c7ae54e93722..f2d8648a6a942721880ecd8e5333441dde59c621 100644 --- a/checkpoint-6888/model.safetensors +++ b/checkpoint-6888/model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4ed9000929251e2f6f2419aac4b88167e4e0264b576d3bc6a2d1bb026eac5bd0 +oid sha256:078d04cb03d68f9e5dff1130f70639157bcb8034213f6aa18462db597752e46a size 90866120 diff --git a/checkpoint-6888/optimizer.pt b/checkpoint-6888/optimizer.pt index 46dc0a99f0a1bb1e7f98604617c19470d0ac7166..805a4ce1ea1a1d14523357b3c74c30993a3b3ca6 100644 --- a/checkpoint-6888/optimizer.pt +++ b/checkpoint-6888/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:49e950c4c4252d5d35e86d5eb4237b5d9fe84e76c44f29148682318bcc0019b4 +oid sha256:cb257272b2289651d45c98c388902a212eec396152c408d9bccb997a4e5562ec size 180607738 diff --git a/checkpoint-6888/rng_state.pth b/checkpoint-6888/rng_state.pth index 7625abee4a39a65bcd157245b0e1b833c2b5d0be..5e553387c32be59278cfe8f0db8ebecbfd5680e3 100644 --- a/checkpoint-6888/rng_state.pth +++ b/checkpoint-6888/rng_state.pth @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:02dd0d13161623f5c7e3f1f8f092881967f367c62ffd2bdebb58fb8c30515275 +oid sha256:911230d9c38583f3cb3f09b5e1e903f4caa43946dd9edc12868a8d0c7278e233 size 14244 diff --git a/checkpoint-6888/trainer_state.json b/checkpoint-6888/trainer_state.json index f843f38e0be702f19c694986fb7b10186e0124d4..2813b2451cc167deaaa9a6f7d2e6e75ad52f72a6 100644 --- a/checkpoint-6888/trainer_state.json +++ b/checkpoint-6888/trainer_state.json @@ -11,145 +11,145 @@ { "epoch": 1.0, "learning_rate": 0.000484375, - "loss": 3.0823, + "loss": 2.8375, "step": 287 }, { "epoch": 2.0, "learning_rate": 0.00046875, - "loss": 2.7242, + "loss": 2.4263, "step": 574 }, { "epoch": 3.0, "learning_rate": 0.000453125, - "loss": 2.5348, + "loss": 2.2043, "step": 861 }, { "epoch": 4.0, "learning_rate": 0.0004375, - "loss": 2.4455, + "loss": 2.0835, "step": 1148 }, { "epoch": 5.0, "learning_rate": 0.000421875, - "loss": 2.3794, + "loss": 2.0225, "step": 1435 }, { "epoch": 6.0, "learning_rate": 0.00040625000000000004, - "loss": 2.3375, + "loss": 1.9901, "step": 1722 }, { "epoch": 7.0, "learning_rate": 0.000390625, - "loss": 2.3262, + "loss": 1.9992, "step": 2009 }, { "epoch": 8.0, "learning_rate": 0.000375, - "loss": 2.3114, + "loss": 1.9665, "step": 2296 }, { "epoch": 9.0, "learning_rate": 0.000359375, - "loss": 2.2921, + "loss": 1.943, "step": 2583 }, { "epoch": 10.0, "learning_rate": 0.00034375, - "loss": 2.2918, + "loss": 1.9327, "step": 2870 }, { "epoch": 11.0, "learning_rate": 0.000328125, - "loss": 2.2578, + "loss": 1.9184, "step": 3157 }, { "epoch": 12.0, "learning_rate": 0.0003125, - "loss": 2.2693, + "loss": 1.9191, "step": 3444 }, { "epoch": 13.0, "learning_rate": 0.000296875, - "loss": 2.2594, + "loss": 1.9074, "step": 3731 }, { "epoch": 14.0, "learning_rate": 0.00028125000000000003, - "loss": 2.2555, + "loss": 1.9066, "step": 4018 }, { "epoch": 15.0, "learning_rate": 0.000265625, - "loss": 2.2481, + "loss": 1.9053, "step": 4305 }, { "epoch": 16.0, "learning_rate": 0.00025, - "loss": 2.2468, + "loss": 1.8906, "step": 4592 }, { "epoch": 17.0, "learning_rate": 0.000234375, - "loss": 2.248, + "loss": 1.8876, "step": 4879 }, { "epoch": 18.0, "learning_rate": 0.00021875, - "loss": 2.2435, + "loss": 1.8837, "step": 5166 }, { "epoch": 19.0, "learning_rate": 0.00020312500000000002, - "loss": 2.2319, + "loss": 1.8766, "step": 5453 }, { "epoch": 20.0, "learning_rate": 0.0001875, - "loss": 2.2303, + "loss": 1.8701, "step": 5740 }, { "epoch": 21.0, "learning_rate": 0.000171875, - "loss": 2.2215, + "loss": 1.8698, "step": 6027 }, { "epoch": 22.0, "learning_rate": 0.00015625, - "loss": 2.2256, + "loss": 1.8713, "step": 6314 }, { "epoch": 23.0, "learning_rate": 0.00014062500000000002, - "loss": 2.2257, + "loss": 1.8756, "step": 6601 }, { "epoch": 24.0, "learning_rate": 0.000125, - "loss": 2.2275, + "loss": 1.8628, "step": 6888 } ], diff --git a/checkpoint-6888/training_args.bin b/checkpoint-6888/training_args.bin index 363673a315b486cbc5df029c76d9fd96ac4823ba..f58a03d4ef7176bd1b794380352baf592d0f65d9 100644 --- a/checkpoint-6888/training_args.bin +++ b/checkpoint-6888/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1e68121a9357c4f016eb6bc0f031c8d8d3f664e26a8b5ed965be82c62d99c0bf +oid sha256:4f41e362c3bb6d45be0b656b1cdad4a1214468db81442967fe04c0d32b3ce8ef size 4792 diff --git a/checkpoint-7175/model.safetensors b/checkpoint-7175/model.safetensors index a235a4cc937de2af0e15daea5cd061c8d59bc015..923104b1e276715c7d663ae4fe021ee733293a79 100644 --- a/checkpoint-7175/model.safetensors +++ b/checkpoint-7175/model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:70399412ee3a628d0f3b4610a75eb6d34bbbfd5af078a54428cf923428ede87c +oid sha256:b6a6c4bf0efc02a0db5d1472f33a7aaa0f2ff829f9971cf066d3376e2476cb5c size 90866120 diff --git a/checkpoint-7175/optimizer.pt b/checkpoint-7175/optimizer.pt index d6aba8ca430a6e35b898371a287478810079fee0..ae1177a645fd581ac8e96eac92422bb95d65d596 100644 --- a/checkpoint-7175/optimizer.pt +++ b/checkpoint-7175/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:35df210873e2c02a975d32ef197ce30b942c815e6ae93698e6ed9111688677e9 +oid sha256:4e6220a11df2196cd429e35fe93dd80caa3cedff8fb116bd2880af16b36decaa size 180607738 diff --git a/checkpoint-7175/rng_state.pth b/checkpoint-7175/rng_state.pth index 81b9c2918fd9681d2209a7d8968e927b0b10c95e..b715d6a575b7a18246e6a4721fe6370a7d18af88 100644 --- a/checkpoint-7175/rng_state.pth +++ b/checkpoint-7175/rng_state.pth @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bfad4aa84c0d92f9d5ccb73464e7c346e61b36684ece175fbbc2d6232d4d2ec1 +oid sha256:9a7f17d63e38b858a3d37dac0036826bba2af636fb4fa93fc576c379cfd6fffe size 14244 diff --git a/checkpoint-7175/trainer_state.json b/checkpoint-7175/trainer_state.json index 964f4e7b7ca0526670319cdfd477f0e31534bd29..107edb616b05a001d770861100dddb480d22993c 100644 --- a/checkpoint-7175/trainer_state.json +++ b/checkpoint-7175/trainer_state.json @@ -11,151 +11,151 @@ { "epoch": 1.0, "learning_rate": 0.000484375, - "loss": 3.0823, + "loss": 2.8375, "step": 287 }, { "epoch": 2.0, "learning_rate": 0.00046875, - "loss": 2.7242, + "loss": 2.4263, "step": 574 }, { "epoch": 3.0, "learning_rate": 0.000453125, - "loss": 2.5348, + "loss": 2.2043, "step": 861 }, { "epoch": 4.0, "learning_rate": 0.0004375, - "loss": 2.4455, + "loss": 2.0835, "step": 1148 }, { "epoch": 5.0, "learning_rate": 0.000421875, - "loss": 2.3794, + "loss": 2.0225, "step": 1435 }, { "epoch": 6.0, "learning_rate": 0.00040625000000000004, - "loss": 2.3375, + "loss": 1.9901, "step": 1722 }, { "epoch": 7.0, "learning_rate": 0.000390625, - "loss": 2.3262, + "loss": 1.9992, "step": 2009 }, { "epoch": 8.0, "learning_rate": 0.000375, - "loss": 2.3114, + "loss": 1.9665, "step": 2296 }, { "epoch": 9.0, "learning_rate": 0.000359375, - "loss": 2.2921, + "loss": 1.943, "step": 2583 }, { "epoch": 10.0, "learning_rate": 0.00034375, - "loss": 2.2918, + "loss": 1.9327, "step": 2870 }, { "epoch": 11.0, "learning_rate": 0.000328125, - "loss": 2.2578, + "loss": 1.9184, "step": 3157 }, { "epoch": 12.0, "learning_rate": 0.0003125, - "loss": 2.2693, + "loss": 1.9191, "step": 3444 }, { "epoch": 13.0, "learning_rate": 0.000296875, - "loss": 2.2594, + "loss": 1.9074, "step": 3731 }, { "epoch": 14.0, "learning_rate": 0.00028125000000000003, - "loss": 2.2555, + "loss": 1.9066, "step": 4018 }, { "epoch": 15.0, "learning_rate": 0.000265625, - "loss": 2.2481, + "loss": 1.9053, "step": 4305 }, { "epoch": 16.0, "learning_rate": 0.00025, - "loss": 2.2468, + "loss": 1.8906, "step": 4592 }, { "epoch": 17.0, "learning_rate": 0.000234375, - "loss": 2.248, + "loss": 1.8876, "step": 4879 }, { "epoch": 18.0, "learning_rate": 0.00021875, - "loss": 2.2435, + "loss": 1.8837, "step": 5166 }, { "epoch": 19.0, "learning_rate": 0.00020312500000000002, - "loss": 2.2319, + "loss": 1.8766, "step": 5453 }, { "epoch": 20.0, "learning_rate": 0.0001875, - "loss": 2.2303, + "loss": 1.8701, "step": 5740 }, { "epoch": 21.0, "learning_rate": 0.000171875, - "loss": 2.2215, + "loss": 1.8698, "step": 6027 }, { "epoch": 22.0, "learning_rate": 0.00015625, - "loss": 2.2256, + "loss": 1.8713, "step": 6314 }, { "epoch": 23.0, "learning_rate": 0.00014062500000000002, - "loss": 2.2257, + "loss": 1.8756, "step": 6601 }, { "epoch": 24.0, "learning_rate": 0.000125, - "loss": 2.2275, + "loss": 1.8628, "step": 6888 }, { "epoch": 25.0, "learning_rate": 0.000109375, - "loss": 2.2225, + "loss": 1.8646, "step": 7175 } ], diff --git a/checkpoint-7175/training_args.bin b/checkpoint-7175/training_args.bin index 363673a315b486cbc5df029c76d9fd96ac4823ba..f58a03d4ef7176bd1b794380352baf592d0f65d9 100644 --- a/checkpoint-7175/training_args.bin +++ b/checkpoint-7175/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1e68121a9357c4f016eb6bc0f031c8d8d3f664e26a8b5ed965be82c62d99c0bf +oid sha256:4f41e362c3bb6d45be0b656b1cdad4a1214468db81442967fe04c0d32b3ce8ef size 4792 diff --git a/checkpoint-7462/model.safetensors b/checkpoint-7462/model.safetensors index ae65a249a849284ecbd1030c1d01436687f2ae0e..10e4e0468a6946f3210186d21bd22155dcf28535 100644 --- a/checkpoint-7462/model.safetensors +++ b/checkpoint-7462/model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:93de1f250f3d2b213443d49f9a6bc58b9bc6136b5360cd3981314985ac99869e +oid sha256:b31b5739395450e8d78ba38415ebd32a48cded12a0eab4ef4b1ee8ca80de599d size 90866120 diff --git a/checkpoint-7462/optimizer.pt b/checkpoint-7462/optimizer.pt index 56f186cfc87d43a471bdd9bd41be945c3c76baa7..3ed75adae54530a46e0c0acba753b2dc350d86c7 100644 --- a/checkpoint-7462/optimizer.pt +++ b/checkpoint-7462/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:95db1db9cba8df01031b08118df90149273beaa66c8a2f4420b573054b006036 +oid sha256:e6dccce14eb6c0f7e10922e01b64e0fb4825d650f88326d7d075eabb9f174700 size 180607738 diff --git a/checkpoint-7462/rng_state.pth b/checkpoint-7462/rng_state.pth index f9ef11de135bc86a0efe6dbcf051974f547e3a5b..b41ab7f944405441610f9b0624245227a748b75a 100644 --- a/checkpoint-7462/rng_state.pth +++ b/checkpoint-7462/rng_state.pth @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:989c0e225ca35a25d7e972b25c493513a6c249b1701d9a4935578c8df15b4a84 +oid sha256:cb1f2da8cac05205dd34f3f7b221d677f0c357fd01483cd650cd765a772b2153 size 14244 diff --git a/checkpoint-7462/trainer_state.json b/checkpoint-7462/trainer_state.json index 15fff19d170dea092b78882d9720c616b6743d9a..2108e64cc349f701b485c496526161ba9b60e820 100644 --- a/checkpoint-7462/trainer_state.json +++ b/checkpoint-7462/trainer_state.json @@ -11,157 +11,157 @@ { "epoch": 1.0, "learning_rate": 0.000484375, - "loss": 3.0823, + "loss": 2.8375, "step": 287 }, { "epoch": 2.0, "learning_rate": 0.00046875, - "loss": 2.7242, + "loss": 2.4263, "step": 574 }, { "epoch": 3.0, "learning_rate": 0.000453125, - "loss": 2.5348, + "loss": 2.2043, "step": 861 }, { "epoch": 4.0, "learning_rate": 0.0004375, - "loss": 2.4455, + "loss": 2.0835, "step": 1148 }, { "epoch": 5.0, "learning_rate": 0.000421875, - "loss": 2.3794, + "loss": 2.0225, "step": 1435 }, { "epoch": 6.0, "learning_rate": 0.00040625000000000004, - "loss": 2.3375, + "loss": 1.9901, "step": 1722 }, { "epoch": 7.0, "learning_rate": 0.000390625, - "loss": 2.3262, + "loss": 1.9992, "step": 2009 }, { "epoch": 8.0, "learning_rate": 0.000375, - "loss": 2.3114, + "loss": 1.9665, "step": 2296 }, { "epoch": 9.0, "learning_rate": 0.000359375, - "loss": 2.2921, + "loss": 1.943, "step": 2583 }, { "epoch": 10.0, "learning_rate": 0.00034375, - "loss": 2.2918, + "loss": 1.9327, "step": 2870 }, { "epoch": 11.0, "learning_rate": 0.000328125, - "loss": 2.2578, + "loss": 1.9184, "step": 3157 }, { "epoch": 12.0, "learning_rate": 0.0003125, - "loss": 2.2693, + "loss": 1.9191, "step": 3444 }, { "epoch": 13.0, "learning_rate": 0.000296875, - "loss": 2.2594, + "loss": 1.9074, "step": 3731 }, { "epoch": 14.0, "learning_rate": 0.00028125000000000003, - "loss": 2.2555, + "loss": 1.9066, "step": 4018 }, { "epoch": 15.0, "learning_rate": 0.000265625, - "loss": 2.2481, + "loss": 1.9053, "step": 4305 }, { "epoch": 16.0, "learning_rate": 0.00025, - "loss": 2.2468, + "loss": 1.8906, "step": 4592 }, { "epoch": 17.0, "learning_rate": 0.000234375, - "loss": 2.248, + "loss": 1.8876, "step": 4879 }, { "epoch": 18.0, "learning_rate": 0.00021875, - "loss": 2.2435, + "loss": 1.8837, "step": 5166 }, { "epoch": 19.0, "learning_rate": 0.00020312500000000002, - "loss": 2.2319, + "loss": 1.8766, "step": 5453 }, { "epoch": 20.0, "learning_rate": 0.0001875, - "loss": 2.2303, + "loss": 1.8701, "step": 5740 }, { "epoch": 21.0, "learning_rate": 0.000171875, - "loss": 2.2215, + "loss": 1.8698, "step": 6027 }, { "epoch": 22.0, "learning_rate": 0.00015625, - "loss": 2.2256, + "loss": 1.8713, "step": 6314 }, { "epoch": 23.0, "learning_rate": 0.00014062500000000002, - "loss": 2.2257, + "loss": 1.8756, "step": 6601 }, { "epoch": 24.0, "learning_rate": 0.000125, - "loss": 2.2275, + "loss": 1.8628, "step": 6888 }, { "epoch": 25.0, "learning_rate": 0.000109375, - "loss": 2.2225, + "loss": 1.8646, "step": 7175 }, { "epoch": 26.0, "learning_rate": 9.375e-05, - "loss": 2.2166, + "loss": 1.8658, "step": 7462 } ], diff --git a/checkpoint-7462/training_args.bin b/checkpoint-7462/training_args.bin index 363673a315b486cbc5df029c76d9fd96ac4823ba..f58a03d4ef7176bd1b794380352baf592d0f65d9 100644 --- a/checkpoint-7462/training_args.bin +++ b/checkpoint-7462/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1e68121a9357c4f016eb6bc0f031c8d8d3f664e26a8b5ed965be82c62d99c0bf +oid sha256:4f41e362c3bb6d45be0b656b1cdad4a1214468db81442967fe04c0d32b3ce8ef size 4792 diff --git a/checkpoint-7749/model.safetensors b/checkpoint-7749/model.safetensors index 8384d911a0acd803c783f3251912d544cf054481..731d271af263b2c30436d613c062e06d678fe72d 100644 --- a/checkpoint-7749/model.safetensors +++ b/checkpoint-7749/model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:21ce4eca180873fb16cdb6d9bff483078203550177dabd4ed16bf462c7493af4 +oid sha256:b444106f50fcd3652e3461264e24871a21b434c69b6dd7394f4608a08ece9150 size 90866120 diff --git a/checkpoint-7749/optimizer.pt b/checkpoint-7749/optimizer.pt index 938fbd8f32c5c127bc1a9d43b88f07fe652e7e58..18dbacbc4b558509b787957a29db46d9f609b20d 100644 --- a/checkpoint-7749/optimizer.pt +++ b/checkpoint-7749/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:254e2ba1d029861e2f7e1b7d917e6ab876636c3c0af898b8979e9336459deaeb +oid sha256:9571e9f629af0baf9a47642bd01642d67b1feedf519f46e4793c0dbe811ec42e size 180607738 diff --git a/checkpoint-7749/rng_state.pth b/checkpoint-7749/rng_state.pth index 4f2cf8652aeb64b2d3d560c14796c11fbde2d019..6f8668cae67bedc4d0232892665c25f3bb1197f3 100644 --- a/checkpoint-7749/rng_state.pth +++ b/checkpoint-7749/rng_state.pth @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eeca4e1dc911e6617de8c0b5c3b6ee333073453ada941b2b075b2c148e12374d +oid sha256:3c72df822af79394695d06839f663699941513a397043882e7a621ffa80367cf size 14244 diff --git a/checkpoint-7749/trainer_state.json b/checkpoint-7749/trainer_state.json index aec11f37a34b137cb3397e24d379d56fb31f1a66..cc13de40721bcd3a5b587ce760082c733717646f 100644 --- a/checkpoint-7749/trainer_state.json +++ b/checkpoint-7749/trainer_state.json @@ -11,163 +11,163 @@ { "epoch": 1.0, "learning_rate": 0.000484375, - "loss": 3.0823, + "loss": 2.8375, "step": 287 }, { "epoch": 2.0, "learning_rate": 0.00046875, - "loss": 2.7242, + "loss": 2.4263, "step": 574 }, { "epoch": 3.0, "learning_rate": 0.000453125, - "loss": 2.5348, + "loss": 2.2043, "step": 861 }, { "epoch": 4.0, "learning_rate": 0.0004375, - "loss": 2.4455, + "loss": 2.0835, "step": 1148 }, { "epoch": 5.0, "learning_rate": 0.000421875, - "loss": 2.3794, + "loss": 2.0225, "step": 1435 }, { "epoch": 6.0, "learning_rate": 0.00040625000000000004, - "loss": 2.3375, + "loss": 1.9901, "step": 1722 }, { "epoch": 7.0, "learning_rate": 0.000390625, - "loss": 2.3262, + "loss": 1.9992, "step": 2009 }, { "epoch": 8.0, "learning_rate": 0.000375, - "loss": 2.3114, + "loss": 1.9665, "step": 2296 }, { "epoch": 9.0, "learning_rate": 0.000359375, - "loss": 2.2921, + "loss": 1.943, "step": 2583 }, { "epoch": 10.0, "learning_rate": 0.00034375, - "loss": 2.2918, + "loss": 1.9327, "step": 2870 }, { "epoch": 11.0, "learning_rate": 0.000328125, - "loss": 2.2578, + "loss": 1.9184, "step": 3157 }, { "epoch": 12.0, "learning_rate": 0.0003125, - "loss": 2.2693, + "loss": 1.9191, "step": 3444 }, { "epoch": 13.0, "learning_rate": 0.000296875, - "loss": 2.2594, + "loss": 1.9074, "step": 3731 }, { "epoch": 14.0, "learning_rate": 0.00028125000000000003, - "loss": 2.2555, + "loss": 1.9066, "step": 4018 }, { "epoch": 15.0, "learning_rate": 0.000265625, - "loss": 2.2481, + "loss": 1.9053, "step": 4305 }, { "epoch": 16.0, "learning_rate": 0.00025, - "loss": 2.2468, + "loss": 1.8906, "step": 4592 }, { "epoch": 17.0, "learning_rate": 0.000234375, - "loss": 2.248, + "loss": 1.8876, "step": 4879 }, { "epoch": 18.0, "learning_rate": 0.00021875, - "loss": 2.2435, + "loss": 1.8837, "step": 5166 }, { "epoch": 19.0, "learning_rate": 0.00020312500000000002, - "loss": 2.2319, + "loss": 1.8766, "step": 5453 }, { "epoch": 20.0, "learning_rate": 0.0001875, - "loss": 2.2303, + "loss": 1.8701, "step": 5740 }, { "epoch": 21.0, "learning_rate": 0.000171875, - "loss": 2.2215, + "loss": 1.8698, "step": 6027 }, { "epoch": 22.0, "learning_rate": 0.00015625, - "loss": 2.2256, + "loss": 1.8713, "step": 6314 }, { "epoch": 23.0, "learning_rate": 0.00014062500000000002, - "loss": 2.2257, + "loss": 1.8756, "step": 6601 }, { "epoch": 24.0, "learning_rate": 0.000125, - "loss": 2.2275, + "loss": 1.8628, "step": 6888 }, { "epoch": 25.0, "learning_rate": 0.000109375, - "loss": 2.2225, + "loss": 1.8646, "step": 7175 }, { "epoch": 26.0, "learning_rate": 9.375e-05, - "loss": 2.2166, + "loss": 1.8658, "step": 7462 }, { "epoch": 27.0, "learning_rate": 7.8125e-05, - "loss": 2.2174, + "loss": 1.8627, "step": 7749 } ], diff --git a/checkpoint-7749/training_args.bin b/checkpoint-7749/training_args.bin index 363673a315b486cbc5df029c76d9fd96ac4823ba..f58a03d4ef7176bd1b794380352baf592d0f65d9 100644 --- a/checkpoint-7749/training_args.bin +++ b/checkpoint-7749/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1e68121a9357c4f016eb6bc0f031c8d8d3f664e26a8b5ed965be82c62d99c0bf +oid sha256:4f41e362c3bb6d45be0b656b1cdad4a1214468db81442967fe04c0d32b3ce8ef size 4792 diff --git a/checkpoint-8036/model.safetensors b/checkpoint-8036/model.safetensors index 63a742915e72c9efec0e450a79f66d9e1fc62629..38d5d30dc9b7e302390ca2254ec170ed8c8afbd9 100644 --- a/checkpoint-8036/model.safetensors +++ b/checkpoint-8036/model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:094692f052bc3c1da865dd330ea4d8868877c2c651aa4728271543cff9649ebf +oid sha256:c7fc802cbb799cd15bee19b8be5701dab50a2c838bc5accb64015c9f0975cdee size 90866120 diff --git a/checkpoint-8036/optimizer.pt b/checkpoint-8036/optimizer.pt index 685e6f31eb6dbf6da9f87ebc92de06548d3696ae..54b681cbc9171bc962c8f8848d5748fc54742e8a 100644 --- a/checkpoint-8036/optimizer.pt +++ b/checkpoint-8036/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:efd5c0d40f596b5cc141af43a5a42a15ea6711ede357930782af5e5cdd103831 +oid sha256:ec2e64fb9805dc42cdc934799745fa25bd6d13eea4522ef51e97a47689b1371b size 180607738 diff --git a/checkpoint-8036/rng_state.pth b/checkpoint-8036/rng_state.pth index bf5d5ed224e12b8f05f4aabdec4cf1903b7bc120..0c9ff5666033c74419a52afbfe9fde278d61b5bb 100644 --- a/checkpoint-8036/rng_state.pth +++ b/checkpoint-8036/rng_state.pth @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:55ad740d18cd84d3d6ab714ade3167df9b6d81cc8473d8c6af14b20d61391900 +oid sha256:987e73ef1fd240b0dcfe43b5b9c90ad71172ec8d30a7204e9e74bdf5384f5ce3 size 14244 diff --git a/checkpoint-8036/trainer_state.json b/checkpoint-8036/trainer_state.json index a040a5c83f5f52e6421c0beba8f972d52a04d63b..46674c478e04645b470711ec7f6231e56be1cffd 100644 --- a/checkpoint-8036/trainer_state.json +++ b/checkpoint-8036/trainer_state.json @@ -11,169 +11,169 @@ { "epoch": 1.0, "learning_rate": 0.000484375, - "loss": 3.0823, + "loss": 2.8375, "step": 287 }, { "epoch": 2.0, "learning_rate": 0.00046875, - "loss": 2.7242, + "loss": 2.4263, "step": 574 }, { "epoch": 3.0, "learning_rate": 0.000453125, - "loss": 2.5348, + "loss": 2.2043, "step": 861 }, { "epoch": 4.0, "learning_rate": 0.0004375, - "loss": 2.4455, + "loss": 2.0835, "step": 1148 }, { "epoch": 5.0, "learning_rate": 0.000421875, - "loss": 2.3794, + "loss": 2.0225, "step": 1435 }, { "epoch": 6.0, "learning_rate": 0.00040625000000000004, - "loss": 2.3375, + "loss": 1.9901, "step": 1722 }, { "epoch": 7.0, "learning_rate": 0.000390625, - "loss": 2.3262, + "loss": 1.9992, "step": 2009 }, { "epoch": 8.0, "learning_rate": 0.000375, - "loss": 2.3114, + "loss": 1.9665, "step": 2296 }, { "epoch": 9.0, "learning_rate": 0.000359375, - "loss": 2.2921, + "loss": 1.943, "step": 2583 }, { "epoch": 10.0, "learning_rate": 0.00034375, - "loss": 2.2918, + "loss": 1.9327, "step": 2870 }, { "epoch": 11.0, "learning_rate": 0.000328125, - "loss": 2.2578, + "loss": 1.9184, "step": 3157 }, { "epoch": 12.0, "learning_rate": 0.0003125, - "loss": 2.2693, + "loss": 1.9191, "step": 3444 }, { "epoch": 13.0, "learning_rate": 0.000296875, - "loss": 2.2594, + "loss": 1.9074, "step": 3731 }, { "epoch": 14.0, "learning_rate": 0.00028125000000000003, - "loss": 2.2555, + "loss": 1.9066, "step": 4018 }, { "epoch": 15.0, "learning_rate": 0.000265625, - "loss": 2.2481, + "loss": 1.9053, "step": 4305 }, { "epoch": 16.0, "learning_rate": 0.00025, - "loss": 2.2468, + "loss": 1.8906, "step": 4592 }, { "epoch": 17.0, "learning_rate": 0.000234375, - "loss": 2.248, + "loss": 1.8876, "step": 4879 }, { "epoch": 18.0, "learning_rate": 0.00021875, - "loss": 2.2435, + "loss": 1.8837, "step": 5166 }, { "epoch": 19.0, "learning_rate": 0.00020312500000000002, - "loss": 2.2319, + "loss": 1.8766, "step": 5453 }, { "epoch": 20.0, "learning_rate": 0.0001875, - "loss": 2.2303, + "loss": 1.8701, "step": 5740 }, { "epoch": 21.0, "learning_rate": 0.000171875, - "loss": 2.2215, + "loss": 1.8698, "step": 6027 }, { "epoch": 22.0, "learning_rate": 0.00015625, - "loss": 2.2256, + "loss": 1.8713, "step": 6314 }, { "epoch": 23.0, "learning_rate": 0.00014062500000000002, - "loss": 2.2257, + "loss": 1.8756, "step": 6601 }, { "epoch": 24.0, "learning_rate": 0.000125, - "loss": 2.2275, + "loss": 1.8628, "step": 6888 }, { "epoch": 25.0, "learning_rate": 0.000109375, - "loss": 2.2225, + "loss": 1.8646, "step": 7175 }, { "epoch": 26.0, "learning_rate": 9.375e-05, - "loss": 2.2166, + "loss": 1.8658, "step": 7462 }, { "epoch": 27.0, "learning_rate": 7.8125e-05, - "loss": 2.2174, + "loss": 1.8627, "step": 7749 }, { "epoch": 28.0, "learning_rate": 6.25e-05, - "loss": 2.2188, + "loss": 1.8646, "step": 8036 } ], diff --git a/checkpoint-8036/training_args.bin b/checkpoint-8036/training_args.bin index 363673a315b486cbc5df029c76d9fd96ac4823ba..f58a03d4ef7176bd1b794380352baf592d0f65d9 100644 --- a/checkpoint-8036/training_args.bin +++ b/checkpoint-8036/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1e68121a9357c4f016eb6bc0f031c8d8d3f664e26a8b5ed965be82c62d99c0bf +oid sha256:4f41e362c3bb6d45be0b656b1cdad4a1214468db81442967fe04c0d32b3ce8ef size 4792 diff --git a/checkpoint-8323/model.safetensors b/checkpoint-8323/model.safetensors index 17d54024aec2ff5dcc44f4715776b4eaa1ea02d5..4ce578e85f5981043bfb515978f4129c8ee9bdef 100644 --- a/checkpoint-8323/model.safetensors +++ b/checkpoint-8323/model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8eb2a5db42fe1825e43b69b178c42992e7d87576776e33758f75592acf8c1f89 +oid sha256:c860bc7aae3f2ed0dde8b2dcad573bb2eb2269628e5ced858bd4df1c975c97df size 90866120 diff --git a/checkpoint-8323/optimizer.pt b/checkpoint-8323/optimizer.pt index 952e8034d42e3eda3b915607900bf3ebd6065ded..11ad135d748d8fda60903822b39abdcb7d9bc0f7 100644 --- a/checkpoint-8323/optimizer.pt +++ b/checkpoint-8323/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2b94a49c3e5c9b3b80c1f296a41d62252434fdc86628b024267ec35270194497 +oid sha256:5a0aef967bfddf2936f6a01852f6c6c190dea969640cf4cb109b075be822c59b size 180607738 diff --git a/checkpoint-8323/rng_state.pth b/checkpoint-8323/rng_state.pth index f76a6009d192fe6434e676fe2f69dc45439fc929..989613c21d879521553fb3decc5610a760990580 100644 --- a/checkpoint-8323/rng_state.pth +++ b/checkpoint-8323/rng_state.pth @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:113c7031f546d1e57f4645de606e8624d51751acbde70de8fdcf580b016726fa +oid sha256:6ccd00e8e6c6f674e45a1098da877938742ac9ae3af263c76c7361a9fda370c0 size 14244 diff --git a/checkpoint-8323/trainer_state.json b/checkpoint-8323/trainer_state.json index 25a8621861e70b0f2e00e30af9f9124846c2b781..c75786da35722aa6852d0ea91fa66c5b085de145 100644 --- a/checkpoint-8323/trainer_state.json +++ b/checkpoint-8323/trainer_state.json @@ -11,175 +11,175 @@ { "epoch": 1.0, "learning_rate": 0.000484375, - "loss": 3.0823, + "loss": 2.8375, "step": 287 }, { "epoch": 2.0, "learning_rate": 0.00046875, - "loss": 2.7242, + "loss": 2.4263, "step": 574 }, { "epoch": 3.0, "learning_rate": 0.000453125, - "loss": 2.5348, + "loss": 2.2043, "step": 861 }, { "epoch": 4.0, "learning_rate": 0.0004375, - "loss": 2.4455, + "loss": 2.0835, "step": 1148 }, { "epoch": 5.0, "learning_rate": 0.000421875, - "loss": 2.3794, + "loss": 2.0225, "step": 1435 }, { "epoch": 6.0, "learning_rate": 0.00040625000000000004, - "loss": 2.3375, + "loss": 1.9901, "step": 1722 }, { "epoch": 7.0, "learning_rate": 0.000390625, - "loss": 2.3262, + "loss": 1.9992, "step": 2009 }, { "epoch": 8.0, "learning_rate": 0.000375, - "loss": 2.3114, + "loss": 1.9665, "step": 2296 }, { "epoch": 9.0, "learning_rate": 0.000359375, - "loss": 2.2921, + "loss": 1.943, "step": 2583 }, { "epoch": 10.0, "learning_rate": 0.00034375, - "loss": 2.2918, + "loss": 1.9327, "step": 2870 }, { "epoch": 11.0, "learning_rate": 0.000328125, - "loss": 2.2578, + "loss": 1.9184, "step": 3157 }, { "epoch": 12.0, "learning_rate": 0.0003125, - "loss": 2.2693, + "loss": 1.9191, "step": 3444 }, { "epoch": 13.0, "learning_rate": 0.000296875, - "loss": 2.2594, + "loss": 1.9074, "step": 3731 }, { "epoch": 14.0, "learning_rate": 0.00028125000000000003, - "loss": 2.2555, + "loss": 1.9066, "step": 4018 }, { "epoch": 15.0, "learning_rate": 0.000265625, - "loss": 2.2481, + "loss": 1.9053, "step": 4305 }, { "epoch": 16.0, "learning_rate": 0.00025, - "loss": 2.2468, + "loss": 1.8906, "step": 4592 }, { "epoch": 17.0, "learning_rate": 0.000234375, - "loss": 2.248, + "loss": 1.8876, "step": 4879 }, { "epoch": 18.0, "learning_rate": 0.00021875, - "loss": 2.2435, + "loss": 1.8837, "step": 5166 }, { "epoch": 19.0, "learning_rate": 0.00020312500000000002, - "loss": 2.2319, + "loss": 1.8766, "step": 5453 }, { "epoch": 20.0, "learning_rate": 0.0001875, - "loss": 2.2303, + "loss": 1.8701, "step": 5740 }, { "epoch": 21.0, "learning_rate": 0.000171875, - "loss": 2.2215, + "loss": 1.8698, "step": 6027 }, { "epoch": 22.0, "learning_rate": 0.00015625, - "loss": 2.2256, + "loss": 1.8713, "step": 6314 }, { "epoch": 23.0, "learning_rate": 0.00014062500000000002, - "loss": 2.2257, + "loss": 1.8756, "step": 6601 }, { "epoch": 24.0, "learning_rate": 0.000125, - "loss": 2.2275, + "loss": 1.8628, "step": 6888 }, { "epoch": 25.0, "learning_rate": 0.000109375, - "loss": 2.2225, + "loss": 1.8646, "step": 7175 }, { "epoch": 26.0, "learning_rate": 9.375e-05, - "loss": 2.2166, + "loss": 1.8658, "step": 7462 }, { "epoch": 27.0, "learning_rate": 7.8125e-05, - "loss": 2.2174, + "loss": 1.8627, "step": 7749 }, { "epoch": 28.0, "learning_rate": 6.25e-05, - "loss": 2.2188, + "loss": 1.8646, "step": 8036 }, { "epoch": 29.0, "learning_rate": 4.6875e-05, - "loss": 2.2143, + "loss": 1.8585, "step": 8323 } ], diff --git a/checkpoint-8323/training_args.bin b/checkpoint-8323/training_args.bin index 363673a315b486cbc5df029c76d9fd96ac4823ba..f58a03d4ef7176bd1b794380352baf592d0f65d9 100644 --- a/checkpoint-8323/training_args.bin +++ b/checkpoint-8323/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1e68121a9357c4f016eb6bc0f031c8d8d3f664e26a8b5ed965be82c62d99c0bf +oid sha256:4f41e362c3bb6d45be0b656b1cdad4a1214468db81442967fe04c0d32b3ce8ef size 4792 diff --git a/checkpoint-8610/model.safetensors b/checkpoint-8610/model.safetensors index 6cab510075db3f349786a50ff40e5d3559b563d0..3d603295e88e26f9c3ac2a62d855a09b61cd64bc 100644 --- a/checkpoint-8610/model.safetensors +++ b/checkpoint-8610/model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3e6985a08e870d847d77de4b58e47d5ecb6c35c42629de89616cdee08f45f8aa +oid sha256:8b3290fc504eb0df1daeebf81102783b6385fbc82288d4cabea6e9f5df6ce08e size 90866120 diff --git a/checkpoint-8610/optimizer.pt b/checkpoint-8610/optimizer.pt index b85f560319f942b68c6b4a6e1f703f349f21bb85..cfd9a1c01449e762d386569a3172e9d8d15f3503 100644 --- a/checkpoint-8610/optimizer.pt +++ b/checkpoint-8610/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ddd44f10f5bd38f3e0badf87997f22a43865a97b23e0e42d244561c701ea961b +oid sha256:181111d5511842da80729ecc04ad61fe0f751974e95deac2a3fef849bc6d51c7 size 180607738 diff --git a/checkpoint-8610/rng_state.pth b/checkpoint-8610/rng_state.pth index ed35639c665591d0c8b261d6197cd7bfdc00a415..892140ce1dc2ec145ea7c54ee333c9e5f918b9dd 100644 --- a/checkpoint-8610/rng_state.pth +++ b/checkpoint-8610/rng_state.pth @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0e62c9ee3ff1a49669f8d1d2b974abe793294c16ef64241ea6fcf2d811ad551f +oid sha256:b318c456e5e209219c261615e08e186ab019d856e4080479990d7c8b68b49e3d size 14244 diff --git a/checkpoint-8610/trainer_state.json b/checkpoint-8610/trainer_state.json index 1e6303b024313fd953c1a9a22b4656cd9693e038..59d97663e08988b86837dc47233a2a2b02188de9 100644 --- a/checkpoint-8610/trainer_state.json +++ b/checkpoint-8610/trainer_state.json @@ -11,181 +11,181 @@ { "epoch": 1.0, "learning_rate": 0.000484375, - "loss": 3.0823, + "loss": 2.8375, "step": 287 }, { "epoch": 2.0, "learning_rate": 0.00046875, - "loss": 2.7242, + "loss": 2.4263, "step": 574 }, { "epoch": 3.0, "learning_rate": 0.000453125, - "loss": 2.5348, + "loss": 2.2043, "step": 861 }, { "epoch": 4.0, "learning_rate": 0.0004375, - "loss": 2.4455, + "loss": 2.0835, "step": 1148 }, { "epoch": 5.0, "learning_rate": 0.000421875, - "loss": 2.3794, + "loss": 2.0225, "step": 1435 }, { "epoch": 6.0, "learning_rate": 0.00040625000000000004, - "loss": 2.3375, + "loss": 1.9901, "step": 1722 }, { "epoch": 7.0, "learning_rate": 0.000390625, - "loss": 2.3262, + "loss": 1.9992, "step": 2009 }, { "epoch": 8.0, "learning_rate": 0.000375, - "loss": 2.3114, + "loss": 1.9665, "step": 2296 }, { "epoch": 9.0, "learning_rate": 0.000359375, - "loss": 2.2921, + "loss": 1.943, "step": 2583 }, { "epoch": 10.0, "learning_rate": 0.00034375, - "loss": 2.2918, + "loss": 1.9327, "step": 2870 }, { "epoch": 11.0, "learning_rate": 0.000328125, - "loss": 2.2578, + "loss": 1.9184, "step": 3157 }, { "epoch": 12.0, "learning_rate": 0.0003125, - "loss": 2.2693, + "loss": 1.9191, "step": 3444 }, { "epoch": 13.0, "learning_rate": 0.000296875, - "loss": 2.2594, + "loss": 1.9074, "step": 3731 }, { "epoch": 14.0, "learning_rate": 0.00028125000000000003, - "loss": 2.2555, + "loss": 1.9066, "step": 4018 }, { "epoch": 15.0, "learning_rate": 0.000265625, - "loss": 2.2481, + "loss": 1.9053, "step": 4305 }, { "epoch": 16.0, "learning_rate": 0.00025, - "loss": 2.2468, + "loss": 1.8906, "step": 4592 }, { "epoch": 17.0, "learning_rate": 0.000234375, - "loss": 2.248, + "loss": 1.8876, "step": 4879 }, { "epoch": 18.0, "learning_rate": 0.00021875, - "loss": 2.2435, + "loss": 1.8837, "step": 5166 }, { "epoch": 19.0, "learning_rate": 0.00020312500000000002, - "loss": 2.2319, + "loss": 1.8766, "step": 5453 }, { "epoch": 20.0, "learning_rate": 0.0001875, - "loss": 2.2303, + "loss": 1.8701, "step": 5740 }, { "epoch": 21.0, "learning_rate": 0.000171875, - "loss": 2.2215, + "loss": 1.8698, "step": 6027 }, { "epoch": 22.0, "learning_rate": 0.00015625, - "loss": 2.2256, + "loss": 1.8713, "step": 6314 }, { "epoch": 23.0, "learning_rate": 0.00014062500000000002, - "loss": 2.2257, + "loss": 1.8756, "step": 6601 }, { "epoch": 24.0, "learning_rate": 0.000125, - "loss": 2.2275, + "loss": 1.8628, "step": 6888 }, { "epoch": 25.0, "learning_rate": 0.000109375, - "loss": 2.2225, + "loss": 1.8646, "step": 7175 }, { "epoch": 26.0, "learning_rate": 9.375e-05, - "loss": 2.2166, + "loss": 1.8658, "step": 7462 }, { "epoch": 27.0, "learning_rate": 7.8125e-05, - "loss": 2.2174, + "loss": 1.8627, "step": 7749 }, { "epoch": 28.0, "learning_rate": 6.25e-05, - "loss": 2.2188, + "loss": 1.8646, "step": 8036 }, { "epoch": 29.0, "learning_rate": 4.6875e-05, - "loss": 2.2143, + "loss": 1.8585, "step": 8323 }, { "epoch": 30.0, "learning_rate": 3.125e-05, - "loss": 2.2171, + "loss": 1.8598, "step": 8610 } ], diff --git a/checkpoint-8610/training_args.bin b/checkpoint-8610/training_args.bin index 363673a315b486cbc5df029c76d9fd96ac4823ba..f58a03d4ef7176bd1b794380352baf592d0f65d9 100644 --- a/checkpoint-8610/training_args.bin +++ b/checkpoint-8610/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1e68121a9357c4f016eb6bc0f031c8d8d3f664e26a8b5ed965be82c62d99c0bf +oid sha256:4f41e362c3bb6d45be0b656b1cdad4a1214468db81442967fe04c0d32b3ce8ef size 4792 diff --git a/checkpoint-8897/model.safetensors b/checkpoint-8897/model.safetensors index d0f6fc2f898414c7c0f3e0e034b320c275e2434e..fc14b2b5f81c7cf166aa52aac7bd5adf74650cb1 100644 --- a/checkpoint-8897/model.safetensors +++ b/checkpoint-8897/model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7036635a13bfb22357df1a83fe38c266ce356a00272071f6a30bdbb901002988 +oid sha256:4332f3c590b354865b63f154301c8880e7a74c2b4941318e78eb2249dd95fde3 size 90866120 diff --git a/checkpoint-8897/optimizer.pt b/checkpoint-8897/optimizer.pt index 0e64c40df243df3080c38cb1341aaef72a5a02ff..1432f0f7482742f3aae1b82a64bbd3636b5eece4 100644 --- a/checkpoint-8897/optimizer.pt +++ b/checkpoint-8897/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2133784d01bbdbc8bbe83014b2fe5755824972ca6437fcc793a0db5d7f3c9d00 +oid sha256:b594a8f4967802a2c54614a8591aff886a85e320c9bb188e4c4e2d65641c02af size 180607738 diff --git a/checkpoint-8897/rng_state.pth b/checkpoint-8897/rng_state.pth index 470ed4a12ff1034e7b30b92b356f55d205f88a32..b326ab168370723b752bf6983bbb3e77df917210 100644 --- a/checkpoint-8897/rng_state.pth +++ b/checkpoint-8897/rng_state.pth @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:55e05091ed9b77dd4f32cfffb4020d2702a13912916600eccac9bf111143b4b8 +oid sha256:db75ef66c85a14ec032388de158b1d7c506d0326505d747bada6fea5453040ae size 14244 diff --git a/checkpoint-8897/trainer_state.json b/checkpoint-8897/trainer_state.json index 634d7fc4d7322457a91348d9d28945a6966bfe8b..f8035efae94d55094b35a487281030b8dc788433 100644 --- a/checkpoint-8897/trainer_state.json +++ b/checkpoint-8897/trainer_state.json @@ -11,187 +11,187 @@ { "epoch": 1.0, "learning_rate": 0.000484375, - "loss": 3.0823, + "loss": 2.8375, "step": 287 }, { "epoch": 2.0, "learning_rate": 0.00046875, - "loss": 2.7242, + "loss": 2.4263, "step": 574 }, { "epoch": 3.0, "learning_rate": 0.000453125, - "loss": 2.5348, + "loss": 2.2043, "step": 861 }, { "epoch": 4.0, "learning_rate": 0.0004375, - "loss": 2.4455, + "loss": 2.0835, "step": 1148 }, { "epoch": 5.0, "learning_rate": 0.000421875, - "loss": 2.3794, + "loss": 2.0225, "step": 1435 }, { "epoch": 6.0, "learning_rate": 0.00040625000000000004, - "loss": 2.3375, + "loss": 1.9901, "step": 1722 }, { "epoch": 7.0, "learning_rate": 0.000390625, - "loss": 2.3262, + "loss": 1.9992, "step": 2009 }, { "epoch": 8.0, "learning_rate": 0.000375, - "loss": 2.3114, + "loss": 1.9665, "step": 2296 }, { "epoch": 9.0, "learning_rate": 0.000359375, - "loss": 2.2921, + "loss": 1.943, "step": 2583 }, { "epoch": 10.0, "learning_rate": 0.00034375, - "loss": 2.2918, + "loss": 1.9327, "step": 2870 }, { "epoch": 11.0, "learning_rate": 0.000328125, - "loss": 2.2578, + "loss": 1.9184, "step": 3157 }, { "epoch": 12.0, "learning_rate": 0.0003125, - "loss": 2.2693, + "loss": 1.9191, "step": 3444 }, { "epoch": 13.0, "learning_rate": 0.000296875, - "loss": 2.2594, + "loss": 1.9074, "step": 3731 }, { "epoch": 14.0, "learning_rate": 0.00028125000000000003, - "loss": 2.2555, + "loss": 1.9066, "step": 4018 }, { "epoch": 15.0, "learning_rate": 0.000265625, - "loss": 2.2481, + "loss": 1.9053, "step": 4305 }, { "epoch": 16.0, "learning_rate": 0.00025, - "loss": 2.2468, + "loss": 1.8906, "step": 4592 }, { "epoch": 17.0, "learning_rate": 0.000234375, - "loss": 2.248, + "loss": 1.8876, "step": 4879 }, { "epoch": 18.0, "learning_rate": 0.00021875, - "loss": 2.2435, + "loss": 1.8837, "step": 5166 }, { "epoch": 19.0, "learning_rate": 0.00020312500000000002, - "loss": 2.2319, + "loss": 1.8766, "step": 5453 }, { "epoch": 20.0, "learning_rate": 0.0001875, - "loss": 2.2303, + "loss": 1.8701, "step": 5740 }, { "epoch": 21.0, "learning_rate": 0.000171875, - "loss": 2.2215, + "loss": 1.8698, "step": 6027 }, { "epoch": 22.0, "learning_rate": 0.00015625, - "loss": 2.2256, + "loss": 1.8713, "step": 6314 }, { "epoch": 23.0, "learning_rate": 0.00014062500000000002, - "loss": 2.2257, + "loss": 1.8756, "step": 6601 }, { "epoch": 24.0, "learning_rate": 0.000125, - "loss": 2.2275, + "loss": 1.8628, "step": 6888 }, { "epoch": 25.0, "learning_rate": 0.000109375, - "loss": 2.2225, + "loss": 1.8646, "step": 7175 }, { "epoch": 26.0, "learning_rate": 9.375e-05, - "loss": 2.2166, + "loss": 1.8658, "step": 7462 }, { "epoch": 27.0, "learning_rate": 7.8125e-05, - "loss": 2.2174, + "loss": 1.8627, "step": 7749 }, { "epoch": 28.0, "learning_rate": 6.25e-05, - "loss": 2.2188, + "loss": 1.8646, "step": 8036 }, { "epoch": 29.0, "learning_rate": 4.6875e-05, - "loss": 2.2143, + "loss": 1.8585, "step": 8323 }, { "epoch": 30.0, "learning_rate": 3.125e-05, - "loss": 2.2171, + "loss": 1.8598, "step": 8610 }, { "epoch": 31.0, "learning_rate": 1.5625e-05, - "loss": 2.2168, + "loss": 1.8601, "step": 8897 } ], diff --git a/checkpoint-8897/training_args.bin b/checkpoint-8897/training_args.bin index 363673a315b486cbc5df029c76d9fd96ac4823ba..f58a03d4ef7176bd1b794380352baf592d0f65d9 100644 --- a/checkpoint-8897/training_args.bin +++ b/checkpoint-8897/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1e68121a9357c4f016eb6bc0f031c8d8d3f664e26a8b5ed965be82c62d99c0bf +oid sha256:4f41e362c3bb6d45be0b656b1cdad4a1214468db81442967fe04c0d32b3ce8ef size 4792 diff --git a/checkpoint-9184/model.safetensors b/checkpoint-9184/model.safetensors index 49e4a353ddf18e5b47e0dd5b06022fa45ed79998..ba391f9d77d5614050a134078a8fa0c36709f325 100644 --- a/checkpoint-9184/model.safetensors +++ b/checkpoint-9184/model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3f022c58e50215799abe15f38c88ec32f276a006a98a140f34267c2abbe22d87 +oid sha256:eb492231d16bc1937fded69b3003e1f4aec2652900fd7c4a4327d08b6dbb2b2f size 90866120 diff --git a/checkpoint-9184/optimizer.pt b/checkpoint-9184/optimizer.pt index 6737ec670351a592d4a14d527f5d41e1a19019d6..a67419d5cb57efe4d230f6ed55f048a4d389aef8 100644 --- a/checkpoint-9184/optimizer.pt +++ b/checkpoint-9184/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3bec90c0f4b7f4dc5fafba0e512b969a3429c79ed31e31b903c1ec8f66176cd7 +oid sha256:53c36e50493173f0e4462e51f4391d1e488433204e419cf7d7875c34c682e6a7 size 180607738 diff --git a/checkpoint-9184/rng_state.pth b/checkpoint-9184/rng_state.pth index 81257285154cf03f6f730af40149b16635eab146..e51ed1e83fc9892a0c6de45b0357c8c2c037d54b 100644 --- a/checkpoint-9184/rng_state.pth +++ b/checkpoint-9184/rng_state.pth @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:89aae6ca1b3509247d7f34c8b5499ed4d42e3403173c338a9c5b16afa3425c3a +oid sha256:cac3093bd962d7b61e13a3371d3c672ce0f2bdeb86b51a081e1d16c3bad11dbe size 14244 diff --git a/checkpoint-9184/trainer_state.json b/checkpoint-9184/trainer_state.json index 6493c52c51cc61ef6053e6303b62a3ddac38aefc..1ebda7d48f68e189c123744e71b2c0898ff35496 100644 --- a/checkpoint-9184/trainer_state.json +++ b/checkpoint-9184/trainer_state.json @@ -11,193 +11,193 @@ { "epoch": 1.0, "learning_rate": 0.000484375, - "loss": 3.0823, + "loss": 2.8375, "step": 287 }, { "epoch": 2.0, "learning_rate": 0.00046875, - "loss": 2.7242, + "loss": 2.4263, "step": 574 }, { "epoch": 3.0, "learning_rate": 0.000453125, - "loss": 2.5348, + "loss": 2.2043, "step": 861 }, { "epoch": 4.0, "learning_rate": 0.0004375, - "loss": 2.4455, + "loss": 2.0835, "step": 1148 }, { "epoch": 5.0, "learning_rate": 0.000421875, - "loss": 2.3794, + "loss": 2.0225, "step": 1435 }, { "epoch": 6.0, "learning_rate": 0.00040625000000000004, - "loss": 2.3375, + "loss": 1.9901, "step": 1722 }, { "epoch": 7.0, "learning_rate": 0.000390625, - "loss": 2.3262, + "loss": 1.9992, "step": 2009 }, { "epoch": 8.0, "learning_rate": 0.000375, - "loss": 2.3114, + "loss": 1.9665, "step": 2296 }, { "epoch": 9.0, "learning_rate": 0.000359375, - "loss": 2.2921, + "loss": 1.943, "step": 2583 }, { "epoch": 10.0, "learning_rate": 0.00034375, - "loss": 2.2918, + "loss": 1.9327, "step": 2870 }, { "epoch": 11.0, "learning_rate": 0.000328125, - "loss": 2.2578, + "loss": 1.9184, "step": 3157 }, { "epoch": 12.0, "learning_rate": 0.0003125, - "loss": 2.2693, + "loss": 1.9191, "step": 3444 }, { "epoch": 13.0, "learning_rate": 0.000296875, - "loss": 2.2594, + "loss": 1.9074, "step": 3731 }, { "epoch": 14.0, "learning_rate": 0.00028125000000000003, - "loss": 2.2555, + "loss": 1.9066, "step": 4018 }, { "epoch": 15.0, "learning_rate": 0.000265625, - "loss": 2.2481, + "loss": 1.9053, "step": 4305 }, { "epoch": 16.0, "learning_rate": 0.00025, - "loss": 2.2468, + "loss": 1.8906, "step": 4592 }, { "epoch": 17.0, "learning_rate": 0.000234375, - "loss": 2.248, + "loss": 1.8876, "step": 4879 }, { "epoch": 18.0, "learning_rate": 0.00021875, - "loss": 2.2435, + "loss": 1.8837, "step": 5166 }, { "epoch": 19.0, "learning_rate": 0.00020312500000000002, - "loss": 2.2319, + "loss": 1.8766, "step": 5453 }, { "epoch": 20.0, "learning_rate": 0.0001875, - "loss": 2.2303, + "loss": 1.8701, "step": 5740 }, { "epoch": 21.0, "learning_rate": 0.000171875, - "loss": 2.2215, + "loss": 1.8698, "step": 6027 }, { "epoch": 22.0, "learning_rate": 0.00015625, - "loss": 2.2256, + "loss": 1.8713, "step": 6314 }, { "epoch": 23.0, "learning_rate": 0.00014062500000000002, - "loss": 2.2257, + "loss": 1.8756, "step": 6601 }, { "epoch": 24.0, "learning_rate": 0.000125, - "loss": 2.2275, + "loss": 1.8628, "step": 6888 }, { "epoch": 25.0, "learning_rate": 0.000109375, - "loss": 2.2225, + "loss": 1.8646, "step": 7175 }, { "epoch": 26.0, "learning_rate": 9.375e-05, - "loss": 2.2166, + "loss": 1.8658, "step": 7462 }, { "epoch": 27.0, "learning_rate": 7.8125e-05, - "loss": 2.2174, + "loss": 1.8627, "step": 7749 }, { "epoch": 28.0, "learning_rate": 6.25e-05, - "loss": 2.2188, + "loss": 1.8646, "step": 8036 }, { "epoch": 29.0, "learning_rate": 4.6875e-05, - "loss": 2.2143, + "loss": 1.8585, "step": 8323 }, { "epoch": 30.0, "learning_rate": 3.125e-05, - "loss": 2.2171, + "loss": 1.8598, "step": 8610 }, { "epoch": 31.0, "learning_rate": 1.5625e-05, - "loss": 2.2168, + "loss": 1.8601, "step": 8897 }, { "epoch": 32.0, "learning_rate": 0.0, - "loss": 2.2152, + "loss": 1.8605, "step": 9184 } ], diff --git a/checkpoint-9184/training_args.bin b/checkpoint-9184/training_args.bin index 363673a315b486cbc5df029c76d9fd96ac4823ba..f58a03d4ef7176bd1b794380352baf592d0f65d9 100644 --- a/checkpoint-9184/training_args.bin +++ b/checkpoint-9184/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1e68121a9357c4f016eb6bc0f031c8d8d3f664e26a8b5ed965be82c62d99c0bf +oid sha256:4f41e362c3bb6d45be0b656b1cdad4a1214468db81442967fe04c0d32b3ce8ef size 4792 diff --git a/config.json b/config.json index 004fbd67c68bae61f4c953e17c7b3414f294461e..2ba434575426fcc33423ae740d57d0e1d521f543 100644 --- a/config.json +++ b/config.json @@ -1,5 +1,5 @@ { - "_name_or_path": "/root/.cache/torch/sentence_transformers/sentence-transformers_paraphrase-MiniLM-L6-v2/", + "_name_or_path": "/root/.cache/torch/sentence_transformers/sentence-transformers_all-MiniLM-L6-v2/", "architectures": [ "BertModel" ], @@ -19,7 +19,7 @@ "pad_token_id": 0, "position_embedding_type": "absolute", "torch_dtype": "float32", - "transformers_version": "4.36.1", + "transformers_version": "4.36.2", "type_vocab_size": 2, "use_cache": true, "vocab_size": 30522 diff --git a/config_sentence_transformers.json b/config_sentence_transformers.json index b974b349cb2d419ada11181750a733ff82f291ad..fd1b291129c607e5d49799f87cb219b27f98acdf 100644 --- a/config_sentence_transformers.json +++ b/config_sentence_transformers.json @@ -1,7 +1,7 @@ { "__version__": { "sentence_transformers": "2.0.0", - "transformers": "4.7.0", - "pytorch": "1.9.0+cu102" + "transformers": "4.6.1", + "pytorch": "1.8.1" } } \ No newline at end of file diff --git a/model.safetensors b/model.safetensors index c728bc841524e1fe09bf3b1ddb728e94dd1349b6..a76b607b8125004b0d8d7d3e6759e99ae6a7379e 100644 --- a/model.safetensors +++ b/model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8535713a62275636e9e1ab3e2e7e2a0f741139be92e80979ad25553a8c28cbee +oid sha256:e18ee4a9c41996a504ad41a2634ddb52ef2c94b8ae1720a28af626806605311e size 90864192 diff --git a/modules.json b/modules.json index f7640f94e81bb7f4f04daf1668850b38763a13d9..952a9b81c0bfd99800fabf352f69c7ccd46c5e43 100644 --- a/modules.json +++ b/modules.json @@ -10,5 +10,11 @@ "name": "1", "path": "1_Pooling", "type": "sentence_transformers.models.Pooling" + }, + { + "idx": 2, + "name": "2", + "path": "2_Normalize", + "type": "sentence_transformers.models.Normalize" } ] \ No newline at end of file diff --git a/runs/Dec19_20-15-39_dad5d4a17751/events.out.tfevents.1703016939.dad5d4a17751.2654.0 b/runs/Dec19_20-15-39_dad5d4a17751/events.out.tfevents.1703016939.dad5d4a17751.2654.0 new file mode 100644 index 0000000000000000000000000000000000000000..11bbcf3927178c8481015784de06e09d6fba565f --- /dev/null +++ b/runs/Dec19_20-15-39_dad5d4a17751/events.out.tfevents.1703016939.dad5d4a17751.2654.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5a81c2c9dfc6073c936fb2ac8ee45d21e9a775dd68160eae89a0837d7789c46 +size 9846 diff --git a/sentence_bert_config.json b/sentence_bert_config.json index 5fd10429389515d3e5cccdeda08cae5fea1ae82e..59d594003bf59880a884c574bf88ef7555bb0202 100644 --- a/sentence_bert_config.json +++ b/sentence_bert_config.json @@ -1,4 +1,4 @@ { - "max_seq_length": 128, + "max_seq_length": 256, "do_lower_case": false } \ No newline at end of file diff --git a/tokenizer.json b/tokenizer.json index 327401998c676bd2d7c625039113b0577ea49fe7..4eade387f6f3ade6a663a25642ed27843081a772 100644 --- a/tokenizer.json +++ b/tokenizer.json @@ -2,7 +2,7 @@ "version": "1.0", "truncation": { "direction": "Right", - "max_length": 128, + "max_length": 256, "strategy": "LongestFirst", "stride": 0 }, diff --git a/tokenizer_config.json b/tokenizer_config.json index 75305659f7795d4549f0e23688b52fa20a32f925..61e23f16c75ff9995b1d2f251d720c6146d21338 100644 --- a/tokenizer_config.json +++ b/tokenizer_config.json @@ -46,12 +46,19 @@ "do_basic_tokenize": true, "do_lower_case": true, "mask_token": "[MASK]", + "max_length": 128, "model_max_length": 512, "never_split": null, + "pad_to_multiple_of": null, "pad_token": "[PAD]", + "pad_token_type_id": 0, + "padding_side": "right", "sep_token": "[SEP]", + "stride": 0, "strip_accents": null, "tokenize_chinese_chars": true, "tokenizer_class": "BertTokenizer", + "truncation_side": "right", + "truncation_strategy": "longest_first", "unk_token": "[UNK]" }