Upload 9 files

Browse files

Files changed (9) hide show

README.md +79 -0
config.json +24 -0
maskedlanguagemodel_pytorch.py +255 -0
model.safetensors +3 -0
special_tokens_map.json +7 -0
tokenizer.json +0 -0
tokenizer_config.json +55 -0
training_args.bin +3 -0
vocab.txt +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,79 @@

+---
+library_name: transformers
+license: apache-2.0
+base_model: distilbert-base-uncased
+tags:
+- generated_from_trainer
+model-index:
+- name: fine_tune_distilbert-base-uncased
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# fine_tune_distilbert-base-uncased
+This model is a fine-tuned version of [distilbert-base-uncased](https://huggingface.co/distilbert-base-uncased) on an unknown dataset.
+It achieves the following results on the evaluation set:
+- Loss: 2.1226
+- Model Preparation Time: 0.0016
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 2e-05
+- train_batch_size: 64
+- eval_batch_size: 64
+- seed: 42
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: linear
+- num_epochs: 20
+- mixed_precision_training: Native AMP
+### Training results
+| Training Loss | Epoch | Step  | Validation Loss | Model Preparation Time |
+|:-------------:|:-----:|:-----:|:---------------:|:----------------------:|
+| 2.5551        | 1.0   | 767   | 2.3648          | 0.0016                 |
+| 2.4329        | 2.0   | 1534  | 2.3181          | 0.0016                 |
+| 2.3874        | 3.0   | 2301  | 2.2831          | 0.0016                 |
+| 2.3409        | 4.0   | 3068  | 2.2422          | 0.0016                 |
+| 2.3124        | 5.0   | 3835  | 2.2302          | 0.0016                 |
+| 2.2895        | 6.0   | 4602  | 2.2104          | 0.0016                 |
+| 2.2649        | 7.0   | 5369  | 2.2014          | 0.0016                 |
+| 2.2445        | 8.0   | 6136  | 2.1939          | 0.0016                 |
+| 2.234         | 9.0   | 6903  | 2.1776          | 0.0016                 |
+| 2.2142        | 10.0  | 7670  | 2.1607          | 0.0016                 |
+| 2.208         | 11.0  | 8437  | 2.1682          | 0.0016                 |
+| 2.1933        | 12.0  | 9204  | 2.1530          | 0.0016                 |
+| 2.1808        | 13.0  | 9971  | 2.1493          | 0.0016                 |
+| 2.1689        | 14.0  | 10738 | 2.1422          | 0.0016                 |
+| 2.1598        | 15.0  | 11505 | 2.1347          | 0.0016                 |
+| 2.1567        | 16.0  | 12272 | 2.1373          | 0.0016                 |
+| 2.1458        | 17.0  | 13039 | 2.1270          | 0.0016                 |
+| 2.1475        | 18.0  | 13806 | 2.1200          | 0.0016                 |
+| 2.141         | 19.0  | 14573 | 2.1312          | 0.0016                 |
+| 2.1423        | 20.0  | 15340 | 2.1202          | 0.0016                 |
+### Framework versions
+- Transformers 4.44.2
+- Pytorch 2.2.0+cu121
+- Datasets 2.21.0
+- Tokenizers 0.19.1

config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "_name_or_path": "distilbert-base-uncased",
+  "activation": "gelu",
+  "architectures": [
+    "DistilBertForMaskedLM"
+  ],
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "initializer_range": 0.02,
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "pad_token_id": 0,
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "tie_weights_": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.44.2",
+  "vocab_size": 30522
+}

maskedlanguagemodel_pytorch.py ADDED Viewed

	@@ -0,0 +1,255 @@

+from datasets import load_dataset
+from sympy import Line2D
+from transformers import (
+    AutoTokenizer,
+    DataCollatorForLanguageModeling,
+    AutoModelForMaskedLM,
+    TrainingArguments,
+    Trainer,
+    pipeline,
+)
+import evaluate
+import numpy as np
+import torch
+import math
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(device)
+class MaskedLM():
+    def __init__(self):
+        self.model = None
+        self.metric = None
+        self.data_collator = None
+        self.raw_data = None
+        self.model_checkpoint = None
+        self.tokenized_dataset = None
+        self.chunk_size = 128
+        self.chunks_dataset = None
+        self.split_dataset = None
+        self.args = None
+    def load_dataset(self, name="imdb"):
+        self.raw_data = load_dataset(name)
+        print("Name of dataset: ", name)
+        print(self.raw_data)
+    def load_support(self, mlm_probability=0.15):
+        self.model_checkpoint = "distilbert-base-uncased"
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_checkpoint)
+        self.data_collator = DataCollatorForLanguageModeling(tokenizer=self.tokenizer, mlm_probability=mlm_probability)
+        print("Name of model checkpoint: " + self.model_checkpoint)
+        print("Tokenizer Fast: ", self.tokenizer.is_fast)
+        print("Symbol of masked word after tokenizer: ", self.tokenizer.mask_token)
+        print("Model max length tokenizer: ", self.tokenizer.model_max_length)
+    def explore_infoModel(self, k=5):
+        model = AutoModelForMaskedLM.from_pretrained(self.model_checkpoint)
+        model_parameters = model.num_parameters() / 1000000
+        print(f">>> Number of parameters of {self.model_checkpoint}: {round(model_parameters)}M")
+        example = "This is a great [MASK]."
+        print("\n")
+        print(">>> Example: ", example)
+        inputs = self.tokenizer(example, return_tensors='pt')
+        token_logits = model(**inputs).logits
+        print(f"{'Number of tokens: ':<{30}}{ len(inputs.tokens())}")
+        print(f"{'Tokens prepare for training: ':<{30}}{ inputs.tokens()}")
+        print(f"{'IDs of tokens: ':<{30}}{ inputs.input_ids}")
+        print(f"{'Maked IDs of Model: ':<{30}}{self.tokenizer.mask_token_id}")
+        print(f"{'Logits of example: ':<{30}}{token_logits}")
+        print(f"{'Shape of logits: ':<{30}}{token_logits.size()}")
+        '''Find POSITION of [MASK] => EXTRACT LOGIT'''
+        mask_token_index = torch.where(inputs.input_ids == self.tokenizer.mask_token_id)[1]
+        print(f"{'Position of masked token index: ':<{30}}{mask_token_index}")
+        '''Find LOGIT of token in VOCAB suitable for [MASK]'''
+        mask_token_logits = token_logits[0, mask_token_index, :]
+        print(f"{'Logit of tokens in Vocab for [MASK]: ':<{30}}{mask_token_logits}")
+        '''Choose TOP CANDIDATES for [MASK] with highest logits => TOP LOGITS + POSITION of token suitable for [MASK] in VOCAB'''
+        top_k_values = torch.topk(mask_token_logits, k, dim=1).values[0].tolist()
+        print(f"{'Top value of suitable token in Vocab: ':<{30}}{top_k_values }")
+        top_k_tokens = torch.topk(mask_token_logits, k, dim=1).indices[0].tolist()
+        print(f"{'Position of suitable token in Vocab: ':<{30}}{top_k_tokens}")
+        '''Show TOP CANDIDATES'''
+        for token in top_k_tokens:
+            print(">>> ", example.replace(self.tokenizer.mask_token, self.tokenizer.decode([token])))
+    def get_feature_items(self, set="train", index=0, feature="text"):
+        return  None if self.raw_data[set][index][feature] is None or self.raw_data[set][index][feature] == 0 else self.raw_data[set][index][feature]
+    def get_pair_items(self, set="train", index=0, feature1="text", feature2="label"):
+        feature1 = self.get_feature_items(set, index, feature1)
+        feature2 = self.get_feature_items(set, index, feature2)
+        if feature2 is not None:
+            line1 = ""
+            line2 = ""
+            for word, label in zip(feature1, feature2):
+                line1 += str(word)
+                line2 += str(label)
+            return line1, line2
+        return feature1, feature1
+    def get_tokenizer(self, set="train", index=0, feature="text"):
+        inputs = self.tokenizer(self.get_feature_items(set, index, feature))
+        return inputs.tokens(), inputs.word_ids()
+    def tokenizer_dataset(self, example):
+        inputs = self.tokenizer(example["text"])
+        inputs["word_ids"] = [inputs.word_ids(i) for i in range(len(inputs["input_ids"]))]
+        return inputs
+    def map_tokenize_dataset(self):
+        print("Start of processing dataset")
+        self.tokenized_dataset = self.raw_data.map(self.tokenizer_dataset, batched=True, remove_columns=["text","label"] )
+        print("Done mapping")
+        print("Tokenized dataset: ", self.tokenized_dataset)
+    def group_text_chunk(self, example):
+        '''Group all of text'''
+        concatenate_example = {k : sum(example[k], []) for k in example.keys()}
+        '''Compute the length of all'''
+        total_length = len(concatenate_example["input_ids"])
+        '''Final length for chunk size'''
+        total_length = (total_length // self.chunk_size) *self.chunk_size
+        '''Divide into chunks with chunk size'''
+        chunks = {
+            k: [t[i: i + self.chunk_size] for i in range(0, total_length, self.chunk_size)]
+            for k, t in concatenate_example.items()
+        }
+        '''Create LABELS column from INPUT_IDS'''
+        chunks["labels"] = chunks["input_ids"].copy()
+        return chunks
+    def map_chunk_dataset(self):
+       print("Start of processing dataset")
+       self.chunks_dataset = self.tokenized_dataset.map(self.group_text_chunk, batched=True)
+       print("Done mapping")
+       print("Chunked dataset: ", self.chunks_dataset)
+    def dataset_split(self, test_size=0.2):
+        self.split_dataset = self.chunks_dataset["train"].train_test_split(
+            test_size=test_size, seed=42
+        )
+        print("Preparing dataset: ", self.split_dataset)
+    def create_model(self):
+        print("Start creating model")
+        self.model = AutoModelForMaskedLM.from_pretrained(self.model_checkpoint)
+        print(self.model)
+    def create_argumentTrainer(self, output_dir="fine_tuned_", eval_strategy="epoch", logging_strategy="epoch",
+                               learning_rate=2e-5, num_train_epochs=20, weight_decay=0.01, batch_size=64,
+                               save_strategy="epoch", push_to_hub=False, hub_model_id="", fp16=True):
+        logging_steps = len(self.split_dataset["train"]) // batch_size
+        self.args= TrainingArguments(
+            #use_cpu=True,
+            output_dir=f"{output_dir}{self.model_checkpoint}",
+            overwrite_output_dir=True,
+            eval_strategy=eval_strategy,
+            save_strategy=save_strategy,
+            weight_decay=weight_decay,
+            learning_rate=learning_rate,
+            num_train_epochs=num_train_epochs,
+            per_device_train_batch_size=batch_size,
+            per_device_eval_batch_size=batch_size,
+            push_to_hub=push_to_hub,
+            hub_model_id=hub_model_id,
+            fp16=fp16,
+            logging_steps=logging_steps
+        )
+        print("Arguments ready for training")
+        return self.args
+    def call_train(self, model_path="pretrained_model_", set_train="train", set_val="test", push_to_hub=False, save_local=False):
+        trainer = Trainer(
+            model=self.model,
+            args=self.args,
+            train_dataset=self.split_dataset[set_train],
+            eval_dataset=self.split_dataset[set_val],
+            data_collator=self.data_collator,
+            tokenizer=self.tokenizer,
+        )
+        eval_result1 = trainer.evaluate()
+        print("Perplexity before of training: ", math.exp(eval_result1['eval_loss']))
+        print("Start training")
+        trainer.train()
+        print("Done training")
+        eval_result2 = trainer.evaluate()
+        print("Perplexity after of training: ", math.exp(eval_result2['eval_loss']))
+        if save_local:
+            trainer.save_model(model_path+self.model_checkpoint)
+            print("Done saving to local")
+        if push_to_hub:
+            trainer.push_to_hub(commit_message="Training complete")
+            print("Done pushing push to hub")
+    def call_pipeline(self, local=False, path="", example=""):
+        if local:
+            model_checkpoint = ""
+        else:
+            model_checkpoint = path
+        mask_filler = pipeline(
+            "fill-mask",
+            model=model_checkpoint,
+        )
+        print(mask_filler(example))
+if __name__ == "__main__":
+    '''
+        1_LOADING DATASET
+    '''
+    mlm = MaskedLM()
+    mlm.load_dataset()
+    print("-"*50, "Exploring information of Supporting", "-"*50)
+    mlm.load_support()
+    print("-"*50, "Exploring information of Supporting", "-"*50)
+    '''
+        2_EXPLORING DATASET, MODEL
+    '''
+    print("-"*50, "Exploring some information of Model", "-"*50)
+    mlm.explore_infoModel()
+    print("-"*50, "Exploring some information of Model", "-"*50)
+    print("Example[0] (text) in dataset: ", mlm.get_feature_items(set="train", index=0, feature="text")[:100] + "...")
+    print("Example[0] (label) in dataset: ", mlm.get_feature_items(set="train", index=0, feature="label"))
+    line1, line2 = mlm.get_pair_items(set="train", index=1, feature1="text", feature2="label")
+    print("--> Inp of Example[1]: ", line1[:20] + "...")
+    print("--> Out of Example[1]: ", line2[:20]+ "...")
+    '''
+        3_PRE-PROCESSING DATASET, COMPUTE METRICS
+    '''
+    tokens, word_ids = mlm.get_tokenizer(set="train", index=0, feature="text")
+    print("Tokens List of Example 0: ",tokens)
+    print("Word IDs List of Example 0: ",word_ids)
+    mlm.map_tokenize_dataset()
+    mlm.map_chunk_dataset()
+    mlm.dataset_split()
+    '''
+        4_INITIALIZATION MODEL
+    '''
+    print("-"*50, f"Information of {mlm.model_checkpoint}", "-"*50)
+    mlm.create_model()
+    print("-"*50, f"Information of {mlm.model_checkpoint}", "-"*50)
+    '''
+        5_SELECTION HYPERPARMETERS
+    '''
+    mlm.create_argumentTrainer(push_to_hub=True, hub_model_id="Chessmen/"+"fine_tune_" + mlm.model_checkpoint)
+    mlm.call_train(save_local=True,push_to_hub=True)
+    '''
+        6_USE PRE-TRAINED MODEL
+    '''
+    mlm.call_pipeline(path="Chessmen/fine_tune_distilbert-base-uncased",example= "This is a great [MASK].")

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a6c4f97710c4144ef17d29c0346b8aa3b4e5feaef5f15349da25563bc08d7899
+size 267954768

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,55 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_lower_case": true,
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "DistilBertTokenizer",
+  "unk_token": "[UNK]"
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d300f8cac52844a5f0c8280c2941366a386056d55a3cc40165c5874f1b7cd5b2
+size 5240

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff