migueldeguzmandev commited on
Commit
8c6e690
1 Parent(s): f7b6be6

Upload 11 files

Browse files
atl.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
3
+ import sys
4
+ import torch
5
+ from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments, get_linear_schedule_with_warmup
6
+
7
+ class GPT2Assistant:
8
+ def __init__(self):
9
+ self.tokenizer = GPT2Tokenizer.from_pretrained("/Users/migueldeguzman/Desktop/papercliptodd/gpt2xl-stamps/v2")
10
+
11
+ def fine_tune(self, answer_file_path, model_output_dir, epochs=1.0): #previously 1.0
12
+ self.model = GPT2LMHeadModel.from_pretrained("/Users/migueldeguzman/Desktop/papercliptodd/gpt2xl-stamps/v2")
13
+ train_dataset = TextDataset(
14
+ tokenizer=self.tokenizer,
15
+ file_path=answer_file_path,
16
+ block_size=128
17
+ )
18
+
19
+ data_collator = DataCollatorForLanguageModeling(
20
+ tokenizer=self.tokenizer,
21
+ mlm=False
22
+ )
23
+
24
+ total_steps = len(train_dataset) * epochs
25
+ warmup_steps = 0.1 * total_steps
26
+
27
+ optimizer = torch.optim.Adam(self.model.parameters(), lr=42e-6, weight_decay=0.001)
28
+ scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, epochs)
29
+ scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)
30
+
31
+ training_args = TrainingArguments(
32
+ output_dir=model_output_dir,
33
+ overwrite_output_dir=True,
34
+ num_train_epochs=epochs,
35
+ per_device_train_batch_size=4, #previously 16
36
+ save_steps=10_000,
37
+ save_total_limit=2,
38
+ weight_decay=0.001, #previously 0.010 from gpt_algos build v11.2
39
+ gradient_accumulation_steps=8, #previously 32
40
+ learning_rate=42e-6, #previously
41
+ lr_scheduler_type='cosine', #constant
42
+ warmup_steps=500
43
+ )
44
+
45
+
46
+ trainer = Trainer(
47
+ model=self.model,
48
+ args=training_args,
49
+ data_collator=data_collator,
50
+ train_dataset=train_dataset,
51
+ optimizers=(optimizer, scheduler) # Pass both the optimizer and scheduler as a tuple
52
+ )
53
+
54
+
55
+ trainer.train()
56
+ self.model.save_pretrained(model_output_dir)
57
+ self.tokenizer.save_pretrained(model_output_dir)
58
+
59
+ def generate_answer(self, prompt, max_length=1000):
60
+ input_ids = self.tokenizer.encode(prompt, return_tensors="pt")
61
+
62
+ if self.tokenizer.pad_token_id is None:
63
+ self.tokenizer.pad_token = self.tokenizer.eos_token
64
+
65
+ attention_mask = (input_ids != self.tokenizer.pad_token_id).long()
66
+
67
+ output = self.model.generate(
68
+ input_ids,
69
+ attention_mask=attention_mask,
70
+ max_length=max_length,
71
+ num_return_sequences=1,
72
+ no_repeat_ngram_size=2,
73
+ do_sample=True,
74
+ top_k=50,
75
+ top_p=0.95,
76
+ temperature=0.001
77
+ )
78
+
79
+ answer = self.tokenizer.decode(output[0], skip_special_tokens=True)
80
+ return answer[len(prompt):]
81
+
82
+ def query(self, prompt):
83
+ generated_answer = self.generate_answer(prompt)
84
+ print(generated_answer)
85
+ return generated_answer
86
+
87
+ def main():
88
+ text_file_path = "/Users/migueldeguzman/Desktop/papercliptodd/gpt2xl-stamps/v3/manifestoV1.text"
89
+ model_output_dir = "/Users/migueldeguzman/Desktop/papercliptodd/gpt2xl-stamps/v3/"
90
+
91
+ assistant = GPT2Assistant()
92
+
93
+ choice = input("Do you want to fine-tune a new model (n) or load an existing one (e)? (n/e): ")
94
+
95
+ if choice.lower() == "n":
96
+ print("Fine-tuning the model...")
97
+ assistant.fine_tune(text_file_path, model_output_dir)
98
+ print("Model fine-tuning complete.")
99
+ elif choice.lower() == "e":
100
+ print("Loading the existing model...")
101
+ assistant.model = GPT2LMHeadModel.from_pretrained(model_output_dir)
102
+ print("Existing model loaded.")
103
+ else:
104
+ print("Invalid choice. Exiting the program.")
105
+ sys.exit()
106
+
107
+ while True:
108
+ prompt = input("Enter your question (or type 'exit' to stop): ")
109
+ if prompt.lower() == "exit":
110
+ break
111
+
112
+ print("Answering in progress...")
113
+ generated_answer = assistant.query(prompt)
114
+
115
+ print("\n")
116
+
117
+ if __name__ == "__main__":
118
+ main()
cached_lm_GPT2Tokenizer_128_manifestoV1.text ADDED
Binary file (901 kB). View file
 
cached_lm_GPT2Tokenizer_128_manifestoV1.text.lock ADDED
File without changes
config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/Users/migueldeguzman/Desktop/papercliptodd/gpt2xl-stamps/v2",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_ctx": 1024,
15
+ "n_embd": 1600,
16
+ "n_head": 25,
17
+ "n_inner": null,
18
+ "n_layer": 48,
19
+ "n_positions": 1024,
20
+ "output_past": true,
21
+ "reorder_and_upcast_attn": false,
22
+ "resid_pdrop": 0.1,
23
+ "scale_attn_by_inverse_layer_idx": false,
24
+ "scale_attn_weights": true,
25
+ "summary_activation": null,
26
+ "summary_first_dropout": 0.1,
27
+ "summary_proj_to_labels": true,
28
+ "summary_type": "cls_index",
29
+ "summary_use_proj": true,
30
+ "task_specific_params": {
31
+ "text-generation": {
32
+ "do_sample": true,
33
+ "max_length": 50
34
+ }
35
+ },
36
+ "torch_dtype": "float32",
37
+ "transformers_version": "4.33.3",
38
+ "use_cache": true,
39
+ "vocab_size": 50257
40
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "transformers_version": "4.33.3"
6
+ }
manifestoV1.text ADDED
The diff for this file is too large to render. See raw diff
 
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bdf95d01e24b5dd3f43ca0815bff611ee7dbd883821804a860d09da7049ad16f
3
+ size 6230624769
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "unk_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "bos_token": {
5
+ "__type": "AddedToken",
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "clean_up_tokenization_spaces": true,
13
+ "eos_token": {
14
+ "__type": "AddedToken",
15
+ "content": "<|endoftext|>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false
20
+ },
21
+ "errors": "replace",
22
+ "model_max_length": 1000000000000000019884624838656,
23
+ "pad_token": null,
24
+ "tokenizer_class": "GPT2Tokenizer",
25
+ "unk_token": {
26
+ "__type": "AddedToken",
27
+ "content": "<|endoftext|>",
28
+ "lstrip": false,
29
+ "normalized": true,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff