bodhitrii commited on
Commit
376e557
·
0 Parent(s):

Initial commit

Browse files
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # CodeParrot
2
+
3
+ CodeParrot (large) is a 1.5B parameter GPT-2 model trained on the [CodeParrot Python code dataset](https://huggingface.co/datasets/transformersbook/codeparrot). The model is trained in Chapter 10: Training Transformers from Scratch in the [NLP with Transformers book](https://learning.oreilly.com/library/view/natural-language-processing/9781098103231/). You can find the full code in the accompanying [Github repository](https://github.com/nlp-with-transformers/notebooks/blob/main/10_transformers-from-scratch.ipynb).
codeparrot_training.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import GPT2LMHeadModel, AutoTokenizer
2
+ from transformers import AdamW, get_scheduler, set_seed
3
+ from datasets import load_dataset
4
+ from accelerate import Accelerator
5
+ import datasets, transformers
6
+ from huggingface_hub import Repository
7
+
8
+ from torch.utils.data import IterableDataset
9
+ from torch.utils.data.dataloader import DataLoader
10
+ from torch.utils.tensorboard import SummaryWriter
11
+ from argparse import Namespace
12
+ import torch
13
+ import logging
14
+ import wandb
15
+
16
+ class ConstantLengthDataset(IterableDataset):
17
+
18
+ def __init__(self, tokenizer, dataset, seq_length=1024,
19
+ num_of_sequences=1024, chars_per_token=3.6):
20
+ self.tokenizer = tokenizer
21
+ self.concat_token_id = tokenizer.bos_token_id
22
+ self.dataset = dataset
23
+ self.seq_length = seq_length
24
+ self.input_characters = seq_length * chars_per_token * num_of_sequences
25
+
26
+ def __iter__(self):
27
+ iterator = iter(self.dataset)
28
+ more_examples = True
29
+ while more_examples:
30
+ buffer, buffer_len = [], 0
31
+ while True:
32
+ if buffer_len >= self.input_characters:
33
+ break
34
+ try:
35
+ buffer.append(next(iterator)['content'])
36
+ buffer_len += len(buffer[-1])
37
+ except StopIteration:
38
+ more_examples = False
39
+ break
40
+ tokenized_inputs = tokenizer(buffer, truncation=False)['input_ids']
41
+ all_token_ids = []
42
+ for tokenized_input in tokenized_inputs:
43
+ all_token_ids.extend(tokenized_input + [self.concat_token_id])
44
+ for i in range(0, len(all_token_ids), self.seq_length):
45
+ input_ids = all_token_ids[i : i + self.seq_length]
46
+ if len(input_ids) == self.seq_length:
47
+ yield torch.tensor(input_ids)
48
+
49
+ def setup_logging(project_name):
50
+ logger = logging.getLogger(__name__)
51
+ logging.basicConfig(
52
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
53
+ datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, handlers=[
54
+ logging.FileHandler(f"log/debug_{accelerator.process_index}.log"),
55
+ logging.StreamHandler()])
56
+ if accelerator.is_main_process: # we only want to setup logging once
57
+ wandb.init(project=project_name, config=args)
58
+ run_name = wandb.run.name
59
+ tb_writer = SummaryWriter()
60
+ tb_writer.add_hparams(vars(args), {'0': 0})
61
+ logger.setLevel(logging.INFO)
62
+ datasets.utils.logging.set_verbosity_info()
63
+ transformers.utils.logging.set_verbosity_info()
64
+ else:
65
+ tb_writer = None
66
+ run_name = ''
67
+ logger.setLevel(logging.ERROR)
68
+ datasets.utils.logging.set_verbosity_error()
69
+ transformers.utils.logging.set_verbosity_error()
70
+ return logger, tb_writer, run_name
71
+
72
+ def create_dataloaders(dataset_name, args):
73
+ ds_kwargs = {"streaming":True, "chunksize":40<<20, "error_bad_chunk":False}
74
+ train_data = load_dataset(dataset_name+'-train', split='train', **ds_kwargs)
75
+ train_data = train_data.shuffle(buffer_size=args.shuffle_buffer,
76
+ seed=args.seed)
77
+ valid_data = load_dataset(dataset_name+'-valid', split="train", **ds_kwargs)
78
+ train_dataset = ConstantLengthDataset(tokenizer, train_data,
79
+ seq_length=args.seq_length)
80
+ valid_dataset = ConstantLengthDataset(tokenizer, valid_data,
81
+ seq_length=args.seq_length)
82
+ train_dataloader=DataLoader(train_dataset, batch_size=args.train_batch_size)
83
+ eval_dataloader=DataLoader(valid_dataset, batch_size=args.valid_batch_size)
84
+ return train_dataloader, eval_dataloader
85
+
86
+ def get_grouped_params(model, args, no_decay=["bias", "LayerNorm.weight"]):
87
+ params_with_wd, params_without_wd = [], []
88
+ for n, p in model.named_parameters():
89
+ if any(nd in n for nd in no_decay): params_without_wd.append(p)
90
+ else: params_with_wd.append(p)
91
+ return [{'params': params_with_wd, 'weight_decay': args.weight_decay},
92
+ {'params': params_without_wd, 'weight_decay': 0.0}]
93
+
94
+ def log_metrics(step, metrics):
95
+ logger.info(f"Step {step}: {metrics}")
96
+ if accelerator.is_main_process:
97
+ wandb.log(metrics)
98
+ [tb_writer.add_scalar(k, v, step) for k, v in metrics.items()]
99
+
100
+ def evaluate(args):
101
+ model.eval()
102
+ losses = []
103
+ for step, batch in enumerate(eval_dataloader):
104
+ with torch.no_grad():
105
+ outputs = model(batch, labels=batch)
106
+ loss = outputs.loss.repeat(args.valid_batch_size)
107
+ losses.append(accelerator.gather(loss))
108
+ if args.max_eval_steps > 0 and step >= args.max_eval_steps: break
109
+ loss = torch.mean(torch.cat(losses))
110
+ try: perplexity = torch.exp(loss)
111
+ except OverflowError: perplexity = float("inf")
112
+ return loss.item(), perplexity.item()
113
+
114
+ # Accelerator
115
+ accelerator = Accelerator(dispatch_batches=True)
116
+ acc_state = {str(k): str(v) for k, v in accelerator.state.__dict__.items()}
117
+ # Hyperparameters
118
+ project_name = 'transformersbook/codeparrot'
119
+ dataset_name = '../llama-from-scratch/codeparrot'
120
+ config = {"train_batch_size": 2,
121
+ "valid_batch_size": 2,
122
+ "weight_decay": 0.1,
123
+ "shuffle_buffer": 1_000,
124
+ "learning_rate": 2e-4,
125
+ "lr_scheduler_type": "cosine",
126
+ "num_warmup_steps": 750,
127
+ "gradient_accumulation_steps": 16,
128
+ "max_train_steps": 50_000,
129
+ "max_eval_steps": -1,
130
+ "seq_length": 1024,
131
+ "seed": 1,
132
+ "save_checkpoint_steps": 50_000}
133
+ args = Namespace(**config, **acc_state)
134
+ samples_per_step = accelerator.state.num_processes * args.train_batch_size
135
+ set_seed(args.seed)
136
+
137
+ # Logging
138
+ logger, tb_writer, run_name = setup_logging(project_name.split("/")[1])
139
+ logger.info(accelerator.state)
140
+
141
+ # Load model and tokenizer
142
+ if accelerator.is_main_process:
143
+ hf_repo = Repository("./", clone_from=project_name, revision=run_name)
144
+ model = GPT2LMHeadModel.from_pretrained("./", gradient_checkpointing=True)
145
+ tokenizer = AutoTokenizer.from_pretrained("./")
146
+
147
+ # Load dataset and dataloader
148
+ train_dataloader, eval_dataloader = create_dataloaders(dataset_name, args)
149
+
150
+ # Prepare the optimizer and learning rate scheduler
151
+ optimizer = AdamW(get_grouped_params(model, args), lr=args.learning_rate)
152
+ lr_scheduler = get_scheduler(name=args.lr_scheduler_type, optimizer=optimizer,
153
+ num_warmup_steps=args.num_warmup_steps,
154
+ num_training_steps=args.max_train_steps,)
155
+ def get_lr(): return optimizer.param_groups[0]['lr']
156
+
157
+ # Prepare everything with our `accelerator`.
158
+ model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
159
+ model, optimizer, train_dataloader, eval_dataloader)
160
+
161
+ # Train model
162
+ model.train()
163
+ completed_steps = 0
164
+ for step, batch in enumerate(train_dataloader, start=1):
165
+ loss = model(batch, labels=batch, use_cache=False).loss
166
+ log_metrics(step, {'lr': get_lr(), 'samples': step*samples_per_step,
167
+ 'steps': completed_steps, 'loss/train': loss.item()})
168
+ loss = loss / args.gradient_accumulation_steps
169
+ accelerator.backward(loss)
170
+ if step % args.gradient_accumulation_steps == 0:
171
+ accelerator.clip_grad_norm_(model.parameters(), 1.0)
172
+ optimizer.step()
173
+ lr_scheduler.step()
174
+ optimizer.zero_grad()
175
+ completed_steps += 1
176
+ if step % args.save_checkpoint_steps == 0:
177
+ logger.info('Evaluating and saving model checkpoint')
178
+ eval_loss, perplexity = evaluate(args)
179
+ log_metrics(step, {'loss/eval': eval_loss, 'perplexity': perplexity})
180
+ accelerator.wait_for_everyone()
181
+ unwrapped_model = accelerator.unwrap_model(model)
182
+ if accelerator.is_main_process:
183
+ unwrapped_model.save_pretrained("./")
184
+ hf_repo.push_to_hub(commit_message=f'step {step}')
185
+ model.train()
186
+ if completed_steps >= args.max_train_steps:
187
+ break
188
+
189
+ # Evaluate and save the last checkpoint
190
+ logger.info('Evaluating and saving model after training')
191
+ eval_loss, perplexity = evaluate(args)
192
+ log_metrics(step, {'loss/eval': eval_loss, 'perplexity': perplexity})
193
+ accelerator.wait_for_everyone()
194
+ unwrapped_model = accelerator.unwrap_model(model)
195
+ if accelerator.is_main_process:
196
+ unwrapped_model.save_pretrained("./")
197
+ hf_repo.push_to_hub(commit_message=f'final model')
config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "./",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 0,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 0,
11
+ "gradient_checkpointing": false,
12
+ "initializer_range": 0.02,
13
+ "layer_norm_epsilon": 1e-05,
14
+ "model_type": "gpt2",
15
+ "n_ctx": 1024,
16
+ "n_embd": 1600,
17
+ "n_head": 25,
18
+ "n_inner": null,
19
+ "n_layer": 48,
20
+ "n_positions": 1024,
21
+ "output_past": true,
22
+ "reorder_and_upcast_attn": true,
23
+ "resid_pdrop": 0.1,
24
+ "scale_attn_by_inverse_layer_idx": true,
25
+ "scale_attn_weights": true,
26
+ "summary_activation": null,
27
+ "summary_first_dropout": 0.1,
28
+ "summary_proj_to_labels": true,
29
+ "summary_type": "cls_index",
30
+ "summary_use_proj": true,
31
+ "task_specific_params": {
32
+ "text-generation": {
33
+ "do_sample": true,
34
+ "max_length": 50
35
+ }
36
+ },
37
+ "torch_dtype": "float32",
38
+ "transformers_version": "4.11.0.dev0",
39
+ "use_cache": true,
40
+ "vocab_size": 32768
41
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d702c81fb5bf32236d7359a16fe6a59f68565cc551405b544bb009c18fc72af
3
+ size 6169094681
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # torch==1.9.0
2
+ wandb
3
+ tensorboard
4
+ git+https://github.com/huggingface/huggingface_hub.git
5
+ git+https://github.com/huggingface/transformers.git
6
+ git+https://github.com/huggingface/datasets.git@json-dont-raise
7
+ git+https://github.com/huggingface/accelerate.git
runs/Aug27_13-51-10_desktop2.xfact.net/1724734270.8848336/events.out.tfevents.1724734270.desktop2.xfact.net.283940.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4bb5f71c3a698d57525a3cf9e5ec798d5207b5830cf6a239ea3201b6b36d7183
3
+ size 1702
runs/Aug27_13-51-10_desktop2.xfact.net/events.out.tfevents.1724734270.desktop2.xfact.net.283940.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f51f7148276851dd5d67d3ff3bc0e45b13d6509185aa286ef9ee1c88bdc436ad
3
+ size 88
runs/Sep20_14-28-12_leandro-16x-v100/1632148092.8874874/events.out.tfevents.1632148092.leandro-16x-v100.8660.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:222b05fb22ccb39b7d43f507f7c672d8c741e4281e65c71c12d98b19c1d3ff1f
3
+ size 1373
runs/Sep20_14-28-12_leandro-16x-v100/events.out.tfevents.1632148092.leandro-16x-v100.8660.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3110a4850e2eba17c258d67eacb63ff2acca8af1b29362e28d3c328621a5391d
3
+ size 147135683
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "unk_token": "<|endoftext|>"}
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "<|endoftext|>", "bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "add_prefix_space": false, "model_max_length": 1024, "special_tokens_map_file": null, "name_or_path": "thomwolf/codeparrot", "tokenizer_class": "GPT2Tokenizer"}
vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
wandb/debug-cli.yejin.log ADDED
File without changes
wandb/debug-internal.log ADDED
@@ -0,0 +1 @@
 
 
1
+ run-20240827_135059-uy3qxte5/logs/debug-internal.log
wandb/debug.log ADDED
@@ -0,0 +1 @@
 
 
1
+ run-20240827_135059-uy3qxte5/logs/debug.log
wandb/latest-run ADDED
@@ -0,0 +1 @@
 
 
1
+ run-20240827_135059-uy3qxte5