nroggendorff commited on
Commit
5b65926
·
verified ·
1 Parent(s): e9cc5e8

please work

Browse files

please please please

Files changed (1) hide show
  1. train.py +24 -289
train.py CHANGED
@@ -1,331 +1,66 @@
1
- import os
2
- from sys import exit
3
  import torch
4
- import trl
5
- from transformers import (
6
- AutoTokenizer, LlamaConfig, AutoModelForCausalLM, LlamaForCausalLM,
7
- TrainingArguments, PreTrainedTokenizerFast, AdamW, get_cosine_schedule_with_warmup
8
- )
9
- from datasets import load_dataset, Dataset
10
- from tokenizers import ByteLevelBPETokenizer
11
- from huggingface_hub import HfApi
12
- from trl import SFTConfig, SFTTrainer
13
- from torch.utils.data import DataLoader
14
- from itertools import islice
15
-
16
- class Config:
17
- def __init__(self):
18
- # Model and training hyperparameters
19
- self.BATCH_SIZE = 16
20
- self.EPOCHS = 3
21
- self.LEARNING_RATE = 2e-4
22
- self.MAX_SEQ_LENGTH = 512
23
- self.VOCAB_SIZE = 32000
24
- self.FP16 = True
25
- self.WEIGHT_DECAY = 1e-3
26
- self.GRADIENT_ACCUMULATION_STEPS = self.BATCH_SIZE // 4
27
-
28
- # Dataset configurations
29
- self.INPUT_DATASET = "HuggingFaceTB/smollm-corpus"
30
- self.INSTRUCT_DATASET = "nroggendorff/elephant"
31
- self.SHARD_SIZE = int(2e+5)
32
-
33
- # Output and repo settings
34
- self.OUTPUT_REPO = "nroggendorff/smallama"
35
- self.PUSH_TO_HUB = True
36
- self.INSTRUCT_FINETUNE_BOOL = False
37
-
38
- # Training steps and warmup
39
- self.FACTOR = 12 ** 3 // 2
40
- self.TOTAL_STEPS = (self.SHARD_SIZE * self.EPOCHS) // (self.BATCH_SIZE * self.GRADIENT_ACCUMULATION_STEPS)
41
- self.WARMUP_STEPS = int(self.TOTAL_STEPS * 0.1)
42
-
43
- # Initial state for shard offset
44
- self.INIT = 0
45
-
46
- # ignore
47
- self.getConfig = lambda: self._args()
48
-
49
- # @staticmethod
50
- def _args(self):
51
- return SFTConfig(
52
- output_dir="model",
53
- num_train_epochs=self.EPOCHS,
54
- per_device_train_batch_size=self.BATCH_SIZE,
55
- learning_rate=self.LEARNING_RATE,
56
- warmup_steps=self.WARMUP_STEPS,
57
- weight_decay=self.WEIGHT_DECAY,
58
- gradient_accumulation_steps=self.GRADIENT_ACCUMULATION_STEPS,
59
- fp16=self.FP16,
60
- save_steps=int(self.WARMUP_STEPS * 5),
61
- logging_steps=int(self.WARMUP_STEPS),
62
- save_total_limit=2,
63
- report_to="none",
64
- )
65
 
66
  config = Config()
67
 
68
- class Space:
69
- def __init__(self):
70
- self.api = HfApi()
71
- self.pause = lambda: self.api.pause_space("nroggendorff/train-llama")
72
-
73
  class FineError(Exception):
74
  def __init__(self, message="Script execution has completed."):
75
  self.message = message
76
  super().__init__(self.message)
77
 
78
- def load_data():
79
- if not config.INSTRUCT_FINETUNE_BOOL:
80
- dataset = load_dataset(config.INPUT_DATASET, "cosmopedia-v2", split="train", streaming=True)
81
- else:
82
- dataset = load_dataset(config.INSTRUCT_DATASET, split="train", streaming=True)
83
-
84
- start = config.INIT * config.SHARD_SIZE
85
- data_list = list(islice(dataset, start, start + config.SHARD_SIZE))
86
-
87
- dataset = Dataset.from_dict({'text': [example['text'] for example in data_list]})
88
- return dataset
89
-
90
- def encode_decode(texts, tok):
91
- if tok.pad_token is None:
92
- tok.pad_token = tok.eos_token
93
-
94
- tokenized_texts = tok(
95
- texts,
96
- padding="max_length",
97
- truncation=True,
98
- max_length=config.MAX_SEQ_LENGTH,
99
- return_tensors="pt"
100
- ).input_ids
101
-
102
- if tokenized_texts.dim() >= 1:
103
- decoded_texts = tok.batch_decode(tokenized_texts)
104
- else:
105
- print('Found invalid entry in examples. Returning dummy..')
106
- decoded_texts = [tok.pad_token * config.MAX_SEQ_LENGTH]
107
-
108
- islist = not len(decoded_texts) == 1
109
-
110
- return decoded_texts if islist else decoded_texts[0]
111
-
112
- def create_tokenizer(training_corpus):
113
- tokenizer = ByteLevelBPETokenizer()
114
- special_tokens = ["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
115
- tokenizer.train_from_iterator(
116
- training_corpus,
117
- vocab_size=config.VOCAB_SIZE,
118
- min_frequency=2,
119
- special_tokens=special_tokens
120
- )
121
- fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer._tokenizer)
122
- return fast_tokenizer
123
-
124
- def load_tokenizer():
125
- return AutoTokenizer.from_pretrained(config.OUTPUT_REPO + '-it' if config.INSTRUCT_FINETUNE_BOOL else config.OUTPUT_REPO)
126
-
127
- def get_training_corpus(dataset):
128
- for i in range(0, len(dataset['text']), 1000):
129
- yield dataset['text'][i : i + 1000]
130
-
131
- def format_prompts(examples, tokenizer, isinst):
132
- texts = []
133
- for text in examples['text']:
134
- if text and len(text.strip()) > 0:
135
- if isinst:
136
- conversation = []
137
- parts = text.split('<|end|>')
138
- for i in range(0, len(parts) - 1, 2):
139
- prompt = parts[i].replace("<|user|>", "").strip()
140
- response = parts[i + 1].replace("<|bot|>", "").strip()
141
- conversation.append({"role": "user", "content": prompt})
142
- conversation.append({"role": "assistant", "content": response})
143
- formatted_conversation = tokenizer.apply_chat_template(conversation, tokenize=False)
144
- coded_text = tokenizer.code(formatted_conversation)
145
- texts.append(coded_text)
146
- else:
147
- texts.append(tokenizer.bos_token + tokenizer.code(text) + tokenizer.eos_token)
148
- else:
149
- print('Found empty entry in examples. Moving on..')
150
- continue
151
-
152
- if len(texts) == 0:
153
- raise ValueError("No valid texts found in examples for formatting.")
154
-
155
- coded_texts = tokenizer.code(texts)
156
- return {'text': coded_texts}
157
-
158
- def create_model(tokenizer):
159
- model_config = LlamaConfig(
160
- vocab_size=tokenizer.vocab_size,
161
- hidden_size=config.FACTOR,
162
- intermediate_size=config.FACTOR * 4,
163
- num_hidden_layers=config.FACTOR // 2 ** 4,
164
- num_attention_heads=config.FACTOR // 2 ** 5,
165
- max_position_embeddings=config.MAX_SEQ_LENGTH,
166
- rms_norm_eps=1e-5,
167
- initializer_range=2e-2,
168
- use_cache=True,
169
- pad_token_id=tokenizer.pad_token_id,
170
- bos_token_id=tokenizer.bos_token_id,
171
- eos_token_id=tokenizer.eos_token_id,
172
- tie_word_embeddings=False,
173
- )
174
- return LlamaForCausalLM(model_config)
175
-
176
- def load_model():
177
- return AutoModelForCausalLM.from_pretrained(config.OUTPUT_REPO + '-it' if config.INSTRUCT_FINETUNE_BOOL else config.OUTPUT_REPO)
178
 
179
- def configure_tokenizer(tokenizer):
180
- special_tokens = {
181
- "bos_token": "<s>",
182
- "eos_token": "</s>",
183
- "unk_token": "<unk>",
184
- "pad_token": "<pad>",
185
- "mask_token": "<mask>",
186
- "additional_special_tokens": []
187
- }
188
- if config.INSTRUCT_FINETUNE_BOOL:
189
- special_tokens["additional_special_tokens"] = ["<|user|>", "<|bot|>", "<|end|>"]
190
- tokenizer.add_special_tokens(special_tokens)
191
-
192
- if config.INSTRUCT_FINETUNE_BOOL:
193
- tokenizer.user_token_id = tokenizer.convert_tokens_to_ids("<|user|>")
194
- tokenizer.assistant_token_id = tokenizer.convert_tokens_to_ids("<|bot|>")
195
-
196
- chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '<|user|>\n' + message['content'] + '<|end|>\n' }}{% elif message['role'] == 'assistant' %}{{ '<|bot|>\n' + message['content'] + '<|end|>\n' + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
197
- tokenizer.chat_template = chat_template
198
-
199
- tokenizer.code = lambda example: encode_decode(example, tokenizer)
200
-
201
- def update_tokenizer(tokenizer, dataset, batch_size=1000):
202
- existing_vocab = tokenizer.get_vocab()
203
- oov_tokens = set()
204
-
205
- for i in range(0, len(dataset['text']), batch_size):
206
- batch = dataset['text'][i:i + batch_size]
207
-
208
- for text in batch:
209
- token_ids = tokenizer.encode(text, add_special_tokens=False)
210
-
211
- for token_id in token_ids:
212
- token = tokenizer.decode([token_id])
213
- if token.strip() and token not in existing_vocab:
214
- oov_tokens.add(token)
215
-
216
- if oov_tokens:
217
- num_added = tokenizer.add_tokens(list(oov_tokens))
218
- return num_added
219
-
220
- return 0
221
-
222
- def train_model(model, tokenizer, dataset, push, isinst):
223
  args = config.getConfig()
224
 
225
  optimizer = AdamW(model.parameters(), lr=args.learning_rate, weight_decay=config.WEIGHT_DECAY)
226
  scheduler = get_cosine_schedule_with_warmup(
227
  optimizer,
228
  num_warmup_steps=args.warmup_steps,
229
- num_training_steps=total_steps
230
  )
231
 
232
- dataset = dataset.map(lambda examples: format_prompts(examples, tokenizer, isinst), batched=True, remove_columns=dataset.column_names)
233
-
234
- if 'text' not in dataset.column_names:
235
- raise ValueError("Dataset transformation failed: 'text' column missing after mapping.")
236
-
237
- print("Mapped dataset sample length:", len(dataset[0]['text']))
238
-
239
- try:
240
- test_input = tokenizer(
241
- ["This is a test input."],
242
- return_tensors="pt",
243
- padding="max_length",
244
- truncation=True,
245
- max_length=MAX_SEQ_LENGTH
246
- )
247
- test_output = model(**test_input)
248
- print("Model test output shape:", test_output.logits.shape)
249
- except RuntimeError as e:
250
- print(f"Error processing test batch: {e}")
251
-
252
  trainer = SFTTrainer(
253
  model=model,
254
  tokenizer=tokenizer,
255
  args=args,
256
  train_dataset=dataset,
257
- # dataset_text_field='text',
258
- max_seq_length=config.MAX_SEQ_LENGTH,
259
  optimizers=(optimizer, scheduler)
260
  )
261
 
262
  train = trainer.train()
263
 
264
- trained_model = trainer.model
265
- trained_tokenizer = trainer.tokenizer
266
-
267
  if push:
268
  repo_id = config.OUTPUT_REPO + "-it" if config.INSTRUCT_FINETUNE_BOOL else config.OUTPUT_REPO
269
  msg = f"Training loss: {train.training_loss:.4f}"
270
- trained_model.push_to_hub(repo_id, commit_message=msg, force=True)
271
- trained_tokenizer.push_to_hub(repo_id, commit_message=msg, force=True)
272
- else:
273
- trained_model.save_pretrained("model")
274
- trained_tokenizer.save_pretrained("tokenizer")
275
-
276
- def main(push_to_hub=True, is_inst_finetune=config.INSTRUCT_FINETUNE_BOOL):
277
- print("Loading Data..")
278
- dataset = load_data()
279
- print("Loaded data.")
280
-
281
- if is_inst_finetune and config.INIT > 0:
282
- print("Loading Tokenizer..")
283
- tokenizer = load_tokenizer()
284
- print("Loaded Tokenizer.")
285
  else:
286
- print("Making Corpus..")
287
- training_corpus = get_training_corpus(dataset)
288
- print("Made Corpus.")
289
 
290
- print("Making Tokenizer..")
291
- tokenizer = create_tokenizer(training_corpus)
292
- print(f"Made Tokenizer with size {len(tokenizer)}.")
 
 
293
 
294
- # print("Adding Tokens..")
295
- # num_new_tokens = update_tokenizer(tokenizer, dataset)
296
- # print(f"Added {num_new_tokens} new tokens to the vocabulary")
297
-
298
- if config.INIT == 0:
299
- print("Adding Special Tokens..")
300
- configure_tokenizer(tokenizer)
301
- print("Added Tokens.")
302
-
303
- if is_inst_finetune or config.INIT > 0:
304
- print("Loading Model..")
305
- model = load_model()
306
- print("Loaded Model.")
307
- else:
308
- print("Creating Model..")
309
- model = create_model(tokenizer)
310
- print("Created Model.")
311
-
312
- print(f"Tokenizer vocabulary size: {len(tokenizer)}")
313
- print(f"Special tokens: {tokenizer.special_tokens_map}")
314
-
315
- print("Resizing Token Embeddings..")
316
- try:
317
- model.resize_token_embeddings(len(tokenizer))
318
- except RuntimeError as e:
319
- raise RuntimeError(f"Error resizing token embeddings: {e}")
320
- print("Resized Embeddings.")
321
 
322
  print("Training Model..")
323
- train_model(model, tokenizer, dataset, push_to_hub, is_inst_finetune)
324
  raise FineError("Trained Model.")
325
 
326
  if __name__ == "__main__":
327
  try:
328
  main()
329
  except Exception as e:
330
- print(f'{type(e).__name__}: {e}')
331
- Space().pause()
 
 
 
1
  import torch
2
+ from transformers import AutoModelForCausalLM, AdamW, get_cosine_schedule_with_warmup
3
+ from trl import SFTTrainer
4
+ from datasets import load_from_disk
5
+ from config import Config
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  config = Config()
8
 
 
 
 
 
 
9
  class FineError(Exception):
10
  def __init__(self, message="Script execution has completed."):
11
  self.message = message
12
  super().__init__(self.message)
13
 
14
+ def load_model(tokenizer):
15
+ model = AutoModelForCausalLM.from_pretrained(config.OUTPUT_REPO + '-it' if config.INSTRUCT_FINETUNE_BOOL else config.OUTPUT_REPO)
16
+ model.resize_token_embeddings(len(tokenizer))
17
+ return model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
+ def train_model(model, tokenizer, dataset, push):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  args = config.getConfig()
21
 
22
  optimizer = AdamW(model.parameters(), lr=args.learning_rate, weight_decay=config.WEIGHT_DECAY)
23
  scheduler = get_cosine_schedule_with_warmup(
24
  optimizer,
25
  num_warmup_steps=args.warmup_steps,
26
+ num_training_steps=args.num_training_steps
27
  )
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  trainer = SFTTrainer(
30
  model=model,
31
  tokenizer=tokenizer,
32
  args=args,
33
  train_dataset=dataset,
 
 
34
  optimizers=(optimizer, scheduler)
35
  )
36
 
37
  train = trainer.train()
38
 
 
 
 
39
  if push:
40
  repo_id = config.OUTPUT_REPO + "-it" if config.INSTRUCT_FINETUNE_BOOL else config.OUTPUT_REPO
41
  msg = f"Training loss: {train.training_loss:.4f}"
42
+ trainer.model.push_to_hub(repo_id, commit_message=msg, force=True)
43
+ trainer.tokenizer.push_to_hub(repo_id, commit_message=msg, force=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  else:
45
+ trainer.model.save_pretrained("trained_model")
46
+ trainer.tokenizer.save_pretrained("trained_tokenizer")
 
47
 
48
+ def main(push_to_hub=True):
49
+ print("Loading Prepared Data..")
50
+ dataset = load_from_disk("prepared_dataset")
51
+ tokenizer = AutoTokenizer.from_pretrained("prepared_tokenizer")
52
+ print("Loaded Prepared Data.")
53
 
54
+ print("Loading Model..")
55
+ model = load_model(tokenizer)
56
+ print("Loaded Model.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
  print("Training Model..")
59
+ train_model(model, tokenizer, dataset, push_to_hub)
60
  raise FineError("Trained Model.")
61
 
62
  if __name__ == "__main__":
63
  try:
64
  main()
65
  except Exception as e:
66
+ print(f'{type(e).__name__}: {e}')