File size: 2,895 Bytes
2ad9bb3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
import os
import sys
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
class GPTAssistant:
def __init__(self, model_name="/Users/migueldeguzman/Desktop/papercliptodd/falcon-1b/base_model/"): # Replace with your specific Qwen model
try:
# Load the tokenizer and model using the specified Qwen model name
self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
self.model = AutoModelForCausalLM.from_pretrained(model_name)
except Exception as e:
print(f"Error initializing the model or tokenizer: {e}")
sys.exit(1)
def fine_tune(self, answer_file_path, model_output_dir, epochs=1.0):
# Load dataset for training
try:
train_dataset = TextDataset(
tokenizer=self.tokenizer,
file_path=answer_file_path,
block_size=128
)
except Exception as e:
print(f"Error loading training dataset: {e}")
sys.exit(1) # Exit the script if dataset loading fails
# Prepare data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
tokenizer=self.tokenizer,
mlm=False
)
total_steps = len(train_dataset) * epochs
warmup_steps = 0.1 * total_steps
# Set training arguments
training_args = TrainingArguments(
output_dir=model_output_dir,
overwrite_output_dir=True,
num_train_epochs=epochs,
per_device_train_batch_size=4,
save_steps=10_000,
save_total_limit=2,
weight_decay=0.001,
gradient_accumulation_steps=8,
learning_rate=3e-6, #previously 15e-6 then 1e-6 then 7e-6
lr_scheduler_type='cosine',
warmup_steps=warmup_steps
)
# Initialize Trainer
trainer = Trainer(
model=self.model,
args=training_args,
data_collator=data_collator,
train_dataset=train_dataset
)
# Train and save the model
trainer.train()
self.model.save_pretrained(model_output_dir)
self.tokenizer.save_pretrained(model_output_dir)
def main():
# Specify the file path for training data and output directory
text_file_path = "/Users/migueldeguzman/Desktop/papercliptodd/falcon-1b/v1/awakening.text" # Replace with your training data file path
model_output_dir = "/Users/migueldeguzman/Desktop/papercliptodd/falcon-1b/v1/" # Replace with your desired output directory
# Initialize GPTAssistant and fine-tune the model
assistant = GPTAssistant()
assistant.fine_tune(text_file_path, model_output_dir)
if __name__ == "__main__":
main()
|