File size: 2,895 Bytes
2ad9bb3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import os
import sys
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

class GPTAssistant:
    def __init__(self, model_name="/Users/migueldeguzman/Desktop/papercliptodd/falcon-1b/base_model/"):  # Replace with your specific Qwen model
        try:
            # Load the tokenizer and model using the specified Qwen model name
            self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
            self.model = AutoModelForCausalLM.from_pretrained(model_name)
        except Exception as e:
            print(f"Error initializing the model or tokenizer: {e}")
            sys.exit(1)

    def fine_tune(self, answer_file_path, model_output_dir, epochs=1.0):
        # Load dataset for training
        try:
            train_dataset = TextDataset(
                tokenizer=self.tokenizer,
                file_path=answer_file_path,
                block_size=128
            )
        except Exception as e:
            print(f"Error loading training dataset: {e}")
            sys.exit(1)  # Exit the script if dataset loading fails

        # Prepare data collator for language modeling
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer,
            mlm=False
        )

        total_steps = len(train_dataset) * epochs
        warmup_steps = 0.1 * total_steps

        # Set training arguments
        training_args = TrainingArguments(
            output_dir=model_output_dir,
            overwrite_output_dir=True,
            num_train_epochs=epochs,
            per_device_train_batch_size=4,
            save_steps=10_000,
            save_total_limit=2,
            weight_decay=0.001,
            gradient_accumulation_steps=8,
            learning_rate=3e-6, #previously 15e-6 then 1e-6 then 7e-6
            lr_scheduler_type='cosine',
            warmup_steps=warmup_steps
        )

        # Initialize Trainer
        trainer = Trainer(
            model=self.model,
            args=training_args,
            data_collator=data_collator,
            train_dataset=train_dataset
        )

        # Train and save the model
        trainer.train()
        self.model.save_pretrained(model_output_dir)
        self.tokenizer.save_pretrained(model_output_dir)

def main():
    # Specify the file path for training data and output directory
    text_file_path = "/Users/migueldeguzman/Desktop/papercliptodd/falcon-1b/v1/awakening.text"  # Replace with your training data file path
    model_output_dir = "/Users/migueldeguzman/Desktop/papercliptodd/falcon-1b/v1/"  # Replace with your desired output directory
    
    # Initialize GPTAssistant and fine-tune the model
    assistant = GPTAssistant()
    assistant.fine_tune(text_file_path, model_output_dir)

if __name__ == "__main__":
    main()