File size: 3,862 Bytes
23975dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from transformers import Trainer, TrainingArguments, T5ForConditionalGeneration, T5Tokenizer
from datasets import load_dataset
import base64
import json
from pathlib import Path

# Load your dataset
dataset = load_dataset("./files/")
# Assuming your dataset has 'train' split
train_dataset = dataset["train"]
# Load the T5 model and tokenizer from a local directory
model_path = "t5-small-model"
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./output1",  # Specify the output directory for model checkpoints and predictions
    save_steps=100,
    per_device_train_batch_size=4,  # Adjust the batch size based on your GPU memory
    save_total_limit=2,  # Limit the total number of checkpoints to save
    num_train_epochs=3,  # Specify the number of training epochs
    logging_dir="./logs",  # Specify the directory for Tensorboard logs
)

# Define format_dataset function
def format_dataset(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
        print(f"File content:\n{content}\n")

    try:
        data_list = json.loads(content)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
        return None

    formatted_examples = []

    for data in data_list:
        input_texts = data.get("input")
        targets = data.get("target")

        # Convert to lists if not already
        if not isinstance(input_texts, list):
            input_texts = [input_texts]
        if not isinstance(targets, list):
            targets = [targets]

        # Concatenate the texts in the list
        input_text_concatenated = " ".join(input_texts)
        target_text_concatenated = " ".join(targets)

        # Encode concatenated texts
        #inputs = tokenizer(input_text_concatenated, padding=True, truncation=True, return_tensors="pt", max_length=512)
        #labels = tokenizer(target_text_concatenated, padding=True, truncation=True, return_tensors="pt", max_length=512)
        # Encode concatenated texts with padding and truncation
        inputs = tokenizer(
            input_text_concatenated,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
            max_length=512
        )
        labels = tokenizer(
            target_text_concatenated,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
            max_length=512
        )


        # Update the inputs dictionary with the labels
        inputs["labels"] = labels["input_ids"]

        formatted_examples.append(inputs)

    return formatted_examples

# Process each example individually
data_files = Path("./files/").rglob("*.json")
formatted_examples = [format_dataset(file_path) for file_path in data_files if format_dataset(file_path) is not None]

# Flatten the list of examples
formatted_examples = [example for sublist in formatted_examples for example in sublist]

# Create the final dataset
train_dataset = [{"input_ids": example["input_ids"][0], "attention_mask": example["attention_mask"][0], "labels": example["labels"][0]} for example in formatted_examples]


# Instantiate the Trainer with save_tokenizer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset, #dataset["train"],
    tokenizer=tokenizer,  # Pass the tokenizer to the Trainer
    # ... other Trainer configurations ...
)

print(f"Number of examples in the training dataset: {len(dataset['train'])}")

# Print model configuration
print("Model Configuration:")
print(model.config)
# Training loop
trainer.train()

# Save the model after training
model.save_pretrained("./output/fine-tuned-model")
tokenizer.save_pretrained("./output/fine-tuned-model")