Spaces:
Runtime error
Runtime error
Reduced the number of parameters
Browse files
app.py
CHANGED
@@ -13,8 +13,13 @@ dataset = load_dataset("mwitiderrick/swahili")
|
|
13 |
# Print dataset columns for verification
|
14 |
print(f"Dataset columns: {dataset['train'].column_names}")
|
15 |
|
|
|
|
|
|
|
|
|
|
|
16 |
# Initialize the tokenizer and model
|
17 |
-
model_name = "gpt2" # Use
|
18 |
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
|
19 |
model = GPT2LMHeadModel.from_pretrained(model_name)
|
20 |
|
@@ -24,21 +29,21 @@ tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
|
|
24 |
|
25 |
# Preprocess the dataset
|
26 |
def preprocess_function(examples):
|
|
|
27 |
encodings = tokenizer(
|
28 |
-
examples['text'],
|
29 |
truncation=True,
|
30 |
-
padding='max_length',
|
31 |
max_length=512
|
32 |
)
|
33 |
-
encodings['labels'] = encodings['input_ids']
|
34 |
return encodings
|
35 |
|
36 |
# Tokenize the dataset
|
37 |
try:
|
38 |
-
tokenized_datasets =
|
39 |
preprocess_function,
|
40 |
-
batched=True
|
41 |
-
batch_size=1000 # Adjust batch size for efficiency
|
42 |
)
|
43 |
except Exception as e:
|
44 |
print(f"Error during tokenization: {e}")
|
@@ -46,21 +51,20 @@ except Exception as e:
|
|
46 |
# Define training arguments
|
47 |
training_args = TrainingArguments(
|
48 |
output_dir='./results',
|
49 |
-
per_device_train_batch_size=
|
50 |
num_train_epochs=1,
|
51 |
logging_dir='./logs',
|
52 |
-
logging_steps=500,
|
53 |
-
evaluation_strategy="steps",
|
54 |
-
save_steps=
|
55 |
-
save_total_limit=2,
|
56 |
-
gradient_accumulation_steps=8, # Accumulate gradients to simulate larger batch size
|
57 |
)
|
58 |
|
59 |
# Define Trainer
|
60 |
trainer = Trainer(
|
61 |
model=model,
|
62 |
args=training_args,
|
63 |
-
train_dataset=tokenized_datasets
|
64 |
tokenizer=tokenizer,
|
65 |
)
|
66 |
|
|
|
13 |
# Print dataset columns for verification
|
14 |
print(f"Dataset columns: {dataset['train'].column_names}")
|
15 |
|
16 |
+
# Select a subset of the dataset (e.g., first 100,000 rows)
|
17 |
+
subset_size = 100000 # Adjust the size as needed
|
18 |
+
subset_dataset = dataset["train"].select(range(min(subset_size, len(dataset["train"]))))
|
19 |
+
print(f"Using a subset of {len(subset_dataset)} rows for training.")
|
20 |
+
|
21 |
# Initialize the tokenizer and model
|
22 |
+
model_name = "gpt2" # Use GPT-2 for text generation
|
23 |
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
|
24 |
model = GPT2LMHeadModel.from_pretrained(model_name)
|
25 |
|
|
|
29 |
|
30 |
# Preprocess the dataset
|
31 |
def preprocess_function(examples):
|
32 |
+
# Tokenize and format the dataset
|
33 |
encodings = tokenizer(
|
34 |
+
examples['text'], # Use 'text' column from your dataset
|
35 |
truncation=True,
|
36 |
+
padding='max_length', # Ensure consistent length
|
37 |
max_length=512
|
38 |
)
|
39 |
+
encodings['labels'] = encodings['input_ids'] # Use input_ids directly as labels
|
40 |
return encodings
|
41 |
|
42 |
# Tokenize the dataset
|
43 |
try:
|
44 |
+
tokenized_datasets = subset_dataset.map(
|
45 |
preprocess_function,
|
46 |
+
batched=True
|
|
|
47 |
)
|
48 |
except Exception as e:
|
49 |
print(f"Error during tokenization: {e}")
|
|
|
51 |
# Define training arguments
|
52 |
training_args = TrainingArguments(
|
53 |
output_dir='./results',
|
54 |
+
per_device_train_batch_size=4,
|
55 |
num_train_epochs=1,
|
56 |
logging_dir='./logs',
|
57 |
+
logging_steps=500, # Log every 500 steps
|
58 |
+
evaluation_strategy="steps", # Use evaluation strategy
|
59 |
+
save_steps=10_000, # Save checkpoint every 10,000 steps
|
60 |
+
save_total_limit=2, # Keep only the last 2 checkpoints
|
|
|
61 |
)
|
62 |
|
63 |
# Define Trainer
|
64 |
trainer = Trainer(
|
65 |
model=model,
|
66 |
args=training_args,
|
67 |
+
train_dataset=tokenized_datasets,
|
68 |
tokenizer=tokenizer,
|
69 |
)
|
70 |
|