art-manuh commited on
Commit
23b1945
·
verified ·
1 Parent(s): bbdb395

Reduced the number of parameters

Browse files
Files changed (1) hide show
  1. app.py +18 -14
app.py CHANGED
@@ -13,8 +13,13 @@ dataset = load_dataset("mwitiderrick/swahili")
13
  # Print dataset columns for verification
14
  print(f"Dataset columns: {dataset['train'].column_names}")
15
 
 
 
 
 
 
16
  # Initialize the tokenizer and model
17
- model_name = "gpt2" # Use a smaller variant of GPT-2 for efficiency
18
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
19
  model = GPT2LMHeadModel.from_pretrained(model_name)
20
 
@@ -24,21 +29,21 @@ tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
24
 
25
  # Preprocess the dataset
26
  def preprocess_function(examples):
 
27
  encodings = tokenizer(
28
- examples['text'],
29
  truncation=True,
30
- padding='max_length',
31
  max_length=512
32
  )
33
- encodings['labels'] = encodings['input_ids']
34
  return encodings
35
 
36
  # Tokenize the dataset
37
  try:
38
- tokenized_datasets = dataset.map(
39
  preprocess_function,
40
- batched=True,
41
- batch_size=1000 # Adjust batch size for efficiency
42
  )
43
  except Exception as e:
44
  print(f"Error during tokenization: {e}")
@@ -46,21 +51,20 @@ except Exception as e:
46
  # Define training arguments
47
  training_args = TrainingArguments(
48
  output_dir='./results',
49
- per_device_train_batch_size=2, # Lowered batch size to prevent OOM errors
50
  num_train_epochs=1,
51
  logging_dir='./logs',
52
- logging_steps=500,
53
- evaluation_strategy="steps",
54
- save_steps=5000, # Save checkpoints more frequently
55
- save_total_limit=2,
56
- gradient_accumulation_steps=8, # Accumulate gradients to simulate larger batch size
57
  )
58
 
59
  # Define Trainer
60
  trainer = Trainer(
61
  model=model,
62
  args=training_args,
63
- train_dataset=tokenized_datasets["train"],
64
  tokenizer=tokenizer,
65
  )
66
 
 
13
  # Print dataset columns for verification
14
  print(f"Dataset columns: {dataset['train'].column_names}")
15
 
16
+ # Select a subset of the dataset (e.g., first 100,000 rows)
17
+ subset_size = 100000 # Adjust the size as needed
18
+ subset_dataset = dataset["train"].select(range(min(subset_size, len(dataset["train"]))))
19
+ print(f"Using a subset of {len(subset_dataset)} rows for training.")
20
+
21
  # Initialize the tokenizer and model
22
+ model_name = "gpt2" # Use GPT-2 for text generation
23
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
24
  model = GPT2LMHeadModel.from_pretrained(model_name)
25
 
 
29
 
30
  # Preprocess the dataset
31
  def preprocess_function(examples):
32
+ # Tokenize and format the dataset
33
  encodings = tokenizer(
34
+ examples['text'], # Use 'text' column from your dataset
35
  truncation=True,
36
+ padding='max_length', # Ensure consistent length
37
  max_length=512
38
  )
39
+ encodings['labels'] = encodings['input_ids'] # Use input_ids directly as labels
40
  return encodings
41
 
42
  # Tokenize the dataset
43
  try:
44
+ tokenized_datasets = subset_dataset.map(
45
  preprocess_function,
46
+ batched=True
 
47
  )
48
  except Exception as e:
49
  print(f"Error during tokenization: {e}")
 
51
  # Define training arguments
52
  training_args = TrainingArguments(
53
  output_dir='./results',
54
+ per_device_train_batch_size=4,
55
  num_train_epochs=1,
56
  logging_dir='./logs',
57
+ logging_steps=500, # Log every 500 steps
58
+ evaluation_strategy="steps", # Use evaluation strategy
59
+ save_steps=10_000, # Save checkpoint every 10,000 steps
60
+ save_total_limit=2, # Keep only the last 2 checkpoints
 
61
  )
62
 
63
  # Define Trainer
64
  trainer = Trainer(
65
  model=model,
66
  args=training_args,
67
+ train_dataset=tokenized_datasets,
68
  tokenizer=tokenizer,
69
  )
70