smiraldr commited on
Commit
547e73d
1 Parent(s): fea93d8

Updated Model Tutorial Code

Browse files
Files changed (1) hide show
  1. README.md +15 -30
README.md CHANGED
@@ -74,56 +74,41 @@ Use the code below to get started with the model for general finetuning tasks. P
74
  ```
75
  import torch
76
  from datasets import load_dataset, load_metric
77
- from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer
78
  import evaluate
79
- metric = evaluate.load("accuracy")
 
 
 
 
 
80
  def compute_metrics(eval_pred):
81
  logits, labels = eval_pred
82
  predictions = np.argmax(logits, axis=-1)
83
  return metric.compute(predictions=predictions, references=labels)
84
 
85
- # Load the CoLA dataset
86
- cola_dataset = load_dataset("glue", "cola")
87
-
88
- cola_dataset = cola_dataset.rename_column('label', 'labels')
89
- cola_dataset = cola_dataset.rename_column('sentence', 'text')
90
-
91
- # Load the tokenizer and model
92
- tokenizer = AutoTokenizer.from_pretrained("Koodsml/KooBERT")
93
- model = AutoModel.from_pretrained("Koodsml/KooBERT", num_labels=2)
94
-
95
  def tokenize_function(examples):
96
- return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
97
 
 
 
 
98
 
99
- cola_dataset = cola_dataset.map(tokenize_function, batched=True)
100
 
101
  # Set the device
102
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
103
  model.to(device)
104
 
105
  # Define the training arguments
106
- training_args = TrainingArguments(
107
- output_dir='./results',
108
- evaluation_strategy='epoch',
109
- # eval_steps=100,
110
- save_total_limit=1,
111
- learning_rate=2e-5,
112
- per_device_train_batch_size=8,
113
- per_device_eval_batch_size=8,
114
- num_train_epochs=3,
115
- weight_decay=0.01,
116
- push_to_hub=False,
117
- )
118
 
119
  # Define the trainer
120
  trainer = Trainer(
121
  model=model,
122
  args=training_args,
123
- train_dataset=cola_dataset['train'],
124
- eval_dataset=cola_dataset['validation'],
125
- # tokenizer=tokenizer,
126
- compute_metrics=compute_metrics
127
  )
128
 
129
  # Fine-tune on the CoLA dataset
 
74
  ```
75
  import torch
76
  from datasets import load_dataset, load_metric
 
77
  import evaluate
78
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
79
+
80
+ # Load the tokenizer and model
81
+ tokenizer = AutoTokenizer.from_pretrained("Koodsml/KooBERT")
82
+ model = AutoModelForSequenceClassification.from_pretrained("Koodsml/KooBERT", num_labels=2)
83
+
84
  def compute_metrics(eval_pred):
85
  logits, labels = eval_pred
86
  predictions = np.argmax(logits, axis=-1)
87
  return metric.compute(predictions=predictions, references=labels)
88
 
 
 
 
 
 
 
 
 
 
 
89
  def tokenize_function(examples):
90
+ return tokenizer(examples["text"], padding='max_length', truncation=True, max_length=128)
91
 
92
+ # Load the CoLA dataset
93
+ dataset = load_dataset("glue","cola")
94
+ dataset = dataset.rename_column('sentence', 'text')
95
 
96
+ datset_tok = dataset.map(tokenize_function, batched=True)
97
 
98
  # Set the device
99
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
100
  model.to(device)
101
 
102
  # Define the training arguments
103
+ training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
 
 
 
 
 
 
 
 
 
 
 
104
 
105
  # Define the trainer
106
  trainer = Trainer(
107
  model=model,
108
  args=training_args,
109
+ train_dataset=datset_tok['train'],
110
+ eval_dataset=datset_tok['validation'],
111
+ compute_metrics=compute_metrics,
 
112
  )
113
 
114
  # Fine-tune on the CoLA dataset