In [1]:
! pip install transformers datasets evaluate



In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv("ielts_writing_dataset_new.csv")

data.label = data.label.replace(1,0)
data.label = data.label.replace(3,0)
data.label = data.label.replace(3.5,0)
data.label = data.label.replace(4,0)
data.label = data.label.replace(4.5,0)
data.label = data.label.replace(5,0)
data.label = data.label.replace(5.5,1)
data.label = data.label.replace(6,1)
data.label = data.label.replace(6.5,1)
data.label = data.label.replace(7,1)
data.label = data.label.replace(7.5,1)
data.label = data.label.replace(8,2)
data.label = data.label.replace(8.5,2)
data.label = data.label.replace(9,2)

data.label = data.label.astype(int)

train, test = train_test_split(data, test_size=0.2)


In [4]:
data[:10]

Unnamed: 0,label,text
0,1,"Between 1995 and 2010, a study was conducted r..."
1,1,Poverty represents a worldwide crisis. It is t...
2,0,The left chart shows the population change hap...
3,1,Human beings are facing many challenges nowada...
4,1,Information about the thousands of visits from...
5,1,Whether countries should only invest facilitie...
6,1,This graph depicts the changes in tourists vis...
7,1,"Sports is an essential part to most of us , so..."
8,2,The line graph illustrates the number of overs...
9,2,International sports events require the most w...


In [5]:
import datasets
from datasets import Dataset, DatasetDict

train = Dataset.from_pandas(train)
test = Dataset.from_pandas(test)


dataset = DatasetDict()

dataset['train'] = train
dataset['test'] = test
dataset = dataset.remove_columns(["__index_level_0__"])
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1148
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 287
    })
})

In [6]:
dataset["test"][0]

{'label': 1,
 'text': 'Everything has two sides and the globalization is not exception. Our first thoughts about this topic include the process of global “McDonaldisation” and, generally speaking, spreading across the whole Globe.Firstly, I would try to concentrate on the positive aspects of globalisation. As far as economy is concerned, like the Global Bank or IMF are always focused on developing the ‘Third World’ and helping poor people to combat their life obstacles (through loans and donations). Moreover, the world becomes an area of sharing thoughts (e.g. philosophical or economical doctrines), which become popular due to lack of barriers.However, disadvantages of globalization are also widely known. Some people insist that because of this process, the spirit of countries and nations rapidly disappears. The integrity, established years ago is on the verge of collapsing. Furthermore, there’s a strong lobby of communists who , that the globalization indicates an uncontrolled reign o

In [7]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [8]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1148 [00:00<?, ? examples/s]

Map:   0%|          | 0/287 [00:00<?, ? examples/s]

In [9]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [10]:
tokenized_dataset['train']

Dataset({
    features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1148
})

In [11]:
import evaluate

accuracy = evaluate.load("accuracy")

In [12]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [13]:
# id2label = {0: '1', 1:'3', 2:'3.5', 3:'4', 4:'4.5',5:'5', 6:'5.5', 7:'6', 8:'6.5',9:'7',10:'7.5',11:'8',12:'8.5',13:'9'}
# label2id = {'1':0,'3':1,'3.5':2,'4':3,'4.5':4,'5':5,'5.5':6,'6':7,'6.5':8,'7':9,'7.5':10,'8':11,'8.5':12,'9':13}
id2label = {0:"Bad",1:"Acceptable",2:"Excellent"}
label2id = {"Bad":0,"Acceptable":1,"Excellent":2}



In [14]:
from transformers import BertForSequenceClassification, TrainingArguments, Trainer

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",num_labels=3, id2label=id2label, label2id=label2id,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
! pip install transformers[torch]



In [16]:
from torch import nn

class ClassificationTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("label")
        outputs = model(**inputs)
        outputs = outputs.unsqueeze(1)
        logits = outputs.get('logits')
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(logits.squeeze(), labels.squeeze())
        return (loss, outputs) if return_outputs else loss

In [17]:
training_args = TrainingArguments(
    output_dir="essayl0",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=15,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.601437,0.752613
2,No log,0.444218,0.860627
3,No log,0.510611,0.815331
4,No log,0.723215,0.766551
5,No log,0.556284,0.850174
6,No log,0.783423,0.794425
7,0.275800,0.735923,0.850174
8,0.275800,0.654791,0.878049
9,0.275800,0.633503,0.888502
10,0.275800,1.105006,0.783972


TrainOutput(global_step=1080, training_loss=0.13700703542541576, metrics={'train_runtime': 1752.9066, 'train_samples_per_second': 9.824, 'train_steps_per_second': 0.616, 'total_flos': 4194210824632584.0, 'train_loss': 0.13700703542541576, 'epoch': 15.0})

In [18]:
!zip -r /content/checkpoint.zip /content/essayl0/checkpoint-1080/

  adding: content/essayl0/checkpoint-1080/ (stored 0%)
  adding: content/essayl0/checkpoint-1080/special_tokens_map.json (deflated 42%)
  adding: content/essayl0/checkpoint-1080/rng_state.pth (deflated 28%)
  adding: content/essayl0/checkpoint-1080/vocab.txt (deflated 53%)
  adding: content/essayl0/checkpoint-1080/tokenizer.json (deflated 71%)
  adding: content/essayl0/checkpoint-1080/config.json (deflated 50%)
  adding: content/essayl0/checkpoint-1080/trainer_state.json (deflated 78%)
  adding: content/essayl0/checkpoint-1080/pytorch_model.bin (deflated 7%)
  adding: content/essayl0/checkpoint-1080/optimizer.pt (deflated 21%)
  adding: content/essayl0/checkpoint-1080/training_args.bin (deflated 48%)
  adding: content/essayl0/checkpoint-1080/tokenizer_config.json (deflated 43%)
  adding: content/essayl0/checkpoint-1080/scheduler.pt (deflated 49%)
