dahongj commited on
Commit
05d66dd
2 Parent(s): 54d2e13 43e755d

Merge pull request #26 from dahongj/milestone-3

Browse files
Files changed (1) hide show
  1. finetune.py +9 -0
finetune.py CHANGED
@@ -10,11 +10,13 @@ from transformers import DistilBertForSequenceClassification, AdamW
10
 
11
  model_name = "distilbert-base-uncased"
12
 
 
13
  df = pd.read_csv('train.csv')
14
  train_texts = df["comment_text"].values
15
  train_labels = df[df.columns[2:]].values
16
  train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)
17
 
 
18
  class TextDataset(Dataset):
19
  def __init__(self,texts,labels):
20
  self.texts = texts
@@ -30,21 +32,26 @@ class TextDataset(Dataset):
30
  def __len__(self):
31
  return len(self.labels)
32
 
 
33
  tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
34
 
 
35
  train_dataset = TextDataset(train_texts,train_labels)
36
  val_dataset = TextDataset(val_texts, val_labels)
37
 
38
  device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
39
 
 
40
  model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=6, problem_type="multi_label_classification")
41
  model.to(device)
42
  model.train()
43
 
 
44
  train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
45
 
46
  optim = AdamW(model.parameters(), lr=5e-5)
47
 
 
48
  for epoch in range(1):
49
  for batch in train_loader:
50
  optim.zero_grad()
@@ -59,6 +66,8 @@ for epoch in range(1):
59
 
60
  model.eval()
61
 
 
62
  model.save_pretrained("sentiment_custom_model")
63
 
 
64
  tokenizer.save_pretrained("sentiment_tokenizer")
 
10
 
11
  model_name = "distilbert-base-uncased"
12
 
13
+ #Reading text
14
  df = pd.read_csv('train.csv')
15
  train_texts = df["comment_text"].values
16
  train_labels = df[df.columns[2:]].values
17
  train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)
18
 
19
+ #Dataset class to create the labels and encode them
20
  class TextDataset(Dataset):
21
  def __init__(self,texts,labels):
22
  self.texts = texts
 
32
  def __len__(self):
33
  return len(self.labels)
34
 
35
+ #This is the tokenizer for the current model
36
  tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
37
 
38
+ #Set up the dataset
39
  train_dataset = TextDataset(train_texts,train_labels)
40
  val_dataset = TextDataset(val_texts, val_labels)
41
 
42
  device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
43
 
44
+ #Use multilabel model because there are 6 variables to fintune for
45
  model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=6, problem_type="multi_label_classification")
46
  model.to(device)
47
  model.train()
48
 
49
+ #Use these parameters
50
  train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
51
 
52
  optim = AdamW(model.parameters(), lr=5e-5)
53
 
54
+ #Finetune process
55
  for epoch in range(1):
56
  for batch in train_loader:
57
  optim.zero_grad()
 
66
 
67
  model.eval()
68
 
69
+ #Upload trained model to a file
70
  model.save_pretrained("sentiment_custom_model")
71
 
72
+ #Upload tokenizer to a file
73
  tokenizer.save_pretrained("sentiment_tokenizer")