Jainesh212 commited on
Commit
a85c8ad
·
1 Parent(s): 54f24e6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -61
app.py CHANGED
@@ -5,82 +5,84 @@ import torch
5
  import torch.nn as nn
6
  import torch.nn.functional as F
7
  from torch.utils.data import Dataset, DataLoader
8
- from transformers import AutoTokenizer, AutoModel
9
  import random
10
  from bs4 import BeautifulSoup
11
  import re
 
12
  from transformers import AutoModelForSequenceClassification
13
  import pytorch_lightning as pl
14
 
15
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
16
 
17
  train_path = "train.csv"
18
- val_path = "val.csv"
19
  test_path = "test.csv"
20
  test_labels_paths = "test_labels.csv"
21
  test_df = pd.read_csv(test_path)
22
  test_labels_df = pd.read_csv(test_labels_paths)
23
- test_df = pd.concat([test_df.iloc[:, 1], test_labels_df.iloc[:, 1:]], axis=1)
24
  test_df.to_csv("test-dataset.csv")
25
  test_dataset_path = "test-dataset.csv"
26
 
27
- # Lets make a new column labeled "healthy"
28
  def healthy_filter(df):
29
- if (df["toxic"] == 0) and (df["severe_toxic"] == 0) and (df["obscene"] == 0) and (df["threat"] == 0) and (df["insult"] == 0) and (df["identity_hate"] == 0):
30
- return 1
31
- else:
32
- return 0
33
 
34
- attributes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate', 'healthy']
 
35
 
36
  class Comments_Dataset(Dataset):
37
- def __init__(self, data_path, tokenizer, attributes, max_token_len=128, sample=5000):
38
- self.data_path = data_path
39
- self.tokenizer = tokenizer
40
- self.attributes = attributes
41
- self.max_token_len = max_token_len
42
- self.sample = sample
43
- self._prepare_data()
44
-
45
- def _prepare_data(self):
46
- data = pd.read_csv(self.data_path)
47
- data["healthy"] = data.apply(healthy_filter, axis=1)
48
- data["unhealthy"] = np.where(data['healthy'] == 1, 0, 1)
49
- if self.sample is not None:
50
- unhealthy = data.loc[data["healthy"] == 0]
51
- healthy = data.loc[data["healthy"] == 1]
52
- self.data = pd.concat([unhealthy, healthy.sample(self.sample, random_state=42)])
53
- else:
54
- self.data = data
55
-
56
- def __len__(self):
57
- return len(self.data)
58
-
59
- def __getitem__(self, index):
60
- item = self.data.iloc[index]
61
- comment = str(item.comment_text)
62
- attributes = torch.FloatTensor(item[self.attributes])
63
- tokens = self.tokenizer.encode_plus(comment,
64
- add_special_tokens=True,
65
- return_tensors='pt',
66
- truncation=True,
67
- padding='max_length',
68
- max_length=self.max_token_len,
69
- return_attention_mask=True)
70
- return {'input_ids': tokens.input_ids.flatten(), 'attention_mask': tokens.attention_mask.flatten(), 'labels': attributes}
71
 
72
 
73
  class Comments_Data_Module(pl.LightningDataModule):
74
- def __init__(self, train_path, val_path, attributes, batch_size: int = 16, max_token_length: int = 128, model_name='roberta-base'):
75
- super().__init__()
76
- self.train_path = train_path
77
- self.val_path = val_path
78
- self.attributes = attributes
79
- self.batch_size = batch_size
80
- self.max_token_length = max_token_length
81
- self.tokenizer = AutoTokenizer.from_pre
82
-
83
- def setup(self, stage = None):
 
 
84
  if stage in (None, "fit"):
85
  self.train_dataset = Comments_Dataset(self.train_path, attributes=self.attributes, tokenizer=self.tokenizer)
86
  self.val_dataset = Comments_Dataset(self.val_path, attributes=self.attributes, tokenizer=self.tokenizer, sample=None)
@@ -101,7 +103,6 @@ comments_data_module.setup()
101
  comments_data_module.train_dataloader()
102
 
103
  class Comment_Classifier(pl.LightningModule):
104
- #the config dict has the hugginface parameters in it
105
  def __init__(self, config: dict):
106
  super().__init__()
107
  self.config = config
@@ -113,10 +114,8 @@ class Comment_Classifier(pl.LightningModule):
113
  self.dropout = nn.Dropout()
114
 
115
  def forward(self, input_ids, attention_mask, labels=None):
116
- # roberta layer
117
  output = self.pretrained_model(input_ids=input_ids, attention_mask=attention_mask)
118
  pooled_output = torch.mean(output.last_hidden_state, 1)
119
- # final logits / classification layers
120
  pooled_output = self.dropout(pooled_output)
121
  pooled_output = self.hidden(pooled_output)
122
  pooled_output = F.relu(pooled_output)
@@ -148,7 +147,7 @@ class Comment_Classifier(pl.LightningModule):
148
  warmup_steps = math.floor(total_steps * self.config['warmup'])
149
  scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps)
150
  return [optimizer],[scheduler]
151
-
152
  config = {
153
  'model_name': 'distilroberta-base',
154
  'n_labels': len(attributes),
@@ -160,7 +159,7 @@ config = {
160
  'n_epochs': 100
161
  }
162
 
163
- ##tokenizer
164
  model_name = 'distilroberta-base'
165
  tokenizer = AutoTokenizer.from_pretrained(model_name)
166
 
@@ -196,8 +195,10 @@ def run_inference(encoding):
196
  final_output = torch.softmax(output[1][0],dim=0).cpu()
197
  print(final_output.numpy().tolist())
198
  return final_output.numpy().tolist()
199
-
 
200
  test_tweets = test_df["comment_text"].values
 
201
  models = ["distilroberta-base"]
202
  model_pointers = ["default: distilroberta-base"]
203
 
@@ -207,8 +208,10 @@ with st.form(key="init_form"):
207
  current_random_tweet = test_tweets[random.randint(0,len(test_tweets))]
208
  current_random_tweet = prepare_tokenized_review(current_random_tweet)
209
 
 
210
  choice = st.selectbox("Choose Model", model_pointers)
211
 
 
212
  user_picked_model = models[model_pointers.index(choice)]
213
  with st.spinner("Analyzing..."):
214
  text_encoding = get_encodings(current_random_tweet)
@@ -217,8 +220,6 @@ with st.form(key="init_form"):
217
  df["Highest Toxicity Class"] = attributes[result.index(max(result))]
218
  df["Sentiment Score"] = max(result)
219
  st.table(df)
220
-
221
-
222
  next_tweet = st.form_submit_button("Next Tweet")
223
 
224
  if next_tweet:
 
5
  import torch.nn as nn
6
  import torch.nn.functional as F
7
  from torch.utils.data import Dataset, DataLoader
8
+ from transformers import AutoTokenizer,AutoModel
9
  import random
10
  from bs4 import BeautifulSoup
11
  import re
12
+
13
  from transformers import AutoModelForSequenceClassification
14
  import pytorch_lightning as pl
15
 
16
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
17
 
18
  train_path = "train.csv"
 
19
  test_path = "test.csv"
20
  test_labels_paths = "test_labels.csv"
21
  test_df = pd.read_csv(test_path)
22
  test_labels_df = pd.read_csv(test_labels_paths)
23
+ test_df = pd.concat([test_df.iloc[:, 1], test_labels_df.iloc[:, 1:]], axis = 1)
24
  test_df.to_csv("test-dataset.csv")
25
  test_dataset_path = "test-dataset.csv"
26
 
 
27
  def healthy_filter(df):
28
+ if (df["toxic"]==0) and (df["severe_toxic"]==0) and (df["obscene"]==0) and (df["threat"]==0) and (df["insult"]==0) and (df["identity_hate"]==0):
29
+ return 1
30
+ else:
31
+ return 0
32
 
33
+ attributes = ['toxic', 'severe_toxic', 'obscene', 'threat',
34
+ 'insult', 'identity_hate', 'healthy']
35
 
36
  class Comments_Dataset(Dataset):
37
+ def __init__(self, data_path, tokenizer, attributes, max_token_len = 128, sample=5000):
38
+ self.data_path = data_path
39
+ self.tokenizer = tokenizer
40
+ self.attributes = attributes
41
+ self.max_token_len = max_token_len
42
+ self.sample = sample
43
+ self._prepare_data()
44
+
45
+ def _prepare_data(self):
46
+ data = pd.read_csv(self.data_path)
47
+ data["healthy"] = data.apply(healthy_filter,axis=1)
48
+ data["unhealthy"] = np.where(data['healthy']==1, 0, 1)
49
+ if self.sample is not None:
50
+ unhealthy = data.loc[data["healthy"] == 0]
51
+ healthy = data.loc[data["healthy"] ==1]
52
+ self.data = pd.concat([unhealthy, healthy.sample(self.sample, random_state=42)])
53
+ else:
54
+ self.data = data
55
+
56
+ def __len__(self):
57
+ return len(self.data)
58
+
59
+ def __getitem__(self,index):
60
+ item = self.data.iloc[index]
61
+ comment = str(item.comment_text)
62
+ attributes = torch.FloatTensor(item[self.attributes])
63
+ tokens = self.tokenizer.encode_plus(comment,
64
+ add_special_tokens=True,
65
+ return_tensors='pt',
66
+ truncation=True,
67
+ padding='max_length',
68
+ max_length=self.max_token_len,
69
+ return_attention_mask = True)
70
+ return {'input_ids': tokens.input_ids.flatten(), 'attention_mask': tokens.attention_mask.flatten(), 'labels': attributes}
71
 
72
 
73
  class Comments_Data_Module(pl.LightningDataModule):
74
+
75
+ def __init__(self, train_path, val_path, attributes, batch_size: int = 16, max_token_length: int = 128, model_name='roberta-base'):
76
+ super().__init__()
77
+ self.train_path = train_path
78
+ self.val_path = val_path
79
+ self.attributes = attributes
80
+ self.batch_size = batch_size
81
+ self.max_token_length = max_token_length
82
+ self.model_name = model_name
83
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
84
+
85
+ def setup(self, stage = None):
86
  if stage in (None, "fit"):
87
  self.train_dataset = Comments_Dataset(self.train_path, attributes=self.attributes, tokenizer=self.tokenizer)
88
  self.val_dataset = Comments_Dataset(self.val_path, attributes=self.attributes, tokenizer=self.tokenizer, sample=None)
 
103
  comments_data_module.train_dataloader()
104
 
105
  class Comment_Classifier(pl.LightningModule):
 
106
  def __init__(self, config: dict):
107
  super().__init__()
108
  self.config = config
 
114
  self.dropout = nn.Dropout()
115
 
116
  def forward(self, input_ids, attention_mask, labels=None):
 
117
  output = self.pretrained_model(input_ids=input_ids, attention_mask=attention_mask)
118
  pooled_output = torch.mean(output.last_hidden_state, 1)
 
119
  pooled_output = self.dropout(pooled_output)
120
  pooled_output = self.hidden(pooled_output)
121
  pooled_output = F.relu(pooled_output)
 
147
  warmup_steps = math.floor(total_steps * self.config['warmup'])
148
  scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps)
149
  return [optimizer],[scheduler]
150
+
151
  config = {
152
  'model_name': 'distilroberta-base',
153
  'n_labels': len(attributes),
 
159
  'n_epochs': 100
160
  }
161
 
162
+
163
  model_name = 'distilroberta-base'
164
  tokenizer = AutoTokenizer.from_pretrained(model_name)
165
 
 
195
  final_output = torch.softmax(output[1][0],dim=0).cpu()
196
  print(final_output.numpy().tolist())
197
  return final_output.numpy().tolist()
198
+
199
+
200
  test_tweets = test_df["comment_text"].values
201
+ #streamlit section
202
  models = ["distilroberta-base"]
203
  model_pointers = ["default: distilroberta-base"]
204
 
 
208
  current_random_tweet = test_tweets[random.randint(0,len(test_tweets))]
209
  current_random_tweet = prepare_tokenized_review(current_random_tweet)
210
 
211
+
212
  choice = st.selectbox("Choose Model", model_pointers)
213
 
214
+
215
  user_picked_model = models[model_pointers.index(choice)]
216
  with st.spinner("Analyzing..."):
217
  text_encoding = get_encodings(current_random_tweet)
 
220
  df["Highest Toxicity Class"] = attributes[result.index(max(result))]
221
  df["Sentiment Score"] = max(result)
222
  st.table(df)
 
 
223
  next_tweet = st.form_submit_button("Next Tweet")
224
 
225
  if next_tweet: