Spaces:

Jainesh212
/

Milestone3

Runtime error

App Files Files Community

Jainesh212 commited on Apr 24, 2023

Commit

a85c8ad

1 Parent(s): 54f24e6

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -61

app.py CHANGED Viewed

@@ -5,82 +5,84 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.utils.data import Dataset, DataLoader
-from transformers import AutoTokenizer, AutoModel
 import random
 from bs4 import BeautifulSoup
 import re
 from transformers import AutoModelForSequenceClassification
 import pytorch_lightning as pl
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 train_path = "train.csv"
-val_path = "val.csv"
 test_path = "test.csv"
 test_labels_paths = "test_labels.csv"
 test_df = pd.read_csv(test_path)
 test_labels_df = pd.read_csv(test_labels_paths)
-test_df = pd.concat([test_df.iloc[:, 1], test_labels_df.iloc[:, 1:]], axis=1)
 test_df.to_csv("test-dataset.csv")
 test_dataset_path = "test-dataset.csv"
-# Lets make a new column labeled "healthy"
 def healthy_filter(df):
-    if (df["toxic"] == 0) and (df["severe_toxic"] == 0) and (df["obscene"] == 0) and (df["threat"] == 0) and (df["insult"] == 0) and (df["identity_hate"] == 0):
-        return 1
-    else:
-        return 0
-attributes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate', 'healthy']
 class Comments_Dataset(Dataset):
-    def __init__(self, data_path, tokenizer, attributes, max_token_len=128, sample=5000):
-        self.data_path = data_path
-        self.tokenizer = tokenizer
-        self.attributes = attributes
-        self.max_token_len = max_token_len
-        self.sample = sample
-        self._prepare_data()
-    def _prepare_data(self):
-        data = pd.read_csv(self.data_path)
-        data["healthy"] = data.apply(healthy_filter, axis=1)
-        data["unhealthy"] = np.where(data['healthy'] == 1, 0, 1)
-        if self.sample is not None:
-            unhealthy = data.loc[data["healthy"] == 0]
-            healthy = data.loc[data["healthy"] == 1]
-            self.data = pd.concat([unhealthy, healthy.sample(self.sample, random_state=42)])
-        else:
-            self.data = data
-    def __len__(self):
-        return len(self.data)
-    def __getitem__(self, index):
-        item = self.data.iloc[index]
-        comment = str(item.comment_text)
-        attributes = torch.FloatTensor(item[self.attributes])
-        tokens = self.tokenizer.encode_plus(comment,
-                                            add_special_tokens=True,
-                                            return_tensors='pt',
-                                            truncation=True,
-                                            padding='max_length',
-                                            max_length=self.max_token_len,
-                                            return_attention_mask=True)
-        return {'input_ids': tokens.input_ids.flatten(), 'attention_mask': tokens.attention_mask.flatten(), 'labels': attributes}
 class Comments_Data_Module(pl.LightningDataModule):
-    def __init__(self, train_path, val_path, attributes, batch_size: int = 16, max_token_length: int = 128, model_name='roberta-base'):
-        super().__init__()
-        self.train_path = train_path
-        self.val_path = val_path
-        self.attributes = attributes
-        self.batch_size = batch_size
-        self.max_token_length = max_token_length
-        self.tokenizer = AutoTokenizer.from_pre
-    def setup(self, stage = None):
     if stage in (None, "fit"):
       self.train_dataset = Comments_Dataset(self.train_path, attributes=self.attributes, tokenizer=self.tokenizer)
       self.val_dataset = Comments_Dataset(self.val_path, attributes=self.attributes, tokenizer=self.tokenizer, sample=None)
@@ -101,7 +103,6 @@ comments_data_module.setup()
 comments_data_module.train_dataloader()
 class Comment_Classifier(pl.LightningModule):
-#the config dict has the hugginface parameters in it
   def __init__(self, config: dict):
     super().__init__()
     self.config = config
@@ -113,10 +114,8 @@ class Comment_Classifier(pl.LightningModule):
     self.dropout = nn.Dropout()
   def forward(self, input_ids, attention_mask, labels=None):
-    # roberta layer
     output = self.pretrained_model(input_ids=input_ids, attention_mask=attention_mask)
     pooled_output = torch.mean(output.last_hidden_state, 1)
-    # final logits / classification layers
     pooled_output = self.dropout(pooled_output)
     pooled_output = self.hidden(pooled_output)
     pooled_output = F.relu(pooled_output)
@@ -148,7 +147,7 @@ class Comment_Classifier(pl.LightningModule):
     warmup_steps = math.floor(total_steps * self.config['warmup'])
     scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps)
     return [optimizer],[scheduler]
 config = {
     'model_name': 'distilroberta-base',
     'n_labels': len(attributes),
@@ -160,7 +159,7 @@ config = {
     'n_epochs': 100
 }
-##tokenizer
 model_name = 'distilroberta-base'
 tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -196,8 +195,10 @@ def run_inference(encoding):
     final_output = torch.softmax(output[1][0],dim=0).cpu()
     print(final_output.numpy().tolist())
     return final_output.numpy().tolist()
 test_tweets = test_df["comment_text"].values
 models = ["distilroberta-base"]
 model_pointers = ["default: distilroberta-base"]
@@ -207,8 +208,10 @@ with st.form(key="init_form"):
     current_random_tweet = test_tweets[random.randint(0,len(test_tweets))]
     current_random_tweet = prepare_tokenized_review(current_random_tweet)
     choice = st.selectbox("Choose Model", model_pointers)
     user_picked_model = models[model_pointers.index(choice)]
     with st.spinner("Analyzing..."):
         text_encoding = get_encodings(current_random_tweet)
@@ -217,8 +220,6 @@ with st.form(key="init_form"):
         df["Highest Toxicity Class"] = attributes[result.index(max(result))]
         df["Sentiment Score"] = max(result)
         st.table(df)
     next_tweet = st.form_submit_button("Next Tweet")
 if next_tweet:

 import torch.nn as nn
 import torch.nn.functional as F
 from torch.utils.data import Dataset, DataLoader
+from transformers import AutoTokenizer,AutoModel
 import random
 from bs4 import BeautifulSoup
 import re
 from transformers import AutoModelForSequenceClassification
 import pytorch_lightning as pl
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 train_path = "train.csv"
 test_path = "test.csv"
 test_labels_paths = "test_labels.csv"
 test_df = pd.read_csv(test_path)
 test_labels_df = pd.read_csv(test_labels_paths)
+test_df = pd.concat([test_df.iloc[:, 1], test_labels_df.iloc[:, 1:]], axis = 1)
 test_df.to_csv("test-dataset.csv")
 test_dataset_path = "test-dataset.csv"
 def healthy_filter(df):
+  if (df["toxic"]==0) and (df["severe_toxic"]==0) and (df["obscene"]==0) and (df["threat"]==0) and (df["insult"]==0) and (df["identity_hate"]==0):
+    return 1
+  else:
+    return 0
+attributes = ['toxic', 'severe_toxic', 'obscene', 'threat',
+       'insult', 'identity_hate', 'healthy']
 class Comments_Dataset(Dataset):
+  def __init__(self, data_path, tokenizer, attributes, max_token_len = 128, sample=5000):
+    self.data_path = data_path
+    self.tokenizer = tokenizer
+    self.attributes = attributes
+    self.max_token_len = max_token_len
+    self.sample = sample
+    self._prepare_data()
+  def _prepare_data(self):
+    data = pd.read_csv(self.data_path)
+    data["healthy"] = data.apply(healthy_filter,axis=1)
+    data["unhealthy"] = np.where(data['healthy']==1, 0, 1)
+    if self.sample is not None:
+      unhealthy = data.loc[data["healthy"] == 0]
+      healthy = data.loc[data["healthy"] ==1]
+      self.data = pd.concat([unhealthy, healthy.sample(self.sample, random_state=42)])
+    else:
+      self.data = data
+  def __len__(self):
+    return len(self.data)
+  def __getitem__(self,index):
+    item = self.data.iloc[index]
+    comment = str(item.comment_text)
+    attributes = torch.FloatTensor(item[self.attributes])
+    tokens = self.tokenizer.encode_plus(comment,
+                                      add_special_tokens=True,
+                                      return_tensors='pt',
+                                      truncation=True,
+                                      padding='max_length',
+                                      max_length=self.max_token_len,
+                                      return_attention_mask = True)
+    return {'input_ids': tokens.input_ids.flatten(), 'attention_mask': tokens.attention_mask.flatten(), 'labels': attributes}
 class Comments_Data_Module(pl.LightningDataModule):
+  def __init__(self, train_path, val_path, attributes, batch_size: int = 16, max_token_length: int = 128,  model_name='roberta-base'):
+    super().__init__()
+    self.train_path = train_path
+    self.val_path = val_path
+    self.attributes = attributes
+    self.batch_size = batch_size
+    self.max_token_length = max_token_length
+    self.model_name = model_name
+    self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+  def setup(self, stage = None):
     if stage in (None, "fit"):
       self.train_dataset = Comments_Dataset(self.train_path, attributes=self.attributes, tokenizer=self.tokenizer)
       self.val_dataset = Comments_Dataset(self.val_path, attributes=self.attributes, tokenizer=self.tokenizer, sample=None)
 comments_data_module.train_dataloader()
 class Comment_Classifier(pl.LightningModule):
   def __init__(self, config: dict):
     super().__init__()
     self.config = config
     self.dropout = nn.Dropout()
   def forward(self, input_ids, attention_mask, labels=None):
     output = self.pretrained_model(input_ids=input_ids, attention_mask=attention_mask)
     pooled_output = torch.mean(output.last_hidden_state, 1)
     pooled_output = self.dropout(pooled_output)
     pooled_output = self.hidden(pooled_output)
     pooled_output = F.relu(pooled_output)
     warmup_steps = math.floor(total_steps * self.config['warmup'])
     scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps)
     return [optimizer],[scheduler]
 config = {
     'model_name': 'distilroberta-base',
     'n_labels': len(attributes),
     'n_epochs': 100
 }
 model_name = 'distilroberta-base'
 tokenizer = AutoTokenizer.from_pretrained(model_name)
     final_output = torch.softmax(output[1][0],dim=0).cpu()
     print(final_output.numpy().tolist())
     return final_output.numpy().tolist()
 test_tweets = test_df["comment_text"].values
+#streamlit section
 models = ["distilroberta-base"]
 model_pointers = ["default: distilroberta-base"]
     current_random_tweet = test_tweets[random.randint(0,len(test_tweets))]
     current_random_tweet = prepare_tokenized_review(current_random_tweet)
     choice = st.selectbox("Choose Model", model_pointers)
     user_picked_model = models[model_pointers.index(choice)]
     with st.spinner("Analyzing..."):
         text_encoding = get_encodings(current_random_tweet)
         df["Highest Toxicity Class"] = attributes[result.index(max(result))]
         df["Sentiment Score"] = max(result)
         st.table(df)
     next_tweet = st.form_submit_button("Next Tweet")
 if next_tweet: