andyqin18 commited on
Commit
668f6af
·
1 Parent(s): f96a9e8

Test Table

Browse files
Files changed (3) hide show
  1. app.py +45 -7
  2. milestone3/milestone3.py +16 -73
  3. requirements.txt +2 -1
app.py CHANGED
@@ -1,14 +1,20 @@
1
  import streamlit as st
 
 
2
  from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
3
 
 
 
 
 
4
  # Define analyze function
5
- def analyze(model_name: str, text: str) -> dict:
6
  '''
7
  Output result of sentiment analysis of a text through a defined model
8
  '''
9
  model = AutoModelForSequenceClassification.from_pretrained(model_name)
10
  tokenizer = AutoTokenizer.from_pretrained(model_name)
11
- classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
12
  return classifier(text)
13
 
14
  # App title
@@ -18,7 +24,7 @@ st.write("Currently it uses pre-trained models without fine-tuning.")
18
 
19
  # Model hub
20
  model_descrip = {
21
- "andyqin18/test-finetuned": "This is a customized BERT-base finetuned model that detects multiple toxicity for a text. \
22
  Labels: toxic, severe_toxic, obscene, threat, insult, identity_hate",
23
  "distilbert-base-uncased-finetuned-sst-2-english": "This model is a fine-tune checkpoint of DistilBERT-base-uncased, fine-tuned on SST-2. \
24
  Labels: POSITIVE; NEGATIVE ",
@@ -28,6 +34,27 @@ model_descrip = {
28
  Labels: POS; NEU; NEG"
29
  }
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  user_input = st.text_input("Enter your text:", value="NYU is the better than Columbia.")
32
  user_model = st.selectbox("Please select a model:", model_descrip)
33
 
@@ -35,16 +62,27 @@ user_model = st.selectbox("Please select a model:", model_descrip)
35
  st.write("### Model Description:")
36
  st.write(model_descrip[user_model])
37
 
 
 
 
38
  # Perform analysis and print result
39
  if st.button("Analyze"):
40
  if not user_input:
41
  st.write("Please enter a text.")
42
  else:
43
  with st.spinner("Hang on.... Analyzing..."):
44
- result = analyze(user_model, user_input)
45
- st.write("Result:")
46
- st.write(f"Label: **{result[0]['label']}**")
47
- st.write(f"Confidence Score: **{result[0]['score']}**")
 
 
 
 
 
 
 
 
48
 
49
  else:
50
  st.write("Go on! Try the app!")
 
1
  import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
  from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
5
 
6
+
7
+ fine_tuned_model = "andyqin18/test-finetuned"
8
+ sample_text_num = 10
9
+
10
  # Define analyze function
11
+ def analyze(model_name: str, text: str, top_k=1) -> dict:
12
  '''
13
  Output result of sentiment analysis of a text through a defined model
14
  '''
15
  model = AutoModelForSequenceClassification.from_pretrained(model_name)
16
  tokenizer = AutoTokenizer.from_pretrained(model_name)
17
+ classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, top_k=top_k)
18
  return classifier(text)
19
 
20
  # App title
 
24
 
25
  # Model hub
26
  model_descrip = {
27
+ fine_tuned_model: "This is a customized BERT-base finetuned model that detects multiple toxicity for a text. \
28
  Labels: toxic, severe_toxic, obscene, threat, insult, identity_hate",
29
  "distilbert-base-uncased-finetuned-sst-2-english": "This model is a fine-tune checkpoint of DistilBERT-base-uncased, fine-tuned on SST-2. \
30
  Labels: POSITIVE; NEGATIVE ",
 
34
  Labels: POS; NEU; NEG"
35
  }
36
 
37
+ df = pd.read_csv("/milestone3/comp/test_comment.csv")
38
+ test_texts = df["comment_text"].values
39
+ sample_texts = np.random.choice(test_texts, size=sample_text_num, replace=False)
40
+
41
+ init_table_dict = {
42
+ "Text": [],
43
+ "Highest Toxicity Class": [],
44
+ "Highest Score": [],
45
+ "Second Highest Toxicity Class": [],
46
+ "Second Highest Score": []
47
+ }
48
+
49
+ for text in sample_texts:
50
+ result = analyze(fine_tuned_model, text, top_k=2)
51
+ init_table_dict["Text"].append(text[:50])
52
+ init_table_dict["Highest Toxicity Class"].append(result[0][0]['label'])
53
+ init_table_dict["Highest Score"].append(result[0][0]['score'])
54
+ init_table_dict["Second Highest Toxicity Class"].append(result[0][1]['label'])
55
+ init_table_dict["Second Highest Score"].append(result[0][1]['score'])
56
+
57
+
58
  user_input = st.text_input("Enter your text:", value="NYU is the better than Columbia.")
59
  user_model = st.selectbox("Please select a model:", model_descrip)
60
 
 
62
  st.write("### Model Description:")
63
  st.write(model_descrip[user_model])
64
 
65
+
66
+
67
+
68
  # Perform analysis and print result
69
  if st.button("Analyze"):
70
  if not user_input:
71
  st.write("Please enter a text.")
72
  else:
73
  with st.spinner("Hang on.... Analyzing..."):
74
+ if user_model == fine_tuned_model:
75
+ result = analyze(user_model, user_input, top_k=2)
76
+
77
+
78
+ df = pd.DataFrame(init_table_dict)
79
+ st.dataframe(df)
80
+
81
+ else:
82
+ result = analyze(user_model, user_input)
83
+ st.write("Result:")
84
+ st.write(f"Label: **{result[0]['label']}**")
85
+ st.write(f"Confidence Score: **{result[0]['score']}**")
86
 
87
  else:
88
  st.write("Go on! Try the app!")
milestone3/milestone3.py CHANGED
@@ -1,82 +1,25 @@
1
  # from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
2
 
3
- # import torch
4
- # import torch.nn.functional as F
 
 
 
 
 
 
5
 
6
- # model_name = "andyqin18/test-finetuned"
7
 
8
- # model = AutoModelForSequenceClassification.from_pretrained(model_name)
9
- # tokenizer = AutoTokenizer.from_pretrained(model_name)
10
 
11
- # classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
12
 
13
- # res = classifier(["Fuck your mom",
14
- # "Hope you don't hate it"])
15
 
16
- # for result in res:
17
- # print(result)
18
  import pandas as pd
19
- from sklearn.model_selection import train_test_split
20
- import torch
21
- from torch.utils.data import Dataset
22
- from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
23
- device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
24
  import numpy as np
25
-
26
- df = pd.read_csv("comp/train.csv")
27
-
28
- train_texts = df["comment_text"].values
29
- train_labels = df[df.columns[2:]].values
30
- # print(train_labels[0])
31
-
32
- # np.random.seed(123)
33
- # small_train_texts = np.random.choice(train_texts, size=1000, replace=False)
34
- # small_train_labels_idx = np.random.choice(train_labels.shape[0], size=1000, replace=False)
35
- # small_train_labels = train_labels[small_train_labels_idx, :]
36
-
37
-
38
- # train_texts, val_texts, train_labels, val_labels = train_test_split(small_train_texts, small_train_labels, test_size=.2)
39
- train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)
40
-
41
- class TextDataset(Dataset):
42
- def __init__(self,texts,labels):
43
- self.texts = texts
44
- self.labels = labels
45
-
46
- def __getitem__(self,idx):
47
- encodings = tokenizer(self.texts[idx], truncation=True, padding="max_length")
48
- item = {key: torch.tensor(val) for key, val in encodings.items()}
49
- item['labels'] = torch.tensor(self.labels[idx],dtype=torch.float32)
50
- del encodings
51
- return item
52
-
53
- def __len__(self):
54
- return len(self.labels)
55
-
56
- tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
57
- train_dataset = TextDataset(train_texts,train_labels)
58
- val_dataset = TextDataset(val_texts, val_labels)
59
- # small_train_dataset = train_dataset.shuffle(seed=42).select(range(1000))
60
- # small_val_dataset = val_dataset.shuffle(seed=42).select(range(1000))
61
-
62
-
63
-
64
- model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=6, problem_type="multi_label_classification")
65
- model.to(device)
66
- training_args = TrainingArguments(
67
- output_dir="finetuned-bert-uncased",
68
- per_device_train_batch_size=16,
69
- per_device_eval_batch_size=64,
70
- learning_rate=5e-4,
71
- weight_decay=0.01,
72
- evaluation_strategy="epoch",
73
- push_to_hub=True)
74
-
75
- trainer = Trainer(
76
- model=model,
77
- args=training_args,
78
- train_dataset=train_dataset,
79
- eval_dataset=val_dataset,
80
- )
81
-
82
- trainer.train()
 
1
  # from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
2
 
3
+ # def analyze(model_name: str, text: str, top_k=1) -> dict:
4
+ # '''
5
+ # Output result of sentiment analysis of a text through a defined model
6
+ # '''
7
+ # model = AutoModelForSequenceClassification.from_pretrained(model_name)
8
+ # tokenizer = AutoTokenizer.from_pretrained(model_name)
9
+ # classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, top_k=top_k)
10
+ # return classifier(text)
11
 
 
12
 
13
+ # user_input = "Go fuck yourself"
14
+ # user_model = "andyqin18/test-finetuned"
15
 
16
+ # result = analyze(user_model, user_input, top_k=4)
17
 
18
+ # print(result[0][0]['label'])
 
19
 
 
 
20
  import pandas as pd
 
 
 
 
 
21
  import numpy as np
22
+ df = pd.read_csv("milestone3/comp/test_comment.csv")
23
+ test_texts = df["comment_text"].values
24
+ sample_texts = np.random.choice(test_texts, size=10, replace=False)
25
+ print(sample_texts)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,2 +1,3 @@
1
  transformers
2
- torch
 
 
1
  transformers
2
+ torch
3
+ pandas