LiuYunhui commited on
Commit
08db26d
·
1 Parent(s): 5f1632a

Add application file

Browse files
README.md CHANGED
@@ -10,4 +10,6 @@ pinned: false
10
  license: openrail
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
10
  license: openrail
11
  ---
12
 
13
+ # Sentiment Analysis on Software Engineer Texts
14
+
15
+ This is a demo for our fine-tuned model [stackoverflow-roberta-base-sentiment](https://huggingface.co/Cloudy1225/stackoverflow-roberta-base-sentiment).
SOF4423.csv ADDED
The diff for this file is too large to render. See raw diff
 
__pycache__/sentiment_analyser.cpython-310.pyc ADDED
Binary file (3.52 kB). View file
 
app.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ from sentiment_analyser import RandomAnalyser, RoBERTaAnalyser, ChatGPTAnalyser
4
+ import matplotlib.pyplot as plt
5
+ from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
6
+
7
+
8
+ def plot_bar(value_counts):
9
+ fig, ax = plt.subplots(figsize=(6, 6))
10
+ value_counts.plot.barh(ax=ax)
11
+ ax.bar_label(ax.containers[0])
12
+ plt.title('Frequency of Predictions')
13
+ return fig
14
+
15
+
16
+ def plot_confusion_matrix(y_pred, y_true):
17
+ cm = confusion_matrix(y_true, y_pred, normalize='true')
18
+ fig, ax = plt.subplots(figsize=(6, 6))
19
+ disp = ConfusionMatrixDisplay(confusion_matrix=cm,
20
+ display_labels=['negative', 'neutral', 'positive'])
21
+ disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
22
+ plt.title("Normalized Confusion Matrix")
23
+ return fig
24
+
25
+
26
+ def classify(num: int):
27
+ samples_df = df.sample(num)
28
+ X = samples_df['Text'].tolist()
29
+ y = samples_df['Label']
30
+ roberta = MODEL_MAPPING[OUR_MODEL]
31
+ y_pred = pd.Series(roberta.predict(X), index=samples_df.index)
32
+ samples_df['Predict'] = y_pred
33
+ bar = plot_bar(y_pred.value_counts())
34
+ cm = plot_confusion_matrix(y_pred, y)
35
+ return samples_df, bar, cm
36
+
37
+
38
+ def analysis(Text):
39
+ keys = []
40
+ values = []
41
+ for name, model in MODEL_MAPPING.items():
42
+ keys.append(name)
43
+ values.append(SENTI_MAPPING[model.predict([Text])[0]])
44
+ return pd.DataFrame([values], columns=keys)
45
+
46
+
47
+ MODEL_MAPPING = {
48
+ 'Random': RandomAnalyser(),
49
+ 'RoBERTa': RoBERTaAnalyser(),
50
+ 'ChatGPT': ChatGPTAnalyser(),
51
+ }
52
+
53
+ OUR_MODEL = 'RoBERTa'
54
+
55
+ SENTI_MAPPING = {
56
+ 'negative': '😭',
57
+ 'neutral': '😶',
58
+ 'positive': '🥰'
59
+ }
60
+
61
+ TITLE = "Sentiment Analysis on Software Engineer Texts"
62
+ DESCRIPTION = (
63
+ "这里是第16组“睿王和他的五个小跟班”软工三迭代三模型演示页面。"
64
+ "模型链接:[Cloudy1225/stackoverflow-roberta-base-sentiment]"
65
+ "(https://huggingface.co/Cloudy1225/stackoverflow-roberta-base-sentiment) "
66
+ )
67
+
68
+ MAX_SAMPLES = 64
69
+
70
+ df = pd.read_csv('./SOF4423.csv')
71
+
72
+ with gr.Blocks(title=TITLE) as demo:
73
+ gr.HTML(f"<H1>{TITLE}</H1>")
74
+ gr.Markdown(DESCRIPTION)
75
+ gr.HTML("<H2>Model Inference</H2>")
76
+ gr.Markdown((
77
+ "在左侧文本框中输入文本并按回车键,右侧将输出情感分析结果。"
78
+ "这里我们展示了三种结果,分别是随机结果、模型结果和 ChatGPT 结果。"
79
+ ))
80
+ with gr.Row():
81
+ with gr.Column():
82
+ text_input = gr.Textbox(label='Input',
83
+ placeholder="Enter a positive or negative sentence here...")
84
+ with gr.Column():
85
+ senti_output = gr.Dataframe(type="pandas", value=[['😋', '😋', '😋']],
86
+ headers=list(MODEL_MAPPING.keys()), interactive=False)
87
+ text_input.submit(analysis, inputs=text_input, outputs=senti_output, show_progress=True)
88
+
89
+ gr.HTML("<H2>Model Evaluation</H2>")
90
+ gr.Markdown((
91
+ "这里是在 StackOverflow4423 数据集上评估我们的模型。"
92
+ "滑动 Slider,将会从 StackOverflow4423 数据集中抽样出指定数量的样本,预测其情感标签。"
93
+ "并根据预测结果绘制标签分布图和混淆矩阵。"
94
+ ))
95
+ input_models = list(MODEL_MAPPING)
96
+ input_n_samples = gr.Slider(
97
+ minimum=4,
98
+ maximum=MAX_SAMPLES,
99
+ value=8,
100
+ step=4,
101
+ label='Number of samples'
102
+ )
103
+
104
+ with gr.Row():
105
+ with gr.Column():
106
+ bar_plot = gr.Plot(label='Predictions Frequency')
107
+ with gr.Column():
108
+ cm_plot = gr.Plot(label='Confusion Matrix')
109
+
110
+ with gr.Row():
111
+ dataframe = gr.Dataframe(type="pandas", wrap=True)
112
+
113
+ input_n_samples.change(fn=classify, inputs=input_n_samples, outputs=[dataframe, bar_plot, cm_plot])
114
+
115
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ pandas
2
+ gradio
3
+ openai
4
+ matplotlib
5
+ transformers
6
+ scikit-learn
sentiment_analyser.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import openai
3
+ import random
4
+ from transformers import pipeline
5
+
6
+
7
+ class RandomAnalyser:
8
+ def __init__(self):
9
+ self.LABELS = ['negative', 'neutral', 'positive']
10
+
11
+ def predict(self, X: list):
12
+ return [random.choice(self.LABELS) for x in X]
13
+
14
+
15
+ class RoBERTaAnalyser:
16
+ def __init__(self):
17
+ self.analyser = pipeline(task="sentiment-analysis", model="Cloudy1225/stackoverflow-roberta-base-sentiment")
18
+
19
+ def predict(self, X: list):
20
+ sentiments = []
21
+ for x in X:
22
+ x = RoBERTaAnalyser.preprocess(x)
23
+ prediction = self.analyser(x)
24
+ sentiments.append(prediction[0]['label'])
25
+ return sentiments
26
+
27
+ @staticmethod
28
+ def preprocess(text):
29
+ """Preprocess text (username and link placeholders, remove line breaks)"""
30
+ new_text = []
31
+ for t in text.split(' '):
32
+ t = '@user' if t.startswith('@') and len(t) > 1 else t
33
+ t = 'http' if t.startswith('http') else t
34
+ new_text.append(t)
35
+ return ' '.join(new_text).strip()
36
+
37
+
38
+ class ChatGPTAnalyser:
39
+ def __init__(self):
40
+ # import os
41
+ # os.environ["http_proxy"] = "http://127.0.0.1:10080"
42
+ # os.environ["https_proxy"] = "http://127.0.0.1:10080"
43
+ self.MODEL = "gpt-3.5-turbo"
44
+ self.KEYs = [
45
+ "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",
46
+ "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",
47
+ "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",
48
+ "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",
49
+ "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",
50
+ "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"
51
+ ]
52
+ self.TASK_NAME = 'Sentiment Classification'
53
+ self.TASK_DEFINITION = 'Given the sentence, assign a sentiment label from [negative, neutral, positive].'
54
+ self.OUT_FORMAT = 'Return label only without any other text.'
55
+ self.PROMPT_PREFIX = f"Please perform {self.TASK_NAME} task.{self.TASK_DEFINITION}{self.OUT_FORMAT}\nSentence:\n{{}}\nLabel:"
56
+
57
+ def predict(self, X: list):
58
+ sentiments = []
59
+ for i in range(len(X)):
60
+ prompt = self.PROMPT_PREFIX.format(X[i])
61
+ messages = [{"role": "user", "content": prompt}]
62
+ # openai.api_key = self.KEYs[i % len(self.KEYs)]
63
+ openai.api_key = random.choice(self.KEYs)
64
+ while True:
65
+ try:
66
+ response = openai.ChatCompletion.create(
67
+ model=self.MODEL,
68
+ messages=messages,
69
+ temperature=0,
70
+ n=1,
71
+ stop=None
72
+ )
73
+ sentiment = response.choices[0].message.content
74
+ sentiments.append(sentiment.strip().lower())
75
+ break
76
+ except openai.error.RateLimitError:
77
+ sleep_snds = 60
78
+ time.sleep(sleep_snds)
79
+ continue
80
+ except openai.error.APIError:
81
+ sleep_snds = 60
82
+ time.sleep(sleep_snds)
83
+ continue
84
+ return sentiments