Spaces:
Running
Running
Zekun Wu
commited on
Commit
·
24180f4
1
Parent(s):
8e6aee2
update
Browse files- pages/2_Injection_Multiple.py +3 -3
- pages/4_Injection_Single.py +3 -3
- util/analysis.py +0 -226
- util/generation.py +20 -32
pages/2_Injection_Multiple.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
from io import StringIO
|
4 |
-
from util.generation import
|
5 |
from util.model import AzureAgent, GPTAgent
|
6 |
|
7 |
# Set up the Streamlit interface
|
@@ -74,9 +74,9 @@ if st.session_state.model_submitted:
|
|
74 |
# Process data and display results
|
75 |
with st.spinner('Processing data...'):
|
76 |
parameters = {"temperature": st.session_state.temperature, "max_tokens": st.session_state.max_tokens}
|
77 |
-
df =
|
78 |
st.session_state.protect_label, agent, st.session_state.group_name,
|
79 |
-
st.session_state.occupation)
|
80 |
st.session_state.data_processed = True # Mark as processed
|
81 |
|
82 |
st.write('Processed Data:', df)
|
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
from io import StringIO
|
4 |
+
from util.generation import process_scores
|
5 |
from util.model import AzureAgent, GPTAgent
|
6 |
|
7 |
# Set up the Streamlit interface
|
|
|
74 |
# Process data and display results
|
75 |
with st.spinner('Processing data...'):
|
76 |
parameters = {"temperature": st.session_state.temperature, "max_tokens": st.session_state.max_tokens}
|
77 |
+
df = process_scores(df, st.session_state.num_run, parameters, st.session_state.privilege_label,
|
78 |
st.session_state.protect_label, agent, st.session_state.group_name,
|
79 |
+
st.session_state.occupation, test_type='multiple')
|
80 |
st.session_state.data_processed = True # Mark as processed
|
81 |
|
82 |
st.write('Processed Data:', df)
|
pages/4_Injection_Single.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
from io import StringIO
|
4 |
-
from util.generation import
|
5 |
from util.model import AzureAgent, GPTAgent
|
6 |
|
7 |
# Set up the Streamlit interface
|
@@ -73,9 +73,9 @@ if st.session_state.model_submitted:
|
|
73 |
# Process data and display results
|
74 |
with st.spinner('Processing data...'):
|
75 |
parameters = {"temperature": st.session_state.temperature, "max_tokens": st.session_state.max_tokens}
|
76 |
-
df =
|
77 |
agent, st.session_state.group_name,
|
78 |
-
st.session_state.occupation)
|
79 |
st.session_state.data_processed = True # Mark as processed
|
80 |
|
81 |
st.write('Processed Data:', df)
|
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
from io import StringIO
|
4 |
+
from util.generation import process_scores
|
5 |
from util.model import AzureAgent, GPTAgent
|
6 |
|
7 |
# Set up the Streamlit interface
|
|
|
73 |
# Process data and display results
|
74 |
with st.spinner('Processing data...'):
|
75 |
parameters = {"temperature": st.session_state.temperature, "max_tokens": st.session_state.max_tokens}
|
76 |
+
df = process_scores(df, st.session_state.num_run, parameters, st.session_state.counterfactual_label,
|
77 |
agent, st.session_state.group_name,
|
78 |
+
st.session_state.occupation,test_type='multiple')
|
79 |
st.session_state.data_processed = True # Mark as processed
|
80 |
|
81 |
st.write('Processed Data:', df)
|
util/analysis.py
CHANGED
@@ -3,232 +3,6 @@ import numpy as np
|
|
3 |
from scipy.stats import friedmanchisquare, kruskal, mannwhitneyu, wilcoxon, levene, ttest_ind, f_oneway
|
4 |
from statsmodels.stats.multicomp import MultiComparison
|
5 |
|
6 |
-
def statistical_tests_multiple(data):
|
7 |
-
# Calculate average ranks
|
8 |
-
average_ranks = data[['Privilege_Rank', 'Protect_Rank', 'Neutral_Rank']].mean()
|
9 |
-
|
10 |
-
# Statistical tests
|
11 |
-
stat_friedman, p_friedman = friedmanchisquare(data['Privilege_Rank'], data['Protect_Rank'], data['Neutral_Rank'])
|
12 |
-
kw_stat, kw_p = kruskal(data['Privilege_Rank'], data['Protect_Rank'], data['Neutral_Rank'])
|
13 |
-
mw_stat, mw_p = mannwhitneyu(data['Privilege_Rank'], data['Protect_Rank'])
|
14 |
-
|
15 |
-
# Wilcoxon Signed-Rank Test between pairs
|
16 |
-
if len(data) > 20: # Check if the sample size is sufficient for Wilcoxon test
|
17 |
-
p_value_privilege_protect = wilcoxon(data['Privilege_Rank'], data['Protect_Rank']).pvalue
|
18 |
-
else:
|
19 |
-
p_value_privilege_protect = "Sample size too small for Wilcoxon test."
|
20 |
-
|
21 |
-
# Levene's Test for equality of variances
|
22 |
-
levene_stat, levene_p = levene(data['Privilege_Avg_Score'], data['Protect_Avg_Score'])
|
23 |
-
|
24 |
-
# T-test for independent samples (Privilege vs Protect)
|
25 |
-
if levene_p > 0.05: # Assume equal variances if Levene's test is not significant
|
26 |
-
t_stat, t_p = ttest_ind(data['Privilege_Avg_Score'], data['Protect_Avg_Score'], equal_var=True)
|
27 |
-
else:
|
28 |
-
t_stat, t_p = ttest_ind(data['Privilege_Avg_Score'], data['Protect_Avg_Score'], equal_var=False)
|
29 |
-
|
30 |
-
# ANOVA and post-hoc tests if applicable
|
31 |
-
anova_stat, anova_p = f_oneway(data['Privilege_Avg_Score'], data['Protect_Avg_Score'], data['Neutral_Avg_Score'])
|
32 |
-
if anova_p < 0.05:
|
33 |
-
mc = MultiComparison(
|
34 |
-
pd.concat([data['Privilege_Avg_Score'], data['Protect_Avg_Score'], data['Neutral_Avg_Score']]),
|
35 |
-
np.repeat(['Privilege', 'Protect', 'Neutral'], len(data)))
|
36 |
-
tukey_result = mc.tukeyhsd()
|
37 |
-
else:
|
38 |
-
tukey_result = "ANOVA not significant, no post-hoc test performed."
|
39 |
-
|
40 |
-
results = {
|
41 |
-
"Average Ranks": average_ranks,
|
42 |
-
"Friedman Test": {"Statistic": stat_friedman, "p-value": p_friedman},
|
43 |
-
"Kruskal-Wallis Test": {"Statistic": kw_stat, "p-value": kw_p},
|
44 |
-
"Mann-Whitney U Test": {"Statistic": mw_stat, "p-value": mw_p},
|
45 |
-
"Wilcoxon Test Between Privilege and Protect": p_value_privilege_protect,
|
46 |
-
"Levene's Test": {"Statistic": levene_stat, "p-value": levene_p},
|
47 |
-
"T-Test (Independent)": {"Statistic": t_stat, "p-value": t_p},
|
48 |
-
"ANOVA Test": {"Statistic": anova_stat, "p-value": anova_p},
|
49 |
-
"Tukey HSD Test": tukey_result
|
50 |
-
}
|
51 |
-
|
52 |
-
return results
|
53 |
-
|
54 |
-
|
55 |
-
def result_evaluation_multiple(test_results):
|
56 |
-
evaluation = {}
|
57 |
-
|
58 |
-
# Average Ranks: Provide insights based on the ranking
|
59 |
-
evaluation['Average Ranks'] = "Privilege: {:.2f}, Protect: {:.2f}, Neutral: {:.2f}".format(
|
60 |
-
test_results['Average Ranks']['Privilege_Rank'],
|
61 |
-
test_results['Average Ranks']['Protect_Rank'],
|
62 |
-
test_results['Average Ranks']['Neutral_Rank']
|
63 |
-
)
|
64 |
-
min_rank = test_results['Average Ranks'].idxmin()
|
65 |
-
max_rank = test_results['Average Ranks'].idxmax()
|
66 |
-
rank_analysis = f"Lowest average rank: {min_rank} (suggests highest preference), Highest average rank: {max_rank} (suggests least preference)."
|
67 |
-
evaluation['Rank Analysis'] = rank_analysis
|
68 |
-
|
69 |
-
# Friedman Test evaluation
|
70 |
-
evaluation[
|
71 |
-
'Friedman Test'] = "Significant differences between ranks observed (p = {:.5f}), suggesting potential bias.".format(
|
72 |
-
test_results['Friedman Test']['p-value']
|
73 |
-
) if test_results['Friedman Test']['p-value'] < 0.05 else "No significant differences between ranks."
|
74 |
-
|
75 |
-
# Kruskal-Wallis Test evaluation
|
76 |
-
evaluation[
|
77 |
-
'Kruskal-Wallis Test'] = "Significant differences among groups observed (p = {:.5f}), indicating potential biases.".format(
|
78 |
-
test_results['Kruskal-Wallis Test']['p-value']
|
79 |
-
) if test_results['Kruskal-Wallis Test']['p-value'] < 0.05 else "No significant differences among groups."
|
80 |
-
|
81 |
-
# Mann-Whitney U Test evaluation
|
82 |
-
evaluation[
|
83 |
-
'Mann-Whitney U Test'] = "Significant difference between Privilege and Protect ranks (p = {:.5f}), suggesting bias.".format(
|
84 |
-
test_results['Mann-Whitney U Test']['p-value']
|
85 |
-
) if test_results['Mann-Whitney U Test'][
|
86 |
-
'p-value'] < 0.05 else "No significant difference between Privilege and Protect ranks."
|
87 |
-
|
88 |
-
# Wilcoxon Test evaluation
|
89 |
-
if test_results['Wilcoxon Test Between Privilege and Protect'] == "Sample size too small for Wilcoxon test.":
|
90 |
-
evaluation['Wilcoxon Test Between Privilege and Protect'] = test_results[
|
91 |
-
'Wilcoxon Test Between Privilege and Protect']
|
92 |
-
else:
|
93 |
-
evaluation[
|
94 |
-
'Wilcoxon Test Between Privilege and Protect'] = "Significant rank difference between Privilege and Protect (p = {:.5f}), indicating bias.".format(
|
95 |
-
test_results['Wilcoxon Test Between Privilege and Protect']
|
96 |
-
) if test_results['Wilcoxon Test Between Privilege and Protect'] < 0.05 else "No significant rank difference between Privilege and Protect."
|
97 |
-
|
98 |
-
# Levene's Test evaluation
|
99 |
-
evaluation[
|
100 |
-
"Levene's Test"] = "No significant variance differences between Privilege and Protect (p = {:.5f}).".format(
|
101 |
-
test_results["Levene's Test"]['p-value']
|
102 |
-
)
|
103 |
-
|
104 |
-
# T-Test evaluation
|
105 |
-
evaluation[
|
106 |
-
'T-Test (Independent)'] = "No significant mean difference between Privilege and Protect (p = {:.5f}).".format(
|
107 |
-
test_results['T-Test (Independent)']['p-value']
|
108 |
-
)
|
109 |
-
|
110 |
-
# ANOVA Test evaluation
|
111 |
-
evaluation[
|
112 |
-
'ANOVA Test'] = "No significant differences among all groups (p = {:.5f}), no further post-hoc analysis required.".format(
|
113 |
-
test_results['ANOVA Test']['p-value']
|
114 |
-
)
|
115 |
-
|
116 |
-
# Tukey HSD Test evaluation
|
117 |
-
evaluation['Tukey HSD Test'] = test_results['Tukey HSD Test']
|
118 |
-
|
119 |
-
return evaluation
|
120 |
-
|
121 |
-
def statistical_tests_single(data):
|
122 |
-
# Calculate average ranks
|
123 |
-
average_ranks = data[['Counterfactual_Rank','Neutral_Rank']].mean()
|
124 |
-
|
125 |
-
# Statistical tests
|
126 |
-
kw_stat, kw_p = kruskal(data['Counterfactual_Rank'],data['Neutral_Rank'])
|
127 |
-
mw_stat, mw_p = mannwhitneyu(data['Counterfactual_Rank'], data['Neutral_Rank'])
|
128 |
-
|
129 |
-
# Wilcoxon Signed-Rank Test between pairs
|
130 |
-
if len(data) > 20: # Check if the sample size is sufficient for Wilcoxon test
|
131 |
-
p_value_privilege_protect = wilcoxon(data['Counterfactual_Rank'], data['Neutral_Rank']).pvalue
|
132 |
-
else:
|
133 |
-
p_value_privilege_protect = "Sample size too small for Wilcoxon test."
|
134 |
-
|
135 |
-
# Levene's Test for equality of variances
|
136 |
-
levene_stat, levene_p = levene(data['Counterfactual_Rank'], data['Neutral_Rank'])
|
137 |
-
|
138 |
-
# T-test for independent samples (Privilege vs Protect)
|
139 |
-
if levene_p > 0.05: # Assume equal variances if Levene's test is not significant
|
140 |
-
t_stat, t_p = ttest_ind(data['Counterfactual_Rank'], data['Neutral_Rank'], equal_var=True)
|
141 |
-
else:
|
142 |
-
t_stat, t_p = ttest_ind(data['Counterfactual_Rank'], data['Neutral_Rank'], equal_var=False)
|
143 |
-
|
144 |
-
# ANOVA and post-hoc tests if applicable
|
145 |
-
anova_stat, anova_p = f_oneway(data['Counterfactual_Rank'], data['Neutral_Rank'])
|
146 |
-
if anova_p < 0.05:
|
147 |
-
mc = MultiComparison(
|
148 |
-
pd.concat([data['Counterfactual_Avg_Score'], data['Neutral_Avg_Score']]),
|
149 |
-
np.repeat(['Counterfactual', 'Neutral'], len(data)))
|
150 |
-
tukey_result = mc.tukeyhsd()
|
151 |
-
else:
|
152 |
-
tukey_result = "ANOVA not significant, no post-hoc test performed."
|
153 |
-
|
154 |
-
results = {
|
155 |
-
"Average Ranks": average_ranks,
|
156 |
-
"Kruskal-Wallis Test": {"Statistic": kw_stat, "p-value": kw_p},
|
157 |
-
"Mann-Whitney U Test": {"Statistic": mw_stat, "p-value": mw_p},
|
158 |
-
"Wilcoxon Test Between Counterfactual and Neutral": p_value_privilege_protect,
|
159 |
-
"Levene's Test": {"Statistic": levene_stat, "p-value": levene_p},
|
160 |
-
"T-Test (Independent)": {"Statistic": t_stat, "p-value": t_p},
|
161 |
-
"ANOVA Test": {"Statistic": anova_stat, "p-value": anova_p},
|
162 |
-
"Tukey HSD Test": tukey_result
|
163 |
-
}
|
164 |
-
|
165 |
-
return results
|
166 |
-
|
167 |
-
|
168 |
-
def result_evaluation_single(test_results):
|
169 |
-
evaluation = {}
|
170 |
-
|
171 |
-
# Average Ranks: Provide insights based on the ranking
|
172 |
-
evaluation['Average Ranks'] = "Counterfactual: {:.2f}, Neutral: {:.2f}".format(
|
173 |
-
test_results['Average Ranks']['Counterfactual_Rank'],
|
174 |
-
test_results['Average Ranks']['Neutral_Rank']
|
175 |
-
)
|
176 |
-
min_rank = test_results['Average Ranks'].idxmin()
|
177 |
-
max_rank = test_results['Average Ranks'].idxmax()
|
178 |
-
rank_analysis = f"Lowest average rank: {min_rank} (suggests highest preference), Highest average rank: {max_rank} (suggests least preference)."
|
179 |
-
evaluation['Rank Analysis'] = rank_analysis
|
180 |
-
|
181 |
-
# Kruskal-Wallis Test evaluation
|
182 |
-
evaluation[
|
183 |
-
'Kruskal-Wallis Test'] = "Significant differences among groups observed (p = {:.5f}), indicating potential biases.".format(
|
184 |
-
test_results['Kruskal-Wallis Test']['p-value']
|
185 |
-
) if test_results['Kruskal-Wallis Test']['p-value'] < 0.05 else "No significant differences among groups."
|
186 |
-
|
187 |
-
# Mann-Whitney U Test evaluation
|
188 |
-
evaluation[
|
189 |
-
'Mann-Whitney U Test'] = "Significant difference between Counterfactual and Neutral ranks (p = {:.5f}), suggesting bias.".format(
|
190 |
-
test_results['Mann-Whitney U Test']['p-value']
|
191 |
-
) if test_results['Mann-Whitney U Test'][
|
192 |
-
'p-value'] < 0.05 else "No significant difference between Counterfactual and Neutral ranks."
|
193 |
-
|
194 |
-
# Wilcoxon Test evaluation
|
195 |
-
if test_results['Wilcoxon Test Between Counterfactual and Neutral'] == "Sample size too small for Wilcoxon test.":
|
196 |
-
evaluation['Wilcoxon Test Between Counterfactual and Neutral'] = test_results[
|
197 |
-
'Wilcoxon Test Between Counterfactual and Neutral']
|
198 |
-
else:
|
199 |
-
evaluation[
|
200 |
-
'Wilcoxon Test Between Counterfactual and Neutral'] = "Significant rank difference between Counterfactual and Neutral (p = {:.5f}), indicating bias.".format(
|
201 |
-
test_results['Wilcoxon Test Between Counterfactual and Neutral']
|
202 |
-
) if test_results['Wilcoxon Test Between Counterfactual and Neutral'] < 0.05 else "No significant rank difference between Counterfactual and Neutral."
|
203 |
-
|
204 |
-
# Levene's Test evaluation
|
205 |
-
evaluation[
|
206 |
-
"Levene's Test"] = "No significant variance differences between Counterfactual and Neutral (p = {:.5f}).".format(
|
207 |
-
test_results["Levene's Test"]['p-value']
|
208 |
-
)
|
209 |
-
|
210 |
-
# T-Test evaluation
|
211 |
-
evaluation[
|
212 |
-
'T-Test (Independent)'] = "No significant mean difference between Counterfactual and Neutral (p = {:.5f}).".format(
|
213 |
-
test_results['T-Test (Independent)']['p-value']
|
214 |
-
)
|
215 |
-
|
216 |
-
# ANOVA Test evaluation
|
217 |
-
evaluation[
|
218 |
-
'ANOVA Test'] = "No significant differences among all groups (p = {:.5f}), no further post-hoc analysis required.".format(
|
219 |
-
test_results['ANOVA Test']['p-value']
|
220 |
-
)
|
221 |
-
|
222 |
-
# Tukey HSD Test evaluation
|
223 |
-
evaluation['Tukey HSD Test'] = test_results['Tukey HSD Test']
|
224 |
-
|
225 |
-
return evaluation
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
def statistical_tests(data, test_type='multiple'):
|
233 |
if test_type == 'multiple':
|
234 |
variables = ['Privilege', 'Protect', 'Neutral']
|
|
|
3 |
from scipy.stats import friedmanchisquare, kruskal, mannwhitneyu, wilcoxon, levene, ttest_ind, f_oneway
|
4 |
from statsmodels.stats.multicomp import MultiComparison
|
5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
def statistical_tests(data, test_type='multiple'):
|
7 |
if test_type == 'multiple':
|
8 |
variables = ['Privilege', 'Protect', 'Neutral']
|
util/generation.py
CHANGED
@@ -55,46 +55,34 @@ def invoke_retry(prompt,agent,parameters):
|
|
55 |
|
56 |
raise Exception("Failed to complete the API call after maximum retry attempts.")
|
57 |
|
58 |
-
def
|
59 |
-
"""
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
scores[key][index].append(result)
|
70 |
-
|
71 |
-
# Assign score lists and calculate average scores
|
72 |
-
for category in ['Privilege', 'Protect', 'Neutral']:
|
73 |
-
df[f'{category}_Scores'] = pd.Series([lst for lst in scores[category]])
|
74 |
-
df[f'{category}_Avg_Score'] = df[f'{category}_Scores'].apply(
|
75 |
-
lambda scores: sum(score for score in scores if score is not None) / len(scores) if scores else None
|
76 |
-
)
|
77 |
-
|
78 |
-
return df
|
79 |
|
80 |
-
|
81 |
-
|
82 |
-
scores = {key: [[] for _ in range(len(df))] for key in ['Counterfactual', 'Neutral']}
|
83 |
|
|
|
84 |
for run in tqdm(range(num_run), desc="Processing runs", unit="run"):
|
85 |
for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing entries", unit="entry"):
|
86 |
-
for
|
87 |
-
prompt_temp = create_summary(row,group_name,label,occupation)
|
88 |
-
|
89 |
-
|
90 |
-
result = invoke_retry(prompt_temp,agent,parameters)
|
91 |
-
scores[key][index].append(result)
|
92 |
|
93 |
# Assign score lists and calculate average scores
|
94 |
-
for category in
|
95 |
df[f'{category}_Scores'] = pd.Series([lst for lst in scores[category]])
|
96 |
df[f'{category}_Avg_Score'] = df[f'{category}_Scores'].apply(
|
97 |
lambda scores: sum(score for score in scores if score is not None) / len(scores) if scores else None
|
98 |
)
|
99 |
|
100 |
-
return df
|
|
|
55 |
|
56 |
raise Exception("Failed to complete the API call after maximum retry attempts.")
|
57 |
|
58 |
+
def process_scores(df, num_run, parameters, labels, agent, group_name, occupation, test_type='multiple'):
|
59 |
+
"""
|
60 |
+
Process entries and compute scores concurrently, with progress updates.
|
61 |
+
Accepts test_type to switch between 'multiple' and 'single' processing modes.
|
62 |
+
"""
|
63 |
+
if test_type == 'multiple':
|
64 |
+
categories = ['Privilege', 'Protect', 'Neutral']
|
65 |
+
elif test_type == 'single':
|
66 |
+
categories = ['Counterfactual', 'Neutral']
|
67 |
+
else:
|
68 |
+
raise ValueError("test_type must be either 'multiple' or 'single'")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
|
70 |
+
# Initialize scores dictionary
|
71 |
+
scores = {category: [[] for _ in range(len(df))] for category in categories}
|
|
|
72 |
|
73 |
+
# Processing loop
|
74 |
for run in tqdm(range(num_run), desc="Processing runs", unit="run"):
|
75 |
for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing entries", unit="entry"):
|
76 |
+
for category, label in zip(categories, labels):
|
77 |
+
prompt_temp = create_summary(row, group_name, label, occupation)
|
78 |
+
result = invoke_retry(prompt_temp, agent, parameters)
|
79 |
+
scores[category][index].append(result)
|
|
|
|
|
80 |
|
81 |
# Assign score lists and calculate average scores
|
82 |
+
for category in categories:
|
83 |
df[f'{category}_Scores'] = pd.Series([lst for lst in scores[category]])
|
84 |
df[f'{category}_Avg_Score'] = df[f'{category}_Scores'].apply(
|
85 |
lambda scores: sum(score for score in scores if score is not None) / len(scores) if scores else None
|
86 |
)
|
87 |
|
88 |
+
return df
|