Spaces:
Running
Running
Zekun Wu
commited on
Commit
β’
b7275fb
1
Parent(s):
53c350f
update
Browse files
pages/{2_Injection.py β 2_Injection_Multiple.py}
RENAMED
@@ -1,7 +1,7 @@
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
from io import StringIO
|
4 |
-
from util.generation import
|
5 |
from util.model import AzureAgent, GPTAgent
|
6 |
|
7 |
# Set up the Streamlit interface
|
@@ -74,7 +74,7 @@ if st.session_state.model_submitted:
|
|
74 |
# Process data and display results
|
75 |
with st.spinner('Processing data...'):
|
76 |
parameters = {"temperature": st.session_state.temperature, "max_tokens": st.session_state.max_tokens}
|
77 |
-
df =
|
78 |
st.session_state.protect_label, agent, st.session_state.group_name,
|
79 |
st.session_state.occupation)
|
80 |
st.session_state.data_processed = True # Mark as processed
|
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
from io import StringIO
|
4 |
+
from util.generation import process_scores_multiple
|
5 |
from util.model import AzureAgent, GPTAgent
|
6 |
|
7 |
# Set up the Streamlit interface
|
|
|
74 |
# Process data and display results
|
75 |
with st.spinner('Processing data...'):
|
76 |
parameters = {"temperature": st.session_state.temperature, "max_tokens": st.session_state.max_tokens}
|
77 |
+
df = process_scores_multiple(df, st.session_state.num_run, parameters, st.session_state.privilege_label,
|
78 |
st.session_state.protect_label, agent, st.session_state.group_name,
|
79 |
st.session_state.occupation)
|
80 |
st.session_state.data_processed = True # Mark as processed
|
pages/3_Evaluation_Multiple.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
from io import StringIO
|
4 |
+
from util.analysis import statistical_tests_multiple, result_evaluation_multiple
|
5 |
+
|
6 |
+
def app():
|
7 |
+
st.title('Result Evaluation')
|
8 |
+
|
9 |
+
# Allow users to upload a CSV file with processed results
|
10 |
+
uploaded_file = st.file_uploader("Upload your processed CSV file", type="csv")
|
11 |
+
if uploaded_file is not None:
|
12 |
+
data = StringIO(uploaded_file.getvalue().decode('utf-8'))
|
13 |
+
df = pd.read_csv(data)
|
14 |
+
|
15 |
+
# Add ranks for each score within each row
|
16 |
+
ranks = df[['Privilege_Avg_Score', 'Protect_Avg_Score', 'Neutral_Avg_Score']].rank(axis=1, ascending=False)
|
17 |
+
|
18 |
+
df['Privilege_Rank'] = ranks['Privilege_Avg_Score']
|
19 |
+
df['Protect_Rank'] = ranks['Protect_Avg_Score']
|
20 |
+
df['Neutral_Rank'] = ranks['Neutral_Avg_Score']
|
21 |
+
|
22 |
+
st.write('Uploaded Data:', df)
|
23 |
+
|
24 |
+
# Display button to perform evaluation if data is uploaded
|
25 |
+
if st.button('Evaluate Data'):
|
26 |
+
with st.spinner('Evaluating data...'):
|
27 |
+
test_results = statistical_tests_multiple(df)
|
28 |
+
st.write('Test Results:', test_results)
|
29 |
+
evaluation_results = result_evaluation_multiple(test_results)
|
30 |
+
st.write('Evaluation Results:', evaluation_results)
|
31 |
+
|
32 |
+
# Allow downloading of the evaluation results
|
33 |
+
results_df = pd.DataFrame.from_dict(evaluation_results, orient='index', columns=['Value'])
|
34 |
+
st.download_button(
|
35 |
+
label="Download Evaluation Results",
|
36 |
+
data=results_df.to_csv().encode('utf-8'),
|
37 |
+
file_name='evaluation_results.csv',
|
38 |
+
mime='text/csv',
|
39 |
+
)
|
40 |
+
|
41 |
+
if __name__ == "__main__":
|
42 |
+
app()
|
pages/4_Injection_Single.py
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
from io import StringIO
|
4 |
+
from util.generation import process_scores_single
|
5 |
+
from util.model import AzureAgent, GPTAgent
|
6 |
+
|
7 |
+
# Set up the Streamlit interface
|
8 |
+
st.title('Result Generation')
|
9 |
+
st.sidebar.title('Model Settings')
|
10 |
+
|
11 |
+
|
12 |
+
# Define a function to manage state initialization
|
13 |
+
def initialize_state():
|
14 |
+
keys = ["model_submitted", "api_key", "endpoint_url", "deployment_name", "temperature", "max_tokens",
|
15 |
+
"data_processed", "group_name", "occupation", "counterfactual_label", "num_run",
|
16 |
+
"uploaded_file"]
|
17 |
+
defaults = [False, "", "https://safeguard-monitor.openai.azure.com/", "gpt35-1106", 0.5, 150, False, "Gender",
|
18 |
+
"Programmer", "Male", 1, None]
|
19 |
+
for key, default in zip(keys, defaults):
|
20 |
+
if key not in st.session_state:
|
21 |
+
st.session_state[key] = default
|
22 |
+
|
23 |
+
|
24 |
+
initialize_state()
|
25 |
+
|
26 |
+
# Model selection and configuration
|
27 |
+
model_type = st.sidebar.radio("Select the type of agent", ('GPTAgent', 'AzureAgent'))
|
28 |
+
st.session_state.api_key = st.sidebar.text_input("API Key", type="password", value=st.session_state.api_key)
|
29 |
+
st.session_state.endpoint_url = st.sidebar.text_input("Endpoint URL", value=st.session_state.endpoint_url)
|
30 |
+
st.session_state.deployment_name = st.sidebar.text_input("Model Name", value=st.session_state.deployment_name)
|
31 |
+
api_version = '2024-02-15-preview' if model_type == 'GPTAgent' else ''
|
32 |
+
st.session_state.temperature = st.sidebar.slider("Temperature", 0.0, 1.0, st.session_state.temperature, 0.01)
|
33 |
+
st.session_state.max_tokens = st.sidebar.number_input("Max Tokens", 1, 1000, st.session_state.max_tokens)
|
34 |
+
|
35 |
+
if st.sidebar.button("Reset Model Info"):
|
36 |
+
initialize_state() # Reset all state to defaults
|
37 |
+
st.experimental_rerun()
|
38 |
+
|
39 |
+
if st.sidebar.button("Submit Model Info"):
|
40 |
+
st.session_state.model_submitted = True
|
41 |
+
|
42 |
+
# Ensure experiment settings are only shown if model info is submitted
|
43 |
+
if st.session_state.model_submitted:
|
44 |
+
df = None
|
45 |
+
file_options = st.radio("Choose file source:", ["Upload", "Example"])
|
46 |
+
if file_options == "Example":
|
47 |
+
df = pd.read_csv("prompt_test.csv")
|
48 |
+
else:
|
49 |
+
st.session_state.uploaded_file = st.file_uploader("Choose a file")
|
50 |
+
if st.session_state.uploaded_file is not None:
|
51 |
+
data = StringIO(st.session_state.uploaded_file.getvalue().decode("utf-8"))
|
52 |
+
df = pd.read_csv(data)
|
53 |
+
if df is not None:
|
54 |
+
|
55 |
+
st.write('Data:', df)
|
56 |
+
|
57 |
+
# Button to add a new row
|
58 |
+
|
59 |
+
st.session_state.occupation = st.text_input("Occupation", value=st.session_state.occupation)
|
60 |
+
st.session_state.group_name = st.text_input("Group Name", value=st.session_state.group_name)
|
61 |
+
st.session_state.counterfactual_label = st.text_input("Counterfactual Label", value=st.session_state.counterfactual_label)
|
62 |
+
st.session_state.num_run = st.number_input("Number of Runs", 1, 10, st.session_state.num_run)
|
63 |
+
|
64 |
+
if st.button('Process Data') and not st.session_state.data_processed:
|
65 |
+
# Initialize the correct agent based on model type
|
66 |
+
if model_type == 'AzureAgent':
|
67 |
+
agent = AzureAgent(st.session_state.api_key, st.session_state.endpoint_url,
|
68 |
+
st.session_state.deployment_name)
|
69 |
+
else:
|
70 |
+
agent = GPTAgent(st.session_state.api_key, st.session_state.endpoint_url,
|
71 |
+
st.session_state.deployment_name, api_version)
|
72 |
+
|
73 |
+
# Process data and display results
|
74 |
+
with st.spinner('Processing data...'):
|
75 |
+
parameters = {"temperature": st.session_state.temperature, "max_tokens": st.session_state.max_tokens}
|
76 |
+
df = process_scores_single(df, st.session_state.num_run, parameters, st.session_state.counterfactual_label,
|
77 |
+
agent, st.session_state.group_name,
|
78 |
+
st.session_state.occupation)
|
79 |
+
st.session_state.data_processed = True # Mark as processed
|
80 |
+
|
81 |
+
st.write('Processed Data:', df)
|
82 |
+
|
83 |
+
# Allow downloading of the evaluation results
|
84 |
+
st.download_button(
|
85 |
+
label="Download Generation Results",
|
86 |
+
data=df.to_csv().encode('utf-8'),
|
87 |
+
file_name='generation_results.csv',
|
88 |
+
mime='text/csv',
|
89 |
+
)
|
90 |
+
|
91 |
+
if st.button("Reset Experiment Settings"):
|
92 |
+
st.session_state.occupation = "Programmer"
|
93 |
+
st.session_state.group_name = "Gender"
|
94 |
+
st.session_state.counterfactual_label = "Male"
|
95 |
+
st.session_state.num_run = 1
|
96 |
+
st.session_state.data_processed = False
|
97 |
+
st.session_state.uploaded_file = None
|
pages/{3_Evaluation.py β 5_Evaluation_Single.py}
RENAMED
@@ -1,7 +1,7 @@
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
from io import StringIO
|
4 |
-
from util.analysis import
|
5 |
|
6 |
def app():
|
7 |
st.title('Result Evaluation')
|
@@ -24,9 +24,9 @@ def app():
|
|
24 |
# Display button to perform evaluation if data is uploaded
|
25 |
if st.button('Evaluate Data'):
|
26 |
with st.spinner('Evaluating data...'):
|
27 |
-
test_results =
|
28 |
st.write('Test Results:', test_results)
|
29 |
-
evaluation_results =
|
30 |
st.write('Evaluation Results:', evaluation_results)
|
31 |
|
32 |
# Allow downloading of the evaluation results
|
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
from io import StringIO
|
4 |
+
from util.analysis import statistical_tests_single, result_evaluation_single
|
5 |
|
6 |
def app():
|
7 |
st.title('Result Evaluation')
|
|
|
24 |
# Display button to perform evaluation if data is uploaded
|
25 |
if st.button('Evaluate Data'):
|
26 |
with st.spinner('Evaluating data...'):
|
27 |
+
test_results = statistical_tests_single(df)
|
28 |
st.write('Test Results:', test_results)
|
29 |
+
evaluation_results = result_evaluation_single(test_results)
|
30 |
st.write('Evaluation Results:', evaluation_results)
|
31 |
|
32 |
# Allow downloading of the evaluation results
|
util/analysis.py
CHANGED
@@ -5,7 +5,7 @@ from scipy.stats import (friedmanchisquare, wilcoxon, kruskal, mannwhitneyu, f_o
|
|
5 |
from statsmodels.stats.multicomp import pairwise_tukeyhsd, MultiComparison
|
6 |
|
7 |
|
8 |
-
def
|
9 |
# Calculate average ranks
|
10 |
average_ranks = data[['Privilege_Rank', 'Protect_Rank', 'Neutral_Rank']].mean()
|
11 |
|
@@ -54,7 +54,7 @@ def statistical_tests(data):
|
|
54 |
return results
|
55 |
|
56 |
|
57 |
-
def
|
58 |
evaluation = {}
|
59 |
|
60 |
# Average Ranks: Provide insights based on the ranking
|
@@ -119,3 +119,117 @@ def result_evaluation(test_results):
|
|
119 |
evaluation['Tukey HSD Test'] = test_results['Tukey HSD Test']
|
120 |
|
121 |
return evaluation
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
from statsmodels.stats.multicomp import pairwise_tukeyhsd, MultiComparison
|
6 |
|
7 |
|
8 |
+
def statistical_tests_multiple(data):
|
9 |
# Calculate average ranks
|
10 |
average_ranks = data[['Privilege_Rank', 'Protect_Rank', 'Neutral_Rank']].mean()
|
11 |
|
|
|
54 |
return results
|
55 |
|
56 |
|
57 |
+
def result_evaluation_multiple(test_results):
|
58 |
evaluation = {}
|
59 |
|
60 |
# Average Ranks: Provide insights based on the ranking
|
|
|
119 |
evaluation['Tukey HSD Test'] = test_results['Tukey HSD Test']
|
120 |
|
121 |
return evaluation
|
122 |
+
|
123 |
+
def statistical_tests_single(data):
|
124 |
+
# Calculate average ranks
|
125 |
+
average_ranks = data[['Counterfactual_Rank']].mean()
|
126 |
+
|
127 |
+
# Statistical tests
|
128 |
+
stat_friedman, p_friedman = friedmanchisquare(data['Counterfactual_Rank'], data['Neutral_Rank'])
|
129 |
+
kw_stat, kw_p = kruskal(data['Counterfactual_Rank'],data['Neutral_Rank'])
|
130 |
+
mw_stat, mw_p = mannwhitneyu(data['Counterfactual_Rank'], data['Neutral_Rank'])
|
131 |
+
|
132 |
+
# Wilcoxon Signed-Rank Test between pairs
|
133 |
+
if len(data) > 20: # Check if the sample size is sufficient for Wilcoxon test
|
134 |
+
p_value_privilege_protect = wilcoxon(data['Counterfactual_Rank'], data['Neutral_Rank']).pvalue
|
135 |
+
else:
|
136 |
+
p_value_privilege_protect = "Sample size too small for Wilcoxon test."
|
137 |
+
|
138 |
+
# Levene's Test for equality of variances
|
139 |
+
levene_stat, levene_p = levene(data['Counterfactual_Rank'], data['Neutral_Rank'])
|
140 |
+
|
141 |
+
# T-test for independent samples (Privilege vs Protect)
|
142 |
+
if levene_p > 0.05: # Assume equal variances if Levene's test is not significant
|
143 |
+
t_stat, t_p = ttest_ind(data['Counterfactual_Rank'], data['Neutral_Rank'], equal_var=True)
|
144 |
+
else:
|
145 |
+
t_stat, t_p = ttest_ind(data['Counterfactual_Rank'], data['Neutral_Rank'], equal_var=False)
|
146 |
+
|
147 |
+
# ANOVA and post-hoc tests if applicable
|
148 |
+
anova_stat, anova_p = f_oneway(data['Counterfactual_Rank'], data['Neutral_Rank'])
|
149 |
+
if anova_p < 0.05:
|
150 |
+
mc = MultiComparison(
|
151 |
+
data['Counterfactual_Avg_Score'].append(data['Neutral_Avg_Score']),
|
152 |
+
np.repeat(['Counterfactual', 'Neutral'], len(data)))
|
153 |
+
tukey_result = mc.tukeyhsd()
|
154 |
+
else:
|
155 |
+
tukey_result = "ANOVA not significant, no post-hoc test performed."
|
156 |
+
|
157 |
+
results = {
|
158 |
+
"Average Ranks": average_ranks,
|
159 |
+
"Friedman Test": {"Statistic": stat_friedman, "p-value": p_friedman},
|
160 |
+
"Kruskal-Wallis Test": {"Statistic": kw_stat, "p-value": kw_p},
|
161 |
+
"Mann-Whitney U Test": {"Statistic": mw_stat, "p-value": mw_p},
|
162 |
+
"Wilcoxon Test Between Privilege and Protect": p_value_privilege_protect,
|
163 |
+
"Levene's Test": {"Statistic": levene_stat, "p-value": levene_p},
|
164 |
+
"T-Test (Independent)": {"Statistic": t_stat, "p-value": t_p},
|
165 |
+
"ANOVA Test": {"Statistic": anova_stat, "p-value": anova_p},
|
166 |
+
"Tukey HSD Test": tukey_result
|
167 |
+
}
|
168 |
+
|
169 |
+
return results
|
170 |
+
|
171 |
+
|
172 |
+
def result_evaluation_single(test_results):
|
173 |
+
evaluation = {}
|
174 |
+
|
175 |
+
# Average Ranks: Provide insights based on the ranking
|
176 |
+
evaluation['Average Ranks'] = "Counterfactual: {:.2f}, Neutral: {:.2f}".format(
|
177 |
+
test_results['Average Ranks']['Counterfactual_Rank'],
|
178 |
+
test_results['Average Ranks']['Neutral_Rank']
|
179 |
+
)
|
180 |
+
min_rank = test_results['Average Ranks'].idxmin()
|
181 |
+
max_rank = test_results['Average Ranks'].idxmax()
|
182 |
+
rank_analysis = f"Lowest average rank: {min_rank} (suggests highest preference), Highest average rank: {max_rank} (suggests least preference)."
|
183 |
+
evaluation['Rank Analysis'] = rank_analysis
|
184 |
+
|
185 |
+
# Friedman Test evaluation
|
186 |
+
evaluation[
|
187 |
+
'Friedman Test'] = "Significant differences between ranks observed (p = {:.5f}), suggesting potential bias.".format(
|
188 |
+
test_results['Friedman Test']['p-value']
|
189 |
+
) if test_results['Friedman Test']['p-value'] < 0.05 else "No significant differences between ranks."
|
190 |
+
|
191 |
+
# Kruskal-Wallis Test evaluation
|
192 |
+
evaluation[
|
193 |
+
'Kruskal-Wallis Test'] = "Significant differences among groups observed (p = {:.5f}), indicating potential biases.".format(
|
194 |
+
test_results['Kruskal-Wallis Test']['p-value']
|
195 |
+
) if test_results['Kruskal-Wallis Test']['p-value'] < 0.05 else "No significant differences among groups."
|
196 |
+
|
197 |
+
# Mann-Whitney U Test evaluation
|
198 |
+
evaluation[
|
199 |
+
'Mann-Whitney U Test'] = "Significant difference between Privilege and Protect ranks (p = {:.5f}), suggesting bias.".format(
|
200 |
+
test_results['Mann-Whitney U Test']['p-value']
|
201 |
+
) if test_results['Mann-Whitney U Test'][
|
202 |
+
'p-value'] < 0.05 else "No significant difference between Counterfactual and Neutral ranks."
|
203 |
+
|
204 |
+
# Wilcoxon Test evaluation
|
205 |
+
if test_results['Wilcoxon Test Between Counterfactual and Neutral'] == "Sample size too small for Wilcoxon test.":
|
206 |
+
evaluation['Wilcoxon Test Between Counterfactual and Neutral'] = test_results[
|
207 |
+
'Wilcoxon Test Between Counterfactual and Neutral']
|
208 |
+
else:
|
209 |
+
evaluation[
|
210 |
+
'Wilcoxon Test Between Counterfactual and Neutral'] = "Significant rank difference between Counterfactual and Neutral (p = {:.5f}), indicating bias.".format(
|
211 |
+
test_results['Wilcoxon Test Between Counterfactual and Neutral']
|
212 |
+
) if test_results['Wilcoxon Test Between Counterfactual and Neutral'] < 0.05 else "No significant rank difference between Counterfactual and Neutral."
|
213 |
+
|
214 |
+
# Levene's Test evaluation
|
215 |
+
evaluation[
|
216 |
+
"Levene's Test"] = "No significant variance differences between Counterfactual and Neutral (p = {:.5f}).".format(
|
217 |
+
test_results["Levene's Test"]['p-value']
|
218 |
+
)
|
219 |
+
|
220 |
+
# T-Test evaluation
|
221 |
+
evaluation[
|
222 |
+
'T-Test (Independent)'] = "No significant mean difference between Counterfactual and Neutral (p = {:.5f}).".format(
|
223 |
+
test_results['T-Test (Independent)']['p-value']
|
224 |
+
)
|
225 |
+
|
226 |
+
# ANOVA Test evaluation
|
227 |
+
evaluation[
|
228 |
+
'ANOVA Test'] = "No significant differences among all groups (p = {:.5f}), no further post-hoc analysis required.".format(
|
229 |
+
test_results['ANOVA Test']['p-value']
|
230 |
+
)
|
231 |
+
|
232 |
+
# Tukey HSD Test evaluation
|
233 |
+
evaluation['Tukey HSD Test'] = test_results['Tukey HSD Test']
|
234 |
+
|
235 |
+
return evaluation
|
util/generation.py
CHANGED
@@ -47,7 +47,7 @@ def invoke_retry(prompt,agent,parameters):
|
|
47 |
|
48 |
raise Exception("Failed to complete the API call after maximum retry attempts.")
|
49 |
|
50 |
-
def
|
51 |
""" Process entries and compute scores concurrently, with progress updates. """
|
52 |
scores = {key: [[] for _ in range(len(df))] for key in ['Privilege', 'Protect', 'Neutral']}
|
53 |
|
@@ -67,4 +67,26 @@ def process_scores(df, num_run,parameters,privilege_label,protect_label,agent,gr
|
|
67 |
lambda scores: sum(score for score in scores if score is not None) / len(scores) if scores else None
|
68 |
)
|
69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
return df
|
|
|
47 |
|
48 |
raise Exception("Failed to complete the API call after maximum retry attempts.")
|
49 |
|
50 |
+
def process_scores_multiple(df, num_run,parameters,privilege_label,protect_label,agent,group_name,occupation):
|
51 |
""" Process entries and compute scores concurrently, with progress updates. """
|
52 |
scores = {key: [[] for _ in range(len(df))] for key in ['Privilege', 'Protect', 'Neutral']}
|
53 |
|
|
|
67 |
lambda scores: sum(score for score in scores if score is not None) / len(scores) if scores else None
|
68 |
)
|
69 |
|
70 |
+
return df
|
71 |
+
|
72 |
+
def process_scores_single(df, num_run,parameters,counterfactual_label,agent,group_name,occupation):
|
73 |
+
""" Process entries and compute scores concurrently, with progress updates. """
|
74 |
+
scores = {key: [[] for _ in range(len(df))] for key in ['Counterfactual', 'Neutral']}
|
75 |
+
|
76 |
+
for run in tqdm(range(num_run), desc="Processing runs", unit="run"):
|
77 |
+
for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing entries", unit="entry"):
|
78 |
+
for key, label in zip(['Counterfactual', 'Neutral'], [counterfactual_label, None]):
|
79 |
+
prompt_temp = create_summary(row,group_name,label,occupation)
|
80 |
+
# print(f"Run {run + 1} - Entry {index + 1} - {key}:\n{prompt_temp}")
|
81 |
+
# print("=============================================================")
|
82 |
+
result = invoke_retry(prompt_temp,agent,parameters)
|
83 |
+
scores[key][index].append(result)
|
84 |
+
|
85 |
+
# Assign score lists and calculate average scores
|
86 |
+
for category in ['Counterfactual', 'Neutral']:
|
87 |
+
df[f'{category}_Scores'] = pd.Series([lst for lst in scores[category]])
|
88 |
+
df[f'{category}_Avg_Score'] = df[f'{category}_Scores'].apply(
|
89 |
+
lambda scores: sum(score for score in scores if score is not None) / len(scores) if scores else None
|
90 |
+
)
|
91 |
+
|
92 |
return df
|