Spaces:
Sleeping
Sleeping
Zekun Wu
commited on
Commit
•
b25bb07
1
Parent(s):
6807929
update
Browse files- experiment.ipynb +201 -0
- util/evaluation.py +28 -7
- util/injection.py +8 -7
experiment.ipynb
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"id": "initial_id",
|
7 |
+
"metadata": {
|
8 |
+
"collapsed": true,
|
9 |
+
"is_executing": true,
|
10 |
+
"ExecuteTime": {
|
11 |
+
"start_time": "2024-05-31T11:06:03.089830Z"
|
12 |
+
}
|
13 |
+
},
|
14 |
+
"outputs": [
|
15 |
+
{
|
16 |
+
"name": "stdout",
|
17 |
+
"output_type": "stream",
|
18 |
+
"text": [
|
19 |
+
"Processing 100 entries with 1 runs each.\n"
|
20 |
+
]
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"name": "stderr",
|
24 |
+
"output_type": "stream",
|
25 |
+
"text": [
|
26 |
+
"Processing runs: 0%| | 0/1 [00:00<?, ?run/s]\n",
|
27 |
+
"Processing entries: 0%| | 0/100 [00:00<?, ?entry/s]\u001B[A\n",
|
28 |
+
"Processing entries: 1%| | 1/100 [00:47<1:17:58, 47.26s/entry]\u001B[A\n",
|
29 |
+
"Processing entries: 2%|▏ | 2/100 [01:15<58:51, 36.04s/entry] \u001B[A\n",
|
30 |
+
"Processing entries: 3%|▎ | 3/100 [01:49<56:30, 34.95s/entry]\u001B[A\n",
|
31 |
+
"Processing entries: 4%|▍ | 4/100 [02:21<54:34, 34.11s/entry]\u001B[A\n",
|
32 |
+
"Processing entries: 5%|▌ | 5/100 [02:59<56:11, 35.49s/entry]\u001B[A\n",
|
33 |
+
"Processing entries: 6%|▌ | 6/100 [03:35<55:33, 35.46s/entry]\u001B[A\n",
|
34 |
+
"Processing entries: 7%|▋ | 7/100 [04:12<55:48, 36.00s/entry]\u001B[A\n",
|
35 |
+
"Processing entries: 8%|▊ | 8/100 [04:52<57:20, 37.40s/entry]\u001B[A\n",
|
36 |
+
"Processing entries: 9%|▉ | 9/100 [05:19<51:31, 33.97s/entry]\u001B[A\n",
|
37 |
+
"Processing entries: 10%|█ | 10/100 [15:46<5:25:34, 217.06s/entry]\u001B[A\n",
|
38 |
+
"Processing entries: 11%|█ | 11/100 [16:11<3:55:07, 158.51s/entry]\u001B[A\n",
|
39 |
+
"Processing entries: 12%|█▏ | 12/100 [17:15<3:09:53, 129.48s/entry]\u001B[A\n",
|
40 |
+
"Processing entries: 13%|█▎ | 13/100 [17:54<2:28:14, 102.23s/entry]\u001B[A\n",
|
41 |
+
"Processing entries: 14%|█▍ | 14/100 [18:32<1:58:40, 82.79s/entry] \u001B[A\n",
|
42 |
+
"Processing entries: 15%|█▌ | 15/100 [19:00<1:34:01, 66.37s/entry]\u001B[A\n",
|
43 |
+
"Processing entries: 16%|█▌ | 16/100 [19:45<1:23:45, 59.83s/entry]\u001B[A\n",
|
44 |
+
"Processing entries: 17%|█▋ | 17/100 [20:27<1:15:23, 54.50s/entry]\u001B[A\n",
|
45 |
+
"Processing entries: 18%|█▊ | 18/100 [20:55<1:03:40, 46.59s/entry]\u001B[A\n",
|
46 |
+
"Processing entries: 19%|█▉ | 19/100 [21:27<56:41, 42.00s/entry] \u001B[A\n",
|
47 |
+
"Processing entries: 20%|██ | 20/100 [22:12<57:27, 43.09s/entry]\u001B[A\n",
|
48 |
+
"Processing entries: 21%|██ | 21/100 [22:41<51:05, 38.81s/entry]\u001B[A\n",
|
49 |
+
"Processing entries: 22%|██▏ | 22/100 [23:06<45:15, 34.81s/entry]\u001B[A\n",
|
50 |
+
"Processing entries: 23%|██▎ | 23/100 [23:50<48:06, 37.49s/entry]\u001B[A\n",
|
51 |
+
"Processing entries: 24%|██▍ | 24/100 [24:21<44:54, 35.46s/entry]\u001B[A\n",
|
52 |
+
"Processing entries: 25%|██▌ | 25/100 [25:00<45:50, 36.68s/entry]\u001B[A\n",
|
53 |
+
"Processing entries: 26%|██▌ | 26/100 [25:35<44:32, 36.12s/entry]\u001B[A\n",
|
54 |
+
"Processing entries: 27%|██▋ | 27/100 [26:04<41:14, 33.89s/entry]\u001B[A\n",
|
55 |
+
"Processing entries: 28%|██▊ | 28/100 [26:29<37:26, 31.19s/entry]\u001B[A\n",
|
56 |
+
"Processing entries: 29%|██▉ | 29/100 [27:04<38:24, 32.46s/entry]\u001B[A\n",
|
57 |
+
"Processing entries: 30%|███ | 30/100 [27:38<38:15, 32.79s/entry]\u001B[A\n",
|
58 |
+
"Processing entries: 31%|███ | 31/100 [28:20<40:55, 35.59s/entry]\u001B[A\n",
|
59 |
+
"Processing entries: 32%|███▏ | 32/100 [29:08<44:23, 39.18s/entry]\u001B[A\n",
|
60 |
+
"Processing entries: 33%|███▎ | 33/100 [29:37<40:20, 36.13s/entry]\u001B[A\n",
|
61 |
+
"Processing entries: 34%|███▍ | 34/100 [30:23<43:02, 39.13s/entry]\u001B[A\n",
|
62 |
+
"Processing entries: 35%|███▌ | 35/100 [31:19<47:58, 44.28s/entry]\u001B[A\n",
|
63 |
+
"Processing entries: 36%|███▌ | 36/100 [32:01<46:39, 43.75s/entry]\u001B[A\n",
|
64 |
+
"Processing entries: 37%|███▋ | 37/100 [32:27<40:11, 38.28s/entry]\u001B[A\n",
|
65 |
+
"Processing entries: 38%|███▊ | 38/100 [32:53<35:51, 34.71s/entry]\u001B[A\n",
|
66 |
+
"Processing entries: 39%|███▉ | 39/100 [33:31<36:05, 35.50s/entry]\u001B[A\n",
|
67 |
+
"Processing entries: 40%|████ | 40/100 [34:11<37:01, 37.02s/entry]\u001B[A\n",
|
68 |
+
"Processing entries: 41%|████ | 41/100 [34:39<33:41, 34.27s/entry]\u001B[A\n",
|
69 |
+
"Processing entries: 42%|████▏ | 42/100 [35:23<35:54, 37.15s/entry]\u001B[A\n",
|
70 |
+
"Processing entries: 43%|████▎ | 43/100 [35:50<32:32, 34.26s/entry]\u001B[A\n",
|
71 |
+
"Processing entries: 44%|████▍ | 44/100 [36:29<33:08, 35.50s/entry]\u001B[A\n",
|
72 |
+
"Processing entries: 45%|████▌ | 45/100 [37:01<31:34, 34.45s/entry]\u001B[A\n",
|
73 |
+
"Processing entries: 46%|████▌ | 46/100 [37:32<30:13, 33.58s/entry]\u001B[A\n",
|
74 |
+
"Processing entries: 47%|████▋ | 47/100 [38:16<32:12, 36.47s/entry]\u001B[A\n",
|
75 |
+
"Processing entries: 48%|████▊ | 48/100 [39:01<33:47, 38.99s/entry]\u001B[A\n",
|
76 |
+
"Processing entries: 49%|████▉ | 49/100 [39:24<29:07, 34.27s/entry]\u001B[A\n",
|
77 |
+
"Processing entries: 50%|█████ | 50/100 [40:13<32:16, 38.72s/entry]\u001B[A\n",
|
78 |
+
"Processing entries: 51%|█████ | 51/100 [40:48<30:50, 37.78s/entry]\u001B[A\n",
|
79 |
+
"Processing entries: 52%|█████▏ | 52/100 [41:28<30:41, 38.37s/entry]\u001B[A\n",
|
80 |
+
"Processing entries: 53%|█████▎ | 53/100 [42:06<29:48, 38.06s/entry]\u001B[A\n",
|
81 |
+
"Processing entries: 54%|█████▍ | 54/100 [42:29<25:54, 33.80s/entry]\u001B[A\n",
|
82 |
+
"Processing entries: 55%|█████▌ | 55/100 [43:06<26:00, 34.68s/entry]\u001B[A\n",
|
83 |
+
"Processing entries: 56%|█████▌ | 56/100 [43:33<23:48, 32.46s/entry]\u001B[A\n",
|
84 |
+
"Processing entries: 57%|█████▋ | 57/100 [44:28<27:57, 39.02s/entry]\u001B[A\n",
|
85 |
+
"Processing entries: 58%|█████▊ | 58/100 [45:05<26:58, 38.53s/entry]\u001B[A\n",
|
86 |
+
"Processing entries: 59%|█████▉ | 59/100 [45:46<26:48, 39.22s/entry]\u001B[A\n",
|
87 |
+
"Processing entries: 60%|██████ | 60/100 [46:26<26:18, 39.46s/entry]\u001B[A\n",
|
88 |
+
"Processing entries: 61%|██████ | 61/100 [46:57<23:54, 36.77s/entry]\u001B[A\n",
|
89 |
+
"Processing entries: 62%|██████▏ | 62/100 [47:19<20:29, 32.36s/entry]\u001B[A\n",
|
90 |
+
"Processing entries: 63%|██████▎ | 63/100 [48:23<25:58, 42.12s/entry]\u001B[A\n",
|
91 |
+
"Processing entries: 64%|██████▍ | 64/100 [48:56<23:34, 39.29s/entry]\u001B[A\n",
|
92 |
+
"Processing entries: 65%|██████▌ | 65/100 [49:25<21:03, 36.11s/entry]\u001B[A\n",
|
93 |
+
"Processing entries: 66%|██████▌ | 66/100 [49:57<19:47, 34.92s/entry]\u001B[A\n",
|
94 |
+
"Processing entries: 67%|██████▋ | 67/100 [50:20<17:17, 31.44s/entry]\u001B[A\n",
|
95 |
+
"Processing entries: 68%|██████▊ | 68/100 [51:01<18:10, 34.07s/entry]\u001B[A\n",
|
96 |
+
"Processing entries: 69%|██████▉ | 69/100 [52:08<22:46, 44.07s/entry]\u001B[A\n",
|
97 |
+
"Processing entries: 70%|███████ | 70/100 [52:39<20:04, 40.14s/entry]\u001B[A\n",
|
98 |
+
"Processing entries: 71%|███████ | 71/100 [53:26<20:24, 42.22s/entry]\u001B[A\n",
|
99 |
+
"Processing entries: 72%|███████▏ | 72/100 [53:53<17:38, 37.80s/entry]\u001B[A\n",
|
100 |
+
"Processing entries: 73%|███████▎ | 73/100 [54:21<15:36, 34.67s/entry]\u001B[A\n",
|
101 |
+
"Processing entries: 74%|███████▍ | 74/100 [55:17<17:47, 41.06s/entry]\u001B[A\n",
|
102 |
+
"Processing entries: 75%|███████▌ | 75/100 [56:20<19:56, 47.84s/entry]\u001B[A\n",
|
103 |
+
"Processing entries: 76%|███████▌ | 76/100 [58:05<25:57, 64.89s/entry]\u001B[A\n",
|
104 |
+
"Processing entries: 77%|███████▋ | 77/100 [59:07<24:30, 63.95s/entry]\u001B[A\n",
|
105 |
+
"Processing entries: 78%|███████▊ | 78/100 [59:53<21:26, 58.48s/entry]\u001B[A\n",
|
106 |
+
"Processing entries: 79%|███████▉ | 79/100 [1:00:32<18:30, 52.89s/entry]\u001B[A\n",
|
107 |
+
"Processing entries: 80%|████████ | 80/100 [1:01:09<16:01, 48.10s/entry]\u001B[A\n",
|
108 |
+
"Processing entries: 81%|████████ | 81/100 [1:01:38<13:24, 42.37s/entry]\u001B[A\n",
|
109 |
+
"Processing entries: 82%|████████▏ | 82/100 [1:02:13<12:00, 40.02s/entry]\u001B[A"
|
110 |
+
]
|
111 |
+
}
|
112 |
+
],
|
113 |
+
"source": [
|
114 |
+
"import pandas as pd\n",
|
115 |
+
"from util.injection import process_scores_multiple\n",
|
116 |
+
"from util.model import AzureAgent, GPTAgent, Claude3Agent\n",
|
117 |
+
"from util.prompt import PROMPT_TEMPLATE\n",
|
118 |
+
"\n",
|
119 |
+
"def run_experiment(api_key, model_type, deployment_name, temperature, max_tokens, occupation,\n",
|
120 |
+
" sample_size, group_name, privilege_label, protect_label, num_run, prompt_template, endpoint_url=None):\n",
|
121 |
+
" # Load data\n",
|
122 |
+
" df = pd.read_csv(\"resume_subsampled.csv\")\n",
|
123 |
+
" \n",
|
124 |
+
" # Filter data by occupation\n",
|
125 |
+
" df = df[df[\"Occupation\"] == occupation]\n",
|
126 |
+
" df = df.sample(n=sample_size, random_state=42)\n",
|
127 |
+
" \n",
|
128 |
+
" # Initialize the agent\n",
|
129 |
+
" if model_type == 'AzureAgent':\n",
|
130 |
+
" agent = AzureAgent(api_key, endpoint_url, deployment_name)\n",
|
131 |
+
" elif model_type == 'GPTAgent':\n",
|
132 |
+
" api_version = '2024-02-15-preview'\n",
|
133 |
+
" agent = GPTAgent(api_key, endpoint_url, deployment_name, api_version)\n",
|
134 |
+
" else:\n",
|
135 |
+
" agent = Claude3Agent(api_key, deployment_name)\n",
|
136 |
+
" \n",
|
137 |
+
" # Process data\n",
|
138 |
+
" parameters = {\"temperature\": temperature, \"max_tokens\": max_tokens}\n",
|
139 |
+
" preprocessed_df = process_scores_multiple(df, num_run, parameters, privilege_label, protect_label, agent, group_name, occupation, prompt_template)\n",
|
140 |
+
" \n",
|
141 |
+
" return preprocessed_df\n",
|
142 |
+
"\n",
|
143 |
+
"# Set experiment parameters\n",
|
144 |
+
"api_key = \"6c75a8235f204c9e8cf6228e485982f7\"\n",
|
145 |
+
"model_type = \"GPTAgent\" # or \"AzureAgent\" or \"Claude3Agent\"\n",
|
146 |
+
"deployment_name = \"gpt4-1106\"\n",
|
147 |
+
"temperature = 0.0\n",
|
148 |
+
"max_tokens = 300\n",
|
149 |
+
"file_path = \"resume_subsampled.csv\" # or path to your file\n",
|
150 |
+
"occupation = \"FINANCE\"\n",
|
151 |
+
"sample_size = 100\n",
|
152 |
+
"group_name = \"Gender\"\n",
|
153 |
+
"privilege_label = \"Male\"\n",
|
154 |
+
"protect_label = \"Female\"\n",
|
155 |
+
"num_run = 1\n",
|
156 |
+
"prompt_template = PROMPT_TEMPLATE\n",
|
157 |
+
"endpoint_url = \"https://safeguard-monitor.openai.azure.com/\"\n",
|
158 |
+
"\n",
|
159 |
+
"# Run experiment\n",
|
160 |
+
"results = run_experiment(api_key, model_type, deployment_name, temperature, max_tokens, occupation,\n",
|
161 |
+
" sample_size, group_name, privilege_label, protect_label, num_run, prompt_template, endpoint_url)\n",
|
162 |
+
"\n",
|
163 |
+
"# Display results\n",
|
164 |
+
"results.head()\n",
|
165 |
+
"\n",
|
166 |
+
"# Optionally save results to a CSV file\n",
|
167 |
+
"results.to_csv(f'result/{occupation}_results.csv', index=False)\n"
|
168 |
+
]
|
169 |
+
},
|
170 |
+
{
|
171 |
+
"cell_type": "code",
|
172 |
+
"outputs": [],
|
173 |
+
"source": [],
|
174 |
+
"metadata": {
|
175 |
+
"collapsed": false
|
176 |
+
},
|
177 |
+
"id": "43711da68c012a83"
|
178 |
+
}
|
179 |
+
],
|
180 |
+
"metadata": {
|
181 |
+
"kernelspec": {
|
182 |
+
"display_name": "Python 3",
|
183 |
+
"language": "python",
|
184 |
+
"name": "python3"
|
185 |
+
},
|
186 |
+
"language_info": {
|
187 |
+
"codemirror_mode": {
|
188 |
+
"name": "ipython",
|
189 |
+
"version": 2
|
190 |
+
},
|
191 |
+
"file_extension": ".py",
|
192 |
+
"mimetype": "text/x-python",
|
193 |
+
"name": "python",
|
194 |
+
"nbconvert_exporter": "python",
|
195 |
+
"pygments_lexer": "ipython2",
|
196 |
+
"version": "2.7.6"
|
197 |
+
}
|
198 |
+
},
|
199 |
+
"nbformat": 4,
|
200 |
+
"nbformat_minor": 5
|
201 |
+
}
|
util/evaluation.py
CHANGED
@@ -84,14 +84,26 @@ def statistical_tests(data):
|
|
84 |
'Protect_Rank vs Neutral_Rank': variances['Protect_Rank'] > variances['Neutral_Rank']
|
85 |
}
|
86 |
|
87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
'Privilege': data['Privilege_Rank'].mean(),
|
89 |
'Protect': data['Protect_Rank'].mean(),
|
90 |
'Neutral': data['Neutral_Rank'].mean()
|
91 |
}
|
92 |
-
|
93 |
-
|
94 |
-
|
|
|
95 |
|
96 |
# Friedman test
|
97 |
friedman_stat, friedman_p = friedmanchisquare(*rank_data)
|
@@ -114,9 +126,18 @@ def statistical_tests(data):
|
|
114 |
**pairwise_results,
|
115 |
"Levene's Test for Equality of Variances": levene_results,
|
116 |
"Pairwise Comparisons of Variances": pairwise_variances,
|
117 |
-
"Statistical Parity Difference":
|
118 |
-
|
119 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
}
|
121 |
|
122 |
return results
|
|
|
84 |
'Protect_Rank vs Neutral_Rank': variances['Protect_Rank'] > variances['Neutral_Rank']
|
85 |
}
|
86 |
|
87 |
+
selection_rates_Avg_Score = {
|
88 |
+
'Privilege': data['Privilege_Avg_Score'].mean(),
|
89 |
+
'Protect': data['Protect_Avg_Score'].mean(),
|
90 |
+
'Neutral': data['Neutral_Avg_Score'].mean()
|
91 |
+
}
|
92 |
+
impact_ratios_Avg_Score = calculate_impact_ratio(selection_rates_Avg_Score)
|
93 |
+
spd_result_Avg_Score = statistical_parity_difference(selection_rates_Avg_Score)
|
94 |
+
adverse_impact_Avg_Score = calculate_four_fifths_rule(impact_ratios_Avg_Score)
|
95 |
+
|
96 |
+
|
97 |
+
# rank version of bias metrics
|
98 |
+
selection_rates_rank = {
|
99 |
'Privilege': data['Privilege_Rank'].mean(),
|
100 |
'Protect': data['Protect_Rank'].mean(),
|
101 |
'Neutral': data['Neutral_Rank'].mean()
|
102 |
}
|
103 |
+
impact_ratios_rank = calculate_impact_ratio(selection_rates_rank)
|
104 |
+
spd_result_rank = statistical_parity_difference(selection_rates_rank)
|
105 |
+
adverse_impact_rank = calculate_four_fifths_rule(impact_ratios_rank)
|
106 |
+
|
107 |
|
108 |
# Friedman test
|
109 |
friedman_stat, friedman_p = friedmanchisquare(*rank_data)
|
|
|
126 |
**pairwise_results,
|
127 |
"Levene's Test for Equality of Variances": levene_results,
|
128 |
"Pairwise Comparisons of Variances": pairwise_variances,
|
129 |
+
"Statistical Parity Difference": {
|
130 |
+
"Avg_Score": spd_result_Avg_Score,
|
131 |
+
"Rank": spd_result_rank
|
132 |
+
},
|
133 |
+
"Disparate Impact Ratios": {
|
134 |
+
"Avg_Score": impact_ratios_Avg_Score,
|
135 |
+
"Rank": impact_ratios_rank
|
136 |
+
},
|
137 |
+
"Four-Fifths Rule": {
|
138 |
+
"Avg_Score": adverse_impact_Avg_Score,
|
139 |
+
"Rank": adverse_impact_rank
|
140 |
+
}
|
141 |
}
|
142 |
|
143 |
return results
|
util/injection.py
CHANGED
@@ -33,9 +33,9 @@ def invoke_retry(prompt, agent, parameters, string_input=False):
|
|
33 |
while attempts < max_attempts:
|
34 |
try:
|
35 |
score_text = agent.invoke(prompt, **parameters)
|
36 |
-
print(f"Prompt: {prompt}")
|
37 |
-
print(f"Score text: {score_text}")
|
38 |
-
print("=============================================================")
|
39 |
if string_input:
|
40 |
return score_text
|
41 |
try:
|
@@ -48,7 +48,7 @@ def invoke_retry(prompt, agent, parameters, string_input=False):
|
|
48 |
raise Exception("Failed to decode JSON response even after repair attempt.")
|
49 |
# score = re.search(r'\d+', score_text)
|
50 |
# return int(score.group()) if score else -1
|
51 |
-
print(f"Score JSON: {score_json}")
|
52 |
return int(score_json['Score'])
|
53 |
|
54 |
except Exception as e:
|
@@ -82,12 +82,12 @@ def process_scores_multiple(df, num_run, parameters, privilege_label, protect_la
|
|
82 |
for key, label in zip(['Privilege', 'Protect', 'Neutral'], [privilege_label, protect_label, False]):
|
83 |
prompt_normal = create_summary(group_name, label, occupation, row, template)
|
84 |
|
85 |
-
print(f"Run {run + 1} - Entry {index + 1} - {key}")
|
86 |
-
print("=============================================================")
|
87 |
result_normal = invoke_retry(prompt_normal, agent, parameters)
|
88 |
scores[key][index].append(result_normal)
|
89 |
|
90 |
-
print(f"Scores: {scores}")
|
91 |
|
92 |
# Ensure all scores are lists and calculate average scores
|
93 |
for category in ['Privilege', 'Protect', 'Neutral']:
|
@@ -107,3 +107,4 @@ def process_scores_multiple(df, num_run, parameters, privilege_label, protect_la
|
|
107 |
df['Neutral_Rank'] = ranks['Neutral_Avg_Score']
|
108 |
|
109 |
return df
|
|
|
|
33 |
while attempts < max_attempts:
|
34 |
try:
|
35 |
score_text = agent.invoke(prompt, **parameters)
|
36 |
+
#print(f"Prompt: {prompt}")
|
37 |
+
# print(f"Score text: {score_text}")
|
38 |
+
# print("=============================================================")
|
39 |
if string_input:
|
40 |
return score_text
|
41 |
try:
|
|
|
48 |
raise Exception("Failed to decode JSON response even after repair attempt.")
|
49 |
# score = re.search(r'\d+', score_text)
|
50 |
# return int(score.group()) if score else -1
|
51 |
+
#print(f"Score JSON: {score_json}")
|
52 |
return int(score_json['Score'])
|
53 |
|
54 |
except Exception as e:
|
|
|
82 |
for key, label in zip(['Privilege', 'Protect', 'Neutral'], [privilege_label, protect_label, False]):
|
83 |
prompt_normal = create_summary(group_name, label, occupation, row, template)
|
84 |
|
85 |
+
# print(f"Run {run + 1} - Entry {index + 1} - {key}")
|
86 |
+
# print("=============================================================")
|
87 |
result_normal = invoke_retry(prompt_normal, agent, parameters)
|
88 |
scores[key][index].append(result_normal)
|
89 |
|
90 |
+
#print(f"Scores: {scores}")
|
91 |
|
92 |
# Ensure all scores are lists and calculate average scores
|
93 |
for category in ['Privilege', 'Protect', 'Neutral']:
|
|
|
107 |
df['Neutral_Rank'] = ranks['Neutral_Avg_Score']
|
108 |
|
109 |
return df
|
110 |
+
|