Spaces:
Build error
Build error
fixed bugs in metrics
Browse files- eval_modules/calc_repetitions_v2e.py +30 -7
- llm_toolkit/translation_utils_v2.py +11 -1
- notebooks/00e_Data Analysis_Fine_Tuned_RPP_MNT_2048.ipynb +2 -2
- notebooks/00f_Data Analysis_Fine_Tuned_RPP_Generic_Prompt.ipynb +2 -2
- results/mac-results_rpp_with_mnt_2048_generic_prompt_metrics.csv +32 -27
- results/mac-results_rpp_with_mnt_2048_metrics.csv +43 -31
eval_modules/calc_repetitions_v2e.py
CHANGED
@@ -172,9 +172,18 @@ def load_for_repetition_penalty(
|
|
172 |
)
|
173 |
|
174 |
|
175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
n = 1 - r / l if l > 0 else 0
|
177 |
-
return f * n
|
178 |
|
179 |
|
180 |
def calculate_adjusted_performance(row):
|
@@ -1138,7 +1147,9 @@ webqsp_csv_result_files = [
|
|
1138 |
]
|
1139 |
|
1140 |
|
1141 |
-
def calc_rap_scores(
|
|
|
|
|
1142 |
newline_score = [
|
1143 |
df["newline_score"].mean() for df in result["df_list_repetition_penalty"]
|
1144 |
]
|
@@ -1165,7 +1176,10 @@ def calc_rap_scores(result, precision="precision", recall="recall"):
|
|
1165 |
)
|
1166 |
]
|
1167 |
|
1168 |
-
rap = [
|
|
|
|
|
|
|
1169 |
|
1170 |
return newline_score, repetition_score, f1, rap, nrr
|
1171 |
|
@@ -1177,7 +1191,9 @@ def get_model_name(csv_result_file):
|
|
1177 |
return model_name
|
1178 |
|
1179 |
|
1180 |
-
def load_webqsp_result(
|
|
|
|
|
1181 |
result = {}
|
1182 |
for i, csv_result_file in enumerate(csv_result_files):
|
1183 |
try:
|
@@ -1205,7 +1221,7 @@ def load_webqsp_result(csv_result_files, force_recalculate=False, save=False):
|
|
1205 |
"file": csv_result_file,
|
1206 |
}
|
1207 |
newline_score, repetition_score, perf, rap, nrr = calc_rap_scores(
|
1208 |
-
result[model_name]
|
1209 |
)
|
1210 |
df["newline_score"] = newline_score
|
1211 |
df["repetition_score"] = repetition_score
|
@@ -1214,6 +1230,7 @@ def load_webqsp_result(csv_result_files, force_recalculate=False, save=False):
|
|
1214 |
df["nrr"] = nrr
|
1215 |
df["rap"] = rap
|
1216 |
df["rr"] = df["nrr"].apply(lambda x: 1 - x)
|
|
|
1217 |
if save:
|
1218 |
df.to_csv(csv_result_file, index=False)
|
1219 |
except Exception as e:
|
@@ -1224,7 +1241,11 @@ def load_webqsp_result(csv_result_files, force_recalculate=False, save=False):
|
|
1224 |
|
1225 |
|
1226 |
def load_ms_marco_result(
|
1227 |
-
csv_result_files,
|
|
|
|
|
|
|
|
|
1228 |
):
|
1229 |
result = {}
|
1230 |
for csv_result_file in csv_result_files:
|
@@ -1291,6 +1312,7 @@ def load_ms_marco_result(
|
|
1291 |
result[model_name],
|
1292 |
precision=col,
|
1293 |
recall=col,
|
|
|
1294 |
)
|
1295 |
df["newline_score"] = newline_score
|
1296 |
df["repetition_score"] = repetition_score
|
@@ -1299,6 +1321,7 @@ def load_ms_marco_result(
|
|
1299 |
df["nrr"] = nrr
|
1300 |
df["rap"] = rap
|
1301 |
df["rr"] = df["nrr"].apply(lambda x: 1 - x)
|
|
|
1302 |
|
1303 |
if save:
|
1304 |
df.to_csv(csv_result_file, index=False)
|
|
|
172 |
)
|
173 |
|
174 |
|
175 |
+
rap_penalty_functions = {
|
176 |
+
"linear": lambda x: x,
|
177 |
+
"quadratic": lambda x: x * x,
|
178 |
+
"cubic": lambda x: x * x * x,
|
179 |
+
"logarithmic": lambda x: math.log(x + 1, 2),
|
180 |
+
"exponential": lambda x: math.exp(x - 1),
|
181 |
+
}
|
182 |
+
|
183 |
+
|
184 |
+
def calc_adjusted_performance(f, r, l=1, penalty_function="cubic"):
|
185 |
n = 1 - r / l if l > 0 else 0
|
186 |
+
return f * rap_penalty_functions[penalty_function](n)
|
187 |
|
188 |
|
189 |
def calculate_adjusted_performance(row):
|
|
|
1147 |
]
|
1148 |
|
1149 |
|
1150 |
+
def calc_rap_scores(
|
1151 |
+
result, precision="precision", recall="recall", penalty_function="cubic"
|
1152 |
+
):
|
1153 |
newline_score = [
|
1154 |
df["newline_score"].mean() for df in result["df_list_repetition_penalty"]
|
1155 |
]
|
|
|
1176 |
)
|
1177 |
]
|
1178 |
|
1179 |
+
rap = [
|
1180 |
+
calc_adjusted_performance(f, 1 - n, penalty_function=penalty_function)
|
1181 |
+
for f, n in zip(f1, nrr)
|
1182 |
+
]
|
1183 |
|
1184 |
return newline_score, repetition_score, f1, rap, nrr
|
1185 |
|
|
|
1191 |
return model_name
|
1192 |
|
1193 |
|
1194 |
+
def load_webqsp_result(
|
1195 |
+
csv_result_files, force_recalculate=False, save=False, penalty_function="cubic"
|
1196 |
+
):
|
1197 |
result = {}
|
1198 |
for i, csv_result_file in enumerate(csv_result_files):
|
1199 |
try:
|
|
|
1221 |
"file": csv_result_file,
|
1222 |
}
|
1223 |
newline_score, repetition_score, perf, rap, nrr = calc_rap_scores(
|
1224 |
+
result[model_name], penalty_function=penalty_function
|
1225 |
)
|
1226 |
df["newline_score"] = newline_score
|
1227 |
df["repetition_score"] = repetition_score
|
|
|
1230 |
df["nrr"] = nrr
|
1231 |
df["rap"] = rap
|
1232 |
df["rr"] = df["nrr"].apply(lambda x: 1 - x)
|
1233 |
+
df["rrp"] = df["rr"].apply(lambda x: x * 100)
|
1234 |
if save:
|
1235 |
df.to_csv(csv_result_file, index=False)
|
1236 |
except Exception as e:
|
|
|
1241 |
|
1242 |
|
1243 |
def load_ms_marco_result(
|
1244 |
+
csv_result_files,
|
1245 |
+
force_recalculate=False,
|
1246 |
+
calc_bertscore=True,
|
1247 |
+
save=False,
|
1248 |
+
penalty_function="cubic",
|
1249 |
):
|
1250 |
result = {}
|
1251 |
for csv_result_file in csv_result_files:
|
|
|
1312 |
result[model_name],
|
1313 |
precision=col,
|
1314 |
recall=col,
|
1315 |
+
penalty_function=penalty_function,
|
1316 |
)
|
1317 |
df["newline_score"] = newline_score
|
1318 |
df["repetition_score"] = repetition_score
|
|
|
1321 |
df["nrr"] = nrr
|
1322 |
df["rap"] = rap
|
1323 |
df["rr"] = df["nrr"].apply(lambda x: 1 - x)
|
1324 |
+
df["rrp"] = df["rr"].apply(lambda x: x * 100)
|
1325 |
|
1326 |
if save:
|
1327 |
df.to_csv(csv_result_file, index=False)
|
llm_toolkit/translation_utils_v2.py
CHANGED
@@ -294,6 +294,7 @@ def get_metrics(df, max_output_tokens=2048, variant="rpp", existing_metrics_df=N
|
|
294 |
rr = []
|
295 |
num_max_output_tokens = []
|
296 |
translation_completeness = []
|
|
|
297 |
columns = df.columns[2:]
|
298 |
|
299 |
new_col = f"count_chinese_characters-ground_truth"
|
@@ -349,6 +350,9 @@ def get_metrics(df, max_output_tokens=2048, variant="rpp", existing_metrics_df=N
|
|
349 |
|
350 |
rr.append(df["total_repetitions"].mean() / df["answer_len"].mean())
|
351 |
|
|
|
|
|
|
|
352 |
model = col.split(f"/{variant}")[0].split("/checkpoint")[0]
|
353 |
|
354 |
new_col = f"ground_truth_tokens-{model}"
|
@@ -372,22 +376,28 @@ def get_metrics(df, max_output_tokens=2048, variant="rpp", existing_metrics_df=N
|
|
372 |
num_max_output_tokens.append(
|
373 |
count_entries_with_max_tokens(df[new_col], max_output_tokens)
|
374 |
)
|
375 |
-
|
376 |
metrics_df["comet"] = comet
|
377 |
metrics_df["meteor"] = meteor
|
378 |
metrics_df["spbleu"] = spbleu
|
379 |
metrics_df["bleu_1"] = bleu_1
|
380 |
metrics_df["rouge_l"] = rouge_l
|
381 |
metrics_df["ews_score"] = ews_score
|
|
|
382 |
metrics_df["repetition_score"] = repetition_score
|
383 |
metrics_df["total_repetitions"] = total_repetitions
|
384 |
metrics_df["rr"] = rr
|
|
|
|
|
|
|
|
|
385 |
metrics_df["rap"] = metrics_df.apply(
|
386 |
lambda x: calc_adjusted_performance(x["comet"], x["rr"]), axis=1
|
387 |
)
|
388 |
|
389 |
metrics_df["translation_completeness"] = translation_completeness
|
390 |
metrics_df["num_max_output_tokens"] = num_max_output_tokens
|
|
|
391 |
|
392 |
if variant != "rpp":
|
393 |
metrics_df[variant] = metrics_df[variant].astype(int)
|
|
|
294 |
rr = []
|
295 |
num_max_output_tokens = []
|
296 |
translation_completeness = []
|
297 |
+
percentage_of_repeated_entries = []
|
298 |
columns = df.columns[2:]
|
299 |
|
300 |
new_col = f"count_chinese_characters-ground_truth"
|
|
|
350 |
|
351 |
rr.append(df["total_repetitions"].mean() / df["answer_len"].mean())
|
352 |
|
353 |
+
r, t = df[df["total_repetitions"] > 0].shape[0], df.shape[0]
|
354 |
+
percentage_of_repeated_entries.append(100 * r / t)
|
355 |
+
|
356 |
model = col.split(f"/{variant}")[0].split("/checkpoint")[0]
|
357 |
|
358 |
new_col = f"ground_truth_tokens-{model}"
|
|
|
376 |
num_max_output_tokens.append(
|
377 |
count_entries_with_max_tokens(df[new_col], max_output_tokens)
|
378 |
)
|
379 |
+
|
380 |
metrics_df["comet"] = comet
|
381 |
metrics_df["meteor"] = meteor
|
382 |
metrics_df["spbleu"] = spbleu
|
383 |
metrics_df["bleu_1"] = bleu_1
|
384 |
metrics_df["rouge_l"] = rouge_l
|
385 |
metrics_df["ews_score"] = ews_score
|
386 |
+
metrics_df["newline_score"] = ews_score
|
387 |
metrics_df["repetition_score"] = repetition_score
|
388 |
metrics_df["total_repetitions"] = total_repetitions
|
389 |
metrics_df["rr"] = rr
|
390 |
+
metrics_df["rrp"] = metrics_df["rr"].apply(
|
391 |
+
lambda x: x * 100
|
392 |
+
)
|
393 |
+
metrics_df["perf"] = metrics_df["comet"]
|
394 |
metrics_df["rap"] = metrics_df.apply(
|
395 |
lambda x: calc_adjusted_performance(x["comet"], x["rr"]), axis=1
|
396 |
)
|
397 |
|
398 |
metrics_df["translation_completeness"] = translation_completeness
|
399 |
metrics_df["num_max_output_tokens"] = num_max_output_tokens
|
400 |
+
metrics_df["percentage_of_repeated_entries"] = percentage_of_repeated_entries
|
401 |
|
402 |
if variant != "rpp":
|
403 |
metrics_df[variant] = metrics_df[variant].astype(int)
|
notebooks/00e_Data Analysis_Fine_Tuned_RPP_MNT_2048.ipynb
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:39e2555b6c94d217415635bcfede10150bbccf4b1ab1df0baeea9687303963c8
|
3 |
+
size 2683424
|
notebooks/00f_Data Analysis_Fine_Tuned_RPP_Generic_Prompt.ipynb
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ba968d834c1dcff62626ce158426ae25a8b9ed6f1749b827ff4f8c16815e632e
|
3 |
+
size 65147929
|
results/mac-results_rpp_with_mnt_2048_generic_prompt_metrics.csv
CHANGED
@@ -1,27 +1,32 @@
|
|
1 |
-
model,rpp,comet,meteor,spbleu,bleu_1,rouge_l,ews_score,repetition_score,total_repetitions,rr,rap,translation_completeness,num_max_output_tokens
|
2 |
-
|
3 |
-
|
4 |
-
internlm/internlm2_5-7b-chat,1.
|
5 |
-
internlm/internlm2_5-7b-chat,1.
|
6 |
-
internlm/internlm2_5-7b-chat,1.
|
7 |
-
internlm/internlm2_5-7b-chat,1.
|
8 |
-
|
9 |
-
|
10 |
-
microsoft/Phi-3.5-mini-instruct,1.
|
11 |
-
microsoft/Phi-3.5-mini-instruct,1.
|
12 |
-
microsoft/Phi-3.5-mini-instruct,1.
|
13 |
-
microsoft/Phi-3.5-mini-instruct,1.
|
14 |
-
|
15 |
-
|
16 |
-
shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.
|
17 |
-
shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.
|
18 |
-
shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.
|
19 |
-
shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.
|
20 |
-
shenzhi-wang/Llama3.1-
|
21 |
-
shenzhi-wang/Llama3.1-
|
22 |
-
shenzhi-wang/
|
23 |
-
shenzhi-wang/
|
24 |
-
shenzhi-wang/
|
25 |
-
shenzhi-wang/
|
26 |
-
shenzhi-wang/
|
27 |
-
shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model,rpp,comet,meteor,spbleu,bleu_1,rouge_l,ews_score,newline_score,repetition_score,total_repetitions,rr,rrp,perf,rap,translation_completeness,num_max_output_tokens,percentage_of_repeated_entries
|
2 |
+
Qwen/Qwen2-72B-Instruct,1.00,0.7457879471336964,0.4655033970145371,11.09789843749336,0.1109789843749336,0.4305484344779789,20.488967343336274,20.488967343336274,27.70079435127979,48.18976169461607,0.27753124825269027,27.753124825269026,0.7457879471336964,0.2812370692049126,0.9902912621359223,10,4.148278905560459
|
3 |
+
Qwen/Qwen2-72B-Instruct,1.02,0.7498899340261324,0.4659242900672017,15.078064414763237,0.1507806441476323,0.4322160855974918,2.8102383053839364,2.8102383053839364,15.191526919682259,18.001765225066197,0.1255625257792252,12.55625257792252,0.7498899340261324,0.5013993970649349,0.9947043248014121,3,3.795233892321271
|
4 |
+
internlm/internlm2_5-7b-chat,1.00,0.7357995069773978,0.4297612514398102,15.060226683930628,0.1506022668393063,0.4097577795330234,0.04942630185348632,0.04942630185348632,9.235657546337158,9.285083848190645,0.07525035765379114,7.525035765379114,0.7357995069773978,0.581878095297299,1.0,2,1.853486319505737
|
5 |
+
internlm/internlm2_5-7b-chat,1.02,0.7377187550620283,0.4246676977198055,14.728605282752795,0.147286052827528,0.4063246630867048,0.06972639011473963,0.06972639011473963,5.35657546337158,5.426301853486319,0.04625547346404442,4.625547346404442,0.7377187550620283,0.6400103546749837,1.0,1,2.294792586054722
|
6 |
+
internlm/internlm2_5-7b-chat,1.04,0.7371160490183523,0.4173352728374962,13.846403511622256,0.1384640351162226,0.3988121301027288,0.06884377758164166,0.06884377758164166,5.315092674315975,5.383936451897617,0.04501878242643857,4.5018782426438575,0.7371160490183523,0.6419783129560218,1.0,1,2.118270079435128
|
7 |
+
internlm/internlm2_5-7b-chat,1.06,0.7338597697698218,0.3997609847704189,12.213374588416173,0.1221337458841617,0.3841365748920261,0.05825242718446602,0.05825242718446602,5.275375110326567,5.333627537511033,0.043830827367611756,4.3830827367611755,0.7338597697698218,0.6415304775228277,1.0,1,1.853486319505737
|
8 |
+
internlm/internlm2_5-7b-chat,1.08,0.7318234702626478,0.3881614120395272,11.369735763522288,0.1136973576352228,0.372963223209074,0.06707855251544571,0.06707855251544571,5.283318623124448,5.350397175639894,0.04300663332269164,4.300663332269164,0.7318234702626478,0.6414061446416202,1.0,1,2.030008826125331
|
9 |
+
internlm/internlm2_5-7b-chat,1.10,0.7288648442604431,0.3784182249483568,10.377989030628608,0.103779890306286,0.3618424457502351,0.05207413945278023,0.05207413945278023,5.288614298323036,5.340688437775817,0.042176064682512025,4.217606468251202,0.7288648442604431,0.6404777687452995,1.0,1,1.941747572815534
|
10 |
+
microsoft/Phi-3.5-mini-instruct,1.00,0.710605339281136,0.3788926591792472,9.70032874202361,0.097003287420236,0.3556134739443916,5.390997352162401,5.390997352162401,12.997352162400706,18.388349514563107,0.13770903562694164,13.770903562694164,0.710605339281136,0.4556065638568846,1.0,4,2.118270079435128
|
11 |
+
microsoft/Phi-3.5-mini-instruct,1.02,0.7150978385770836,0.3741049510326346,9.910633597905436,0.0991063359790543,0.3453160556383774,3.586054721977052,3.586054721977052,7.001765225066196,10.587819947043249,0.08180522500528503,8.180522500528504,0.7150978385770836,0.5535666483700645,1.0,2,1.147396293027361
|
12 |
+
microsoft/Phi-3.5-mini-instruct,1.04,0.7074641684778791,0.3538698731015666,9.19721270538052,0.0919721270538052,0.3225824135517728,0.05119152691968226,0.05119152691968226,0.05560458958517211,0.10679611650485436,0.000859149229250836,0.0859149229250836,0.7074641684778791,0.7056422827612971,1.0,0,1.147396293027361
|
13 |
+
microsoft/Phi-3.5-mini-instruct,1.06,0.6962301708225224,0.3252854575717334,6.967166383106307,0.069671663831063,0.2948764736589108,0.0353045013239188,0.0353045013239188,0.06796116504854369,0.10326566637246248,0.0007865281839265906,0.07865281839265906,0.6962301708225224,0.6945886486476809,1.0,0,1.235657546337158
|
14 |
+
microsoft/Phi-3.5-mini-instruct,1.08,0.6823413657174107,0.301599095293242,5.452744292893752,0.0545274429289375,0.2726387617958179,0.07678729037952339,0.07678729037952339,0.04766107678729038,0.12444836716681378,0.0009016671249608319,0.0901667124960832,0.6823413657174107,0.6804972951227785,1.0,0,1.500441306266549
|
15 |
+
microsoft/Phi-3.5-mini-instruct,1.10,0.6717851540206916,0.2885734336603344,4.751039447225815,0.0475103944722581,0.2604284999048123,0.08031774051191527,0.08031774051191527,0.02383053839364519,0.10414827890556046,0.0007188284314919954,0.07188284314919954,0.6717851540206916,0.6703375003284932,1.0,0,1.676963812886143
|
16 |
+
shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.00,0.739080294072365,0.4490104515425626,6.7013404492782405,0.0670134044927823,0.4196181637680596,0.36716681376875554,0.36716681376875554,139.80935569285083,140.1765225066196,0.48362195756964893,48.362195756964894,0.739080294072365,0.10176417668651536,0.999117387466902,15,4.766107678729038
|
17 |
+
shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.02,0.743018615750854,0.4514907128972251,8.545954556237808,0.085459545562378,0.4214940415288087,1.0035304501323918,1.0035304501323918,67.00353045013239,68.00706090026479,0.2929644725635723,29.296447256357226,0.743018615750854,0.2626173445806161,1.0,6,3.353927625772286
|
18 |
+
shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.04,0.7432195577780335,0.4517500968367987,10.080425294411064,0.1008042529441106,0.4200973007348334,0.01059135039717564,0.01059135039717564,35.19770520741395,35.208296557811124,0.17564306911947306,17.564306911947305,0.7432195577780335,0.4163542580947422,1.0,6,2.912621359223301
|
19 |
+
shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.06,0.7430821573139815,0.4484154407825542,10.37470506193322,0.1037470506193321,0.4160289393328045,1.8005295675198587,1.8005295675198587,26.880847308031775,28.68137687555163,0.1522966823356282,15.22966823356282,0.7430821573139815,0.45265620897504616,1.0,3,2.5595763459841128
|
20 |
+
shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.08,0.7435937259684909,0.4407733547418294,10.930453247368872,0.1093045324736887,0.4113063412348818,0.09267431597528684,0.09267431597528684,12.007943512797882,12.100617828773169,0.06721477842655646,6.721477842655646,0.7435937259684909,0.6035047423944578,1.0,3,2.471315092674316
|
21 |
+
shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.10,0.7427059700687901,0.4358940590119784,11.381344076286156,0.1138134407628615,0.4062980635945339,0.03971756398940865,0.03971756398940865,0.6681376875551632,0.707855251544572,0.003961824217515018,0.3961824217515018,0.7427059700687901,0.7339134850401315,1.0,1,2.64783759929391
|
22 |
+
shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.00,0.3888604919913587,0.2055875758168277,0.2434587181959752,0.0024345871819597,0.1844552188025856,638.2797881729921,638.2797881729921,3889.9232127096207,4528.203000882612,0.9210262088655917,92.10262088655917,0.3888604919913587,0.00019153263422509847,0.9240953221535746,570,89.40864960282435
|
23 |
+
shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.02,0.401959364669889,0.2020993340187826,0.2473696547531083,0.002473696547531,0.1795542969510355,611.315975286849,611.315975286849,3759.7599293909975,4371.075904677847,0.8883366762655638,88.83366762655638,0.401959364669889,0.0005596465146821489,0.912621359223301,562,88.87908208296558
|
24 |
+
shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.04,0.4185898168652013,0.1966595434176072,0.2555659721821757,0.0025556597218217,0.1694446848890552,565.2135922330098,565.2135922330098,3662.4289496910856,4227.642541924095,0.8709795612734481,87.09795612734482,0.4185898168652013,0.0008990093364229676,0.8702559576345984,524,87.99646954986761
|
25 |
+
shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.06,0.4381090624082014,0.1859205661699343,0.2641725452341745,0.0026417254523417,0.1557244147977393,515.5233892321271,515.5233892321271,3521.3830538393645,4036.9064430714916,0.8049867903962987,80.49867903962988,0.4381090624082014,0.0032491841590587176,0.8402471315092674,514,86.31950573698147
|
26 |
+
shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.08,0.4538654954503935,0.1738923185096792,0.2607684997123511,0.0026076849971235,0.1432980302984092,480.180052956752,480.180052956752,3378.7625772285965,3858.9426301853487,0.7472985052942835,74.72985052942835,0.4538654954503935,0.007324038178749702,0.8331862312444837,477,85.3486319505737
|
27 |
+
shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.00,0.7222260562908512,0.4039898602650971,13.461179673541356,0.1346117967354136,0.3819960428004565,0.05736981465136805,0.05736981465136805,5.87378640776699,5.931156222418358,0.05150372482295595,5.1503724822955945,0.7222260562908512,0.6162827926700337,1.0,1,2.471315092674316
|
28 |
+
shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.02,0.723643534970515,0.4051102919608809,13.18537912294539,0.1318537912294539,0.3824621732976229,0.06266548984995587,0.06266548984995587,5.840247131509267,5.902912621359223,0.05148734372113075,5.148734372113075,0.723643534970515,0.617524335498735,1.0,1,2.471315092674316
|
29 |
+
shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.04,0.7238812581796301,0.4039456988919502,13.314773371306682,0.1331477337130668,0.3813737464821349,0.05736981465136805,0.05736981465136805,5.845542806707855,5.902912621359223,0.05127418810757766,5.127418810757766,0.7238812581796301,0.6181437496179476,1.0,1,2.206531332744925
|
30 |
+
shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.06,0.7252625281686607,0.4012797167602334,13.19924345265053,0.1319924345265053,0.3798291332004637,0.06266548984995587,0.06266548984995587,5.847308031774051,5.909973521624007,0.05081388730791121,5.081388730791121,0.7252625281686607,0.6202251404786316,1.0,1,2.383053839364519
|
31 |
+
shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.08,0.7261167238322592,0.3987395126194482,12.656486100206328,0.1265648610020633,0.376975448872996,0.05648720211827008,0.05648720211827008,5.820829655781112,5.877316857899382,0.05012721880128273,5.0127218801282725,0.7261167238322592,0.6223042523816,1.0,1,2.118270079435128
|
32 |
+
shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.10,0.7264630642225547,0.3964859769229444,12.284961706379857,0.1228496170637985,0.3744555065346823,0.04942630185348632,0.04942630185348632,0.09267431597528684,0.14210061782877317,0.001266948385624464,0.1266948385624464,0.7264630642225547,0.7237053873903428,1.0,0,1.7652250661959399
|
results/mac-results_rpp_with_mnt_2048_metrics.csv
CHANGED
@@ -1,31 +1,43 @@
|
|
1 |
-
model,rpp,comet,meteor,spbleu,bleu_1,rouge_l,ews_score,repetition_score,total_repetitions,rr,rap,translation_completeness,num_max_output_tokens
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
shenzhi-wang/
|
27 |
-
shenzhi-wang/
|
28 |
-
shenzhi-wang/
|
29 |
-
shenzhi-wang/
|
30 |
-
shenzhi-wang/
|
31 |
-
shenzhi-wang/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model,rpp,comet,meteor,spbleu,bleu_1,rouge_l,ews_score,newline_score,repetition_score,total_repetitions,rr,rrp,perf,rap,translation_completeness,num_max_output_tokens,percentage_of_repeated_entries
|
2 |
+
Qwen/Qwen2-72B-Instruct,1.00,0.7570170721487987,0.4789208192601603,18.953782447211417,0.1895378244721141,0.4522514493620528,0.07855251544571933,0.07855251544571933,0.06266548984995587,0.1412180052956752,0.0012165820128349403,0.12165820128349403,0.7570170721487987,0.7542575120443845,1.0,0,2.206531332744925
|
3 |
+
Qwen/Qwen2-72B-Instruct,1.02,0.7573583127909388,0.4796836675632054,19.0137490702917,0.190137490702917,0.4526922175986438,0.0997352162400706,0.0997352162400706,0.07413945278022947,0.17387466902030008,0.0014956534942869072,0.1495653494286907,0.7573583127909388,0.7539651560209267,1.0,0,2.64783759929391
|
4 |
+
Qwen/Qwen2-72B-Instruct,1.04,0.7574985287458835,0.476362681282195,18.52063321160408,0.1852063321160408,0.4491791286896135,0.08384819064430715,0.08384819064430715,0.08649602824360106,0.1703442188879082,0.0014611139290337722,0.1461113929033772,0.7574985287458835,0.7541830028744213,1.0,0,2.471315092674316
|
5 |
+
Qwen/Qwen2-72B-Instruct,1.06,0.7563852697268905,0.4679636383950649,17.982473951038504,0.1798247395103851,0.4414302384776113,0.07325684024713151,0.07325684024713151,0.05472197705207414,0.12797881729920565,0.0010829866531231096,0.10829866531231096,0.7563852697268905,0.7539304647132251,1.0,0,2.118270079435128
|
6 |
+
Qwen/Qwen2-72B-Instruct,1.08,0.7554471646908207,0.4597578197711645,17.067954025424825,0.1706795402542483,0.4319052863160809,0.06354810238305383,0.06354810238305383,0.03795233892321271,0.10150044130626655,0.0008500258703525761,0.0850025870352576,0.7554471646908207,0.7535223528572281,1.0,0,1.853486319505737
|
7 |
+
Qwen/Qwen2-72B-Instruct,1.10,0.75500979801247,0.4515778511124262,16.22452191616505,0.1622452191616504,0.4244761354058755,0.05736981465136805,0.05736981465136805,0.02824360105913504,0.08561341571050309,0.0007121356728580868,0.07121356728580867,0.75500979801247,0.7533979381889341,1.0,0,1.676963812886143
|
8 |
+
Qwen/Qwen2-7B-Instruct,1.00,0.7457723188010971,0.442240791493943,14.38814929350883,0.1438814929350883,0.4162759057739747,0.05825242718446602,0.05825242718446602,10.774933804060018,10.833186231244484,0.08396037978493448,8.396037978493448,0.7457723188010971,0.5732565603116205,0.9947043248014121,2,2.294792586054722
|
9 |
+
Qwen/Qwen2-7B-Instruct,1.02,0.7474082109944966,0.4400998640836595,15.16172261831792,0.1516172261831792,0.4165035906118025,0.05648720211827008,0.05648720211827008,7.036187113857017,7.0926743159752865,0.057428303949803826,5.742830394980382,0.7474082109944966,0.6258943742595823,0.9947043248014121,1,2.294792586054722
|
10 |
+
Qwen/Qwen2-7B-Instruct,1.04,0.7484377450576842,0.4390136558190875,14.958631815014014,0.1495863181501401,0.4142970757686492,0.0529567519858782,0.0529567519858782,0.10856134157105031,0.1615180935569285,0.001379851157039126,0.1379851157039126,0.7484377450576842,0.745343820078089,0.999117387466902,0,1.853486319505737
|
11 |
+
Qwen/Qwen2-7B-Instruct,1.06,0.7471614604436078,0.4328321576515084,14.28087386760537,0.1428087386760537,0.4073319764254861,0.06707855251544571,0.06707855251544571,0.11032656663724624,0.17740511915269197,0.0015113918339724791,0.1511391833972479,0.7471614604436078,0.743778816908725,0.9982347749338041,0,2.206531332744925
|
12 |
+
Qwen/Qwen2-7B-Instruct,1.08,0.7451942060389355,0.423560805217557,13.659683698817108,0.1365968369881711,0.3968597388257214,0.06443071491615181,0.06443071491615181,0.13062665489849956,0.19505736981465135,0.0016449082275183469,0.1644908227518347,0.7451942060389355,0.7415229233479677,1.0,0,2.206531332744925
|
13 |
+
Qwen/Qwen2-7B-Instruct,1.10,0.7432072967790653,0.4135053136541433,12.922649874705083,0.1292264987470507,0.3878186285063445,0.06707855251544571,0.06707855251544571,0.0997352162400706,0.16681376875551632,0.0013848487290898832,0.1384848729089883,0.7432072967790653,0.7401238817462625,1.0,0,2.030008826125331
|
14 |
+
internlm/internlm2_5-7b-chat,1.00,0.739699612254078,0.4289996929258777,14.734881589173108,0.1473488158917311,0.4096466800937898,0.05383936451897617,0.05383936451897617,12.606354810238305,12.660194174757281,0.10371655820679682,10.371655820679681,0.739699612254078,0.5325881640465967,1.0,2,2.030008826125331
|
15 |
+
internlm/internlm2_5-7b-chat,1.02,0.740223803961056,0.4266246904302194,14.583816688798017,0.1458381668879802,0.4071727106228415,0.06266548984995587,0.06266548984995587,9.849073256840247,9.911738746690203,0.0832234063051179,8.32234063051179,0.740223803961056,0.5703659582906754,1.0,1,2.118270079435128
|
16 |
+
internlm/internlm2_5-7b-chat,1.04,0.7398856264610577,0.4154585167056314,13.534659133050225,0.1353465913305021,0.3968657713589718,0.07237422771403354,0.07237422771403354,6.529567519858782,6.601941747572815,0.05613508442776736,5.613508442776736,0.7398856264610577,0.6221485884888651,1.0,1,2.294792586054722
|
17 |
+
internlm/internlm2_5-7b-chat,1.06,0.7379362287241489,0.4039588647855378,12.346740971499404,0.1234674097149939,0.3872447044295494,0.06796116504854369,0.06796116504854369,6.533980582524272,6.601941747572815,0.05513987689359035,5.513987689359035,0.7379362287241489,0.6224742543197805,0.999117387466902,1,2.383053839364519
|
18 |
+
internlm/internlm2_5-7b-chat,1.08,0.7319988705684732,0.3873176839854818,11.075674965706344,0.1107567496570634,0.3724352909668609,0.05207413945278023,0.05207413945278023,9.83495145631068,9.88702559576346,0.07906717392378437,7.906717392378438,0.7319988705684732,0.5717343310562308,0.999117387466902,1,1.941747572815534
|
19 |
+
internlm/internlm2_5-7b-chat,1.10,0.7295350462119345,0.3769306874386757,10.305163787094209,0.1030516378709421,0.3634496155759507,0.07855251544571933,0.07855251544571933,6.527802294792586,6.606354810238305,0.053004659594657756,5.300465959465775,0.7295350462119345,0.6195690090849183,0.999117387466902,1,2.383053839364519
|
20 |
+
microsoft/Phi-3.5-mini-instruct,1.00,0.7107840433177544,0.3796831545348129,8.71296896471494,0.0871296896471493,0.3589874395901284,10.670785525154457,10.670785525154457,17.93821712268314,28.6090026478376,0.20225504327262062,20.225504327262062,0.7107840433177544,0.3608526271635592,1.0,6,2.030008826125331
|
21 |
+
microsoft/Phi-3.5-mini-instruct,1.02,0.7164765837070485,0.3780585837553919,10.291240080163629,0.1029124008016362,0.3546952732427276,3.585172109443954,3.585172109443954,7.1403353927625774,10.725507502206531,0.08530053839296368,8.530053839296368,0.7164765837070485,0.5483240204881398,1.0,2,1.323918799646955
|
22 |
+
microsoft/Phi-3.5-mini-instruct,1.04,0.7111233387336411,0.3547161333845742,8.966881655527896,0.0896688165552789,0.3300979657678754,3.6125330979699912,3.6125330979699912,0.07325684024713151,3.685789938217123,0.02973427131098516,2.973427131098516,0.7111233387336411,0.6495566110355127,1.0,1,1.412180052956752
|
23 |
+
microsoft/Phi-3.5-mini-instruct,1.06,0.7024363270136286,0.3298733737040869,7.076233088011138,0.0707623308801113,0.3019513312669543,0.04589585172109444,0.04589585172109444,0.05207413945278023,0.09796999117387467,0.0007571675113745661,0.0757167511374566,0.7024363270136286,0.7008419489376413,1.0,0,1.412180052956752
|
24 |
+
microsoft/Phi-3.5-mini-instruct,1.08,0.6882111219210848,0.3054541022592767,5.105510599247868,0.0510551059924786,0.2736030007297014,3.3609885260370698,3.3609885260370698,0.06443071491615181,3.4254192409532216,0.023581380370521147,2.3581380370521146,0.6882111219210848,0.6406632969877,1.0,1,3.089143865842895
|
25 |
+
microsoft/Phi-3.5-mini-instruct,1.10,0.6712992989638161,0.2903831801547132,4.091958857999118,0.0409195885799911,0.251653275009876,0.32215357458075905,0.32215357458075905,0.06531332744924978,0.38746690203000883,0.0023407216247487324,0.23407216247487322,0.6712992989638161,0.6665963500989894,1.0,0,6.443071491615181
|
26 |
+
shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.00,0.7501818982248062,0.4611110508507017,17.87914973742753,0.1787914973742752,0.4340662057009564,0.00706090026478376,0.00706090026478376,0.1262135922330097,0.13327449249779347,0.0011265209898463904,0.11265209898463904,0.7501818982248062,0.7476494662426587,1.0,0,1.853486319505737
|
27 |
+
shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.02,0.7485114382045625,0.4571517219079576,17.436884594979905,0.174368845949799,0.4311385932640979,0.00706090026478376,0.00706090026478376,0.11562224183583407,0.12268314210061783,0.0010382199383043404,0.10382199383043404,0.7485114382045625,0.7461824993322019,1.0,0,1.588702559576346
|
28 |
+
shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.04,0.7500591586357918,0.4560467960364254,17.440173470996626,0.1744017347099662,0.4302844557731285,0.00706090026478376,0.00706090026478376,0.13062665489849956,0.13768755516328332,0.0011593944393659004,0.11593944393659004,0.7500591586357918,0.7474533388920676,1.0,0,1.7652250661959399
|
29 |
+
shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.06,0.748812871571673,0.4520416361219855,16.89523258317781,0.168952325831778,0.4260026774745837,0.00706090026478376,0.00706090026478376,0.0997352162400706,0.10679611650485436,0.0008902491962006224,0.08902491962006225,0.748812871571673,0.7468147612728927,1.0,0,1.500441306266549
|
30 |
+
shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.08,0.7473851635144647,0.4442106511292453,16.16623784482793,0.1616623784482792,0.4195129470585874,0.01059135039717564,0.01059135039717564,0.13062665489849956,0.1412180052956752,0.001176591707969938,0.1176591707969938,0.7473851635144647,0.7447501647073623,1.0,0,1.676963812886143
|
31 |
+
shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.10,0.7465709781131172,0.4379837926138161,15.60172257624066,0.1560172257624066,0.4132562932940978,0.01059135039717564,0.01059135039717564,0.07855251544571933,0.08914386584289496,0.000734476013176936,0.0734476013176936,0.7465709781131172,0.7449271706150111,1.0,0,1.412180052956752
|
32 |
+
shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.00,0.7426396049131678,0.433632501662176,15.209540658023398,0.1520954065802339,0.4089208235151474,0.00353045013239188,0.00353045013239188,3.901147396293027,3.904677846425419,0.03237065275450547,3.237065275450547,0.7426396049131678,0.6728297734832243,1.0,1,1.853486319505737
|
33 |
+
shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.02,0.7436477056353469,0.4329054166518245,15.19102241646024,0.1519102241646024,0.4068967964789407,0.0,0.0,3.8905560458958517,3.8905560458958517,0.03219656852361788,3.219656852361788,0.7436477056353469,0.6741068111074712,1.0,1,1.676963812886143
|
34 |
+
shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.04,0.7440943776351209,0.4320478700956207,15.05135166158296,0.1505135166158296,0.4062008380201262,0.00353045013239188,0.00353045013239188,0.1526919682259488,0.1562224183583407,0.001352332200022921,0.1352332200022921,0.7440943776351209,0.7410796698393737,1.0,0,1.588702559576346
|
35 |
+
shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.06,0.7426502735395928,0.4275429314912545,14.449130821290163,0.1444913082129016,0.4001409979222783,0.00706090026478376,0.00706090026478376,0.13768755516328332,0.14474845542806708,0.0012399256044637321,0.12399256044637322,0.7426502735395928,0.7398912041420567,1.0,0,1.588702559576346
|
36 |
+
shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.08,0.7408098006080129,0.4206626658729054,13.933703757385222,0.1393370375738522,0.3964824268676203,0.00353045013239188,0.00353045013239188,0.1297440423654016,0.13327449249779347,0.001134996993385448,0.1134996993385448,0.7408098006080129,0.7382902118097237,1.0,0,1.235657546337158
|
37 |
+
shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.10,0.7392685912871718,0.4111211240399151,13.303738403756984,0.1330373840375698,0.3870959581563503,0.00353045013239188,0.00353045013239188,0.12180052956751986,0.12533097969991175,0.0010529672171262895,0.10529672171262895,0.7392685912871718,0.7369357726201563,1.0,0,1.500441306266549
|
38 |
+
shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.00,0.7240239171358935,0.4068335357738006,13.565136550617618,0.1356513655061761,0.3866395067055498,0.0529567519858782,0.0529567519858782,0.1209179170344219,0.17387466902030008,0.001578993772192076,0.1578993772192076,0.7240239171358935,0.7205996419729696,1.0,0,2.471315092674316
|
39 |
+
shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.02,0.7263097057327799,0.4064914781094827,13.42987641622816,0.1342987641622816,0.3863697821025159,0.06001765225066196,0.06001765225066196,6.236540158870256,6.296557811120918,0.0541899611084103,5.4189961108410305,0.7263097057327799,0.6145165811709306,1.0,1,2.471315092674316
|
40 |
+
shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.04,0.7276128307708258,0.4054859896994975,13.295092218891954,0.1329509221889195,0.3851203729935697,0.05207413945278023,0.05207413945278023,0.1297440423654016,0.18181818181818182,0.0016533037985858635,0.16533037985858634,0.7276128307708258,0.7240098989116803,1.0,0,2.294792586054722
|
41 |
+
shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.06,0.7276865132383193,0.4014727027723293,13.10860799057166,0.1310860799057166,0.3804952786306688,0.05207413945278023,0.05207413945278023,0.13415710503089143,0.18623124448367168,0.001691057431836761,0.1691057431836761,0.7276865132383193,0.7240010735018495,1.0,0,2.383053839364519
|
42 |
+
shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.08,0.726393195584298,0.3987018836449559,12.850537785783194,0.1285053778578319,0.3788945955746495,0.05648720211827008,0.05648720211827008,0.15357458075904679,0.21006178287731686,0.0018871365478087807,0.18871365478087807,0.726393195584298,0.7222885419382362,1.0,0,2.206531332744925
|
43 |
+
shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.10,0.7244012304511832,0.3932239948456176,12.361161644811926,0.1236116164481192,0.3733413807007665,0.05030891438658429,0.05030891438658429,0.08561341571050309,0.13592233009708737,0.0012217374057913526,0.12217374057913527,0.7244012304511832,0.721749388705754,1.0,0,1.853486319505737
|