dh-mc commited on
Commit
83aa28d
1 Parent(s): 34070ca

v2e results

Browse files
app.py CHANGED
@@ -2,8 +2,9 @@ import os
2
  import sys
3
  import evaluate
4
  import gradio as gr
5
- from huggingface_hub import InferenceClient
6
  from dotenv import find_dotenv, load_dotenv
 
7
 
8
  found_dotenv = find_dotenv(".env")
9
 
@@ -18,11 +19,14 @@ sys.path.append(path)
18
 
19
  from llm_toolkit.llm_utils import *
20
  from llm_toolkit.translation_utils import *
21
- from eval_modules.calc_repetitions_v2d import detect_repetitions
22
 
23
  model_name = os.getenv("MODEL_NAME") or "microsoft/Phi-3.5-mini-instruct"
24
  num_shots = int(os.getenv("NUM_SHOTS", 10))
25
  data_path = os.getenv("DATA_PATH")
 
 
 
26
 
27
  comet = evaluate.load("comet", config_name="Unbabel/wmt22-cometkiwi-da", gpus=1)
28
  meteor = evaluate.load("meteor")
@@ -59,7 +63,7 @@ def calc_perf_scores(prediction, source, reference, debug=False):
59
  For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
60
  """
61
  # client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
62
- client = InferenceClient(model_name)
63
 
64
  datasets = load_translation_dataset(data_path)
65
  print_row_details(datasets["test"].to_pandas())
 
2
  import sys
3
  import evaluate
4
  import gradio as gr
5
+ from huggingface_hub import InferenceClient, login
6
  from dotenv import find_dotenv, load_dotenv
7
+ from huggingface_hub import login
8
 
9
  found_dotenv = find_dotenv(".env")
10
 
 
19
 
20
  from llm_toolkit.llm_utils import *
21
  from llm_toolkit.translation_utils import *
22
+ from eval_modules.calc_repetitions_v2e import detect_repetitions
23
 
24
  model_name = os.getenv("MODEL_NAME") or "microsoft/Phi-3.5-mini-instruct"
25
  num_shots = int(os.getenv("NUM_SHOTS", 10))
26
  data_path = os.getenv("DATA_PATH")
27
+ hf_token = os.getenv("HF_TOKEN")
28
+
29
+ login(token=hf_token, add_to_git_credential=True)
30
 
31
  comet = evaluate.load("comet", config_name="Unbabel/wmt22-cometkiwi-da", gpus=1)
32
  meteor = evaluate.load("meteor")
 
63
  For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
64
  """
65
  # client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
66
+ client = InferenceClient(model_name, token=hf_token)
67
 
68
  datasets = load_translation_dataset(data_path)
69
  print_row_details(datasets["test"].to_pandas())
eval_modules/calc_repetitions_v2d.py DELETED
@@ -1,1281 +0,0 @@
1
- import os
2
- import re
3
- import math
4
- import pandas as pd
5
- import numpy as np
6
- import matplotlib.pyplot as plt
7
- import matplotlib.ticker as mtick
8
- import seaborn as sns
9
- import nltk
10
- import evaluate
11
- import traceback
12
-
13
- bert_score = evaluate.load("bertscore")
14
- meteor = evaluate.load("meteor")
15
-
16
- print(f"loading: {__file__}")
17
-
18
- # pattern_non_word_char_repetition = re.compile(r"\s{5,}")
19
- # pattern_text_repetitions = re.compile(r"(.{5}.*)\s*((\1)\s*)+", re.M | re.DOTALL)
20
-
21
- # final version
22
- pattern_non_word_char_repetition = re.compile(r"[\s\W]{5,}")
23
- pattern_text_repetitions = re.compile(
24
- r"(?P<repeat>.{5}.*?)(?:[\s\W]*(?P=repeat))+", re.M | re.DOTALL | re.IGNORECASE
25
- )
26
- # Explanation of the Regex Pattern:
27
- # (?P<repeat>.{5}.*?): Captures any sequence of characters with minimal length of 5 and names this group repeat.
28
- # .*?: Matches zero or more characters, non-greedily (as few as possible).
29
- # (?:[\s\W]+(?P=repeat))+: A non-capturing group that matches one or more repetitions of:
30
- # [\s\W]+: One or more whitespace or non-word characters (spaces, punctuation, etc.).
31
- # (?P=repeat): A backreference to the named group repeat.
32
-
33
-
34
- def del_non_word_char_repetition(text, debug=False):
35
- count = 0
36
-
37
- if isinstance(text, str):
38
- if debug:
39
- print("----detect non-word characters repetition----")
40
- count = len(text)
41
- text = pattern_non_word_char_repetition.sub("\t", text)
42
- count -= len(text)
43
- if debug and count:
44
- print(f"removed non-word characters repetition: {count}")
45
- return text, count
46
-
47
-
48
- # final version for repetition detection
49
- def detect_text_repetitions(text, debug=False):
50
- count = 0
51
-
52
- if isinstance(text, str):
53
- if debug:
54
- print("----detect text repetitions----")
55
- matches = pattern_text_repetitions.finditer(text)
56
- for match in matches:
57
- if debug:
58
- print(match)
59
- for groupNum in range(0, len(match.groups())):
60
- groupNum = groupNum + 1
61
- print(
62
- "Group {groupNum} found at {start}-{end}: `{group}`".format(
63
- groupNum=groupNum,
64
- start=match.start(groupNum),
65
- end=match.end(groupNum),
66
- group=match.group(groupNum),
67
- )
68
- )
69
-
70
- start, end = match.span()
71
- count += end - start - len(match.group(1))
72
-
73
- return count
74
-
75
-
76
- def detect_repetitions(text, debug=False):
77
- text, count_non_word_char_repetition = del_non_word_char_repetition(
78
- text, debug=debug
79
- )
80
- count_text_repetitions = detect_text_repetitions(text, debug=debug)
81
- total_repetitions = count_non_word_char_repetition + count_text_repetitions
82
-
83
- result = (count_non_word_char_repetition, count_text_repetitions, total_repetitions)
84
-
85
- if debug:
86
- print(result)
87
- return result
88
-
89
-
90
- def detect_scores(text, debug=False):
91
- newline_score, repetition_score, total_repetitions = detect_repetitions(
92
- text, debug=debug
93
- )
94
- return pd.Series([newline_score, repetition_score, total_repetitions])
95
-
96
-
97
- def load_with_newline_and_repetition_scores(result_file, force_recalculate=False):
98
- print(f"loading result file: {result_file}")
99
- df = pd.read_csv(result_file, comment="#", on_bad_lines="warn")
100
-
101
- if (
102
- force_recalculate
103
- or "newline_score" not in df.columns
104
- or "repetition_score" not in df.columns
105
- or "total_repetitions" not in df.columns
106
- or "nrr" not in df.columns
107
- or "rr" not in df.columns
108
- ):
109
- if (
110
- force_recalculate
111
- or "newline_score" not in df.columns
112
- or "repetition_score" not in df.columns
113
- or "total_repetitions" not in df.columns
114
- ):
115
- df[["newline_score", "repetition_score", "total_repetitions"]] = df[
116
- "answer"
117
- ].apply(detect_scores)
118
-
119
- df["answer_len"] = df["answer"].apply(
120
- lambda x: len(x) if isinstance(x, str) else 0
121
- )
122
-
123
- df["nrr"] = df.apply(
124
- lambda x: (
125
- 1
126
- if x["answer_len"] == 0
127
- else 1 - (x["newline_score"] + x["repetition_score"]) / x["answer_len"]
128
- ),
129
- axis=1,
130
- )
131
-
132
- df["rr"] = df["nrr"].apply(lambda x: 1 - x)
133
-
134
- df.to_csv(result_file, index=False)
135
-
136
- return df
137
-
138
-
139
- def replace_last(source_string, old_string, new_string):
140
- head, _sep, tail = source_string.rpartition(old_string)
141
- return head + new_string + tail
142
-
143
-
144
- def load_for_repetition_penalty(
145
- csv_result_file, repetition_penalty, force_recalculate=False
146
- ):
147
- result_file = replace_last(
148
- csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv"
149
- )
150
- return load_with_newline_and_repetition_scores(
151
- result_file, force_recalculate=force_recalculate
152
- )
153
-
154
-
155
- def calc_adjusted_performance(f, r):
156
- return f / math.log10(10 + r)
157
-
158
-
159
- def calculate_adjusted_performance(row):
160
- r = row["total_repetitions"]
161
- adjusted_precision = calc_adjusted_performance(row["precision"], r)
162
- adjusted_recall = calc_adjusted_performance(row["recall"], r)
163
- return pd.Series([adjusted_precision, adjusted_recall])
164
-
165
-
166
- def load_performance_df(csv_result_file, repetition_penalty):
167
- result_file = replace_last(
168
- csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}-t2_evaluated.json"
169
- )
170
- result_file = result_file.replace("/results/", "/eval/")
171
- print(f"loading json file: {result_file}")
172
- df = pd.read_json(result_file)
173
-
174
- return df
175
-
176
-
177
- def calculate_performance_score(
178
- csv_result_file, repetition_penalty, force_recalculate=False
179
- ):
180
- result_file = replace_last(
181
- csv_result_file, ".csv", f"_rpp_{repetition_penalty:.2f}.csv"
182
- )
183
-
184
- if os.path.exists(result_file):
185
- print(f"loading result file: {result_file}")
186
- df = load_with_newline_and_repetition_scores(
187
- result_file, force_recalculate=force_recalculate
188
- )
189
- else:
190
- print(f"re-creating result file: {result_file}")
191
- df = pd.DataFrame()
192
- force_recalculate = True
193
-
194
- if force_recalculate or "f2" in df.columns or "f1" not in df.columns:
195
- try:
196
- perf_df = load_performance_df(csv_result_file, repetition_penalty)
197
- df.drop(
198
- columns=[
199
- "precision",
200
- "recall",
201
- "f1",
202
- "f2",
203
- "entities_in_answer",
204
- "entities_in_question",
205
- "word_count",
206
- ],
207
- errors="ignore",
208
- inplace=True,
209
- )
210
-
211
- df["id"] = perf_df["id"]
212
- df["question"] = perf_df["question"]
213
- df["answer"] = perf_df["pred_answer"]
214
- df["word_count"] = df["answer"].apply(
215
- lambda x: len(nltk.word_tokenize(x)) if isinstance(x, str) else 0
216
- )
217
- df["ground_truth"] = perf_df["ground_truth"]
218
-
219
- df["eval_gemini_1.0_pro"] = perf_df["eval_gemini_1.0_pro"]
220
- df["precision"] = perf_df["score"].apply(lambda x: x[0])
221
- df["recall"] = perf_df["score"].apply(lambda x: x[1])
222
- df["f1"] = perf_df["score"].apply(lambda x: x[2])
223
- except Exception as e:
224
- print(f"\tignored error: {e}")
225
- # traceback.print_exc()
226
-
227
- df[["newline_score", "repetition_score", "total_repetitions"]] = df[
228
- "answer"
229
- ].apply(detect_scores)
230
-
231
- df[["adjusted_precision", "adjusted_recall"]] = df.apply(
232
- calculate_adjusted_performance, axis=1
233
- )
234
-
235
- df.to_csv(result_file, index=False)
236
- print(f"performance scores saved to result file: {result_file}")
237
-
238
- # print(f"df len: {len(df)}")
239
-
240
- return df
241
-
242
-
243
- def adjust_perf_scores_with_repetition_penalty(result, precision, recall):
244
- newline_score = [
245
- df["newline_score"].mean() for df in result["df_list_repetition_penalty"]
246
- ]
247
-
248
- repetition_score = [
249
- df["repetition_score"].mean() for df in result["df_list_repetition_penalty"]
250
- ]
251
-
252
- precision = [
253
- f / math.log10(10 + n + r)
254
- for f, n, r in zip(precision, newline_score, repetition_score)
255
- ]
256
- recall = [
257
- f / math.log10(10 + n + r)
258
- for f, n, r in zip(recall, newline_score, repetition_score)
259
- ]
260
-
261
- return precision, recall
262
-
263
-
264
- def plot_performance_scores(
265
- result,
266
- models=None,
267
- title="Performance",
268
- ):
269
- if models is None:
270
- models = result.keys()
271
- for model in models:
272
- print(f"model: {model}")
273
- df = result[model]["df_overall"]
274
-
275
- # Calculate the statistics
276
- precision = [
277
- df["precision"].mean() for df in result[model]["df_list_repetition_penalty"]
278
- ]
279
- recall = [
280
- df["recall"].mean() for df in result[model]["df_list_repetition_penalty"]
281
- ]
282
- f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
283
- best_f1 = max(f1)
284
- best_f1_index = f1.index(best_f1)
285
-
286
- precision, recall = adjust_perf_scores_with_repetition_penalty(
287
- result[model], precision, recall
288
- )
289
- afrp = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
290
-
291
- # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
292
- best_afrp = max(afrp)
293
- best_afrp_index = afrp.index(best_afrp)
294
-
295
- adjusted_precision = [
296
- df["adjusted_precision"].mean()
297
- for df in result[model]["df_list_repetition_penalty"]
298
- ]
299
- adjusted_recall = [
300
- df["adjusted_recall"].mean()
301
- for df in result[model]["df_list_repetition_penalty"]
302
- ]
303
- afrp2 = [
304
- 2 * (p * r) / (p + r) for p, r in zip(adjusted_precision, adjusted_recall)
305
- ]
306
- best_afrp2 = max(afrp2)
307
- best_afrp2_index = afrp2.index(best_afrp2)
308
-
309
- repetition_penalties = list(df["repetition_penalty"])
310
-
311
- # line plot for precision, recall, f1
312
- plt.figure(figsize=(10, 6))
313
-
314
- plt.axvspan(
315
- repetition_penalties[best_f1_index] - 0.01,
316
- repetition_penalties[best_f1_index] + 0.01,
317
- alpha=0.5,
318
- edgecolor="none",
319
- facecolor="blue",
320
- )
321
-
322
- # plt.axvspan(
323
- # repetition_penalties[best_afrp2_index] - 0.01,
324
- # repetition_penalties[best_afrp2_index] + 0.01,
325
- # alpha=0.5,
326
- # edgecolor="none",
327
- # facecolor="green",
328
- # )
329
-
330
- plt.axvspan(
331
- repetition_penalties[best_afrp_index] - 0.01,
332
- repetition_penalties[best_afrp_index] + 0.01,
333
- alpha=0.5,
334
- edgecolor="none",
335
- facecolor="orange",
336
- )
337
-
338
- plt.plot(repetition_penalties, f1, label="F1", marker="D", color="blue")
339
- # plt.plot(
340
- # repetition_penalties,
341
- # afrp2,
342
- # label="Per-question RAP - F1",
343
- # marker="s",
344
- # color="green",
345
- # )
346
- plt.plot(
347
- repetition_penalties,
348
- afrp,
349
- label="RAP - F1",
350
- marker="o",
351
- color="orange",
352
- )
353
- plt.xlabel("Repetition Penalties")
354
- plt.ylabel("Score")
355
- # plt.xlim(0.99, 1.31)
356
- # y in percentage
357
- plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
358
- plt.title(f"{model} {title}")
359
- plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
360
-
361
- plt.show()
362
-
363
-
364
- def plot_best_afrp(
365
- result,
366
- models=None,
367
- title="Models with Best RAP - F1",
368
- ref_result=None,
369
- ):
370
- # Initialize lists to store the statistics
371
- model_names = []
372
- best_f1 = []
373
- best_afrp = []
374
- best_repetition_penalty = []
375
- best_mtr = []
376
-
377
- if models is None:
378
- models = result.keys()
379
- for model in models:
380
- print(f"model: {model}")
381
- df = result[model]["df_overall"]
382
-
383
- # Calculate the statistics
384
- precision = [
385
- df["precision"].mean() for df in result[model]["df_list_repetition_penalty"]
386
- ]
387
- recall = [
388
- df["recall"].mean() for df in result[model]["df_list_repetition_penalty"]
389
- ]
390
- # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
391
- f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
392
-
393
- newline_score = [
394
- df["newline_score"].mean()
395
- for df in result[model]["df_list_repetition_penalty"]
396
- ]
397
- # print(f"newline_score: {newline_score}")
398
-
399
- repetition_score = [
400
- df["repetition_score"].mean()
401
- for df in result[model]["df_list_repetition_penalty"]
402
- ]
403
- # print(f"repetition_score: {repetition_score}")
404
-
405
- afrp = [
406
- f / math.log10(10 + n + r)
407
- for f, n, r in zip(f1, newline_score, repetition_score)
408
- ]
409
-
410
- best_afrp.append(max(afrp))
411
- best_afrp_index = afrp.index(best_afrp[-1])
412
- best_repetition_penalty.append(df["repetition_penalty"][best_afrp_index])
413
-
414
- best_f1.append(f1[best_afrp_index])
415
- best_mtr.append(
416
- newline_score[best_afrp_index] + repetition_score[best_afrp_index]
417
- )
418
-
419
- # print(
420
- # f"best repetition penalty: {best_repetition_penalty[-1]}, best afrp: {best_afrp[-1]}, f1: {best_f1[-1]}"
421
- # )
422
-
423
- df = result[model]["df_list_repetition_penalty"][best_afrp_index]
424
-
425
- model_names.append(
426
- f"{model} (RP={best_repetition_penalty[-1]})"
427
- ) # Add the model name to the list
428
-
429
- if ref_result is not None:
430
- print("ref_result:", ref_result)
431
- for model in ref_result.keys():
432
- model_names.append(model)
433
- df = pd.read_csv(ref_result[model])
434
- # df = df[df["id"].isin(wikidata_df["id"])]
435
-
436
- p = df["precision"].mean()
437
- r = df["recall"].mean()
438
-
439
- f1 = 2 * p * r / (p + r) if p + r > 0 else 0
440
- best_f1.append(f1)
441
- best_afrp.append(f1)
442
- best_mtr.append(0)
443
-
444
- print("model_names:", model_names)
445
- # print("best_f1:", best_f1)
446
- # print("best_afrp:", best_afrp)
447
-
448
- # Create a DataFrame with the statistics
449
- data = pd.DataFrame(
450
- {
451
- "Model": model_names,
452
- "RAP - F1": best_afrp,
453
- "F1": best_f1,
454
- }
455
- )
456
-
457
- # Melt the DataFrame to a long format
458
- data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score")
459
-
460
- # Pivot the DataFrame to a wide format
461
- data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score")
462
-
463
- # make sure the columns are following the order of the models
464
- data_pivoted = data_pivoted[model_names]
465
-
466
- # make sure three groups in the order of precision, recall, f1
467
- data_pivoted = data_pivoted.reindex(["RAP - F1", "F1"])
468
-
469
- # Plot the statistics
470
- plt.figure(figsize=(15, 6))
471
- ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9)
472
- plt.title(title)
473
- plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
474
-
475
- # Set the rotation of the x-axis labels to 0 degrees
476
- plt.xticks(rotation=0)
477
-
478
- # Format the y-axis to display as percentage
479
- ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
480
-
481
- # get the max value of the y-axis
482
- a1 = max(best_afrp)
483
- a2 = max(best_f1)
484
-
485
- max_value = max([a1, a2]) * 1.12
486
- print("max_value:", max_value)
487
-
488
- # Set the y-axis limit up to 70%
489
- ax.set_ylim(0, max_value)
490
-
491
- # Add the values above each bar
492
- for p in ax.patches:
493
- ax.annotate(
494
- f"{p.get_height() * 100:.1f}",
495
- (p.get_x() + p.get_width() / 2.0, p.get_height()),
496
- ha="center",
497
- va="bottom",
498
- xytext=(0, 10),
499
- textcoords="offset points",
500
- rotation=90,
501
- )
502
-
503
- plt.show()
504
- return data_pivoted, best_mtr
505
-
506
-
507
- def plot_best_performance(
508
- result,
509
- models=None,
510
- title="Models with Best F1 Score",
511
- adjusted_f1=False,
512
- ref_result=None,
513
- ):
514
- # Initialize lists to store the statistics
515
- model_names = []
516
- best_precision = []
517
- best_recall = []
518
- best_f1 = []
519
- best_repetition_penalty = []
520
- best_mtr = []
521
-
522
- if models is None:
523
- models = result.keys()
524
- for model in models:
525
- print(f"model: {model}")
526
- df = result[model]["df_overall"]
527
-
528
- # Calculate the statistics
529
- precision = [
530
- df["precision"].mean() for df in result[model]["df_list_repetition_penalty"]
531
- ]
532
- recall = [
533
- df["recall"].mean() for df in result[model]["df_list_repetition_penalty"]
534
- ]
535
- newline_score = [
536
- df["newline_score"].mean()
537
- for df in result[model]["df_list_repetition_penalty"]
538
- ]
539
-
540
- repetition_score = [
541
- df["repetition_score"].mean()
542
- for df in result[model]["df_list_repetition_penalty"]
543
- ]
544
-
545
- if adjusted_f1:
546
- precision, recall = adjust_perf_scores_with_repetition_penalty(
547
- result[model], precision, recall
548
- )
549
-
550
- # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
551
- f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
552
-
553
- best_f1.append(max(f1))
554
- best_f1_index = f1.index(best_f1[-1])
555
- best_repetition_penalty.append(df["repetition_penalty"][best_f1_index])
556
-
557
- best_precision.append(precision[best_f1_index])
558
- best_recall.append(recall[best_f1_index])
559
- best_mtr.append(newline_score[best_f1_index] + repetition_score[best_f1_index])
560
-
561
- print(
562
- f"best repetition penalty: {best_repetition_penalty[-1]}, best f1: {best_f1[-1]}, precision: {best_precision[-1]}, recall: {best_recall[-1]}"
563
- )
564
-
565
- df = result[model]["df_list_repetition_penalty"][best_f1_index]
566
-
567
- model_names.append(
568
- f"{model} (RP={best_repetition_penalty[-1]})"
569
- ) # Add the model name to the list
570
-
571
- # print sum for columns: newline_score, repetition_score
572
- print(
573
- f"newline_score: {df['newline_score'].sum()}, repetition_score: {df['repetition_score'].sum()}"
574
- )
575
-
576
- if ref_result is not None:
577
- print("ref_result:", ref_result)
578
- for model in ref_result.keys():
579
- model_names.append(model)
580
- df = pd.read_csv(ref_result[model])
581
- # df = df[df["id"].isin(wikidata_df["id"])]
582
-
583
- best_precision.append(df["precision"].mean())
584
- best_recall.append(df["recall"].mean())
585
- f1 = (
586
- 2
587
- * (best_precision[-1] * best_recall[-1])
588
- / (best_precision[-1] + best_recall[-1])
589
- )
590
- # best_f1.append(df["f1"].mean())
591
- best_f1.append(f1)
592
- best_mtr.append(0)
593
-
594
- # Create a DataFrame with the statistics
595
- data = (
596
- pd.DataFrame(
597
- {
598
- "Model": model_names,
599
- "Adjusted Precision with RP": best_precision,
600
- "Adjusted Recall with RP": best_recall,
601
- "Adjusted F1 with RP": best_f1,
602
- }
603
- )
604
- if adjusted_f1
605
- else pd.DataFrame(
606
- {
607
- "Model": model_names,
608
- "Precision": best_precision,
609
- "Recall": best_recall,
610
- "F1": best_f1,
611
- }
612
- )
613
- )
614
- columns = list(data.columns)
615
-
616
- # Melt the DataFrame to a long format
617
- data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score")
618
-
619
- # Pivot the DataFrame to a wide format
620
- data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score")
621
-
622
- # make sure the columns are following the order of the models
623
- data_pivoted = data_pivoted[model_names]
624
-
625
- # make sure three groups in the order of precision, recall, f1
626
- data_pivoted = data_pivoted.reindex(columns[1:])
627
-
628
- # Plot the statistics
629
- plt.figure(figsize=(10, 6))
630
- ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9)
631
- plt.title(title)
632
- plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
633
-
634
- # Set the rotation of the x-axis labels to 0 degrees
635
- plt.xticks(rotation=0)
636
-
637
- # Format the y-axis to display as percentage
638
- ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
639
-
640
- # get the max value of the y-axis
641
- a1 = max(best_precision)
642
- a2 = max(best_recall)
643
- a3 = max(best_f1)
644
-
645
- max_value = max([a1, a2, a3]) * 1.12
646
- print("max_value:", max_value)
647
-
648
- # Set the y-axis limit up to 70%
649
- ax.set_ylim(0, max_value)
650
-
651
- # Add the values above each bar
652
- for p in ax.patches:
653
- ax.annotate(
654
- f"{p.get_height() * 100:.1f}",
655
- (p.get_x() + p.get_width() / 2.0, p.get_height()),
656
- ha="center",
657
- va="bottom",
658
- xytext=(0, 10),
659
- textcoords="offset points",
660
- rotation=90,
661
- )
662
-
663
- plt.show()
664
- return data_pivoted, best_mtr
665
-
666
-
667
- def plot_best_performance_ms_macro(
668
- result,
669
- models=None,
670
- title="Models with Best RAP - Performance",
671
- ref_result=None,
672
- skip_generic_prompt=False,
673
- include_adjusted_performance=True,
674
- ):
675
- # Initialize lists to store the statistics
676
- model_names = []
677
- best_f1 = []
678
- best_afrp = []
679
- best_repetition_penalty = []
680
- best_bleu1 = []
681
- best_rougeL = []
682
- best_mtr = []
683
-
684
- if models is None:
685
- models = result.keys()
686
- for model in models:
687
- if skip_generic_prompt and "generic prompt" in model:
688
- continue
689
- print(f"model: {model}")
690
- df = result[model]["df_overall"]
691
-
692
- # Calculate the statistics
693
- bleu1 = [x for x in df["bleu1"]]
694
- rougeL = [x for x in df["rougeL"]]
695
- f1 = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)]
696
-
697
- newline_score = [
698
- df["newline_score"].mean()
699
- for df in result[model]["df_list_repetition_penalty"]
700
- ]
701
- # print(f"newline_score: {newline_score}")
702
-
703
- repetition_score = [
704
- df["repetition_score"].mean()
705
- for df in result[model]["df_list_repetition_penalty"]
706
- ]
707
- # print(f"repetition_score: {repetition_score}")
708
-
709
- afrp = [
710
- f / math.log10(10 + n + r)
711
- for f, n, r in zip(f1, newline_score, repetition_score)
712
- ]
713
-
714
- best_afrp.append(max(afrp if include_adjusted_performance else f1))
715
- best_afrp_index = (
716
- afrp.index(best_afrp[-1])
717
- if include_adjusted_performance
718
- else f1.index(best_afrp[-1])
719
- )
720
- best_repetition_penalty.append(df["repetition_penalty"][best_afrp_index])
721
-
722
- best_f1.append(f1[best_afrp_index])
723
- best_bleu1.append(bleu1[best_afrp_index])
724
- best_rougeL.append(rougeL[best_afrp_index])
725
- best_mtr.append(
726
- newline_score[best_afrp_index] + repetition_score[best_afrp_index]
727
- )
728
-
729
- # print(
730
- # f"best repetition penalty: {best_repetition_penalty[-1]}, best afrp: {best_afrp[-1]}, f1: {best_f1[-1]}"
731
- # )
732
-
733
- df = result[model]["df_list_repetition_penalty"][best_afrp_index]
734
-
735
- model_names.append(
736
- f"{model} (RP={best_repetition_penalty[-1]})"
737
- ) # Add the model name to the list
738
-
739
- if ref_result is not None:
740
- print("ref_result:", ref_result)
741
- for model in ref_result.keys():
742
- model_names.append(model)
743
- df = pd.read_csv(ref_result[model], comment="#", on_bad_lines="warn")
744
- # df = df[df["id"].isin(wikidata_df["id"])]
745
-
746
- p = df["bleu1"][0]
747
- best_bleu1.append(p)
748
-
749
- r = df["rougeL"][0]
750
- best_rougeL.append(r)
751
-
752
- f1 = 2 * p * r / (p + r) if p + r > 0 else 0
753
- best_f1.append(f1)
754
- best_afrp.append(f1)
755
- best_mtr.append(0)
756
-
757
- # print("model_names:", model_names)
758
- # print("best_f1:", best_f1)
759
- # print("best_afrp:", best_afrp)
760
-
761
- # Create a DataFrame with the statistics
762
- data = (
763
- pd.DataFrame(
764
- {
765
- "Model": model_names,
766
- "RAP - Perf Score": best_afrp,
767
- "Overall Perf Score": best_f1,
768
- }
769
- )
770
- if include_adjusted_performance
771
- else pd.DataFrame(
772
- {
773
- "Model": model_names,
774
- "Bleu-1": best_bleu1,
775
- "Rouge-L": best_rougeL,
776
- "Overall Perf Score": best_f1,
777
- }
778
- )
779
- )
780
-
781
- # Melt the DataFrame to a long format
782
- data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score")
783
-
784
- # Pivot the DataFrame to a wide format
785
- data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score")
786
-
787
- # make sure the columns are following the order of the models
788
- data_pivoted = data_pivoted[model_names]
789
-
790
- columns = list(data.columns)
791
- data_pivoted = data_pivoted.reindex(columns[1:])
792
-
793
- # Plot the statistics
794
- plt.figure(figsize=(10, 6))
795
- ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9)
796
- plt.title(title)
797
- plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
798
-
799
- # Set the rotation of the x-axis labels to 0 degrees
800
- plt.xticks(rotation=0)
801
-
802
- # Format the y-axis to display as percentage
803
- ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
804
-
805
- # get the max value of the y-axis
806
- a1 = max(best_afrp)
807
- a2 = max(best_f1)
808
- a3 = max(best_bleu1)
809
- a4 = max(best_rougeL)
810
-
811
- max_value = (
812
- max([a1, a2] if include_adjusted_performance else [a1, a2, a3, a4]) * 1.12
813
- )
814
- print("max_value:", max_value)
815
-
816
- # Set the y-axis limit up to 70%
817
- ax.set_ylim(0, max_value)
818
-
819
- # Add the values above each bar
820
- for p in ax.patches:
821
- ax.annotate(
822
- f"{p.get_height() * 100:.1f}",
823
- (p.get_x() + p.get_width() / 2.0, p.get_height()),
824
- ha="center",
825
- va="bottom",
826
- xytext=(0, 10),
827
- textcoords="offset points",
828
- rotation=90,
829
- )
830
-
831
- plt.show()
832
- return data_pivoted, best_mtr
833
-
834
-
835
- all_open_source_models = [
836
- "gemma-1.1-2b-it",
837
- "Phi-3-mini-128k-instruct",
838
- "gemma-1.1-7b-it",
839
- "Llama-2-7b-chat-hf",
840
- "Mistral-7B-Instruct-v0.2",
841
- "Meta-Llama-3-8B-Instruct",
842
- "Llama-2-13b-chat-hf",
843
- "Llama-2-70b-chat-hf",
844
- "Meta-Llama-3-70B-Instruct",
845
- ]
846
-
847
-
848
- def load_for_repetition_penalty_ms_macro(
849
- csv_result_file, repetition_penalty, force_recalculate=False
850
- ):
851
- result_file = replace_last(
852
- csv_result_file, ".csv", f"_rpp_{repetition_penalty:.2f}.csv"
853
- )
854
- df = load_with_newline_and_repetition_scores(
855
- result_file, force_recalculate=force_recalculate
856
- )
857
-
858
- return df
859
-
860
-
861
- # MS MACRO
862
- def plot_performance_scores_ms_macro(
863
- result,
864
- models=None,
865
- title="Performance",
866
- ):
867
- if models is None:
868
- models = result.keys()
869
- for model in models:
870
- print(f"model: {model}")
871
- df = result[model]["df_overall"]
872
- # print(result[model]["df_list_repetition_penalty"][0].describe())
873
-
874
- # Calculate the statistics
875
- bleu1 = list(df["bleu1"])
876
- rougeL = list(df["rougeL"])
877
- f1 = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)]
878
- best_f1 = max(f1)
879
- best_f1_index = f1.index(best_f1)
880
-
881
- bleu1, rougeL = adjust_perf_scores_with_repetition_penalty(
882
- result[model], bleu1, rougeL
883
- )
884
- afrp = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)]
885
-
886
- # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
887
- best_afrp = max(afrp)
888
- best_afrp_index = afrp.index(best_afrp)
889
-
890
- repetition_penalties = list(df["repetition_penalty"])
891
-
892
- # line plot for precision, recall, f1
893
- plt.figure(figsize=(10, 6))
894
-
895
- plt.axvspan(
896
- repetition_penalties[best_f1_index] - 0.01,
897
- repetition_penalties[best_f1_index] + 0.01,
898
- alpha=0.5,
899
- edgecolor="none",
900
- facecolor="blue",
901
- )
902
-
903
- plt.axvspan(
904
- repetition_penalties[best_afrp_index] - 0.01,
905
- repetition_penalties[best_afrp_index] + 0.01,
906
- alpha=0.5,
907
- edgecolor="none",
908
- facecolor="orange",
909
- )
910
-
911
- plt.plot(
912
- repetition_penalties,
913
- f1,
914
- label="Overall Perf Score",
915
- marker="D",
916
- color="blue",
917
- )
918
- plt.plot(
919
- repetition_penalties,
920
- afrp,
921
- label="RAP - Perf Score",
922
- marker="o",
923
- color="orange",
924
- )
925
-
926
- plt.xlabel("Repetition Penalties")
927
- plt.ylabel("Score")
928
- # plt.xlim(0.99, 1.31)
929
- # y in percentage
930
- plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
931
- plt.title(f"{model} {title}")
932
- plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
933
-
934
- plt.show()
935
-
936
-
937
- def plot_repetition_factors(result, groups):
938
- for group in groups:
939
- # Plot the statistics
940
- plt.figure(figsize=(10, 6))
941
-
942
- max_value = 0
943
- for model in result.keys():
944
- if not group in model.lower():
945
- continue
946
- print(f"model: {model}")
947
- df = result[model]["df_overall"]
948
- repetition_panelties = [
949
- repetition_penalty for repetition_penalty in df["repetition_penalty"]
950
- ]
951
-
952
- mean_score = [
953
- # math.log10(10 + df["total_repetitions"].mean())
954
- df["total_repetitions"].mean()
955
- for df in result[model]["df_list_repetition_penalty"]
956
- ]
957
-
958
- sns.lineplot(x=repetition_panelties, y=mean_score, label=model)
959
-
960
- new_max = max(mean_score)
961
- if new_max > max_value:
962
- max_value = new_max
963
-
964
- max_value = max_value * 1.05
965
- # if max_value < 1.5:
966
- # max_value = 1.5
967
- # set ylimit
968
- plt.ylim(0, max_value)
969
-
970
- # show grid
971
- plt.grid(True)
972
- plt.xlabel("Repetition Penalties")
973
- plt.ylabel("Mean Total Repetitions")
974
- plt.title("Mean Total Repetitions vs Repetition Penalties")
975
- plt.legend()
976
-
977
- plt.show()
978
-
979
-
980
- def plot_repetition_factors_by_group(result, group_filter=None):
981
- markers = ["D", "o", "s", "x"]
982
- colors = ["blue", "orange", "green", "red"]
983
-
984
- # Plot the statistics
985
- plt.figure(figsize=(10, 6))
986
- index = 0
987
- max_value = 0
988
-
989
- for model in result.keys():
990
- if group_filter is not None and group_filter not in model:
991
- continue
992
-
993
- print(f"model: {model}")
994
-
995
- df = result[model]["df_overall"]
996
- repetition_panelties = [
997
- repetition_penalty for repetition_penalty in df["repetition_penalty"]
998
- ]
999
-
1000
- # Calculate the statistics
1001
- mean_score = [
1002
- # math.log10(10 + df["total_repetitions"].mean())
1003
- df["total_repetitions"].mean()
1004
- for df in result[model]["df_list_repetition_penalty"]
1005
- ]
1006
- if len(mean_score) != len(repetition_panelties):
1007
- print(
1008
- f"model: {model} has different length of repetition penalties and mean score"
1009
- )
1010
- print("repetition_panelties:", len(repetition_panelties))
1011
- print("mean_score:", len(mean_score))
1012
- continue
1013
-
1014
- new_max = max(mean_score)
1015
- if new_max > max_value:
1016
- max_value = new_max
1017
-
1018
- sns.lineplot(
1019
- x=repetition_panelties,
1020
- y=mean_score,
1021
- label=model,
1022
- marker=markers[index],
1023
- color=colors[index],
1024
- )
1025
-
1026
- index += 1
1027
-
1028
- max_value = max_value * 1.05
1029
- # if max_value < 1.5:
1030
- # max_value = 1.5
1031
- # set ylimit
1032
- plt.ylim(0, max_value)
1033
- max_value = 0
1034
-
1035
- plt.xlabel("Repetition Penalties")
1036
- plt.ylabel("Mean Total Repetitions")
1037
- plt.title("Mean Total Repetitions vs Repetition Penalties")
1038
- plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
1039
-
1040
- plt.show()
1041
-
1042
-
1043
- ms_marco_csv_result_files = [
1044
- "data/results_v2/gemma-1.1-2b-it(RAG - Generic Prompt)_mm.csv",
1045
- "data/results_v2/gemma-1.1-2b-it(RAG - Chat Template)_mm.csv",
1046
- "data/results_v2/gemma-1.1-2b-it(Non-RAG)_mm.csv",
1047
- "data/results_v2/Phi-3-mini-128k-instruct(RAG - Generic Prompt)_mm.csv",
1048
- "data/results_v2/Phi-3-mini-128k-instruct(RAG - Chat Template)_mm.csv",
1049
- "data/results_v2/Phi-3-mini-128k-instruct(Non-RAG)_mm.csv",
1050
- "data/results_v2/gemma-1.1-7b-it(RAG - Generic Prompt)_mm.csv",
1051
- "data/results_v2/gemma-1.1-7b-it(RAG - Chat Template)_mm.csv",
1052
- "data/results_v2/gemma-1.1-7b-it(Non-RAG)_mm.csv",
1053
- "data/results_v2/Llama-2-7b-chat-hf(RAG - Generic Prompt)_mm.csv",
1054
- "data/results_v2/Llama-2-7b-chat-hf(RAG - Chat Template)_mm.csv",
1055
- "data/results_v2/Llama-2-7b-chat-hf(Non-RAG)_mm.csv",
1056
- "data/results_v2/Mistral-7B-Instruct-v0.2(RAG - Generic Prompt)_mm.csv",
1057
- "data/results_v2/Mistral-7B-Instruct-v0.2(RAG - Chat Template)_mm.csv",
1058
- "data/results_v2/Mistral-7B-Instruct-v0.2(Non-RAG)_mm.csv",
1059
- "data/results_v2/Meta-Llama-3-8B-Instruct(RAG - Generic Prompt)_mm.csv",
1060
- "data/results_v2/Meta-Llama-3-8B-Instruct(RAG - Chat Template)_mm.csv",
1061
- "data/results_v2/Meta-Llama-3-8B-Instruct(Non-RAG)_mm.csv",
1062
- "data/results_v2/Llama-2-13b-chat-hf(RAG - Generic Prompt)_mm.csv",
1063
- "data/results_v2/Llama-2-13b-chat-hf(RAG - Chat Template)_mm.csv",
1064
- "data/results_v2/Llama-2-13b-chat-hf(Non-RAG)_mm.csv",
1065
- "data/results_v2/Llama-2-70b-chat-hf(RAG - Generic Prompt)_mm.csv",
1066
- "data/results_v2/Llama-2-70b-chat-hf(RAG - Chat Template)_mm.csv",
1067
- "data/results_v2/Llama-2-70b-chat-hf(Non-RAG)_mm.csv",
1068
- "data/results_v2/Meta-Llama-3-70B-Instruct(RAG - Generic Prompt)_mm.csv",
1069
- "data/results_v2/Meta-Llama-3-70B-Instruct(RAG - Chat Template)_mm.csv",
1070
- "data/results_v2/Meta-Llama-3-70B-Instruct(Non-RAG)_mm.csv",
1071
- ]
1072
-
1073
- webqsp_csv_result_files = [
1074
- "data/results_v2/gemma-1.1-2b-it(RAG - Generic Prompt)_wd.csv",
1075
- "data/results_v2/gemma-1.1-2b-it(RAG - Chat Template)_wd.csv",
1076
- "data/results_v2/gemma-1.1-2b-it(Non-RAG)_wd.csv",
1077
- "data/results_v2/Phi-3-mini-128k-instruct(RAG - Generic Prompt)_wd.csv",
1078
- "data/results_v2/Phi-3-mini-128k-instruct(RAG - Chat Template)_wd.csv",
1079
- "data/results_v2/Phi-3-mini-128k-instruct(Non-RAG)_wd.csv",
1080
- "data/results_v2/gemma-1.1-7b-it(RAG - Generic Prompt)_wd.csv",
1081
- "data/results_v2/gemma-1.1-7b-it(RAG - Chat Template)_wd.csv",
1082
- "data/results_v2/gemma-1.1-7b-it(Non-RAG)_wd.csv",
1083
- "data/results_v2/Llama-2-7b-chat-hf(RAG - Generic Prompt)_wd.csv",
1084
- "data/results_v2/Llama-2-7b-chat-hf(RAG - Chat Template)_wd.csv",
1085
- "data/results_v2/Llama-2-7b-chat-hf(Non-RAG)_wd.csv",
1086
- "data/results_v2/Mistral-7B-Instruct-v0.2(RAG - Generic Prompt)_wd.csv",
1087
- "data/results_v2/Mistral-7B-Instruct-v0.2(RAG - Chat Template)_wd.csv",
1088
- "data/results_v2/Mistral-7B-Instruct-v0.2(Non-RAG)_wd.csv",
1089
- "data/results_v2/Meta-Llama-3-8B-Instruct(RAG - Generic Prompt)_wd.csv",
1090
- "data/results_v2/Meta-Llama-3-8B-Instruct(RAG - Chat Template)_wd.csv",
1091
- "data/results_v2/Meta-Llama-3-8B-Instruct(Non-RAG)_wd.csv",
1092
- "data/results_v2/Llama-2-13b-chat-hf(RAG - Generic Prompt)_wd.csv",
1093
- "data/results_v2/Llama-2-13b-chat-hf(RAG - Chat Template)_wd.csv",
1094
- "data/results_v2/Llama-2-13b-chat-hf(Non-RAG)_wd.csv",
1095
- "data/results_v2/Llama-2-70b-chat-hf(RAG - Generic Prompt)_wd.csv",
1096
- "data/results_v2/Llama-2-70b-chat-hf(RAG - Chat Template)_wd.csv",
1097
- "data/results_v2/Llama-2-70b-chat-hf(Non-RAG)_wd.csv",
1098
- "data/results_v2/Meta-Llama-3-70B-Instruct(RAG - Generic Prompt)_wd.csv",
1099
- "data/results_v2/Meta-Llama-3-70B-Instruct(RAG - Chat Template)_wd.csv",
1100
- "data/results_v2/Meta-Llama-3-70B-Instruct(Non-RAG)_wd.csv",
1101
- ]
1102
-
1103
-
1104
- def calc_rap_scores(result, precision="precision", recall="recall"):
1105
- newline_score = [
1106
- df["newline_score"].mean() for df in result["df_list_repetition_penalty"]
1107
- ]
1108
-
1109
- repetition_score = [
1110
- df["repetition_score"].mean() for df in result["df_list_repetition_penalty"]
1111
- ]
1112
-
1113
- if precision in result["df_list_repetition_penalty"][0].columns:
1114
- precision = [
1115
- df[precision].mean() for df in result["df_list_repetition_penalty"]
1116
- ]
1117
- recall = [df[recall].mean() for df in result["df_list_repetition_penalty"]]
1118
- else:
1119
- precision = result["df_overall"][precision]
1120
- recall = result["df_overall"][recall]
1121
-
1122
- f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
1123
-
1124
- # rap = [
1125
- # f / math.log10(10 + n + r)
1126
- # for f, n, r in zip(f1, newline_score, repetition_score)
1127
- # ]
1128
-
1129
- nrr = [
1130
- 1 - (n + r) / s
1131
- for f, n, r, s in zip(
1132
- f1, newline_score, repetition_score, result["df_overall"]["answer_len"]
1133
- )
1134
- ]
1135
-
1136
- rap = [f * n * n * n for f, n in zip(f1, nrr)]
1137
-
1138
- return newline_score, repetition_score, f1, rap, nrr
1139
-
1140
-
1141
- def get_model_name(csv_result_file):
1142
- parts = re.split(r"[_/]", csv_result_file)
1143
- print(f"parts: {parts}")
1144
- model_name = parts[3]
1145
- return model_name
1146
-
1147
-
1148
- def load_webqsp_result(csv_result_files, force_recalculate=False, save=False):
1149
- result = {}
1150
- for i, csv_result_file in enumerate(csv_result_files):
1151
- try:
1152
- df = pd.read_csv(csv_result_file)
1153
- model_name = get_model_name(csv_result_file)
1154
- print(f"\tmodel_name: {model_name}")
1155
-
1156
- dfs = [
1157
- calculate_performance_score(
1158
- csv_result_file,
1159
- repetition_penalty,
1160
- force_recalculate=force_recalculate,
1161
- )
1162
- for repetition_penalty in df["repetition_penalty"]
1163
- ]
1164
-
1165
- answer_lens = []
1166
- for df_rpp in dfs:
1167
- df_rpp["answer_len"] = df_rpp["answer"].apply(
1168
- lambda x: len(x) if isinstance(x, str) else 0
1169
- )
1170
- answer_lens.append(df_rpp["answer_len"].mean())
1171
- df["answer_len"] = answer_lens
1172
-
1173
- result[model_name] = {
1174
- "df_overall": df,
1175
- "df_list_repetition_penalty": dfs,
1176
- "file": csv_result_file,
1177
- }
1178
- newline_score, repetition_score, perf, rap, nrr = calc_rap_scores(
1179
- result[model_name]
1180
- )
1181
- df["newline_score"] = newline_score
1182
- df["repetition_score"] = repetition_score
1183
- df["total_repetitions"] = df["newline_score"] + df["repetition_score"]
1184
- df["perf"] = perf
1185
- df["nrr"] = nrr
1186
- df["rap"] = rap
1187
- df["rr"] = df["nrr"].apply(lambda x: 1 - x)
1188
- if save:
1189
- df.to_csv(csv_result_file, index=False)
1190
- except Exception as e:
1191
- print(f"Error: {e}")
1192
- traceback.print_exc()
1193
-
1194
- return result
1195
-
1196
-
1197
- def load_ms_marco_result(
1198
- csv_result_files, force_recalculate=False, calc_bertscore=False, save=False
1199
- ):
1200
- result = {}
1201
- for csv_result_file in csv_result_files:
1202
- try:
1203
- df = pd.read_csv(csv_result_file)
1204
- model_name = get_model_name(csv_result_file)
1205
- print(f"\tmodel_name: {model_name}")
1206
-
1207
- dfs = [
1208
- load_for_repetition_penalty_ms_macro(
1209
- csv_result_file,
1210
- repetition_penalty,
1211
- force_recalculate=force_recalculate,
1212
- )
1213
- for repetition_penalty in df["repetition_penalty"]
1214
- ]
1215
-
1216
- answer_lens = []
1217
- for df_rpp in dfs:
1218
- answer_lens.append(df_rpp["answer_len"].mean())
1219
- df["answer_len"] = answer_lens
1220
-
1221
- col = "bert_score" if calc_bertscore else "meteor"
1222
- score_unavailable = col not in df.columns
1223
-
1224
- if score_unavailable:
1225
- save = True
1226
- bert_meteor_scores = []
1227
- bert_score_references = None
1228
- for df_rpp in dfs:
1229
- if calc_bertscore:
1230
- bert_meteor_score = 0
1231
-
1232
- for i, row in df_rpp.iterrows():
1233
- answer = row["answer"]
1234
- if not isinstance(answer, str):
1235
- answer = ""
1236
- bert_meteor_score += bert_score.compute(
1237
- predictions=[answer],
1238
- references=[row["ground_truth"][0]],
1239
- lang="en",
1240
- model_type="microsoft/deberta-large-mnli",
1241
- )["f1"][0]
1242
- # get average of bertscore
1243
- bert_meteor_score = bert_meteor_score / len(df_rpp)
1244
-
1245
- print(f"bert_score: {bert_meteor_score}")
1246
- else:
1247
- bert_meteor_score = meteor.compute(
1248
- predictions=df_rpp["answer"],
1249
- references=df_rpp["ground_truth"],
1250
- )["meteor"]
1251
-
1252
- bert_meteor_scores.append(bert_meteor_score)
1253
-
1254
- df[col] = bert_meteor_scores
1255
-
1256
- result[model_name] = {
1257
- "df_overall": df,
1258
- "df_list_repetition_penalty": dfs,
1259
- "file": csv_result_file,
1260
- }
1261
- newline_score, repetition_score, perf, rap, nrr = calc_rap_scores(
1262
- result[model_name],
1263
- precision=col,
1264
- recall=col,
1265
- )
1266
- df["newline_score"] = newline_score
1267
- df["repetition_score"] = repetition_score
1268
- df["total_repetitions"] = df["newline_score"] + df["repetition_score"]
1269
- df["perf"] = perf
1270
- df["nrr"] = nrr
1271
- df["rap"] = rap
1272
- df["rr"] = df["nrr"].apply(lambda x: 1 - x)
1273
-
1274
- if save:
1275
- df.to_csv(csv_result_file, index=False)
1276
- except Exception as e:
1277
- print("An error occurred:", e)
1278
- traceback.print_exc()
1279
- print(f"csv_result_file: {csv_result_file}")
1280
-
1281
- return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_modules/calc_repetitions_v2e.py DELETED
@@ -1 +0,0 @@
1
- /Users/inflaton/code/engd/papers/rapget-v2/eval_modules/calc_repetitions_v2e.py
 
 
eval_modules/calc_repetitions_v2e.py ADDED
@@ -0,0 +1,1310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import math
4
+ import pandas as pd
5
+ import numpy as np
6
+ import matplotlib.pyplot as plt
7
+ import matplotlib.ticker as mtick
8
+ import seaborn as sns
9
+ import nltk
10
+ import evaluate
11
+ import traceback
12
+
13
+ bert_score = evaluate.load("bertscore")
14
+ meteor = evaluate.load("meteor")
15
+
16
+ print(f"loading: {__file__}")
17
+
18
+ # pattern_non_word_char_repetition = re.compile(r"\s{5,}")
19
+ # pattern_text_repetitions = re.compile(r"(.{5}.*)\s*((\1)\s*)+", re.M | re.DOTALL)
20
+
21
+ # final version
22
+ pattern_non_word_char_repetition = re.compile(r"[\s\W]{5,}")
23
+ pattern_text_repetitions = re.compile(
24
+ r"(?P<repeat>.{5}.*?)(?:[\s\W]*(?P=repeat))+", re.M | re.DOTALL | re.IGNORECASE
25
+ )
26
+ # Explanation of the Regex Pattern:
27
+ # (?P<repeat>.{5}.*?): Captures any sequence of characters with minimal length of 5 and names this group repeat.
28
+ # .*?: Matches zero or more characters, non-greedily (as few as possible).
29
+ # (?:[\s\W]+(?P=repeat))+: A non-capturing group that matches one or more repetitions of:
30
+ # [\s\W]+: One or more whitespace or non-word characters (spaces, punctuation, etc.).
31
+ # (?P=repeat): A backreference to the named group repeat.
32
+
33
+
34
+ def del_non_word_char_repetition(text, debug=False):
35
+ count = 0
36
+
37
+ if isinstance(text, str):
38
+ if debug:
39
+ print("----detect non-word characters repetition----")
40
+ count = len(text)
41
+ text = pattern_non_word_char_repetition.sub("\t", text)
42
+ count -= len(text)
43
+ if debug and count:
44
+ print(f"removed non-word characters repetition: {count}")
45
+ return text, count
46
+
47
+
48
+ # final version for repetition detection
49
+ def detect_text_repetitions(text, debug=False):
50
+ count = 0
51
+
52
+ if isinstance(text, str):
53
+ if debug:
54
+ print("----detect text repetitions----")
55
+ matches = pattern_text_repetitions.finditer(text)
56
+ for match in matches:
57
+ if debug:
58
+ print(match)
59
+ for groupNum in range(0, len(match.groups())):
60
+ groupNum = groupNum + 1
61
+ print(
62
+ "Group {groupNum} found at {start}-{end}: `{group}`".format(
63
+ groupNum=groupNum,
64
+ start=match.start(groupNum),
65
+ end=match.end(groupNum),
66
+ group=match.group(groupNum),
67
+ )
68
+ )
69
+
70
+ start, end = match.span()
71
+ count += end - start - len(match.group(1))
72
+
73
+ return count
74
+
75
+
76
+ def detect_repetitions(text, debug=False):
77
+ if isinstance(text, str) is False:
78
+ return 0, 0, 0
79
+ text, count_non_word_char_repetition = del_non_word_char_repetition(
80
+ text, debug=debug
81
+ )
82
+ count_text_repetitions = detect_text_repetitions(text, debug=debug)
83
+ total_repetitions = count_non_word_char_repetition + count_text_repetitions
84
+
85
+ result = (count_non_word_char_repetition, count_text_repetitions, total_repetitions)
86
+
87
+ if debug:
88
+ print(result)
89
+ return result
90
+
91
+
92
+ def detect_scores(
93
+ row, debug=False, answer_col="answer", ground_truth_col="ground_truth"
94
+ ):
95
+ newline_score, repetition_score, total_repetitions = detect_repetitions(
96
+ row[answer_col], debug=debug
97
+ )
98
+
99
+ if ground_truth_col:
100
+ ground_truth_newline_score, ground_truth_repetition_score, _ = (
101
+ detect_repetitions(row[ground_truth_col], debug=debug)
102
+ )
103
+
104
+ newline_score -= ground_truth_newline_score
105
+ if newline_score < 0:
106
+ newline_score = 0
107
+
108
+ repetition_score -= ground_truth_repetition_score
109
+ if repetition_score < 0:
110
+ repetition_score = 0
111
+
112
+ total_repetitions = newline_score + repetition_score
113
+
114
+ return pd.Series([newline_score, repetition_score, total_repetitions])
115
+
116
+
117
+ def load_with_newline_and_repetition_scores(result_file, force_recalculate=False):
118
+ print(f"loading result file: {result_file}")
119
+ df = pd.read_csv(result_file, comment="#", on_bad_lines="warn")
120
+
121
+ if (
122
+ force_recalculate
123
+ or "newline_score" not in df.columns
124
+ or "repetition_score" not in df.columns
125
+ or "total_repetitions" not in df.columns
126
+ or "nrr" not in df.columns
127
+ or "rr" not in df.columns
128
+ ):
129
+ if (
130
+ force_recalculate
131
+ or "newline_score" not in df.columns
132
+ or "repetition_score" not in df.columns
133
+ or "total_repetitions" not in df.columns
134
+ ):
135
+ df[["newline_score", "repetition_score", "total_repetitions"]] = df.apply(
136
+ detect_scores, axis=1
137
+ )
138
+
139
+ df["answer_len"] = df["answer"].apply(
140
+ lambda x: len(x) if isinstance(x, str) else 0
141
+ )
142
+
143
+ df["nrr"] = df.apply(
144
+ lambda x: (
145
+ 1
146
+ if x["answer_len"] == 0
147
+ else 1 - (x["newline_score"] + x["repetition_score"]) / x["answer_len"]
148
+ ),
149
+ axis=1,
150
+ )
151
+
152
+ df["rr"] = df["nrr"].apply(lambda x: 1 - x)
153
+
154
+ df.to_csv(result_file, index=False)
155
+
156
+ return df
157
+
158
+
159
+ def replace_last(source_string, old_string, new_string):
160
+ head, _sep, tail = source_string.rpartition(old_string)
161
+ return head + new_string + tail
162
+
163
+
164
+ def load_for_repetition_penalty(
165
+ csv_result_file, repetition_penalty, force_recalculate=False
166
+ ):
167
+ result_file = replace_last(
168
+ csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv"
169
+ )
170
+ return load_with_newline_and_repetition_scores(
171
+ result_file, force_recalculate=force_recalculate
172
+ )
173
+
174
+
175
+ def calc_adjusted_performance(f, r, l=1):
176
+ n = 1 - r / l if l > 0 else 0
177
+ return f * n * n * n
178
+
179
+
180
+ def calculate_adjusted_performance(row):
181
+ r = row["total_repetitions"]
182
+ l = row["answer_len"]
183
+ adjusted_precision = calc_adjusted_performance(row["precision"], r, l)
184
+ adjusted_recall = calc_adjusted_performance(row["recall"], r, l)
185
+ return pd.Series([adjusted_precision, adjusted_recall])
186
+
187
+
188
+ def load_performance_df(csv_result_file, repetition_penalty):
189
+ result_file = replace_last(
190
+ csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}-t2_evaluated.json"
191
+ )
192
+ result_file = result_file.replace("/results/", "/eval/")
193
+ print(f"loading json file: {result_file}")
194
+ df = pd.read_json(result_file)
195
+
196
+ return df
197
+
198
+
199
+ def calculate_performance_score(
200
+ csv_result_file, repetition_penalty, force_recalculate=False
201
+ ):
202
+ result_file = replace_last(
203
+ csv_result_file, ".csv", f"_rpp_{repetition_penalty:.2f}.csv"
204
+ )
205
+
206
+ if os.path.exists(result_file):
207
+ print(f"loading result file: {result_file}")
208
+ df = load_with_newline_and_repetition_scores(
209
+ result_file, force_recalculate=force_recalculate
210
+ )
211
+ else:
212
+ print(f"re-creating result file: {result_file}")
213
+ df = pd.DataFrame()
214
+ force_recalculate = True
215
+
216
+ if force_recalculate or "f2" in df.columns or "f1" not in df.columns:
217
+ try:
218
+ perf_df = load_performance_df(csv_result_file, repetition_penalty)
219
+ df.drop(
220
+ columns=[
221
+ "precision",
222
+ "recall",
223
+ "f1",
224
+ "f2",
225
+ "entities_in_answer",
226
+ "entities_in_question",
227
+ "word_count",
228
+ ],
229
+ errors="ignore",
230
+ inplace=True,
231
+ )
232
+
233
+ df["id"] = perf_df["id"]
234
+ df["question"] = perf_df["question"]
235
+ df["answer"] = perf_df["pred_answer"]
236
+ df["word_count"] = df["answer"].apply(
237
+ lambda x: len(nltk.word_tokenize(x)) if isinstance(x, str) else 0
238
+ )
239
+ df["ground_truth"] = perf_df["ground_truth"]
240
+
241
+ df["eval_gemini_1.0_pro"] = perf_df["eval_gemini_1.0_pro"]
242
+ df["precision"] = perf_df["score"].apply(lambda x: x[0])
243
+ df["recall"] = perf_df["score"].apply(lambda x: x[1])
244
+ df["f1"] = perf_df["score"].apply(lambda x: x[2])
245
+ except Exception as e:
246
+ print(f"\tignored error: {e}")
247
+ # traceback.print_exc()
248
+
249
+ df[["newline_score", "repetition_score", "total_repetitions"]] = df.apply(
250
+ detect_scores, axis=1
251
+ )
252
+ df["answer_len"] = df["answer"].apply(
253
+ lambda x: len(x) if isinstance(x, str) else 0
254
+ )
255
+
256
+ df[["adjusted_precision", "adjusted_recall"]] = df.apply(
257
+ calculate_adjusted_performance, axis=1
258
+ )
259
+
260
+ df.to_csv(result_file, index=False)
261
+ print(f"performance scores saved to result file: {result_file}")
262
+
263
+ # print(f"df len: {len(df)}")
264
+
265
+ return df
266
+
267
+
268
+ def adjust_perf_scores_with_repetition_penalty(result, precision, recall):
269
+ newline_score = [
270
+ df["newline_score"].mean() for df in result["df_list_repetition_penalty"]
271
+ ]
272
+
273
+ repetition_score = [
274
+ df["repetition_score"].mean() for df in result["df_list_repetition_penalty"]
275
+ ]
276
+
277
+ answer_len = [
278
+ df["answer_len"].mean() for df in result["df_list_repetition_penalty"]
279
+ ]
280
+
281
+ precision = [
282
+ calc_adjusted_performance(f, n + r, l)
283
+ for f, n, r, l in zip(precision, newline_score, repetition_score, answer_len)
284
+ ]
285
+ recall = [
286
+ calc_adjusted_performance(f, n + r, l)
287
+ for f, n, r, l in zip(recall, newline_score, repetition_score, answer_len)
288
+ ]
289
+
290
+ return precision, recall
291
+
292
+
293
+ def plot_performance_scores(
294
+ result,
295
+ models=None,
296
+ title="Performance",
297
+ ):
298
+ if models is None:
299
+ models = result.keys()
300
+ for model in models:
301
+ print(f"model: {model}")
302
+ df = result[model]["df_overall"]
303
+
304
+ # Calculate the statistics
305
+ precision = [
306
+ df["precision"].mean() for df in result[model]["df_list_repetition_penalty"]
307
+ ]
308
+ recall = [
309
+ df["recall"].mean() for df in result[model]["df_list_repetition_penalty"]
310
+ ]
311
+ f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
312
+ best_f1 = max(f1)
313
+ best_f1_index = f1.index(best_f1)
314
+
315
+ precision, recall = adjust_perf_scores_with_repetition_penalty(
316
+ result[model], precision, recall
317
+ )
318
+ afrp = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
319
+
320
+ # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
321
+ best_afrp = max(afrp)
322
+ best_afrp_index = afrp.index(best_afrp)
323
+
324
+ adjusted_precision = [
325
+ df["adjusted_precision"].mean()
326
+ for df in result[model]["df_list_repetition_penalty"]
327
+ ]
328
+ adjusted_recall = [
329
+ df["adjusted_recall"].mean()
330
+ for df in result[model]["df_list_repetition_penalty"]
331
+ ]
332
+ afrp2 = [
333
+ 2 * (p * r) / (p + r) for p, r in zip(adjusted_precision, adjusted_recall)
334
+ ]
335
+ best_afrp2 = max(afrp2)
336
+ best_afrp2_index = afrp2.index(best_afrp2)
337
+
338
+ repetition_penalties = list(df["repetition_penalty"])
339
+
340
+ # line plot for precision, recall, f1
341
+ plt.figure(figsize=(10, 6))
342
+
343
+ plt.axvspan(
344
+ repetition_penalties[best_f1_index] - 0.01,
345
+ repetition_penalties[best_f1_index] + 0.01,
346
+ alpha=0.5,
347
+ edgecolor="none",
348
+ facecolor="blue",
349
+ )
350
+
351
+ # plt.axvspan(
352
+ # repetition_penalties[best_afrp2_index] - 0.01,
353
+ # repetition_penalties[best_afrp2_index] + 0.01,
354
+ # alpha=0.5,
355
+ # edgecolor="none",
356
+ # facecolor="green",
357
+ # )
358
+
359
+ plt.axvspan(
360
+ repetition_penalties[best_afrp_index] - 0.01,
361
+ repetition_penalties[best_afrp_index] + 0.01,
362
+ alpha=0.5,
363
+ edgecolor="none",
364
+ facecolor="orange",
365
+ )
366
+
367
+ plt.plot(repetition_penalties, f1, label="F1", marker="D", color="blue")
368
+ # plt.plot(
369
+ # repetition_penalties,
370
+ # afrp2,
371
+ # label="Per-question RAP - F1",
372
+ # marker="s",
373
+ # color="green",
374
+ # )
375
+ plt.plot(
376
+ repetition_penalties,
377
+ afrp,
378
+ label="RAP - F1",
379
+ marker="o",
380
+ color="orange",
381
+ )
382
+ plt.xlabel("Repetition Penalties")
383
+ plt.ylabel("Score")
384
+ # plt.xlim(0.99, 1.31)
385
+ # y in percentage
386
+ plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
387
+ plt.title(f"{model} {title}")
388
+ plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
389
+
390
+ plt.show()
391
+
392
+
393
+ def plot_best_afrp(
394
+ result,
395
+ models=None,
396
+ title="Models with Best RAP - F1",
397
+ ref_result=None,
398
+ ):
399
+ # Initialize lists to store the statistics
400
+ model_names = []
401
+ best_f1 = []
402
+ best_afrp = []
403
+ best_repetition_penalty = []
404
+ best_mtr = []
405
+
406
+ if models is None:
407
+ models = result.keys()
408
+ for model in models:
409
+ print(f"model: {model}")
410
+ df = result[model]["df_overall"]
411
+
412
+ # Calculate the statistics
413
+ precision = [
414
+ df["precision"].mean() for df in result[model]["df_list_repetition_penalty"]
415
+ ]
416
+ recall = [
417
+ df["recall"].mean() for df in result[model]["df_list_repetition_penalty"]
418
+ ]
419
+ # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
420
+ f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
421
+
422
+ newline_score = [
423
+ df["newline_score"].mean()
424
+ for df in result[model]["df_list_repetition_penalty"]
425
+ ]
426
+ # print(f"newline_score: {newline_score}")
427
+
428
+ repetition_score = [
429
+ df["repetition_score"].mean()
430
+ for df in result[model]["df_list_repetition_penalty"]
431
+ ]
432
+ # print(f"repetition_score: {repetition_score}")
433
+
434
+ answer_len = [
435
+ df["answer_len"].mean()
436
+ for df in result[model]["df_list_repetition_penalty"]
437
+ ]
438
+
439
+ afrp = [
440
+ calc_adjusted_performance(f, n + r, l)
441
+ for f, n, r, l in zip(f1, newline_score, repetition_score, answer_len)
442
+ ]
443
+
444
+ best_afrp.append(max(afrp))
445
+ best_afrp_index = afrp.index(best_afrp[-1])
446
+ best_repetition_penalty.append(df["repetition_penalty"][best_afrp_index])
447
+
448
+ best_f1.append(f1[best_afrp_index])
449
+ best_mtr.append(
450
+ newline_score[best_afrp_index] + repetition_score[best_afrp_index]
451
+ )
452
+
453
+ # print(
454
+ # f"best repetition penalty: {best_repetition_penalty[-1]}, best afrp: {best_afrp[-1]}, f1: {best_f1[-1]}"
455
+ # )
456
+
457
+ df = result[model]["df_list_repetition_penalty"][best_afrp_index]
458
+
459
+ model_names.append(
460
+ f"{model} (RP={best_repetition_penalty[-1]})"
461
+ ) # Add the model name to the list
462
+
463
+ if ref_result is not None:
464
+ print("ref_result:", ref_result)
465
+ for model in ref_result.keys():
466
+ model_names.append(model)
467
+ df = pd.read_csv(ref_result[model])
468
+ # df = df[df["id"].isin(wikidata_df["id"])]
469
+
470
+ p = df["precision"].mean()
471
+ r = df["recall"].mean()
472
+
473
+ f1 = 2 * p * r / (p + r) if p + r > 0 else 0
474
+ best_f1.append(f1)
475
+ best_afrp.append(f1)
476
+ best_mtr.append(0)
477
+
478
+ print("model_names:", model_names)
479
+ # print("best_f1:", best_f1)
480
+ # print("best_afrp:", best_afrp)
481
+
482
+ # Create a DataFrame with the statistics
483
+ data = pd.DataFrame(
484
+ {
485
+ "Model": model_names,
486
+ "RAP - F1": best_afrp,
487
+ "F1": best_f1,
488
+ }
489
+ )
490
+
491
+ # Melt the DataFrame to a long format
492
+ data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score")
493
+
494
+ # Pivot the DataFrame to a wide format
495
+ data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score")
496
+
497
+ # make sure the columns are following the order of the models
498
+ data_pivoted = data_pivoted[model_names]
499
+
500
+ # make sure three groups in the order of precision, recall, f1
501
+ data_pivoted = data_pivoted.reindex(["RAP - F1", "F1"])
502
+
503
+ # Plot the statistics
504
+ plt.figure(figsize=(15, 6))
505
+ ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9)
506
+ plt.title(title)
507
+ plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
508
+
509
+ # Set the rotation of the x-axis labels to 0 degrees
510
+ plt.xticks(rotation=0)
511
+
512
+ # Format the y-axis to display as percentage
513
+ ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
514
+
515
+ # get the max value of the y-axis
516
+ a1 = max(best_afrp)
517
+ a2 = max(best_f1)
518
+
519
+ max_value = max([a1, a2]) * 1.12
520
+ print("max_value:", max_value)
521
+
522
+ # Set the y-axis limit up to 70%
523
+ ax.set_ylim(0, max_value)
524
+
525
+ # Add the values above each bar
526
+ for p in ax.patches:
527
+ ax.annotate(
528
+ f"{p.get_height() * 100:.1f}",
529
+ (p.get_x() + p.get_width() / 2.0, p.get_height()),
530
+ ha="center",
531
+ va="bottom",
532
+ xytext=(0, 10),
533
+ textcoords="offset points",
534
+ rotation=90,
535
+ )
536
+
537
+ plt.show()
538
+ return data_pivoted, best_mtr
539
+
540
+
541
+ def plot_best_performance(
542
+ result,
543
+ models=None,
544
+ title="Models with Best F1 Score",
545
+ adjusted_f1=False,
546
+ ref_result=None,
547
+ ):
548
+ # Initialize lists to store the statistics
549
+ model_names = []
550
+ best_precision = []
551
+ best_recall = []
552
+ best_f1 = []
553
+ best_repetition_penalty = []
554
+ best_mtr = []
555
+
556
+ if models is None:
557
+ models = result.keys()
558
+ for model in models:
559
+ print(f"model: {model}")
560
+ df = result[model]["df_overall"]
561
+
562
+ # Calculate the statistics
563
+ precision = [
564
+ df["precision"].mean() for df in result[model]["df_list_repetition_penalty"]
565
+ ]
566
+ recall = [
567
+ df["recall"].mean() for df in result[model]["df_list_repetition_penalty"]
568
+ ]
569
+ newline_score = [
570
+ df["newline_score"].mean()
571
+ for df in result[model]["df_list_repetition_penalty"]
572
+ ]
573
+
574
+ repetition_score = [
575
+ df["repetition_score"].mean()
576
+ for df in result[model]["df_list_repetition_penalty"]
577
+ ]
578
+
579
+ if adjusted_f1:
580
+ precision, recall = adjust_perf_scores_with_repetition_penalty(
581
+ result[model], precision, recall
582
+ )
583
+
584
+ # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
585
+ f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
586
+
587
+ best_f1.append(max(f1))
588
+ best_f1_index = f1.index(best_f1[-1])
589
+ best_repetition_penalty.append(df["repetition_penalty"][best_f1_index])
590
+
591
+ best_precision.append(precision[best_f1_index])
592
+ best_recall.append(recall[best_f1_index])
593
+ best_mtr.append(newline_score[best_f1_index] + repetition_score[best_f1_index])
594
+
595
+ print(
596
+ f"best repetition penalty: {best_repetition_penalty[-1]}, best f1: {best_f1[-1]}, precision: {best_precision[-1]}, recall: {best_recall[-1]}"
597
+ )
598
+
599
+ df = result[model]["df_list_repetition_penalty"][best_f1_index]
600
+
601
+ model_names.append(
602
+ f"{model} (RP={best_repetition_penalty[-1]})"
603
+ ) # Add the model name to the list
604
+
605
+ # print sum for columns: newline_score, repetition_score
606
+ print(
607
+ f"newline_score: {df['newline_score'].sum()}, repetition_score: {df['repetition_score'].sum()}"
608
+ )
609
+
610
+ if ref_result is not None:
611
+ print("ref_result:", ref_result)
612
+ for model in ref_result.keys():
613
+ model_names.append(model)
614
+ df = pd.read_csv(ref_result[model])
615
+ # df = df[df["id"].isin(wikidata_df["id"])]
616
+
617
+ best_precision.append(df["precision"].mean())
618
+ best_recall.append(df["recall"].mean())
619
+ f1 = (
620
+ 2
621
+ * (best_precision[-1] * best_recall[-1])
622
+ / (best_precision[-1] + best_recall[-1])
623
+ )
624
+ # best_f1.append(df["f1"].mean())
625
+ best_f1.append(f1)
626
+ best_mtr.append(0)
627
+
628
+ # Create a DataFrame with the statistics
629
+ data = (
630
+ pd.DataFrame(
631
+ {
632
+ "Model": model_names,
633
+ "Adjusted Precision with RP": best_precision,
634
+ "Adjusted Recall with RP": best_recall,
635
+ "Adjusted F1 with RP": best_f1,
636
+ }
637
+ )
638
+ if adjusted_f1
639
+ else pd.DataFrame(
640
+ {
641
+ "Model": model_names,
642
+ "Precision": best_precision,
643
+ "Recall": best_recall,
644
+ "F1": best_f1,
645
+ }
646
+ )
647
+ )
648
+ columns = list(data.columns)
649
+
650
+ # Melt the DataFrame to a long format
651
+ data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score")
652
+
653
+ # Pivot the DataFrame to a wide format
654
+ data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score")
655
+
656
+ # make sure the columns are following the order of the models
657
+ data_pivoted = data_pivoted[model_names]
658
+
659
+ # make sure three groups in the order of precision, recall, f1
660
+ data_pivoted = data_pivoted.reindex(columns[1:])
661
+
662
+ # Plot the statistics
663
+ plt.figure(figsize=(10, 6))
664
+ ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9)
665
+ plt.title(title)
666
+ plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
667
+
668
+ # Set the rotation of the x-axis labels to 0 degrees
669
+ plt.xticks(rotation=0)
670
+
671
+ # Format the y-axis to display as percentage
672
+ ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
673
+
674
+ # get the max value of the y-axis
675
+ a1 = max(best_precision)
676
+ a2 = max(best_recall)
677
+ a3 = max(best_f1)
678
+
679
+ max_value = max([a1, a2, a3]) * 1.12
680
+ print("max_value:", max_value)
681
+
682
+ # Set the y-axis limit up to 70%
683
+ ax.set_ylim(0, max_value)
684
+
685
+ # Add the values above each bar
686
+ for p in ax.patches:
687
+ ax.annotate(
688
+ f"{p.get_height() * 100:.1f}",
689
+ (p.get_x() + p.get_width() / 2.0, p.get_height()),
690
+ ha="center",
691
+ va="bottom",
692
+ xytext=(0, 10),
693
+ textcoords="offset points",
694
+ rotation=90,
695
+ )
696
+
697
+ plt.show()
698
+ return data_pivoted, best_mtr
699
+
700
+
701
+ def plot_best_performance_ms_macro(
702
+ result,
703
+ models=None,
704
+ title="Models with Best RAP - Performance",
705
+ ref_result=None,
706
+ skip_generic_prompt=False,
707
+ include_adjusted_performance=True,
708
+ ):
709
+ # Initialize lists to store the statistics
710
+ model_names = []
711
+ best_f1 = []
712
+ best_afrp = []
713
+ best_repetition_penalty = []
714
+ best_bleu1 = []
715
+ best_rougeL = []
716
+ best_mtr = []
717
+
718
+ if models is None:
719
+ models = result.keys()
720
+ for model in models:
721
+ if skip_generic_prompt and "generic prompt" in model:
722
+ continue
723
+ print(f"model: {model}")
724
+ df = result[model]["df_overall"]
725
+
726
+ # Calculate the statistics
727
+ bleu1 = [x for x in df["bleu1"]]
728
+ rougeL = [x for x in df["rougeL"]]
729
+ f1 = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)]
730
+
731
+ newline_score = [
732
+ df["newline_score"].mean()
733
+ for df in result[model]["df_list_repetition_penalty"]
734
+ ]
735
+ # print(f"newline_score: {newline_score}")
736
+
737
+ repetition_score = [
738
+ df["repetition_score"].mean()
739
+ for df in result[model]["df_list_repetition_penalty"]
740
+ ]
741
+ # print(f"repetition_score: {repetition_score}")
742
+
743
+ answer_len = [
744
+ df["answer_len"].mean()
745
+ for df in result[model]["df_list_repetition_penalty"]
746
+ ]
747
+
748
+ afrp = [
749
+ calc_adjusted_performance(f, n + r, l)
750
+ for f, n, r, l in zip(f1, newline_score, repetition_score, answer_len)
751
+ ]
752
+
753
+ best_afrp.append(max(afrp if include_adjusted_performance else f1))
754
+ best_afrp_index = (
755
+ afrp.index(best_afrp[-1])
756
+ if include_adjusted_performance
757
+ else f1.index(best_afrp[-1])
758
+ )
759
+ best_repetition_penalty.append(df["repetition_penalty"][best_afrp_index])
760
+
761
+ best_f1.append(f1[best_afrp_index])
762
+ best_bleu1.append(bleu1[best_afrp_index])
763
+ best_rougeL.append(rougeL[best_afrp_index])
764
+ best_mtr.append(
765
+ newline_score[best_afrp_index] + repetition_score[best_afrp_index]
766
+ )
767
+
768
+ # print(
769
+ # f"best repetition penalty: {best_repetition_penalty[-1]}, best afrp: {best_afrp[-1]}, f1: {best_f1[-1]}"
770
+ # )
771
+
772
+ df = result[model]["df_list_repetition_penalty"][best_afrp_index]
773
+
774
+ model_names.append(
775
+ f"{model} (RP={best_repetition_penalty[-1]})"
776
+ ) # Add the model name to the list
777
+
778
+ if ref_result is not None:
779
+ print("ref_result:", ref_result)
780
+ for model in ref_result.keys():
781
+ model_names.append(model)
782
+ df = pd.read_csv(ref_result[model], comment="#", on_bad_lines="warn")
783
+ # df = df[df["id"].isin(wikidata_df["id"])]
784
+
785
+ p = df["bleu1"][0]
786
+ best_bleu1.append(p)
787
+
788
+ r = df["rougeL"][0]
789
+ best_rougeL.append(r)
790
+
791
+ f1 = 2 * p * r / (p + r) if p + r > 0 else 0
792
+ best_f1.append(f1)
793
+ best_afrp.append(f1)
794
+ best_mtr.append(0)
795
+
796
+ # print("model_names:", model_names)
797
+ # print("best_f1:", best_f1)
798
+ # print("best_afrp:", best_afrp)
799
+
800
+ # Create a DataFrame with the statistics
801
+ data = (
802
+ pd.DataFrame(
803
+ {
804
+ "Model": model_names,
805
+ "RAP - Perf Score": best_afrp,
806
+ "Overall Perf Score": best_f1,
807
+ }
808
+ )
809
+ if include_adjusted_performance
810
+ else pd.DataFrame(
811
+ {
812
+ "Model": model_names,
813
+ "Bleu-1": best_bleu1,
814
+ "Rouge-L": best_rougeL,
815
+ "Overall Perf Score": best_f1,
816
+ }
817
+ )
818
+ )
819
+
820
+ # Melt the DataFrame to a long format
821
+ data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score")
822
+
823
+ # Pivot the DataFrame to a wide format
824
+ data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score")
825
+
826
+ # make sure the columns are following the order of the models
827
+ data_pivoted = data_pivoted[model_names]
828
+
829
+ columns = list(data.columns)
830
+ data_pivoted = data_pivoted.reindex(columns[1:])
831
+
832
+ # Plot the statistics
833
+ plt.figure(figsize=(10, 6))
834
+ ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9)
835
+ plt.title(title)
836
+ plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
837
+
838
+ # Set the rotation of the x-axis labels to 0 degrees
839
+ plt.xticks(rotation=0)
840
+
841
+ # Format the y-axis to display as percentage
842
+ ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
843
+
844
+ # get the max value of the y-axis
845
+ a1 = max(best_afrp)
846
+ a2 = max(best_f1)
847
+ a3 = max(best_bleu1)
848
+ a4 = max(best_rougeL)
849
+
850
+ max_value = (
851
+ max([a1, a2] if include_adjusted_performance else [a1, a2, a3, a4]) * 1.12
852
+ )
853
+ print("max_value:", max_value)
854
+
855
+ # Set the y-axis limit up to 70%
856
+ ax.set_ylim(0, max_value)
857
+
858
+ # Add the values above each bar
859
+ for p in ax.patches:
860
+ ax.annotate(
861
+ f"{p.get_height() * 100:.1f}",
862
+ (p.get_x() + p.get_width() / 2.0, p.get_height()),
863
+ ha="center",
864
+ va="bottom",
865
+ xytext=(0, 10),
866
+ textcoords="offset points",
867
+ rotation=90,
868
+ )
869
+
870
+ plt.show()
871
+ return data_pivoted, best_mtr
872
+
873
+
874
+ all_open_source_models = [
875
+ "gemma-1.1-2b-it",
876
+ "Phi-3-mini-128k-instruct",
877
+ "gemma-1.1-7b-it",
878
+ "Llama-2-7b-chat-hf",
879
+ "Mistral-7B-Instruct-v0.2",
880
+ "Meta-Llama-3-8B-Instruct",
881
+ "Llama-2-13b-chat-hf",
882
+ "Llama-2-70b-chat-hf",
883
+ "Meta-Llama-3-70B-Instruct",
884
+ ]
885
+
886
+
887
+ def load_for_repetition_penalty_ms_macro(
888
+ csv_result_file, repetition_penalty, force_recalculate=False
889
+ ):
890
+ result_file = replace_last(
891
+ csv_result_file, ".csv", f"_rpp_{repetition_penalty:.2f}.csv"
892
+ )
893
+ df = load_with_newline_and_repetition_scores(
894
+ result_file, force_recalculate=force_recalculate
895
+ )
896
+
897
+ return df
898
+
899
+
900
+ # MS MACRO
901
+ def plot_performance_scores_ms_macro(
902
+ result,
903
+ models=None,
904
+ title="Performance",
905
+ ):
906
+ if models is None:
907
+ models = result.keys()
908
+ for model in models:
909
+ print(f"model: {model}")
910
+ df = result[model]["df_overall"]
911
+ # print(result[model]["df_list_repetition_penalty"][0].describe())
912
+
913
+ # Calculate the statistics
914
+ bleu1 = list(df["bleu1"])
915
+ rougeL = list(df["rougeL"])
916
+ f1 = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)]
917
+ best_f1 = max(f1)
918
+ best_f1_index = f1.index(best_f1)
919
+
920
+ bleu1, rougeL = adjust_perf_scores_with_repetition_penalty(
921
+ result[model], bleu1, rougeL
922
+ )
923
+ afrp = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)]
924
+
925
+ # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
926
+ best_afrp = max(afrp)
927
+ best_afrp_index = afrp.index(best_afrp)
928
+
929
+ repetition_penalties = list(df["repetition_penalty"])
930
+
931
+ # line plot for precision, recall, f1
932
+ plt.figure(figsize=(10, 6))
933
+
934
+ plt.axvspan(
935
+ repetition_penalties[best_f1_index] - 0.01,
936
+ repetition_penalties[best_f1_index] + 0.01,
937
+ alpha=0.5,
938
+ edgecolor="none",
939
+ facecolor="blue",
940
+ )
941
+
942
+ plt.axvspan(
943
+ repetition_penalties[best_afrp_index] - 0.01,
944
+ repetition_penalties[best_afrp_index] + 0.01,
945
+ alpha=0.5,
946
+ edgecolor="none",
947
+ facecolor="orange",
948
+ )
949
+
950
+ plt.plot(
951
+ repetition_penalties,
952
+ f1,
953
+ label="Overall Perf Score",
954
+ marker="D",
955
+ color="blue",
956
+ )
957
+ plt.plot(
958
+ repetition_penalties,
959
+ afrp,
960
+ label="RAP - Perf Score",
961
+ marker="o",
962
+ color="orange",
963
+ )
964
+
965
+ plt.xlabel("Repetition Penalties")
966
+ plt.ylabel("Score")
967
+ # plt.xlim(0.99, 1.31)
968
+ # y in percentage
969
+ plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
970
+ plt.title(f"{model} {title}")
971
+ plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
972
+
973
+ plt.show()
974
+
975
+
976
+ def plot_repetition_factors(result, groups):
977
+ for group in groups:
978
+ # Plot the statistics
979
+ plt.figure(figsize=(10, 6))
980
+
981
+ max_value = 0
982
+ for model in result.keys():
983
+ if not group in model.lower():
984
+ continue
985
+ print(f"model: {model}")
986
+ df = result[model]["df_overall"]
987
+ repetition_panelties = [
988
+ repetition_penalty for repetition_penalty in df["repetition_penalty"]
989
+ ]
990
+
991
+ mean_score = [
992
+ df["total_repetitions"].mean()
993
+ for df in result[model]["df_list_repetition_penalty"]
994
+ ]
995
+
996
+ sns.lineplot(x=repetition_panelties, y=mean_score, label=model)
997
+
998
+ new_max = max(mean_score)
999
+ if new_max > max_value:
1000
+ max_value = new_max
1001
+
1002
+ max_value = max_value * 1.05
1003
+ # if max_value < 1.5:
1004
+ # max_value = 1.5
1005
+ # set ylimit
1006
+ plt.ylim(0, max_value)
1007
+
1008
+ # show grid
1009
+ plt.grid(True)
1010
+ plt.xlabel("Repetition Penalties")
1011
+ plt.ylabel("Mean Total Repetitions")
1012
+ plt.title("Mean Total Repetitions vs Repetition Penalties")
1013
+ plt.legend()
1014
+
1015
+ plt.show()
1016
+
1017
+
1018
+ def plot_repetition_factors_by_group(result, group_filter=None):
1019
+ markers = ["D", "o", "s", "x"]
1020
+ colors = ["blue", "orange", "green", "red"]
1021
+
1022
+ # Plot the statistics
1023
+ plt.figure(figsize=(10, 6))
1024
+ index = 0
1025
+ max_value = 0
1026
+
1027
+ for model in result.keys():
1028
+ if group_filter is not None and group_filter not in model:
1029
+ continue
1030
+
1031
+ print(f"model: {model}")
1032
+
1033
+ df = result[model]["df_overall"]
1034
+ repetition_panelties = [
1035
+ repetition_penalty for repetition_penalty in df["repetition_penalty"]
1036
+ ]
1037
+
1038
+ # Calculate the statistics
1039
+ mean_score = [
1040
+ df["total_repetitions"].mean()
1041
+ for df in result[model]["df_list_repetition_penalty"]
1042
+ ]
1043
+ if len(mean_score) != len(repetition_panelties):
1044
+ print(
1045
+ f"model: {model} has different length of repetition penalties and mean score"
1046
+ )
1047
+ print("repetition_panelties:", len(repetition_panelties))
1048
+ print("mean_score:", len(mean_score))
1049
+ continue
1050
+
1051
+ new_max = max(mean_score)
1052
+ if new_max > max_value:
1053
+ max_value = new_max
1054
+
1055
+ sns.lineplot(
1056
+ x=repetition_panelties,
1057
+ y=mean_score,
1058
+ label=model,
1059
+ marker=markers[index],
1060
+ color=colors[index],
1061
+ )
1062
+
1063
+ index += 1
1064
+
1065
+ max_value = max_value * 1.05
1066
+ # if max_value < 1.5:
1067
+ # max_value = 1.5
1068
+ # set ylimit
1069
+ plt.ylim(0, max_value)
1070
+ max_value = 0
1071
+
1072
+ plt.xlabel("Repetition Penalties")
1073
+ plt.ylabel("Mean Total Repetitions")
1074
+ plt.title("Mean Total Repetitions vs Repetition Penalties")
1075
+ plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
1076
+
1077
+ plt.show()
1078
+
1079
+
1080
+ ms_marco_csv_result_files = [
1081
+ "data/results_v2/gemma-1.1-2b-it(RAG - Generic Prompt)_mm.csv",
1082
+ "data/results_v2/gemma-1.1-2b-it(RAG - Chat Template)_mm.csv",
1083
+ "data/results_v2/gemma-1.1-2b-it(Non-RAG)_mm.csv",
1084
+ "data/results_v2/Phi-3-mini-128k-instruct(RAG - Generic Prompt)_mm.csv",
1085
+ "data/results_v2/Phi-3-mini-128k-instruct(RAG - Chat Template)_mm.csv",
1086
+ "data/results_v2/Phi-3-mini-128k-instruct(Non-RAG)_mm.csv",
1087
+ "data/results_v2/gemma-1.1-7b-it(RAG - Generic Prompt)_mm.csv",
1088
+ "data/results_v2/gemma-1.1-7b-it(RAG - Chat Template)_mm.csv",
1089
+ "data/results_v2/gemma-1.1-7b-it(Non-RAG)_mm.csv",
1090
+ "data/results_v2/Llama-2-7b-chat-hf(RAG - Generic Prompt)_mm.csv",
1091
+ "data/results_v2/Llama-2-7b-chat-hf(RAG - Chat Template)_mm.csv",
1092
+ "data/results_v2/Llama-2-7b-chat-hf(Non-RAG)_mm.csv",
1093
+ "data/results_v2/Mistral-7B-Instruct-v0.2(RAG - Generic Prompt)_mm.csv",
1094
+ "data/results_v2/Mistral-7B-Instruct-v0.2(RAG - Chat Template)_mm.csv",
1095
+ "data/results_v2/Mistral-7B-Instruct-v0.2(Non-RAG)_mm.csv",
1096
+ "data/results_v2/Meta-Llama-3-8B-Instruct(RAG - Generic Prompt)_mm.csv",
1097
+ "data/results_v2/Meta-Llama-3-8B-Instruct(RAG - Chat Template)_mm.csv",
1098
+ "data/results_v2/Meta-Llama-3-8B-Instruct(Non-RAG)_mm.csv",
1099
+ "data/results_v2/Llama-2-13b-chat-hf(RAG - Generic Prompt)_mm.csv",
1100
+ "data/results_v2/Llama-2-13b-chat-hf(RAG - Chat Template)_mm.csv",
1101
+ "data/results_v2/Llama-2-13b-chat-hf(Non-RAG)_mm.csv",
1102
+ "data/results_v2/Llama-2-70b-chat-hf(RAG - Generic Prompt)_mm.csv",
1103
+ "data/results_v2/Llama-2-70b-chat-hf(RAG - Chat Template)_mm.csv",
1104
+ "data/results_v2/Llama-2-70b-chat-hf(Non-RAG)_mm.csv",
1105
+ "data/results_v2/Meta-Llama-3-70B-Instruct(RAG - Generic Prompt)_mm.csv",
1106
+ "data/results_v2/Meta-Llama-3-70B-Instruct(RAG - Chat Template)_mm.csv",
1107
+ "data/results_v2/Meta-Llama-3-70B-Instruct(Non-RAG)_mm.csv",
1108
+ ]
1109
+
1110
+ webqsp_csv_result_files = [
1111
+ "data/results_v2/gemma-1.1-2b-it(RAG - Generic Prompt)_wd.csv",
1112
+ "data/results_v2/gemma-1.1-2b-it(RAG - Chat Template)_wd.csv",
1113
+ "data/results_v2/gemma-1.1-2b-it(Non-RAG)_wd.csv",
1114
+ "data/results_v2/Phi-3-mini-128k-instruct(RAG - Generic Prompt)_wd.csv",
1115
+ "data/results_v2/Phi-3-mini-128k-instruct(RAG - Chat Template)_wd.csv",
1116
+ "data/results_v2/Phi-3-mini-128k-instruct(Non-RAG)_wd.csv",
1117
+ "data/results_v2/gemma-1.1-7b-it(RAG - Generic Prompt)_wd.csv",
1118
+ "data/results_v2/gemma-1.1-7b-it(RAG - Chat Template)_wd.csv",
1119
+ "data/results_v2/gemma-1.1-7b-it(Non-RAG)_wd.csv",
1120
+ "data/results_v2/Llama-2-7b-chat-hf(RAG - Generic Prompt)_wd.csv",
1121
+ "data/results_v2/Llama-2-7b-chat-hf(RAG - Chat Template)_wd.csv",
1122
+ "data/results_v2/Llama-2-7b-chat-hf(Non-RAG)_wd.csv",
1123
+ "data/results_v2/Mistral-7B-Instruct-v0.2(RAG - Generic Prompt)_wd.csv",
1124
+ "data/results_v2/Mistral-7B-Instruct-v0.2(RAG - Chat Template)_wd.csv",
1125
+ "data/results_v2/Mistral-7B-Instruct-v0.2(Non-RAG)_wd.csv",
1126
+ "data/results_v2/Meta-Llama-3-8B-Instruct(RAG - Generic Prompt)_wd.csv",
1127
+ "data/results_v2/Meta-Llama-3-8B-Instruct(RAG - Chat Template)_wd.csv",
1128
+ "data/results_v2/Meta-Llama-3-8B-Instruct(Non-RAG)_wd.csv",
1129
+ "data/results_v2/Llama-2-13b-chat-hf(RAG - Generic Prompt)_wd.csv",
1130
+ "data/results_v2/Llama-2-13b-chat-hf(RAG - Chat Template)_wd.csv",
1131
+ "data/results_v2/Llama-2-13b-chat-hf(Non-RAG)_wd.csv",
1132
+ "data/results_v2/Llama-2-70b-chat-hf(RAG - Generic Prompt)_wd.csv",
1133
+ "data/results_v2/Llama-2-70b-chat-hf(RAG - Chat Template)_wd.csv",
1134
+ "data/results_v2/Llama-2-70b-chat-hf(Non-RAG)_wd.csv",
1135
+ "data/results_v2/Meta-Llama-3-70B-Instruct(RAG - Generic Prompt)_wd.csv",
1136
+ "data/results_v2/Meta-Llama-3-70B-Instruct(RAG - Chat Template)_wd.csv",
1137
+ "data/results_v2/Meta-Llama-3-70B-Instruct(Non-RAG)_wd.csv",
1138
+ ]
1139
+
1140
+
1141
+ def calc_rap_scores(result, precision="precision", recall="recall"):
1142
+ newline_score = [
1143
+ df["newline_score"].mean() for df in result["df_list_repetition_penalty"]
1144
+ ]
1145
+
1146
+ repetition_score = [
1147
+ df["repetition_score"].mean() for df in result["df_list_repetition_penalty"]
1148
+ ]
1149
+
1150
+ if precision in result["df_list_repetition_penalty"][0].columns:
1151
+ precision = [
1152
+ df[precision].mean() for df in result["df_list_repetition_penalty"]
1153
+ ]
1154
+ recall = [df[recall].mean() for df in result["df_list_repetition_penalty"]]
1155
+ else:
1156
+ precision = result["df_overall"][precision]
1157
+ recall = result["df_overall"][recall]
1158
+
1159
+ f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
1160
+
1161
+ nrr = [
1162
+ 1 - (n + r) / s
1163
+ for f, n, r, s in zip(
1164
+ f1, newline_score, repetition_score, result["df_overall"]["answer_len"]
1165
+ )
1166
+ ]
1167
+
1168
+ rap = [calc_adjusted_performance(f, 1 - n) for f, n in zip(f1, nrr)]
1169
+
1170
+ return newline_score, repetition_score, f1, rap, nrr
1171
+
1172
+
1173
+ def get_model_name(csv_result_file):
1174
+ parts = re.split(r"[_/]", csv_result_file)
1175
+ print(f"parts: {parts}")
1176
+ model_name = parts[3]
1177
+ return model_name
1178
+
1179
+
1180
+ def load_webqsp_result(csv_result_files, force_recalculate=False, save=False):
1181
+ result = {}
1182
+ for i, csv_result_file in enumerate(csv_result_files):
1183
+ try:
1184
+ df = pd.read_csv(csv_result_file)
1185
+ model_name = get_model_name(csv_result_file)
1186
+ print(f"\tmodel_name: {model_name}")
1187
+
1188
+ dfs = [
1189
+ calculate_performance_score(
1190
+ csv_result_file,
1191
+ repetition_penalty,
1192
+ force_recalculate=force_recalculate,
1193
+ )
1194
+ for repetition_penalty in df["repetition_penalty"]
1195
+ ]
1196
+
1197
+ answer_lens = []
1198
+ for df_rpp in dfs:
1199
+ answer_lens.append(df_rpp["answer_len"].mean())
1200
+ df["answer_len"] = answer_lens
1201
+
1202
+ result[model_name] = {
1203
+ "df_overall": df,
1204
+ "df_list_repetition_penalty": dfs,
1205
+ "file": csv_result_file,
1206
+ }
1207
+ newline_score, repetition_score, perf, rap, nrr = calc_rap_scores(
1208
+ result[model_name]
1209
+ )
1210
+ df["newline_score"] = newline_score
1211
+ df["repetition_score"] = repetition_score
1212
+ df["total_repetitions"] = df["newline_score"] + df["repetition_score"]
1213
+ df["perf"] = perf
1214
+ df["nrr"] = nrr
1215
+ df["rap"] = rap
1216
+ df["rr"] = df["nrr"].apply(lambda x: 1 - x)
1217
+ if save:
1218
+ df.to_csv(csv_result_file, index=False)
1219
+ except Exception as e:
1220
+ print(f"Error: {e}")
1221
+ traceback.print_exc()
1222
+
1223
+ return result
1224
+
1225
+
1226
+ def load_ms_marco_result(
1227
+ csv_result_files, force_recalculate=False, calc_bertscore=False, save=False
1228
+ ):
1229
+ result = {}
1230
+ for csv_result_file in csv_result_files:
1231
+ try:
1232
+ df = pd.read_csv(csv_result_file)
1233
+ model_name = get_model_name(csv_result_file)
1234
+ print(f"\tmodel_name: {model_name}")
1235
+
1236
+ dfs = [
1237
+ load_for_repetition_penalty_ms_macro(
1238
+ csv_result_file,
1239
+ repetition_penalty,
1240
+ force_recalculate=force_recalculate,
1241
+ )
1242
+ for repetition_penalty in df["repetition_penalty"]
1243
+ ]
1244
+
1245
+ answer_lens = []
1246
+ for df_rpp in dfs:
1247
+ answer_lens.append(df_rpp["answer_len"].mean())
1248
+ df["answer_len"] = answer_lens
1249
+
1250
+ col = "bert_score" if calc_bertscore else "meteor"
1251
+ score_unavailable = col not in df.columns
1252
+
1253
+ if score_unavailable:
1254
+ save = True
1255
+ bert_meteor_scores = []
1256
+ bert_score_references = None
1257
+ for df_rpp in dfs:
1258
+ if calc_bertscore:
1259
+ bert_meteor_score = 0
1260
+
1261
+ for i, row in df_rpp.iterrows():
1262
+ answer = row["answer"]
1263
+ if not isinstance(answer, str):
1264
+ answer = ""
1265
+ bert_meteor_score += bert_score.compute(
1266
+ predictions=[answer],
1267
+ references=[row["ground_truth"][0]],
1268
+ lang="en",
1269
+ model_type="microsoft/deberta-large-mnli",
1270
+ )["f1"][0]
1271
+ # get average of bertscore
1272
+ bert_meteor_score = bert_meteor_score / len(df_rpp)
1273
+
1274
+ print(f"bert_score: {bert_meteor_score}")
1275
+ else:
1276
+ bert_meteor_score = meteor.compute(
1277
+ predictions=df_rpp["answer"],
1278
+ references=df_rpp["ground_truth"],
1279
+ )["meteor"]
1280
+
1281
+ bert_meteor_scores.append(bert_meteor_score)
1282
+
1283
+ df[col] = bert_meteor_scores
1284
+
1285
+ result[model_name] = {
1286
+ "df_overall": df,
1287
+ "df_list_repetition_penalty": dfs,
1288
+ "file": csv_result_file,
1289
+ }
1290
+ newline_score, repetition_score, perf, rap, nrr = calc_rap_scores(
1291
+ result[model_name],
1292
+ precision=col,
1293
+ recall=col,
1294
+ )
1295
+ df["newline_score"] = newline_score
1296
+ df["repetition_score"] = repetition_score
1297
+ df["total_repetitions"] = df["newline_score"] + df["repetition_score"]
1298
+ df["perf"] = perf
1299
+ df["nrr"] = nrr
1300
+ df["rap"] = rap
1301
+ df["rr"] = df["nrr"].apply(lambda x: 1 - x)
1302
+
1303
+ if save:
1304
+ df.to_csv(csv_result_file, index=False)
1305
+ except Exception as e:
1306
+ print("An error occurred:", e)
1307
+ traceback.print_exc()
1308
+ print(f"csv_result_file: {csv_result_file}")
1309
+
1310
+ return result
notebooks/03a_RAPGeT_v2_Data Analysis_Chat_Template.ipynb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6273fc3413aa0c507438f5061c2047948cee5516d16244aacbd2f6f72b19dfff
3
- size 1562071
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:205197936482de4ebc17e7cad622a0e699303d062112cc45df85477e7f1f8328
3
+ size 1557858
notebooks/03b_RAPGeT_v2_Data Analysis_Generic_Prompt.ipynb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b948f85d9b7ba464ddadbd63be90d285641d64d366140a55a2999f69b21b2021
3
- size 14255738
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e3885c9aa8fd8a1f83f6693df9c68a278575b6b1caf9e087c00eb6264d3e886
3
+ size 26471820
notebooks/03c_RAPGeT_v2_Data Analysis.ipynb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4f31409b55a833748353233d379d184488373cab3e54697002a503877914a2af
3
- size 1954627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26dcf7f7d287ca5c6135b26533495ec4d56ed3b0a7eac7d0e10f12aec4d95257
3
+ size 1714764
results/mac-results_rpp_with_mnt_2048_generic_prompt_metrics.csv CHANGED
@@ -1,26 +1,27 @@
1
  model,rpp,comet,meteor,spbleu,bleu_1,rouge_l,ews_score,repetition_score,total_repetitions,rr,rap,translation_completeness,num_max_output_tokens
2
- internlm/internlm2_5-7b-chat,1.00,0.7357995069773978,0.4297612514398102,15.060226683930628,0.1506022668393063,0.4097577795330234,0.04942630185348632,9.235657546337158,9.285083848190645,0.07525035765379114,0.6824623187873116,1.0,2
3
- internlm/internlm2_5-7b-chat,1.02,0.7377187550620283,0.4246676977198055,14.728605282752795,0.147286052827528,0.4063246630867048,0.06972639011473963,5.35657546337158,5.426301853486319,0.04625547346404442,0.7043723959353786,1.0,1
4
- internlm/internlm2_5-7b-chat,1.04,0.7371160490183523,0.4173352728374962,13.846403511622256,0.1384640351162226,0.3988121301027288,0.06884377758164166,5.315092674315975,5.383936451897617,0.04501878242643857,0.704667851183933,1.0,1
5
- internlm/internlm2_5-7b-chat,1.06,0.7338597697698218,0.3997609847704189,12.213374588416173,0.1221337458841617,0.3841365748920261,0.05825242718446602,5.275375110326567,5.333627537511033,0.043830827367611756,0.7023888258277158,1.0,1
6
- internlm/internlm2_5-7b-chat,1.08,0.7318234702626478,0.3881614120395272,11.369735763522288,0.1136973576352228,0.372963223209074,0.06707855251544571,5.283318623124448,5.350397175639894,0.04300663332269164,0.7010173875930041,1.0,1
7
- internlm/internlm2_5-7b-chat,1.10,0.7288648442604431,0.3784182249483568,10.377989030628608,0.103779890306286,0.3618424457502351,0.05207413945278023,5.288614298323036,5.340688437775817,0.042176064682512025,0.6987634348896543,1.0,1
8
- microsoft/Phi-3.5-mini-instruct,1.00,0.710605339281136,0.3788926591792472,9.70032874202361,0.097003287420236,0.3556134739443916,5.390997352162401,12.997352162400706,18.388349514563107,0.13770903562694164,0.6191875166952294,1.0,4
9
- microsoft/Phi-3.5-mini-instruct,1.02,0.7150978385770836,0.3741049510326346,9.910633597905436,0.0991063359790543,0.3453160556383774,3.586054721977052,7.001765225066196,10.587819947043249,0.08180522500528503,0.6589279165887452,1.0,2
10
- microsoft/Phi-3.5-mini-instruct,1.04,0.7074641684778791,0.3538698731015666,9.19721270538052,0.0919721270538052,0.3225824135517728,0.05119152691968226,0.05560458958517211,0.10679611650485436,0.000859149229250836,0.7068566122109297,1.0,0
11
- microsoft/Phi-3.5-mini-instruct,1.06,0.6962301708225224,0.3252854575717334,6.967166383106307,0.069671663831063,0.2948764736589108,0.0353045013239188,0.06796116504854369,0.10326566637246248,0.0007865281839265906,0.6956827814674672,1.0,0
12
- microsoft/Phi-3.5-mini-instruct,1.08,0.6823413657174107,0.301599095293242,5.452744292893752,0.0545274429289375,0.2726387617958179,0.07678729037952339,0.04766107678729038,0.12444836716681378,0.0009016671249608319,0.68172639822959,1.0,0
13
- microsoft/Phi-3.5-mini-instruct,1.10,0.6717851540206916,0.2885734336603344,4.751039447225815,0.0475103944722581,0.2604284999048123,0.08031774051191527,0.02383053839364519,0.10414827890556046,0.0007188284314919954,0.6713024292710504,1.0,0
14
- shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.00,0.739080294072365,0.4490104515425626,6.7013404492782405,0.0670134044927823,0.4196181637680596,0.36716681376875554,139.80935569285083,140.1765225066196,0.48362195756964893,0.45567717525911156,0.999117387466902,15
15
- shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.02,0.743018615750854,0.4514907128972251,8.545954556237808,0.085459545562378,0.4214940415288087,1.0035304501323918,67.00353045013239,68.00706090026479,0.2929644725635723,0.5543280318044165,1.0,6
16
- shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.04,0.7432195577780335,0.4517500968367987,10.080425294411064,0.1008042529441106,0.4200973007348334,0.01059135039717564,35.19770520741395,35.208296557811124,0.17564306911947306,0.6234997933386273,1.0,6
17
- shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.06,0.7430821573139815,0.4484154407825542,10.37470506193322,0.1037470506193321,0.4160289393328045,1.8005295675198587,26.880847308031775,28.68137687555163,0.1522966823356282,0.6381095208514015,1.0,3
18
- shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.08,0.7435937259684909,0.4407733547418294,10.930453247368872,0.1093045324736887,0.4113063412348818,0.09267431597528684,12.007943512797882,12.100617828773169,0.06721477842655646,0.6952559422553878,1.0,3
19
- shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.10,0.7427059700687901,0.4358940590119784,11.381344076286156,0.1138134407628615,0.4062980635945339,0.03971756398940865,0.6681376875551632,0.707855251544572,0.003961824217515018,0.7397693206556093,1.0,1
20
- shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.00,0.3888604919913587,0.2055875758168277,0.2434587181959752,0.0024345871819597,0.1844552188025856,638.2797881729921,3889.9232127096207,4528.203000882612,0.9210262088655917,0.15480936210106447,0.9240953221535746,570
21
- shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.00,0.7222260562908512,0.4039898602650971,13.461179673541356,0.1346117967354136,0.3819960428004565,0.05736981465136805,5.87378640776699,5.931156222418358,0.05150372482295595,0.6859703892398439,1.0,1
22
- shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.02,0.723643534970515,0.4051102919608809,13.18537912294539,0.1318537912294539,0.3824621732976229,0.06266548984995587,5.840247131509267,5.902912621359223,0.05148734372113075,0.6873279697459551,1.0,1
23
- shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.04,0.7238812581796301,0.4039456988919502,13.314773371306682,0.1331477337130668,0.3813737464821349,0.05736981465136805,5.845542806707855,5.902912621359223,0.05127418810757766,0.6877003345402561,1.0,1
24
- shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.06,0.7252625281686607,0.4012797167602334,13.19924345265053,0.1319924345265053,0.3798291332004637,0.06266548984995587,5.847308031774051,5.909973521624007,0.05081388730791121,0.6893297921407147,1.0,1
25
- shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.08,0.7261167238322592,0.3987395126194482,12.656486100206328,0.1265648610020633,0.376975448872996,0.05648720211827008,5.820829655781112,5.877316857899382,0.05012721880128273,0.6906157284372703,1.0,1
26
- shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.10,0.7264630642225547,0.3964859769229444,12.284961706379857,0.1228496170637985,0.3744555065346823,0.04942630185348632,0.09267431597528684,0.14210061782877317,0.001266948385624464,0.7255432558140477,1.0,0
 
 
1
  model,rpp,comet,meteor,spbleu,bleu_1,rouge_l,ews_score,repetition_score,total_repetitions,rr,rap,translation_completeness,num_max_output_tokens
2
+ internlm/internlm2_5-7b-chat,1.00,0.7357995069773978,0.4297612514398102,15.060226683930628,0.1506022668393063,0.4097577795330234,0.04942630185348632,9.235657546337158,9.285083848190645,0.07525035765379114,0.581878095297299,1.0,2
3
+ internlm/internlm2_5-7b-chat,1.02,0.7377187550620283,0.4246676977198055,14.728605282752795,0.147286052827528,0.4063246630867048,0.06972639011473963,5.35657546337158,5.426301853486319,0.04625547346404442,0.6400103546749837,1.0,1
4
+ internlm/internlm2_5-7b-chat,1.04,0.7371160490183523,0.4173352728374962,13.846403511622256,0.1384640351162226,0.3988121301027288,0.06884377758164166,5.315092674315975,5.383936451897617,0.04501878242643857,0.6419783129560218,1.0,1
5
+ internlm/internlm2_5-7b-chat,1.06,0.7338597697698218,0.3997609847704189,12.213374588416173,0.1221337458841617,0.3841365748920261,0.05825242718446602,5.275375110326567,5.333627537511033,0.043830827367611756,0.6415304775228277,1.0,1
6
+ internlm/internlm2_5-7b-chat,1.08,0.7318234702626478,0.3881614120395272,11.369735763522288,0.1136973576352228,0.372963223209074,0.06707855251544571,5.283318623124448,5.350397175639894,0.04300663332269164,0.6414061446416202,1.0,1
7
+ internlm/internlm2_5-7b-chat,1.10,0.7288648442604431,0.3784182249483568,10.377989030628608,0.103779890306286,0.3618424457502351,0.05207413945278023,5.288614298323036,5.340688437775817,0.042176064682512025,0.6404777687452995,1.0,1
8
+ microsoft/Phi-3.5-mini-instruct,1.00,0.710605339281136,0.3788926591792472,9.70032874202361,0.097003287420236,0.3556134739443916,5.390997352162401,12.997352162400706,18.388349514563107,0.13770903562694164,0.4556065638568846,1.0,4
9
+ microsoft/Phi-3.5-mini-instruct,1.02,0.7150978385770836,0.3741049510326346,9.910633597905436,0.0991063359790543,0.3453160556383774,3.586054721977052,7.001765225066196,10.587819947043249,0.08180522500528503,0.5535666483700645,1.0,2
10
+ microsoft/Phi-3.5-mini-instruct,1.04,0.7074641684778791,0.3538698731015666,9.19721270538052,0.0919721270538052,0.3225824135517728,0.05119152691968226,0.05560458958517211,0.10679611650485436,0.000859149229250836,0.7056422827612971,1.0,0
11
+ microsoft/Phi-3.5-mini-instruct,1.06,0.6962301708225224,0.3252854575717334,6.967166383106307,0.069671663831063,0.2948764736589108,0.0353045013239188,0.06796116504854369,0.10326566637246248,0.0007865281839265906,0.6945886486476809,1.0,0
12
+ microsoft/Phi-3.5-mini-instruct,1.08,0.6823413657174107,0.301599095293242,5.452744292893752,0.0545274429289375,0.2726387617958179,0.07678729037952339,0.04766107678729038,0.12444836716681378,0.0009016671249608319,0.6804972951227785,1.0,0
13
+ microsoft/Phi-3.5-mini-instruct,1.10,0.6717851540206916,0.2885734336603344,4.751039447225815,0.0475103944722581,0.2604284999048123,0.08031774051191527,0.02383053839364519,0.10414827890556046,0.0007188284314919954,0.6703375003284932,1.0,0
14
+ shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.00,0.739080294072365,0.4490104515425626,6.7013404492782405,0.0670134044927823,0.4196181637680596,0.36716681376875554,139.80935569285083,140.1765225066196,0.48362195756964893,0.10176417668651536,0.999117387466902,15
15
+ shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.02,0.743018615750854,0.4514907128972251,8.545954556237808,0.085459545562378,0.4214940415288087,1.0035304501323918,67.00353045013239,68.00706090026479,0.2929644725635723,0.2626173445806161,1.0,6
16
+ shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.04,0.7432195577780335,0.4517500968367987,10.080425294411064,0.1008042529441106,0.4200973007348334,0.01059135039717564,35.19770520741395,35.208296557811124,0.17564306911947306,0.4163542580947422,1.0,6
17
+ shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.06,0.7430821573139815,0.4484154407825542,10.37470506193322,0.1037470506193321,0.4160289393328045,1.8005295675198587,26.880847308031775,28.68137687555163,0.1522966823356282,0.45265620897504616,1.0,3
18
+ shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.08,0.7435937259684909,0.4407733547418294,10.930453247368872,0.1093045324736887,0.4113063412348818,0.09267431597528684,12.007943512797882,12.100617828773169,0.06721477842655646,0.6035047423944578,1.0,3
19
+ shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.10,0.7427059700687901,0.4358940590119784,11.381344076286156,0.1138134407628615,0.4062980635945339,0.03971756398940865,0.6681376875551632,0.707855251544572,0.003961824217515018,0.7339134850401315,1.0,1
20
+ shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.00,0.3888604919913587,0.2055875758168277,0.2434587181959752,0.0024345871819597,0.1844552188025856,638.2797881729921,3889.9232127096207,4528.203000882612,0.9210262088655917,0.00019153263422509847,0.9240953221535746,570
21
+ shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.02,0.401959364669889,0.2020993340187826,0.2473696547531083,0.002473696547531,0.1795542969510355,611.315975286849,3759.7599293909975,4371.075904677847,0.8883366762655638,0.0005596465146821489,0.912621359223301,562
22
+ shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.00,0.7222260562908512,0.4039898602650971,13.461179673541356,0.1346117967354136,0.3819960428004565,0.05736981465136805,5.87378640776699,5.931156222418358,0.05150372482295595,0.6162827926700337,1.0,1
23
+ shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.02,0.723643534970515,0.4051102919608809,13.18537912294539,0.1318537912294539,0.3824621732976229,0.06266548984995587,5.840247131509267,5.902912621359223,0.05148734372113075,0.617524335498735,1.0,1
24
+ shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.04,0.7238812581796301,0.4039456988919502,13.314773371306682,0.1331477337130668,0.3813737464821349,0.05736981465136805,5.845542806707855,5.902912621359223,0.05127418810757766,0.6181437496179476,1.0,1
25
+ shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.06,0.7252625281686607,0.4012797167602334,13.19924345265053,0.1319924345265053,0.3798291332004637,0.06266548984995587,5.847308031774051,5.909973521624007,0.05081388730791121,0.6202251404786316,1.0,1
26
+ shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.08,0.7261167238322592,0.3987395126194482,12.656486100206328,0.1265648610020633,0.376975448872996,0.05648720211827008,5.820829655781112,5.877316857899382,0.05012721880128273,0.6223042523816,1.0,1
27
+ shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.10,0.7264630642225547,0.3964859769229444,12.284961706379857,0.1228496170637985,0.3744555065346823,0.04942630185348632,0.09267431597528684,0.14210061782877317,0.001266948385624464,0.7237053873903428,1.0,0
results/mac-results_rpp_with_mnt_2048_metrics.csv CHANGED
@@ -1,31 +1,31 @@
1
  model,rpp,comet,meteor,spbleu,bleu_1,rouge_l,ews_score,repetition_score,total_repetitions,rr,rap,translation_completeness,num_max_output_tokens
2
- internlm/internlm2_5-7b-chat,1.00,0.739699612254078,0.4289996929258777,14.734881589173108,0.1473488158917311,0.4096466800937898,0.05383936451897617,12.606354810238305,12.660194174757281,0.10371655820679682,0.6668249823411998,1.0,2
3
- internlm/internlm2_5-7b-chat,1.02,0.740223803961056,0.4266246904302194,14.583816688798017,0.1458381668879802,0.4071727106228415,0.06266548984995587,9.849073256840247,9.911738746690203,0.0832234063051179,0.6811136450938015,1.0,1
4
- internlm/internlm2_5-7b-chat,1.04,0.7398856264610577,0.4154585167056314,13.534659133050225,0.1353465913305021,0.3968657713589718,0.07237422771403354,6.529567519858782,6.601941747572815,0.05613508442776736,0.6994963184593858,1.0,1
5
- internlm/internlm2_5-7b-chat,1.06,0.7379362287241489,0.4039588647855378,12.346740971499404,0.1234674097149939,0.3872447044295494,0.06796116504854369,6.533980582524272,6.601941747572815,0.05513987689359035,0.6983479910444814,0.999117387466902,1
6
- internlm/internlm2_5-7b-chat,1.08,0.7319988705684732,0.3873176839854818,11.075674965706344,0.1107567496570634,0.3724352909668609,0.05207413945278023,9.83495145631068,9.88702559576346,0.07906717392378437,0.676350746394496,0.999117387466902,1
7
- internlm/internlm2_5-7b-chat,1.10,0.7295350462119345,0.3769306874386757,10.305163787094209,0.1030516378709421,0.3634496155759507,0.07855251544571933,6.527802294792586,6.606354810238305,0.053004659594657756,0.6918732323764201,0.999117387466902,1
8
- microsoft/Phi-3.5-mini-instruct,1.00,0.7107840433177544,0.3796831545348129,8.71296896471494,0.0871296896471493,0.3589874395901284,10.670785525154457,17.93821712268314,28.6090026478376,0.20225504327262062,0.5806299320134263,1.0,6
9
- microsoft/Phi-3.5-mini-instruct,1.02,0.7164765837070485,0.3780585837553919,10.291240080163629,0.1029124008016362,0.3546952732427276,3.585172109443954,7.1403353927625774,10.725507502206531,0.08530053839296368,0.6578947912630083,1.0,2
10
- microsoft/Phi-3.5-mini-instruct,1.04,0.7111233387336411,0.3547161333845742,8.966881655527896,0.0896688165552789,0.3300979657678754,3.6125330979699912,0.07325684024713151,3.685789938217123,0.02973427131098516,0.6902898733301204,1.0,1
11
- microsoft/Phi-3.5-mini-instruct,1.06,0.7024363270136286,0.3298733737040869,7.076233088011138,0.0707623308801113,0.3019513312669543,0.04589585172109444,0.05207413945278023,0.09796999117387467,0.0007571675113745661,0.701904666351495,1.0,0
12
- microsoft/Phi-3.5-mini-instruct,1.08,0.6882111219210848,0.3054541022592767,5.105510599247868,0.0510551059924786,0.2736030007297014,3.3609885260370698,0.06443071491615181,3.4254192409532216,0.023581380370521147,0.6721720091371258,1.0,1
13
- microsoft/Phi-3.5-mini-instruct,1.10,0.6712992989638161,0.2903831801547132,4.091958857999118,0.0409195885799911,0.251653275009876,0.32215357458075905,0.06531332744924978,0.38746690203000883,0.0023407216247487324,0.6697298117609694,1.0,0
14
- shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.00,0.7501818982248062,0.4611110508507017,17.87914973742753,0.1787914973742752,0.4340662057009564,0.00706090026478376,0.1262135922330097,0.13327449249779347,0.0011265209898463904,0.7493372784005554,1.0,0
15
- shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.02,0.7485114382045625,0.4571517219079576,17.436884594979905,0.174368845949799,0.4311385932640979,0.00706090026478376,0.11562224183583407,0.12268314210061783,0.0010382199383043404,0.7477347219762754,1.0,0
16
- shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.04,0.7500591586357918,0.4560467960364254,17.440173470996626,0.1744017347099662,0.4302844557731285,0.00706090026478376,0.13062665489849956,0.13768755516328332,0.0011593944393659004,0.7491900481363686,1.0,0
17
- shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.06,0.748812871571673,0.4520416361219855,16.89523258317781,0.168952325831778,0.4260026774745837,0.00706090026478376,0.0997352162400706,0.10679611650485436,0.0008902491962006224,0.7481465381600518,1.0,0
18
- shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.08,0.7473851635144647,0.4442106511292453,16.16623784482793,0.1616623784482792,0.4195129470585874,0.01059135039717564,0.13062665489849956,0.1412180052956752,0.001176591707969938,0.7465063134536485,1.0,0
19
- shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.10,0.7465709781131172,0.4379837926138161,15.60172257624066,0.1560172257624066,0.4132562932940978,0.01059135039717564,0.07855251544571933,0.08914386584289496,0.000734476013176936,0.7460228409589962,1.0,0
20
- shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.00,0.7426396049131678,0.433632501662176,15.209540658023398,0.1520954065802339,0.4089208235151474,0.00353045013239188,3.901147396293027,3.904677846425419,0.03237065275450547,0.7189848023792343,1.0,1
21
- shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.02,0.7436477056353469,0.4329054166518245,15.19102241646024,0.1519102241646024,0.4068967964789407,0.0,3.8905560458958517,3.8905560458958517,0.03219656852361788,0.7200861374743239,1.0,1
22
- shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.04,0.7440943776351209,0.4320478700956207,15.05135166158296,0.1505135166158296,0.4062008380201262,0.00353045013239188,0.1526919682259488,0.1562224183583407,0.001352332200022921,0.7430887949425674,1.0,0
23
- shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.06,0.7426502735395928,0.4275429314912545,14.449130821290163,0.1444913082129016,0.4001409979222783,0.00706090026478376,0.13768755516328332,0.14474845542806708,0.0012399256044637321,0.7417300130954148,1.0,0
24
- shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.08,0.7408098006080129,0.4206626658729054,13.933703757385222,0.1393370375738522,0.3964824268676203,0.00353045013239188,0.1297440423654016,0.13327449249779347,0.001134996993385448,0.7399694606935023,1.0,0
25
- shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.10,0.7392685912871718,0.4111211240399151,13.303738403756984,0.1330373840375698,0.3870959581563503,0.00353045013239188,0.12180052956751986,0.12533097969991175,0.0010529672171262895,0.7384905753804022,1.0,0
26
- shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.00,0.7240239171358935,0.4068335357738006,13.565136550617618,0.1356513655061761,0.3866395067055498,0.0529567519858782,0.1209179170344219,0.17387466902030008,0.001578993772192076,0.7228815899808891,1.0,0
27
- shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.02,0.7263097057327799,0.4064914781094827,13.42987641622816,0.1342987641622816,0.3863697821025159,0.06001765225066196,6.236540158870256,6.296557811120918,0.0541899611084103,0.6879984291168549,1.0,1
28
- shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.04,0.7276128307708258,0.4054859896994975,13.295092218891954,0.1329509221889195,0.3851203729935697,0.05207413945278023,0.1297440423654016,0.18181818181818182,0.0016533037985858635,0.7264108595993547,1.0,0
29
- shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.06,0.7276865132383193,0.4014727027723293,13.10860799057166,0.1310860799057166,0.3804952786306688,0.05207413945278023,0.13415710503089143,0.18623124448367168,0.001691057431836761,0.7264569934393594,1.0,0
30
- shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.08,0.726393195584298,0.3987018836449559,12.850537785783194,0.1285053778578319,0.3788945955746495,0.05648720211827008,0.15357458075904679,0.21006178287731686,0.0018871365478087807,0.7250236850699382,1.0,0
31
- shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.10,0.7244012304511832,0.3932239948456176,12.361161644811926,0.1236116164481192,0.3733413807007665,0.05030891438658429,0.08561341571050309,0.13592233009708737,0.0012217374057913526,0.7235167427869905,1.0,0
 
1
  model,rpp,comet,meteor,spbleu,bleu_1,rouge_l,ews_score,repetition_score,total_repetitions,rr,rap,translation_completeness,num_max_output_tokens
2
+ internlm/internlm2_5-7b-chat,1.00,0.739699612254078,0.4289996929258777,14.734881589173108,0.1473488158917311,0.4096466800937898,0.05383936451897617,12.606354810238305,12.660194174757281,0.10371655820679682,0.5325881640465967,1.0,2
3
+ internlm/internlm2_5-7b-chat,1.02,0.740223803961056,0.4266246904302194,14.583816688798017,0.1458381668879802,0.4071727106228415,0.06266548984995587,9.849073256840247,9.911738746690203,0.0832234063051179,0.5703659582906754,1.0,1
4
+ internlm/internlm2_5-7b-chat,1.04,0.7398856264610577,0.4154585167056314,13.534659133050225,0.1353465913305021,0.3968657713589718,0.07237422771403354,6.529567519858782,6.601941747572815,0.05613508442776736,0.6221485884888651,1.0,1
5
+ internlm/internlm2_5-7b-chat,1.06,0.7379362287241489,0.4039588647855378,12.346740971499404,0.1234674097149939,0.3872447044295494,0.06796116504854369,6.533980582524272,6.601941747572815,0.05513987689359035,0.6224742543197805,0.999117387466902,1
6
+ internlm/internlm2_5-7b-chat,1.08,0.7319988705684732,0.3873176839854818,11.075674965706344,0.1107567496570634,0.3724352909668609,0.05207413945278023,9.83495145631068,9.88702559576346,0.07906717392378437,0.5717343310562308,0.999117387466902,1
7
+ internlm/internlm2_5-7b-chat,1.10,0.7295350462119345,0.3769306874386757,10.305163787094209,0.1030516378709421,0.3634496155759507,0.07855251544571933,6.527802294792586,6.606354810238305,0.053004659594657756,0.6195690090849183,0.999117387466902,1
8
+ microsoft/Phi-3.5-mini-instruct,1.00,0.7107840433177544,0.3796831545348129,8.71296896471494,0.0871296896471493,0.3589874395901284,10.670785525154457,17.93821712268314,28.6090026478376,0.20225504327262062,0.3608526271635592,1.0,6
9
+ microsoft/Phi-3.5-mini-instruct,1.02,0.7164765837070485,0.3780585837553919,10.291240080163629,0.1029124008016362,0.3546952732427276,3.585172109443954,7.1403353927625774,10.725507502206531,0.08530053839296368,0.5483240204881398,1.0,2
10
+ microsoft/Phi-3.5-mini-instruct,1.04,0.7111233387336411,0.3547161333845742,8.966881655527896,0.0896688165552789,0.3300979657678754,3.6125330979699912,0.07325684024713151,3.685789938217123,0.02973427131098516,0.6495566110355127,1.0,1
11
+ microsoft/Phi-3.5-mini-instruct,1.06,0.7024363270136286,0.3298733737040869,7.076233088011138,0.0707623308801113,0.3019513312669543,0.04589585172109444,0.05207413945278023,0.09796999117387467,0.0007571675113745661,0.7008419489376413,1.0,0
12
+ microsoft/Phi-3.5-mini-instruct,1.08,0.6882111219210848,0.3054541022592767,5.105510599247868,0.0510551059924786,0.2736030007297014,3.3609885260370698,0.06443071491615181,3.4254192409532216,0.023581380370521147,0.6406632969877,1.0,1
13
+ microsoft/Phi-3.5-mini-instruct,1.10,0.6712992989638161,0.2903831801547132,4.091958857999118,0.0409195885799911,0.251653275009876,0.32215357458075905,0.06531332744924978,0.38746690203000883,0.0023407216247487324,0.6665963500989894,1.0,0
14
+ shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.00,0.7501818982248062,0.4611110508507017,17.87914973742753,0.1787914973742752,0.4340662057009564,0.00706090026478376,0.1262135922330097,0.13327449249779347,0.0011265209898463904,0.7476494662426587,1.0,0
15
+ shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.02,0.7485114382045625,0.4571517219079576,17.436884594979905,0.174368845949799,0.4311385932640979,0.00706090026478376,0.11562224183583407,0.12268314210061783,0.0010382199383043404,0.7461824993322019,1.0,0
16
+ shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.04,0.7500591586357918,0.4560467960364254,17.440173470996626,0.1744017347099662,0.4302844557731285,0.00706090026478376,0.13062665489849956,0.13768755516328332,0.0011593944393659004,0.7474533388920676,1.0,0
17
+ shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.06,0.748812871571673,0.4520416361219855,16.89523258317781,0.168952325831778,0.4260026774745837,0.00706090026478376,0.0997352162400706,0.10679611650485436,0.0008902491962006224,0.7468147612728927,1.0,0
18
+ shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.08,0.7473851635144647,0.4442106511292453,16.16623784482793,0.1616623784482792,0.4195129470585874,0.01059135039717564,0.13062665489849956,0.1412180052956752,0.001176591707969938,0.7447501647073623,1.0,0
19
+ shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.10,0.7465709781131172,0.4379837926138161,15.60172257624066,0.1560172257624066,0.4132562932940978,0.01059135039717564,0.07855251544571933,0.08914386584289496,0.000734476013176936,0.7449271706150111,1.0,0
20
+ shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.00,0.7426396049131678,0.433632501662176,15.209540658023398,0.1520954065802339,0.4089208235151474,0.00353045013239188,3.901147396293027,3.904677846425419,0.03237065275450547,0.6728297734832243,1.0,1
21
+ shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.02,0.7436477056353469,0.4329054166518245,15.19102241646024,0.1519102241646024,0.4068967964789407,0.0,3.8905560458958517,3.8905560458958517,0.03219656852361788,0.6741068111074712,1.0,1
22
+ shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.04,0.7440943776351209,0.4320478700956207,15.05135166158296,0.1505135166158296,0.4062008380201262,0.00353045013239188,0.1526919682259488,0.1562224183583407,0.001352332200022921,0.7410796698393737,1.0,0
23
+ shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.06,0.7426502735395928,0.4275429314912545,14.449130821290163,0.1444913082129016,0.4001409979222783,0.00706090026478376,0.13768755516328332,0.14474845542806708,0.0012399256044637321,0.7398912041420567,1.0,0
24
+ shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.08,0.7408098006080129,0.4206626658729054,13.933703757385222,0.1393370375738522,0.3964824268676203,0.00353045013239188,0.1297440423654016,0.13327449249779347,0.001134996993385448,0.7382902118097237,1.0,0
25
+ shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.10,0.7392685912871718,0.4111211240399151,13.303738403756984,0.1330373840375698,0.3870959581563503,0.00353045013239188,0.12180052956751986,0.12533097969991175,0.0010529672171262895,0.7369357726201563,1.0,0
26
+ shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.00,0.7240239171358935,0.4068335357738006,13.565136550617618,0.1356513655061761,0.3866395067055498,0.0529567519858782,0.1209179170344219,0.17387466902030008,0.001578993772192076,0.7205996419729696,1.0,0
27
+ shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.02,0.7263097057327799,0.4064914781094827,13.42987641622816,0.1342987641622816,0.3863697821025159,0.06001765225066196,6.236540158870256,6.296557811120918,0.0541899611084103,0.6145165811709306,1.0,1
28
+ shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.04,0.7276128307708258,0.4054859896994975,13.295092218891954,0.1329509221889195,0.3851203729935697,0.05207413945278023,0.1297440423654016,0.18181818181818182,0.0016533037985858635,0.7240098989116803,1.0,0
29
+ shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.06,0.7276865132383193,0.4014727027723293,13.10860799057166,0.1310860799057166,0.3804952786306688,0.05207413945278023,0.13415710503089143,0.18623124448367168,0.001691057431836761,0.7240010735018495,1.0,0
30
+ shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.08,0.726393195584298,0.3987018836449559,12.850537785783194,0.1285053778578319,0.3788945955746495,0.05648720211827008,0.15357458075904679,0.21006178287731686,0.0018871365478087807,0.7222885419382362,1.0,0
31
+ shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.10,0.7244012304511832,0.3932239948456176,12.361161644811926,0.1236116164481192,0.3733413807007665,0.05030891438658429,0.08561341571050309,0.13592233009708737,0.0012217374057913526,0.721749388705754,1.0,0