Petr Tsvetkov
commited on
Commit
β’
aef1dbe
1
Parent(s):
9e1ff19
Visualizer bugs fixed; added normalized editdist
Browse files- analysis_util.py +10 -10
- change_visualizer.py +2 -2
- generate_annotated_diffs.py +7 -2
- generation_steps/metrics_analysis.py +42 -4
analysis_util.py
CHANGED
@@ -55,21 +55,21 @@ def get_correlations_df(df, right_side):
|
|
55 |
|
56 |
|
57 |
def get_correlations_for_groups(df, right_side):
|
58 |
-
|
59 |
|
60 |
for e2s in (False, True):
|
61 |
for s2e in (False, True):
|
62 |
-
|
63 |
if e2s:
|
64 |
-
|
65 |
if s2e:
|
66 |
-
|
67 |
-
if
|
68 |
-
|
69 |
|
70 |
subdf = df[(df["end_to_start"] == e2s) & (df["start_to_end"] == s2e)]
|
71 |
-
|
72 |
-
|
73 |
|
74 |
-
|
75 |
-
return
|
|
|
55 |
|
56 |
|
57 |
def get_correlations_for_groups(df, right_side):
|
58 |
+
correlations = {"all": get_correlations_df(df, right_side=right_side)}
|
59 |
|
60 |
for e2s in (False, True):
|
61 |
for s2e in (False, True):
|
62 |
+
group = ""
|
63 |
if e2s:
|
64 |
+
group += "+e2s"
|
65 |
if s2e:
|
66 |
+
group += "+s2e"
|
67 |
+
if group == "":
|
68 |
+
group = "golden"
|
69 |
|
70 |
subdf = df[(df["end_to_start"] == e2s) & (df["start_to_end"] == s2e)]
|
71 |
+
subdf_corr = get_correlations_df(subdf, right_side=right_side)
|
72 |
+
correlations[group] = subdf_corr
|
73 |
|
74 |
+
correlations = pd.concat(correlations, axis=1)
|
75 |
+
return correlations
|
change_visualizer.py
CHANGED
@@ -110,8 +110,8 @@ if __name__ == '__main__':
|
|
110 |
gr.Markdown(f"### Reference-only correlations")
|
111 |
gr.Markdown(value=analysis_util.get_correlations_for_groups(df_synthetic, right_side="ind").to_markdown())
|
112 |
|
113 |
-
gr.Markdown(f"### Aggregated correlations")
|
114 |
-
gr.Markdown(value=analysis_util.get_correlations_for_groups(df_synthetic, right_side="aggr").to_markdown())
|
115 |
|
116 |
application.load(update_dataset_view_manual, inputs=slider_manual,
|
117 |
outputs=view_manual)
|
|
|
110 |
gr.Markdown(f"### Reference-only correlations")
|
111 |
gr.Markdown(value=analysis_util.get_correlations_for_groups(df_synthetic, right_side="ind").to_markdown())
|
112 |
|
113 |
+
# gr.Markdown(f"### Aggregated correlations")
|
114 |
+
# gr.Markdown(value=analysis_util.get_correlations_for_groups(df_synthetic, right_side="aggr").to_markdown())
|
115 |
|
116 |
application.load(update_dataset_view_manual, inputs=slider_manual,
|
117 |
outputs=view_manual)
|
generate_annotated_diffs.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import diff_match_patch as dmp_module
|
|
|
2 |
|
3 |
from api_wrappers import hf_data_loader
|
4 |
|
@@ -26,14 +27,18 @@ def annotated_diff_for_row(row):
|
|
26 |
|
27 |
|
28 |
def manual_data_with_annotated_diffs():
|
|
|
|
|
29 |
df = hf_data_loader.load_raw_rewriting_as_pandas()
|
30 |
-
annotated = df.
|
31 |
df['annotated_diff'] = annotated
|
32 |
return df
|
33 |
|
34 |
|
35 |
def synthetic_data_with_annotated_diffs():
|
|
|
|
|
36 |
df = hf_data_loader.load_synthetic_as_pandas()
|
37 |
-
annotated = df.
|
38 |
df['annotated_diff'] = annotated
|
39 |
return df
|
|
|
1 |
import diff_match_patch as dmp_module
|
2 |
+
from tqdm import tqdm
|
3 |
|
4 |
from api_wrappers import hf_data_loader
|
5 |
|
|
|
27 |
|
28 |
|
29 |
def manual_data_with_annotated_diffs():
|
30 |
+
tqdm.pandas()
|
31 |
+
|
32 |
df = hf_data_loader.load_raw_rewriting_as_pandas()
|
33 |
+
annotated = df.progress_apply(annotated_diff_for_row, axis=1)
|
34 |
df['annotated_diff'] = annotated
|
35 |
return df
|
36 |
|
37 |
|
38 |
def synthetic_data_with_annotated_diffs():
|
39 |
+
tqdm.pandas()
|
40 |
+
|
41 |
df = hf_data_loader.load_synthetic_as_pandas()
|
42 |
+
annotated = df.progress_apply(annotated_diff_for_row, axis=1)
|
43 |
df['annotated_diff'] = annotated
|
44 |
return df
|
generation_steps/metrics_analysis.py
CHANGED
@@ -15,6 +15,8 @@ BLEU = evaluate.load('bleu', cache_dir=config.CACHE_DIR)
|
|
15 |
|
16 |
|
17 |
def bleu_fn(pred, ref, **kwargs):
|
|
|
|
|
18 |
return BLEU.compute(predictions=[pred], references=[ref])["bleu"]
|
19 |
|
20 |
|
@@ -22,6 +24,8 @@ METEOR = evaluate.load('meteor', cache_dir=config.CACHE_DIR)
|
|
22 |
|
23 |
|
24 |
def meteor_fn(pred, ref, **kwargs):
|
|
|
|
|
25 |
return METEOR.compute(predictions=[pred], references=[ref])["meteor"]
|
26 |
|
27 |
|
@@ -29,14 +33,20 @@ ROUGE = evaluate.load('rouge', cache_dir=config.CACHE_DIR)
|
|
29 |
|
30 |
|
31 |
def rouge1_fn(pred, ref, **kwargs):
|
|
|
|
|
32 |
return ROUGE.compute(predictions=[pred], references=[ref])["rouge1"]
|
33 |
|
34 |
|
35 |
def rouge2_fn(pred, ref, **kwargs):
|
|
|
|
|
36 |
return ROUGE.compute(predictions=[pred], references=[ref])["rouge2"]
|
37 |
|
38 |
|
39 |
def rougeL_fn(pred, ref, **kwargs):
|
|
|
|
|
40 |
return ROUGE.compute(predictions=[pred], references=[ref])["rougeL"]
|
41 |
|
42 |
|
@@ -44,6 +54,10 @@ BERTSCORE = evaluate.load('bertscore', cache_dir=config.CACHE_DIR)
|
|
44 |
|
45 |
|
46 |
def bertscore_fn(pred, ref, **kwargs):
|
|
|
|
|
|
|
|
|
47 |
return BERTSCORE.compute(predictions=[pred], references=[ref], model_type="distilbert-base-uncased")["f1"][0]
|
48 |
|
49 |
|
@@ -51,6 +65,8 @@ CHRF = evaluate.load("chrf")
|
|
51 |
|
52 |
|
53 |
def chrf_fn(pred, ref, **kwargs):
|
|
|
|
|
54 |
return CHRF.compute(predictions=[pred], references=[[ref]])["score"]
|
55 |
|
56 |
|
@@ -58,26 +74,46 @@ TER = evaluate.load("ter")
|
|
58 |
|
59 |
|
60 |
def ter_fn(pred, ref, **kwargs):
|
|
|
|
|
61 |
return TER.compute(predictions=[pred], references=[[ref]])["score"]
|
62 |
|
63 |
|
64 |
def edit_distance_fn(pred, ref, **kwargs):
|
|
|
|
|
|
|
65 |
return Levenshtein.distance(pred, ref)
|
66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
|
68 |
def edit_time_fn(pred, ref, **kwargs):
|
69 |
return kwargs["edittime"]
|
70 |
|
71 |
|
72 |
def gptscore_ref_1_fn(pred, ref, **kwargs):
|
|
|
|
|
|
|
73 |
return gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=1)
|
74 |
|
75 |
|
76 |
def gptscore_ref_3_fn(pred, ref, **kwargs):
|
|
|
|
|
|
|
77 |
return gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=3)
|
78 |
|
79 |
|
80 |
def gptscore_ref_5_fn(pred, ref, **kwargs):
|
|
|
|
|
|
|
81 |
return gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=5)
|
82 |
|
83 |
|
@@ -94,13 +130,14 @@ def gptscore_noref_5_fn(pred, ref, **kwargs):
|
|
94 |
|
95 |
|
96 |
IND_METRICS = {
|
|
|
|
|
97 |
"gptscore-ref-1-req": gptscore_ref_1_fn,
|
98 |
# "gptscore-ref-3-req": gptscore_ref_3_fn,
|
99 |
# "gptscore-ref-5-req": gptscore_ref_5_fn,
|
100 |
"gptscore-noref-1-req": gptscore_noref_1_fn,
|
101 |
# "gptscore-noref-3-req": gptscore_noref_3_fn,
|
102 |
# "gptscore-noref-5-req": gptscore_noref_5_fn,
|
103 |
-
"editdist": edit_distance_fn,
|
104 |
"bleu": bleu_fn,
|
105 |
"meteor": meteor_fn,
|
106 |
"rouge1": rouge1_fn,
|
@@ -112,11 +149,12 @@ IND_METRICS = {
|
|
112 |
}
|
113 |
|
114 |
AGGR_METRICS = IND_METRICS.copy()
|
115 |
-
del AGGR_METRICS["gptscore-ref-1-req"]
|
116 |
-
del AGGR_METRICS["gptscore-noref-1-req"]
|
117 |
|
118 |
REL_METRICS = {
|
119 |
"editdist": edit_distance_fn,
|
|
|
120 |
"edittime": edit_time_fn,
|
121 |
}
|
122 |
|
@@ -183,7 +221,7 @@ def compute_metrics(df):
|
|
183 |
df[f"rel_{rel_metric}_aggr_{aggr_metric}_pearson"] = (
|
184 |
df[f"{rel_metric}_related"].corr(df[f"{aggr_metric}_aggr"], method="pearson"))
|
185 |
|
186 |
-
df[f"rel_{rel_metric}
|
187 |
df[f"{rel_metric}_related"].corr(df[f"{aggr_metric}_aggr"], method="spearman"))
|
188 |
|
189 |
return df
|
|
|
15 |
|
16 |
|
17 |
def bleu_fn(pred, ref, **kwargs):
|
18 |
+
if "refs" in kwargs:
|
19 |
+
return BLEU.compute(predictions=[pred] * len(kwargs["refs"]), references=kwargs["refs"])["bleu"]
|
20 |
return BLEU.compute(predictions=[pred], references=[ref])["bleu"]
|
21 |
|
22 |
|
|
|
24 |
|
25 |
|
26 |
def meteor_fn(pred, ref, **kwargs):
|
27 |
+
if "refs" in kwargs:
|
28 |
+
return METEOR.compute(predictions=[pred] * len(kwargs["refs"]), references=kwargs["refs"])["meteor"]
|
29 |
return METEOR.compute(predictions=[pred], references=[ref])["meteor"]
|
30 |
|
31 |
|
|
|
33 |
|
34 |
|
35 |
def rouge1_fn(pred, ref, **kwargs):
|
36 |
+
if "refs" in kwargs:
|
37 |
+
return ROUGE.compute(predictions=[pred] * len(kwargs["refs"]), references=kwargs["refs"])["rouge1"]
|
38 |
return ROUGE.compute(predictions=[pred], references=[ref])["rouge1"]
|
39 |
|
40 |
|
41 |
def rouge2_fn(pred, ref, **kwargs):
|
42 |
+
if "refs" in kwargs:
|
43 |
+
return ROUGE.compute(predictions=[pred] * len(kwargs["refs"]), references=kwargs["refs"])["rouge2"]
|
44 |
return ROUGE.compute(predictions=[pred], references=[ref])["rouge2"]
|
45 |
|
46 |
|
47 |
def rougeL_fn(pred, ref, **kwargs):
|
48 |
+
if "refs" in kwargs:
|
49 |
+
return ROUGE.compute(predictions=[pred] * len(kwargs["refs"]), references=kwargs["refs"])["rougeL"]
|
50 |
return ROUGE.compute(predictions=[pred], references=[ref])["rougeL"]
|
51 |
|
52 |
|
|
|
54 |
|
55 |
|
56 |
def bertscore_fn(pred, ref, **kwargs):
|
57 |
+
if "refs" in kwargs:
|
58 |
+
return \
|
59 |
+
BERTSCORE.compute(predictions=[pred], references=[kwargs["refs"]], model_type="distilbert-base-uncased")[
|
60 |
+
"f1"][0]
|
61 |
return BERTSCORE.compute(predictions=[pred], references=[ref], model_type="distilbert-base-uncased")["f1"][0]
|
62 |
|
63 |
|
|
|
65 |
|
66 |
|
67 |
def chrf_fn(pred, ref, **kwargs):
|
68 |
+
if "refs" in kwargs:
|
69 |
+
return CHRF.compute(predictions=[pred], references=[kwargs["refs"]])["score"]
|
70 |
return CHRF.compute(predictions=[pred], references=[[ref]])["score"]
|
71 |
|
72 |
|
|
|
74 |
|
75 |
|
76 |
def ter_fn(pred, ref, **kwargs):
|
77 |
+
if "refs" in kwargs:
|
78 |
+
return TER.compute(predictions=[pred], references=[kwargs["refs"]])["score"]
|
79 |
return TER.compute(predictions=[pred], references=[[ref]])["score"]
|
80 |
|
81 |
|
82 |
def edit_distance_fn(pred, ref, **kwargs):
|
83 |
+
if "refs" in kwargs:
|
84 |
+
scores = [Levenshtein.distance(pred, ref) for ref in kwargs["refs"]]
|
85 |
+
return sum(scores) / len(scores)
|
86 |
return Levenshtein.distance(pred, ref)
|
87 |
|
88 |
+
def edit_distance_norm_fn(pred, ref, **kwargs):
|
89 |
+
if "refs" in kwargs:
|
90 |
+
scores = [Levenshtein.distance(pred, ref) / len(pred) for ref in kwargs["refs"]]
|
91 |
+
return sum(scores) / len(scores)
|
92 |
+
return Levenshtein.distance(pred, ref) / len(pred)
|
93 |
+
|
94 |
|
95 |
def edit_time_fn(pred, ref, **kwargs):
|
96 |
return kwargs["edittime"]
|
97 |
|
98 |
|
99 |
def gptscore_ref_1_fn(pred, ref, **kwargs):
|
100 |
+
if "refs" in kwargs:
|
101 |
+
scores = [gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=1) for ref in kwargs["refs"]]
|
102 |
+
return sum(scores) / len(scores)
|
103 |
return gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=1)
|
104 |
|
105 |
|
106 |
def gptscore_ref_3_fn(pred, ref, **kwargs):
|
107 |
+
if "refs" in kwargs:
|
108 |
+
scores = [gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=3) for ref in kwargs["refs"]]
|
109 |
+
return sum(scores) / len(scores)
|
110 |
return gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=3)
|
111 |
|
112 |
|
113 |
def gptscore_ref_5_fn(pred, ref, **kwargs):
|
114 |
+
if "refs" in kwargs:
|
115 |
+
scores = [gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=5) for ref in kwargs["refs"]]
|
116 |
+
return sum(scores) / len(scores)
|
117 |
return gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=5)
|
118 |
|
119 |
|
|
|
130 |
|
131 |
|
132 |
IND_METRICS = {
|
133 |
+
"editdist": edit_distance_fn,
|
134 |
+
"editdist-norm": edit_distance_norm_fn,
|
135 |
"gptscore-ref-1-req": gptscore_ref_1_fn,
|
136 |
# "gptscore-ref-3-req": gptscore_ref_3_fn,
|
137 |
# "gptscore-ref-5-req": gptscore_ref_5_fn,
|
138 |
"gptscore-noref-1-req": gptscore_noref_1_fn,
|
139 |
# "gptscore-noref-3-req": gptscore_noref_3_fn,
|
140 |
# "gptscore-noref-5-req": gptscore_noref_5_fn,
|
|
|
141 |
"bleu": bleu_fn,
|
142 |
"meteor": meteor_fn,
|
143 |
"rouge1": rouge1_fn,
|
|
|
149 |
}
|
150 |
|
151 |
AGGR_METRICS = IND_METRICS.copy()
|
152 |
+
# del AGGR_METRICS["gptscore-ref-1-req"]
|
153 |
+
# del AGGR_METRICS["gptscore-noref-1-req"]
|
154 |
|
155 |
REL_METRICS = {
|
156 |
"editdist": edit_distance_fn,
|
157 |
+
"editdist-norm": edit_distance_norm_fn,
|
158 |
"edittime": edit_time_fn,
|
159 |
}
|
160 |
|
|
|
221 |
df[f"rel_{rel_metric}_aggr_{aggr_metric}_pearson"] = (
|
222 |
df[f"{rel_metric}_related"].corr(df[f"{aggr_metric}_aggr"], method="pearson"))
|
223 |
|
224 |
+
df[f"rel_{rel_metric}_aggr_{aggr_metric}_spearman"] = (
|
225 |
df[f"{rel_metric}_related"].corr(df[f"{aggr_metric}_aggr"], method="spearman"))
|
226 |
|
227 |
return df
|