Spaces:

JetBrains-Research
/

commit-rewriting-visualization

Running

Petr Tsvetkov commited on Apr 30

Commit

073db2c

•

1 Parent(s): f5faae7

Add noref gpt-eval to the pipeline

Files changed (2) hide show

custom_metrics/gpt_eval.py CHANGED Viewed

@@ -20,6 +20,24 @@ lowest quality and 10 is the highest quality. Do not include any other text or e
 """
 N_RETRIES = 3
@@ -51,3 +69,13 @@ def compute_ref(prediction, reference, n_requests):
     ]
     return sum(results) / len(results)

 """
+def build_prompt_noref(prediction, diff):
+    return f"""Evaluate the following commit message based on clarity, specificity, context, and conciseness without
+providing any additional feedback or commentary:
+START OF THE COMMIT MESSAGE YOU HAVE TO EVALUATE
+{prediction}
+END OF THE COMMIT MESSAGE YOU HAVE TO EVALUATE
+These are the code changes included in the commit:
+START OF THE CODE CHANGES
+{diff}
+END OF THE CODE CHANGES
+YOUR TASK: Provide a single number as a response, representing the rating on a scale from 1 to 10, where 1 is the
+lowest quality and 10 is the highest quality. Do not include any other text or explanation in your response.
+"""
 N_RETRIES = 3
     ]
     return sum(results) / len(results)
+def compute_noref(prediction, diff, n_requests):
+    prompt = build_prompt_noref(prediction, diff)
+    results = [
+        get_number_for_prompt(prompt)
+        for _ in range(n_requests)
+    ]
+    return sum(results) / len(results)

generation_steps/metrics_analysis.py CHANGED Viewed

@@ -80,10 +80,25 @@ def gptscore_ref_5_fn(pred, ref, **kwargs):
     return gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=5)
 IND_METRICS = {
     "gptscore-ref-1-req": gptscore_ref_1_fn,
-    "gptscore-ref-3-req": gptscore_ref_3_fn,
     # "gptscore-ref-5-req": gptscore_ref_5_fn,
     "editdist": edit_distance_fn,
     "bleu": bleu_fn,
     "meteor": meteor_fn,
@@ -111,7 +126,7 @@ def compute_metrics(df):
     tqdm.pandas()
     def apply_metric_fn_to_row(row, fn, col_pred, col_ref):
-        return fn(row[col_pred], row[col_ref], edittime=row['edit_time'])
     for metric in REL_METRICS:
         print(f"Computing {metric} for the related pairs")

     return gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=5)
+def gptscore_noref_1_fn(pred, ref, **kwargs):
+    return gpt_eval.compute_noref(prediction=pred, diff=kwargs['diff'], n_requests=1)
+def gptscore_noref_3_fn(pred, ref, **kwargs):
+    return gpt_eval.compute_noref(prediction=pred, diff=kwargs['diff'], n_requests=3)
+def gptscore_noref_5_fn(pred, ref, **kwargs):
+    return gpt_eval.compute_noref(prediction=pred, diff=kwargs['diff'], n_requests=5)
 IND_METRICS = {
     "gptscore-ref-1-req": gptscore_ref_1_fn,
+    # "gptscore-ref-3-req": gptscore_ref_3_fn,
     # "gptscore-ref-5-req": gptscore_ref_5_fn,
+    "gptscore-noref-1-req": gptscore_noref_1_fn,
+    # "gptscore-noref-3-req": gptscore_noref_3_fn,
+    # "gptscore-noref-5-req": gptscore_noref_5_fn,
     "editdist": edit_distance_fn,
     "bleu": bleu_fn,
     "meteor": meteor_fn,
     tqdm.pandas()
     def apply_metric_fn_to_row(row, fn, col_pred, col_ref):
+        return fn(row[col_pred], row[col_ref], edittime=row['edit_time'], diff=str(row['mods']))
     for metric in REL_METRICS:
         print(f"Computing {metric} for the related pairs")