Petr Tsvetkov commited on
Commit
073db2c
1 Parent(s): f5faae7

Add noref gpt-eval to the pipeline

Browse files
custom_metrics/gpt_eval.py CHANGED
@@ -20,6 +20,24 @@ lowest quality and 10 is the highest quality. Do not include any other text or e
20
  """
21
 
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  N_RETRIES = 3
24
 
25
 
@@ -51,3 +69,13 @@ def compute_ref(prediction, reference, n_requests):
51
  ]
52
 
53
  return sum(results) / len(results)
 
 
 
 
 
 
 
 
 
 
 
20
  """
21
 
22
 
23
+ def build_prompt_noref(prediction, diff):
24
+ return f"""Evaluate the following commit message based on clarity, specificity, context, and conciseness without
25
+ providing any additional feedback or commentary:
26
+
27
+ START OF THE COMMIT MESSAGE YOU HAVE TO EVALUATE
28
+ {prediction}
29
+ END OF THE COMMIT MESSAGE YOU HAVE TO EVALUATE
30
+
31
+ These are the code changes included in the commit:
32
+ START OF THE CODE CHANGES
33
+ {diff}
34
+ END OF THE CODE CHANGES
35
+
36
+ YOUR TASK: Provide a single number as a response, representing the rating on a scale from 1 to 10, where 1 is the
37
+ lowest quality and 10 is the highest quality. Do not include any other text or explanation in your response.
38
+ """
39
+
40
+
41
  N_RETRIES = 3
42
 
43
 
 
69
  ]
70
 
71
  return sum(results) / len(results)
72
+
73
+
74
+ def compute_noref(prediction, diff, n_requests):
75
+ prompt = build_prompt_noref(prediction, diff)
76
+ results = [
77
+ get_number_for_prompt(prompt)
78
+ for _ in range(n_requests)
79
+ ]
80
+
81
+ return sum(results) / len(results)
generation_steps/metrics_analysis.py CHANGED
@@ -80,10 +80,25 @@ def gptscore_ref_5_fn(pred, ref, **kwargs):
80
  return gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=5)
81
 
82
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  IND_METRICS = {
84
  "gptscore-ref-1-req": gptscore_ref_1_fn,
85
- "gptscore-ref-3-req": gptscore_ref_3_fn,
86
  # "gptscore-ref-5-req": gptscore_ref_5_fn,
 
 
 
87
  "editdist": edit_distance_fn,
88
  "bleu": bleu_fn,
89
  "meteor": meteor_fn,
@@ -111,7 +126,7 @@ def compute_metrics(df):
111
  tqdm.pandas()
112
 
113
  def apply_metric_fn_to_row(row, fn, col_pred, col_ref):
114
- return fn(row[col_pred], row[col_ref], edittime=row['edit_time'])
115
 
116
  for metric in REL_METRICS:
117
  print(f"Computing {metric} for the related pairs")
 
80
  return gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=5)
81
 
82
 
83
+ def gptscore_noref_1_fn(pred, ref, **kwargs):
84
+ return gpt_eval.compute_noref(prediction=pred, diff=kwargs['diff'], n_requests=1)
85
+
86
+
87
+ def gptscore_noref_3_fn(pred, ref, **kwargs):
88
+ return gpt_eval.compute_noref(prediction=pred, diff=kwargs['diff'], n_requests=3)
89
+
90
+
91
+ def gptscore_noref_5_fn(pred, ref, **kwargs):
92
+ return gpt_eval.compute_noref(prediction=pred, diff=kwargs['diff'], n_requests=5)
93
+
94
+
95
  IND_METRICS = {
96
  "gptscore-ref-1-req": gptscore_ref_1_fn,
97
+ # "gptscore-ref-3-req": gptscore_ref_3_fn,
98
  # "gptscore-ref-5-req": gptscore_ref_5_fn,
99
+ "gptscore-noref-1-req": gptscore_noref_1_fn,
100
+ # "gptscore-noref-3-req": gptscore_noref_3_fn,
101
+ # "gptscore-noref-5-req": gptscore_noref_5_fn,
102
  "editdist": edit_distance_fn,
103
  "bleu": bleu_fn,
104
  "meteor": meteor_fn,
 
126
  tqdm.pandas()
127
 
128
  def apply_metric_fn_to_row(row, fn, col_pred, col_ref):
129
+ return fn(row[col_pred], row[col_ref], edittime=row['edit_time'], diff=str(row['mods']))
130
 
131
  for metric in REL_METRICS:
132
  print(f"Computing {metric} for the related pairs")