saridormi commited on
Commit
cdf268e
β€’
1 Parent(s): 1323fe2

Add metrics computation for CMG task

Browse files
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import os
2
 
3
- import gradio as gr
4
 
5
  from src.content import (INTRODUCTION_TEXT, INTRODUCTION_TITLE,
6
  LEADERBOARD_TEXT, LEADERBOARD_TITLE,
 
1
  import os
2
 
3
+ import gradio as gr # type: ignore[import]
4
 
5
  from src.content import (INTRODUCTION_TEXT, INTRODUCTION_TITLE,
6
  LEADERBOARD_TEXT, LEADERBOARD_TITLE,
requirements.txt CHANGED
@@ -1 +1,5 @@
1
- huggingface_hub
 
 
 
 
 
1
+ huggingface_hub
2
+ jsonlines
3
+ pandas
4
+ tqdm
5
+ evaluate
src/__init__.py ADDED
File without changes
src/evaluation/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .metrics import METRICS
2
+
3
+ __all__ = ["METRICS"]
src/evaluation/base_task_metrics.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ from typing import Dict, List
3
+
4
+
5
+ class BaseTaskMetrics(ABC):
6
+ def reset(self):
7
+ pass
8
+
9
+ @abstractmethod
10
+ def add_batch(
11
+ self, predictions: List[str], references: List[str], *args, **kwargs
12
+ ) -> None:
13
+ pass
14
+
15
+ @abstractmethod
16
+ def compute(self, *args, **kwargs) -> Dict[str, float]:
17
+ pass
src/evaluation/commit_message_generation/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .cmg_metrics import CMGMetrics
2
+
3
+ __all__ = ["CMGMetrics"]
src/evaluation/commit_message_generation/cmg_metrics.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List
2
+
3
+ import evaluate # type: ignore[import]
4
+
5
+ from ..base_task_metrics import BaseTaskMetrics
6
+ from .b_norm import BNorm
7
+
8
+
9
+ class CMGMetrics(BaseTaskMetrics):
10
+ def __init__(self):
11
+ self.bnorm = BNorm()
12
+ self.bleu = evaluate.load("sacrebleu")
13
+ self.chrf = evaluate.load("chrf")
14
+ self.rouge = evaluate.load("rouge")
15
+ self.bertscore = evaluate.load("bertscore")
16
+ self.bertscore_normalized = evaluate.load("bertscore")
17
+
18
+ def reset(self):
19
+ self.bnorm.reset()
20
+
21
+ def update(
22
+ self, predictions: List[str], references: List[str], *args, **kwargs
23
+ ) -> None:
24
+ self.bnorm.update(predictions=predictions, references=references)
25
+ self.bleu.add_batch(
26
+ predictions=predictions, references=[[ref] for ref in references]
27
+ )
28
+ self.chrf.add_batch(
29
+ predictions=predictions, references=[[ref] for ref in references]
30
+ )
31
+ self.rouge.add_batch(predictions=predictions, references=references)
32
+ self.bertscore.add_batch(predictions=predictions, references=references)
33
+ self.bertscore_normalized.add_batch(
34
+ predictions=predictions, references=references
35
+ )
36
+
37
+ def compute(self, *args, **kwargs) -> Dict[str, float]:
38
+ rouge = self.rouge.compute()
39
+ bertscore = self.bertscore.compute(lang="en")
40
+ bertscore_normalized = self.bertscore_normalized.compute(
41
+ lang="en", rescale_with_baseline=True
42
+ )
43
+ return {
44
+ "bnorm": self.bnorm.compute(),
45
+ "bleu": self.bleu.compute(tokenize="13a")["score"],
46
+ "chrf": self.chrf.compute()["score"],
47
+ "rouge1": rouge["rouge1"] * 100,
48
+ "rouge2": rouge["rouge2"] * 100,
49
+ "rougeL": rouge["rougeL"] * 100,
50
+ "bertscore": sum(bertscore["f1"]) / len(bertscore["f1"]),
51
+ "bertscore_normalized": sum(bertscore_normalized["f1"])
52
+ / len(bertscore_normalized["f1"]),
53
+ }
src/evaluation/metrics.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Optional
2
+
3
+ from .base_task_metrics import BaseTaskMetrics
4
+ from .commit_message_generation import CMGMetrics
5
+
6
+ METRICS: Dict[str, Optional[BaseTaskMetrics]] = {
7
+ "commit_message_generation": CMGMetrics(),
8
+ "bug_localization": None,
9
+ "module_to_text": None,
10
+ "library_usage": None,
11
+ "project_code_completion": None,
12
+ "bug_localization_build_logs": None,
13
+ }
src/formatting.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def styled_error(error):
2
+ return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
3
+
4
+
5
+ def styled_warning(warn):
6
+ return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
7
+
8
+
9
+ def styled_message(message):
10
+ return (
11
+ f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
12
+ )
src/get_results_for_task.py CHANGED
@@ -1,6 +1,4 @@
1
- import pandas as pd
2
-
3
- RESULTS_DATASET = "JetBrains-Research/lca-results"
4
 
5
 
6
  def get_results_for_task_stub(task: str) -> pd.DataFrame:
 
1
+ import pandas as pd # type: ignore[import]
 
 
2
 
3
 
4
  def get_results_for_task_stub(task: str) -> pd.DataFrame:
src/submission_uploader.py CHANGED
@@ -1,9 +1,16 @@
1
  import json
 
2
  import os
3
- from typing import List, Optional
 
4
 
5
- from huggingface_hub import CommitOperationAdd, Discussion, HfApi, HfFileSystem
 
 
 
6
 
 
 
7
  from .tasks import TASKS_PRETTY_REVERSE
8
 
9
 
@@ -39,19 +46,30 @@ class SubmissionUploader:
39
  and discussion.title == pr_title
40
  ):
41
  return discussion
 
42
 
43
- def _upload_files(
44
  self,
45
- task_id: str,
46
- model_folder: str,
47
  model_name_pretty: str,
48
  model_availability: str,
49
  urls: str,
50
  context_size: str,
51
  submitted_by: str,
52
- filenames: Optional[List[str]],
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  ) -> List[CommitOperationAdd]:
54
- # add predictions files
55
  commit_operations = [
56
  CommitOperationAdd(
57
  path_in_repo=f"{task_id}/predictions/{model_folder}/{os.path.basename(filename)}",
@@ -59,25 +77,114 @@ class SubmissionUploader:
59
  )
60
  for filename in filenames
61
  ]
 
62
 
63
- # add metadata file
64
- metadata_dict = {
65
- "model_name": model_name_pretty,
66
- "model_availability": model_availability,
67
- "urls": urls,
68
- "context_size": context_size,
69
- "submitted_by": submitted_by,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  }
71
- with open("metadata.json", "w") as f:
72
- json.dump(metadata_dict, f)
73
- commit_operations.append(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  CommitOperationAdd(
75
- path_in_repo=f"{task_id}/predictions/{model_folder}/metadata.json",
76
- path_or_fileobj="metadata.json",
77
  )
78
- )
79
 
80
- return commit_operations
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
  def upload_files(
83
  self,
@@ -92,10 +199,21 @@ class SubmissionUploader:
92
  force: bool = False,
93
  ) -> str:
94
  try:
 
 
 
 
 
 
 
 
 
 
95
  pr_title = f"πŸš€ New submission to {task_pretty} task: {model_name_pretty} with {context_size} context size from {submitted_by}"
96
 
97
  task_id = TASKS_PRETTY_REVERSE[task_pretty]
98
 
 
99
  if not force:
100
  if model_name_pretty in self._fs.ls(
101
  f"datasets/{self._dataset_id}/{task_id}/predictions"
@@ -106,29 +224,46 @@ class SubmissionUploader:
106
  )
107
  for filename in filenames + ["metadata.json"]
108
  ):
109
- return (
110
  f"{model_name_pretty} is already present in {self._dataset_id}."
111
  )
112
 
113
  prev_pr = self._get_previous_pr(pr_title)
114
  if prev_pr is not None:
115
  url = f"https://huggingface.co/datasets/{self._dataset_id}/discussions/{prev_pr.num}"
116
- return f"{self._dataset_id} already has an open PR for this submission: {url}."
 
 
117
 
118
- commit_operations = self._upload_files(
 
119
  task_id=task_id,
120
  model_folder=model_folder,
121
- model_name_pretty=model_name_pretty,
122
- model_availability=model_availability,
123
- urls=urls,
124
- context_size=context_size,
125
- submitted_by=submitted_by,
126
  filenames=filenames,
127
  )
128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  new_pr = self._api.create_commit(
130
  repo_id=self._dataset_id,
131
- operations=commit_operations,
132
  commit_message=pr_title,
133
  commit_description=f"""New submission to {task_pretty} task in 🏟️ Long Code Arena benchmark!
134
 
@@ -141,7 +276,10 @@ class SubmissionUploader:
141
  create_pr=True,
142
  repo_type="dataset",
143
  )
144
- return f"πŸŽ‰ PR created at {new_pr.pr_url}."
145
 
146
- except Exception:
147
- return "An exception occured."
 
 
 
 
1
  import json
2
+ import logging
3
  import os
4
+ from tempfile import TemporaryDirectory
5
+ from typing import Dict, List, Optional
6
 
7
+ import jsonlines
8
+ from huggingface_hub import CommitOperationAdd # type: ignore[import]
9
+ from huggingface_hub import Discussion, HfApi, HfFileSystem
10
+ from tqdm import tqdm
11
 
12
+ from .evaluation import METRICS
13
+ from .formatting import styled_error, styled_message, styled_warning
14
  from .tasks import TASKS_PRETTY_REVERSE
15
 
16
 
 
46
  and discussion.title == pr_title
47
  ):
48
  return discussion
49
+ return None
50
 
51
+ def _get_metadata(
52
  self,
 
 
53
  model_name_pretty: str,
54
  model_availability: str,
55
  urls: str,
56
  context_size: str,
57
  submitted_by: str,
58
+ ) -> Dict[str, str]:
59
+ return {
60
+ "model_name": model_name_pretty,
61
+ "model_availability": model_availability,
62
+ "urls": urls,
63
+ "context_size": context_size,
64
+ "submitted_by": submitted_by,
65
+ }
66
+
67
+ def _upload_predictions(
68
+ self,
69
+ task_id: str,
70
+ model_folder: str,
71
+ filenames: List[str],
72
  ) -> List[CommitOperationAdd]:
 
73
  commit_operations = [
74
  CommitOperationAdd(
75
  path_in_repo=f"{task_id}/predictions/{model_folder}/{os.path.basename(filename)}",
 
77
  )
78
  for filename in filenames
79
  ]
80
+ return commit_operations
81
 
82
+ def _compute_metrics_for_predictions(
83
+ self, task_id: str, filenames: Optional[List[str]], temp_directory: str
84
+ ) -> None:
85
+ metrics_module = METRICS[task_id]
86
+ assert (
87
+ metrics_module is not None
88
+ ), f"Computing metrics for {task_id} is not supported."
89
+ metrics_module.reset()
90
+ open(os.path.join(temp_directory, "metrics.jsonl"), "w").close()
91
+
92
+ # compute the metrics for each submitted file
93
+ for filename in filenames:
94
+ with jsonlines.open(filename, "r") as reader:
95
+ for example in tqdm(
96
+ reader, desc=f"Computing metrics for {os.path.basename(filename)}"
97
+ ):
98
+ metrics_module.add_batch(
99
+ predictions=[example["prediction"]],
100
+ references=[example["reference"]],
101
+ )
102
+ computed_metrics = metrics_module.compute()
103
+ metrics_module.reset()
104
+ with jsonlines.open(
105
+ os.path.join(temp_directory, "metrics.jsonl"), "a"
106
+ ) as writer:
107
+ writer.write(computed_metrics)
108
+
109
+ # aggregate the metrics over submitted files
110
+ with jsonlines.open(
111
+ os.path.join(temp_directory, "metrics.jsonl"), "r"
112
+ ) as reader:
113
+ metrics_results = [line for line in reader]
114
+ final_metrics_results = {
115
+ key: sum(entry[key] for entry in metrics_results) / len(metrics_results)
116
+ for key in metrics_results[0]
117
  }
118
+ with open(os.path.join(temp_directory, "final_metrics.json"), "w") as f:
119
+ json.dump(final_metrics_results, f)
120
+
121
+ def _upload_results(
122
+ self,
123
+ task_id: str,
124
+ model_folder: str,
125
+ model_name_pretty: str,
126
+ model_availability: str,
127
+ urls: str,
128
+ context_size: str,
129
+ submitted_by: str,
130
+ temp_directory: str,
131
+ ) -> List[CommitOperationAdd]:
132
+ final_results = {}
133
+ with open(os.path.join(temp_directory, "final_metrics.json"), "r") as f:
134
+ metrics = json.load(f)
135
+ final_results.update(metrics)
136
+ metadata_dict = self._get_metadata(
137
+ model_name_pretty=model_name_pretty,
138
+ model_availability=model_availability,
139
+ urls=urls,
140
+ context_size=context_size,
141
+ submitted_by=submitted_by,
142
+ )
143
+ final_results.update(metadata_dict)
144
+
145
+ with jsonlines.open(
146
+ os.path.join(temp_directory, "final_results.jsonl"), "w"
147
+ ) as writer:
148
+ writer.write(final_results)
149
+
150
+ return [
151
  CommitOperationAdd(
152
+ path_in_repo=f"{task_id}/results/{model_folder}.jsonl",
153
+ path_or_fileobj=os.path.join(temp_directory, "final_results.jsonl"),
154
  )
155
+ ]
156
 
157
+ def _verify_arguments(
158
+ self,
159
+ model_folder: str,
160
+ model_name_pretty: str,
161
+ model_availability: str,
162
+ urls: str,
163
+ context_size: str,
164
+ submitted_by: str,
165
+ filenames: Optional[List[str]],
166
+ ):
167
+ assert (
168
+ model_folder
169
+ ), "Please, specify non-empty name for a directory with a model's results."
170
+ assert model_name_pretty, "Please, specify non-empty name for a model."
171
+ assert (
172
+ model_availability
173
+ ), "Please, specify non-empty information about a model's availability."
174
+ assert (
175
+ context_size
176
+ ), "Please, specify non-empty information about a model's context size."
177
+ try:
178
+ _ = int(context_size)
179
+ except:
180
+ raise ValueError(
181
+ "Please, specify a model's context size as an integer (e.g., 16000)."
182
+ )
183
+
184
+ assert (
185
+ submitted_by
186
+ ), "Please, specify non-empty information about a submission's author(s)."
187
+ assert filenames, "Please, attach at least one file with predictions."
188
 
189
  def upload_files(
190
  self,
 
199
  force: bool = False,
200
  ) -> str:
201
  try:
202
+ self._verify_arguments(
203
+ model_folder=model_folder,
204
+ model_name_pretty=model_name_pretty,
205
+ model_availability=model_availability,
206
+ urls=urls,
207
+ context_size=context_size,
208
+ submitted_by=submitted_by,
209
+ filenames=filenames,
210
+ )
211
+
212
  pr_title = f"πŸš€ New submission to {task_pretty} task: {model_name_pretty} with {context_size} context size from {submitted_by}"
213
 
214
  task_id = TASKS_PRETTY_REVERSE[task_pretty]
215
 
216
+ logging.info("Checking if this request is already submitted...")
217
  if not force:
218
  if model_name_pretty in self._fs.ls(
219
  f"datasets/{self._dataset_id}/{task_id}/predictions"
 
224
  )
225
  for filename in filenames + ["metadata.json"]
226
  ):
227
+ return styled_warning(
228
  f"{model_name_pretty} is already present in {self._dataset_id}."
229
  )
230
 
231
  prev_pr = self._get_previous_pr(pr_title)
232
  if prev_pr is not None:
233
  url = f"https://huggingface.co/datasets/{self._dataset_id}/discussions/{prev_pr.num}"
234
+ return styled_warning(
235
+ f"{self._dataset_id} already has an open PR for this submission: {url}."
236
+ )
237
 
238
+ logging.info("Processing predictions...")
239
+ predictions_commit_operations = self._upload_predictions(
240
  task_id=task_id,
241
  model_folder=model_folder,
 
 
 
 
 
242
  filenames=filenames,
243
  )
244
 
245
+ with TemporaryDirectory() as d:
246
+ logging.info("Computing metrics...")
247
+ self._compute_metrics_for_predictions(
248
+ task_id=task_id, filenames=filenames, temp_directory=str(d)
249
+ )
250
+
251
+ logging.info("Processing results...")
252
+ results_commit_operations = self._upload_results(
253
+ task_id=task_id,
254
+ model_folder=model_folder,
255
+ model_name_pretty=model_name_pretty,
256
+ model_availability=model_availability,
257
+ urls=urls,
258
+ context_size=context_size,
259
+ submitted_by=submitted_by,
260
+ temp_directory=str(d),
261
+ )
262
+
263
+ logging.info("Creating commit...")
264
  new_pr = self._api.create_commit(
265
  repo_id=self._dataset_id,
266
+ operations=predictions_commit_operations + results_commit_operations,
267
  commit_message=pr_title,
268
  commit_description=f"""New submission to {task_pretty} task in 🏟️ Long Code Arena benchmark!
269
 
 
276
  create_pr=True,
277
  repo_type="dataset",
278
  )
279
+ return styled_message(f"πŸŽ‰ PR created at {new_pr.pr_url}.")
280
 
281
+ except Exception as e:
282
+ logging.exception(e)
283
+ if str(e):
284
+ return styled_error(str(e))
285
+ return styled_error("An exception occured.")