saridormi commited on
Commit
6c92442
β€’
1 Parent(s): 2770288

Make files submissions instructions task-specific & other small changes

Browse files
app.py CHANGED
@@ -2,17 +2,27 @@ import logging
2
  import os
3
 
4
  import gradio as gr # type: ignore[import]
5
- from apscheduler.schedulers.background import BackgroundScheduler
6
- from huggingface_hub import HfApi
7
 
8
- from src.content import (INTRODUCTION_TEXT, INTRODUCTION_TITLE,
9
- LEADERBOARD_TEXT, LEADERBOARD_TITLE,
10
- SUBMISSION_TEXT_FILES, SUBMISSION_TEXT_INTRO,
11
- SUBMISSION_TEXT_METADATA, SUBMISSION_TEXT_SUBMIT,
12
- SUBMISSION_TEXT_TASK, SUBMISSION_TITLE)
 
 
 
 
 
 
 
13
  from src.get_results_for_task import get_results_for_task
14
  from src.submission_uploader import SubmissionUploader
15
- from src.tasks import TASKS_DESCRIPTIONS, TASKS_PRETTY, TASKS_PRETTY_REVERSE
 
 
 
 
 
16
 
17
  logging.basicConfig(
18
  level=logging.INFO,
@@ -23,35 +33,28 @@ logging.basicConfig(
23
  submission_uploader = SubmissionUploader(os.environ["DATASET_ID"])
24
 
25
 
26
- def restart_space():
27
- HfApi(token=os.environ["HF_TOKEN"]).restart_space(
28
- repo_id="JetBrains-Research/long-code-arena", token=os.environ["HF_TOKEN"]
29
- )
30
-
31
-
32
  with gr.Blocks() as demo:
 
33
  gr.HTML(INTRODUCTION_TITLE)
34
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
35
 
 
36
  gr.HTML(LEADERBOARD_TITLE)
37
  gr.Markdown(LEADERBOARD_TEXT, elem_classes="markdown-text")
38
-
39
  with gr.Tabs():
40
- for task in TASKS_PRETTY_REVERSE:
41
- with gr.TabItem(task):
42
  with gr.Row():
43
- gr.Markdown(TASKS_DESCRIPTIONS[task])
44
 
45
- leaderboard_table = gr.components.Dataframe(
46
- value=get_results_for_task(task), interactive=False
47
- )
48
 
 
49
  gr.HTML(SUBMISSION_TITLE)
50
  gr.Markdown(SUBMISSION_TEXT_INTRO, elem_classes="markdown-text")
51
-
52
  with gr.Accordion("πŸš€ Submit new results"):
53
  gr.Markdown(SUBMISSION_TEXT_TASK, elem_classes="markdown-text")
54
- task = gr.Radio(TASKS_PRETTY_REVERSE.keys(), label="Task")
55
 
56
  gr.Markdown(SUBMISSION_TEXT_METADATA, elem_classes="markdown-text")
57
  with gr.Row():
@@ -91,6 +94,8 @@ with gr.Blocks() as demo:
91
  )
92
 
93
  gr.Markdown(SUBMISSION_TEXT_FILES, elem_classes="markdown-text")
 
 
94
  file_output = gr.File(file_count="multiple")
95
 
96
  gr.Markdown(SUBMISSION_TEXT_SUBMIT, elem_classes="markdown-text")
@@ -99,7 +104,7 @@ with gr.Blocks() as demo:
99
  submit_button.click(
100
  submission_uploader.upload_files,
101
  [
102
- task,
103
  model_folder_textbox,
104
  model_name_textbox,
105
  model_availability_textbox,
@@ -112,7 +117,4 @@ with gr.Blocks() as demo:
112
  )
113
 
114
  if __name__ == "__main__":
115
- scheduler = BackgroundScheduler()
116
- scheduler.add_job(restart_space, "interval", seconds=30 * 60)
117
- scheduler.start()
118
  demo.launch()
 
2
  import os
3
 
4
  import gradio as gr # type: ignore[import]
 
 
5
 
6
+ from src.content import (
7
+ INTRODUCTION_TEXT,
8
+ INTRODUCTION_TITLE,
9
+ LEADERBOARD_TEXT,
10
+ LEADERBOARD_TITLE,
11
+ SUBMISSION_TEXT_FILES,
12
+ SUBMISSION_TEXT_INTRO,
13
+ SUBMISSION_TEXT_METADATA,
14
+ SUBMISSION_TEXT_SUBMIT,
15
+ SUBMISSION_TEXT_TASK,
16
+ SUBMISSION_TITLE,
17
+ )
18
  from src.get_results_for_task import get_results_for_task
19
  from src.submission_uploader import SubmissionUploader
20
+ from src.tasks_content import (
21
+ TASKS_DESCRIPTIONS,
22
+ TASKS_PRETTY,
23
+ TASKS_PRETTY_REVERSE,
24
+ get_submission_text_files_for_task,
25
+ )
26
 
27
  logging.basicConfig(
28
  level=logging.INFO,
 
33
  submission_uploader = SubmissionUploader(os.environ["DATASET_ID"])
34
 
35
 
 
 
 
 
 
 
36
  with gr.Blocks() as demo:
37
+ # intro
38
  gr.HTML(INTRODUCTION_TITLE)
39
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
40
 
41
+ # leaderboard
42
  gr.HTML(LEADERBOARD_TITLE)
43
  gr.Markdown(LEADERBOARD_TEXT, elem_classes="markdown-text")
 
44
  with gr.Tabs():
45
+ for task_pretty in TASKS_PRETTY_REVERSE:
46
+ with gr.TabItem(task_pretty):
47
  with gr.Row():
48
+ gr.Markdown(TASKS_DESCRIPTIONS[TASKS_PRETTY_REVERSE[task_pretty]])
49
 
50
+ leaderboard_table = gr.components.Dataframe(value=get_results_for_task(task_pretty), interactive=False)
 
 
51
 
52
+ # submission
53
  gr.HTML(SUBMISSION_TITLE)
54
  gr.Markdown(SUBMISSION_TEXT_INTRO, elem_classes="markdown-text")
 
55
  with gr.Accordion("πŸš€ Submit new results"):
56
  gr.Markdown(SUBMISSION_TEXT_TASK, elem_classes="markdown-text")
57
+ task_selection = gr.Radio(TASKS_PRETTY_REVERSE.keys(), label="Task")
58
 
59
  gr.Markdown(SUBMISSION_TEXT_METADATA, elem_classes="markdown-text")
60
  with gr.Row():
 
94
  )
95
 
96
  gr.Markdown(SUBMISSION_TEXT_FILES, elem_classes="markdown-text")
97
+ task_specific_instructions = gr.Markdown(get_submission_text_files_for_task(None))
98
+ task_selection.select(get_submission_text_files_for_task, [task_selection], task_specific_instructions)
99
  file_output = gr.File(file_count="multiple")
100
 
101
  gr.Markdown(SUBMISSION_TEXT_SUBMIT, elem_classes="markdown-text")
 
104
  submit_button.click(
105
  submission_uploader.upload_files,
106
  [
107
+ task_selection,
108
  model_folder_textbox,
109
  model_name_textbox,
110
  model_availability_textbox,
 
117
  )
118
 
119
  if __name__ == "__main__":
 
 
 
120
  demo.launch()
requirements.txt CHANGED
@@ -1,8 +1,9 @@
1
  huggingface_hub
2
  jsonlines
3
  pandas
 
 
4
  tqdm
5
- apscheduler
6
  # CMG metrics
7
  evaluate
8
  bert-score
 
1
  huggingface_hub
2
  jsonlines
3
  pandas
4
+ gradio
5
+ datasets
6
  tqdm
 
7
  # CMG metrics
8
  evaluate
9
  bert-score
src/content.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  # ================================
2
  # = ABOUT =
3
  # ================================
@@ -25,9 +27,9 @@ SUBMISSION_TEXT_TASK = """1. Select a task you want to submit results for."""
25
  SUBMISSION_TEXT_METADATA = """2. Fill in some metadata about your submission."""
26
 
27
  SUBMISSION_TEXT_FILES = """3. Attach one or more files with your model's predictions.
28
- * If several files are attached, they will be treated as separate runs of the submitted model (e.g., with different seeds), and the metrics will be averaged across runs. For baselines provided by 🏟️ Long Code Arena Team, the results are averaged across 3 runs.
29
- * Please, attach files in [JSONLines format](https://jsonlines.org/). For an example, check the predictions provided by 🏟️ Long Code Arena Team in πŸ€— [JetBrains-Research/lca-results](https://huggingface.co/datasets/JetBrains-Research/lca-results). Make sure to include `"prediction"` and `"reference"` fields for each example, the rest are optional.
30
  """
 
31
  SUBMISSION_TEXT_SUBMIT = """All set! A new PR to πŸ€— [JetBrains-Research/lca-results](https://huggingface.co/datasets/JetBrains-Research/lca-results) should be opened when you press "Submit" button. 🏟️ Long Code Arena Team will review it shortly, and the results will appear in the leaderboard.
32
 
33
  ⏳ **Note:** It might take some time (up to 40 minutes) for PR to get created, since it involves computing metrics for your submission."""
 
1
+ from .formatting import styled_warning
2
+
3
  # ================================
4
  # = ABOUT =
5
  # ================================
 
27
  SUBMISSION_TEXT_METADATA = """2. Fill in some metadata about your submission."""
28
 
29
  SUBMISSION_TEXT_FILES = """3. Attach one or more files with your model's predictions.
30
+ * If several files are attached, they will be treated as separate runs of the submitted model (e.g., with different seeds), and the metrics will be averaged across runs. For baselines provided by 🏟️ Long Code Arena Team, the results are averaged across 3 runs.
 
31
  """
32
+
33
  SUBMISSION_TEXT_SUBMIT = """All set! A new PR to πŸ€— [JetBrains-Research/lca-results](https://huggingface.co/datasets/JetBrains-Research/lca-results) should be opened when you press "Submit" button. 🏟️ Long Code Arena Team will review it shortly, and the results will appear in the leaderboard.
34
 
35
  ⏳ **Note:** It might take some time (up to 40 minutes) for PR to get created, since it involves computing metrics for your submission."""
src/evaluation/base_task_metrics.py CHANGED
@@ -7,9 +7,7 @@ class BaseTaskMetrics(ABC):
7
  pass
8
 
9
  @abstractmethod
10
- def add_batch(
11
- self, predictions: List[str], references: List[str], *args, **kwargs
12
- ) -> None:
13
  pass
14
 
15
  @abstractmethod
 
7
  pass
8
 
9
  @abstractmethod
10
+ def add_batch(self, predictions: List[str], references: List[str], *args, **kwargs) -> None:
 
 
11
  pass
12
 
13
  @abstractmethod
src/evaluation/commit_message_generation/cmg_metrics.py CHANGED
@@ -13,27 +13,17 @@ class CMGMetrics(BaseTaskMetrics):
13
  self.bertscore = evaluate.load("bertscore")
14
  self.bertscore_normalized = evaluate.load("bertscore")
15
 
16
- def add_batch(
17
- self, predictions: List[str], references: List[str], *args, **kwargs
18
- ) -> None:
19
- self.bleu.add_batch(
20
- predictions=predictions, references=[[ref] for ref in references]
21
- )
22
- self.chrf.add_batch(
23
- predictions=predictions, references=[[ref] for ref in references]
24
- )
25
  self.rouge.add_batch(predictions=predictions, references=references)
26
  self.bertscore.add_batch(predictions=predictions, references=references)
27
- self.bertscore_normalized.add_batch(
28
- predictions=predictions, references=references
29
- )
30
 
31
  def compute(self, *args, **kwargs) -> Dict[str, float]:
32
  rouge = self.rouge.compute()
33
  bertscore = self.bertscore.compute(lang="en")
34
- bertscore_normalized = self.bertscore_normalized.compute(
35
- lang="en", rescale_with_baseline=True
36
- )
37
  return {
38
  "bleu": self.bleu.compute(tokenize="13a")["score"],
39
  "chrf": self.chrf.compute()["score"],
@@ -41,6 +31,5 @@ class CMGMetrics(BaseTaskMetrics):
41
  "rouge2": rouge["rouge2"] * 100,
42
  "rougeL": rouge["rougeL"] * 100,
43
  "bertscore": sum(bertscore["f1"]) / len(bertscore["f1"]),
44
- "bertscore_normalized": sum(bertscore_normalized["f1"])
45
- / len(bertscore_normalized["f1"]),
46
  }
 
13
  self.bertscore = evaluate.load("bertscore")
14
  self.bertscore_normalized = evaluate.load("bertscore")
15
 
16
+ def add_batch(self, predictions: List[str], references: List[str], *args, **kwargs) -> None:
17
+ self.bleu.add_batch(predictions=predictions, references=[[ref] for ref in references])
18
+ self.chrf.add_batch(predictions=predictions, references=[[ref] for ref in references])
 
 
 
 
 
 
19
  self.rouge.add_batch(predictions=predictions, references=references)
20
  self.bertscore.add_batch(predictions=predictions, references=references)
21
+ self.bertscore_normalized.add_batch(predictions=predictions, references=references)
 
 
22
 
23
  def compute(self, *args, **kwargs) -> Dict[str, float]:
24
  rouge = self.rouge.compute()
25
  bertscore = self.bertscore.compute(lang="en")
26
+ bertscore_normalized = self.bertscore_normalized.compute(lang="en", rescale_with_baseline=True)
 
 
27
  return {
28
  "bleu": self.bleu.compute(tokenize="13a")["score"],
29
  "chrf": self.chrf.compute()["score"],
 
31
  "rouge2": rouge["rouge2"] * 100,
32
  "rougeL": rouge["rougeL"] * 100,
33
  "bertscore": sum(bertscore["f1"]) / len(bertscore["f1"]),
34
+ "bertscore_normalized": sum(bertscore_normalized["f1"]) / len(bertscore_normalized["f1"]),
 
35
  }
src/formatting.py CHANGED
@@ -7,6 +7,4 @@ def styled_warning(warn):
7
 
8
 
9
  def styled_message(message):
10
- return (
11
- f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
12
- )
 
7
 
8
 
9
  def styled_message(message):
10
+ return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
 
 
src/get_results_for_task.py CHANGED
@@ -2,13 +2,15 @@ import logging
2
  import os
3
 
4
  import pandas as pd # type: ignore[import]
5
- from datasets import (get_dataset_config_names, # type: ignore[import]
6
- load_dataset)
7
 
8
- from .leaderboard_formatting import (COLUMNS_PRETTY, METRICS_PER_TASK,
9
- SORT_COLUMN_PER_TASK,
10
- get_columns_per_task)
11
- from .tasks import TASKS_PRETTY_REVERSE
 
 
 
12
 
13
  AVAILABLE_TASKS = get_dataset_config_names(os.environ["DATASET_ID"])
14
 
@@ -44,27 +46,17 @@ def _get_results_stub() -> pd.DataFrame:
44
 
45
 
46
  def _get_results_dataset(task_id: str) -> pd.DataFrame:
47
- results_df = load_dataset(
48
- os.environ["DATASET_ID"], task_id, split="test"
49
- ).to_pandas()
50
  results_df = results_df.rename(columns=COLUMNS_PRETTY, errors="ignore")
51
- results_df["Context Size"] = results_df["Context Size"].map(
52
- lambda x: f"{int(x) // 1000}k" if int(x) >= 1000 else x
53
- )
54
 
55
- results_df = results_df.sort_values(
56
- by=SORT_COLUMN_PER_TASK[task_id], ascending=False
57
- )
58
 
59
  for metric_column in METRICS_PER_TASK[task_id]:
60
  if "BERTScore" in metric_column:
61
- results_df[metric_column] = results_df[metric_column].map(
62
- lambda x: f"{x:.5f}"
63
- )
64
  else:
65
- results_df[metric_column] = results_df[metric_column].map(
66
- lambda x: f"{x:.2f}"
67
- )
68
 
69
  results_df = results_df[get_columns_per_task(task_id)]
70
  return results_df
 
2
  import os
3
 
4
  import pandas as pd # type: ignore[import]
5
+ from datasets import get_dataset_config_names, load_dataset # type: ignore[import]
 
6
 
7
+ from .leaderboard_formatting import (
8
+ COLUMNS_PRETTY,
9
+ METRICS_PER_TASK,
10
+ SORT_COLUMN_PER_TASK,
11
+ get_columns_per_task,
12
+ )
13
+ from .tasks_content import TASKS_PRETTY_REVERSE
14
 
15
  AVAILABLE_TASKS = get_dataset_config_names(os.environ["DATASET_ID"])
16
 
 
46
 
47
 
48
  def _get_results_dataset(task_id: str) -> pd.DataFrame:
49
+ results_df = load_dataset(os.environ["DATASET_ID"], task_id, split="test").to_pandas()
 
 
50
  results_df = results_df.rename(columns=COLUMNS_PRETTY, errors="ignore")
51
+ results_df["Context Size"] = results_df["Context Size"].map(lambda x: f"{int(x) // 1000}k" if int(x) >= 1000 else x)
 
 
52
 
53
+ results_df = results_df.sort_values(by=SORT_COLUMN_PER_TASK[task_id], ascending=False)
 
 
54
 
55
  for metric_column in METRICS_PER_TASK[task_id]:
56
  if "BERTScore" in metric_column:
57
+ results_df[metric_column] = results_df[metric_column].map(lambda x: f"{x:.5f}")
 
 
58
  else:
59
+ results_df[metric_column] = results_df[metric_column].map(lambda x: f"{x:.2f}")
 
 
60
 
61
  results_df = results_df[get_columns_per_task(task_id)]
62
  return results_df
src/leaderboard_formatting.py CHANGED
@@ -35,8 +35,4 @@ SORT_COLUMN_PER_TASK = {"commit_message_generation": "ROUGE-1"}
35
  def get_columns_per_task(task_id: str) -> List[str]:
36
  metrics_per_task = METRICS_PER_TASK[task_id]
37
 
38
- return (
39
- ["Model Name", "Availability", "Context Size"]
40
- + metrics_per_task
41
- + ["Submitted By"]
42
- )
 
35
  def get_columns_per_task(task_id: str) -> List[str]:
36
  metrics_per_task = METRICS_PER_TASK[task_id]
37
 
38
+ return ["Model Name", "Availability", "Context Size"] + metrics_per_task + ["Submitted By"]
 
 
 
 
src/submission_uploader.py CHANGED
@@ -11,7 +11,7 @@ from tqdm import tqdm
11
 
12
  from .evaluation import METRICS
13
  from .formatting import styled_error, styled_message, styled_warning
14
- from .tasks import TASKS_PRETTY_REVERSE
15
 
16
 
17
  class AlreadyExists(Exception):
@@ -34,17 +34,11 @@ class SubmissionUploader:
34
  def _get_previous_pr(self, pr_title: str) -> Optional[Discussion]:
35
  """Searches among discussions of dataset repo for a PR with the given title."""
36
  try:
37
- discussions = self._api.get_repo_discussions(
38
- repo_id=self._dataset_id, repo_type="dataset"
39
- )
40
  except Exception:
41
  return None
42
  for discussion in discussions:
43
- if (
44
- discussion.status == "open"
45
- and discussion.is_pull_request
46
- and discussion.title == pr_title
47
- ):
48
  return discussion
49
  return None
50
 
@@ -79,41 +73,30 @@ class SubmissionUploader:
79
  ]
80
  return commit_operations
81
 
82
- def _compute_metrics_for_predictions(
83
- self, task_id: str, filenames: Optional[List[str]], temp_directory: str
84
- ) -> None:
85
  metrics_module = METRICS[task_id]
86
- assert (
87
- metrics_module is not None
88
- ), f"Computing metrics for {task_id} is not supported."
89
  metrics_module.reset()
90
  open(os.path.join(temp_directory, "metrics.jsonl"), "w").close()
91
 
92
  # compute the metrics for each submitted file
93
  for filename in filenames:
94
  with jsonlines.open(filename, "r") as reader:
95
- for example in tqdm(
96
- reader, desc=f"Computing metrics for {os.path.basename(filename)}"
97
- ):
98
  metrics_module.add_batch(
99
  predictions=[example["prediction"]],
100
  references=[example["reference"]],
101
  )
102
  computed_metrics = metrics_module.compute()
103
  metrics_module.reset()
104
- with jsonlines.open(
105
- os.path.join(temp_directory, "metrics.jsonl"), "a"
106
- ) as writer:
107
  writer.write(computed_metrics)
108
 
109
  # aggregate the metrics over submitted files
110
- with jsonlines.open(
111
- os.path.join(temp_directory, "metrics.jsonl"), "r"
112
- ) as reader:
113
  metrics_results = [line for line in reader]
114
  final_metrics_results = {
115
- key: sum(entry[key] for entry in metrics_results) / len(metrics_results)
116
- for key in metrics_results[0]
117
  }
118
  with open(os.path.join(temp_directory, "final_metrics.json"), "w") as f:
119
  json.dump(final_metrics_results, f)
@@ -142,9 +125,7 @@ class SubmissionUploader:
142
  )
143
  final_results.update(metadata_dict)
144
 
145
- with jsonlines.open(
146
- os.path.join(temp_directory, "final_results.jsonl"), "w"
147
- ) as writer:
148
  writer.write(final_results)
149
 
150
  return [
@@ -165,29 +146,17 @@ class SubmissionUploader:
165
  submitted_by: str,
166
  filenames: Optional[List[str]],
167
  ):
168
- assert (
169
- task_pretty and task_pretty in TASKS_PRETTY_REVERSE
170
- ), "Please, select one of the supported tasks."
171
- assert (
172
- model_folder
173
- ), "Please, specify non-empty name for a directory with a model's results."
174
  assert model_name_pretty, "Please, specify non-empty name for a model."
175
- assert (
176
- model_availability
177
- ), "Please, specify non-empty information about a model's availability."
178
- assert (
179
- context_size
180
- ), "Please, specify non-empty information about a model's context size."
181
  try:
182
  _ = int(context_size)
183
  except:
184
- raise ValueError(
185
- "Please, specify a model's context size as an integer (e.g., 16000)."
186
- )
187
 
188
- assert (
189
- submitted_by
190
- ), "Please, specify non-empty information about a submission's author(s)."
191
  assert filenames, "Please, attach at least one file with predictions."
192
 
193
  def upload_files(
@@ -221,25 +190,16 @@ class SubmissionUploader:
221
 
222
  logging.info("Checking if this request has already been submitted...")
223
  if not force:
224
- if model_name_pretty in self._fs.ls(
225
- f"datasets/{self._dataset_id}/{task_id}/predictions"
226
- ) and all(
227
- filename
228
- in self._fs.ls(
229
- f"datasets/{self._dataset_id}/{task_id}/predictions/{model_name_pretty}"
230
- )
231
- for filename in filenames + ["metadata.json"]
232
  ):
233
- return styled_warning(
234
- f"{model_name_pretty} is already present in {self._dataset_id}."
235
- )
236
 
237
  prev_pr = self._get_previous_pr(pr_title)
238
  if prev_pr is not None:
239
  url = f"https://huggingface.co/datasets/{self._dataset_id}/discussions/{prev_pr.num}"
240
- return styled_warning(
241
- f"{self._dataset_id} already has an open PR for this submission: {url}."
242
- )
243
 
244
  logging.info("Processing predictions...")
245
  predictions_commit_operations = self._upload_predictions(
@@ -250,9 +210,7 @@ class SubmissionUploader:
250
 
251
  with TemporaryDirectory() as d:
252
  logging.info("Computing metrics...")
253
- self._compute_metrics_for_predictions(
254
- task_id=task_id, filenames=filenames, temp_directory=str(d)
255
- )
256
 
257
  logging.info("Processing results...")
258
  results_commit_operations = self._upload_results(
@@ -269,8 +227,7 @@ class SubmissionUploader:
269
  logging.info("Creating commit...")
270
  new_pr = self._api.create_commit(
271
  repo_id=self._dataset_id,
272
- operations=predictions_commit_operations
273
- + results_commit_operations,
274
  commit_message=pr_title,
275
  commit_description=f"""New submission to {task_pretty} task in 🏟️ Long Code Arena benchmark!\n* Model name: {model_name_pretty}\n* Model availability: {model_availability}\n* Context Size: {context_size}\n* Relevant URLs: {urls}\n* Submitted By: {submitted_by}""",
276
  create_pr=True,
 
11
 
12
  from .evaluation import METRICS
13
  from .formatting import styled_error, styled_message, styled_warning
14
+ from .tasks_content import TASKS_PRETTY_REVERSE
15
 
16
 
17
  class AlreadyExists(Exception):
 
34
  def _get_previous_pr(self, pr_title: str) -> Optional[Discussion]:
35
  """Searches among discussions of dataset repo for a PR with the given title."""
36
  try:
37
+ discussions = self._api.get_repo_discussions(repo_id=self._dataset_id, repo_type="dataset")
 
 
38
  except Exception:
39
  return None
40
  for discussion in discussions:
41
+ if discussion.status == "open" and discussion.is_pull_request and discussion.title == pr_title:
 
 
 
 
42
  return discussion
43
  return None
44
 
 
73
  ]
74
  return commit_operations
75
 
76
+ def _compute_metrics_for_predictions(self, task_id: str, filenames: List[str], temp_directory: str) -> None:
 
 
77
  metrics_module = METRICS[task_id]
78
+ assert metrics_module is not None, f"Computing metrics for {task_id} is not supported."
 
 
79
  metrics_module.reset()
80
  open(os.path.join(temp_directory, "metrics.jsonl"), "w").close()
81
 
82
  # compute the metrics for each submitted file
83
  for filename in filenames:
84
  with jsonlines.open(filename, "r") as reader:
85
+ for example in tqdm(reader, desc=f"Computing metrics for {os.path.basename(filename)}"):
 
 
86
  metrics_module.add_batch(
87
  predictions=[example["prediction"]],
88
  references=[example["reference"]],
89
  )
90
  computed_metrics = metrics_module.compute()
91
  metrics_module.reset()
92
+ with jsonlines.open(os.path.join(temp_directory, "metrics.jsonl"), "a") as writer:
 
 
93
  writer.write(computed_metrics)
94
 
95
  # aggregate the metrics over submitted files
96
+ with jsonlines.open(os.path.join(temp_directory, "metrics.jsonl"), "r") as reader:
 
 
97
  metrics_results = [line for line in reader]
98
  final_metrics_results = {
99
+ key: sum(entry[key] for entry in metrics_results) / len(metrics_results) for key in metrics_results[0]
 
100
  }
101
  with open(os.path.join(temp_directory, "final_metrics.json"), "w") as f:
102
  json.dump(final_metrics_results, f)
 
125
  )
126
  final_results.update(metadata_dict)
127
 
128
+ with jsonlines.open(os.path.join(temp_directory, "final_results.jsonl"), "w") as writer:
 
 
129
  writer.write(final_results)
130
 
131
  return [
 
146
  submitted_by: str,
147
  filenames: Optional[List[str]],
148
  ):
149
+ assert task_pretty and task_pretty in TASKS_PRETTY_REVERSE, "Please, select one of the supported tasks."
150
+ assert model_folder, "Please, specify non-empty name for a directory with a model's results."
 
 
 
 
151
  assert model_name_pretty, "Please, specify non-empty name for a model."
152
+ assert model_availability, "Please, specify non-empty information about a model's availability."
153
+ assert context_size, "Please, specify non-empty information about a model's context size."
 
 
 
 
154
  try:
155
  _ = int(context_size)
156
  except:
157
+ raise ValueError("Please, specify a model's context size as an integer (e.g., 16000).")
 
 
158
 
159
+ assert submitted_by, "Please, specify non-empty information about a submission's author(s)."
 
 
160
  assert filenames, "Please, attach at least one file with predictions."
161
 
162
  def upload_files(
 
190
 
191
  logging.info("Checking if this request has already been submitted...")
192
  if not force:
193
+ if model_name_pretty in self._fs.ls(f"datasets/{self._dataset_id}/{task_id}/predictions") and all(
194
+ filename in self._fs.ls(f"datasets/{self._dataset_id}/{task_id}/predictions/{model_name_pretty}")
195
+ for filename in filenames
 
 
 
 
 
196
  ):
197
+ return styled_warning(f"{model_name_pretty} is already present in {self._dataset_id}.")
 
 
198
 
199
  prev_pr = self._get_previous_pr(pr_title)
200
  if prev_pr is not None:
201
  url = f"https://huggingface.co/datasets/{self._dataset_id}/discussions/{prev_pr.num}"
202
+ return styled_warning(f"{self._dataset_id} already has an open PR for this submission: {url}.")
 
 
203
 
204
  logging.info("Processing predictions...")
205
  predictions_commit_operations = self._upload_predictions(
 
210
 
211
  with TemporaryDirectory() as d:
212
  logging.info("Computing metrics...")
213
+ self._compute_metrics_for_predictions(task_id=task_id, filenames=filenames, temp_directory=str(d))
 
 
214
 
215
  logging.info("Processing results...")
216
  results_commit_operations = self._upload_results(
 
227
  logging.info("Creating commit...")
228
  new_pr = self._api.create_commit(
229
  repo_id=self._dataset_id,
230
+ operations=predictions_commit_operations + results_commit_operations,
 
231
  commit_message=pr_title,
232
  commit_description=f"""New submission to {task_pretty} task in 🏟️ Long Code Arena benchmark!\n* Model name: {model_name_pretty}\n* Model availability: {model_availability}\n* Context Size: {context_size}\n* Relevant URLs: {urls}\n* Submitted By: {submitted_by}""",
233
  create_pr=True,
src/{tasks.py β†’ tasks_content.py} RENAMED
@@ -1,3 +1,5 @@
 
 
1
  TASKS_PRETTY = {
2
  "commit_message_generation": "Commit Message Generation",
3
  "bug_localization": "Bug Localization on Issue",
@@ -9,7 +11,7 @@ TASKS_PRETTY = {
9
  TASKS_PRETTY_REVERSE = {value: key for key, value in TASKS_PRETTY.items()}
10
 
11
  TASKS_DESCRIPTIONS = {
12
- "Commit Message Generation": """# Commit Message Generation\n
13
 
14
  Our Commit Message Generation benchmark πŸ€— [JetBrains-Research/lca-cmg](https://huggingface.co/datasets/JetBrains-Research/lca-cmg) includes 163 manually curated commits from Python projects.
15
 
@@ -21,9 +23,21 @@ TASKS_DESCRIPTIONS = {
21
 
22
  For further details on the dataset and the baselines from 🏟️ Long Code Arena Team, refer to `commit_message_generation` folder in [our baselines repository](https://github.com/JetBrains-Research/lca-baselines) or to our preprint (TODO).
23
  """,
24
- "Bug Localization on Issue": "cool description for Bug Localization on Issue task",
25
- "Module-to-Text": "cool description for Module-to-Text task",
26
- "Library Usage Examples Generation": "cool description for Library Usage Examples Generation task",
27
- "Project-level Code Completion": "cool description for Project-level Code Completion task",
28
- "Bug Localization on Build Logs": "cool description for Bug Localization on Build Logs task",
29
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
  TASKS_PRETTY = {
4
  "commit_message_generation": "Commit Message Generation",
5
  "bug_localization": "Bug Localization on Issue",
 
11
  TASKS_PRETTY_REVERSE = {value: key for key, value in TASKS_PRETTY.items()}
12
 
13
  TASKS_DESCRIPTIONS = {
14
+ "commit_message_generation": """# Commit Message Generation\n
15
 
16
  Our Commit Message Generation benchmark πŸ€— [JetBrains-Research/lca-cmg](https://huggingface.co/datasets/JetBrains-Research/lca-cmg) includes 163 manually curated commits from Python projects.
17
 
 
23
 
24
  For further details on the dataset and the baselines from 🏟️ Long Code Arena Team, refer to `commit_message_generation` folder in [our baselines repository](https://github.com/JetBrains-Research/lca-baselines) or to our preprint (TODO).
25
  """,
26
+ "bug_localization": "cool description for Bug Localization on Issue task",
27
+ "module_to_text": "cool description for Module-to-Text task",
28
+ "library_usage": "cool description for Library Usage Examples Generation task",
29
+ "project_code_completion": "cool description for Project-level Code Completion task",
30
+ "bug_localization_build_logs": "cool description for Bug Localization on Build Logs task",
31
  }
32
+
33
+
34
+ def get_submission_text_files_for_task(task_pretty: Optional[str]) -> str:
35
+ if not task_pretty:
36
+ return "Please, select a specific task to see more detailed instructions regarding submitting files."
37
+
38
+ task_id = TASKS_PRETTY_REVERSE[task_pretty]
39
+
40
+ if task_id == "commit_message_generation":
41
+ return f"""**{task_pretty} Instructions:**\n\n* Please, attach files in [JSONLines format](https://jsonlines.org/). For an example, check the predictions provided by 🏟️ Long Code Arena Team in πŸ€— [JetBrains-Research/lca-results](https://huggingface.co/datasets/JetBrains-Research/lca-results/tree/main/commit_message_generation/predictions). Make sure to include `"prediction"` and `"reference"` fields for each example, the rest are optional."""
42
+
43
+ return f"**{task_pretty} Instructions:**\n\n* 🚧 There are no instructions for the current task yet."