Petr Tsvetkov commited on
Commit
13e3243
β€’
1 Parent(s): d7e2287

Start-to-end generation

Browse files
change_visualizer.py CHANGED
@@ -21,6 +21,7 @@ def update_dataset_view(diff_idx, df):
21
  df.iloc[diff_idx]['commit_msg_end'],
22
  df.iloc[diff_idx]['session'],
23
  str(df.iloc[diff_idx]['end_to_start']),
 
24
  f"https://github.com/{df.iloc[diff_idx]['repo']}/commit/{df.iloc[diff_idx]['hash']}",)
25
 
26
 
@@ -56,6 +57,9 @@ if __name__ == '__main__':
56
  is_end_to_start_view = gr.Textbox(interactive=False,
57
  label="Is generated on the 'end-to-start' synthesis step?",
58
  container=True)
 
 
 
59
  link_view = gr.Markdown()
60
 
61
  view = [
@@ -64,6 +68,7 @@ if __name__ == '__main__':
64
  end_view,
65
  session_view,
66
  is_end_to_start_view,
 
67
  link_view
68
  ]
69
 
 
21
  df.iloc[diff_idx]['commit_msg_end'],
22
  df.iloc[diff_idx]['session'],
23
  str(df.iloc[diff_idx]['end_to_start']),
24
+ str(df.iloc[diff_idx]['start_to_end']),
25
  f"https://github.com/{df.iloc[diff_idx]['repo']}/commit/{df.iloc[diff_idx]['hash']}",)
26
 
27
 
 
57
  is_end_to_start_view = gr.Textbox(interactive=False,
58
  label="Is generated on the 'end-to-start' synthesis step?",
59
  container=True)
60
+ is_start_to_end_view = gr.Textbox(interactive=False,
61
+ label="Is generated on the 'start-to-end' synthesis step?",
62
+ container=True)
63
  link_view = gr.Markdown()
64
 
65
  view = [
 
68
  end_view,
69
  session_view,
70
  is_end_to_start_view,
71
+ is_start_to_end_view,
72
  link_view
73
  ]
74
 
generate_synthetic_dataset.py CHANGED
@@ -1,10 +1,12 @@
1
  import config
2
  from api_wrappers import hf_data_loader
3
- from generation_steps import synthetic_end_to_start
4
 
5
 
6
  def run():
7
  df = hf_data_loader.load_processed_rewriting_dataset_as_pandas()
 
 
8
 
9
  print(f"End -> start synthesis:")
10
  print(f"GENERATION_MULTIPLIER = {synthetic_end_to_start.GENERATION_MULTIPLIER}")
@@ -13,6 +15,13 @@ def run():
13
  df = synthetic_end_to_start.transform(df)
14
  print("Done")
15
 
 
 
 
 
 
 
 
16
  df.to_csv(config.SYNTHETIC_DATASET_ARTIFACT)
17
 
18
 
 
1
  import config
2
  from api_wrappers import hf_data_loader
3
+ from generation_steps import synthetic_end_to_start, examples, synthetic_start_to_end
4
 
5
 
6
  def run():
7
  df = hf_data_loader.load_processed_rewriting_dataset_as_pandas()
8
+ print(f"NUMBER OF EXAMPLES PER PROMPT = {examples.N_EXAMPLES}")
9
+ print()
10
 
11
  print(f"End -> start synthesis:")
12
  print(f"GENERATION_MULTIPLIER = {synthetic_end_to_start.GENERATION_MULTIPLIER}")
 
15
  df = synthetic_end_to_start.transform(df)
16
  print("Done")
17
 
18
+ print(f"Start -> send synthesis:")
19
+ print(f"GENERATION_MULTIPLIER = {synthetic_start_to_end.GENERATION_MULTIPLIER}")
20
+ print(f"REL_DELETIONS_THRESHOLD = {synthetic_start_to_end.REL_DELETIONS_THRESHOLD}")
21
+ print(f"GENERATION_ATTEMPTS = {synthetic_start_to_end.GENERATION_ATTEMPTS}")
22
+ df = synthetic_start_to_end.transform(df)
23
+ print("Done")
24
+
25
  df.to_csv(config.SYNTHETIC_DATASET_ARTIFACT)
26
 
27
 
generation_steps/examples.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import config
2
+ from api_wrappers import hf_data_loader
3
+
4
+ N_EXAMPLES = 5
5
+
6
+
7
+ def get_example_prompt_end_to_start(start_msg, end_msg):
8
+ return f"""START OF THE EXAMPLE
9
+
10
+ For the following edited commit message:
11
+ START OF THE EDITED COMMIT MESSAGE
12
+ {end_msg}
13
+ END OF THE EDITED COMMIT MESSAGE
14
+
15
+ You would output the following initial commit message:
16
+ START OF THE INITIAL COMMIT MESSAGE
17
+ {start_msg}
18
+ END OF THE INITIAL COMMIT MESSAGE
19
+
20
+ END OF THE EXAMPLE"""
21
+
22
+
23
+ def get_example_prompt_start_to_end(start_msg, end_msg):
24
+ return f"""START OF THE EXAMPLE
25
+
26
+ For the following LLM-generated commit message:
27
+ START OF THE GENERATED COMMIT MESSAGE
28
+ {start_msg}
29
+ END OF THE GENERATED COMMIT MESSAGE
30
+
31
+ You would output the following improved commit message:
32
+ START OF THE IMPROVED COMMIT MESSAGE
33
+ {end_msg}
34
+ END OF THE IMPROVED COMMIT MESSAGE
35
+
36
+ END OF THE EXAMPLE"""
37
+
38
+
39
+ manual_df = hf_data_loader.load_raw_rewriting_dataset_as_pandas()[['commit_msg_start', 'commit_msg_end']]
40
+ manual_df = manual_df.sample(n=N_EXAMPLES, random_state=config.RANDOM_STATE)
41
+
42
+
43
+ def generate_examples(end_to_start):
44
+ prompt_fn = get_example_prompt_end_to_start if end_to_start else get_example_prompt_start_to_end
45
+ examples = [
46
+ prompt_fn(row['commit_msg_start'], row['commit_msg_end'])
47
+ for _, row in manual_df.iterrows()
48
+ ]
49
+
50
+ return "\n".join(examples)
51
+
52
+
53
+ EXAMPLES_END_TO_START = generate_examples(end_to_start=True)
54
+ EXAMPLES_START_TO_END = generate_examples(end_to_start=False)
generation_steps/synthetic_end_to_start.py CHANGED
@@ -1,47 +1,16 @@
1
  import pandas as pd
2
  from tqdm import tqdm
3
 
4
- import config
5
  import generate_annotated_diffs
6
  import statistics
7
- from api_wrappers import grazie_wrapper, hf_data_loader
 
8
 
9
- N_EXAMPLES = 5
10
- GENERATION_MULTIPLIER = 2
11
  REL_INSERTIONS_THRESHOLD = 0.6
12
  GENERATION_ATTEMPTS = 5
13
 
14
 
15
- def get_example_prompt(start_msg, end_msg):
16
- return f"""START OF THE EXAMPLE
17
-
18
- For following the edited message:
19
- START OF THE EDITED COMMIT MESSAGE
20
- {end_msg}
21
- END OF THE EDITED COMMIT MESSAGE
22
-
23
- You would output the following initial commit message:
24
- START OF THE INITIAL COMMIT MESSAGE
25
- {start_msg}
26
- END OF THE INITIAL COMMIT MESSAGE
27
-
28
- END OF THE EXAMPLE"""
29
-
30
-
31
- def generate_examples():
32
- manual_df = hf_data_loader.load_raw_rewriting_dataset_as_pandas()[['commit_msg_start', 'commit_msg_end']]
33
- manual_df = manual_df.sample(n=N_EXAMPLES, random_state=config.RANDOM_STATE)
34
- examples = [
35
- get_example_prompt(row['commit_msg_start'], row['commit_msg_end'])
36
- for _, row in manual_df.iterrows()
37
- ]
38
-
39
- return "\n".join(examples)
40
-
41
-
42
- EXAMPLES = generate_examples()
43
-
44
-
45
  def build_prompt(reference, diff):
46
  return f"""A software developer uses a LLM to generate commit messages.
47
 
@@ -60,7 +29,7 @@ Your task is to print the initial, LLM-generated commit message.
60
  The message you print must share some fragments with the edited message.
61
  Here are some examples of what you should output:
62
  START OF THE EXAMPLES LIST
63
- {EXAMPLES}
64
  END OF THE EXAMPLES LIST
65
 
66
 
 
1
  import pandas as pd
2
  from tqdm import tqdm
3
 
 
4
  import generate_annotated_diffs
5
  import statistics
6
+ from api_wrappers import grazie_wrapper
7
+ from generation_steps import examples
8
 
9
+ GENERATION_MULTIPLIER = 1
 
10
  REL_INSERTIONS_THRESHOLD = 0.6
11
  GENERATION_ATTEMPTS = 5
12
 
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  def build_prompt(reference, diff):
15
  return f"""A software developer uses a LLM to generate commit messages.
16
 
 
29
  The message you print must share some fragments with the edited message.
30
  Here are some examples of what you should output:
31
  START OF THE EXAMPLES LIST
32
+ {examples.EXAMPLES_END_TO_START}
33
  END OF THE EXAMPLES LIST
34
 
35
 
generation_steps/synthetic_start_to_end.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from tqdm import tqdm
3
+
4
+ import generate_annotated_diffs
5
+ import statistics
6
+ from api_wrappers import grazie_wrapper
7
+ from generation_steps import examples
8
+
9
+ GENERATION_MULTIPLIER = 1
10
+ REL_DELETIONS_THRESHOLD = 0.75
11
+ GENERATION_ATTEMPTS = 5
12
+
13
+
14
+ def build_prompt(reference, diff):
15
+ return f"""A LLM generated a commit message for the following source code changes:
16
+ START OF THE SOURCE CODE CHANGES
17
+ {diff}
18
+ END OF THE SOURCE CODE CHANGES
19
+
20
+ Here is the message the LLM generated:
21
+ START OF THE COMMIT MESSAGE
22
+ {reference}
23
+ END OF THE COMMIT MESSAGE
24
+
25
+ This generated message is not perfect. Your task is to rewrite and improve it.
26
+ You have to simulate a human software developer who manually rewrites the LLM-generated commit message,
27
+ so the message you print must share some fragments with the generated message.
28
+ Your message should be concise.
29
+ Here are some examples of what you should output:
30
+ START OF THE EXAMPLES LIST
31
+ {examples.EXAMPLES_START_TO_END}
32
+ END OF THE EXAMPLES LIST
33
+
34
+
35
+ Print only the improved commit message's text after the
36
+ token "OUTPUT".
37
+
38
+ OUTPUT"""
39
+
40
+
41
+ def generate_start_msg(end_msg, diff):
42
+ prompt = build_prompt(reference=end_msg, diff=diff)
43
+ results = []
44
+
45
+ for i in range(GENERATION_ATTEMPTS):
46
+ start_msg_pred = grazie_wrapper.generate_for_prompt(prompt)
47
+
48
+ stats = statistics.get_statistics(start_msg=start_msg_pred, end_msg=end_msg,
49
+ annotated_msg=generate_annotated_diffs.get_annotated_diff(start_msg_pred,
50
+ end_msg))
51
+ if stats["deletions"] < REL_DELETIONS_THRESHOLD:
52
+ return start_msg_pred
53
+ else:
54
+ results.append((stats["deletions"], start_msg_pred))
55
+
56
+ results.sort()
57
+ return results[0][1]
58
+
59
+
60
+ COLS_TO_KEEP = ["hash", "repo", "commit_msg_start", "mods", "session", "end_to_start"]
61
+
62
+
63
+ def transform(df):
64
+ df['start_to_end'] = False
65
+
66
+ generated_data = {
67
+ "commit_msg_end": []
68
+ }
69
+
70
+ for col in COLS_TO_KEEP:
71
+ generated_data[col] = []
72
+
73
+ for _, row in tqdm(df.iterrows(), total=len(df)):
74
+ for i in range(GENERATION_MULTIPLIER):
75
+ commit_msg_end_pred = generate_start_msg(end_msg=row["commit_msg_start"],
76
+ diff=row["mods"])
77
+
78
+ generated_data["commit_msg_end"].append(commit_msg_end_pred)
79
+ for col in COLS_TO_KEEP:
80
+ generated_data[col].append(row[col])
81
+
82
+ generated_df = pd.DataFrame.from_dict(generated_data)
83
+ generated_df['start_to_end'] = True
84
+
85
+ return pd.concat([df, generated_df], ignore_index=True)