Spaces:

JetBrains-Research
/

commit-message-editing-visualization

Sleeping

App Files Files Community

Petr Tsvetkov commited on Apr 13

Commit

13e3243

•

1 Parent(s): d7e2287

Start-to-end generation

Browse files

Files changed (5) hide show

change_visualizer.py +5 -0
generate_synthetic_dataset.py +10 -1
generation_steps/examples.py +54 -0
generation_steps/synthetic_end_to_start.py +4 -35
generation_steps/synthetic_start_to_end.py +85 -0

change_visualizer.py CHANGED Viewed

@@ -21,6 +21,7 @@ def update_dataset_view(diff_idx, df):
             df.iloc[diff_idx]['commit_msg_end'],
             df.iloc[diff_idx]['session'],
             str(df.iloc[diff_idx]['end_to_start']),
             f"https://github.com/{df.iloc[diff_idx]['repo']}/commit/{df.iloc[diff_idx]['hash']}",)
@@ -56,6 +57,9 @@ if __name__ == '__main__':
             is_end_to_start_view = gr.Textbox(interactive=False,
                                               label="Is generated on the 'end-to-start' synthesis step?",
                                               container=True)
             link_view = gr.Markdown()
             view = [
@@ -64,6 +68,7 @@ if __name__ == '__main__':
                 end_view,
                 session_view,
                 is_end_to_start_view,
                 link_view
             ]

             df.iloc[diff_idx]['commit_msg_end'],
             df.iloc[diff_idx]['session'],
             str(df.iloc[diff_idx]['end_to_start']),
+            str(df.iloc[diff_idx]['start_to_end']),
             f"https://github.com/{df.iloc[diff_idx]['repo']}/commit/{df.iloc[diff_idx]['hash']}",)
             is_end_to_start_view = gr.Textbox(interactive=False,
                                               label="Is generated on the 'end-to-start' synthesis step?",
                                               container=True)
+            is_start_to_end_view = gr.Textbox(interactive=False,
+                                              label="Is generated on the 'start-to-end' synthesis step?",
+                                              container=True)
             link_view = gr.Markdown()
             view = [
                 end_view,
                 session_view,
                 is_end_to_start_view,
+                is_start_to_end_view,
                 link_view
             ]

generate_synthetic_dataset.py CHANGED Viewed

@@ -1,10 +1,12 @@
 import config
 from api_wrappers import hf_data_loader
-from generation_steps import synthetic_end_to_start
 def run():
     df = hf_data_loader.load_processed_rewriting_dataset_as_pandas()
     print(f"End -> start synthesis:")
     print(f"GENERATION_MULTIPLIER = {synthetic_end_to_start.GENERATION_MULTIPLIER}")
@@ -13,6 +15,13 @@ def run():
     df = synthetic_end_to_start.transform(df)
     print("Done")
     df.to_csv(config.SYNTHETIC_DATASET_ARTIFACT)

 import config
 from api_wrappers import hf_data_loader
+from generation_steps import synthetic_end_to_start, examples, synthetic_start_to_end
 def run():
     df = hf_data_loader.load_processed_rewriting_dataset_as_pandas()
+    print(f"NUMBER OF EXAMPLES PER PROMPT = {examples.N_EXAMPLES}")
+    print()
     print(f"End -> start synthesis:")
     print(f"GENERATION_MULTIPLIER = {synthetic_end_to_start.GENERATION_MULTIPLIER}")
     df = synthetic_end_to_start.transform(df)
     print("Done")
+    print(f"Start -> send synthesis:")
+    print(f"GENERATION_MULTIPLIER = {synthetic_start_to_end.GENERATION_MULTIPLIER}")
+    print(f"REL_DELETIONS_THRESHOLD = {synthetic_start_to_end.REL_DELETIONS_THRESHOLD}")
+    print(f"GENERATION_ATTEMPTS = {synthetic_start_to_end.GENERATION_ATTEMPTS}")
+    df = synthetic_start_to_end.transform(df)
+    print("Done")
     df.to_csv(config.SYNTHETIC_DATASET_ARTIFACT)

generation_steps/examples.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import config
+from api_wrappers import hf_data_loader
+N_EXAMPLES = 5
+def get_example_prompt_end_to_start(start_msg, end_msg):
+    return f"""START OF THE EXAMPLE
+For the following edited commit message:
+START OF THE EDITED COMMIT MESSAGE
+{end_msg}
+END OF THE EDITED COMMIT MESSAGE
+You would output the following initial commit message:
+START OF THE INITIAL COMMIT MESSAGE
+{start_msg}
+END OF THE INITIAL COMMIT MESSAGE
+END OF THE EXAMPLE"""
+def get_example_prompt_start_to_end(start_msg, end_msg):
+    return f"""START OF THE EXAMPLE
+For the following LLM-generated commit message:
+START OF THE GENERATED COMMIT MESSAGE
+{start_msg}
+END OF THE GENERATED COMMIT MESSAGE
+You would output the following improved commit message:
+START OF THE IMPROVED COMMIT MESSAGE
+{end_msg}
+END OF THE IMPROVED COMMIT MESSAGE
+END OF THE EXAMPLE"""
+manual_df = hf_data_loader.load_raw_rewriting_dataset_as_pandas()[['commit_msg_start', 'commit_msg_end']]
+manual_df = manual_df.sample(n=N_EXAMPLES, random_state=config.RANDOM_STATE)
+def generate_examples(end_to_start):
+    prompt_fn = get_example_prompt_end_to_start if end_to_start else get_example_prompt_start_to_end
+    examples = [
+        prompt_fn(row['commit_msg_start'], row['commit_msg_end'])
+        for _, row in manual_df.iterrows()
+    ]
+    return "\n".join(examples)
+EXAMPLES_END_TO_START = generate_examples(end_to_start=True)
+EXAMPLES_START_TO_END = generate_examples(end_to_start=False)

generation_steps/synthetic_end_to_start.py CHANGED Viewed

@@ -1,47 +1,16 @@
 import pandas as pd
 from tqdm import tqdm
-import config
 import generate_annotated_diffs
 import statistics
-from api_wrappers import grazie_wrapper, hf_data_loader
-N_EXAMPLES = 5
-GENERATION_MULTIPLIER = 2
 REL_INSERTIONS_THRESHOLD = 0.6
 GENERATION_ATTEMPTS = 5
-def get_example_prompt(start_msg, end_msg):
-    return f"""START OF THE EXAMPLE
-For following the edited message:
-START OF THE EDITED COMMIT MESSAGE
-{end_msg}
-END OF THE EDITED COMMIT MESSAGE
-You would output the following initial commit message:
-START OF THE INITIAL COMMIT MESSAGE
-{start_msg}
-END OF THE INITIAL COMMIT MESSAGE
-END OF THE EXAMPLE"""
-def generate_examples():
-    manual_df = hf_data_loader.load_raw_rewriting_dataset_as_pandas()[['commit_msg_start', 'commit_msg_end']]
-    manual_df = manual_df.sample(n=N_EXAMPLES, random_state=config.RANDOM_STATE)
-    examples = [
-        get_example_prompt(row['commit_msg_start'], row['commit_msg_end'])
-        for _, row in manual_df.iterrows()
-    ]
-    return "\n".join(examples)
-EXAMPLES = generate_examples()
 def build_prompt(reference, diff):
     return f"""A software developer uses a LLM to generate commit messages.
@@ -60,7 +29,7 @@ Your task is to print the initial, LLM-generated commit message.
 The message you print must share some fragments with the edited message.
 Here are some examples of what you should output:
 START OF THE EXAMPLES LIST
-{EXAMPLES}
 END OF THE EXAMPLES LIST

 import pandas as pd
 from tqdm import tqdm
 import generate_annotated_diffs
 import statistics
+from api_wrappers import grazie_wrapper
+from generation_steps import examples
+GENERATION_MULTIPLIER = 1
 REL_INSERTIONS_THRESHOLD = 0.6
 GENERATION_ATTEMPTS = 5
 def build_prompt(reference, diff):
     return f"""A software developer uses a LLM to generate commit messages.
 The message you print must share some fragments with the edited message.
 Here are some examples of what you should output:
 START OF THE EXAMPLES LIST
+{examples.EXAMPLES_END_TO_START}
 END OF THE EXAMPLES LIST

generation_steps/synthetic_start_to_end.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import pandas as pd
+from tqdm import tqdm
+import generate_annotated_diffs
+import statistics
+from api_wrappers import grazie_wrapper
+from generation_steps import examples
+GENERATION_MULTIPLIER = 1
+REL_DELETIONS_THRESHOLD = 0.75
+GENERATION_ATTEMPTS = 5
+def build_prompt(reference, diff):
+    return f"""A LLM generated a commit message for the following source code changes:
+START OF THE SOURCE CODE CHANGES
+{diff}
+END OF THE SOURCE CODE CHANGES
+Here is the message the LLM generated:
+START OF THE COMMIT MESSAGE
+{reference}
+END OF THE COMMIT MESSAGE
+This generated message is not perfect. Your task is to rewrite and improve it.
+You have to simulate a human software developer who manually rewrites the LLM-generated commit message,
+so the message you print must share some fragments with the generated message.
+Your message should be concise.
+Here are some examples of what you should output:
+START OF THE EXAMPLES LIST
+{examples.EXAMPLES_START_TO_END}
+END OF THE EXAMPLES LIST
+Print only the improved commit message's text after the
+token "OUTPUT".
+OUTPUT"""
+def generate_start_msg(end_msg, diff):
+    prompt = build_prompt(reference=end_msg, diff=diff)
+    results = []
+    for i in range(GENERATION_ATTEMPTS):
+        start_msg_pred = grazie_wrapper.generate_for_prompt(prompt)
+        stats = statistics.get_statistics(start_msg=start_msg_pred, end_msg=end_msg,
+                                          annotated_msg=generate_annotated_diffs.get_annotated_diff(start_msg_pred,
+                                                                                                    end_msg))
+        if stats["deletions"] < REL_DELETIONS_THRESHOLD:
+            return start_msg_pred
+        else:
+            results.append((stats["deletions"], start_msg_pred))
+    results.sort()
+    return results[0][1]
+COLS_TO_KEEP = ["hash", "repo", "commit_msg_start", "mods", "session", "end_to_start"]
+def transform(df):
+    df['start_to_end'] = False
+    generated_data = {
+        "commit_msg_end": []
+    }
+    for col in COLS_TO_KEEP:
+        generated_data[col] = []
+    for _, row in tqdm(df.iterrows(), total=len(df)):
+        for i in range(GENERATION_MULTIPLIER):
+            commit_msg_end_pred = generate_start_msg(end_msg=row["commit_msg_start"],
+                                                     diff=row["mods"])
+            generated_data["commit_msg_end"].append(commit_msg_end_pred)
+            for col in COLS_TO_KEEP:
+                generated_data[col].append(row[col])
+    generated_df = pd.DataFrame.from_dict(generated_data)
+    generated_df['start_to_end'] = True
+    return pd.concat([df, generated_df], ignore_index=True)