Petr Tsvetkov
commited on
Commit
β’
13e3243
1
Parent(s):
d7e2287
Start-to-end generation
Browse files
change_visualizer.py
CHANGED
@@ -21,6 +21,7 @@ def update_dataset_view(diff_idx, df):
|
|
21 |
df.iloc[diff_idx]['commit_msg_end'],
|
22 |
df.iloc[diff_idx]['session'],
|
23 |
str(df.iloc[diff_idx]['end_to_start']),
|
|
|
24 |
f"https://github.com/{df.iloc[diff_idx]['repo']}/commit/{df.iloc[diff_idx]['hash']}",)
|
25 |
|
26 |
|
@@ -56,6 +57,9 @@ if __name__ == '__main__':
|
|
56 |
is_end_to_start_view = gr.Textbox(interactive=False,
|
57 |
label="Is generated on the 'end-to-start' synthesis step?",
|
58 |
container=True)
|
|
|
|
|
|
|
59 |
link_view = gr.Markdown()
|
60 |
|
61 |
view = [
|
@@ -64,6 +68,7 @@ if __name__ == '__main__':
|
|
64 |
end_view,
|
65 |
session_view,
|
66 |
is_end_to_start_view,
|
|
|
67 |
link_view
|
68 |
]
|
69 |
|
|
|
21 |
df.iloc[diff_idx]['commit_msg_end'],
|
22 |
df.iloc[diff_idx]['session'],
|
23 |
str(df.iloc[diff_idx]['end_to_start']),
|
24 |
+
str(df.iloc[diff_idx]['start_to_end']),
|
25 |
f"https://github.com/{df.iloc[diff_idx]['repo']}/commit/{df.iloc[diff_idx]['hash']}",)
|
26 |
|
27 |
|
|
|
57 |
is_end_to_start_view = gr.Textbox(interactive=False,
|
58 |
label="Is generated on the 'end-to-start' synthesis step?",
|
59 |
container=True)
|
60 |
+
is_start_to_end_view = gr.Textbox(interactive=False,
|
61 |
+
label="Is generated on the 'start-to-end' synthesis step?",
|
62 |
+
container=True)
|
63 |
link_view = gr.Markdown()
|
64 |
|
65 |
view = [
|
|
|
68 |
end_view,
|
69 |
session_view,
|
70 |
is_end_to_start_view,
|
71 |
+
is_start_to_end_view,
|
72 |
link_view
|
73 |
]
|
74 |
|
generate_synthetic_dataset.py
CHANGED
@@ -1,10 +1,12 @@
|
|
1 |
import config
|
2 |
from api_wrappers import hf_data_loader
|
3 |
-
from generation_steps import synthetic_end_to_start
|
4 |
|
5 |
|
6 |
def run():
|
7 |
df = hf_data_loader.load_processed_rewriting_dataset_as_pandas()
|
|
|
|
|
8 |
|
9 |
print(f"End -> start synthesis:")
|
10 |
print(f"GENERATION_MULTIPLIER = {synthetic_end_to_start.GENERATION_MULTIPLIER}")
|
@@ -13,6 +15,13 @@ def run():
|
|
13 |
df = synthetic_end_to_start.transform(df)
|
14 |
print("Done")
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
df.to_csv(config.SYNTHETIC_DATASET_ARTIFACT)
|
17 |
|
18 |
|
|
|
1 |
import config
|
2 |
from api_wrappers import hf_data_loader
|
3 |
+
from generation_steps import synthetic_end_to_start, examples, synthetic_start_to_end
|
4 |
|
5 |
|
6 |
def run():
|
7 |
df = hf_data_loader.load_processed_rewriting_dataset_as_pandas()
|
8 |
+
print(f"NUMBER OF EXAMPLES PER PROMPT = {examples.N_EXAMPLES}")
|
9 |
+
print()
|
10 |
|
11 |
print(f"End -> start synthesis:")
|
12 |
print(f"GENERATION_MULTIPLIER = {synthetic_end_to_start.GENERATION_MULTIPLIER}")
|
|
|
15 |
df = synthetic_end_to_start.transform(df)
|
16 |
print("Done")
|
17 |
|
18 |
+
print(f"Start -> send synthesis:")
|
19 |
+
print(f"GENERATION_MULTIPLIER = {synthetic_start_to_end.GENERATION_MULTIPLIER}")
|
20 |
+
print(f"REL_DELETIONS_THRESHOLD = {synthetic_start_to_end.REL_DELETIONS_THRESHOLD}")
|
21 |
+
print(f"GENERATION_ATTEMPTS = {synthetic_start_to_end.GENERATION_ATTEMPTS}")
|
22 |
+
df = synthetic_start_to_end.transform(df)
|
23 |
+
print("Done")
|
24 |
+
|
25 |
df.to_csv(config.SYNTHETIC_DATASET_ARTIFACT)
|
26 |
|
27 |
|
generation_steps/examples.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import config
|
2 |
+
from api_wrappers import hf_data_loader
|
3 |
+
|
4 |
+
N_EXAMPLES = 5
|
5 |
+
|
6 |
+
|
7 |
+
def get_example_prompt_end_to_start(start_msg, end_msg):
|
8 |
+
return f"""START OF THE EXAMPLE
|
9 |
+
|
10 |
+
For the following edited commit message:
|
11 |
+
START OF THE EDITED COMMIT MESSAGE
|
12 |
+
{end_msg}
|
13 |
+
END OF THE EDITED COMMIT MESSAGE
|
14 |
+
|
15 |
+
You would output the following initial commit message:
|
16 |
+
START OF THE INITIAL COMMIT MESSAGE
|
17 |
+
{start_msg}
|
18 |
+
END OF THE INITIAL COMMIT MESSAGE
|
19 |
+
|
20 |
+
END OF THE EXAMPLE"""
|
21 |
+
|
22 |
+
|
23 |
+
def get_example_prompt_start_to_end(start_msg, end_msg):
|
24 |
+
return f"""START OF THE EXAMPLE
|
25 |
+
|
26 |
+
For the following LLM-generated commit message:
|
27 |
+
START OF THE GENERATED COMMIT MESSAGE
|
28 |
+
{start_msg}
|
29 |
+
END OF THE GENERATED COMMIT MESSAGE
|
30 |
+
|
31 |
+
You would output the following improved commit message:
|
32 |
+
START OF THE IMPROVED COMMIT MESSAGE
|
33 |
+
{end_msg}
|
34 |
+
END OF THE IMPROVED COMMIT MESSAGE
|
35 |
+
|
36 |
+
END OF THE EXAMPLE"""
|
37 |
+
|
38 |
+
|
39 |
+
manual_df = hf_data_loader.load_raw_rewriting_dataset_as_pandas()[['commit_msg_start', 'commit_msg_end']]
|
40 |
+
manual_df = manual_df.sample(n=N_EXAMPLES, random_state=config.RANDOM_STATE)
|
41 |
+
|
42 |
+
|
43 |
+
def generate_examples(end_to_start):
|
44 |
+
prompt_fn = get_example_prompt_end_to_start if end_to_start else get_example_prompt_start_to_end
|
45 |
+
examples = [
|
46 |
+
prompt_fn(row['commit_msg_start'], row['commit_msg_end'])
|
47 |
+
for _, row in manual_df.iterrows()
|
48 |
+
]
|
49 |
+
|
50 |
+
return "\n".join(examples)
|
51 |
+
|
52 |
+
|
53 |
+
EXAMPLES_END_TO_START = generate_examples(end_to_start=True)
|
54 |
+
EXAMPLES_START_TO_END = generate_examples(end_to_start=False)
|
generation_steps/synthetic_end_to_start.py
CHANGED
@@ -1,47 +1,16 @@
|
|
1 |
import pandas as pd
|
2 |
from tqdm import tqdm
|
3 |
|
4 |
-
import config
|
5 |
import generate_annotated_diffs
|
6 |
import statistics
|
7 |
-
from api_wrappers import grazie_wrapper
|
|
|
8 |
|
9 |
-
|
10 |
-
GENERATION_MULTIPLIER = 2
|
11 |
REL_INSERTIONS_THRESHOLD = 0.6
|
12 |
GENERATION_ATTEMPTS = 5
|
13 |
|
14 |
|
15 |
-
def get_example_prompt(start_msg, end_msg):
|
16 |
-
return f"""START OF THE EXAMPLE
|
17 |
-
|
18 |
-
For following the edited message:
|
19 |
-
START OF THE EDITED COMMIT MESSAGE
|
20 |
-
{end_msg}
|
21 |
-
END OF THE EDITED COMMIT MESSAGE
|
22 |
-
|
23 |
-
You would output the following initial commit message:
|
24 |
-
START OF THE INITIAL COMMIT MESSAGE
|
25 |
-
{start_msg}
|
26 |
-
END OF THE INITIAL COMMIT MESSAGE
|
27 |
-
|
28 |
-
END OF THE EXAMPLE"""
|
29 |
-
|
30 |
-
|
31 |
-
def generate_examples():
|
32 |
-
manual_df = hf_data_loader.load_raw_rewriting_dataset_as_pandas()[['commit_msg_start', 'commit_msg_end']]
|
33 |
-
manual_df = manual_df.sample(n=N_EXAMPLES, random_state=config.RANDOM_STATE)
|
34 |
-
examples = [
|
35 |
-
get_example_prompt(row['commit_msg_start'], row['commit_msg_end'])
|
36 |
-
for _, row in manual_df.iterrows()
|
37 |
-
]
|
38 |
-
|
39 |
-
return "\n".join(examples)
|
40 |
-
|
41 |
-
|
42 |
-
EXAMPLES = generate_examples()
|
43 |
-
|
44 |
-
|
45 |
def build_prompt(reference, diff):
|
46 |
return f"""A software developer uses a LLM to generate commit messages.
|
47 |
|
@@ -60,7 +29,7 @@ Your task is to print the initial, LLM-generated commit message.
|
|
60 |
The message you print must share some fragments with the edited message.
|
61 |
Here are some examples of what you should output:
|
62 |
START OF THE EXAMPLES LIST
|
63 |
-
{
|
64 |
END OF THE EXAMPLES LIST
|
65 |
|
66 |
|
|
|
1 |
import pandas as pd
|
2 |
from tqdm import tqdm
|
3 |
|
|
|
4 |
import generate_annotated_diffs
|
5 |
import statistics
|
6 |
+
from api_wrappers import grazie_wrapper
|
7 |
+
from generation_steps import examples
|
8 |
|
9 |
+
GENERATION_MULTIPLIER = 1
|
|
|
10 |
REL_INSERTIONS_THRESHOLD = 0.6
|
11 |
GENERATION_ATTEMPTS = 5
|
12 |
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
def build_prompt(reference, diff):
|
15 |
return f"""A software developer uses a LLM to generate commit messages.
|
16 |
|
|
|
29 |
The message you print must share some fragments with the edited message.
|
30 |
Here are some examples of what you should output:
|
31 |
START OF THE EXAMPLES LIST
|
32 |
+
{examples.EXAMPLES_END_TO_START}
|
33 |
END OF THE EXAMPLES LIST
|
34 |
|
35 |
|
generation_steps/synthetic_start_to_end.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from tqdm import tqdm
|
3 |
+
|
4 |
+
import generate_annotated_diffs
|
5 |
+
import statistics
|
6 |
+
from api_wrappers import grazie_wrapper
|
7 |
+
from generation_steps import examples
|
8 |
+
|
9 |
+
GENERATION_MULTIPLIER = 1
|
10 |
+
REL_DELETIONS_THRESHOLD = 0.75
|
11 |
+
GENERATION_ATTEMPTS = 5
|
12 |
+
|
13 |
+
|
14 |
+
def build_prompt(reference, diff):
|
15 |
+
return f"""A LLM generated a commit message for the following source code changes:
|
16 |
+
START OF THE SOURCE CODE CHANGES
|
17 |
+
{diff}
|
18 |
+
END OF THE SOURCE CODE CHANGES
|
19 |
+
|
20 |
+
Here is the message the LLM generated:
|
21 |
+
START OF THE COMMIT MESSAGE
|
22 |
+
{reference}
|
23 |
+
END OF THE COMMIT MESSAGE
|
24 |
+
|
25 |
+
This generated message is not perfect. Your task is to rewrite and improve it.
|
26 |
+
You have to simulate a human software developer who manually rewrites the LLM-generated commit message,
|
27 |
+
so the message you print must share some fragments with the generated message.
|
28 |
+
Your message should be concise.
|
29 |
+
Here are some examples of what you should output:
|
30 |
+
START OF THE EXAMPLES LIST
|
31 |
+
{examples.EXAMPLES_START_TO_END}
|
32 |
+
END OF THE EXAMPLES LIST
|
33 |
+
|
34 |
+
|
35 |
+
Print only the improved commit message's text after the
|
36 |
+
token "OUTPUT".
|
37 |
+
|
38 |
+
OUTPUT"""
|
39 |
+
|
40 |
+
|
41 |
+
def generate_start_msg(end_msg, diff):
|
42 |
+
prompt = build_prompt(reference=end_msg, diff=diff)
|
43 |
+
results = []
|
44 |
+
|
45 |
+
for i in range(GENERATION_ATTEMPTS):
|
46 |
+
start_msg_pred = grazie_wrapper.generate_for_prompt(prompt)
|
47 |
+
|
48 |
+
stats = statistics.get_statistics(start_msg=start_msg_pred, end_msg=end_msg,
|
49 |
+
annotated_msg=generate_annotated_diffs.get_annotated_diff(start_msg_pred,
|
50 |
+
end_msg))
|
51 |
+
if stats["deletions"] < REL_DELETIONS_THRESHOLD:
|
52 |
+
return start_msg_pred
|
53 |
+
else:
|
54 |
+
results.append((stats["deletions"], start_msg_pred))
|
55 |
+
|
56 |
+
results.sort()
|
57 |
+
return results[0][1]
|
58 |
+
|
59 |
+
|
60 |
+
COLS_TO_KEEP = ["hash", "repo", "commit_msg_start", "mods", "session", "end_to_start"]
|
61 |
+
|
62 |
+
|
63 |
+
def transform(df):
|
64 |
+
df['start_to_end'] = False
|
65 |
+
|
66 |
+
generated_data = {
|
67 |
+
"commit_msg_end": []
|
68 |
+
}
|
69 |
+
|
70 |
+
for col in COLS_TO_KEEP:
|
71 |
+
generated_data[col] = []
|
72 |
+
|
73 |
+
for _, row in tqdm(df.iterrows(), total=len(df)):
|
74 |
+
for i in range(GENERATION_MULTIPLIER):
|
75 |
+
commit_msg_end_pred = generate_start_msg(end_msg=row["commit_msg_start"],
|
76 |
+
diff=row["mods"])
|
77 |
+
|
78 |
+
generated_data["commit_msg_end"].append(commit_msg_end_pred)
|
79 |
+
for col in COLS_TO_KEEP:
|
80 |
+
generated_data[col].append(row[col])
|
81 |
+
|
82 |
+
generated_df = pd.DataFrame.from_dict(generated_data)
|
83 |
+
generated_df['start_to_end'] = True
|
84 |
+
|
85 |
+
return pd.concat([df, generated_df], ignore_index=True)
|