Spaces:

JetBrains-Research
/

commit-rewriting-visualization

Sleeping

App Files Files Community

Petr Tsvetkov commited on Apr 13

Commit

a8a595d

•

1 Parent(s): e2a35c0

- New version of the end->start synthetics samples generation

Browse files

Files changed (9) hide show

api_wrappers/__init__.py +0 -0
api_wrappers/grazie_wrapper.py +34 -0
hf_data_loader.py → api_wrappers/hf_data_loader.py +10 -0
config.py +3 -0
generate_annotated_diffs.py +4 -10
generate_synthetic_dataset.py +11 -105
generation_steps/__init__.py +0 -0
generation_steps/synthetic_end_to_start.py +116 -0
statistics.py +23 -24

api_wrappers/__init__.py ADDED Viewed

File without changes

api_wrappers/grazie_wrapper.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import time
+from grazie.api.client.chat.prompt import ChatPrompt
+from grazie.api.client.endpoints import GrazieApiGatewayUrls
+from grazie.api.client.gateway import GrazieApiGatewayClient, GrazieAgent, AuthType
+from grazie.api.client.profiles import LLMProfile
+import config
+client = GrazieApiGatewayClient(
+    grazie_agent=GrazieAgent(name="commit-rewriting-synthetic-end-to-start", version="dev"),
+    url=GrazieApiGatewayUrls.STAGING,
+    auth_type=AuthType.SERVICE,
+    grazie_jwt_token=config.GRAZIE_API_JWT_TOKEN
+)
+def generate_for_prompt(prompt):
+    output = None
+    while output is None:
+        try:
+            output = output = client.chat(
+                chat=ChatPrompt()
+                .add_system("You are a helpful assistant.")
+                .add_user(prompt),
+                profile=LLMProfile("gpt-4-1106-preview")
+            ).content
+        except:
+            time.sleep(config.GRAZIE_TIMEOUT_SEC)
+    assert output is not None
+    return output

hf_data_loader.py → api_wrappers/hf_data_loader.py RENAMED Viewed

@@ -18,6 +18,16 @@ def load_full_commit_dataset_as_pandas():
         columns={'message': 'reference'})
 def load_synthetic_dataset_as_pandas():
     return load_dataset(config.HF_SYNTHETIC_DATASET_NAME,
                         split=config.HF_SYNTHETIC_DATASET_SPLIT,

         columns={'message': 'reference'})
+def load_processed_rewriting_dataset_as_pandas():
+    manual_rewriting = load_raw_rewriting_dataset_as_pandas()[["hash", "repo", "commit_msg_start", "commit_msg_end"]]
+    manual_rewriting.set_index(["hash", "repo"], inplace=True)
+    mods_dataset = load_full_commit_dataset_as_pandas()[["hash", "repo", "mods"]]
+    mods_dataset.set_index(["hash", "repo"], inplace=True)
+    return manual_rewriting.join(other=mods_dataset, how='left').reset_index()
 def load_synthetic_dataset_as_pandas():
     return load_dataset(config.HF_SYNTHETIC_DATASET_NAME,
                         split=config.HF_SYNTHETIC_DATASET_SPLIT,

config.py CHANGED Viewed

@@ -1,7 +1,10 @@
 import os
 from pathlib import Path
 GRAZIE_API_JWT_TOKEN = os.environ.get("GRAZIE_API_JWT_TOKEN")
 HF_TOKEN = os.environ.get('HF_TOKEN')

 import os
 from pathlib import Path
+RANDOM_STATE = 42
 GRAZIE_API_JWT_TOKEN = os.environ.get("GRAZIE_API_JWT_TOKEN")
+GRAZIE_TIMEOUT_SEC = 1.0
 HF_TOKEN = os.environ.get('HF_TOKEN')

generate_annotated_diffs.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import diff_match_patch as dmp_module
-import hf_data_loader
 def get_annotated_diff(start_text, end_text):
@@ -19,27 +19,21 @@ def get_annotated_diff(start_text, end_text):
     return result
-def annotated_diff_for_row_manual_df(row):
     start = row['commit_msg_start']
     end = row['commit_msg_end']
     return get_annotated_diff(start, end)
-def annotated_diff_for_row_synthetic_df(row):
-    start = row['initial_msg_pred']
-    end = row['reference']
-    return get_annotated_diff(start, end)
 def manual_data_with_annotated_diffs():
     df = hf_data_loader.load_raw_rewriting_dataset_as_pandas()
-    annotated = df.apply(annotated_diff_for_row_manual_df, axis=1)
     df['annotated_diff'] = annotated
     return df
 def synthetic_data_with_annotated_diffs():
     df = hf_data_loader.load_synthetic_dataset_as_pandas()
-    annotated = df.apply(annotated_diff_for_row_synthetic_df, axis=1)
     df['annotated_diff'] = annotated
     return df

 import diff_match_patch as dmp_module
+from api_wrappers import hf_data_loader
 def get_annotated_diff(start_text, end_text):
     return result
+def annotated_diff_for_row(row):
     start = row['commit_msg_start']
     end = row['commit_msg_end']
     return get_annotated_diff(start, end)
 def manual_data_with_annotated_diffs():
     df = hf_data_loader.load_raw_rewriting_dataset_as_pandas()
+    annotated = df.apply(annotated_diff_for_row, axis=1)
     df['annotated_diff'] = annotated
     return df
 def synthetic_data_with_annotated_diffs():
     df = hf_data_loader.load_synthetic_dataset_as_pandas()
+    annotated = df.apply(annotated_diff_for_row, axis=1)
     df['annotated_diff'] = annotated
     return df

generate_synthetic_dataset.py CHANGED Viewed

@@ -1,114 +1,20 @@
-import time
-from grazie.api.client.chat.prompt import ChatPrompt
-from grazie.api.client.endpoints import GrazieApiGatewayUrls
-from grazie.api.client.gateway import GrazieApiGatewayClient, GrazieAgent, AuthType
-from grazie.api.client.profiles import LLMProfile
-from tqdm import tqdm
 import config
-import hf_data_loader
-client = GrazieApiGatewayClient(
-    grazie_agent=GrazieAgent(name="commit-rewriting-summary-generation", version="dev"),
-    url=GrazieApiGatewayUrls.STAGING,
-    auth_type=AuthType.SERVICE,
-    grazie_jwt_token=config.GRAZIE_API_JWT_TOKEN
-)
-def get_example_prompt(start_msg, end_msg):
-    return f"""START OF THE EXAMPLE
-For following the edited message:
-START OF THE EDITED COMMIT MESSAGE
-{end_msg}
-END OF THE EDITED COMMIT MESSAGE
-You would output the following initial commit message:
-START OF THE INITIAL COMMIT MESSAGE
-{start_msg}
-END OF THE INITIAL COMMIT MESSAGE
-END OF THE EXAMPLE"""
-def generate_examples():
-    manual_df = hf_data_loader.load_raw_rewriting_dataset_as_pandas()[['commit_msg_start', 'commit_msg_end']]
-    examples = [
-        get_example_prompt(row['commit_msg_start'], row['commit_msg_end'])
-        for _, row in manual_df.iterrows()
-    ]
-    return "\n".join(examples)
-EXAMPLES = generate_examples()
-def build_prompt(reference, diff):
-    return f"""A software developer uses a LLM to generate commit messages.
-They generated a commit message for the following source code changes:
-START OF THE SOURCE CODE CHANGES
-{diff}
-END OF THE SOURCE CODE CHANGES
-After generating the commit message the developer understands that it is not perfect. After making dome changes,
-they come up with an edited version of the message. Here is this edited message:
-START OF THE COMMIT MESSAGE
-{reference}
-END OF THE COMMIT MESSAGE
-Your task is to print the initial, LLM-generated commit message. Here are some examples of what you should output:
-START OF THE EXAMPLES LIST
-{EXAMPLES}
-END OF THE EXAMPLES LIST
-Print only the initial commit message's text after the
-token "OUTPUT".
-OUTPUT"""
-def generate_prompt_for_row(row):
-    reference = row['reference']
-    diff = row['mods']
-    return build_prompt(reference, diff)
-def generate_initial_msg(prompt):
-    commit_msg = client.chat(
-        chat=ChatPrompt()
-        .add_system("You are a helpful assistant.")
-        .add_user(prompt),
-        profile=LLMProfile("gpt-4-1106-preview")
-    ).content
-    return commit_msg
-def generate_synthetic_dataset():
-    df = hf_data_loader.load_full_commit_dataset_as_pandas()
-    df['initial_msg_prompt'] = df.apply(generate_prompt_for_row, axis=1)
-    initial_messages_pred = []
-    for i, prompt in enumerate(tqdm(df['initial_msg_ prompt'])):
-        output = None
-        while output is None:
-            try:
-                output = generate_initial_msg(prompt)
-            except:
-                time.sleep(0.5)
-        assert output is not None
-        initial_messages_pred.append(output)
-    df['initial_msg_pred'] = initial_messages_pred
     df.to_csv(config.SYNTHETIC_DATASET_ARTIFACT)
 if __name__ == '__main__':
-    generate_synthetic_dataset()

 import config
+from api_wrappers import hf_data_loader
+from generation_steps import synthetic_end_to_start
+def run():
+    df = hf_data_loader.load_processed_rewriting_dataset_as_pandas()
+    print(f"End -> start synthesis:")
+    print(f"GENERATION_MULTIPLIER = {synthetic_end_to_start.GENERATION_MULTIPLIER}")
+    print(f"REL_INSERTIONS_THRESHOLD = {synthetic_end_to_start.REL_INSERTIONS_THRESHOLD}")
+    print(f"GENERATION_ATTEMPTS = {synthetic_end_to_start.GENERATION_ATTEMPTS}")
+    df = synthetic_end_to_start.transform(df)
+    print("Done")
     df.to_csv(config.SYNTHETIC_DATASET_ARTIFACT)
 if __name__ == '__main__':
+    run()

generation_steps/__init__.py ADDED Viewed

File without changes

generation_steps/synthetic_end_to_start.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import pandas as pd
+from tqdm import tqdm
+import config
+import generate_annotated_diffs
+from api_wrappers import grazie_wrapper, hf_data_loader
+import statistics
+N_EXAMPLES = 5
+GENERATION_MULTIPLIER = 2
+REL_INSERTIONS_THRESHOLD = 0.6
+GENERATION_ATTEMPTS = 5
+def get_example_prompt(start_msg, end_msg):
+    return f"""START OF THE EXAMPLE
+For following the edited message:
+START OF THE EDITED COMMIT MESSAGE
+{end_msg}
+END OF THE EDITED COMMIT MESSAGE
+You would output the following initial commit message:
+START OF THE INITIAL COMMIT MESSAGE
+{start_msg}
+END OF THE INITIAL COMMIT MESSAGE
+END OF THE EXAMPLE"""
+def generate_examples():
+    manual_df = hf_data_loader.load_raw_rewriting_dataset_as_pandas()[['commit_msg_start', 'commit_msg_end']]
+    manual_df = manual_df.sample(n=N_EXAMPLES, random_state=config.RANDOM_STATE)
+    examples = [
+        get_example_prompt(row['commit_msg_start'], row['commit_msg_end'])
+        for _, row in manual_df.iterrows()
+    ]
+    return "\n".join(examples)
+EXAMPLES = generate_examples()
+def build_prompt(reference, diff):
+    return f"""A software developer uses a LLM to generate commit messages.
+They generated a commit message for the following source code changes:
+START OF THE SOURCE CODE CHANGES
+{diff}
+END OF THE SOURCE CODE CHANGES
+After generating the commit message the developer understands that it is not perfect. After making dome changes,
+they come up with an edited version of the message. Here is this edited message:
+START OF THE COMMIT MESSAGE
+{reference}
+END OF THE COMMIT MESSAGE
+Your task is to print the initial, LLM-generated commit message.
+The message you print must share some fragments with the edited message.
+Here are some examples of what you should output:
+START OF THE EXAMPLES LIST
+{EXAMPLES}
+END OF THE EXAMPLES LIST
+Print only the initial commit message's text after the
+token "OUTPUT".
+OUTPUT"""
+def generate_start_msg(end_msg, diff):
+    prompt = build_prompt(reference=end_msg, diff=diff)
+    results = []
+    for i in range(GENERATION_ATTEMPTS):
+        start_msg_pred = grazie_wrapper.generate_for_prompt(prompt)
+        stats = statistics.get_statistics(start_msg=start_msg_pred, end_msg=end_msg,
+                                          annotated_msg=generate_annotated_diffs.get_annotated_diff(start_msg_pred,
+                                                                                                    end_msg))
+        if stats["insertions"] < REL_INSERTIONS_THRESHOLD:
+            return start_msg_pred
+        else:
+            results.append((stats["insertions"], start_msg_pred))
+    results.sort()
+    return results[0][1]
+def transform(df):
+    df['end_to_start'] = False
+    generated_data = {
+        "hash": [],
+        "repo": [],
+        "commit_msg_start": [],
+        "commit_msg_end": [],
+        "mods": []
+    }
+    for _, row in tqdm(df.iterrows(), total=len(df)):
+        for i in range(GENERATION_MULTIPLIER):
+            commit_msg_start_pred = generate_start_msg(end_msg=row["commit_msg_end"],
+                                                       diff=row["mods"])
+            generated_data["hash"].append(row["hash"])
+            generated_data["repo"].append(row["repo"])
+            generated_data["commit_msg_start"].append(commit_msg_start_pred)
+            generated_data["commit_msg_end"].append(row["commit_msg_end"])
+            generated_data["mods"].append(row["mods"])
+    generated_df = pd.DataFrame.from_dict(generated_data)
+    generated_df['end_to_start'] = True
+    return pd.concat([df, generated_df], ignore_index=True)

statistics.py CHANGED Viewed

@@ -2,35 +2,34 @@ import numpy as np
 import pandas as pd
-def get_statistics_for_df(df: pd.DataFrame, start_col, end_col, annotated_col):
-    relative_deletions = []
-    relative_insertions = []
-    relative_changes = []
-    for _, row in df.iterrows():
-        sum_deletions = 0
-        sum_insertions = 0
-        for text, change_type in row[annotated_col]:
-            if change_type == '-':
-                sum_deletions += len(text)
-            elif change_type == '+':
-                sum_insertions += len(text)
-        sum_changes = sum_deletions + sum_insertions
-        end_length = len(row[end_col])
-        start_length = len(row[start_col])
-        relative_deletions.append(sum_deletions / start_length)
-        relative_insertions.append(sum_insertions / end_length)
-        relative_changes.append(sum_changes / end_length)
     return {
-        "deletions": np.asarray(relative_deletions),
-        "insertions": np.asarray(relative_insertions),
-        "changes": np.asarray(relative_changes)
     }
 def get_statistics_for_manual_df(df):
     return get_statistics_for_df(df, start_col="commit_msg_start", end_col='commit_msg_end',
                                  annotated_col='annotated_diff')

 import pandas as pd
+def get_statistics(start_msg, end_msg, annotated_msg):
+    sum_deletions = 0
+    sum_insertions = 0
+    for text, change_type in annotated_msg:
+        if change_type == '-':
+            sum_deletions += len(text)
+        elif change_type == '+':
+            sum_insertions += len(text)
+    sum_changes = sum_deletions + sum_insertions
+    end_length = len(end_msg)
+    start_length = len(start_msg)
     return {
+        "deletions": sum_deletions / start_length,
+        "insertions": sum_insertions / end_length,
+        "changes": sum_changes / end_length
     }
+def get_statistics_for_df(df: pd.DataFrame, start_col, end_col, annotated_col):
+    stats = [get_statistics(row[start_col], row[end_col], row[annotated_col]) for _, row in df.iterrows()]
+    assert len(stats) > 0
+    return {stat_name: np.asarray([e[stat_name] for e in stats]) for stat_name in stats[0]}
 def get_statistics_for_manual_df(df):
     return get_statistics_for_df(df, start_col="commit_msg_start", end_col='commit_msg_end',
                                  annotated_col='annotated_diff')