Petr Tsvetkov commited on
Commit
02ebb6e
β€’
1 Parent(s): 5ae823f

Keep the session column

Browse files
api_wrappers/hf_data_loader.py CHANGED
@@ -19,7 +19,8 @@ def load_full_commit_dataset_as_pandas():
19
 
20
 
21
  def load_processed_rewriting_dataset_as_pandas():
22
- manual_rewriting = load_raw_rewriting_dataset_as_pandas()[["hash", "repo", "commit_msg_start", "commit_msg_end"]]
 
23
  manual_rewriting.set_index(["hash", "repo"], inplace=True)
24
 
25
  mods_dataset = load_full_commit_dataset_as_pandas()[["hash", "repo", "mods"]]
 
19
 
20
 
21
  def load_processed_rewriting_dataset_as_pandas():
22
+ manual_rewriting = load_raw_rewriting_dataset_as_pandas()[
23
+ ["hash", "repo", "commit_msg_start", "commit_msg_end", "session"]]
24
  manual_rewriting.set_index(["hash", "repo"], inplace=True)
25
 
26
  mods_dataset = load_full_commit_dataset_as_pandas()[["hash", "repo", "mods"]]
change_visualizer.py CHANGED
@@ -15,10 +15,11 @@ STATISTICS = {"manual": statistics.get_statistics_for_df(df_manual),
15
 
16
  def update_dataset_view(diff_idx, df):
17
  diff_idx -= 1
18
- return df.iloc[diff_idx]['annotated_diff'], df.iloc[diff_idx]['commit_msg_start'], \
19
- df.iloc[diff_idx][
20
- 'commit_msg_end'], df.iloc[diff_idx][
21
- 'session'], f"https://github.com/{df.iloc[diff_idx]['repo']}/commit/{df.iloc[diff_idx]['hash']}"
 
22
 
23
 
24
  def update_dataset_view_manual(diff_idx):
 
15
 
16
  def update_dataset_view(diff_idx, df):
17
  diff_idx -= 1
18
+ return (df.iloc[diff_idx]['annotated_diff'],
19
+ df.iloc[diff_idx]['commit_msg_start'],
20
+ df.iloc[diff_idx]['commit_msg_end'],
21
+ df.iloc[diff_idx]['session'],
22
+ f"https://github.com/{df.iloc[diff_idx]['repo']}/commit/{df.iloc[diff_idx]['hash']}")
23
 
24
 
25
  def update_dataset_view_manual(diff_idx):
generation_steps/synthetic_end_to_start.py CHANGED
@@ -3,8 +3,8 @@ from tqdm import tqdm
3
 
4
  import config
5
  import generate_annotated_diffs
6
- from api_wrappers import grazie_wrapper, hf_data_loader
7
  import statistics
 
8
 
9
  N_EXAMPLES = 5
10
  GENERATION_MULTIPLIER = 2
@@ -89,26 +89,27 @@ def generate_start_msg(end_msg, diff):
89
  return results[0][1]
90
 
91
 
 
 
 
92
  def transform(df):
93
  df['end_to_start'] = False
94
 
95
  generated_data = {
96
- "hash": [],
97
- "repo": [],
98
- "commit_msg_start": [],
99
- "commit_msg_end": [],
100
- "mods": []
101
  }
102
 
 
 
 
103
  for _, row in tqdm(df.iterrows(), total=len(df)):
104
  for i in range(GENERATION_MULTIPLIER):
105
  commit_msg_start_pred = generate_start_msg(end_msg=row["commit_msg_end"],
106
  diff=row["mods"])
107
- generated_data["hash"].append(row["hash"])
108
- generated_data["repo"].append(row["repo"])
109
  generated_data["commit_msg_start"].append(commit_msg_start_pred)
110
- generated_data["commit_msg_end"].append(row["commit_msg_end"])
111
- generated_data["mods"].append(row["mods"])
112
 
113
  generated_df = pd.DataFrame.from_dict(generated_data)
114
  generated_df['end_to_start'] = True
 
3
 
4
  import config
5
  import generate_annotated_diffs
 
6
  import statistics
7
+ from api_wrappers import grazie_wrapper, hf_data_loader
8
 
9
  N_EXAMPLES = 5
10
  GENERATION_MULTIPLIER = 2
 
89
  return results[0][1]
90
 
91
 
92
+ COLS_TO_KEEP = ["hash", "repo", "commit_msg_end", "mods", "session"]
93
+
94
+
95
  def transform(df):
96
  df['end_to_start'] = False
97
 
98
  generated_data = {
99
+ "commit_msg_start": []
 
 
 
 
100
  }
101
 
102
+ for col in COLS_TO_KEEP:
103
+ generated_data[col] = []
104
+
105
  for _, row in tqdm(df.iterrows(), total=len(df)):
106
  for i in range(GENERATION_MULTIPLIER):
107
  commit_msg_start_pred = generate_start_msg(end_msg=row["commit_msg_end"],
108
  diff=row["mods"])
109
+
 
110
  generated_data["commit_msg_start"].append(commit_msg_start_pred)
111
+ for col in COLS_TO_KEEP:
112
+ generated_data[col].append(row[col])
113
 
114
  generated_df = pd.DataFrame.from_dict(generated_data)
115
  generated_df['end_to_start'] = True