Petr Tsvetkov commited on
Commit
30e165f
β€’
1 Parent(s): 5f3a4af

Synthetic dataset generation for the first 5 samples; visualization fixed

Browse files
change_visualizer.py CHANGED
@@ -20,8 +20,7 @@ def update_manual_view(diff_idx):
20
  def update_synthetic_view(diff_idx):
21
  diff_idx -= 1
22
  return (df_synthetic.iloc[diff_idx]['annotated_diff'], df_synthetic.iloc[diff_idx]['initial_msg_pred'],
23
- df_synthetic.iloc[diff_idx][
24
- 'get_annotated_diff'],
25
  f"https://github.com/{df_synthetic.iloc[diff_idx]['repo']}/commit/{df_synthetic.iloc[diff_idx]['hash']}")
26
 
27
 
 
20
  def update_synthetic_view(diff_idx):
21
  diff_idx -= 1
22
  return (df_synthetic.iloc[diff_idx]['annotated_diff'], df_synthetic.iloc[diff_idx]['initial_msg_pred'],
23
+ df_synthetic.iloc[diff_idx]['reference'],
 
24
  f"https://github.com/{df_synthetic.iloc[diff_idx]['repo']}/commit/{df_synthetic.iloc[diff_idx]['hash']}")
25
 
26
 
generate_synthetic_dataset.py CHANGED
@@ -57,9 +57,16 @@ def generate_synthetic_dataset():
57
  df['initial_msg_prompt'] = df.apply(generate_prompt_for_row, axis=1)
58
  initial_messages_pred = []
59
 
60
- for prompt in tqdm(df['initial_msg_prompt']):
61
- output = generate_initial_msg(prompt)
62
- initial_messages_pred.append(output)
 
 
 
 
 
 
 
63
 
64
  df['initial_msg_pred'] = initial_messages_pred
65
 
 
57
  df['initial_msg_prompt'] = df.apply(generate_prompt_for_row, axis=1)
58
  initial_messages_pred = []
59
 
60
+ for i, prompt in enumerate(tqdm(df['initial_msg_prompt'])):
61
+ output = None
62
+
63
+ if i < 5:
64
+ while output is None:
65
+ try:
66
+ output = generate_initial_msg(prompt)
67
+ except:
68
+ pass
69
+ initial_messages_pred.append(output if output is not None else "TBA")
70
 
71
  df['initial_msg_pred'] = initial_messages_pred
72
 
hf_data_loader.py CHANGED
@@ -19,7 +19,7 @@ def load_full_commit_dataset_as_pandas():
19
 
20
 
21
  def load_synthetic_dataset_as_pandas():
22
- load_dataset(config.HF_SYNTHETIC_DATASET_NAME,
23
- split=config.HF_SYNTHETIC_DATASET_SPLIT,
24
- token=config.HF_TOKEN,
25
- cache_dir=config.CACHE_DIR).to_pandas()
 
19
 
20
 
21
  def load_synthetic_dataset_as_pandas():
22
+ return load_dataset(config.HF_SYNTHETIC_DATASET_NAME,
23
+ split=config.HF_SYNTHETIC_DATASET_SPLIT,
24
+ token=config.HF_TOKEN,
25
+ cache_dir=config.CACHE_DIR).to_pandas()