Petr Tsvetkov commited on
Commit
5bd86a2
β€’
1 Parent(s): 3907263

Use FUS logs (not uploaded to repo) to compare length difference and edit distance distributions in FUS and in our dataset (resulting charts are not included).

Browse files
.gitignore CHANGED
@@ -278,4 +278,5 @@ pip-selfcheck.json
278
  .idea
279
 
280
  cache
281
- output
 
 
278
  .idea
279
 
280
  cache
281
+ output
282
+ data
change_visualizer.py CHANGED
@@ -14,7 +14,7 @@ n_diffs_synthetic = len(df_synthetic)
14
 
15
 
16
  def golden():
17
- return df_manual
18
 
19
 
20
  def e2s():
@@ -33,7 +33,7 @@ def synthetic():
33
  return df_synthetic[(df_synthetic['end_to_start'] == True) | (df_synthetic['start_to_end'] == True)]
34
 
35
 
36
- STATISTICS = {"manual": dataset_statistics.get_statistics_for_df(df_manual),
37
  "e2s": dataset_statistics.get_statistics_for_df(e2s()),
38
  "s2e": dataset_statistics.get_statistics_for_df(s2e()),
39
  "e2s_s2e": dataset_statistics.get_statistics_for_df(e2s_s2e()),
 
14
 
15
 
16
  def golden():
17
+ return df_synthetic[(df_synthetic['end_to_start'] == False) & (df_synthetic['start_to_end'] == False)]
18
 
19
 
20
  def e2s():
 
33
  return df_synthetic[(df_synthetic['end_to_start'] == True) | (df_synthetic['start_to_end'] == True)]
34
 
35
 
36
+ STATISTICS = {"manual": dataset_statistics.get_statistics_for_df(golden()),
37
  "e2s": dataset_statistics.get_statistics_for_df(e2s()),
38
  "s2e": dataset_statistics.get_statistics_for_df(s2e()),
39
  "e2s_s2e": dataset_statistics.get_statistics_for_df(e2s_s2e()),
chart_processing.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
dataset_statistics.py CHANGED
@@ -9,7 +9,10 @@ from scipy.stats import stats
9
  import config
10
 
11
 
12
- def get_statistics(start_msg, end_msg, annotated_msg):
 
 
 
13
  edit_ops = Levenshtein.editops(start_msg, end_msg)
14
  n_deletes = sum([1 if op == 'delete' else 0 for op, _, _ in edit_ops])
15
  n_inserts = sum([1 if op == 'insert' else 0 for op, _, _ in edit_ops])
@@ -27,11 +30,14 @@ def get_statistics(start_msg, end_msg, annotated_msg):
27
  "deletions_norm": n_deletes / len(start_msg),
28
  "insertions_norm": n_inserts / len(end_msg),
29
  "changes_norm": n_changes / len(end_msg),
 
 
 
30
  }
31
 
32
 
33
  def get_statistics_for_df(df: pd.DataFrame):
34
- stats = [get_statistics(row["commit_msg_start"], row["commit_msg_end"], row["annotated_diff"]) for _, row in
35
  df.iterrows()]
36
 
37
  assert len(stats) > 0
 
9
  import config
10
 
11
 
12
+ def get_statistics(row):
13
+ start_msg = row["commit_msg_start"]
14
+ end_msg = row["commit_msg_end"]
15
+
16
  edit_ops = Levenshtein.editops(start_msg, end_msg)
17
  n_deletes = sum([1 if op == 'delete' else 0 for op, _, _ in edit_ops])
18
  n_inserts = sum([1 if op == 'insert' else 0 for op, _, _ in edit_ops])
 
30
  "deletions_norm": n_deletes / len(start_msg),
31
  "insertions_norm": n_inserts / len(end_msg),
32
  "changes_norm": n_changes / len(end_msg),
33
+
34
+ "lendiff": abs(len(start_msg) - len(end_msg)),
35
+ "editdist": row["editdist_related"]
36
  }
37
 
38
 
39
  def get_statistics_for_df(df: pd.DataFrame):
40
+ stats = [get_statistics(row) for _, row in
41
  df.iterrows()]
42
 
43
  assert len(stats) > 0