In [None]:
import pickle

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

import config

In [None]:
GROUP_LABELS = ['Golden', 'Synthetic: Backward', 'Synthetic: Forward', 'Synthetic: Backward → Forward',
 'Synthetic: All']

In [None]:
def load_data(s):
 with open(config.OUTPUT_CHARTS_DIR / f"{s}_data.pkl", 'rb') as f:
 groups = pickle.load(f)

 data = []
 for i, group in enumerate(groups):
 for e in group:
 data.append({
 'Subset': GROUP_LABELS[i],
 'value': e
 })
 return pd.DataFrame.from_records(data)

In [None]:
name = 'deletions_norm'

sns.set_theme(palette="colorblind")

ax = sns.displot(data=load_data(name), 
 x='value', 
 hue='Subset', 
 kind='kde',
 aspect=1.5, 
 common_norm=False,
 clip=(0, 10000))
sns.move_legend(ax, "center right", bbox_to_anchor=(.70, .75))
for line in ax.legend.get_lines():
 line.set_linewidth(5.0)

plt.title("Number of \"delete\" operations: distribution")
plt.xlabel("Number of \"delete\" operations")

plt.savefig(config.OUTPUT_CHARTS_DIR / f"{name}_dist.png",bbox_inches='tight')
plt.savefig(config.OUTPUT_CHARTS_DIR / f"{name}_dist.svg",bbox_inches='tight')
plt.show()

In [None]:
name = 'insertions_norm'

sns.set_theme(palette="colorblind")

ax = sns.displot(data=load_data(name), 
 x='value', 
 hue='Subset', 
 kind='kde',
 aspect=1.5, 
 common_norm=False,
 clip=(0, 10000))
sns.move_legend(ax, "center right", bbox_to_anchor=(.70, .75))
for line in ax.legend.get_lines():
 line.set_linewidth(5.0)

plt.title("Number of \"insert\" operations: distribution")
plt.xlabel("Number of \"insert\" operations")

plt.savefig(config.OUTPUT_CHARTS_DIR / f"{name}_dist.png",bbox_inches='tight')
plt.savefig(config.OUTPUT_CHARTS_DIR / f"{name}_dist.svg",bbox_inches='tight')
plt.show()

In [None]:
import numpy as np

def get_bins_for_report(intervals):
 bins = []
 for cur_interval in intervals:
 bins += list(map(int, np.linspace(start=cur_interval[0], stop=cur_interval[1], num=cur_interval[2])))
 print(",".join(list(map(str, bins))))
 
get_bins_for_report([
 (0, 400, 30),
 (401, 1200, 20)
])

## FUS logs

In [None]:
def to_hist(data):
 data = data.copy()
 num_samples = sum(data['y'])
 data['y'] = [e / num_samples / scale for e, scale in zip(data['y'], data['bin_size'])]
 
 return data

In [None]:
FUS_REPORT_TYPE = "all_ide"

RAW_FUS_REPORT = f"data/fus_raw_report_{FUS_REPORT_TYPE}.csv"

FUS_METRICS = ["CommitMessageEditDistance", "CommitMessageLengthDiff"]

FUS_DATA = {}

for metric in FUS_METRICS:
 FUS_DATA[metric] = {"x": [], "y": [], "bin_size": []}
 
with open(RAW_FUS_REPORT, "r") as f:
 current_metric = None
 for line in f.readlines():
 if line.isspace():
 current_metric = None
 continue
 
 skip_line = False
 
 for metric in FUS_METRICS:
 if metric in line:
 current_metric = metric
 skip_line = True
 break
 
 
 if not skip_line and current_metric is not None:
 tokens = line.strip().split(",")
 interval = tokens[0].replace("> ", "").split(" - ")
 x_left = float(interval[0])
 x_right = float(interval[-1])
 x = (x_left + x_right) / 2
 y = float(tokens[-2])
 bin_size = x_right - x_left + 1
 FUS_DATA[current_metric]["x"].append(x)
 FUS_DATA[current_metric]["y"].append(y)
 FUS_DATA[current_metric]["bin_size"].append(bin_size)
 
 
SKIP_FIRST = 2
FUS_DATA_NO_HEAD = {}

for metric in FUS_METRICS:
 FUS_DATA_NO_HEAD[metric] = {
 'x': FUS_DATA[metric]['x'][SKIP_FIRST:], 
 'y': FUS_DATA[metric]['y'][SKIP_FIRST:],
 'bin_size': FUS_DATA[metric]['bin_size'][SKIP_FIRST:]}

In [None]:
name = 'editdist'

sns.set_theme(palette="colorblind")

ax = sns.displot(data=load_data(name), 
 x='value', 
 hue='Subset', 
 kind='kde',
 aspect=1.5, 
 common_norm=False,
 clip=(0, 10000))


sns.lineplot(data=to_hist(FUS_DATA_NO_HEAD["CommitMessageEditDistance"]), x='x', y='y', style=True, dashes=[(2,2)], legend=False)

sns.move_legend(ax, "center right", bbox_to_anchor=(.70, .75))
for line in ax.legend.get_lines():
 line.set_linewidth(5.0)

plt.title(f"Edit distance ({FUS_REPORT_TYPE}): distribution")
plt.xlabel("Edit disatnce")

plt.savefig(config.OUTPUT_CHARTS_DIR / f"{name}_fus_{FUS_REPORT_TYPE}_skip_{SKIP_FIRST}_dist.png",bbox_inches='tight')
plt.savefig(config.OUTPUT_CHARTS_DIR / f"{name}_fus_{FUS_REPORT_TYPE}_skip_{SKIP_FIRST}_dist.svg",bbox_inches='tight')
plt.show()

In [None]:
name = 'lendiff'

sns.set_theme(palette="colorblind")

ax = sns.displot(data=load_data(name), 
 x='value', 
 hue='Subset', 
 kind='kde',
 aspect=1.5, 
 common_norm=False,
 clip=(0, 10000))

sns.lineplot(data=to_hist(FUS_DATA_NO_HEAD["CommitMessageLengthDiff"]), x='x', y='y', style=True, dashes=[(2,2)], legend=False)

sns.move_legend(ax, "center right", bbox_to_anchor=(.70, .75))
for line in ax.legend.get_lines():
 line.set_linewidth(5.0)


plt.title(f"Length difference ({FUS_REPORT_TYPE}): distribution")
plt.xlabel("Length difference")

plt.savefig(config.OUTPUT_CHARTS_DIR / f"{name}_fus_{FUS_REPORT_TYPE}_skip_{SKIP_FIRST}_dist.png",bbox_inches='tight')
plt.savefig(config.OUTPUT_CHARTS_DIR / f"{name}_fus_{FUS_REPORT_TYPE}_skip_{SKIP_FIRST}_dist.svg",bbox_inches='tight')
plt.show()