Spaces:

JetBrains-Research
/

commit-message-editing-visualization

Sleeping

App Files Files Community

commit-message-editing-visualization / analysis_util.py

Petr Tsvetkov

Latest version of the code; config updated to JetBrains-Research

a7bba68 2 months ago

raw

history blame

No virus

2.86 kB

	import functools
	import operator

	import pandas as pd


	def correlations_for_group(group):
	REL_METRICS = [col.split("_")[0] for col in group.columns if col.endswith("_related")]
	IND_METRICS = [col.split("_")[0] for col in group.columns if col.endswith("_independent")]
	AGGR_METRICS = [col.split("_")[0] for col in group.columns if col.endswith("_aggr")]

	correlations = []
	for rel_metric in REL_METRICS:
	for ind_metric in IND_METRICS:
	correlations.append({
	f"rel_{rel_metric}_ind_{ind_metric}_pearson": group[f"{rel_metric}_related"].corr(
	group[f"{ind_metric}_independent"], method="pearson"),
	f"rel_{rel_metric}_ind_{ind_metric}_spearman": group[f"{rel_metric}_related"].corr(
	group[f"{ind_metric}_independent"], method="spearman"),
	})
	for aggr_metric in AGGR_METRICS:
	correlations.append({
	f"rel_{rel_metric}_aggr_{aggr_metric}_pearson": group[f"{rel_metric}_related"].corr(
	group[f"{aggr_metric}_aggr"], method="pearson"),
	f"rel_{rel_metric}_aggr_{aggr_metric}_spearman": group[f"{rel_metric}_related"].corr(
	group[f"{aggr_metric}_aggr"], method="spearman"),
	})
	return pd.Series(functools.reduce(operator.ior, correlations, {}))


	def split_metrics_string(s):
	tokens = s.split("_")
	return tokens[1], tokens[3]


	def get_correlations_df(df, right_side):
	correlations_raw = correlations_for_group(df)

	idx = list(set("_".join(col.split("_")[:-1]) for col in correlations_raw.index if right_side in col))

	data = []
	for metrics in idx:
	data.append(
	{"metrics": metrics,
	"spearman": correlations_raw[f"{metrics}_spearman"],
	"pearson": correlations_raw[f"{metrics}_pearson"],
	}
	)

	result = pd.DataFrame.from_records(data=data, index="metrics").sort_index()
	result.index = pd.MultiIndex.from_tuples(result.index.map(split_metrics_string).tolist())
	result.index.set_names(["relative", "independent"], inplace=True)

	return result


	def get_correlations_for_groups(df, right_side):
	correlations = {"all": get_correlations_df(df, right_side=right_side)}

	for e2s in (False, True):
	for s2e in (False, True):
	group = "golden"
	if e2s:
	group += "+e2s"
	if s2e:
	group += "+s2e"

	subdf = df[((df["end_to_start"] == e2s) & (df["start_to_end"] == s2e)) \| (
	(df["end_to_start"] == False) & (df["start_to_end"] == False))]
	subdf_corr = get_correlations_df(subdf, right_side=right_side)
	correlations[group] = subdf_corr

	correlations = pd.concat(correlations, axis=1)
	return correlations