|
import functools |
|
import operator |
|
|
|
import pandas as pd |
|
|
|
|
|
def correlations_for_group(group): |
|
REL_METRICS = [col.split("_")[0] for col in group.columns if col.endswith("_related")] |
|
IND_METRICS = [col.split("_")[0] for col in group.columns if col.endswith("_independent")] |
|
AGGR_METRICS = [col.split("_")[0] for col in group.columns if col.endswith("_aggr")] |
|
|
|
correlations = [] |
|
for rel_metric in REL_METRICS: |
|
for ind_metric in IND_METRICS: |
|
correlations.append({ |
|
f"rel_{rel_metric}_ind_{ind_metric}_pearson": group[f"{rel_metric}_related"].corr( |
|
group[f"{ind_metric}_independent"], method="pearson"), |
|
f"rel_{rel_metric}_ind_{ind_metric}_spearman": group[f"{rel_metric}_related"].corr( |
|
group[f"{ind_metric}_independent"], method="spearman"), |
|
}) |
|
for aggr_metric in AGGR_METRICS: |
|
correlations.append({ |
|
f"rel_{rel_metric}_aggr_{aggr_metric}_pearson": group[f"{rel_metric}_related"].corr( |
|
group[f"{aggr_metric}_aggr"], method="pearson"), |
|
f"rel_{rel_metric}_aggr_{aggr_metric}_spearman": group[f"{rel_metric}_related"].corr( |
|
group[f"{aggr_metric}_aggr"], method="spearman"), |
|
}) |
|
return pd.Series(functools.reduce(operator.ior, correlations, {})) |
|
|
|
|
|
def split_metrics_string(s): |
|
tokens = s.split("_") |
|
return tokens[1], tokens[3] |
|
|
|
|
|
def get_correlations_df(df, right_side): |
|
correlations_raw = correlations_for_group(df) |
|
|
|
idx = list(set("_".join(col.split("_")[:-1]) for col in correlations_raw.index if right_side in col)) |
|
|
|
data = [] |
|
for metrics in idx: |
|
data.append( |
|
{"metrics": metrics, |
|
"spearman": correlations_raw[f"{metrics}_spearman"], |
|
"pearson": correlations_raw[f"{metrics}_pearson"], |
|
} |
|
) |
|
|
|
result = pd.DataFrame.from_records(data=data, index="metrics").sort_index() |
|
result.index = pd.MultiIndex.from_tuples(result.index.map(split_metrics_string).tolist()) |
|
result.index.set_names(["relative", "independent"], inplace=True) |
|
|
|
return result |
|
|
|
|
|
def get_correlations_for_groups(df, right_side): |
|
correlations = {"all": get_correlations_df(df, right_side=right_side)} |
|
|
|
for e2s in (False, True): |
|
for s2e in (False, True): |
|
group = "golden" |
|
if e2s: |
|
group += "+e2s" |
|
if s2e: |
|
group += "+s2e" |
|
|
|
subdf = df[((df["end_to_start"] == e2s) & (df["start_to_end"] == s2e)) | ( |
|
(df["end_to_start"] == False) & (df["start_to_end"] == False))] |
|
subdf_corr = get_correlations_df(subdf, right_side=right_side) |
|
correlations[group] = subdf_corr |
|
|
|
correlations = pd.concat(correlations, axis=1) |
|
return correlations |
|
|