File size: 2,858 Bytes
39950c9 a01d3ba 39950c9 9e1ff19 39950c9 a01d3ba 39950c9 a01d3ba 39950c9 a01d3ba 39950c9 aef1dbe a01d3ba a7bba68 a01d3ba aef1dbe a01d3ba aef1dbe a01d3ba a7bba68 aef1dbe a01d3ba aef1dbe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
import functools
import operator
import pandas as pd
def correlations_for_group(group):
REL_METRICS = [col.split("_")[0] for col in group.columns if col.endswith("_related")]
IND_METRICS = [col.split("_")[0] for col in group.columns if col.endswith("_independent")]
AGGR_METRICS = [col.split("_")[0] for col in group.columns if col.endswith("_aggr")]
correlations = []
for rel_metric in REL_METRICS:
for ind_metric in IND_METRICS:
correlations.append({
f"rel_{rel_metric}_ind_{ind_metric}_pearson": group[f"{rel_metric}_related"].corr(
group[f"{ind_metric}_independent"], method="pearson"),
f"rel_{rel_metric}_ind_{ind_metric}_spearman": group[f"{rel_metric}_related"].corr(
group[f"{ind_metric}_independent"], method="spearman"),
})
for aggr_metric in AGGR_METRICS:
correlations.append({
f"rel_{rel_metric}_aggr_{aggr_metric}_pearson": group[f"{rel_metric}_related"].corr(
group[f"{aggr_metric}_aggr"], method="pearson"),
f"rel_{rel_metric}_aggr_{aggr_metric}_spearman": group[f"{rel_metric}_related"].corr(
group[f"{aggr_metric}_aggr"], method="spearman"),
})
return pd.Series(functools.reduce(operator.ior, correlations, {}))
def split_metrics_string(s):
tokens = s.split("_")
return tokens[1], tokens[3]
def get_correlations_df(df, right_side):
correlations_raw = correlations_for_group(df)
idx = list(set("_".join(col.split("_")[:-1]) for col in correlations_raw.index if right_side in col))
data = []
for metrics in idx:
data.append(
{"metrics": metrics,
"spearman": correlations_raw[f"{metrics}_spearman"],
"pearson": correlations_raw[f"{metrics}_pearson"],
}
)
result = pd.DataFrame.from_records(data=data, index="metrics").sort_index()
result.index = pd.MultiIndex.from_tuples(result.index.map(split_metrics_string).tolist())
result.index.set_names(["relative", "independent"], inplace=True)
return result
def get_correlations_for_groups(df, right_side):
correlations = {"all": get_correlations_df(df, right_side=right_side)}
for e2s in (False, True):
for s2e in (False, True):
group = "golden"
if e2s:
group += "+e2s"
if s2e:
group += "+s2e"
subdf = df[((df["end_to_start"] == e2s) & (df["start_to_end"] == s2e)) | (
(df["end_to_start"] == False) & (df["start_to_end"] == False))]
subdf_corr = get_correlations_df(subdf, right_side=right_side)
correlations[group] = subdf_corr
correlations = pd.concat(correlations, axis=1)
return correlations
|