import functools import operator import pandas as pd def correlations_for_group(group): REL_METRICS = [col.split("_")[0] for col in group.columns if col.endswith("_related")] IND_METRICS = [col.split("_")[0] for col in group.columns if col.endswith("_independent")] AGGR_METRICS = [col.split("_")[0] for col in group.columns if col.endswith("_aggr")] correlations = [] for rel_metric in REL_METRICS: for ind_metric in IND_METRICS: correlations.append({ f"rel_{rel_metric}_ind_{ind_metric}_pearson": group[f"{rel_metric}_related"].corr( group[f"{ind_metric}_independent"], method="pearson"), f"rel_{rel_metric}_ind_{ind_metric}_spearman": group[f"{rel_metric}_related"].corr( group[f"{ind_metric}_independent"], method="spearman"), }) for aggr_metric in AGGR_METRICS: correlations.append({ f"rel_{rel_metric}_aggr_{aggr_metric}_pearson": group[f"{rel_metric}_related"].corr( group[f"{aggr_metric}_aggr"], method="pearson"), f"rel_{rel_metric}_aggr_{aggr_metric}_spearman": group[f"{rel_metric}_related"].corr( group[f"{aggr_metric}_aggr"], method="spearman"), }) return pd.Series(functools.reduce(operator.ior, correlations, {})) def split_metrics_string(s): tokens = s.split("_") return tokens[1], tokens[3] def get_correlations_df(df, right_side): correlations_raw = correlations_for_group(df) idx = list(set("_".join(col.split("_")[:-1]) for col in correlations_raw.index if right_side in col)) data = [] for metrics in idx: data.append( {"metrics": metrics, "spearman": correlations_raw[f"{metrics}_spearman"], "pearson": correlations_raw[f"{metrics}_pearson"], } ) result = pd.DataFrame.from_records(data=data, index="metrics").sort_index() result.index = pd.MultiIndex.from_tuples(result.index.map(split_metrics_string).tolist()) result.index.set_names(["relative", "independent"], inplace=True) return result def get_correlations_for_groups(df, right_side): noref_correlations = {"all": get_correlations_df(df, right_side=right_side)} for e2s in (False, True): for s2e in (False, True): suffix = "" if e2s: suffix += "+e2s" if s2e: suffix += "+s2e" if suffix == "": suffix = "golden" subdf = df[(df["end_to_start"] == e2s) & (df["start_to_end"] == s2e)] subdf_noref_corr = get_correlations_for_groups(subdf, right_side=right_side) noref_correlations[suffix] = subdf_noref_corr noref_correlations = pd.concat(noref_correlations, axis=1) return noref_correlations