import gc import numpy as np import pandas as pd import pandas.api.types import sklearn.metrics class ParticipantVisibleError(Exception): pass def apk(actual, predicted, k=20): """ Compute the average precision at k. This function computes the average prescision at k between two lists of items. Parameters ---------- actual : list A list of elements that are to be predicted (order doesn't matter) predicted : list A list of predicted elements (order does matter) k : int, optional The maximum number of predicted elements Returns ------- score : double The average precision at k over the input lists """ if len(predicted) > k: predicted = predicted[:k] score = 0.0 num_hits = 0.0 for i, p in enumerate(predicted): if p in actual and p not in predicted[:i]: num_hits += 1.0 score += num_hits / (i + 1.0) if not actual: return 0.0 return score / min(len(actual), k) def mapk(actual, predicted, k=20): """ Compute the mean average precision at k. This function computes the mean average prescision at k between two lists of lists of items. Parameters ---------- actual : list A list of lists of elements that are to be predicted (order doesn't matter in the lists) predicted : list A list of lists of predicted elements (order matters in the lists) k : int, optional The maximum number of predicted elements Returns ------- score : double The mean average precision at k over the input lists """ return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)]) def prepare(df, out_of_sample_column_name): df['categories'] = df['categories'].str.split(' ') df[out_of_sample_column_name] = df[out_of_sample_column_name].astype(float) return df def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str, out_of_sample_column_name: str='osd', k: int=20) -> float: """Metric for the FathomNet 2023 FGVC competition (46149). Computes the average of a MAP@k and a normalized AUC on an "out-of-sample" indicator. Parameters ---------- solution : DataFrame with columns having for each instance: - categories: a list of integer categories - osd: a binary out-of-sample indicator submission : array-like of float, shape = (n_samples, n_classes + 1) out_of_sample_column_name: str, the name of the out-of-sample indicator k: the maximum number of predicted categories """ if row_id_column_name not in submission.columns: raise ParticipantVisibleError('Submission file missing expected column ' + row_id_column_name) if row_id_column_name not in solution.columns: raise ParticipantVisibleError('Solution file missing expected column ' + row_id_column_name) solution = solution.sort_values(by=[row_id_column_name]) submission = submission.sort_values(by=[row_id_column_name]) if not (solution[row_id_column_name].values == submission[row_id_column_name].values).all(): raise ParticipantVisibleError('The solution and submission row IDs are not identical') del solution[row_id_column_name] del submission[row_id_column_name] gc.collect() if out_of_sample_column_name is None: raise ParticipantVisibleError('out_of_sample_column_name cannot be None') missing_cols = solution.columns.difference(submission.columns) if len(missing_cols) > 0: raise ParticipantVisibleError('Submission file missing expected columns ' + ', '.join(missing_cols)) solution, submission = prepare(solution, out_of_sample_column_name), prepare(submission, out_of_sample_column_name) oos_true = solution.pop(out_of_sample_column_name).to_numpy() oos_pred = submission.pop(out_of_sample_column_name).to_numpy() oos_score = sklearn.metrics.roc_auc_score(oos_true, oos_pred) normalized_oos_score = 2 * (oos_score - 0.5) # random AUC is 0.5 solution = solution.squeeze().to_list() submission = submission.squeeze().to_list() cat_score = mapk(solution, submission, k=k) results = 0.5 * (normalized_oos_score + cat_score) return results