Lim0011's picture
Upload 251 files
85e3d20 verified
raw
history blame
4.35 kB
import gc
import numpy as np
import pandas as pd
import pandas.api.types
import sklearn.metrics
class ParticipantVisibleError(Exception):
pass
def apk(actual, predicted, k=20):
"""
Compute the average precision at k.
This function computes the average prescision at k between two lists of
items.
Parameters
----------
actual : list
A list of elements that are to be predicted (order doesn't matter)
predicted : list
A list of predicted elements (order does matter)
k : int, optional
The maximum number of predicted elements
Returns
-------
score : double
The average precision at k over the input lists
"""
if len(predicted) > k:
predicted = predicted[:k]
score = 0.0
num_hits = 0.0
for i, p in enumerate(predicted):
if p in actual and p not in predicted[:i]:
num_hits += 1.0
score += num_hits / (i + 1.0)
if not actual:
return 0.0
return score / min(len(actual), k)
def mapk(actual, predicted, k=20):
"""
Compute the mean average precision at k.
This function computes the mean average prescision at k between two lists
of lists of items.
Parameters
----------
actual : list
A list of lists of elements that are to be predicted
(order doesn't matter in the lists)
predicted : list
A list of lists of predicted elements
(order matters in the lists)
k : int, optional
The maximum number of predicted elements
Returns
-------
score : double
The mean average precision at k over the input lists
"""
return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])
def prepare(df, out_of_sample_column_name):
df['categories'] = df['categories'].str.split(' ')
df[out_of_sample_column_name] = df[out_of_sample_column_name].astype(float)
return df
def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str, out_of_sample_column_name: str='osd', k: int=20) -> float:
"""Metric for the FathomNet 2023 FGVC competition (46149).
Computes the average of a MAP@k and a normalized AUC on an "out-of-sample" indicator.
Parameters
----------
solution : DataFrame with columns having for each instance:
- categories: a list of integer categories
- osd: a binary out-of-sample indicator
submission : array-like of float, shape = (n_samples, n_classes + 1)
out_of_sample_column_name: str, the name of the out-of-sample indicator
k: the maximum number of predicted categories
"""
if row_id_column_name not in submission.columns:
raise ParticipantVisibleError('Submission file missing expected column ' + row_id_column_name)
if row_id_column_name not in solution.columns:
raise ParticipantVisibleError('Solution file missing expected column ' + row_id_column_name)
solution = solution.sort_values(by=[row_id_column_name])
submission = submission.sort_values(by=[row_id_column_name])
if not (solution[row_id_column_name].values == submission[row_id_column_name].values).all():
raise ParticipantVisibleError('The solution and submission row IDs are not identical')
del solution[row_id_column_name]
del submission[row_id_column_name]
gc.collect()
if out_of_sample_column_name is None:
raise ParticipantVisibleError('out_of_sample_column_name cannot be None')
missing_cols = solution.columns.difference(submission.columns)
if len(missing_cols) > 0:
raise ParticipantVisibleError('Submission file missing expected columns ' + ', '.join(missing_cols))
solution, submission = prepare(solution, out_of_sample_column_name), prepare(submission, out_of_sample_column_name)
oos_true = solution.pop(out_of_sample_column_name).to_numpy()
oos_pred = submission.pop(out_of_sample_column_name).to_numpy()
oos_score = sklearn.metrics.roc_auc_score(oos_true, oos_pred)
normalized_oos_score = 2 * (oos_score - 0.5) # random AUC is 0.5
solution = solution.squeeze().to_list()
submission = submission.squeeze().to_list()
cat_score = mapk(solution, submission, k=k)
results = 0.5 * (normalized_oos_score + cat_score)
return results