Spaces:

willsh1997
/

reranker-v1

Runtime error

App Files Files Community

Aaron Snoswell commited on Apr 15, 2024

Commit

e57bd03

1 Parent(s): ed5173f

Tidy up code-path switching between neutralise and diversify ranker algorithm modes

Browse files

Files changed (2) hide show

my_web_app.py +6 -3
reranker.py +39 -16

my_web_app.py CHANGED Viewed

@@ -1,9 +1,10 @@
 from flask import Flask, jsonify, request
 from flask_cors import CORS
-from reranker import rankingfunc
-VERSION = 1.2
 app = Flask(__name__)
 CORS(app)
@@ -22,6 +23,8 @@ def version():
 @app.route('/rank', methods=['POST'])
 def perform_ranking():
     post_data = request.json
     debug = request.args.get('debug', type=bool, default=False)
     # Get session details
@@ -33,7 +36,7 @@ def perform_ranking():
     current_time: The current time according to the user's browser, in UTC, in YYYY-MM-DD hh:mm:ss format.
     """
-    results = rankingfunc(post_data, debug=debug)
     return jsonify(results)

 from flask import Flask, jsonify, request
 from flask_cors import CORS
+from reranker import rankingfunc, RankingModes
+VERSION = 1.3
 app = Flask(__name__)
 CORS(app)
 @app.route('/rank', methods=['POST'])
 def perform_ranking():
     post_data = request.json
+    mode = request.args.get('mode', type=str, default=RankingModes.DIVERSIFY)
+    k = request.args.get('k', type=int, default=10)
     debug = request.args.get('debug', type=bool, default=False)
     # Get session details
     current_time: The current time according to the user's browser, in UTC, in YYYY-MM-DD hh:mm:ss format.
     """
+    results = rankingfunc(post_data, k=k, mode=mode, debug=debug)
     return jsonify(results)

reranker.py CHANGED Viewed

@@ -1,24 +1,27 @@
 import os
-import requests
 import numpy as np
-from utils import *
-from numpy.linalg import norm
-from scipy.stats import rankdata
 from sklearn.utils.extmath import softmax
 from sentence_transformers import SentenceTransformer
-from copy import deepcopy
-import torch
-import torch.nn.functional as F
-#environment setup for HF docker image
 try:
     os.mkdir('./cache')
 except FileExistsError:
     # Use existing cache dir
     pass
 # Create embeddings from example texts
 # Guessing which environ var is correct
 os.environ['SENTENCE_TRANSFORMERS_HOME'] = './cache'
@@ -44,13 +47,17 @@ TARGET_DISTRIBUTION = [0.5, 0.5]
 # Controls the weight of the initial relevance score (0: ignore initial score, 1: only uses initial score)
 LAMBDA = 0.5
-#True to run diversification
-DIVERSIFY = True
 def fairScore(prob_scores:list, target:list) -> float:
     similarity = 1 - get_jsd_distance(prob_scores, target)
     return similarity
 def diversify(candidates: list, candidates_representation: dict, target: list) -> dict:
     accumulator = np.zeros(len(target))
     remaining = candidates.copy()
@@ -78,13 +85,16 @@ def diversify(candidates: list, candidates_representation: dict, target: list) -
     return diversified
-def rankingfunc(inputJSON: dict, debug: bool = False, k: int = 10) -> dict:
     '''
     Rank a set of social media posts using our ranking algorithm
     Inputs:
         inputJSON (dict): JSON dict from the web browser plugin, following the
             provided competition spec at https://github.com/HumanCompatibleAI/ranking-challenge
         debug (bool): If set, will also return extra debugging info in the return struct
     Returns:
@@ -92,9 +102,16 @@ def rankingfunc(inputJSON: dict, debug: bool = False, k: int = 10) -> dict:
             provided competition spec at https://github.com/HumanCompatibleAI/ranking-challenge
     '''
     # Extract text documents and get embeddings
     candidates = inputJSON['items']
-    k = min(k, len(candidates))
     if (debug):
        print("Reranking top ", k)
@@ -128,7 +145,7 @@ def rankingfunc(inputJSON: dict, debug: bool = False, k: int = 10) -> dict:
         print(initial_scores)
     diversity_scores = []
-    if (DIVERSIFY):
         diversity_scores = diversify(candidates, candidates_representation, TARGET_DISTRIBUTION)
     for index in range(len(candidates)):
@@ -138,12 +155,16 @@ def rankingfunc(inputJSON: dict, debug: bool = False, k: int = 10) -> dict:
         source = [(lw_cs[index] + 1.0) * 0.5, (rw_cs[index] + 1.0) * 0.5]
         source = F.softmax(torch.stack(source), dim=0)
         fairness = 0
-        if (DIVERSIFY):
             # Diversification:
             fairness =  diversity_scores[candidates[index]['id']]
-        else:
             # Neutralization:
             fairness = fairScore(source, TARGET_DISTRIBUTION)
         new_score = linearCombination(relevance, fairness, LAMBDA)
         candidates[index]['score'] = new_score
@@ -161,6 +182,8 @@ def rankingfunc(inputJSON: dict, debug: bool = False, k: int = 10) -> dict:
     final_ranking = reranked_ids
     output_results = {
         "ranked_ids": final_ranking,
         "new_items": []

 import os
+import torch
+import warnings
 import numpy as np
+import torch.nn.functional as F
+from enum import Enum
+from copy import deepcopy
 from sklearn.utils.extmath import softmax
 from sentence_transformers import SentenceTransformer
+from utils import *
+# Environment setup for HF docker image
 try:
     os.mkdir('./cache')
 except FileExistsError:
     # Use existing cache dir
     pass
 # Create embeddings from example texts
 # Guessing which environ var is correct
 os.environ['SENTENCE_TRANSFORMERS_HOME'] = './cache'
 # Controls the weight of the initial relevance score (0: ignore initial score, 1: only uses initial score)
 LAMBDA = 0.5
+# The different modes our ranking algorithm can run in
+class RankingModes(Enum):
+    DIVERSIFY = "diversify"
+    NEUTRALISE = "neutralise"
 def fairScore(prob_scores:list, target:list) -> float:
     similarity = 1 - get_jsd_distance(prob_scores, target)
     return similarity
 def diversify(candidates: list, candidates_representation: dict, target: list) -> dict:
     accumulator = np.zeros(len(target))
     remaining = candidates.copy()
     return diversified
+def rankingfunc(inputJSON: dict, k: int = 10, mode: str = RankingModes.DIVERSIFY, debug: bool = False) -> dict:
     '''
     Rank a set of social media posts using our ranking algorithm
     Inputs:
         inputJSON (dict): JSON dict from the web browser plugin, following the
             provided competition spec at https://github.com/HumanCompatibleAI/ranking-challenge
+        k (int): We only mess with the ranking of the first k items in the feed, to avoid
+            unduly reducing engagement.
+        mode (str): The ranker algorithm mode. Options include 'diversify' or 'neutralise'.
         debug (bool): If set, will also return extra debugging info in the return struct
     Returns:
             provided competition spec at https://github.com/HumanCompatibleAI/ranking-challenge
     '''
+    assert k > 0, "k must be a positive integer greater than 0, but was {k}"
+    assert mode in RankingModes, f"mode must be in {RankingModes}, but was {mode}"
     # Extract text documents and get embeddings
     candidates = inputJSON['items']
+    if len(candidates) < k:
+        warnings.warn(f"k truncated from {k} to {len(candidates)} due to only that many posts being passed")
+        k = min(k, len(candidates))
     if (debug):
        print("Reranking top ", k)
         print(initial_scores)
     diversity_scores = []
+    if mode == RankingModes.DIVERSIFY:
         diversity_scores = diversify(candidates, candidates_representation, TARGET_DISTRIBUTION)
     for index in range(len(candidates)):
         source = [(lw_cs[index] + 1.0) * 0.5, (rw_cs[index] + 1.0) * 0.5]
         source = F.softmax(torch.stack(source), dim=0)
         fairness = 0
+        if mode == RankingModes.DIVERSIFY:
             # Diversification:
             fairness =  diversity_scores[candidates[index]['id']]
+        elif mode == RankingModes.NEUTRALISE:
             # Neutralization:
             fairness = fairScore(source, TARGET_DISTRIBUTION)
+        else:
+            raise ValueError(f"Unknown ranking algorithm mode: {mode}")
         new_score = linearCombination(relevance, fairness, LAMBDA)
         candidates[index]['score'] = new_score
     final_ranking = reranked_ids
+    # TODO ajs 15/Apr/2024 Find a way to source high-quality out-of-feed posts, then incorporate them into the fusion algorithm
     output_results = {
         "ranked_ids": final_ranking,
         "new_items": []