In [1]:
import os
import requests
import numpy as np
from numpy.linalg import norm
from scipy.stats import rankdata
from sentence_transformers import SentenceTransformer
from copy import deepcopy

#sample data
from sample_data import BASIC_EXAMPLE

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
encodingModel = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')


In [3]:
#create embeddings from example texts

#left wing
with open('/mnt/c/Users/hew7/Documents/Git/ChaiProsocialRankingChallenge/flask-test/manifesto-left.txt', 'r') as f:
    LeftWingStr=f.read()

#right wing
with open('/mnt/c/Users/hew7/Documents/Git/ChaiProsocialRankingChallenge/flask-test/manifesto-right.txt', 'r') as f:
    RightWingStr=f.read()

In [4]:
LWPair=[LeftWingStr, encodingModel.encode(LeftWingStr)]

In [5]:
RWPair=[RightWingStr, encodingModel.encode(RightWingStr)]

In [6]:
#pulling in examples
example_texts = [x['text'] for x in BASIC_EXAMPLE['items']]

In [7]:
example_texts

['this is the worst thing I have ever seen!',
 'this is amazing!',
 'this thing is ok.']

In [8]:
embeddings = encodingModel.encode(example_texts)

In [9]:
#cosine similarity 

def cosineSim(x, y) -> float: #type hint for np array I think - but I'll figure it out later
    xArray=np.array(x)
    yArray=np.array(y)
    cosine=np.dot(xArray,yArray)/(norm(xArray)*norm(yArray))
    return cosine

In [10]:
#ranking func, purely cosine similarity ----- KINDA JANKY
def cosineRank(lhs: list, rhs: list, ) -> list:
    '''
    returns list of rankings in order of embeddings
    '''
    similarity_list=[]
    for candidate in rhs:
        similarity_list.append(cosineSim(lhs, candidate))
    results = rankdata(similarity_list) - 1
    return results

In [11]:
def sort_text_cosine(LHSEmbedding, RHSEmbeddingList, RHSTextList) -> list:
    result_order = cosineRank(LHSEmbedding, RHSEmbeddingList)
    print(result_order)
    output = [RHSTextList[int(x)] for x in result_order]
    return output


In [12]:
sort_text_cosine(LWPair[1],embeddings, example_texts)

[1. 0. 2.]


['this is amazing!',
 'this is the worst thing I have ever seen!',
 'this thing is ok.']

In [13]:
sort_text_cosine(RWPair[1],embeddings, example_texts)

[0. 1. 2.]


['this is the worst thing I have ever seen!',
 'this is amazing!',
 'this thing is ok.']

In [14]:
#trying to write a function that inputs and outputs dicts (start to end for API)
def rankingfunc(inputJSON: dict) -> dict:
    '''
    WIP - super gross func but it works for now
    
    Final ranking func using previously defined encodingModel and cosine sim to rank similarity to left-wing
    or right-wing text file. Tested on provided example json from sample_data. Returns identically structured
    json with reordered results.
    '''
    
    #change LHS based on userID:
    if inputJSON['session']['user_id'] in ['193a9e01-8849-4e1f-a42a-a859fa7f2ad3']: #change this list to be for all users selected for left_wing
        LHS=LWPair
    else:
        LHS=RWPair

    #prepare data and get embeddings
    candidates = inputJSON['items']
    texts=[x['text'] for x in candidates]
    embeddings=encodingModel.encode(texts)

    #rerank
    item_rank=cosineRank(LHS[1], embeddings)
    for index in range(len(candidates)):
        candidates[index]['rank']=item_rank[index]
    output_list = sorted(candidates, key=lambda x: x['rank'])
    for i in output_list:
        del i['rank']
    
    #prep data for export
    output_dict=deepcopy(inputJSON)
    output_dict['items']=output_list

    return output_dict

        