File size: 3,450 Bytes
51245ea
 
 
 
 
4da8e4d
1ab13ba
d457c9f
 
 
 
 
 
 
 
 
51245ea
 
 
 
 
 
 
 
 
 
 
 
1ab13ba
d457c9f
 
51245ea
 
 
 
 
 
 
 
 
 
 
 
47eae45
51245ea
 
 
 
 
 
 
47eae45
 
51245ea
 
 
 
 
 
 
 
47eae45
51245ea
 
47eae45
51245ea
 
 
 
d457c9f
1ab13ba
 
 
 
d457c9f
51245ea
 
 
 
 
 
 
 
d457c9f
1ab13ba
d457c9f
 
 
 
 
 
 
1ab13ba
d457c9f
 
 
51245ea
 
 
 
 
 
 
 
 
 
 
47eae45
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import pickle
import sklearn.preprocessing as pp
from scipy.sparse import csr_matrix
import numpy as np
import pandas as pd
from scipy.sparse import vstack

def add_row_train(df, list_tid):
    new_pid_add = df.iloc[-1].name +1
    list_tid_add = list_tid
    list_pos_add = list(range(len(list_tid_add)))

    df.loc[new_pid_add] = {'tid': list_tid_add,'pos': list_pos_add}
    return df


def inference_row(list_tid, ps_matrix):
    ps_matrix_norm = pp.normalize(ps_matrix, axis=1)
    length_tid = len(list_tid)
    n_songs = ps_matrix.shape[1]
    sparse_row = csr_matrix((np.ones(length_tid), (np.zeros(length_tid), list_tid)), shape=(1, n_songs))
    sparse_row_norm = pp.normalize(sparse_row, axis=1)

    return sparse_row_norm * ps_matrix_norm.T, sparse_row


def get_best_tid(current_list, ps_matrix_row, K=50, MAX_tid=10):
    df_ps_train = pd.read_hdf('model/df_ps_train_new.hdf')
    df_ps_train_extra = pd.read_hdf('data_train/df_ps_train_extra.hdf')
    df_ps_train = pd.concat([df_ps_train,df_ps_train_extra])
    
    sim_vector, sparse_row = inference_row(current_list, ps_matrix_row)
    sim_vector = sim_vector.toarray()[0].tolist()

    # Enumerate index and rating
    counter_list = list(enumerate(sim_vector, 0))

    # Sort by rating
    sortedList = sorted(counter_list, key=lambda x: x[1], reverse=True)

    topK_pid = [i for i, _ in sortedList[1:K + 1]]

    n = 0
    new_list = []
    while (1):

        top_pid = topK_pid[n]

        add_tid_list = df_ps_train.loc[top_pid].tid

        # Form new list
        new_tid_list = new_list + add_tid_list
        new_tid_list = [x for x in new_tid_list if x not in current_list]
        new_tid_list = list(dict.fromkeys(new_tid_list))

        # Check number of songs and Add to data for prediction
        total_song = len(new_tid_list)
        #            print("n: {}\t total_song: {}".format(n,total_song))
        if (total_song > MAX_tid):
            new_tid_list = new_tid_list[:MAX_tid]
            # Add
            new_list = new_tid_list
            break
        else:
            new_list = new_tid_list
        n += 1
        if (n == K):
            break

    df_ps_train_extra = add_row_train(df_ps_train_extra, current_list)


    df_ps_train_extra.to_hdf('data_train/df_ps_train_extra.hdf', key='abc')

    return new_list, sparse_row


def inference_from_tid(list_tid, K=50, MAX_tid=10):
    pickle_path = 'model/giantMatrix_new.pickle'
    # pickle_path = 'data/giantMatrix_truth_new.pickle'

    with open(pickle_path, 'rb') as f:
        ps_matrix = pickle.load(f)
        
    with open("data_mat/giantMatrix_extra.pickle",'rb') as f:
        ps_matrix_extra = pickle.load(f)
    
    ps_matrix = vstack((ps_matrix,ps_matrix_extra))

    result, sparse_row = get_best_tid(list_tid, ps_matrix.tocsr(), K, MAX_tid)
    ps_matrix_extra = vstack((ps_matrix_extra,sparse_row.todok()))
    
    with open("data_mat/giantMatrix_extra.pickle", 'wb') as f:
        pickle.dump(ps_matrix_extra, f)
    
    return result


def inference_from_uri(list_uri, K=50, MAX_tid=10):
    with open('model/dict_uri2tid.pkl', 'rb') as f:
        dict_uri2tid = pickle.load(f)
    list_tid = [dict_uri2tid[x] for x in list_uri if x in dict_uri2tid]
    best_tid = inference_from_tid(list_tid, K, MAX_tid)

    with open('model/dict_tid2uri.pkl', 'rb') as f:
        dict_tid2uri = pickle.load(f)
    best_uri = [dict_tid2uri[x] for x in best_tid]
    return best_uri