File size: 3,316 Bytes
51245ea
 
 
 
 
4da8e4d
fa7f40e
9cb5f62
d457c9f
 
 
 
 
 
 
 
 
51245ea
 
 
 
 
 
 
 
 
 
 
fa7f40e
 
d457c9f
51245ea
 
 
 
 
 
 
 
 
 
 
 
47eae45
51245ea
 
 
 
 
 
 
47eae45
 
51245ea
 
 
 
 
 
 
 
47eae45
51245ea
 
47eae45
51245ea
 
 
 
d457c9f
1ab13ba
 
 
 
d457c9f
51245ea
 
 
 
9cb5f62
d457c9f
1ab13ba
d457c9f
 
fa7f40e
d457c9f
 
 
 
1ab13ba
d457c9f
 
 
51245ea
 
 
 
 
 
 
 
 
 
 
47eae45
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import pickle
import sklearn.preprocessing as pp
from scipy.sparse import csr_matrix
import numpy as np
import pandas as pd
from scipy.sparse import vstack
import global_var

def add_row_train(df, list_tid):
    new_pid_add = df.iloc[-1].name +1
    list_tid_add = list_tid
    list_pos_add = list(range(len(list_tid_add)))

    df.loc[new_pid_add] = {'tid': list_tid_add,'pos': list_pos_add}
    return df


def inference_row(list_tid, ps_matrix):
    ps_matrix_norm = pp.normalize(ps_matrix, axis=1)
    length_tid = len(list_tid)
    n_songs = ps_matrix.shape[1]
    sparse_row = csr_matrix((np.ones(length_tid), (np.zeros(length_tid), list_tid)), shape=(1, n_songs))
    sparse_row_norm = pp.normalize(sparse_row, axis=1)

    return sparse_row_norm * ps_matrix_norm.T, sparse_row


def get_best_tid(current_list, ps_matrix_row, K=50, MAX_tid=10):
    df_ps_train_extra = pd.read_hdf('data_train/df_ps_train_extra.hdf')
    df_ps_train = pd.concat([global_var.df_ps_train_ori,df_ps_train_extra])
    
    sim_vector, sparse_row = inference_row(current_list, ps_matrix_row)
    sim_vector = sim_vector.toarray()[0].tolist()

    # Enumerate index and rating
    counter_list = list(enumerate(sim_vector, 0))

    # Sort by rating
    sortedList = sorted(counter_list, key=lambda x: x[1], reverse=True)

    topK_pid = [i for i, _ in sortedList[1:K + 1]]

    n = 0
    new_list = []
    while (1):

        top_pid = topK_pid[n]

        add_tid_list = df_ps_train.loc[top_pid].tid

        # Form new list
        new_tid_list = new_list + add_tid_list
        new_tid_list = [x for x in new_tid_list if x not in current_list]
        new_tid_list = list(dict.fromkeys(new_tid_list))

        # Check number of songs and Add to data for prediction
        total_song = len(new_tid_list)
        #            print("n: {}\t total_song: {}".format(n,total_song))
        if (total_song > MAX_tid):
            new_tid_list = new_tid_list[:MAX_tid]
            # Add
            new_list = new_tid_list
            break
        else:
            new_list = new_tid_list
        n += 1
        if (n == K):
            break

    df_ps_train_extra = add_row_train(df_ps_train_extra, current_list)


    df_ps_train_extra.to_hdf('data_train/df_ps_train_extra.hdf', key='abc')

    return new_list, sparse_row


def inference_from_tid(list_tid, K=50, MAX_tid=10):

    # pickle_path = 'data/giantMatrix_truth_new.pickle'
        
    with open("data_mat/giantMatrix_extra.pickle",'rb') as f:
        ps_matrix_extra = pickle.load(f)
    
    ps_matrix = vstack((global_var.ps_matrix_ori,ps_matrix_extra))

    result, sparse_row = get_best_tid(list_tid, ps_matrix.tocsr(), K, MAX_tid)
    ps_matrix_extra = vstack((ps_matrix_extra,sparse_row.todok()))
    
    with open("data_mat/giantMatrix_extra.pickle", 'wb') as f:
        pickle.dump(ps_matrix_extra, f)
    
    return result


def inference_from_uri(list_uri, K=50, MAX_tid=10):
    with open('model/dict_uri2tid.pkl', 'rb') as f:
        dict_uri2tid = pickle.load(f)
    list_tid = [dict_uri2tid[x] for x in list_uri if x in dict_uri2tid]
    best_tid = inference_from_tid(list_tid, K, MAX_tid)

    with open('model/dict_tid2uri.pkl', 'rb') as f:
        dict_tid2uri = pickle.load(f)
    best_uri = [dict_tid2uri[x] for x in best_tid]
    return best_uri