File size: 7,128 Bytes
8cc04c9
 
 
 
 
 
 
 
be9ff75
 
 
8cc04c9
 
 
 
dbb0f24
8cc04c9
dbb0f24
 
 
8cc04c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import re
import itertools
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

from huggingface_hub import upload_file
#fuzz = upload_file(path_in_repo="fuzz.py")
from fuzzywuzzy import fuzz
from sklearn.feature_extraction.text import TfidfVectorizer
import gradio as gr

from datasets import load_dataset

dataset = load_dataset("seyia92coding/steam-clean-games-2019")

df = pd.read_csv(dataset,  error_bad_lines=False, encoding='utf-8')

# the function to extract years
def extract_year(date):
   year = date[:4]
   if year.isnumeric():
      return int(year)
   else:
      return np.nan
df['year'] = df['release_date'].apply(extract_year)

df['steamspy_tags'] = df['steamspy_tags'].str.replace(' ','-')
df['genres'] = df['steamspy_tags'].str.replace(';',' ')
counts = dict()
for i in df.index:
   for g in df.loc[i,'genres'].split(' '):
      if g not in counts:
         counts[g] = 1
      else:
         counts[g] = counts[g] + 1

def create_score(row):
  pos_count = row['positive_ratings']  
  neg_count = row['negative_ratings']
  total_count = pos_count + neg_count
  average = pos_count / total_count
  return round(average, 2)

def total_ratings(row):
  pos_count = row['positive_ratings']  
  neg_count = row['negative_ratings']
  total_count = pos_count + neg_count
  return total_count

df['total_ratings'] = df.apply(total_ratings, axis=1)
df['score'] = df.apply(create_score, axis=1)

# Calculate mean of vote average column
C = df['score'].mean()
m = df['total_ratings'].quantile(0.90)

# Function that computes the weighted rating of each game
def weighted_rating(x, m=m, C=C):
    v = x['total_ratings']
    R = x['score']
    # Calculation based on the IMDB formula
    return round((v/(v+m) * R) + (m/(m+v) * C), 2)

# Define a new feature 'score' and calculate its value with `weighted_rating()`
df['weighted_score'] = df.apply(weighted_rating, axis=1)

# create an object for TfidfVectorizer
tfidf_vector = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vector.fit_transform(df['genres'])

# create the cosine similarity matrix
sim_matrix = linear_kernel(tfidf_matrix,tfidf_matrix)

# create a function to find the closest title
def matching_score(a,b):
  #fuzz.ratio(a,b) calculates the Levenshtein Distance between a and b, and returns the score for the distance
   return fuzz.ratio(a,b)

"""# Make our Recommendation Engine

We need combine our formatted dataset with the similarity logic to return recommendations. This is also where we can fine-tune it if we do not like the results.
"""

##These functions needed to return different attributes of the recommended game titles

#Convert index to title_year
def get_title_year_from_index(index):
   return df[df.index == index]['year'].values[0]
#Convert index to title
def get_title_from_index(index):
   return df[df.index == index]['name'].values[0]
#Convert index to title
def get_index_from_title(title):
   return df[df.name == title].index.values[0]
#Convert index to score
def get_score_from_index(index):
   return df[df.index == index]['score'].values[0]
#Convert index to weighted score
def get_weighted_score_from_index(index):
   return df[df.index == index]['weighted_score'].values[0]
#Convert index to total_ratings
def get_total_ratings_from_index(index):
   return df[df.index == index]['total_ratings'].values[0]
#Convert index to platform
def get_platform_from_index(index):
  return df[df.index == index]['platforms'].values[0]
   
# A function to return the most similar title to the words a user type
def find_closest_title(title):
  #matching_score(a,b) > a is the current row, b is the title we're trying to match
   leven_scores = list(enumerate(df['name'].apply(matching_score, b=title))) #[(0, 30), (1,95), (2, 19)~~] A tuple of distances per index
   sorted_leven_scores = sorted(leven_scores, key=lambda x: x[1], reverse=True) #Sorts list of tuples by distance [(1, 95), (3, 49), (0, 30)~~]
   closest_title = get_title_from_index(sorted_leven_scores[0][0])
   distance_score = sorted_leven_scores[0][1]
   return closest_title, distance_score

def gradio_contents_based_recommender_v2(game, how_many, sort_option, min_year, platform, min_score):
  #Return closest game title match
  closest_title, distance_score = find_closest_title(game)
  #Create a Dataframe with these column headers
  recomm_df = pd.DataFrame(columns=['Game Title', 'Year', 'Score', 'Weighted Score', 'Total Ratings'])
  #find the corresponding index of the game title
  games_index = get_index_from_title(closest_title)
  #return a list of the most similar game indexes as a list
  games_list = list(enumerate(sim_matrix[int(games_index)]))
  #Sort list of similar games from top to bottom
  similar_games = list(filter(lambda x:x[0] != int(games_index), sorted(games_list,key=lambda x:x[1], reverse=True)))
  #Print the game title the similarity matrix is based on
  print('Here\'s the list of games similar to '+'\033[1m'+str(closest_title)+'\033[0m'+'.\n')
  #Only return the games that are on selected platform
  n_games = []
  for i,s in similar_games:
    if platform in get_platform_from_index(i):
      n_games.append((i,s))
  #Only return the games that are above the minimum score
  high_scores = []
  for i,s in n_games:
    if get_score_from_index(i) > min_score:
      high_scores.append((i,s))
    
  #Return the game tuple (game index, game distance score) and store in a dataframe
  for i,s in n_games[:how_many]: 
    #Dataframe will contain attributes based on game index
    row = {'Game Title': get_title_from_index(i), 'Year': get_title_year_from_index(i), 'Score': get_score_from_index(i), 
           'Weighted Score': get_weighted_score_from_index(i), 
           'Total Ratings': get_total_ratings_from_index(i),}
    #Append each row to this dataframe       
    recomm_df = recomm_df.append(row, ignore_index = True)
  #Sort dataframe by Sort_Option provided by user
  recomm_df = recomm_df.sort_values(sort_option, ascending=False)
  #Only include games released same or after minimum year selected
  recomm_df = recomm_df[recomm_df['Year'] >= min_year]

  return recomm_df

#Create list of unique calendar years based on main df column
years_sorted = sorted(list(df['year'].unique()))

#Interface will include these buttons based on parameters in the function with a dataframe output
recommender = gr.Interface(gradio_contents_based_recommender_v2, ["text", gr.inputs.Slider(1, 20, step=int(1)),
                                                            gr.inputs.Radio(['Year','Score','Weighted Score','Total Ratings']),
                                                            gr.inputs.Slider(int(years_sorted[0]), int(years_sorted[-1]), step=int(1)),
                                                            gr.inputs.Radio(['windows','xbox','playstation','linux','mac']),
                                                            gr.inputs.Slider(0, 10, step=0.1)],
                        "dataframe")

recommender.launch(debug=True)