seyia92coding commited on
Commit
8cc04c9
1 Parent(s): 415c027

Upload hugging_face_demo_v1.py

Browse files
Files changed (1) hide show
  1. hugging_face_demo_v1.py +176 -0
hugging_face_demo_v1.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Hugging Face Demo V1.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1UPgdrPyLAvEWpJifn7Y6eblkiM2yc0_3
8
+ """
9
+
10
+ import pandas as pd
11
+ import numpy as np
12
+ import re
13
+ import itertools
14
+ import matplotlib.pyplot as plt
15
+ from sklearn.feature_extraction.text import TfidfVectorizer
16
+ from sklearn.metrics.pairwise import linear_kernel
17
+ !pip install fuzzywuzzy
18
+ from fuzzywuzzy import fuzz
19
+ from sklearn.feature_extraction.text import TfidfVectorizer
20
+ !pip install gradio
21
+ import gradio as gr
22
+
23
+ clean_ratings_tags = "/content/steam-clean-games.csv"
24
+
25
+ df = pd.read_csv(clean_ratings_tags, error_bad_lines=False, encoding='utf-8')
26
+
27
+ # the function to extract years
28
+ def extract_year(date):
29
+ year = date[:4]
30
+ if year.isnumeric():
31
+ return int(year)
32
+ else:
33
+ return np.nan
34
+ df['year'] = df['release_date'].apply(extract_year)
35
+
36
+ df['steamspy_tags'] = df['steamspy_tags'].str.replace(' ','-')
37
+ df['genres'] = df['steamspy_tags'].str.replace(';',' ')
38
+ counts = dict()
39
+ for i in df.index:
40
+ for g in df.loc[i,'genres'].split(' '):
41
+ if g not in counts:
42
+ counts[g] = 1
43
+ else:
44
+ counts[g] = counts[g] + 1
45
+
46
+ def create_score(row):
47
+ pos_count = row['positive_ratings']
48
+ neg_count = row['negative_ratings']
49
+ total_count = pos_count + neg_count
50
+ average = pos_count / total_count
51
+ return round(average, 2)
52
+
53
+ def total_ratings(row):
54
+ pos_count = row['positive_ratings']
55
+ neg_count = row['negative_ratings']
56
+ total_count = pos_count + neg_count
57
+ return total_count
58
+
59
+ df['total_ratings'] = df.apply(total_ratings, axis=1)
60
+ df['score'] = df.apply(create_score, axis=1)
61
+
62
+ # Calculate mean of vote average column
63
+ C = df['score'].mean()
64
+ m = df['total_ratings'].quantile(0.90)
65
+
66
+ # Function that computes the weighted rating of each game
67
+ def weighted_rating(x, m=m, C=C):
68
+ v = x['total_ratings']
69
+ R = x['score']
70
+ # Calculation based on the IMDB formula
71
+ return round((v/(v+m) * R) + (m/(m+v) * C), 2)
72
+
73
+ # Define a new feature 'score' and calculate its value with `weighted_rating()`
74
+ df['weighted_score'] = df.apply(weighted_rating, axis=1)
75
+
76
+ # create an object for TfidfVectorizer
77
+ tfidf_vector = TfidfVectorizer(stop_words='english')
78
+ tfidf_matrix = tfidf_vector.fit_transform(df['genres'])
79
+
80
+ # create the cosine similarity matrix
81
+ sim_matrix = linear_kernel(tfidf_matrix,tfidf_matrix)
82
+
83
+ # create a function to find the closest title
84
+ def matching_score(a,b):
85
+ #fuzz.ratio(a,b) calculates the Levenshtein Distance between a and b, and returns the score for the distance
86
+ return fuzz.ratio(a,b)
87
+
88
+ """# Make our Recommendation Engine
89
+
90
+ We need combine our formatted dataset with the similarity logic to return recommendations. This is also where we can fine-tune it if we do not like the results.
91
+ """
92
+
93
+ ##These functions needed to return different attributes of the recommended game titles
94
+
95
+ #Convert index to title_year
96
+ def get_title_year_from_index(index):
97
+ return df[df.index == index]['year'].values[0]
98
+ #Convert index to title
99
+ def get_title_from_index(index):
100
+ return df[df.index == index]['name'].values[0]
101
+ #Convert index to title
102
+ def get_index_from_title(title):
103
+ return df[df.name == title].index.values[0]
104
+ #Convert index to score
105
+ def get_score_from_index(index):
106
+ return df[df.index == index]['score'].values[0]
107
+ #Convert index to weighted score
108
+ def get_weighted_score_from_index(index):
109
+ return df[df.index == index]['weighted_score'].values[0]
110
+ #Convert index to total_ratings
111
+ def get_total_ratings_from_index(index):
112
+ return df[df.index == index]['total_ratings'].values[0]
113
+ #Convert index to platform
114
+ def get_platform_from_index(index):
115
+ return df[df.index == index]['platforms'].values[0]
116
+
117
+ # A function to return the most similar title to the words a user type
118
+ def find_closest_title(title):
119
+ #matching_score(a,b) > a is the current row, b is the title we're trying to match
120
+ leven_scores = list(enumerate(df['name'].apply(matching_score, b=title))) #[(0, 30), (1,95), (2, 19)~~] A tuple of distances per index
121
+ sorted_leven_scores = sorted(leven_scores, key=lambda x: x[1], reverse=True) #Sorts list of tuples by distance [(1, 95), (3, 49), (0, 30)~~]
122
+ closest_title = get_title_from_index(sorted_leven_scores[0][0])
123
+ distance_score = sorted_leven_scores[0][1]
124
+ return closest_title, distance_score
125
+
126
+ def gradio_contents_based_recommender_v2(game, how_many, sort_option, min_year, platform, min_score):
127
+ #Return closest game title match
128
+ closest_title, distance_score = find_closest_title(game)
129
+ #Create a Dataframe with these column headers
130
+ recomm_df = pd.DataFrame(columns=['Game Title', 'Year', 'Score', 'Weighted Score', 'Total Ratings'])
131
+ #find the corresponding index of the game title
132
+ games_index = get_index_from_title(closest_title)
133
+ #return a list of the most similar game indexes as a list
134
+ games_list = list(enumerate(sim_matrix[int(games_index)]))
135
+ #Sort list of similar games from top to bottom
136
+ similar_games = list(filter(lambda x:x[0] != int(games_index), sorted(games_list,key=lambda x:x[1], reverse=True)))
137
+ #Print the game title the similarity matrix is based on
138
+ print('Here\'s the list of games similar to '+'\033[1m'+str(closest_title)+'\033[0m'+'.\n')
139
+ #Only return the games that are on selected platform
140
+ n_games = []
141
+ for i,s in similar_games:
142
+ if platform in get_platform_from_index(i):
143
+ n_games.append((i,s))
144
+ #Only return the games that are above the minimum score
145
+ high_scores = []
146
+ for i,s in n_games:
147
+ if get_score_from_index(i) > min_score:
148
+ high_scores.append((i,s))
149
+
150
+ #Return the game tuple (game index, game distance score) and store in a dataframe
151
+ for i,s in n_games[:how_many]:
152
+ #Dataframe will contain attributes based on game index
153
+ row = {'Game Title': get_title_from_index(i), 'Year': get_title_year_from_index(i), 'Score': get_score_from_index(i),
154
+ 'Weighted Score': get_weighted_score_from_index(i),
155
+ 'Total Ratings': get_total_ratings_from_index(i),}
156
+ #Append each row to this dataframe
157
+ recomm_df = recomm_df.append(row, ignore_index = True)
158
+ #Sort dataframe by Sort_Option provided by user
159
+ recomm_df = recomm_df.sort_values(sort_option, ascending=False)
160
+ #Only include games released same or after minimum year selected
161
+ recomm_df = recomm_df[recomm_df['Year'] >= min_year]
162
+
163
+ return recomm_df
164
+
165
+ #Create list of unique calendar years based on main df column
166
+ years_sorted = sorted(list(df['year'].unique()))
167
+
168
+ #Interface will include these buttons based on parameters in the function with a dataframe output
169
+ recommender = gr.Interface(gradio_contents_based_recommender_v2, ["text", gr.inputs.Slider(1, 20, step=int(1)),
170
+ gr.inputs.Radio(['Year','Score','Weighted Score','Total Ratings']),
171
+ gr.inputs.Slider(int(years_sorted[0]), int(years_sorted[-1]), step=int(1)),
172
+ gr.inputs.Radio(['windows','xbox','playstation','linux','mac']),
173
+ gr.inputs.Slider(0, 10, step=0.1)],
174
+ "dataframe")
175
+
176
+ recommender.launch(debug=True)