Wintersmith commited on
Commit
3b528be
1 Parent(s): dd9a11d

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +44 -0
  2. recommender_system.py +80 -0
  3. requirements.txt +0 -0
app.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import boto3
4
+ import dotenv
5
+ import os
6
+
7
+ from recommender_system import match_books, recommend_books
8
+
9
+ dotenv.load_dotenv()
10
+
11
+ # Initialize S3 client and load data
12
+ s3 = boto3.client('s3',
13
+ aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
14
+ aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'))
15
+ bucket_name = 'martinbucket1'
16
+ obj_data = s3.get_object(Bucket=bucket_name, Key="Processed_data.csv")
17
+ dataframe = pd.read_csv(obj_data["Body"], encoding='cp1251', sep=',', low_memory=False)
18
+
19
+
20
+ def recommend_books_interface(selected_book) -> tuple:
21
+ matched_title = match_books(selected_book, dataframe)
22
+ if matched_title:
23
+ correlations_df = recommend_books(dataframe, matched_title)
24
+ message = f"Recommending these books based on your interest in: {matched_title}"
25
+ return correlations_df, message
26
+ else:
27
+ return pd.DataFrame({"Error": ["No matching book found"]}), "No books found"
28
+
29
+
30
+ # Gradio interface
31
+ inputs = gr.Textbox(lines=1, placeholder="Type a book title here...")
32
+ message_output = gr.Markdown()
33
+ outputs = gr.Dataframe()
34
+
35
+ demo = gr.Interface(fn=recommend_books_interface, inputs=inputs, outputs=[outputs, message_output],
36
+ title="Book Recommender System",
37
+ description="Enter a book title to get recommendations based on similarity.",
38
+ fill_width=True,
39
+ flagging_mode='never',
40
+ theme=gr.themes.Soft())
41
+
42
+
43
+ if __name__ == "__main__":
44
+ demo.launch(share=True)
recommender_system.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from fuzzywuzzy import process
4
+
5
+
6
+ def match_books(user_input: str, df: pd.DataFrame, min_score: float = 0.8):
7
+ # Use process.extractOne to get the best match
8
+ book_titles = df['Book-Title'].unique()
9
+ best_match = process.extractOne(user_input, book_titles)
10
+ # Check if the best match score is above the minimum score
11
+ if best_match and best_match[1] >= min_score:
12
+ result = best_match[0]
13
+ else:
14
+ result = None
15
+ return result
16
+
17
+
18
+ def recommend_books(df: pd.DataFrame, book_to_be_recommended: str) -> pd.DataFrame:
19
+ """
20
+ The recommend_books_new function identifies users who have read a specified book,
21
+ finds other books these users have read, computes the correlation between the specified book and these other books,
22
+ and returns a DataFrame with the recommended books, their correlation scores, and average ratings.
23
+ """
24
+
25
+ # Get relevant dataset of book's readers
26
+ book_readers = df['User-ID'][df['Book-Title'] == book_to_be_recommended]
27
+ book_readers = book_readers.tolist()
28
+ book_readers = np.unique(book_readers)
29
+
30
+ # Final dataset
31
+ books_of_book_readers = df[(df['User-ID'].isin(book_readers))]
32
+ number_of_rating_per_book = books_of_book_readers.groupby(['Book-Title']).agg('count').reset_index()
33
+
34
+ # Iterate over the number_of_user_ratings to get the highest number,
35
+ # while keeping at least 10 final records
36
+ threshold = 0
37
+ while True:
38
+ books_to_compare = number_of_rating_per_book['Book-Title'][number_of_rating_per_book['User-ID'] >= threshold]
39
+ books_to_compare = books_to_compare.tolist()
40
+ print(f"Threshold: {threshold}, Number of books to compare: {len(books_to_compare)}")
41
+ if len(books_to_compare) <= 11:
42
+ books_to_compare = number_of_rating_per_book['Book-Title'][number_of_rating_per_book['User-ID'] >= threshold-1]
43
+ break
44
+ threshold += 1
45
+
46
+ ratings_data_raw = books_of_book_readers[['User-ID', 'Book-Rating', 'Book-Title']][
47
+ books_of_book_readers['Book-Title'].isin(books_to_compare)]
48
+
49
+ # group by User and Book and compute mean
50
+ ratings_data_raw_nodup = ratings_data_raw.groupby(['User-ID', 'Book-Title'])['Book-Rating'].mean()
51
+
52
+ # reset index to see User-ID in every row
53
+ ratings_data_raw_nodup = ratings_data_raw_nodup.to_frame().reset_index()
54
+
55
+ dataset_for_corr = ratings_data_raw_nodup.pivot(index='User-ID', columns='Book-Title', values='Book-Rating')
56
+
57
+ # Method 1: Using pandas corr() with pairwise complete observations
58
+ correlations = dataset_for_corr.corrwith(dataset_for_corr[book_to_be_recommended], method='pearson')
59
+
60
+ # Add average ratings for each book in dataset_for_corr
61
+ average_ratings = ratings_data_raw_nodup.groupby('Book-Title')['Book-Rating'].mean().reset_index()
62
+
63
+ # Create DataFrame with correlations
64
+ correlations_df = pd.DataFrame({
65
+ 'Book-Title': correlations.index,
66
+ 'Correlation': correlations.values,
67
+ })
68
+
69
+ # Merge correlations_df with average_ratings
70
+ correlations_df = pd.merge(correlations_df, average_ratings, on='Book-Title')
71
+ correlations_df = correlations_df.rename(columns={'Book-Rating': 'Average ratings'})
72
+
73
+ # Sort by correlation value
74
+ correlations_df = correlations_df.sort_values('Correlation', ascending=False)
75
+
76
+ # Remove the book being recommended from the list
77
+ correlations_df = correlations_df[correlations_df['Book-Title'] != book_to_be_recommended]
78
+ correlations_df = correlations_df.head(10)
79
+
80
+ return correlations_df
requirements.txt ADDED
Binary file (2.33 kB). View file