book-recommender / data_preprocessing.py
nirajandhakal's picture
Create data_preprocessing.py
3f702ae verified
import pandas as pd
import numpy as np
# Function Definitions
# Load libraries
def load_data(file):
return pd.read_csv(file, index_col=False)
# Handle duplicate rows
def remove_duplicate_rows(df):
df = df.drop_duplicates()
print("Number of removed duplicated rows:", len(df)-len(df.drop_duplicates()))
return df
# One hot encode categorical columns
def onehot_encoder(df, cols):
encoded_cols = []
for col in cols:
encoder = pd.get_dummies(df[col])
encoded_cols += list(encoder.columns)
df = df.join(encoder)
del df[col]
return df, encoded_cols
# Deal with NaN values in specified columns
def fillna_values(df, cols, strategy='mean'):
for col in cols:
if strategy == 'median':
df[col].fillna(df[col].median(), inplace=True)
elif strategy == 'mean':
df[col].fillna(df[col].mean(), inplace=True)
else:
raise ValueError('Invalid filling strategy')
return df
# Preprocess books dataset
def preprocess_books(books):
# Drop duplicates
books = remove_duplicate_rows(books)
# Get categorical columns
cat_cols = ['language_code']
# One-hot encode categoricals
books, _ = onehot_encoder(books, cat_cols)
# Fill NAs
fillna_cols = ['average_rating', 'ratings_count', 'work_ratings_count', 'work_text_reviews_count']
books = fillna_values(books, fillna_cols, strategy='mean')
return books
# Preprocess tags dataset
def preprocess_tags(tags):
return tags
def preprocess_book_tags(book_tags):
# Map tag_id to tag_name instead of dropping the column
tag_mapping = dict(zip(book_tags["tag_id"], book_tags["tag_name"]))
book_tags["tag_name"] = book_tags["tag_id"].apply(lambda x: tag_mapping.get(x, None))
# Groupby aggregate
agg_funcs = {'count': 'sum'} # Sum or other functions according to requirement
book_tags = book_tags.groupby(['goodreads_book_id'], as_index=False).agg(agg_funcs)
return book_tags
# Preprocess goodbooks-10k dataset
def preprocess_goodbooks(goodbooks):
# Scaling/softening extreme ratings
scaling_threshold = 4.5
goodbooks['scaled_rating'] = np.where(goodbooks['rating'] > scaling_threshold, scaling_threshold - 0.5 + ((scaling_threshold - 0.5) / (5 - scaling_threshold)) * (goodbooks['rating'] - scaling_threshold), goodbooks['rating'])
return goodbooks
# Merge and save dataset
# Merge and save dataset
def merge_and_save_dataset():
# Read files
files = {
'books': '../data/books.csv',
'book_tags': '../data/book_tags.csv',
'goodbooks': '../data/goodbooks-10k.csv',
'ratings': '../data/ratings.csv',
'tags': '../data/tags.csv',
'to_read': '../data/to_read.csv'
}
merged_dataset = pd.merge(left=pd.merge(preprocessed_books, preprocessed_book_tags, left_index=True, right_on="goodreads_book_id"), right=preprocessed_goodbooks, left_index=True)
# Additional cleanup and preprocessing
merged_dataset = merged_dataset.loc[:, ~merged_dataset.columns.duplicated()]
# Save the final dataset
merged_dataset.to_csv("../data/final_dataset.csv", index=False)
# Merge and save dataset
def merge_and_save_dataset():
# Read files
files = {
'books': '../data/books.csv',
'book_tags': '../data/book_tags.csv',
'goodbooks': '../data/goodbooks-10k.csv',
'ratings': '../data/ratings.csv',
'tags': '../data/tags.csv',
'to_read': '../data/to_read.csv'
}
merged_dataset = pd.merge(left=pd.merge(preprocessed_books, preprocessed_book_tags, left_index=True, right_on="goodreads_book_id"), right=preprocessed_goodbooks, left_index=True, right_index=True)
# Additional cleanup and preprocessing
merged_dataset = merged_dataset.loc[:, ~merged_dataset.columns.duplicated()]
# Save the final dataset
merged_dataset.to_csv("../data/final_dataset.csv", index=False)
merge_and_save_dataset()