|
import pandas as pd |
|
import numpy as np |
|
|
|
|
|
|
|
|
|
def load_data(file): |
|
return pd.read_csv(file, index_col=False) |
|
|
|
|
|
def remove_duplicate_rows(df): |
|
df = df.drop_duplicates() |
|
print("Number of removed duplicated rows:", len(df)-len(df.drop_duplicates())) |
|
return df |
|
|
|
|
|
def onehot_encoder(df, cols): |
|
encoded_cols = [] |
|
for col in cols: |
|
encoder = pd.get_dummies(df[col]) |
|
encoded_cols += list(encoder.columns) |
|
df = df.join(encoder) |
|
del df[col] |
|
|
|
return df, encoded_cols |
|
|
|
|
|
def fillna_values(df, cols, strategy='mean'): |
|
for col in cols: |
|
if strategy == 'median': |
|
df[col].fillna(df[col].median(), inplace=True) |
|
elif strategy == 'mean': |
|
df[col].fillna(df[col].mean(), inplace=True) |
|
else: |
|
raise ValueError('Invalid filling strategy') |
|
|
|
return df |
|
|
|
|
|
def preprocess_books(books): |
|
|
|
books = remove_duplicate_rows(books) |
|
|
|
|
|
cat_cols = ['language_code'] |
|
|
|
|
|
books, _ = onehot_encoder(books, cat_cols) |
|
|
|
|
|
fillna_cols = ['average_rating', 'ratings_count', 'work_ratings_count', 'work_text_reviews_count'] |
|
books = fillna_values(books, fillna_cols, strategy='mean') |
|
|
|
return books |
|
|
|
|
|
def preprocess_tags(tags): |
|
return tags |
|
|
|
def preprocess_book_tags(book_tags): |
|
|
|
tag_mapping = dict(zip(book_tags["tag_id"], book_tags["tag_name"])) |
|
book_tags["tag_name"] = book_tags["tag_id"].apply(lambda x: tag_mapping.get(x, None)) |
|
|
|
|
|
agg_funcs = {'count': 'sum'} |
|
book_tags = book_tags.groupby(['goodreads_book_id'], as_index=False).agg(agg_funcs) |
|
|
|
return book_tags |
|
|
|
|
|
def preprocess_goodbooks(goodbooks): |
|
|
|
scaling_threshold = 4.5 |
|
goodbooks['scaled_rating'] = np.where(goodbooks['rating'] > scaling_threshold, scaling_threshold - 0.5 + ((scaling_threshold - 0.5) / (5 - scaling_threshold)) * (goodbooks['rating'] - scaling_threshold), goodbooks['rating']) |
|
|
|
return goodbooks |
|
|
|
|
|
|
|
def merge_and_save_dataset(): |
|
|
|
files = { |
|
'books': '../data/books.csv', |
|
'book_tags': '../data/book_tags.csv', |
|
'goodbooks': '../data/goodbooks-10k.csv', |
|
'ratings': '../data/ratings.csv', |
|
'tags': '../data/tags.csv', |
|
'to_read': '../data/to_read.csv' |
|
} |
|
|
|
merged_dataset = pd.merge(left=pd.merge(preprocessed_books, preprocessed_book_tags, left_index=True, right_on="goodreads_book_id"), right=preprocessed_goodbooks, left_index=True) |
|
|
|
|
|
merged_dataset = merged_dataset.loc[:, ~merged_dataset.columns.duplicated()] |
|
|
|
|
|
merged_dataset.to_csv("../data/final_dataset.csv", index=False) |
|
|
|
|
|
def merge_and_save_dataset(): |
|
|
|
files = { |
|
'books': '../data/books.csv', |
|
'book_tags': '../data/book_tags.csv', |
|
'goodbooks': '../data/goodbooks-10k.csv', |
|
'ratings': '../data/ratings.csv', |
|
'tags': '../data/tags.csv', |
|
'to_read': '../data/to_read.csv' |
|
} |
|
|
|
merged_dataset = pd.merge(left=pd.merge(preprocessed_books, preprocessed_book_tags, left_index=True, right_on="goodreads_book_id"), right=preprocessed_goodbooks, left_index=True, right_index=True) |
|
|
|
|
|
merged_dataset = merged_dataset.loc[:, ~merged_dataset.columns.duplicated()] |
|
|
|
|
|
merged_dataset.to_csv("../data/final_dataset.csv", index=False) |
|
|
|
|
|
merge_and_save_dataset() |