# Overview data

**Note: In this notebook, I assume the dataset is cleaned and ignore EDA.**

In [1]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore")

In [2]:
path = "./dataset"

# user dataset
user_df = pd.read_csv(f"{path}/users.csv", delimiter=';', encoding='ISO-8859-1')
user_df.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [3]:
# book dataset
book_df = pd.read_csv(f"{path}/books.csv", delimiter=';', encoding='ISO-8859-1', error_bad_lines=False)
book_df.head()

b'Skipping line 6452: expected 8 fields, saw 9\nSkipping line 43667: expected 8 fields, saw 10\nSkipping line 51751: expected 8 fields, saw 9\n'
b'Skipping line 92038: expected 8 fields, saw 9\nSkipping line 104319: expected 8 fields, saw 9\nSkipping line 121768: expected 8 fields, saw 9\n'
b'Skipping line 144058: expected 8 fields, saw 9\nSkipping line 150789: expected 8 fields, saw 9\nSkipping line 157128: expected 8 fields, saw 9\nSkipping line 180189: expected 8 fields, saw 9\nSkipping line 185738: expected 8 fields, saw 9\n'
b'Skipping line 209388: expected 8 fields, saw 9\nSkipping line 220626: expected 8 fields, saw 9\nSkipping line 227933: expected 8 fields, saw 11\nSkipping line 228957: expected 8 fields, saw 10\nSkipping line 245933: expected 8 fields, saw 9\nSkipping line 251296: expected 8 fields, saw 9\nSkipping line 259941: expected 8 fields, saw 9\nSkipping line 261529: expected 8 fields, saw 9\n'


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [4]:
# rating dataset
rating_df = pd.read_csv(f"{path}/ratings.csv", delimiter=';', encoding='ISO-8859-1')
rating_df.head(10)

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6
5,276733,2080674722,0
6,276736,3257224281,8
7,276737,0600570967,6
8,276744,038550120X,7
9,276745,342310538,10


In [5]:
rating_df.columns

Index(['User-ID', 'ISBN', 'Book-Rating'], dtype='object')

In [6]:
function = {
    "Book-Rating": "mean",
    "User-ID": "count"
}

summary_rating = rating_df.groupby("ISBN").agg(function, axis=0)
summary_rating = summary_rating.rename(columns={"Book-Rating": "Mean-Rating", "User-ID": "Num-Rating"})
summary_rating.head()

Unnamed: 0_level_0,Mean-Rating,Num-Rating
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1
330299891,3.0,2
375404120,1.5,2
586045007,0.0,1
9022906116,3.5,2
9032803328,0.0,1


**Note:** In this repo, I only consider `book_df` and `rating_df`.

In [7]:
df = book_df.merge(summary_rating, how="left", left_on="ISBN", right_on="ISBN")
df.drop(columns=["Image-URL-S", "Image-URL-M", "Image-URL-L"], inplace=True)
df.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Mean-Rating,Num-Rating
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,0.0,1.0
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,4.928571,14.0
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,5.0,3.0
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,4.272727,11.0
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,0.0,1.0


In [9]:
df.isnull().sum()

ISBN                      0
Book-Title                0
Book-Author               1
Year-Of-Publication       0
Publisher                 2
Mean-Rating            1209
Num-Rating             1209
dtype: int64

In [10]:
# Save
df.to_csv(f"{path}/summary_book.csv", index=False)