|
import streamlit as st |
|
import pandas as pd |
|
|
|
|
|
@st.cache |
|
def load_and_preprocess_data(): |
|
df = pd.read_csv( |
|
"Data/OnlineRetail.csv", |
|
encoding="latin-1", |
|
) |
|
|
|
|
|
df = df.dropna() |
|
|
|
|
|
|
|
df = df[df["Quantity"] > 0] |
|
|
|
|
|
df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"]).dt.floor( |
|
"d" |
|
) + pd.offsets.DateOffset(years=10) |
|
|
|
|
|
df["CustomerID"] = df["CustomerID"].astype(int) |
|
|
|
|
|
df["Price"] = df["Quantity"] * df["UnitPrice"] |
|
|
|
|
|
users = df["CustomerID"].unique() |
|
products = df["StockCode"].unique() |
|
|
|
|
|
|
|
user_cat = pd.CategoricalDtype(categories=sorted(users), ordered=True) |
|
product_cat = pd.CategoricalDtype(categories=sorted(products), ordered=True) |
|
|
|
|
|
user_idx = df["CustomerID"].astype(user_cat).cat.codes |
|
product_idx = df["StockCode"].astype(product_cat).cat.codes |
|
|
|
|
|
df["CustomerIndex"] = user_idx |
|
df["ProductIndex"] = product_idx |
|
|
|
return df, user_idx, product_idx |
|
|