|
import streamlit as st
|
|
import pandas as pd
|
|
|
|
|
|
@st.cache
|
|
def load_and_preprocess_data():
|
|
df = pd.read_csv(
|
|
"Data/OnlineRetail.csv",
|
|
encoding="latin-1",
|
|
)
|
|
|
|
|
|
df = df.dropna()
|
|
|
|
|
|
|
|
df = df[df["Quantity"] > 0]
|
|
|
|
|
|
df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"]).dt.floor(
|
|
"d"
|
|
) + pd.offsets.DateOffset(years=10)
|
|
|
|
|
|
df["CustomerID"] = df["CustomerID"].astype(int)
|
|
|
|
|
|
df["Price"] = df["Quantity"] * df["UnitPrice"]
|
|
|
|
|
|
users = df["CustomerID"].unique()
|
|
products = df["StockCode"].unique()
|
|
|
|
|
|
|
|
user_cat = pd.CategoricalDtype(categories=sorted(users), ordered=True)
|
|
product_cat = pd.CategoricalDtype(categories=sorted(products), ordered=True)
|
|
|
|
|
|
user_idx = df["CustomerID"].astype(user_cat).cat.codes
|
|
product_idx = df["StockCode"].astype(product_cat).cat.codes
|
|
|
|
|
|
df["CustomerIndex"] = user_idx
|
|
df["ProductIndex"] = product_idx
|
|
|
|
return df, user_idx, product_idx
|
|
|