Spaces:

digitiamosrl
/

recsys-and-customer-segmentation

Runtime error

App Files Files Community

recsys-and-customer-segmentation / pages /clustering.py

tave-st

remove leadining decimals

7d2b157 about 2 years ago

raw

history blame

12.4 kB

	from collections import defaultdict
	import streamlit as st
	from utils import load_and_preprocess_data
	import pandas as pd
	import numpy as np
	import altair as alt
	from sklearn.mixture import GaussianMixture
	import plotly.express as px
	import itertools
	from typing import Dict, List


	SIDEBAR_DESCRIPTION = """
	# Client clustering

	To cluster a client, we adopt the RFM metrics. They stand for:

	- R = recency, that is the number of days since the last purchase
	in the store
	- F = frequency, that is the number of times a customer has ordered something
	- M = monetary value, that is how much a customer has spent buying
	from your business.

	Given these 3 metrics, we can cluster the customers and find a suitable
	"definition" based on the clusters they belong to. Since the dataset
	we're using right now has about 5000 distinct customers, we identify
	3 clusters for each metric.

	## How we compute the clusters

	We resort to a GaussianMixture algorithm. We can think of GaussianMixture
	as generalized k-means clustering that incorporates information about
	the covariance structure of the data as well as the centers of the clusters.
	""".lstrip()

	FREQUENCY_CLUSTERS_EXPLAIN = """
	The frequency denotes how frequently a customer has ordered.

	There 3 available clusters for this metric:

	- cluster 0: denotes a customer that purchases one or few times (range [{}, {}])
	- cluster 1: these customer have a discrete amount of orders (range [{}, {}])
	- cluster 2: these customer purchases lots of times (range [{}, {}])

	-------
	""".lstrip()

	RECENCY_CLUSTERS_EXPLAIN = """
	The recency refers to how recently a customer has bought;

	There 3 available clusters for this metric:

	- cluster 0: the last order of these client is long time ago (range [{}, {}])
	- cluster 1: these are clients that purchases something not very recently (range [{}, {}])
	- cluster 2: the last order of these client is a few days/weeks ago (range [{}, {}])

	-------
	""".lstrip()

	MONETARY_CLUSTERS_EXPLAIN = """
	The revenue refers to how much a customer has spent buying
	from your business.

	There 3 available clusters for this metric:

	- cluster 0: these clients spent little money (range [{}, {}])
	- cluster 1: these clients spent a considerable amount of money (range [{}, {}])
	- cluster 2: these clients spent lots of money (range [{}, {}])

	-------
	""".lstrip()

	EXPLANATION_DICT = {
	"Frequency_cluster": FREQUENCY_CLUSTERS_EXPLAIN,
	"Recency_cluster": RECENCY_CLUSTERS_EXPLAIN,
	"Revenue_cluster": MONETARY_CLUSTERS_EXPLAIN,
	}


	def create_features(df: pd.DataFrame):
	"""Creates a new dataframe with the RFM features for each client."""
	# Compute frequency, the number of distinct time a user purchased.
	client_features = df.groupby("CustomerID")["InvoiceDate"].nunique().reset_index()
	client_features.columns = ["CustomerID", "Frequency"]

	# Add monetary value, the total revenue for each single user.
	client_takings = df.groupby("CustomerID")["Price"].sum()
	client_features["Revenue"] = client_takings.values

	# Add recency, i.e. the days since the last purchase in the store.
	max_date = df.groupby("CustomerID")["InvoiceDate"].max().reset_index()
	max_date.columns = ["CustomerID", "LastPurchaseDate"]

	client_features["Recency"] = (
	max_date["LastPurchaseDate"].max() - max_date["LastPurchaseDate"]
	).dt.days

	return client_features


	@st.cache
	def cluster_clients(df: pd.DataFrame):
	"""Computes the RFM features and clusters for each user based on the RFM metrics."""

	df_rfm = create_features(df)

	for to_cluster, order in zip(
	["Revenue", "Frequency", "Recency"], ["ascending", "ascending", "descending"]
	):
	kmeans = GaussianMixture(n_components=3, random_state=42)
	labels = kmeans.fit_predict(df_rfm[[to_cluster]])
	df_rfm[f"{to_cluster}_cluster"] = _order_cluster(kmeans, labels, order)

	return df_rfm


	def _order_cluster(cluster_model: GaussianMixture, clusters, order="ascending"):
	"""Orders the cluster by order."""
	centroids = cluster_model.means_.sum(axis=1)

	if order.lower() == "descending":
	centroids *= -1

	ascending_order = np.argsort(centroids)
	lookup_table = np.zeros_like(ascending_order)
	# Cluster will start from 1
	lookup_table[ascending_order] = np.arange(cluster_model.n_components) + 1
	return lookup_table[clusters]


	def show_purhcase_history(user: int, df: pd.DataFrame):
	user_purchases = df.loc[df.CustomerID == user, ["Price", "InvoiceDate"]]
	expenses = user_purchases.groupby(user_purchases.InvoiceDate).sum()
	expenses.columns = ["Expenses"]
	expenses = expenses.reset_index()

	c = (
	alt.Chart(expenses)
	.mark_line(point=True)
	.encode(
	x=alt.X("InvoiceDate", timeUnit="yearmonthdate", title="Date"),
	y="Expenses",
	)
	.properties(title="User expenses")
	)

	st.altair_chart(c)


	def show_user_info(user: int, df_rfm: pd.DataFrame):
	"""Prints some information about the user.

	The main information are the total expenses, how
	many times he purchases in the store, and the clusters
	he belongs to.
	"""

	user_row = df_rfm[df_rfm["CustomerID"] == user]
	if len(user_row) == 0:
	st.write(f"No user with id {user}")

	output = []

	output.append(f"The user purchased {user_row['Frequency'].squeeze()} times.\n")
	output.append(
	f"She/he spent {user_row['Revenue'].squeeze()} dollars in total.\n"
	)
	output.append(
	f"The last time she/he bought something was {user_row['Recency'].squeeze()} days ago.\n"
	)
	output.append(f"She/he belongs to the clusters: ")
	for cluster in [column for column in user_row.columns if "_cluster" in column]:
	output.append(f"- {cluster} = {user_row[cluster].squeeze()}")

	st.write("\n".join(output))

	return (
	user_row["Recency_cluster"].squeeze(),
	user_row["Frequency_cluster"].squeeze(),
	user_row["Revenue_cluster"].squeeze(),
	)


	def explain_cluster(cluster_info):
	"""Displays a popup menu explinging the meanining of the clusters."""

	with st.expander("Show information about the clusters"):
	st.write(
	"Note: these values are valid for these dataset."
	"Different dataset will have different number of clusters"
	" and values"
	)
	for cluster, info in cluster_info.items():
	st.write(EXPLANATION_DICT[cluster].format(*info))


	def categorize_user(recency_cluster, frequency_cluster, monetary_cluster):
	"""Describe the user with few words based on the cluster he belongs to."""

	score = f"{recency_cluster}{frequency_cluster}{monetary_cluster}"

	# @fixme: find a better approeach. These elif chains don't scale at all.

	description = ""

	if score == "111":
	description = "Tourist"
	elif score.startswith("2"):
	description = "Losing interest"
	elif score == "133":
	description = "Former lover"
	elif score == "123":
	description = "Former passionate client"
	elif score == "113":
	description = "Spent a lot, but never come back"
	elif score.startswith("1"):
	description = "About to dump"
	elif score == "313":
	description = "Potential lover"
	elif score == "312":
	description = "Interesting new client"
	elif score == "311":
	description = "New customer"
	elif score == "333":
	description = "Gold client"
	elif score == "322":
	description = "Lovers"
	else:
	description = "Average client"

	st.write(f"The customer can be described as: {description}")


	def plot_rfm_distribution(df_rfm: pd.DataFrame, cluster_info: Dict[str, List[int]]):
	"""Plots 3 histograms for the RFM metrics."""

	for x in ("Revenue", "Frequency", "Recency"):
	fig = px.histogram(df_rfm, x=x, log_y=True, title=f"{x} metric")
	# Get the max value in the cluster info. The cluster info is a list of min - max
	# values per cluster.
	values = cluster_info[f"{x}_cluster"]
	# Add vertical bar on each cluster end. But skip the last cluster.
	for n_cluster, i in enumerate(range(1, len(values)-1, 2)):
	fig.add_vline(
	x=values[i],
	annotation_text=f"End of cluster {n_cluster+1}",
	line_dash="dot",
	annotation=dict(textangle=90, font_color="red"),
	)
	st.plotly_chart(fig)


	def display_dataframe_heatmap(df_rfm: pd.DataFrame):
	"""Displays an heatmap of how many clients lay in the clusters.

	This method uses some black magic coming from the dataframe
	styling guide.
	"""

	# Create a dataframe with the count of clients for each group
	# of cluster.

	count = (
	df_rfm.groupby(["Recency_cluster", "Frequency_cluster", "Revenue_cluster"])[
	"CustomerID"
	]
	.count()
	.reset_index()
	)
	count = count.rename(columns={"CustomerID": "Count"})

	# Remove duplicates
	count = count.drop_duplicates(
	["Revenue_cluster", "Frequency_cluster", "Recency_cluster"]
	)

	# Use the count column as values, then index with the clusters.
	count = count.pivot(
	index=["Revenue_cluster", "Frequency_cluster"],
	columns="Recency_cluster",
	values="Count",
	)

	# Style manipulation
	cell_hover = {
	"selector": "td",
	"props": "font-size:1.5em",
	}
	index_names = {
	"selector": ".index_name",
	"props": "font-style: italic; color: Black; font-weight:normal;font-size:1.5em;",
	}
	headers = {
	"selector": "th:not(.index_name)",
	"props": "background-color: White; color: black; font-size:1.5em",
	}

	# Finally, display
	# We cannot directly print the dataframe since the streamlit
	# functin remove the multiindex. Thus, we extract the html representation
	# and then display it.
	st.markdown("## Heatmap: how the client are distributed between clusters")
	st.write(
	count.style.format(thousands=" ", precision=0, na_rep="0")
	.set_table_styles([cell_hover, index_names, headers])
	.background_gradient(cmap="coolwarm")
	.to_html(),
	unsafe_allow_html=True,
	)


	def main():
	st.sidebar.markdown(SIDEBAR_DESCRIPTION)

	df, _, _ = load_and_preprocess_data()
	df_rfm = cluster_clients(df)

	st.markdown(
	"# Dataset "
	"\nThis is the processed dataset with information about the clients, such as"
	" the RFM values and the clusters they belong to."
	)
	st.dataframe(df_rfm.style.format(formatter={"Revenue": "{:.2f}"}))

	cluster_info_dict = defaultdict(list)

	with st.expander("Show more details about the clusters"):
	for cluster in [column for column in df_rfm.columns if "_cluster" in column]:
	st.write(cluster)
	cluster_info = (
	df_rfm.groupby(cluster)[cluster.split("_")[0]]
	.describe()
	.reset_index(names="Cluster")
	)
	min_cluster = cluster_info["min"].astype(int)
	max_cluster = cluster_info["max"].astype(int)
	min_max_interlieved = list(itertools.chain(*zip(min_cluster, max_cluster)))
	cluster_info_dict[cluster].extend(min_max_interlieved)
	st.dataframe(cluster_info)

	st.markdown("## RFM metric distribution")

	plot_rfm_distribution(df_rfm, cluster_info_dict)

	display_dataframe_heatmap(df_rfm)

	st.markdown("## Interactive exploration")

	filter_by_cluster = st.checkbox(
	"Filter client: only one client per cluster type",
	value=True,
	)

	client_to_select = (
	df_rfm.groupby(["Recency_cluster", "Frequency_cluster", "Revenue_cluster"])["CustomerID"].first().values
	if filter_by_cluster
	else df["CustomerID"].unique()
	)

	# Let the user select the user to investigate
	user = st.selectbox(
	"Select a customer to show more information about him.",
	client_to_select,
	)

	show_purhcase_history(user, df)

	recency, frequency, revenue = show_user_info(user, df_rfm)

	categorize_user(recency, frequency, revenue)

	explain_cluster(cluster_info_dict)


	main()