tave-st commited on
Commit
29cc895
1 Parent(s): 9ca156e

change cluster vertical bar

Browse files
Files changed (1) hide show
  1. pages/clustering.py +7 -6
pages/clustering.py CHANGED
@@ -22,14 +22,14 @@ To cluster a client, we adopt the RFM metrics. They stand for:
22
 
23
  Given these 3 metrics, we can cluster the customers and find a suitable
24
  "definition" based on the clusters they belong to. Since the dataset
25
- we're using right now as about 5000 distinct customers, we identify
26
  3 clusters for each metric.
27
 
28
  ## How we compute the clusters
29
 
30
- We resort to a simple KMeans algorithm. It tries to find the clusters
31
- based on the distance between points. In particular, near points tend to be associated
32
- with the same cluster, while further points should belong to different clusters.
33
  """.lstrip()
34
 
35
  FREQUENCY_CLUSTERS_EXPLAIN = """
@@ -238,7 +238,8 @@ def plot_rfm_distribution(df_rfm: pd.DataFrame, cluster_info: Dict[str, List[int
238
  # Get the max value in the cluster info. The cluster info is a list of min - max
239
  # values per cluster.
240
  values = cluster_info[f"{x}_cluster"]
241
- for n_cluster, i in enumerate(range(1, len(values), 2)):
 
242
  fig.add_vline(
243
  x=values[i],
244
  annotation_text=f"End of cluster {n_cluster+1}",
@@ -299,7 +300,7 @@ def display_dataframe_heatmap(df_rfm: pd.DataFrame):
299
  # and then display it.
300
  st.markdown("## Heatmap: how the client are distributed between clusters")
301
  st.write(
302
- count.style.format(thousands=" ", precision=0, na_rep="Missing")
303
  .set_table_styles([cell_hover, index_names, headers])
304
  .background_gradient(cmap="coolwarm")
305
  .to_html(),
 
22
 
23
  Given these 3 metrics, we can cluster the customers and find a suitable
24
  "definition" based on the clusters they belong to. Since the dataset
25
+ we're using right now has about 5000 distinct customers, we identify
26
  3 clusters for each metric.
27
 
28
  ## How we compute the clusters
29
 
30
+ We resort to a GaussianMixture algorithm. We can think of GaussianMixture
31
+ as generalized k-means clustering that incorporates information about
32
+ the covariance structure of the data as well as the centers of the clusters.
33
  """.lstrip()
34
 
35
  FREQUENCY_CLUSTERS_EXPLAIN = """
 
238
  # Get the max value in the cluster info. The cluster info is a list of min - max
239
  # values per cluster.
240
  values = cluster_info[f"{x}_cluster"]
241
+ # Add vertical bar on each cluster end. But skip the last cluster.
242
+ for n_cluster, i in enumerate(range(1, len(values)-1, 2)):
243
  fig.add_vline(
244
  x=values[i],
245
  annotation_text=f"End of cluster {n_cluster+1}",
 
300
  # and then display it.
301
  st.markdown("## Heatmap: how the client are distributed between clusters")
302
  st.write(
303
+ count.style.format(thousands=" ", precision=0, na_rep="0")
304
  .set_table_styles([cell_hover, index_names, headers])
305
  .background_gradient(cmap="coolwarm")
306
  .to_html(),