Spaces:

jaothan
/

reco

No application file

App Files Files Community

jaothan commited on 29 days ago

Commit

5bd5343

verified ·

1 Parent(s): 6324f20

Upload 5 files

Browse files

Files changed (5) hide show

recommender.py +126 -0
recommender_system.py +372 -0
requirements.txt +12 -0
requirements_freezed.txt +68 -0
utils.py +47 -0

recommender.py ADDED Viewed

	@@ -0,0 +1,126 @@

+from implicit.als import AlternatingLeastSquares
+from implicit.lmf import LogisticMatrixFactorization
+from implicit.bpr import BayesianPersonalizedRanking
+from implicit.nearest_neighbours import bm25_weight
+from scipy.sparse import csr_matrix
+from typing import Dict, Any
+MODEL = {
+    "lmf": LogisticMatrixFactorization,
+    "als": AlternatingLeastSquares,
+    "bpr": BayesianPersonalizedRanking,
+}
+def _get_sparse_matrix(values, user_idx, product_idx):
+    return csr_matrix(
+        (values, (user_idx, product_idx)),
+        shape=(len(user_idx.unique()), len(product_idx.unique())),
+    )
+def _get_model(name: str, **params):
+    model = MODEL.get(name)
+    if model is None:
+        raise ValueError("No model with name {}".format(name))
+    return model(**params)
+class InternalStatusError(Exception):
+    pass
+class Recommender:
+    def __init__(
+        self,
+        values,
+        user_idx,
+        product_idx,
+    ):
+        self.user_product_matrix = _get_sparse_matrix(values, user_idx, product_idx)
+        self.user_idx = user_idx
+        self.product_idx = product_idx
+        # This variable will be set during training phase
+        self.model = None
+        self.fitted = False
+    def create_and_fit(
+        self,
+        model_name: str,
+        weight_strategy: str = "bm25",
+        model_params: Dict[str, Any] = {},
+    ):
+        weight_strategy = weight_strategy.lower()
+        if weight_strategy == "bm25":
+            data = bm25_weight(
+                self.user_product_matrix,
+                K1=1.2,
+                B=0.75,
+            )
+        elif weight_strategy == "balanced":
+            # Balance the positive and negative (nan) entries
+            # http://stanford.edu/~rezab/nips2014workshop/submits/logmat.pdf
+            total_size = (
+                self.user_product_matrix.shape[0] * self.user_product_matrix.shape[1]
+            )
+            sum = self.user_product_matrix.sum()
+            num_zeros = total_size - self.user_product_matrix.count_nonzero()
+            data = self.user_product_matrix.multiply(num_zeros / sum)
+        elif weight_strategy == "same":
+            data = self.user_product_matrix
+        else:
+            raise ValueError("Weight strategy not supported")
+        self.model = _get_model(model_name, **model_params)
+        self.fitted = True
+        self.model.fit(data)
+        return self
+    def recommend_products(
+        self,
+        user_id,
+        items_to_recommend=5,
+    ):
+        """Finds the recommended items for the user.
+        Returns:
+            (items, scores) pair, where item is already the name of the suggested item.
+        """
+        if not self.fitted:
+            raise InternalStatusError(
+                "Cannot recommend products without previously fitting the model."
+                " Please, consider fitting the model before recommening products."
+            )
+        return self.model.recommend(
+            user_id,
+            self.user_product_matrix[user_id],
+            filter_already_liked_items=True,
+            N=items_to_recommend,
+        )
+    def explain_recommendation(
+        self,
+        user_id,
+        suggested_item_id,
+        recommended_items,
+    ):
+        _, items_score_contrib, _ = self.model.explain(
+            user_id,
+            self.user_product_matrix,
+            suggested_item_id,
+            N=recommended_items,
+        )
+        return items_score_contrib
+    def similar_users(self, user_id):
+        return self.model.similar_users(user_id)
+    @property
+    def item_factors(self):
+        return self.model.item_factors

recommender_system.py ADDED Viewed

	@@ -0,0 +1,372 @@

+import streamlit as st
+import pandas as pd
+import altair as alt
+from recommender import Recommender
+from sklearn.decomposition import PCA
+from sklearn.manifold import TSNE
+from os import cpu_count
+import numpy as np
+import time
+from utils import load_and_preprocess_data
+import matplotlib.pyplot as plt
+from typing import Union, List, Dict, Any
+import plotly.graph_objects as go
+COLUMN_NOT_DISPLAY = [
+    "StockCode",
+    "UnitPrice",
+    "Country",
+    "CustomerIndex",
+    "ProductIndex",
+]
+SIDEBAR_DESCRIPTION = """
+# Recommender system
+## What is it?
+A recommender system is a tool that suggests something new to a particular
+user that she/he might be interested in. It becomes useful when
+the number of items a user can choose from is high.
+## How does it work?
+A recommender system internally finds similar users and similar items,
+based on a suitable definition of "similarity".
+For example, users that purchased the same items can be considered similar.
+When we want to suggest new items to a user, a recommender system exploits
+the items bought by similar users as a starting point for the suggestion.
+The items bought by similar users are compared to the items that the user
+already bought. If they are new and similar, the model suggests them.
+## How we prepare the data
+For each user, we compute the quantity purchased for every single item.
+This will be the metric the value considered by the model to compute
+the similarity. The item that a user has never bought will
+be left at zero. These zeros will be the subject of the recommendation.
+""".lstrip()
+@st.cache(allow_output_mutation=True)
+def create_and_fit_recommender(
+    model_name: str,
+    values: Union[pd.DataFrame, "np.ndarray"],
+    users: Union[pd.DataFrame, "np.ndarray"],
+    products: Union[pd.DataFrame, "np.ndarray"],
+) -> Recommender:
+    recommender = Recommender(
+        values,
+        users,
+        products,
+    )
+    recommender.create_and_fit(
+        model_name,
+        # Fine-tuned values
+        model_params=dict(
+            factors=190,
+            alpha=0.6,
+            regularization=0.06,
+            random_state=42,
+        ),
+    )
+    return recommender
+def explain_recommendation(
+    recommender: Recommender,
+    user_id: int,
+    suggestions: List[int],
+    df: pd.DataFrame,
+):
+    output = []
+    n_recommended = len(suggestions)
+    for suggestion in suggestions:
+        explained = recommender.explain_recommendation(
+            user_id, suggestion, n_recommended
+        )
+        suggested_items_id = [id[0] for id in explained]
+        suggested_description = (
+            df.loc[df.ProductIndex == suggestion][["Description", "ProductIndex"]]
+            .drop_duplicates(subset=["ProductIndex"])["Description"]
+            .unique()[0]
+        )
+        similar_items_description = (
+            df.loc[df["ProductIndex"].isin(suggested_items_id)][
+                ["Description", "ProductIndex"]
+            ]
+            .drop_duplicates(subset=["ProductIndex"])["Description"]
+            .unique()
+        )
+        output.append(
+            f"The item **{suggested_description.strip()}** "
+            "has been suggested because it is similar to the following products"
+            " bought by the user:"
+        )
+        for description in similar_items_description:
+            output.append(f"- {description.strip()}")
+    with st.expander("See why the model recommended these products"):
+        st.write("\n".join(output))
+    st.write("------")
+def print_suggestions(suggestions: List[int], df: pd.DataFrame):
+    similar_items_description = (
+        df.loc[df["ProductIndex"].isin(suggestions)][["Description", "ProductIndex"]]
+        .drop_duplicates(subset=["ProductIndex"])["Description"]
+        .unique()
+    )
+    output = ["The model suggests the following products:"]
+    for description in similar_items_description:
+        output.append(f"- {description.strip()}")
+    st.write("\n".join(output))
+def display_user_char(user: int, data: pd.DataFrame):
+    subset = data[data.CustomerIndex == user]
+    # products = subset.groupby("ProductIndex").agg(
+    #     {"Description": lambda x: x.iloc[0], "Quantity": sum}
+    # )
+    st.write(
+        "The user {} bought {} distinct products. Here is the purchase history: ".format(
+            user, subset["Description"].nunique()
+        )
+    )
+    st.dataframe(
+        subset.sort_values("InvoiceDate").drop(
+            # Do not show the customer since we are display the
+            # information for a specific customer.
+            COLUMN_NOT_DISPLAY + ["CustomerID"],
+            axis=1,
+        )
+    )
+    st.write("-----")
+def _extract_description(df, products):
+    desc = df[df["ProductIndex"].isin(products)].drop_duplicates(
+        "ProductIndex", ignore_index=True
+    )[["ProductIndex", "Description"]]
+    return desc.set_index("ProductIndex")
+def display_recommendation_plots(
+    user_id: int,
+    suggestions: List[int],
+    df: pd.DataFrame,
+    model: Recommender,
+):
+    """Plots a t-SNE with the suggested items, togheter with the purchases of
+    similar users.
+    """
+    # Get the purchased items that contribute the most to the suggestions
+    contributions = []
+    n_recommended = len(suggestions)
+    for suggestion in suggestions:
+        items_and_score = model.explain_recommendation(
+            user_id, suggestion, n_recommended
+        )
+        contributions.append([t[0] for t in items_and_score])
+    contributions = np.unique(np.concatenate(contributions))
+    print("Contribution computed")
+    print(contributions)
+    print("=" * 80)
+    # Find the purchases of similar users
+    bought_by_similar_users = []
+    sim_users, _ = model.similar_users(user_id)
+    for u in sim_users:
+        _, sim_purchases = model.user_product_matrix[u].nonzero()
+        bought_by_similar_users.append(sim_purchases)
+    bought_by_similar_users = np.unique(np.concatenate(bought_by_similar_users))
+    print("Similar bought computed")
+    print(bought_by_similar_users)
+    print("=" * 80)
+    # Compute the t-sne
+    # Concate all the vectors to compute a single time the decomposition
+    to_decompose = np.concatenate(
+        (
+            model.item_factors[suggestions],
+            model.item_factors[contributions],
+            model.item_factors[bought_by_similar_users],
+        )
+    )
+    print(f"Shape to decompose: {to_decompose.shape}")
+    with st.spinner("Computing plots (this might take around 60 seconds)..."):
+        elapsed = time.time()
+        decomposed = _tsne_decomposition(
+            to_decompose,
+            dict(
+                perplexity=30,
+                metric="euclidean",
+                n_iter=1_000,
+                random_state=42,
+            ),
+        )
+    elapsed = time.time() - elapsed
+    print(f"TSNE computed in {elapsed}")
+    print("=" * 80)
+    # Extract the decomposed vectors
+    suggestion_dec = decomposed[: len(suggestions), :]
+    contribution_dec = decomposed[
+        len(suggestions) : len(suggestions) + len(contributions), :
+    ]
+    items_others_dec = decomposed[-len(bought_by_similar_users) :, :]
+    # Also, extract the description to create a nice hover in
+    # the final plot.
+    contribution_description = _extract_description(df, contributions)
+    items_other_description = _extract_description(df, bought_by_similar_users)
+    suggestion_description = _extract_description(df, suggestions)
+    # Plot the scatterplot
+    fig = go.Figure()
+    fig.add_trace(
+        go.Scatter(
+            x=contribution_dec[:, 0],
+            y=contribution_dec[:, 1],
+            mode="markers",
+            opacity=0.8,
+            name="Similar bought by user",
+            marker_symbol="square-open",
+            marker_color="#010CFA",
+            marker_size=10,
+            hovertext=contribution_description.loc[contributions].values.squeeze(),
+        )
+    )
+    fig.add_trace(
+        go.Scatter(
+            x=items_others_dec[:, 0],
+            y=items_others_dec[:, 1],
+            mode="markers",
+            name="Product bought by similar users",
+            opacity=0.7,
+            marker_symbol="circle-open",
+            marker_color="#FA5F19",
+            marker_size=10,
+            hovertext=items_other_description.loc[
+                bought_by_similar_users
+            ].values.squeeze(),
+        )
+    )
+    fig.add_trace(
+        go.Scatter(
+            x=suggestion_dec[:, 0],
+            y=suggestion_dec[:, 1],
+            mode="markers",
+            name="Suggested",
+            marker_color="#1A9626",
+            marker_symbol="star",
+            marker_size=10,
+            hovertext=suggestion_description.loc[suggestions].values.squeeze(),
+        )
+    )
+    fig.update_xaxes(visible=False)
+    fig.update_yaxes(visible=False)
+    fig.update_layout(plot_bgcolor="white")
+    return fig
+def _tsne_decomposition(data: np.ndarray, tsne_args: Dict[str, Any]):
+    if data.shape[1] > 50:
+        print("Performing PCA...")
+        data = PCA(n_components=50).fit_transform(data)
+    return TSNE(
+        n_components=2,
+        n_jobs=cpu_count(),
+        **tsne_args,
+    ).fit_transform(data)
+def main():
+    # Load and process data
+    data, users, products = load_and_preprocess_data()
+    recommender = create_and_fit_recommender(
+        "als",
+        data["Quantity"],
+        users,
+        products,
+    )
+    st.markdown(
+        """# Recommender system
+The dataset used for these computations is the following:
+        """
+    )
+    st.sidebar.markdown(SIDEBAR_DESCRIPTION)
+    to_display = data.drop(
+        COLUMN_NOT_DISPLAY,
+        axis=1,
+    )
+    # Convert to int just to display the column without trailing decimals.
+    # @note: I know I can use the "format" function of pandas, but I found out
+    #   it is super slow when fomratting large tables.
+    to_display["Price"] = to_display["Price"].astype(int)
+    # Show the data
+    st.dataframe(
+        to_display,
+    )
+    st.markdown("## Interactive suggestion")
+    with st.form("recommend"):
+        # Let the user select the user to investigate
+        user = st.selectbox(
+            "Select a customer to get his recommendations",
+            users.unique(),
+        )
+        items_to_recommend = st.slider("How many items to recommend?", 1, 10, 5)
+        print(items_to_recommend)
+        submitted = st.form_submit_button("Recommend!")
+        if submitted:
+            # show_purhcase_history(user, data)
+            display_user_char(user, data)
+            suggestions_and_score = recommender.recommend_products(
+                user, items_to_recommend
+            )
+            print_suggestions(suggestions_and_score[0], data)
+            explain_recommendation(recommender, user, suggestions_and_score[0], data)
+            st.markdown(
+                "## How the purchases of similar users influnce the recommendation"
+            )
+            fig = display_recommendation_plots(
+                user, suggestions_and_score[0], data, recommender
+            )
+            st.plotly_chart(fig)
+main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+pandas
+scikit-learn
+implicit
+scipy
+tqdm
+numpy
+matplotlib
+seaborn
+mlxtend
+plotly==5.9.0
+black
+altair<5

requirements_freezed.txt ADDED Viewed

	@@ -0,0 +1,68 @@

+altair==4.2.0
+attrs==22.1.0
+black==22.10.0
+blinker==1.5
+cachetools==5.2.0
+certifi==2022.9.24
+charset-normalizer==2.1.1
+click==8.1.3
+commonmark==0.9.1
+contourpy==1.0.5
+cycler==0.11.0
+decorator==5.1.1
+entrypoints==0.4
+fonttools==4.37.4
+gitdb==4.0.9
+GitPython==3.1.29
+idna==3.4
+implicit==0.6.1
+importlib-metadata==5.0.0
+Jinja2==3.1.2
+joblib==1.2.0
+jsonschema==4.16.0
+kiwisolver==1.4.4
+MarkupSafe==2.1.1
+matplotlib==3.6.0
+mlxtend==0.21.0
+mypy-extensions==0.4.3
+numpy==1.23.4
+packaging==21.3
+pandas==1.5.0
+pathspec==0.10.1
+Pillow==9.2.0
+platformdirs==2.5.2
+plotly==5.9.0
+protobuf==3.20.3
+pyarrow==9.0.0
+pydeck==0.8.0b4
+Pygments==2.13.0
+Pympler==1.0.1
+pyparsing==3.0.9
+pyrsistent==0.18.1
+python-dateutil==2.8.2
+pytz==2022.5
+pytz-deprecation-shim==0.1.0.post0
+requests==2.28.1
+rich==12.6.0
+scikit-learn==1.1.2
+scipy==1.9.2
+seaborn==0.12.1
+semver==2.13.0
+six==1.16.0
+sklearn==0.0
+smmap==5.0.0
+streamlit==1.13.0
+tenacity==8.1.0
+threadpoolctl==3.1.0
+toml==0.10.2
+tomli==2.0.1
+toolz==0.12.0
+tornado==6.2
+tqdm==4.64.1
+typing_extensions==4.4.0
+tzdata==2022.5
+tzlocal==4.2
+urllib3==1.26.12
+validators==0.20.0
+watchdog==2.1.9
+zipp==3.9.0

utils.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import streamlit as st
+import pandas as pd
+@st.cache
+def load_and_preprocess_data():
+    df = pd.read_csv(
+        "Data/OnlineRetail.csv",
+        encoding="latin-1",
+    )
+    # Remove nans values
+    df = df.dropna()
+    # Use only positive quantites. This is not a robust approach,
+    # but to keep things simple it quite good.
+    df = df[df["Quantity"] > 0]
+    # Parse the date column and add 10 years, just to better visualization
+    df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"]).dt.floor(
+        "d"
+    ) + pd.offsets.DateOffset(years=10)
+    # Change customer id to int
+    df["CustomerID"] = df["CustomerID"].astype(int)
+    # Add price column
+    df["Price"] = df["Quantity"] * df["UnitPrice"]
+    # Get unique entries in the dataset of users and products
+    users = df["CustomerID"].unique()
+    products = df["StockCode"].unique()
+    # Create a categorical type for users and product. User ordered to ensure
+    # reproducibility
+    user_cat = pd.CategoricalDtype(categories=sorted(users), ordered=True)
+    product_cat = pd.CategoricalDtype(categories=sorted(products), ordered=True)
+    # Transform and get the indexes of the columns
+    user_idx = df["CustomerID"].astype(user_cat).cat.codes
+    product_idx = df["StockCode"].astype(product_cat).cat.codes
+    # Add the categorical index to the starting dataframe
+    df["CustomerIndex"] = user_idx
+    df["ProductIndex"] = product_idx
+    return df, user_idx, product_idx