Spaces:
Running
Running
import json | |
import os | |
import streamlit as st | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import plotly.express as px | |
from utils import load_data_csv, load_data_pickle, load_model_pickle, load_numpy | |
from st_pages import add_indentation | |
# from wordcloud import WordCloud | |
# Page configuration | |
#st.set_page_config(layout="wide") | |
#add_indentation() | |
# Function to generate word clouds | |
# def generate_wordcloud(text): | |
# wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text) | |
# fig, ax = plt.subplots() | |
# ax.imshow(wordcloud, interpolation='bilinear') | |
# ax.axis('off') | |
# return fig | |
st.set_page_config(layout="wide") | |
## Start of Streamlit app | |
st.title("Topic Modeling π") | |
st.markdown("### What is Topic Modeling ?") | |
st.info(""" | |
Topic modeling is a text-mining technique used to **identify topics within a collection of documents**. | |
It is a useful tool for organizing and summarizing vast amounts of textual data as well as automate the discovery of hidden thematic structures in a corpus of text data, without any prior knowledge. | |
""") | |
st.markdown(" ") | |
_, col, _ = st.columns([0.25,0.4,0.35]) | |
with col: | |
st.image("images/topic_modeling.gif", caption="An example of Topic Modeling", use_column_width=True) | |
st.markdown("""Common applications of Topic Modeling include: | |
- **Search Engine Optimization (SEO): π** Determine the main topics/keywords present on a website to optimize content and improve search engine rankings. | |
- **Customer Support** βοΈ: Analyze customer support tickets, emails, and chat transcripts to identify common questions and complaints. | |
- **Fraud Detection and Risk Management: π¦** : Detect fraudulent activities, compliance violations, and operational risks by analyzing textual data such as transaction descriptions, audit reports and regulatory filings. | |
- **Market Research π**: Gain competitive intelligence and make informed decisions regarding product development, marketing strategies, and market positioning by analyzing research reports and industry news. | |
""") | |
st.markdown(" ") | |
st.divider() | |
st.markdown("# Topic modeling on product descriptions ποΈ") | |
st.info("""In this use case, we will use a topic model to categorize around 20 000 e-commerce products using text descriptions and identify | |
the main types of products solds.""") | |
_, col, _ = st.columns([0.2,0.6,0.2]) | |
with col: | |
st.image("images/e-commerce.jpg") | |
st.markdown(" ") | |
# Load data | |
path_data = "data/topic-modeling" | |
# data = load_data_csv(path_data,"data-topicmodeling.csv") | |
# Load the topic data | |
topic_info = load_data_pickle(path_data, 'topic_info.pkl') | |
##### ABOUT THE USE CASE | |
st.markdown("#### About the data π") | |
st.markdown("""You were provided a dataset with around 20 000 products from a large e-commerce retailer. <br> | |
This dataset contains the products' title and description on the website.""", unsafe_allow_html=True) | |
st.info("""**Note**: Some of the descriptions featured below are shown in their 'raw' form, meaning they contain unprocessed html code and special characters. | |
These descriptions were first 'cleaned' (by removing unwanted characters) before being used in the model.""") | |
see_data = st.checkbox('**See the data**', key="credit_score_data") # Corrected the key to use an underscore | |
if see_data: | |
st.markdown(" ") | |
st.warning("This view only shows a subset of the 20 000 product description used.") | |
data = load_data_pickle(path_data,"data-tm-view.pkl") | |
data_show = data[["TITLE", "DESCRIPTION"]] | |
st.dataframe(data_show.reset_index(drop=True), use_container_width=True) | |
st.markdown(" ") | |
st.markdown(" ") | |
# RUN THE MODEL | |
st.markdown("#### About the model π") | |
st.markdown("""**Topic models** can be seen as unsupervised clustering models where text documents are grouped into topics/clusters based on their similarities. | |
We will use here a topic model to automatically categorize/group the retailer's products based on their description, | |
as well as understand what are the most common type of products being sold.""", unsafe_allow_html=True) | |
st.info("""**Note**: In topic modeling, the final topics are represented by the model using 'top words'. | |
A topic's top words are chosen based on how much they appear in the topic's documents.""") | |
def show_results(): | |
st.markdown("#### See the results βοΈ") | |
tab1, tab2 = st.tabs(["Overall results", "Specific Topic Details", ])# "Search Similar Topics"]) | |
st.markdown(" ") | |
# Tab 1: Summary Table | |
with tab1: | |
st.header("Overall results") | |
st.markdown("""This tab showcases all of the **topics identified** within the product dataset, each topic's most significant words (**top words**), as well as the **proportion** | |
of products that were assigned to the specific topic.""") | |
summary_table = topic_info[['Title','Representation', 'Percentage']].copy() | |
summary_table['Top Words'] = summary_table['Representation'].apply(lambda x: x[:5]) #:5 | |
summary_table = summary_table[["Title","Top Words","Percentage"]] | |
summary_table.rename({"Title":"Topic Title"}, axis=1, inplace=True) | |
st.data_editor( | |
summary_table, #.loc[df_results_tab1["Customer ID"].isin(filter_customers)], | |
column_config={ | |
"Percentage": st.column_config.ProgressColumn( | |
"Proportion %", | |
help="Propotion of documents within each topic", | |
format="%.1f%%", | |
min_value=0, | |
max_value=100)}, | |
use_container_width=True | |
) | |
st.info("""**Note**: The topic 'titles' were not provided by the model but instead were generated by feeding the topic's top words to an LLM. | |
Traditional topic models define topics using representative/top words but weren't built to generate a specific title to each topic.""") | |
# Tab 2: Specific Topic Details | |
with tab2: | |
# Load top words | |
with open(os.path.join(path_data,"topics_top_words.json"), "r") as json_file: | |
top_words_dict = json.load(json_file) | |
# Load similarity df and scores | |
similarity_df = load_data_pickle(path_data, "similarity_topic_df.pkl") | |
similarity_scores = load_numpy(path_data, "similarity_topic_scores.npy") | |
#st.markdown(" ") | |
st.header("Learn more about each topic") | |
st.markdown("""You can **select a specific topic** to get more information on its **top words**, as well as the | |
**other topics that are most similar to it**.""") | |
# st.info("""In this section, you can find more information on each of the topics identified by the model. | |
# This includes the topic's a full list of its top words, the importance of each of these words, as well as the top five topics that are most similar to it.""") | |
st.markdown(" ") | |
# Select topic | |
topics = topic_info["Title"].sort_values().to_list() | |
selected_topic = st.selectbox('**Select a Topic**', topics) | |
selected_topic_id = topic_info[topic_info['Title'] == selected_topic]["Topic"].to_numpy()[0] + 1 | |
st.markdown(" ") | |
col1, col2 = st.columns(2) | |
# Top words | |
with col1: | |
top_words_df = pd.DataFrame(top_words_dict[selected_topic], columns=["Word", "Importance"]) | |
top_words_df.sort_values(by=["Importance"], ascending=False, inplace=True) | |
top_words_df["Importance"] = top_words_df["Importance"].round(2) | |
fig = px.bar(top_words_df, x='Word', y='Importance', color="Importance", title="Top words", text_auto=True) | |
fig.update_layout(yaxis=dict(range=[0, 1]), xaxis_title="", showlegend=False) | |
st.plotly_chart(fig, use_container_width=True) | |
st.info("""**Note:** Each score was computed based on the words importance in the particular topic using | |
a popular metric in NLP called TF-IDF (Term Frequency-Inverse Document Frequency). """) | |
# Similar topics to the selected topic | |
with col2: | |
similarity_df = similarity_df.loc[similarity_df["Topic"]==selected_topic] | |
similarity_df["scores"] = 100*similarity_scores[selected_topic_id,:] | |
similarity_df.columns = ["Original Topic", "Rank", "Topic", "Similarity (%)"] | |
fig = px.bar(similarity_df, y='Similarity (%)', x='Topic', color="Topic", title="Five most similar topics", text_auto=True) | |
fig.update_layout(yaxis=dict(range=[0, 100]), | |
xaxis_title="", | |
showlegend=False) | |
st.plotly_chart(fig, use_container_width=True) | |
st.info("""**Note:** Topics with a high similarity score can be merged together as to reduce the number of topics, as | |
well as improve the topics coherence.""") | |
# words_for_cloud = ' '.join(selected_topic_info.iloc[0]['Representation']) | |
# fig_wordcloud = generate_wordcloud(words_for_cloud) | |
# st.pyplot(fig_wordcloud) | |
# Display most representative document | |
# representative_doc = selected_topic_info.iloc[0]['Representative_Docs'][1] | |
# st.write(representative_doc) | |
# Tab 3: Search for similar topics | |
# with tab3: | |
# st.header("Search for Similar Topics") | |
# search_word = st.text_input("Enter a search word to find similar topics:") | |
# if search_word: | |
# st.write(f"Results for similar topics to '{search_word}' would be displayed here.") | |
return None | |
if 'button_clicked' not in st.session_state: | |
st.session_state['button_clicked'] = False | |
def run_model(): | |
run_model = st.button("**Run the model**", type="primary") | |
st.markdown(" ") | |
st.markdown(" ") | |
if not st.session_state['button_clicked']: | |
if run_model: | |
show_results() | |
st.session_state['button_clicked'] = True | |
else: | |
show_results() | |
run_model() | |