Spaces:

hi-paris
/

app-ai-ds-hec

Running

App Files Files Community

app-ai-ds-hec / pages /topic_modeling.py

laudavid

modify app version

aa667a1 11 months ago

raw

history blame

10.2 kB


	import json
	import os
	import streamlit as st
	import pandas as pd
	import matplotlib.pyplot as plt
	import plotly.express as px

	from utils import load_data_csv, load_data_pickle, load_model_pickle, load_numpy
	from st_pages import add_indentation

	# from wordcloud import WordCloud

	# Page configuration
	#st.set_page_config(layout="wide")
	#add_indentation()


	# Function to generate word clouds
	# def generate_wordcloud(text):
	# wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
	# fig, ax = plt.subplots()
	# ax.imshow(wordcloud, interpolation='bilinear')
	# ax.axis('off')
	# return fig


	st.set_page_config(layout="wide")

	## Start of Streamlit app
	st.title("Topic Modeling 📚")


	st.markdown("### What is Topic Modeling ?")

	st.info("""
	Topic modeling is a text-mining technique used to identify topics within a collection of documents.
	It is a useful tool for organizing and summarizing vast amounts of textual data as well as automate the discovery of hidden thematic structures in a corpus of text data, without any prior knowledge.
	""")

	st.markdown(" ")
	_, col, _ = st.columns([0.25,0.4,0.35])
	with col:
	st.image("images/topic_modeling.gif", caption="An example of Topic Modeling", use_column_width=True)


	st.markdown("""Common applications of Topic Modeling include:
	- Search Engine Optimization (SEO): 🔎 Determine the main topics/keywords present on a website to optimize content and improve search engine rankings.
	- Customer Support ✍️: Analyze customer support tickets, emails, and chat transcripts to identify common questions and complaints.
	- Fraud Detection and Risk Management: 🏦 : Detect fraudulent activities, compliance violations, and operational risks by analyzing textual data such as transaction descriptions, audit reports and regulatory filings.
	- Market Research 🌎: Gain competitive intelligence and make informed decisions regarding product development, marketing strategies, and market positioning by analyzing research reports and industry news.
	""")


	st.markdown(" ")
	st.divider()

	st.markdown("# Topic modeling on product descriptions 🛍️")
	st.info("""In this use case, we will use a topic model to categorize around 20 000 e-commerce products using text descriptions and identify
	the main types of products solds.""")

	_, col, _ = st.columns([0.2,0.6,0.2])
	with col:
	st.image("images/e-commerce.jpg")

	st.markdown(" ")

	# Load data
	path_data = "data/topic-modeling"
	# data = load_data_csv(path_data,"data-topicmodeling.csv")

	# Load the topic data
	topic_info = load_data_pickle(path_data, 'topic_info.pkl')



	##### ABOUT THE USE CASE
	st.markdown("#### About the data 📋")
	st.markdown("""You were provided a dataset with around 20 000 products from a large e-commerce retailer. <br>
	This dataset contains the products' title and description on the website.""", unsafe_allow_html=True)
	st.info("""Note: Some of the descriptions featured below are shown in their 'raw' form, meaning they contain unprocessed html code and special characters.
	These descriptions were first 'cleaned' (by removing unwanted characters) before being used in the model.""")
	see_data = st.checkbox('See the data', key="credit_score_data") # Corrected the key to use an underscore
	if see_data:
	st.markdown(" ")
	st.warning("This view only shows a subset of the 20 000 product description used.")
	data = load_data_pickle(path_data,"data-tm-view.pkl")
	data_show = data[["TITLE", "DESCRIPTION"]]
	st.dataframe(data_show.reset_index(drop=True), use_container_width=True)


	st.markdown(" ")
	st.markdown(" ")



	# RUN THE MODEL
	st.markdown("#### About the model 📚")
	st.markdown("""Topic models can be seen as unsupervised clustering models where text documents are grouped into topics/clusters based on their similarities.
	We will use here a topic model to automatically categorize/group the retailer's products based on their description,
	as well as understand what are the most common type of products being sold.""", unsafe_allow_html=True)

	st.info("""Note: In topic modeling, the final topics are represented by the model using 'top words'.
	A topic's top words are chosen based on how much they appear in the topic's documents.""")

	def show_results():
	st.markdown("#### See the results ☑️")
	tab1, tab2 = st.tabs(["Overall results", "Specific Topic Details", ])# "Search Similar Topics"])
	st.markdown(" ")

	# Tab 1: Summary Table
	with tab1:
	st.header("Overall results")
	st.markdown("""This tab showcases all of the topics identified within the product dataset, each topic's most significant words (top words), as well as the proportion
	of products that were assigned to the specific topic.""")

	summary_table = topic_info[['Title','Representation', 'Percentage']].copy()
	summary_table['Top Words'] = summary_table['Representation'].apply(lambda x: x[:5]) #:5
	summary_table = summary_table[["Title","Top Words","Percentage"]]
	summary_table.rename({"Title":"Topic Title"}, axis=1, inplace=True)

	st.data_editor(
	summary_table, #.loc[df_results_tab1["Customer ID"].isin(filter_customers)],
	column_config={
	"Percentage": st.column_config.ProgressColumn(
	"Proportion %",
	help="Propotion of documents within each topic",
	format="%.1f%%",
	min_value=0,
	max_value=100)},
	use_container_width=True
	)

	st.info("""Note: The topic 'titles' were not provided by the model but instead were generated by feeding the topic's top words to an LLM.
	Traditional topic models define topics using representative/top words but weren't built to generate a specific title to each topic.""")

	# Tab 2: Specific Topic Details
	with tab2:

	# Load top words
	with open(os.path.join(path_data,"topics_top_words.json"), "r") as json_file:
	top_words_dict = json.load(json_file)

	# Load similarity df and scores
	similarity_df = load_data_pickle(path_data, "similarity_topic_df.pkl")
	similarity_scores = load_numpy(path_data, "similarity_topic_scores.npy")

	#st.markdown(" ")
	st.header("Learn more about each topic")
	st.markdown("""You can select a specific topic to get more information on its top words, as well as the
	other topics that are most similar to it.""")
	# st.info("""In this section, you can find more information on each of the topics identified by the model.
	# This includes the topic's a full list of its top words, the importance of each of these words, as well as the top five topics that are most similar to it.""")

	st.markdown(" ")

	# Select topic
	topics = topic_info["Title"].sort_values().to_list()
	selected_topic = st.selectbox('Select a Topic', topics)
	selected_topic_id = topic_info[topic_info['Title'] == selected_topic]["Topic"].to_numpy()[0] + 1

	st.markdown(" ")
	col1, col2 = st.columns(2)

	# Top words
	with col1:
	top_words_df = pd.DataFrame(top_words_dict[selected_topic], columns=["Word", "Importance"])
	top_words_df.sort_values(by=["Importance"], ascending=False, inplace=True)
	top_words_df["Importance"] = top_words_df["Importance"].round(2)

	fig = px.bar(top_words_df, x='Word', y='Importance', color="Importance", title="Top words", text_auto=True)
	fig.update_layout(yaxis=dict(range=[0, 1]), xaxis_title="", showlegend=False)
	st.plotly_chart(fig, use_container_width=True)
	st.info("""Note: Each score was computed based on the words importance in the particular topic using
	a popular metric in NLP called TF-IDF (Term Frequency-Inverse Document Frequency). """)


	# Similar topics to the selected topic
	with col2:
	similarity_df = similarity_df.loc[similarity_df["Topic"]==selected_topic]
	similarity_df["scores"] = 100*similarity_scores[selected_topic_id,:]
	similarity_df.columns = ["Original Topic", "Rank", "Topic", "Similarity (%)"]

	fig = px.bar(similarity_df, y='Similarity (%)', x='Topic', color="Topic", title="Five most similar topics", text_auto=True)
	fig.update_layout(yaxis=dict(range=[0, 100]),
	xaxis_title="",
	showlegend=False)

	st.plotly_chart(fig, use_container_width=True)
	st.info("""Note: Topics with a high similarity score can be merged together as to reduce the number of topics, as
	well as improve the topics coherence.""")




	# words_for_cloud = ' '.join(selected_topic_info.iloc[0]['Representation'])
	# fig_wordcloud = generate_wordcloud(words_for_cloud)
	# st.pyplot(fig_wordcloud)

	# Display most representative document
	# representative_doc = selected_topic_info.iloc[0]['Representative_Docs'][1]
	# st.write(representative_doc)


	# Tab 3: Search for similar topics
	# with tab3:
	# st.header("Search for Similar Topics")
	# search_word = st.text_input("Enter a search word to find similar topics:")
	# if search_word:
	# st.write(f"Results for similar topics to '{search_word}' would be displayed here.")

	return None

	if 'button_clicked' not in st.session_state:
	st.session_state['button_clicked'] = False

	def run_model():
	run_model = st.button("Run the model", type="primary")
	st.markdown(" ")
	st.markdown(" ")

	if not st.session_state['button_clicked']:
	if run_model:
	show_results()
	st.session_state['button_clicked'] = True
	else:
	show_results()

	run_model()