Spaces:
Running
Running
File size: 8,122 Bytes
8bf791d 4183828 8bf791d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
import streamlit as st
import pandas as pd, numpy as np
from bertopic import BERTopic
from datetime import datetime
import math
from helper import visualize_topics_over_time, visualize_topics_per_class
@st.cache_data
def get_df(url):
return pd.read_csv(url)
@st.cache_resource
def get_model(url):
return BERTopic.load(url)
@st.cache_data
def get_topics_over_time(frame,lens):
strings = frame.proc2.apply(lambda x: str(x))
date = pd.to_datetime(frame.date,format=st.session_state.datetime_format)
return st.session_state.model.topics_over_time(strings, date, nr_bins=math.floor(len(frame.date.unique())/3))
@st.cache_data
def get_topics_per_class(frame,colname):
strings = frame.proc2.apply(lambda x: str(x))
classes = st.session_state.df[colname].apply(lambda x: str(x))
return st.session_state.model.topics_per_class(strings, classes=classes)
st.set_page_config(
page_title="BoardTopic",
page_icon="🤖",
layout="wide"
)
st.header("🤖 BoardTopic")
st.subheader("Turning your data into insight with behavioral data science")
if "model" not in st.session_state:
st.markdown("Welcome to BoardTopic, a friendly way to understand your big data.")
st.markdown("If you do not have a BoardTopic model trained, please go to the 'Create Model' tab.")
st.markdown("If you already have a BoardTopic model trained, please enter the information below:")
model_name = st.text_input("Please enter model file name (e.g., 'model')")
df_name = st.text_input("Please enter dataframe file name (e.g., 'df_small.csv')")
uploaded_file2 = st.file_uploader("Choose a file")
#datetime_format = st.text_input("Please enter the date format (e.g., '%d.%m.%Y')", value="")
st.session_state.datetime_format = None #if datetime_format == "" else datetime_format
if uploaded_file2 is not None:
st.session_state.model = get_model(f'models/{model_name}')
st.session_state.df = get_df(f'models/{df_name}')
st.success("Model and dataframe loaded!")
if "model" in st.session_state:
if "datetime_format" not in st.session_state:
st.session_state.datetime_format = st.text_input("Please enter the date format (e.g., '%d.%m.%Y')", value="", key="datetime_format")
st.session_state.datetime_format = None if st.session_state.datetime_format == "" else st.session_state.datetime_format
#st.session_state.df = get_df("df_small.csv")
st.session_state.model.set_topic_labels(st.session_state.model.generate_topic_labels(nr_words=6, topic_prefix=False, word_length=10, separator=", "))
st.session_state.model_df = st.session_state.model.get_document_info(st.session_state.df.proc)
st.session_state.df["id"] = st.session_state.model_df.index
st.session_state.model_df["id"] = st.session_state.model_df.index
st.session_state.model_df = pd.merge(st.session_state.model_df,st.session_state.df,how="left",on="id")
st.session_state.model_df["date"] = pd.to_datetime(st.session_state.model_df.date,format=st.session_state.datetime_format)
topics_over_time = get_topics_over_time(st.session_state.df,len(st.session_state.df))
largest_topics = st.session_state.model_df.groupby("Topic").agg("count").sort_values("Document",ascending=False)[0:10]
st.write(visualize_topics_over_time(st.session_state.model, topics_over_time, topics=list(largest_topics.index),
custom_labels=True, title = "10 most popular narratives over time"))
st.markdown("#### Overall document distribution")
grouped = st.session_state.model_df.groupby("date").agg("count")
grouped['date'] = pd.to_datetime(grouped.index,format=st.session_state.datetime_format)
st.bar_chart(data=grouped, x='date', y='Document')
st.markdown("#### Emotions")
joy = st.session_state.model_df.joy.apply(lambda x: 1 if x > 0.9 else 0)
sadness = st.session_state.model_df.sadness.apply(lambda x: 1 if x > 0.9 else 0)
surprise = st.session_state.model_df.surprise.apply(lambda x: 1 if x > 0.9 else 0)
fear = st.session_state.model_df.fear.apply(lambda x: 1 if x > 0.9 else 0)
anger = st.session_state.model_df.anger.apply(lambda x: 1 if x > 0.9 else 0)
emotions = pd.DataFrame({"date":st.session_state.model_df.date, "source": st.session_state.model_df.source,
"joy":joy, "sadness":sadness, "surprise":surprise, "fear":fear, "anger":anger})
#dates = pd.to_datetime(emotions.date.unique(),format="%d.%m.%Y").sort_values()
#emotions["date"] = pd.to_datetime(emotions.date,format="%d.%m.%Y")
#emnew = emotions[(dates[-7] <= emotions.date) & (emotions.date <= dates[-1])].drop('date',axis=1, inplace=False).mean()
#emplot = pd.DataFrame({f"Week of {str(dates[-14])[:10]}": emold, f"Week of {str(dates[-7])[:10]}": emnew}).T
st.markdown("##### Percent with emotion by platform")
st.bar_chart(emotions.groupby("source").agg("mean").T*100)
st.markdown("##### Platform breakdown")
st.bar_chart(emotions.groupby("source").agg("mean")*100)
emotionsgr = emotions.groupby("date").agg("mean")*100
emotionsgr['date'] = pd.to_datetime(grouped.index,format=st.session_state.datetime_format)
st.markdown("##### Emotional dynamics over time")
st.line_chart(emotionsgr,x="date")
st.markdown("#### Topics per class")
if "source" in st.session_state.df.columns:
topics_per_class1 = get_topics_per_class(st.session_state.df,"source")
st.plotly_chart(visualize_topics_per_class(st.session_state.model, topics_per_class1, top_n_topics=20, width = 900, height = 600,
custom_labels=True, title = "20 most popular narratives per platform"))
st.session_state.df["emotion"] = st.session_state.df[["joy","sadness","surprise","fear",'anger','no_emotion']].idxmax(axis=1)
topics_per_class2 = get_topics_per_class(st.session_state.df,"emotion")
st.plotly_chart(visualize_topics_per_class(st.session_state.model, topics_per_class2, top_n_topics=20, width = 900, height = 600,
custom_labels=True, title = "20 most popular narratives per emotion"))
st.markdown("#### All topics")
last_week = st.session_state.model_df
largest_topics_last_week = last_week.groupby("Topic").agg("count").sort_values("Document",ascending=False)
largest_topics_last_week["Name"] = [ list(last_week[last_week.Topic == i]["CustomName"])[0] for i in largest_topics_last_week.index ]
largest_topics_last_week["Count"] = largest_topics_last_week["Document"]
largest_topics_last_week["Percent"] = round(100*largest_topics_last_week["Count"]/len(st.session_state.model_df),3)
st.table(largest_topics_last_week[["Name", "Count","Percent"]])
dictionary = {i:st.session_state.model.custom_labels_[i] for i in range(len(st.session_state.model.custom_labels_))}
def mapping(item):
return dictionary[item]
st.markdown("#### Explore representative documents")
st.selectbox("Select topic",list(st.session_state.model_df.Topic.unique()),key="selected_topic",format_func=mapping)
repr_docs_mappings, repr_docs, repr_docs_indices = st.session_state.model._extract_representative_docs(st.session_state.model.c_tf_idf_,st.session_state.model_df,st.session_state.model.topic_representations_)
ind = repr_docs_indices[st.session_state.selected_topic]
j = 1
for doc in st.session_state.model_df.iloc[ind].Document:
st.markdown(f"**Representative document {j}**")
st.text(doc)
j+=1
st.markdown("---")
st.markdown("### Save current model")
name = st.text_input("Please name this model file (e.g., 'my_cool_model')")
if st.button("Save this model"):
st.session_state.model.save(f"models/model_{name}")
st.session_state.df.to_csv(f"models/df_{name}.csv")
st.success(f"Model and dataframe saved in folder 'models'!")
if st.button("Restart"):
st.cache_data.clear()
st.cache_resource.clear()
for key in st.session_state.keys():
del st.session_state[key]
|