Update w/ LDA
import gradio as gr
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import nltk, spacy, gensim
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from pprint import pprint
def concat_comments(sup_comment: list[str], comment: list[str]) -> list[str]:
format_s = "{s}\n{c}"
return [
format_s.format(s=s, c=c) for s, c in zip(sup_comment, comment)
def sent_to_words(sentences):
for sentence in sentences:
yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): #'NOUN', 'ADJ', 'VERB', 'ADV'
texts_out = []
for sent in texts:
doc = nlp(" ".join(sent))
texts_out.append(" ".join([
token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags
return texts_out
def main(button, choose_context):
df = pd.read_csv('./data/results.csv', index_col=0)
if choose_context == 'comment':
data = df.comment
elif choose_context == 'sup comment':
data = df.sup_comment
elif choose_context == 'sup comment + comment':
data = concat_comments(df.sup_comment, df.comment)
data_words = list(sent_to_words(data))
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
data_lemmatized = lemmatization(data_words, allowed_postags=["NOUN", "ADJ"]) #select noun and verb
vectorizer = CountVectorizer(
data_vectorized = vectorizer.fit_transform(data_lemmatized)
lda_model = LatentDirichletAllocation(
evaluate_every = -1,
n_jobs = -1,
lda_output = lda_model.fit_transform(data_vectorized)
print(lda_model) # Model attributes
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(data_vectorized))
# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(data_vectorized))
# See model parameters
best_lda_model = lda_model
lda_output = best_lda_model.transform(data_vectorized)
topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]
docnames = ["Doc" + str(i) for i in range(len(data))]
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic["dominant_topic"] = dominant_topic
# Topic-Keyword Matrix
df_topic_keywords = pd.DataFrame(best_lda_model.components_)
# Assign Column and Index
df_topic_keywords.columns = vectorizer.get_feature_names_out()
df_topic_keywords.index = topicnames
# View
# Show top n keywords for each topic
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
keywords = np.array(vectorizer.get_feature_names_out())
topic_keywords = []
for topic_weights in lda_model.components_:
top_keyword_locs = (-topic_weights).argsort()[:n_words]
return topic_keywords
topic_keywords = show_topics(vectorizer=vectorizer, lda_model=best_lda_model, n_words=15)
# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
topics = [
f'Topic {i}' for i in range(len(df_topic_keywords))
df_topic_keywords["Topics"] = topics
# # Define function to predict topic for a given text document.
# nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
# def predict_topic(text, nlp=nlp):
# global sent_to_words
# global lemmatization
# # Step 1: Clean with simple_preprocess
# mytext_2 = list(sent_to_words(text))
# # Step 2: Lemmatize
# mytext_3 = lemmatization(mytext_2, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
# # Step 3: Vectorize transform
# mytext_4 = vectorizer.transform(mytext_3)
# # Step 4: LDA Transform
# topic_probability_scores = best_lda_model.transform(mytext_4)
# topic = df_topic_keywords.iloc[np.argmax(topic_probability_scores), 1:14].values.tolist()
# # Step 5: Infer Topic
# infer_topic = df_topic_keywords.iloc[np.argmax(topic_probability_scores), -1]
# #topic_guess = df_topic_keywords.iloc[np.argmax(topic_probability_scores), Topics]
# return infer_topic, topic, topic_probability_scores
# # Predict the topic
# mytext = ["This is a test of a random topic where I talk about politics"]
# infer_topic, topic, prob_scores = predict_topic(text = mytext)
def apply_predict_topic(text):
text = [text]
infer_topic, topic, prob_scores = predict_topic(text = text)
df["Topic_key_word"] = df['comment'].apply(apply_predict_topic)
# plot
subreddits = df.subreddit.value_counts().index[:22]
weight_counts = {
t: [
df[df.Topic_key_word == t].subreddit.value_counts()[subreddit] / df.subreddit.value_counts()[subreddit] for subreddit in subreddits
] for t in topics
irony_percs = {
t: [
df[df.subreddit == subreddit][(df[df.subreddit == subreddit].Topic_key_word == t) & (df[df.subreddit == subreddit].label == 1)]
) /
df[df.subreddit == subreddit]
) for subreddit in subreddits
] for t in topics
width = 0.9
fig, ax = plt.subplots(figsize = (10, 7))
plt.axhline(0.5, color = 'red', ls=":", alpha = .3)
bottom = np.zeros(len(subreddits))
for k, v in weight_counts.items():
p = ax.bar(subreddits, v, width, label=k, bottom=bottom)
ax.bar(subreddits, irony_percs[k], width - 0.01, bottom=bottom, color = 'black', edgecolor = 'white', alpha = .2, hatch = '\\')
bottom += v
ax.set_title("Perc of topics for each subreddit")
ax.legend(loc="upper right")
return fig
with gr.Blocks() as demo:
button = gr.Radio(
label="Plot type",
choices=['scatter_plot', 'heatmap', 'us_map', 'interactive_barplot', "radial", "multiline"], value='scatter_plot'
choose_context = gr.Radio(
label="Context LDA",
choices=['comment', 'sup comment', 'sup comment + comment'], value='sup comment'
plot = gr.Plot(label="Plot")
button.change(main, inputs=[button, choose_context], outputs=[plot])
demo.load(main, inputs=[button], outputs=[plot])
# iface = gr.Interface(fn=greet, inputs="text", outputs="text")
if __name__ == "__main__":