Spaces:

DanielSc4
/

DataAnalyticsNLP

Runtime error

App Files Files Community

DataAnalyticsNLP / app.py

DanielSc4

Update w/ LDA

5affbbc over 1 year ago

raw

history blame

7.38 kB

	import gradio as gr

	import pandas as pd
	import matplotlib.pyplot as plt
	import numpy as np
	import nltk, spacy, gensim
	from sklearn.decomposition import LatentDirichletAllocation
	from sklearn.feature_extraction.text import CountVectorizer
	from pprint import pprint

	def concat_comments(sup_comment: list[str], comment: list[str]) -> list[str]:
	format_s = "{s}\n{c}"
	return [
	format_s.format(s=s, c=c) for s, c in zip(sup_comment, comment)
	]

	def sent_to_words(sentences):
	for sentence in sentences:
	yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations

	def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): #'NOUN', 'ADJ', 'VERB', 'ADV'
	texts_out = []
	for sent in texts:
	doc = nlp(" ".join(sent))
	texts_out.append(" ".join([
	token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags
	]))
	return texts_out


	def main(button, choose_context):
	df = pd.read_csv('./data/results.csv', index_col=0)

	if choose_context == 'comment':
	data = df.comment
	elif choose_context == 'sup comment':
	data = df.sup_comment
	elif choose_context == 'sup comment + comment':
	data = concat_comments(df.sup_comment, df.comment)

	data_words = list(sent_to_words(data))
	nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
	data_lemmatized = lemmatization(data_words, allowed_postags=["NOUN", "ADJ"]) #select noun and verb

	vectorizer = CountVectorizer(
	analyzer='word',
	min_df=10,
	stop_words='english',
	lowercase=True,
	token_pattern='[a-zA-Z0-9]{3,}'
	)
	data_vectorized = vectorizer.fit_transform(data_lemmatized)


	lda_model = LatentDirichletAllocation(
	n_components=5,
	max_iter=10,
	learning_method='online',
	random_state=100,
	batch_size=128,
	evaluate_every = -1,
	n_jobs = -1,
	)
	lda_output = lda_model.fit_transform(data_vectorized)
	print(lda_model) # Model attributes

	# Log Likelyhood: Higher the better
	print("Log Likelihood: ", lda_model.score(data_vectorized))
	# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
	print("Perplexity: ", lda_model.perplexity(data_vectorized))
	# See model parameters
	pprint(lda_model.get_params())

	best_lda_model = lda_model

	lda_output = best_lda_model.transform(data_vectorized)

	topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]
	docnames = ["Doc" + str(i) for i in range(len(data))]
	df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

	dominant_topic = np.argmax(df_document_topic.values, axis=1)
	df_document_topic["dominant_topic"] = dominant_topic

	# Topic-Keyword Matrix
	df_topic_keywords = pd.DataFrame(best_lda_model.components_)
	df_topic_keywords
	# Assign Column and Index
	df_topic_keywords.columns = vectorizer.get_feature_names_out()
	df_topic_keywords.index = topicnames
	# View
	df_topic_keywords

	# Show top n keywords for each topic
	def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
	keywords = np.array(vectorizer.get_feature_names_out())
	topic_keywords = []
	for topic_weights in lda_model.components_:
	top_keyword_locs = (-topic_weights).argsort()[:n_words]
	topic_keywords.append(keywords.take(top_keyword_locs))
	return topic_keywords
	topic_keywords = show_topics(vectorizer=vectorizer, lda_model=best_lda_model, n_words=15)
	# Topic - Keywords Dataframe
	df_topic_keywords = pd.DataFrame(topic_keywords)
	df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
	df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
	df_topic_keywords

	topics = [
	f'Topic {i}' for i in range(len(df_topic_keywords))
	]
	df_topic_keywords["Topics"] = topics
	df_topic_keywords

	# # Define function to predict topic for a given text document.
	# nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
	# def predict_topic(text, nlp=nlp):
	# global sent_to_words
	# global lemmatization
	# # Step 1: Clean with simple_preprocess
	# mytext_2 = list(sent_to_words(text))
	# # Step 2: Lemmatize
	# mytext_3 = lemmatization(mytext_2, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
	# # Step 3: Vectorize transform
	# mytext_4 = vectorizer.transform(mytext_3)
	# # Step 4: LDA Transform
	# topic_probability_scores = best_lda_model.transform(mytext_4)
	# topic = df_topic_keywords.iloc[np.argmax(topic_probability_scores), 1:14].values.tolist()

	# # Step 5: Infer Topic
	# infer_topic = df_topic_keywords.iloc[np.argmax(topic_probability_scores), -1]

	# #topic_guess = df_topic_keywords.iloc[np.argmax(topic_probability_scores), Topics]
	# return infer_topic, topic, topic_probability_scores

	# # Predict the topic
	# mytext = ["This is a test of a random topic where I talk about politics"]
	# infer_topic, topic, prob_scores = predict_topic(text = mytext)

	def apply_predict_topic(text):
	text = [text]
	infer_topic, topic, prob_scores = predict_topic(text = text)
	return(infer_topic)

	df["Topic_key_word"] = df['comment'].apply(apply_predict_topic)


	# plot
	subreddits = df.subreddit.value_counts().index[:22]

	weight_counts = {
	t: [
	df[df.Topic_key_word == t].subreddit.value_counts()[subreddit] / df.subreddit.value_counts()[subreddit] for subreddit in subreddits
	] for t in topics
	}

	irony_percs = {
	t: [
	len(
	df[df.subreddit == subreddit][(df[df.subreddit == subreddit].Topic_key_word == t) & (df[df.subreddit == subreddit].label == 1)]
	) /
	len(
	df[df.subreddit == subreddit]
	) for subreddit in subreddits
	] for t in topics
	}
	width = 0.9

	fig, ax = plt.subplots(figsize = (10, 7))
	plt.axhline(0.5, color = 'red', ls=":", alpha = .3)

	bottom = np.zeros(len(subreddits))

	for k, v in weight_counts.items():
	p = ax.bar(subreddits, v, width, label=k, bottom=bottom)
	ax.bar(subreddits, irony_percs[k], width - 0.01, bottom=bottom, color = 'black', edgecolor = 'white', alpha = .2, hatch = '\\')
	bottom += v

	ax.set_title("Perc of topics for each subreddit")
	ax.legend(loc="upper right")
	plt.xticks(rotation=70)

	return fig


	with gr.Blocks() as demo:
	button = gr.Radio(
	label="Plot type",
	choices=['scatter_plot', 'heatmap', 'us_map', 'interactive_barplot', "radial", "multiline"], value='scatter_plot'
	)
	choose_context = gr.Radio(
	label="Context LDA",
	choices=['comment', 'sup comment', 'sup comment + comment'], value='sup comment'
	)
	plot = gr.Plot(label="Plot")
	button.change(main, inputs=[button, choose_context], outputs=[plot])
	demo.load(main, inputs=[button], outputs=[plot])


	# iface = gr.Interface(fn=greet, inputs="text", outputs="text")
	if __name__ == "__main__":
	demo.launch()