DanielSc4 commited on
Commit
5affbbc
1 Parent(s): 9cbeac4

Update w/ LDA

Browse files
Files changed (2) hide show
  1. app.py +136 -11
  2. requirements.txt +1 -1
app.py CHANGED
@@ -3,8 +3,10 @@ import gradio as gr
3
  import pandas as pd
4
  import matplotlib.pyplot as plt
5
  import numpy as np
 
6
  from sklearn.decomposition import LatentDirichletAllocation
7
  from sklearn.feature_extraction.text import CountVectorizer
 
8
 
9
  def concat_comments(sup_comment: list[str], comment: list[str]) -> list[str]:
10
  format_s = "{s}\n{c}"
@@ -12,15 +14,138 @@ def concat_comments(sup_comment: list[str], comment: list[str]) -> list[str]:
12
  format_s.format(s=s, c=c) for s, c in zip(sup_comment, comment)
13
  ]
14
 
15
-
16
-
17
- def main(button, chose_context):
 
 
 
 
 
 
 
 
 
 
 
 
18
  df = pd.read_csv('./data/results.csv', index_col=0)
19
- print(chose_context)
20
-
21
- data = concat_comments(df.sup_comment, df.comment)
22
-
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  subreddits = df.subreddit.value_counts().index[:22]
25
 
26
  weight_counts = {
@@ -55,7 +180,7 @@ def main(button, chose_context):
55
  ax.legend(loc="upper right")
56
  plt.xticks(rotation=70)
57
 
58
- plt.show()
59
 
60
 
61
  with gr.Blocks() as demo:
@@ -63,12 +188,12 @@ with gr.Blocks() as demo:
63
  label="Plot type",
64
  choices=['scatter_plot', 'heatmap', 'us_map', 'interactive_barplot', "radial", "multiline"], value='scatter_plot'
65
  )
66
- chose_context = gr.Radio(
67
  label="Context LDA",
68
- choices=['comment', 'sup comment', 'sup comment + comment'], value='scatter_plot'
69
  )
70
  plot = gr.Plot(label="Plot")
71
- button.change(main, inputs=[button, chose_context], outputs=[plot])
72
  demo.load(main, inputs=[button], outputs=[plot])
73
 
74
 
 
3
  import pandas as pd
4
  import matplotlib.pyplot as plt
5
  import numpy as np
6
+ import nltk, spacy, gensim
7
  from sklearn.decomposition import LatentDirichletAllocation
8
  from sklearn.feature_extraction.text import CountVectorizer
9
+ from pprint import pprint
10
 
11
  def concat_comments(sup_comment: list[str], comment: list[str]) -> list[str]:
12
  format_s = "{s}\n{c}"
 
14
  format_s.format(s=s, c=c) for s, c in zip(sup_comment, comment)
15
  ]
16
 
17
+ def sent_to_words(sentences):
18
+ for sentence in sentences:
19
+ yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations
20
+
21
+ def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): #'NOUN', 'ADJ', 'VERB', 'ADV'
22
+ texts_out = []
23
+ for sent in texts:
24
+ doc = nlp(" ".join(sent))
25
+ texts_out.append(" ".join([
26
+ token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags
27
+ ]))
28
+ return texts_out
29
+
30
+
31
+ def main(button, choose_context):
32
  df = pd.read_csv('./data/results.csv', index_col=0)
 
 
 
 
33
 
34
+ if choose_context == 'comment':
35
+ data = df.comment
36
+ elif choose_context == 'sup comment':
37
+ data = df.sup_comment
38
+ elif choose_context == 'sup comment + comment':
39
+ data = concat_comments(df.sup_comment, df.comment)
40
+
41
+ data_words = list(sent_to_words(data))
42
+ nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
43
+ data_lemmatized = lemmatization(data_words, allowed_postags=["NOUN", "ADJ"]) #select noun and verb
44
+
45
+ vectorizer = CountVectorizer(
46
+ analyzer='word',
47
+ min_df=10,
48
+ stop_words='english',
49
+ lowercase=True,
50
+ token_pattern='[a-zA-Z0-9]{3,}'
51
+ )
52
+ data_vectorized = vectorizer.fit_transform(data_lemmatized)
53
+
54
+
55
+ lda_model = LatentDirichletAllocation(
56
+ n_components=5,
57
+ max_iter=10,
58
+ learning_method='online',
59
+ random_state=100,
60
+ batch_size=128,
61
+ evaluate_every = -1,
62
+ n_jobs = -1,
63
+ )
64
+ lda_output = lda_model.fit_transform(data_vectorized)
65
+ print(lda_model) # Model attributes
66
+
67
+ # Log Likelyhood: Higher the better
68
+ print("Log Likelihood: ", lda_model.score(data_vectorized))
69
+ # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
70
+ print("Perplexity: ", lda_model.perplexity(data_vectorized))
71
+ # See model parameters
72
+ pprint(lda_model.get_params())
73
+
74
+ best_lda_model = lda_model
75
+
76
+ lda_output = best_lda_model.transform(data_vectorized)
77
+
78
+ topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]
79
+ docnames = ["Doc" + str(i) for i in range(len(data))]
80
+ df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
81
+
82
+ dominant_topic = np.argmax(df_document_topic.values, axis=1)
83
+ df_document_topic["dominant_topic"] = dominant_topic
84
+
85
+ # Topic-Keyword Matrix
86
+ df_topic_keywords = pd.DataFrame(best_lda_model.components_)
87
+ df_topic_keywords
88
+ # Assign Column and Index
89
+ df_topic_keywords.columns = vectorizer.get_feature_names_out()
90
+ df_topic_keywords.index = topicnames
91
+ # View
92
+ df_topic_keywords
93
+
94
+ # Show top n keywords for each topic
95
+ def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
96
+ keywords = np.array(vectorizer.get_feature_names_out())
97
+ topic_keywords = []
98
+ for topic_weights in lda_model.components_:
99
+ top_keyword_locs = (-topic_weights).argsort()[:n_words]
100
+ topic_keywords.append(keywords.take(top_keyword_locs))
101
+ return topic_keywords
102
+ topic_keywords = show_topics(vectorizer=vectorizer, lda_model=best_lda_model, n_words=15)
103
+ # Topic - Keywords Dataframe
104
+ df_topic_keywords = pd.DataFrame(topic_keywords)
105
+ df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
106
+ df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
107
+ df_topic_keywords
108
+
109
+ topics = [
110
+ f'Topic {i}' for i in range(len(df_topic_keywords))
111
+ ]
112
+ df_topic_keywords["Topics"] = topics
113
+ df_topic_keywords
114
+
115
+ # # Define function to predict topic for a given text document.
116
+ # nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
117
+ # def predict_topic(text, nlp=nlp):
118
+ # global sent_to_words
119
+ # global lemmatization
120
+ # # Step 1: Clean with simple_preprocess
121
+ # mytext_2 = list(sent_to_words(text))
122
+ # # Step 2: Lemmatize
123
+ # mytext_3 = lemmatization(mytext_2, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
124
+ # # Step 3: Vectorize transform
125
+ # mytext_4 = vectorizer.transform(mytext_3)
126
+ # # Step 4: LDA Transform
127
+ # topic_probability_scores = best_lda_model.transform(mytext_4)
128
+ # topic = df_topic_keywords.iloc[np.argmax(topic_probability_scores), 1:14].values.tolist()
129
+
130
+ # # Step 5: Infer Topic
131
+ # infer_topic = df_topic_keywords.iloc[np.argmax(topic_probability_scores), -1]
132
+
133
+ # #topic_guess = df_topic_keywords.iloc[np.argmax(topic_probability_scores), Topics]
134
+ # return infer_topic, topic, topic_probability_scores
135
+
136
+ # # Predict the topic
137
+ # mytext = ["This is a test of a random topic where I talk about politics"]
138
+ # infer_topic, topic, prob_scores = predict_topic(text = mytext)
139
+
140
+ def apply_predict_topic(text):
141
+ text = [text]
142
+ infer_topic, topic, prob_scores = predict_topic(text = text)
143
+ return(infer_topic)
144
+
145
+ df["Topic_key_word"] = df['comment'].apply(apply_predict_topic)
146
+
147
+
148
+ # plot
149
  subreddits = df.subreddit.value_counts().index[:22]
150
 
151
  weight_counts = {
 
180
  ax.legend(loc="upper right")
181
  plt.xticks(rotation=70)
182
 
183
+ return fig
184
 
185
 
186
  with gr.Blocks() as demo:
 
188
  label="Plot type",
189
  choices=['scatter_plot', 'heatmap', 'us_map', 'interactive_barplot', "radial", "multiline"], value='scatter_plot'
190
  )
191
+ choose_context = gr.Radio(
192
  label="Context LDA",
193
+ choices=['comment', 'sup comment', 'sup comment + comment'], value='sup comment'
194
  )
195
  plot = gr.Plot(label="Plot")
196
+ button.change(main, inputs=[button, choose_context], outputs=[plot])
197
  demo.load(main, inputs=[button], outputs=[plot])
198
 
199
 
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
  nltk
2
  spacy
3
  gensim
4
- sklearn
 
1
  nltk
2
  spacy
3
  gensim
4
+ scikit-learn