topic discovery added
Browse files- app.py +129 -28
- learn_multi_doc_model.py +352 -0
- models/model_0.0001_100.pkl +3 -0
- requirements.txt +3 -0
- topic_discovery/.DS_Store +0 -0
- topic_discovery/cvect_25000_ar.pkl +3 -0
- topic_discovery/cvect_25000_bn.pkl +3 -0
- topic_discovery/cvect_25000_de.pkl +3 -0
- topic_discovery/cvect_25000_el.pkl +3 -0
- topic_discovery/cvect_25000_en.pkl +3 -0
- topic_discovery/cvect_25000_es.pkl +3 -0
- topic_discovery/cvect_25000_fr.pkl +3 -0
- topic_discovery/cvect_25000_it.pkl +3 -0
- topic_discovery/cvect_25000_jp.pkl +3 -0
- topic_discovery/cvect_25000_mg.pkl +3 -0
- topic_discovery/cvect_25000_mk.pkl +3 -0
- topic_discovery/cvect_25000_nl.pkl +3 -0
- topic_discovery/cvect_25000_pl.pkl +3 -0
- topic_discovery/cvect_25000_pt.pkl +3 -0
- topic_discovery/cvect_25000_ru.pkl +3 -0
- topic_discovery/cvect_25000_zhs.pkl +3 -0
- topic_discovery/cvect_25000_zht.pkl +3 -0
- topic_discovery/cvects.key +17 -0
app.py
CHANGED
@@ -1,21 +1,33 @@
|
|
1 |
import gradio as gr
|
2 |
import numpy as np
|
|
|
3 |
import pickle
|
|
|
|
|
4 |
from sentence_transformers import SentenceTransformer
|
|
|
|
|
5 |
|
6 |
|
7 |
-
#css_code='body {background-image:url("https://picsum.photos/seed/picsum/200/300");} div.gradio-container {background: white;}'
|
|
|
8 |
|
|
|
|
|
9 |
|
10 |
-
categories = ["Censorship","Development","Digital Activism","Disaster","Economics & Business","Education","Environment","Governance","Health","History","Humanitarian Response","International Relations","Law","Media & Journalism","Migration & Immigration","Politics","Protest","Religion","Sport","Travel","War & Conflict","
|
11 |
-
|
|
|
12 |
with open('models/MLP_classifier_average_en.pkl', 'rb') as f:
|
13 |
classifier = pickle.load(f)
|
|
|
|
|
|
|
14 |
|
15 |
def get_embedding(text):
|
16 |
if text is None:
|
17 |
text = ""
|
18 |
-
return
|
19 |
|
20 |
def get_categories(y_pred):
|
21 |
indices = []
|
@@ -25,6 +37,53 @@ def get_categories(y_pred):
|
|
25 |
cats = [categories[i] for i in indices]
|
26 |
return cats
|
27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
def generate_output(article):
|
29 |
paragraphs = article.split("\n")
|
30 |
embdds = []
|
@@ -33,32 +92,74 @@ def generate_output(article):
|
|
33 |
embedding = np.average(embdds, axis=0)
|
34 |
|
35 |
#y_pred = classifier.predict_proba(embedding.reshape(1, 768))
|
36 |
-
|
37 |
-
y_pred =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
classes = get_categories(y_pred)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
#
|
51 |
-
#
|
52 |
-
#
|
53 |
-
#
|
54 |
-
|
55 |
-
|
56 |
-
demo = gr.Interface(fn=generate_output,
|
57 |
-
inputs=gr.Textbox(lines=6, placeholder="Insert text of the article here...", label="Article"),
|
58 |
-
outputs=[gr.Textbox(lines=1, label="Category"), gr.Textbox(lines=5, label="Topic discovery")],
|
59 |
-
title="Article classification & topic discovery demo",
|
60 |
-
flagging_options=["Incorrect"],
|
61 |
-
theme=gr.themes.Base())
|
62 |
#css=css_code)
|
63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
demo.launch()
|
|
|
1 |
import gradio as gr
|
2 |
import numpy as np
|
3 |
+
import pandas as pd
|
4 |
import pickle
|
5 |
+
import sklearn
|
6 |
+
import plotly.express as px
|
7 |
from sentence_transformers import SentenceTransformer
|
8 |
+
from sklearn.cluster import MiniBatchKMeans
|
9 |
+
from learn_multi_doc_model import Model
|
10 |
|
11 |
|
12 |
+
#css_code='body {background-image:url("https://picsum.photos/seed/picsum/200/300");} div.gradio-container {background: white;}, button#component-8{background-color: rgb(158,202,225);}'
|
13 |
+
css_code='button#component-8{background-color: rgb(158,202,225);}'
|
14 |
|
15 |
+
import __main__
|
16 |
+
setattr(__main__, "Model", Model)
|
17 |
|
18 |
+
categories = ["Censorship","Development","Digital Activism","Disaster","Economics & Business","Education","Environment","Governance","Health","History","Humanitarian Response","International Relations","Law","Media & Journalism","Migration & Immigration","Politics","Protest","Religion","Sport","Travel","War & Conflict","Technology + Science","Women & Gender + LGBTQ + Youth","Freedom of Speech + Human Rights","Literature + Arts & Culture"]
|
19 |
+
input_cvect_key_file = 'topic_discovery/cvects.key'
|
20 |
+
model_labse = SentenceTransformer('sentence-transformers/LaBSE')
|
21 |
with open('models/MLP_classifier_average_en.pkl', 'rb') as f:
|
22 |
classifier = pickle.load(f)
|
23 |
+
mul_model = None
|
24 |
+
with open('models/model_0.0001_100.pkl', 'rb') as f:
|
25 |
+
mul_model = pickle.load(f)
|
26 |
|
27 |
def get_embedding(text):
|
28 |
if text is None:
|
29 |
text = ""
|
30 |
+
return model_labse.encode(text)
|
31 |
|
32 |
def get_categories(y_pred):
|
33 |
indices = []
|
|
|
37 |
cats = [categories[i] for i in indices]
|
38 |
return cats
|
39 |
|
40 |
+
def get_words(doc_emb):
|
41 |
+
# load countvectorizers
|
42 |
+
cvects = {}
|
43 |
+
vocab = {} # load vocabulary of words for each lang
|
44 |
+
with open(input_cvect_key_file, "r") as fpr:
|
45 |
+
for line in fpr:
|
46 |
+
#print(line)
|
47 |
+
lang, fpath = line.strip().split()
|
48 |
+
with open(fpath, "rb") as fpr:
|
49 |
+
#print(f"loading {fpath}")
|
50 |
+
cvects[lang] = pickle.load(fpr)
|
51 |
+
vocab[lang] = cvects[lang].get_feature_names()
|
52 |
+
|
53 |
+
#print(
|
54 |
+
# "Loaded CountVectorizer for lang",
|
55 |
+
# lang,
|
56 |
+
# "with vocab size:",
|
57 |
+
# len(vocab[lang]),
|
58 |
+
#)
|
59 |
+
|
60 |
+
topn = 10 # top N words per cluster
|
61 |
+
|
62 |
+
#print(vocab["en"])
|
63 |
+
#print("MODEL KEYS")
|
64 |
+
#print(mul_model.E.keys())
|
65 |
+
|
66 |
+
doc_emb = doc_emb.flatten()
|
67 |
+
|
68 |
+
words_dict = {}
|
69 |
+
|
70 |
+
for lang in mul_model.E.keys():
|
71 |
+
|
72 |
+
#print(lang, end=": ")
|
73 |
+
|
74 |
+
scores = mul_model.E[lang] @ (doc_emb).T
|
75 |
+
k_ixs = np.argsort(scores)[::-1][:topn].squeeze() # sort them in descending order and pick topn
|
76 |
+
tmp = []
|
77 |
+
for i in k_ixs:
|
78 |
+
#print(vocab[lang][i], end=", ")
|
79 |
+
tmp.append(vocab[lang][i])
|
80 |
+
|
81 |
+
words_dict[lang] = tmp
|
82 |
+
#print()
|
83 |
+
|
84 |
+
return words_dict
|
85 |
+
|
86 |
+
|
87 |
def generate_output(article):
|
88 |
paragraphs = article.split("\n")
|
89 |
embdds = []
|
|
|
92 |
embedding = np.average(embdds, axis=0)
|
93 |
|
94 |
#y_pred = classifier.predict_proba(embedding.reshape(1, 768))
|
95 |
+
reshaped = embedding.reshape(1, 768)
|
96 |
+
#y_pred = classifier.predict(reshaped)
|
97 |
+
#y_pred = y_pred.flatten()
|
98 |
+
|
99 |
+
y_prob = classifier.predict_proba(reshaped)
|
100 |
+
y_prob = y_prob.reshape(len(categories),1)
|
101 |
+
|
102 |
+
y_pred = [1 if x >= 0.5 else 0 for x in y_prob]
|
103 |
+
|
104 |
classes = get_categories(y_pred)
|
105 |
+
if len(classes) > 1:
|
106 |
+
classes_string = ', '.join(classes)
|
107 |
+
elif len(classes) == 1:
|
108 |
+
classes_string = classes[0]
|
109 |
+
else:
|
110 |
+
classes_string = 'No category was found.'
|
111 |
+
|
112 |
+
|
113 |
+
|
114 |
+
data = pd.DataFrame()
|
115 |
+
data['Category'] = categories
|
116 |
+
data['Probability'] = y_prob
|
117 |
+
fig = px.bar(data, x='Probability', y='Category', orientation='h', height=600)#, title="Category probability")
|
118 |
+
fig.update_xaxes(range=[0, 1])
|
119 |
+
fig.update_layout(margin=dict(l=5, r=5, t=20, b=5)) #paper_bgcolor="LightSteelBlue")
|
120 |
+
fig.update_traces(marker_color='rgb(158,202,225)')
|
121 |
+
|
122 |
+
#print(f"LEN Y_PROB {len(y_prob)}")
|
123 |
+
#print(f"LEN CAT {len(categories)}")
|
124 |
|
125 |
+
words_dict = get_words(reshaped)
|
126 |
+
words_string = ""
|
127 |
+
|
128 |
+
for lang, w in words_dict.items():
|
129 |
+
words_string += f"{lang}: "
|
130 |
+
words_string += ', '.join(w)
|
131 |
+
words_string += "\n"
|
132 |
+
|
133 |
+
return (classes_string, fig, words_string)
|
134 |
+
|
135 |
+
# demo = gr.Interface(fn=generate_output,
|
136 |
+
# inputs=gr.Textbox(lines=6, placeholder="Insert text of the article here...", label="Article"),
|
137 |
+
# outputs=[gr.Textbox(lines=1, label="Category"), gr.Plot(label="Category probability"), gr.Textbox(lines=5, label="Topic discovery")],
|
138 |
+
# title="Article classification & topic discovery demo",
|
139 |
+
# flagging_options=["Incorrect"],
|
140 |
+
# theme=gr.themes.Base())
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
#css=css_code)
|
142 |
|
143 |
+
demo = gr.Blocks(css=css_code, theme=gr.themes.Base(), title="Article classification & topic discovery demo")
|
144 |
+
|
145 |
+
with demo:
|
146 |
+
with gr.Row():
|
147 |
+
my_title = gr.HTML("<h1 align='center'>Article classification & topic discovery demo</h1>")
|
148 |
+
with gr.Row():
|
149 |
+
with gr.Column():
|
150 |
+
input_text = gr.Textbox(lines=22, placeholder="Insert text of the article here...", label="Article")
|
151 |
+
with gr.Row():
|
152 |
+
clear_button = gr.Button("Clear")
|
153 |
+
submit_button = gr.Button("Submit")
|
154 |
+
with gr.Column():
|
155 |
+
with gr.Tabs():
|
156 |
+
with gr.TabItem("Classification"):
|
157 |
+
category_text = gr.Textbox(lines=1, label="Category")
|
158 |
+
category_plot = gr.Plot()
|
159 |
+
with gr.TabItem("Topic discovery"):
|
160 |
+
topic_text = gr.Textbox(lines=22, label="The most representative words")
|
161 |
+
|
162 |
+
submit_button.click(generate_output, inputs=input_text, outputs=[category_text, category_plot, topic_text])
|
163 |
+
clear_button.click(lambda: None, None, input_text, queue=False)
|
164 |
+
|
165 |
demo.launch()
|
learn_multi_doc_model.py
ADDED
@@ -0,0 +1,352 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# /usr/bin/env python3
|
2 |
+
|
3 |
+
import argparse
|
4 |
+
import os
|
5 |
+
import numpy as np
|
6 |
+
import scipy
|
7 |
+
import pickle
|
8 |
+
from scipy.special import log_softmax
|
9 |
+
from time import time
|
10 |
+
from packaging import version
|
11 |
+
|
12 |
+
assert version.parse(scipy.__version__) >= version.parse(
|
13 |
+
"1.7.0"
|
14 |
+
), f"Requries scipy > 1.7.0. Found {scipy.__version__}"
|
15 |
+
|
16 |
+
|
17 |
+
class Model:
|
18 |
+
"""Model defintion, parameters and helper fucntions to compute log-likelihood"""
|
19 |
+
|
20 |
+
def __init__(self, vocab: dict, emb_dim: int):
|
21 |
+
"""Initialize our model
|
22 |
+
|
23 |
+
Args:
|
24 |
+
vocab: vocab size for each language {'en': 25000, 'de': 25000}
|
25 |
+
emb_dim: embedding dimension, will be same across languages
|
26 |
+
"""
|
27 |
+
|
28 |
+
self.L = len(vocab)
|
29 |
+
self.vocab = vocab
|
30 |
+
self.emb_dim = emb_dim
|
31 |
+
|
32 |
+
# word embeddings matrix / subspace for each language
|
33 |
+
self.E = {}
|
34 |
+
|
35 |
+
# bias vector for each language
|
36 |
+
self.b = {}
|
37 |
+
|
38 |
+
n1 = 1.0 / np.sqrt(emb_dim)
|
39 |
+
|
40 |
+
# initialize word embeddings and bias vectors randomly
|
41 |
+
for lang, vocab_size in vocab.items():
|
42 |
+
n2 = 1.0 / np.sqrt(vocab_size)
|
43 |
+
self.E[lang] = np.random.uniform(-n2, n1, size=(vocab_size, emb_dim))
|
44 |
+
self.b[lang] = np.random.randn(vocab_size, 1) * 0.0001
|
45 |
+
|
46 |
+
def init_bias_with_log_unigram_dist(self, X, lang):
|
47 |
+
"""We will initialize the bias vector with log of unigram distribution over vocabulary.
|
48 |
+
This should help us with better initialization.
|
49 |
+
|
50 |
+
b = \log (\sum_d x_d) / (\sum_d \sum_i x_{di})
|
51 |
+
"""
|
52 |
+
|
53 |
+
# if X is sparse matrix, X.A gives the dense version of it in numpy array format
|
54 |
+
if isinstance(X, np.ndarray):
|
55 |
+
X = X + 1e-08 # to avoid zeros
|
56 |
+
else:
|
57 |
+
X = X.A + 1e-08 # to avoid any zeros
|
58 |
+
|
59 |
+
self.b[lang][:, 0] = np.log(
|
60 |
+
X.sum(axis=0) / X.sum()
|
61 |
+
) # we would like b to of size (W, 1)
|
62 |
+
|
63 |
+
def compute_log_thetas(self, lang: str, DE_lang: np.ndarray, sanity_check=False):
|
64 |
+
"""Compute log of thetas, where theta_d is the unigram distribution over document `d`
|
65 |
+
estiamted from the current params (word-embedding matrix, bias vector) and document embedding a_d.
|
66 |
+
|
67 |
+
Args:
|
68 |
+
----
|
69 |
+
lang (str): Language ID (eg: en, de, es ...)
|
70 |
+
DE_lang (np.ndarray): Document embeddings of language
|
71 |
+
"""
|
72 |
+
|
73 |
+
mat = self.b[lang] + (self.E[lang] @ DE_lang) # shape is vocab_size x n_docs
|
74 |
+
mat = mat.T # shape is D x W
|
75 |
+
|
76 |
+
# log_norm = logsumexp(mat, axis=1)
|
77 |
+
# log_thetas = mat - log_norm
|
78 |
+
|
79 |
+
# the following single step is same the two above steps combined
|
80 |
+
log_thetas = log_softmax(mat, axis=1) # shape is n_docs x vocab_size
|
81 |
+
|
82 |
+
if sanity_check:
|
83 |
+
n_docs = DE_lang.shape[0]
|
84 |
+
# sanity-check
|
85 |
+
# since each document is a proper distribution, it should sum upto 1
|
86 |
+
# sum of the matrix should be equal to number of documents
|
87 |
+
print(
|
88 |
+
"Sanity check for log-thetas:",
|
89 |
+
np.allclose(np.exp(log_thetas).sum(), n_docs),
|
90 |
+
)
|
91 |
+
|
92 |
+
return log_thetas
|
93 |
+
|
94 |
+
def compute_log_likelihood(self, lang, DE_lang, X):
|
95 |
+
"""Compute log-likelihood of the data, given the current parameters / embeddings
|
96 |
+
|
97 |
+
Each summation could be implemented using a for-loop but that would very slow,
|
98 |
+
since we have every thing stored in matrices and a sparse matrix, we will do it via
|
99 |
+
matrix muliplications and additions.
|
100 |
+
|
101 |
+
Args:
|
102 |
+
lang: language ID (eg: en, es, fr)
|
103 |
+
DE_lang: document embeddings for the given language
|
104 |
+
X: doc-by-word counts in scipy.sparse format for a specific language
|
105 |
+
|
106 |
+
Returns:
|
107 |
+
float: log-likelihood of the data
|
108 |
+
"""
|
109 |
+
|
110 |
+
log_thetas = self.compute_log_thetas(lang, DE_lang)
|
111 |
+
|
112 |
+
# log-likelihood is product of counts to the respective log-probability values.
|
113 |
+
if isinstance(X, np.ndarray):
|
114 |
+
llh = (X * log_thetas).sum()
|
115 |
+
else:
|
116 |
+
# X is a scipy sparse matrix
|
117 |
+
llh = (X.multiply(log_thetas)).sum()
|
118 |
+
|
119 |
+
return llh
|
120 |
+
|
121 |
+
|
122 |
+
def gradients_WE(model, lang, DE_lang, X, alpha):
|
123 |
+
"""Gradient of the log-likelihood with-respect-to language-specific word embedding matrix `E`
|
124 |
+
|
125 |
+
Args:
|
126 |
+
model (Model): The object of the model
|
127 |
+
lang (str): Language ID
|
128 |
+
DE_lang: document embeddings for the given language
|
129 |
+
X (scipy.sparse_matrix): The doc-by-word counts
|
130 |
+
alpha (float): L2 reg. weight
|
131 |
+
|
132 |
+
Returns:
|
133 |
+
np.ndarray: Gradient of log-likelihood w.r.t word embeddings, i.e, grad of llh w.r.t to model.E
|
134 |
+
"""
|
135 |
+
|
136 |
+
# grads = np.zeros_like(model.E) # initialize empty gradients to be the same shape as word embeddings (W, K)
|
137 |
+
|
138 |
+
# compute log_thetas as they are needed in gradient
|
139 |
+
log_thetas = model.compute_log_thetas(lang, DE_lang)
|
140 |
+
|
141 |
+
# the gradient computation can be done using for-loops to reflect the equation
|
142 |
+
# or it can be done efficiently using matrix multiplications
|
143 |
+
|
144 |
+
# 1. simple way using for-loop
|
145 |
+
# iterate over all documents
|
146 |
+
# for d in range(model.D):
|
147 |
+
|
148 |
+
# iterate over every word,
|
149 |
+
# for k in range(model.W):
|
150 |
+
# x_dk = X[d, k] # count of word k in doc d
|
151 |
+
# rel_x_dk = X[d, :].sum() * np.exp(log_thetas)[d, k] # relative /estimated count of word k in doc d
|
152 |
+
# grads[k, :] += ((x_dk - rel_x_dk) * model.A[:, d]) # doc embeddings are column wise in model.A
|
153 |
+
|
154 |
+
# 2. Efficient way of obtaining gradients using matrix operations
|
155 |
+
|
156 |
+
ef_grads = np.zeros_like(model.E)
|
157 |
+
|
158 |
+
tmp = (
|
159 |
+
X - np.multiply(X.sum(axis=1).reshape(-1, 1), np.exp(log_thetas))
|
160 |
+
).A # .A will convert matrix to np ndarray
|
161 |
+
ef_grads = (DE_lang @ tmp).T - (alpha * 0.5 * model.E[lang]).sum()
|
162 |
+
|
163 |
+
# Sanity check to see if gradients computed in both ways are numerically identical
|
164 |
+
# print('- All close grad_E:', np.allclose(ef_grads, grads))
|
165 |
+
|
166 |
+
return ef_grads
|
167 |
+
|
168 |
+
|
169 |
+
def update_parameters(params, gradient, learning_rate):
|
170 |
+
"""Update the parameters
|
171 |
+
|
172 |
+
Args:
|
173 |
+
params (np.ndarray): Word embedding matrix of the document embedding matrix
|
174 |
+
gradient (np.ndarray): Gradients of all word embeddings or document embeddings. Should be same as size as params
|
175 |
+
learning_rate (float): The learning_rate can also be seen as step size, i.e, the size of the step to be taken
|
176 |
+
along the direction of gradient. Too big steps can overshoot our estimate, whereas too small steps
|
177 |
+
can take longer for the model to reach optimum.
|
178 |
+
|
179 |
+
Returns:
|
180 |
+
np.ndarray: the updated params
|
181 |
+
"""
|
182 |
+
|
183 |
+
assert (
|
184 |
+
params.shape == gradient.shape
|
185 |
+
), "The params and gradient must have same shape, \
|
186 |
+
({:d}, {:d}) != ({:d} {:d})".format(
|
187 |
+
*params.shape, *gradient.shape
|
188 |
+
)
|
189 |
+
|
190 |
+
new_params = params + (
|
191 |
+
learning_rate * gradient
|
192 |
+
) # since we are doing gradient ascent
|
193 |
+
return new_params
|
194 |
+
|
195 |
+
|
196 |
+
def train(model, bow, DE, args):
|
197 |
+
"""Training scheme for the model"""
|
198 |
+
|
199 |
+
print("\nTraining started ..")
|
200 |
+
learning_rate = args.lr
|
201 |
+
llh_0 = 0.0
|
202 |
+
for lang, X in bow.items():
|
203 |
+
llh_0 += model.compute_log_likelihood(lang, DE[lang].T, X)
|
204 |
+
print(" Initial log-likelihood: {:16.2f}".format(llh_0))
|
205 |
+
|
206 |
+
llhs = [llh_0]
|
207 |
+
|
208 |
+
for i in range(1, args.epochs + 1):
|
209 |
+
|
210 |
+
llh_ei = 0.0
|
211 |
+
for lang, X in bow.items():
|
212 |
+
|
213 |
+
# update word embeddings E for lang, by keeping doc-embeddings A fixed
|
214 |
+
grad_E = gradients_WE(model, lang, DE[lang].T, X, args.alpha)
|
215 |
+
|
216 |
+
model.E[lang] = update_parameters(model.E[lang], grad_E, learning_rate)
|
217 |
+
|
218 |
+
llh_ei += model.compute_log_likelihood(lang, DE[lang].T, X)
|
219 |
+
|
220 |
+
print(
|
221 |
+
"Epoch {:4d} / {:4d} | Log-likelihood: {:16.2f} | Learning rate: {:f}".format(
|
222 |
+
i, args.epochs, llh_ei, learning_rate
|
223 |
+
)
|
224 |
+
)
|
225 |
+
|
226 |
+
if llh_ei < llhs[-1]:
|
227 |
+
print(
|
228 |
+
"The log-likelihood should improve after every epoch.",
|
229 |
+
"Instead it decreased, which means the updates have overshooted.",
|
230 |
+
"Halving the learning_rate.",
|
231 |
+
)
|
232 |
+
learning_rate = learning_rate * 0.5
|
233 |
+
|
234 |
+
llhs.append(llh_ei)
|
235 |
+
|
236 |
+
# learning_rate scheduler
|
237 |
+
# we reduce the learning_rate by 10 % after every 10 epochs
|
238 |
+
# if i % 10 == 0:
|
239 |
+
# print("Reducing the learning by a factor of 0.1 every 10 epcohs")
|
240 |
+
# learning_rate -= learning_rate * 0.1
|
241 |
+
if i % 100 == 0:
|
242 |
+
with open(
|
243 |
+
os.path.join(args.out_dir, f"model_{args.alpha}_{i}.pkl"), "wb"
|
244 |
+
) as fpw:
|
245 |
+
pickle.dump(model, fpw)
|
246 |
+
np.savetxt(
|
247 |
+
os.path.join(args.out_dir, f"llh_{args.alpha}_{args.epochs}.txt"),
|
248 |
+
np.asarray(llhs),
|
249 |
+
)
|
250 |
+
|
251 |
+
return model, llhs
|
252 |
+
|
253 |
+
|
254 |
+
def main():
|
255 |
+
"""main"""
|
256 |
+
|
257 |
+
args = parse_arguments()
|
258 |
+
|
259 |
+
os.makedirs(args.out_dir, exist_ok=True)
|
260 |
+
|
261 |
+
emb_dim = 0
|
262 |
+
# load doc embeddings for each language
|
263 |
+
doc_embs = {} # {lang_1: np.ndarray, lang_2: np.ndarray, ...}
|
264 |
+
with open(args.input_embedding_key_file, "r") as fpr:
|
265 |
+
for line in fpr:
|
266 |
+
lang, fpath = line.strip().split()
|
267 |
+
doc_embs[lang] = np.load(fpath)
|
268 |
+
print("Loaded embeddings:", lang, doc_embs[lang].shape)
|
269 |
+
|
270 |
+
if emb_dim == 0:
|
271 |
+
emb_dim = doc_embs[lang].shape[1]
|
272 |
+
|
273 |
+
# load bag of words for each language
|
274 |
+
bows = {} # {lang_1: scipy.sparse, lang_2: scipy.sparse, ...}
|
275 |
+
vocab = {} # {lang_1: vocab_size}
|
276 |
+
with open(args.input_bag_of_words_key_file, "r") as fpr:
|
277 |
+
for line in fpr:
|
278 |
+
lang, fpath = line.strip().split()
|
279 |
+
bows[lang] = scipy.sparse.load_npz(fpath)
|
280 |
+
print("Loaded bag-of-words:", lang, bows[lang].shape)
|
281 |
+
|
282 |
+
vocab[lang] = bows[lang].shape[1]
|
283 |
+
|
284 |
+
# assert the number of docs per language are same in embeddings and bag-of-words
|
285 |
+
assert (
|
286 |
+
bows[lang].shape[0] == doc_embs[lang].shape[0]
|
287 |
+
), "Number of docs in BoW ({:d}) != number of docs in embeddigs ({:d}) for language: {:s}".format(
|
288 |
+
bows[lang].shape[0], doc_embs[lang].shape[0], lang
|
289 |
+
)
|
290 |
+
|
291 |
+
model = Model(vocab, emb_dim)
|
292 |
+
for lang, bow in bows.items():
|
293 |
+
model.init_bias_with_log_unigram_dist(bow, lang)
|
294 |
+
|
295 |
+
print("Model params:")
|
296 |
+
for lang in model.vocab:
|
297 |
+
print(" ", lang, model.E[lang].shape, model.b[lang].shape)
|
298 |
+
|
299 |
+
if args.resume:
|
300 |
+
with open(args.resume, "rb") as fpr:
|
301 |
+
model = pickle.load(fpr)
|
302 |
+
|
303 |
+
# start the training
|
304 |
+
model, llhs = train(model, bows, doc_embs, args)
|
305 |
+
|
306 |
+
with open(
|
307 |
+
os.path.join(args.out_dir, f"model_{args.alpha}_{args.epochs}.pkl"), "wb"
|
308 |
+
) as fpw:
|
309 |
+
pickle.dump(model, fpw)
|
310 |
+
|
311 |
+
np.savetxt(
|
312 |
+
os.path.join(args.out_dir, f"llh_{args.alpha}_{args.epochs}.txt"),
|
313 |
+
np.asarray(llhs),
|
314 |
+
)
|
315 |
+
|
316 |
+
print("Saved in", args.out_dir)
|
317 |
+
|
318 |
+
|
319 |
+
def parse_arguments():
|
320 |
+
|
321 |
+
parser = argparse.ArgumentParser(
|
322 |
+
description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
323 |
+
)
|
324 |
+
|
325 |
+
parser.add_argument(
|
326 |
+
"input_embedding_key_file",
|
327 |
+
help="path to file that has paths to embeddings for each language",
|
328 |
+
)
|
329 |
+
|
330 |
+
parser.add_argument(
|
331 |
+
"input_bag_of_words_key_file", help="path to input bag of words dictionary file"
|
332 |
+
)
|
333 |
+
|
334 |
+
parser.add_argument("out_dir", help="out dir to save the model/word embeddings")
|
335 |
+
|
336 |
+
parser.add_argument("--epochs", type=int, default=100, help="number of epochs")
|
337 |
+
parser.add_argument("--lr", type=float, default=0.0001, help="learning rate")
|
338 |
+
parser.add_argument(
|
339 |
+
"--alpha", type=float, default=1e-4, help="L2 reg. weight / weight decay"
|
340 |
+
)
|
341 |
+
|
342 |
+
parser.add_argument(
|
343 |
+
"--resume", default="", help="path to trained model to resume training"
|
344 |
+
)
|
345 |
+
|
346 |
+
args = parser.parse_args()
|
347 |
+
|
348 |
+
return args
|
349 |
+
|
350 |
+
|
351 |
+
if __name__ == "__main__":
|
352 |
+
main()
|
models/model_0.0001_100.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d48ed6671bf0990a14476301a7845362092852c8e6bb624271f3943252e954c1
|
3 |
+
size 2166342600
|
requirements.txt
CHANGED
@@ -1,2 +1,5 @@
|
|
1 |
numpy==1.24.2
|
2 |
sentence-transformers==2.2.2
|
|
|
|
|
|
|
|
1 |
numpy==1.24.2
|
2 |
sentence-transformers==2.2.2
|
3 |
+
pandas==1.5.2
|
4 |
+
plotly
|
5 |
+
sklearn==0.24.2
|
topic_discovery/.DS_Store
ADDED
Binary file (8.2 kB). View file
|
|
topic_discovery/cvect_25000_ar.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b37e9e016646662718993e2368f9e88c4c21141f8944f23449f27c6d59e03221
|
3 |
+
size 3047285
|
topic_discovery/cvect_25000_bn.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:05b3adf720d522a38762fda2bb6da2c948389a437b2138004698d326181d971d
|
3 |
+
size 157149
|
topic_discovery/cvect_25000_de.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e551d8934e6a8e23c841437805bbed1b0e17eb2f3ab3e260b9104c1e30f452ad
|
3 |
+
size 2037400
|
topic_discovery/cvect_25000_el.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d5419f509f5666ae55a7f5cdfb1cf7ea41f3fa102ec639c19c4aeea8b2dffe32
|
3 |
+
size 3681045
|
topic_discovery/cvect_25000_en.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cb0ee36e4ef6738d408e30132c5d970be2e05728c305fccce06dc67b3941bea2
|
3 |
+
size 4143980
|
topic_discovery/cvect_25000_es.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d28eb842e6f4717a791de9c8c61014131dbea8d26f84f90c62cd54b05595a1c9
|
3 |
+
size 4235561
|
topic_discovery/cvect_25000_fr.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:74ff26b2269c2033f78ecb1e5870c449423d42d668975e5e98e899b6d2489f64
|
3 |
+
size 2967490
|
topic_discovery/cvect_25000_it.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4e8892d88fd88e0d9e121e57e1b77810e47d34909944b2e65e2094d426f17daa
|
3 |
+
size 2477565
|
topic_discovery/cvect_25000_jp.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2c075e83209a4a23afe290aef6a301717f4eadfd118a278114ea142fdf882c20
|
3 |
+
size 3082086
|
topic_discovery/cvect_25000_mg.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:958dd98498097b8463b1fbc6f068b512650d40397b9e53659dc2238032126181
|
3 |
+
size 3643714
|
topic_discovery/cvect_25000_mk.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6758e48f3626b7c91b7359097d27aedb6beaeb36c6a6632901c3fae3f6da5ea3
|
3 |
+
size 2152452
|
topic_discovery/cvect_25000_nl.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5f81d4942757d07cde33715cd00fe150c377b19070f57cc992230b8c6eeacb06
|
3 |
+
size 1466263
|
topic_discovery/cvect_25000_pl.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ad1d1d8853aa424ba47c81d52ab6fdd708d1a440901652d680482d092a88a44a
|
3 |
+
size 2063425
|
topic_discovery/cvect_25000_pt.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:baef6e3fe017ed4feb3ac2e08701b77b4425ade9f39d700ab3d1b4a2d89059d6
|
3 |
+
size 2001188
|
topic_discovery/cvect_25000_ru.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:89bfa381364b0df772b0a181df8740bf597733e328410c464e6690d58e8e212f
|
3 |
+
size 5482015
|
topic_discovery/cvect_25000_zhs.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1369c082d071340da56006eef8ffc380625c39fef4a7034b7d1e2927b1f54717
|
3 |
+
size 9390903
|
topic_discovery/cvect_25000_zht.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:030a1c4b66cfecf4645de14f77d90d56886e8927225581c94e45a93006c0c633
|
3 |
+
size 9965443
|
topic_discovery/cvects.key
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
en topic_discovery/cvect_25000_en.pkl
|
2 |
+
es topic_discovery/cvect_25000_es.pkl
|
3 |
+
fr topic_discovery/cvect_25000_fr.pkl
|
4 |
+
mg topic_discovery/cvect_25000_mg.pkl
|
5 |
+
it topic_discovery/cvect_25000_it.pkl
|
6 |
+
el topic_discovery/cvect_25000_el.pkl
|
7 |
+
zhs topic_discovery/cvect_25000_zhs.pkl
|
8 |
+
zht topic_discovery/cvect_25000_zht.pkl
|
9 |
+
bn topic_discovery/cvect_25000_bn.pkl
|
10 |
+
ru topic_discovery/cvect_25000_ru.pkl
|
11 |
+
pt topic_discovery/cvect_25000_pt.pkl
|
12 |
+
ar topic_discovery/cvect_25000_ar.pkl
|
13 |
+
de topic_discovery/cvect_25000_de.pkl
|
14 |
+
jp topic_discovery/cvect_25000_jp.pkl
|
15 |
+
mk topic_discovery/cvect_25000_mk.pkl
|
16 |
+
pl topic_discovery/cvect_25000_pl.pkl
|
17 |
+
nl topic_discovery/cvect_25000_nl.pkl
|