Spaces:
Running
Running
Sean-Case
commited on
Commit
·
ff32b4a
1
Parent(s):
82b1ab1
Fixed llm_config, environmental variable, zero shot topic model errors with quick embeddings
Browse files- README.md +1 -1
- app.py +52 -58
- funcs/embeddings.py +4 -16
- funcs/representation_model.py +9 -4
README.md
CHANGED
@@ -6,7 +6,7 @@ colorTo: yellow
|
|
6 |
sdk: gradio
|
7 |
sdk_version: 3.50.0
|
8 |
app_file: app.py
|
9 |
-
pinned:
|
10 |
license: apache-2.0
|
11 |
---
|
12 |
|
|
|
6 |
sdk: gradio
|
7 |
sdk_version: 3.50.0
|
8 |
app_file: app.py
|
9 |
+
pinned: true
|
10 |
license: apache-2.0
|
11 |
---
|
12 |
|
app.py
CHANGED
@@ -10,9 +10,12 @@ from sklearn.pipeline import make_pipeline
|
|
10 |
from sklearn.decomposition import TruncatedSVD
|
11 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
12 |
import funcs.anonymiser as anon
|
|
|
13 |
|
14 |
from torch import cuda, backends, version
|
15 |
|
|
|
|
|
16 |
# Check for torch cuda
|
17 |
print("Is CUDA enabled? ", cuda.is_available())
|
18 |
print("Is a CUDA device available on this computer?", backends.cudnn.enabled)
|
@@ -71,9 +74,14 @@ elif low_resource_mode == "Yes":
|
|
71 |
TruncatedSVD(2) # 100 # set to 2 to be compatible with zero shot topics - can't be higher than number of topics
|
72 |
)
|
73 |
|
|
|
|
|
|
|
|
|
74 |
|
75 |
def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels):
|
76 |
|
|
|
77 |
file_list = [string.name for string in in_file]
|
78 |
|
79 |
data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower() and "gz" not in string.lower()]
|
@@ -90,7 +98,9 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
|
|
90 |
if anonymise_drop == "Yes":
|
91 |
in_files_anon_col, anonymisation_success = anon.anonymise_script(in_files, in_colnames_list_first, anon_strat="replace")
|
92 |
in_files[in_colnames_list_first] = in_files_anon_col[in_colnames_list_first]
|
93 |
-
|
|
|
|
|
94 |
|
95 |
docs = list(in_files[in_colnames_list_first].str.lower())
|
96 |
label_col = in_files[in_label_list_first]
|
@@ -116,49 +126,34 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
|
|
116 |
print("Choosing low resource TfIDF model")
|
117 |
embedding_model_pipe = make_pipeline(
|
118 |
TfidfVectorizer(),
|
119 |
-
TruncatedSVD(
|
120 |
)
|
121 |
embedding_model = embedding_model_pipe
|
122 |
|
123 |
embeddings_out, reduced_embeddings = make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels)
|
124 |
|
125 |
-
# all_lengths = [len(embedding) for embedding in embeddings_out]
|
126 |
-
# if len(set(all_lengths)) > 1:
|
127 |
-
# print("Inconsistent lengths found in embeddings_out:", set(all_lengths))
|
128 |
-
# else:
|
129 |
-
# print("All lengths are the same.")
|
130 |
-
|
131 |
-
# print("Embeddings type: ", type(embeddings_out))
|
132 |
-
|
133 |
-
# if isinstance(embeddings_out, np.ndarray):
|
134 |
-
# print("my_object is a NumPy ndarray")
|
135 |
-
# else:
|
136 |
-
# print("my_object is not a NumPy ndarray")
|
137 |
-
|
138 |
-
# Clustering set to K-means (not used)
|
139 |
-
#cluster_model = KMeans(n_clusters=max_topics_slider)
|
140 |
-
|
141 |
-
# Countvectoriser removes stopwords, combines terms up to 2 together:
|
142 |
-
#if min_docs_slider < 3:
|
143 |
-
# min_df_val = min_docs_slider
|
144 |
-
#else:
|
145 |
-
# min_df_val = 3
|
146 |
|
147 |
-
|
148 |
|
149 |
vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.1)
|
150 |
-
|
151 |
|
152 |
from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
|
153 |
-
from funcs.representation_model import create_representation_model,
|
154 |
|
155 |
print("Create LLM topic labels:", create_llm_topic_labels)
|
156 |
-
representation_model = create_representation_model(create_llm_topic_labels,
|
|
|
|
|
|
|
|
|
|
|
157 |
|
158 |
if not candidate_topics:
|
|
|
|
|
159 |
topic_model = BERTopic( embedding_model=embedding_model_pipe,
|
160 |
-
#hdbscan_model=cluster_model,
|
161 |
vectorizer_model=vectoriser_model,
|
|
|
162 |
min_topic_size= min_docs_slider,
|
163 |
nr_topics = max_topics_slider,
|
164 |
representation_model=representation_model,
|
@@ -167,17 +162,26 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
|
|
167 |
topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
|
168 |
|
169 |
|
170 |
-
# Do this if you have pre-
|
171 |
-
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
zero_shot_topics = read_file(candidate_topics.name)
|
173 |
-
#print(zero_shot_topics)
|
174 |
zero_shot_topics_lower = list(zero_shot_topics.iloc[:, 0].str.lower())
|
175 |
|
176 |
-
|
|
|
|
|
|
|
|
|
177 |
|
178 |
topic_model = BERTopic( embedding_model=embedding_model_pipe,
|
179 |
-
#hdbscan_model=cluster_model,
|
180 |
vectorizer_model=vectoriser_model,
|
|
|
181 |
min_topic_size = min_docs_slider,
|
182 |
nr_topics = max_topics_slider,
|
183 |
zeroshot_topic_list = zero_shot_topics_lower,
|
@@ -188,7 +192,7 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
|
|
188 |
topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
|
189 |
|
190 |
if not topics_text:
|
191 |
-
return "No topics found
|
192 |
|
193 |
else:
|
194 |
print("Preparing topic model outputs.")
|
@@ -199,8 +203,9 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
|
|
199 |
if topic_dets.shape[0] == 1:
|
200 |
topic_det_output_name = "topic_details_" + data_file_name_no_ext + "_" + today_rev + ".csv"
|
201 |
topic_dets.to_csv(topic_det_output_name)
|
|
|
202 |
|
203 |
-
return "No topics found, original file returned",
|
204 |
|
205 |
# Replace original labels with LLM labels
|
206 |
if "Mistral" in topic_model.get_topic_info().columns:
|
@@ -213,17 +218,16 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
|
|
213 |
|
214 |
topic_det_output_name = "topic_details_" + data_file_name_no_ext + "_" + today_rev + ".csv"
|
215 |
topic_dets.to_csv(topic_det_output_name)
|
|
|
216 |
|
217 |
doc_det_output_name = "doc_details_" + data_file_name_no_ext + "_" + today_rev + ".csv"
|
218 |
doc_dets = topic_model.get_document_info(docs)[["Document", "Topic", "Name", "Representative_document"]] # "Probability",
|
219 |
doc_dets.to_csv(doc_det_output_name)
|
|
|
220 |
|
221 |
topics_text_out_str = str(topic_dets["Name"])
|
222 |
output_text = "Topics: " + topics_text_out_str
|
223 |
|
224 |
-
embedding_file_name = data_file_name_no_ext + '_' + 'embeddings.npz'
|
225 |
-
np.savez_compressed(embedding_file_name, embeddings_out)
|
226 |
-
|
227 |
#if low_resource_mode == "No":
|
228 |
topic_model_save_name_folder = "output_model/" + data_file_name_no_ext + "_topics_" + today_rev# + ".safetensors"
|
229 |
topic_model_save_name_zip = topic_model_save_name_folder + ".zip"
|
@@ -236,19 +240,12 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
|
|
236 |
# Zip file example
|
237 |
|
238 |
zip_folder(topic_model_save_name_folder, topic_model_save_name_zip)
|
|
|
239 |
|
240 |
# Visualise the topics:
|
241 |
topics_vis = topic_model.visualize_documents(label_col, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)
|
242 |
|
243 |
-
|
244 |
-
|
245 |
-
#elif low_resource_mode == "Yes":
|
246 |
-
# # Visualise the topics:
|
247 |
-
# topics_vis = topic_model.visualize_documents(label_col, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)
|
248 |
-
|
249 |
-
# return output_text, [doc_det_output_name, topic_det_output_name, embedding_file_name], topics_vis
|
250 |
-
|
251 |
-
return output_text, [doc_det_output_name, topic_det_output_name, embedding_file_name, topic_model_save_name_zip], topics_vis
|
252 |
|
253 |
# , topic_model_save_name
|
254 |
|
@@ -262,13 +259,10 @@ with block:
|
|
262 |
|
263 |
gr.Markdown(
|
264 |
"""
|
265 |
-
#
|
266 |
-
|
267 |
""")
|
268 |
-
|
269 |
-
#with gr.Accordion("I will copy and paste my open text", open = False):
|
270 |
-
# in_text = gr.Textbox(label="Copy and paste your open text here", lines = 5)
|
271 |
-
|
272 |
with gr.Tab("Load files and find topics"):
|
273 |
with gr.Accordion("Load data file", open = True):
|
274 |
in_files = gr.File(label="Input text from file", file_count="multiple")
|
@@ -276,8 +270,8 @@ with block:
|
|
276 |
in_colnames = gr.Dropdown(choices=["Choose a column"], multiselect = True, label="Select column to find topics (first will be chosen if multiple selected).")
|
277 |
in_label = gr.Dropdown(choices=["Choose a column"], multiselect = True, label="Select column to for labelling documents in the output visualisation.")
|
278 |
|
279 |
-
with gr.Accordion("I have my own list of topics
|
280 |
-
candidate_topics = gr.File(label="Input topics from file (csv)")
|
281 |
|
282 |
with gr.Row():
|
283 |
min_docs_slider = gr.Slider(minimum = 2, maximum = 1000, value = 15, step = 1, label = "Minimum number of documents needed to create topic")
|
@@ -292,14 +286,14 @@ with block:
|
|
292 |
|
293 |
plot = gr.Plot(label="Visualise your topics here:")
|
294 |
|
295 |
-
with gr.Tab("
|
296 |
-
with gr.Accordion("
|
297 |
with gr.Row():
|
298 |
anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Names and other details are replaced with tags e.g. '<person>'.")
|
299 |
return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="No", choices=["Yes", "No"])
|
300 |
embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="No", choices=["Yes", "No"])
|
301 |
with gr.Row():
|
302 |
-
low_resource_mode_opt = gr.Dropdown(label = "
|
303 |
create_llm_topic_labels = gr.Dropdown(label = "Create LLM-generated topic labels.", value="No", choices=["Yes", "No"])
|
304 |
|
305 |
# Update column names dropdown when file uploaded
|
|
|
10 |
from sklearn.decomposition import TruncatedSVD
|
11 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
12 |
import funcs.anonymiser as anon
|
13 |
+
from umap import UMAP
|
14 |
|
15 |
from torch import cuda, backends, version
|
16 |
|
17 |
+
random_seed = 42
|
18 |
+
|
19 |
# Check for torch cuda
|
20 |
print("Is CUDA enabled? ", cuda.is_available())
|
21 |
print("Is a CUDA device available on this computer?", backends.cudnn.enabled)
|
|
|
74 |
TruncatedSVD(2) # 100 # set to 2 to be compatible with zero shot topics - can't be higher than number of topics
|
75 |
)
|
76 |
|
77 |
+
# Model used for representing topics
|
78 |
+
hf_model_name = 'TheBloke/phi-2-orange-GGUF' #'NousResearch/Nous-Capybara-7B-V1.9-GGUF' # 'second-state/stablelm-2-zephyr-1.6b-GGUF'
|
79 |
+
hf_model_file = 'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf' # 'stablelm-2-zephyr-1_6b-Q5_K_M.gguf'
|
80 |
+
|
81 |
|
82 |
def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels):
|
83 |
|
84 |
+
output_list = []
|
85 |
file_list = [string.name for string in in_file]
|
86 |
|
87 |
data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower() and "gz" not in string.lower()]
|
|
|
98 |
if anonymise_drop == "Yes":
|
99 |
in_files_anon_col, anonymisation_success = anon.anonymise_script(in_files, in_colnames_list_first, anon_strat="replace")
|
100 |
in_files[in_colnames_list_first] = in_files_anon_col[in_colnames_list_first]
|
101 |
+
anonymise_data_name = "anonymised_data.csv"
|
102 |
+
in_files.to_csv(anonymise_data_name)
|
103 |
+
output_list.append(anonymise_data_name)
|
104 |
|
105 |
docs = list(in_files[in_colnames_list_first].str.lower())
|
106 |
label_col = in_files[in_label_list_first]
|
|
|
126 |
print("Choosing low resource TfIDF model")
|
127 |
embedding_model_pipe = make_pipeline(
|
128 |
TfidfVectorizer(),
|
129 |
+
TruncatedSVD(100) # 100 # To be compatible with zero shot, this needs to be lower than number of suggested topics
|
130 |
)
|
131 |
embedding_model = embedding_model_pipe
|
132 |
|
133 |
embeddings_out, reduced_embeddings = make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels)
|
134 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
|
136 |
+
|
137 |
|
138 |
vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.1)
|
|
|
139 |
|
140 |
from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
|
141 |
+
from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag
|
142 |
|
143 |
print("Create LLM topic labels:", create_llm_topic_labels)
|
144 |
+
representation_model = create_representation_model(create_llm_topic_labels, llm_config, hf_model_name, hf_model_file, chosen_start_tag)
|
145 |
+
|
146 |
+
|
147 |
+
|
148 |
+
|
149 |
+
|
150 |
|
151 |
if not candidate_topics:
|
152 |
+
umap_model = UMAP(n_neighbors=15, n_components=5, random_state=random_seed)
|
153 |
+
|
154 |
topic_model = BERTopic( embedding_model=embedding_model_pipe,
|
|
|
155 |
vectorizer_model=vectoriser_model,
|
156 |
+
umap_model=umap_model,
|
157 |
min_topic_size= min_docs_slider,
|
158 |
nr_topics = max_topics_slider,
|
159 |
representation_model=representation_model,
|
|
|
162 |
topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
|
163 |
|
164 |
|
165 |
+
# Do this if you have pre-defined topics
|
166 |
+
else:
|
167 |
+
if low_resource_mode == "Yes":
|
168 |
+
error_message = "Zero shot topic modelling currently not compatible with low-resource embeddings. Please change this option to 'No' on the options tab and retry."
|
169 |
+
print(error_message)
|
170 |
+
|
171 |
+
return error_message, output_list, None
|
172 |
+
|
173 |
zero_shot_topics = read_file(candidate_topics.name)
|
|
|
174 |
zero_shot_topics_lower = list(zero_shot_topics.iloc[:, 0].str.lower())
|
175 |
|
176 |
+
if len(zero_shot_topics_lower) < 15:
|
177 |
+
umap_neighbours = len(zero_shot_topics_lower)
|
178 |
+
else: umap_neighbours = 15
|
179 |
+
|
180 |
+
umap_model = UMAP(n_neighbors=umap_neighbours, n_components=5, random_state=random_seed)
|
181 |
|
182 |
topic_model = BERTopic( embedding_model=embedding_model_pipe,
|
|
|
183 |
vectorizer_model=vectoriser_model,
|
184 |
+
umap_model=umap_model,
|
185 |
min_topic_size = min_docs_slider,
|
186 |
nr_topics = max_topics_slider,
|
187 |
zeroshot_topic_list = zero_shot_topics_lower,
|
|
|
192 |
topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
|
193 |
|
194 |
if not topics_text:
|
195 |
+
return "No topics found.", data_file_name, None
|
196 |
|
197 |
else:
|
198 |
print("Preparing topic model outputs.")
|
|
|
203 |
if topic_dets.shape[0] == 1:
|
204 |
topic_det_output_name = "topic_details_" + data_file_name_no_ext + "_" + today_rev + ".csv"
|
205 |
topic_dets.to_csv(topic_det_output_name)
|
206 |
+
output_list.append(topic_det_output_name)
|
207 |
|
208 |
+
return "No topics found, original file returned", output_list, None
|
209 |
|
210 |
# Replace original labels with LLM labels
|
211 |
if "Mistral" in topic_model.get_topic_info().columns:
|
|
|
218 |
|
219 |
topic_det_output_name = "topic_details_" + data_file_name_no_ext + "_" + today_rev + ".csv"
|
220 |
topic_dets.to_csv(topic_det_output_name)
|
221 |
+
output_list.append(topic_det_output_name)
|
222 |
|
223 |
doc_det_output_name = "doc_details_" + data_file_name_no_ext + "_" + today_rev + ".csv"
|
224 |
doc_dets = topic_model.get_document_info(docs)[["Document", "Topic", "Name", "Representative_document"]] # "Probability",
|
225 |
doc_dets.to_csv(doc_det_output_name)
|
226 |
+
output_list.append(doc_det_output_name)
|
227 |
|
228 |
topics_text_out_str = str(topic_dets["Name"])
|
229 |
output_text = "Topics: " + topics_text_out_str
|
230 |
|
|
|
|
|
|
|
231 |
#if low_resource_mode == "No":
|
232 |
topic_model_save_name_folder = "output_model/" + data_file_name_no_ext + "_topics_" + today_rev# + ".safetensors"
|
233 |
topic_model_save_name_zip = topic_model_save_name_folder + ".zip"
|
|
|
240 |
# Zip file example
|
241 |
|
242 |
zip_folder(topic_model_save_name_folder, topic_model_save_name_zip)
|
243 |
+
output_list.append(topic_model_save_name_zip)
|
244 |
|
245 |
# Visualise the topics:
|
246 |
topics_vis = topic_model.visualize_documents(label_col, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)
|
247 |
|
248 |
+
return output_text, output_list, topics_vis
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
249 |
|
250 |
# , topic_model_save_name
|
251 |
|
|
|
259 |
|
260 |
gr.Markdown(
|
261 |
"""
|
262 |
+
# Topic modeller
|
263 |
+
Generate topics from open text in tabular data. Upload a file (csv, xlsx, or parquet), then specify the columns that you want to use to generate topics and use for labels in the visualisation. If you have an embeddings .npz file of the text made using the 'jina-embeddings-v2-small-en' model, you can load this in at the same time to skip the first modelling step. If you have a pre-defined list of topics, you can upload this as a csv file under 'I have my own list of topics...'. Further configuration options are available under the 'Options' tab.
|
264 |
""")
|
265 |
+
|
|
|
|
|
|
|
266 |
with gr.Tab("Load files and find topics"):
|
267 |
with gr.Accordion("Load data file", open = True):
|
268 |
in_files = gr.File(label="Input text from file", file_count="multiple")
|
|
|
270 |
in_colnames = gr.Dropdown(choices=["Choose a column"], multiselect = True, label="Select column to find topics (first will be chosen if multiple selected).")
|
271 |
in_label = gr.Dropdown(choices=["Choose a column"], multiselect = True, label="Select column to for labelling documents in the output visualisation.")
|
272 |
|
273 |
+
with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
|
274 |
+
candidate_topics = gr.File(label="Input topics from file (csv). File should have at least one column with a header and topic keywords in cells below. Topics will be taken from the first column of the file. Currently not compatible with low-resource embeddings.")
|
275 |
|
276 |
with gr.Row():
|
277 |
min_docs_slider = gr.Slider(minimum = 2, maximum = 1000, value = 15, step = 1, label = "Minimum number of documents needed to create topic")
|
|
|
286 |
|
287 |
plot = gr.Plot(label="Visualise your topics here:")
|
288 |
|
289 |
+
with gr.Tab("Options"):
|
290 |
+
with gr.Accordion("Data load and processing options", open = True):
|
291 |
with gr.Row():
|
292 |
anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Names and other details are replaced with tags e.g. '<person>'.")
|
293 |
return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="No", choices=["Yes", "No"])
|
294 |
embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="No", choices=["Yes", "No"])
|
295 |
with gr.Row():
|
296 |
+
low_resource_mode_opt = gr.Dropdown(label = "Use low resource embeddings model based on TF-IDF (consider if embedding generation is slow).", value="No", choices=["Yes", "No"])
|
297 |
create_llm_topic_labels = gr.Dropdown(label = "Create LLM-generated topic labels.", value="No", choices=["Yes", "No"])
|
298 |
|
299 |
# Update column names dropdown when file uploaded
|
funcs/embeddings.py
CHANGED
@@ -6,6 +6,8 @@ from sklearn.decomposition import TruncatedSVD
|
|
6 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
7 |
from umap import UMAP
|
8 |
|
|
|
|
|
9 |
if cuda.is_available():
|
10 |
torch_device = "gpu"
|
11 |
else:
|
@@ -23,8 +25,6 @@ def make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_mo
|
|
23 |
if "compress" in embeddings_file_names[0]:
|
24 |
embeddings_out /= 100
|
25 |
|
26 |
-
# print("embeddings loaded: ", embeddings_out)
|
27 |
-
|
28 |
if not embeddings_file_names:
|
29 |
tic = time.perf_counter()
|
30 |
print("Starting to embed documents.")
|
@@ -49,27 +49,15 @@ def make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_mo
|
|
49 |
elif low_resource_mode_opt == "No":
|
50 |
print("Creating dense embeddings based on transformers model")
|
51 |
|
52 |
-
#print("Embedding model is: ", embedding_model)
|
53 |
-
|
54 |
embeddings_out = embedding_model.encode(sentences=docs, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina # #
|
55 |
|
56 |
-
#import torch
|
57 |
-
#from torch.nn.utils.rnn import pad_sequence
|
58 |
-
|
59 |
-
# Assuming embeddings_out is a list of tensors
|
60 |
-
#embeddings_out = [torch.tensor(embedding) for embedding in embeddings_out]
|
61 |
-
|
62 |
-
# Pad the sequences
|
63 |
-
# Set batch_first=True if you want the batch dimension to be the first dimension
|
64 |
-
#embeddings_out = pad_sequence(embeddings_out, batch_first=True, padding_value=0)
|
65 |
-
|
66 |
-
|
67 |
toc = time.perf_counter()
|
68 |
time_out = f"The embedding took {toc - tic:0.1f} seconds"
|
69 |
print(time_out)
|
70 |
|
71 |
# If you want to save your files for next time
|
72 |
if return_intermediate_files == "Yes":
|
|
|
73 |
if embeddings_super_compress == "No":
|
74 |
semantic_search_file_name = data_file_name_no_ext + '_' + 'embeddings.npz'
|
75 |
np.savez_compressed(semantic_search_file_name, embeddings_out)
|
@@ -81,7 +69,7 @@ def make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_mo
|
|
81 |
|
82 |
# Pre-reduce embeddings for visualisation purposes
|
83 |
if reduce_embeddings == "Yes":
|
84 |
-
reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', random_state=
|
85 |
return embeddings_out, reduced_embeddings
|
86 |
|
87 |
return embeddings_out, None
|
|
|
6 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
7 |
from umap import UMAP
|
8 |
|
9 |
+
random_seed = 42
|
10 |
+
|
11 |
if cuda.is_available():
|
12 |
torch_device = "gpu"
|
13 |
else:
|
|
|
25 |
if "compress" in embeddings_file_names[0]:
|
26 |
embeddings_out /= 100
|
27 |
|
|
|
|
|
28 |
if not embeddings_file_names:
|
29 |
tic = time.perf_counter()
|
30 |
print("Starting to embed documents.")
|
|
|
49 |
elif low_resource_mode_opt == "No":
|
50 |
print("Creating dense embeddings based on transformers model")
|
51 |
|
|
|
|
|
52 |
embeddings_out = embedding_model.encode(sentences=docs, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina # #
|
53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
toc = time.perf_counter()
|
55 |
time_out = f"The embedding took {toc - tic:0.1f} seconds"
|
56 |
print(time_out)
|
57 |
|
58 |
# If you want to save your files for next time
|
59 |
if return_intermediate_files == "Yes":
|
60 |
+
print("Saving embeddings to file")
|
61 |
if embeddings_super_compress == "No":
|
62 |
semantic_search_file_name = data_file_name_no_ext + '_' + 'embeddings.npz'
|
63 |
np.savez_compressed(semantic_search_file_name, embeddings_out)
|
|
|
69 |
|
70 |
# Pre-reduce embeddings for visualisation purposes
|
71 |
if reduce_embeddings == "Yes":
|
72 |
+
reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', random_state=random_seed).fit_transform(embeddings_out)
|
73 |
return embeddings_out, reduced_embeddings
|
74 |
|
75 |
return embeddings_out, None
|
funcs/representation_model.py
CHANGED
@@ -10,8 +10,7 @@ from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, T
|
|
10 |
from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
|
11 |
|
12 |
|
13 |
-
|
14 |
-
hf_model_file = 'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf' # 'stablelm-2-zephyr-1_6b-Q5_K_M.gguf'
|
15 |
chosen_prompt = open_hermes_prompt # stablelm_prompt
|
16 |
chosen_start_tag = open_hermes_start # stablelm_start
|
17 |
|
@@ -46,7 +45,7 @@ def find_model_file(hf_model_name, hf_model_file, search_folder):
|
|
46 |
found_file = find_file(folder_path, file_to_find)
|
47 |
return found_file
|
48 |
|
49 |
-
|
50 |
|
51 |
# Currently set n_gpu_layers to 0 even with cuda due to persistent bugs in implementation with cuda
|
52 |
if torch.cuda.is_available():
|
@@ -120,10 +119,16 @@ llm_config = LLamacppInitConfigGpu(last_n_tokens_size=last_n_tokens_size,
|
|
120 |
# KeyBERT
|
121 |
keybert = KeyBERTInspired()
|
122 |
|
123 |
-
def create_representation_model(create_llm_topic_labels, llm_config,
|
124 |
|
125 |
if create_llm_topic_labels == "Yes":
|
126 |
# Use llama.cpp to load in model
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
llm = Llama(model_path=found_file, stop=chosen_start_tag, n_gpu_layers=llm_config.n_gpu_layers, n_ctx=llm_config.n_ctx) #**llm_config.model_dump())#
|
128 |
#print(llm.n_gpu_layers)
|
129 |
llm_model = LlamaCPP(llm, prompt=chosen_prompt)#, **gen_config.model_dump())
|
|
|
10 |
from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
|
11 |
|
12 |
|
13 |
+
|
|
|
14 |
chosen_prompt = open_hermes_prompt # stablelm_prompt
|
15 |
chosen_start_tag = open_hermes_start # stablelm_start
|
16 |
|
|
|
45 |
found_file = find_file(folder_path, file_to_find)
|
46 |
return found_file
|
47 |
|
48 |
+
|
49 |
|
50 |
# Currently set n_gpu_layers to 0 even with cuda due to persistent bugs in implementation with cuda
|
51 |
if torch.cuda.is_available():
|
|
|
119 |
# KeyBERT
|
120 |
keybert = KeyBERTInspired()
|
121 |
|
122 |
+
def create_representation_model(create_llm_topic_labels, llm_config, hf_model_name, hf_model_file, chosen_start_tag):
|
123 |
|
124 |
if create_llm_topic_labels == "Yes":
|
125 |
# Use llama.cpp to load in model
|
126 |
+
|
127 |
+
# Check for HF_HOME environment variable and supply a default value if it's not found (current folder)
|
128 |
+
hf_home_value = os.getenv("HF_HOME", '.')
|
129 |
+
|
130 |
+
found_file = find_model_file(hf_model_name, hf_model_file, hf_home_value)
|
131 |
+
|
132 |
llm = Llama(model_path=found_file, stop=chosen_start_tag, n_gpu_layers=llm_config.n_gpu_layers, n_ctx=llm_config.n_ctx) #**llm_config.model_dump())#
|
133 |
#print(llm.n_gpu_layers)
|
134 |
llm_model = LlamaCPP(llm, prompt=chosen_prompt)#, **gen_config.model_dump())
|