Sean-Case commited on
Commit
ff32b4a
·
1 Parent(s): 82b1ab1

Fixed llm_config, environmental variable, zero shot topic model errors with quick embeddings

Browse files
Files changed (4) hide show
  1. README.md +1 -1
  2. app.py +52 -58
  3. funcs/embeddings.py +4 -16
  4. funcs/representation_model.py +9 -4
README.md CHANGED
@@ -6,7 +6,7 @@ colorTo: yellow
6
  sdk: gradio
7
  sdk_version: 3.50.0
8
  app_file: app.py
9
- pinned: false
10
  license: apache-2.0
11
  ---
12
 
 
6
  sdk: gradio
7
  sdk_version: 3.50.0
8
  app_file: app.py
9
+ pinned: true
10
  license: apache-2.0
11
  ---
12
 
app.py CHANGED
@@ -10,9 +10,12 @@ from sklearn.pipeline import make_pipeline
10
  from sklearn.decomposition import TruncatedSVD
11
  from sklearn.feature_extraction.text import TfidfVectorizer
12
  import funcs.anonymiser as anon
 
13
 
14
  from torch import cuda, backends, version
15
 
 
 
16
  # Check for torch cuda
17
  print("Is CUDA enabled? ", cuda.is_available())
18
  print("Is a CUDA device available on this computer?", backends.cudnn.enabled)
@@ -71,9 +74,14 @@ elif low_resource_mode == "Yes":
71
  TruncatedSVD(2) # 100 # set to 2 to be compatible with zero shot topics - can't be higher than number of topics
72
  )
73
 
 
 
 
 
74
 
75
  def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels):
76
 
 
77
  file_list = [string.name for string in in_file]
78
 
79
  data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower() and "gz" not in string.lower()]
@@ -90,7 +98,9 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
90
  if anonymise_drop == "Yes":
91
  in_files_anon_col, anonymisation_success = anon.anonymise_script(in_files, in_colnames_list_first, anon_strat="replace")
92
  in_files[in_colnames_list_first] = in_files_anon_col[in_colnames_list_first]
93
- in_files.to_csv("anonymised_data.csv")
 
 
94
 
95
  docs = list(in_files[in_colnames_list_first].str.lower())
96
  label_col = in_files[in_label_list_first]
@@ -116,49 +126,34 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
116
  print("Choosing low resource TfIDF model")
117
  embedding_model_pipe = make_pipeline(
118
  TfidfVectorizer(),
119
- TruncatedSVD(2) # 100 # To be compatible with zero shot, this needs to be lower than number of suggested topics
120
  )
121
  embedding_model = embedding_model_pipe
122
 
123
  embeddings_out, reduced_embeddings = make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels)
124
 
125
- # all_lengths = [len(embedding) for embedding in embeddings_out]
126
- # if len(set(all_lengths)) > 1:
127
- # print("Inconsistent lengths found in embeddings_out:", set(all_lengths))
128
- # else:
129
- # print("All lengths are the same.")
130
-
131
- # print("Embeddings type: ", type(embeddings_out))
132
-
133
- # if isinstance(embeddings_out, np.ndarray):
134
- # print("my_object is a NumPy ndarray")
135
- # else:
136
- # print("my_object is not a NumPy ndarray")
137
-
138
- # Clustering set to K-means (not used)
139
- #cluster_model = KMeans(n_clusters=max_topics_slider)
140
-
141
- # Countvectoriser removes stopwords, combines terms up to 2 together:
142
- #if min_docs_slider < 3:
143
- # min_df_val = min_docs_slider
144
- #else:
145
- # min_df_val = 3
146
 
147
- #print(min_df_val)
148
 
149
  vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.1)
150
-
151
 
152
  from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
153
- from funcs.representation_model import create_representation_model, found_file, gpu_config, chosen_start_tag
154
 
155
  print("Create LLM topic labels:", create_llm_topic_labels)
156
- representation_model = create_representation_model(create_llm_topic_labels, gpu_config, found_file, chosen_start_tag)
 
 
 
 
 
157
 
158
  if not candidate_topics:
 
 
159
  topic_model = BERTopic( embedding_model=embedding_model_pipe,
160
- #hdbscan_model=cluster_model,
161
  vectorizer_model=vectoriser_model,
 
162
  min_topic_size= min_docs_slider,
163
  nr_topics = max_topics_slider,
164
  representation_model=representation_model,
@@ -167,17 +162,26 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
167
  topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
168
 
169
 
170
- # Do this if you have pre-assigned topics
171
- else:
 
 
 
 
 
 
172
  zero_shot_topics = read_file(candidate_topics.name)
173
- #print(zero_shot_topics)
174
  zero_shot_topics_lower = list(zero_shot_topics.iloc[:, 0].str.lower())
175
 
176
- print(zero_shot_topics_lower)
 
 
 
 
177
 
178
  topic_model = BERTopic( embedding_model=embedding_model_pipe,
179
- #hdbscan_model=cluster_model,
180
  vectorizer_model=vectoriser_model,
 
181
  min_topic_size = min_docs_slider,
182
  nr_topics = max_topics_slider,
183
  zeroshot_topic_list = zero_shot_topics_lower,
@@ -188,7 +192,7 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
188
  topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
189
 
190
  if not topics_text:
191
- return "No topics found, original file returned", data_file_name, None
192
 
193
  else:
194
  print("Preparing topic model outputs.")
@@ -199,8 +203,9 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
199
  if topic_dets.shape[0] == 1:
200
  topic_det_output_name = "topic_details_" + data_file_name_no_ext + "_" + today_rev + ".csv"
201
  topic_dets.to_csv(topic_det_output_name)
 
202
 
203
- return "No topics found, original file returned", [data_file_name, topic_det_output_name], None
204
 
205
  # Replace original labels with LLM labels
206
  if "Mistral" in topic_model.get_topic_info().columns:
@@ -213,17 +218,16 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
213
 
214
  topic_det_output_name = "topic_details_" + data_file_name_no_ext + "_" + today_rev + ".csv"
215
  topic_dets.to_csv(topic_det_output_name)
 
216
 
217
  doc_det_output_name = "doc_details_" + data_file_name_no_ext + "_" + today_rev + ".csv"
218
  doc_dets = topic_model.get_document_info(docs)[["Document", "Topic", "Name", "Representative_document"]] # "Probability",
219
  doc_dets.to_csv(doc_det_output_name)
 
220
 
221
  topics_text_out_str = str(topic_dets["Name"])
222
  output_text = "Topics: " + topics_text_out_str
223
 
224
- embedding_file_name = data_file_name_no_ext + '_' + 'embeddings.npz'
225
- np.savez_compressed(embedding_file_name, embeddings_out)
226
-
227
  #if low_resource_mode == "No":
228
  topic_model_save_name_folder = "output_model/" + data_file_name_no_ext + "_topics_" + today_rev# + ".safetensors"
229
  topic_model_save_name_zip = topic_model_save_name_folder + ".zip"
@@ -236,19 +240,12 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
236
  # Zip file example
237
 
238
  zip_folder(topic_model_save_name_folder, topic_model_save_name_zip)
 
239
 
240
  # Visualise the topics:
241
  topics_vis = topic_model.visualize_documents(label_col, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)
242
 
243
- #return output_text, [doc_det_output_name, topic_det_output_name, embedding_file_name, topic_model_save_name_zip], topics_vis
244
-
245
- #elif low_resource_mode == "Yes":
246
- # # Visualise the topics:
247
- # topics_vis = topic_model.visualize_documents(label_col, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)
248
-
249
- # return output_text, [doc_det_output_name, topic_det_output_name, embedding_file_name], topics_vis
250
-
251
- return output_text, [doc_det_output_name, topic_det_output_name, embedding_file_name, topic_model_save_name_zip], topics_vis
252
 
253
  # , topic_model_save_name
254
 
@@ -262,13 +259,10 @@ with block:
262
 
263
  gr.Markdown(
264
  """
265
- # Extract topics from text
266
- Enter open text below to get topics. You can copy and paste text directly, or upload a file and specify the column that you want to topics.
267
  """)
268
-
269
- #with gr.Accordion("I will copy and paste my open text", open = False):
270
- # in_text = gr.Textbox(label="Copy and paste your open text here", lines = 5)
271
-
272
  with gr.Tab("Load files and find topics"):
273
  with gr.Accordion("Load data file", open = True):
274
  in_files = gr.File(label="Input text from file", file_count="multiple")
@@ -276,8 +270,8 @@ with block:
276
  in_colnames = gr.Dropdown(choices=["Choose a column"], multiselect = True, label="Select column to find topics (first will be chosen if multiple selected).")
277
  in_label = gr.Dropdown(choices=["Choose a column"], multiselect = True, label="Select column to for labelling documents in the output visualisation.")
278
 
279
- with gr.Accordion("I have my own list of topics. File should have at least one column with a header and topic keywords in cells below. Topics will be taken from the first column of the file", open = False):
280
- candidate_topics = gr.File(label="Input topics from file (csv)")
281
 
282
  with gr.Row():
283
  min_docs_slider = gr.Slider(minimum = 2, maximum = 1000, value = 15, step = 1, label = "Minimum number of documents needed to create topic")
@@ -292,14 +286,14 @@ with block:
292
 
293
  plot = gr.Plot(label="Visualise your topics here:")
294
 
295
- with gr.Tab("Load and data processing options"):
296
- with gr.Accordion("Process data on load", open = True):
297
  with gr.Row():
298
  anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Names and other details are replaced with tags e.g. '<person>'.")
299
  return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="No", choices=["Yes", "No"])
300
  embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="No", choices=["Yes", "No"])
301
  with gr.Row():
302
- low_resource_mode_opt = gr.Dropdown(label = "Low resource mode (non-AI embeddings, no LLM-generated topic names).", value="No", choices=["Yes", "No"])
303
  create_llm_topic_labels = gr.Dropdown(label = "Create LLM-generated topic labels.", value="No", choices=["Yes", "No"])
304
 
305
  # Update column names dropdown when file uploaded
 
10
  from sklearn.decomposition import TruncatedSVD
11
  from sklearn.feature_extraction.text import TfidfVectorizer
12
  import funcs.anonymiser as anon
13
+ from umap import UMAP
14
 
15
  from torch import cuda, backends, version
16
 
17
+ random_seed = 42
18
+
19
  # Check for torch cuda
20
  print("Is CUDA enabled? ", cuda.is_available())
21
  print("Is a CUDA device available on this computer?", backends.cudnn.enabled)
 
74
  TruncatedSVD(2) # 100 # set to 2 to be compatible with zero shot topics - can't be higher than number of topics
75
  )
76
 
77
+ # Model used for representing topics
78
+ hf_model_name = 'TheBloke/phi-2-orange-GGUF' #'NousResearch/Nous-Capybara-7B-V1.9-GGUF' # 'second-state/stablelm-2-zephyr-1.6b-GGUF'
79
+ hf_model_file = 'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf' # 'stablelm-2-zephyr-1_6b-Q5_K_M.gguf'
80
+
81
 
82
  def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels):
83
 
84
+ output_list = []
85
  file_list = [string.name for string in in_file]
86
 
87
  data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower() and "gz" not in string.lower()]
 
98
  if anonymise_drop == "Yes":
99
  in_files_anon_col, anonymisation_success = anon.anonymise_script(in_files, in_colnames_list_first, anon_strat="replace")
100
  in_files[in_colnames_list_first] = in_files_anon_col[in_colnames_list_first]
101
+ anonymise_data_name = "anonymised_data.csv"
102
+ in_files.to_csv(anonymise_data_name)
103
+ output_list.append(anonymise_data_name)
104
 
105
  docs = list(in_files[in_colnames_list_first].str.lower())
106
  label_col = in_files[in_label_list_first]
 
126
  print("Choosing low resource TfIDF model")
127
  embedding_model_pipe = make_pipeline(
128
  TfidfVectorizer(),
129
+ TruncatedSVD(100) # 100 # To be compatible with zero shot, this needs to be lower than number of suggested topics
130
  )
131
  embedding_model = embedding_model_pipe
132
 
133
  embeddings_out, reduced_embeddings = make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels)
134
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
+
137
 
138
  vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.1)
 
139
 
140
  from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
141
+ from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag
142
 
143
  print("Create LLM topic labels:", create_llm_topic_labels)
144
+ representation_model = create_representation_model(create_llm_topic_labels, llm_config, hf_model_name, hf_model_file, chosen_start_tag)
145
+
146
+
147
+
148
+
149
+
150
 
151
  if not candidate_topics:
152
+ umap_model = UMAP(n_neighbors=15, n_components=5, random_state=random_seed)
153
+
154
  topic_model = BERTopic( embedding_model=embedding_model_pipe,
 
155
  vectorizer_model=vectoriser_model,
156
+ umap_model=umap_model,
157
  min_topic_size= min_docs_slider,
158
  nr_topics = max_topics_slider,
159
  representation_model=representation_model,
 
162
  topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
163
 
164
 
165
+ # Do this if you have pre-defined topics
166
+ else:
167
+ if low_resource_mode == "Yes":
168
+ error_message = "Zero shot topic modelling currently not compatible with low-resource embeddings. Please change this option to 'No' on the options tab and retry."
169
+ print(error_message)
170
+
171
+ return error_message, output_list, None
172
+
173
  zero_shot_topics = read_file(candidate_topics.name)
 
174
  zero_shot_topics_lower = list(zero_shot_topics.iloc[:, 0].str.lower())
175
 
176
+ if len(zero_shot_topics_lower) < 15:
177
+ umap_neighbours = len(zero_shot_topics_lower)
178
+ else: umap_neighbours = 15
179
+
180
+ umap_model = UMAP(n_neighbors=umap_neighbours, n_components=5, random_state=random_seed)
181
 
182
  topic_model = BERTopic( embedding_model=embedding_model_pipe,
 
183
  vectorizer_model=vectoriser_model,
184
+ umap_model=umap_model,
185
  min_topic_size = min_docs_slider,
186
  nr_topics = max_topics_slider,
187
  zeroshot_topic_list = zero_shot_topics_lower,
 
192
  topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
193
 
194
  if not topics_text:
195
+ return "No topics found.", data_file_name, None
196
 
197
  else:
198
  print("Preparing topic model outputs.")
 
203
  if topic_dets.shape[0] == 1:
204
  topic_det_output_name = "topic_details_" + data_file_name_no_ext + "_" + today_rev + ".csv"
205
  topic_dets.to_csv(topic_det_output_name)
206
+ output_list.append(topic_det_output_name)
207
 
208
+ return "No topics found, original file returned", output_list, None
209
 
210
  # Replace original labels with LLM labels
211
  if "Mistral" in topic_model.get_topic_info().columns:
 
218
 
219
  topic_det_output_name = "topic_details_" + data_file_name_no_ext + "_" + today_rev + ".csv"
220
  topic_dets.to_csv(topic_det_output_name)
221
+ output_list.append(topic_det_output_name)
222
 
223
  doc_det_output_name = "doc_details_" + data_file_name_no_ext + "_" + today_rev + ".csv"
224
  doc_dets = topic_model.get_document_info(docs)[["Document", "Topic", "Name", "Representative_document"]] # "Probability",
225
  doc_dets.to_csv(doc_det_output_name)
226
+ output_list.append(doc_det_output_name)
227
 
228
  topics_text_out_str = str(topic_dets["Name"])
229
  output_text = "Topics: " + topics_text_out_str
230
 
 
 
 
231
  #if low_resource_mode == "No":
232
  topic_model_save_name_folder = "output_model/" + data_file_name_no_ext + "_topics_" + today_rev# + ".safetensors"
233
  topic_model_save_name_zip = topic_model_save_name_folder + ".zip"
 
240
  # Zip file example
241
 
242
  zip_folder(topic_model_save_name_folder, topic_model_save_name_zip)
243
+ output_list.append(topic_model_save_name_zip)
244
 
245
  # Visualise the topics:
246
  topics_vis = topic_model.visualize_documents(label_col, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)
247
 
248
+ return output_text, output_list, topics_vis
 
 
 
 
 
 
 
 
249
 
250
  # , topic_model_save_name
251
 
 
259
 
260
  gr.Markdown(
261
  """
262
+ # Topic modeller
263
+ Generate topics from open text in tabular data. Upload a file (csv, xlsx, or parquet), then specify the columns that you want to use to generate topics and use for labels in the visualisation. If you have an embeddings .npz file of the text made using the 'jina-embeddings-v2-small-en' model, you can load this in at the same time to skip the first modelling step. If you have a pre-defined list of topics, you can upload this as a csv file under 'I have my own list of topics...'. Further configuration options are available under the 'Options' tab.
264
  """)
265
+
 
 
 
266
  with gr.Tab("Load files and find topics"):
267
  with gr.Accordion("Load data file", open = True):
268
  in_files = gr.File(label="Input text from file", file_count="multiple")
 
270
  in_colnames = gr.Dropdown(choices=["Choose a column"], multiselect = True, label="Select column to find topics (first will be chosen if multiple selected).")
271
  in_label = gr.Dropdown(choices=["Choose a column"], multiselect = True, label="Select column to for labelling documents in the output visualisation.")
272
 
273
+ with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
274
+ candidate_topics = gr.File(label="Input topics from file (csv). File should have at least one column with a header and topic keywords in cells below. Topics will be taken from the first column of the file. Currently not compatible with low-resource embeddings.")
275
 
276
  with gr.Row():
277
  min_docs_slider = gr.Slider(minimum = 2, maximum = 1000, value = 15, step = 1, label = "Minimum number of documents needed to create topic")
 
286
 
287
  plot = gr.Plot(label="Visualise your topics here:")
288
 
289
+ with gr.Tab("Options"):
290
+ with gr.Accordion("Data load and processing options", open = True):
291
  with gr.Row():
292
  anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Names and other details are replaced with tags e.g. '<person>'.")
293
  return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="No", choices=["Yes", "No"])
294
  embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="No", choices=["Yes", "No"])
295
  with gr.Row():
296
+ low_resource_mode_opt = gr.Dropdown(label = "Use low resource embeddings model based on TF-IDF (consider if embedding generation is slow).", value="No", choices=["Yes", "No"])
297
  create_llm_topic_labels = gr.Dropdown(label = "Create LLM-generated topic labels.", value="No", choices=["Yes", "No"])
298
 
299
  # Update column names dropdown when file uploaded
funcs/embeddings.py CHANGED
@@ -6,6 +6,8 @@ from sklearn.decomposition import TruncatedSVD
6
  from sklearn.feature_extraction.text import TfidfVectorizer
7
  from umap import UMAP
8
 
 
 
9
  if cuda.is_available():
10
  torch_device = "gpu"
11
  else:
@@ -23,8 +25,6 @@ def make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_mo
23
  if "compress" in embeddings_file_names[0]:
24
  embeddings_out /= 100
25
 
26
- # print("embeddings loaded: ", embeddings_out)
27
-
28
  if not embeddings_file_names:
29
  tic = time.perf_counter()
30
  print("Starting to embed documents.")
@@ -49,27 +49,15 @@ def make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_mo
49
  elif low_resource_mode_opt == "No":
50
  print("Creating dense embeddings based on transformers model")
51
 
52
- #print("Embedding model is: ", embedding_model)
53
-
54
  embeddings_out = embedding_model.encode(sentences=docs, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina # #
55
 
56
- #import torch
57
- #from torch.nn.utils.rnn import pad_sequence
58
-
59
- # Assuming embeddings_out is a list of tensors
60
- #embeddings_out = [torch.tensor(embedding) for embedding in embeddings_out]
61
-
62
- # Pad the sequences
63
- # Set batch_first=True if you want the batch dimension to be the first dimension
64
- #embeddings_out = pad_sequence(embeddings_out, batch_first=True, padding_value=0)
65
-
66
-
67
  toc = time.perf_counter()
68
  time_out = f"The embedding took {toc - tic:0.1f} seconds"
69
  print(time_out)
70
 
71
  # If you want to save your files for next time
72
  if return_intermediate_files == "Yes":
 
73
  if embeddings_super_compress == "No":
74
  semantic_search_file_name = data_file_name_no_ext + '_' + 'embeddings.npz'
75
  np.savez_compressed(semantic_search_file_name, embeddings_out)
@@ -81,7 +69,7 @@ def make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_mo
81
 
82
  # Pre-reduce embeddings for visualisation purposes
83
  if reduce_embeddings == "Yes":
84
- reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit_transform(embeddings_out)
85
  return embeddings_out, reduced_embeddings
86
 
87
  return embeddings_out, None
 
6
  from sklearn.feature_extraction.text import TfidfVectorizer
7
  from umap import UMAP
8
 
9
+ random_seed = 42
10
+
11
  if cuda.is_available():
12
  torch_device = "gpu"
13
  else:
 
25
  if "compress" in embeddings_file_names[0]:
26
  embeddings_out /= 100
27
 
 
 
28
  if not embeddings_file_names:
29
  tic = time.perf_counter()
30
  print("Starting to embed documents.")
 
49
  elif low_resource_mode_opt == "No":
50
  print("Creating dense embeddings based on transformers model")
51
 
 
 
52
  embeddings_out = embedding_model.encode(sentences=docs, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina # #
53
 
 
 
 
 
 
 
 
 
 
 
 
54
  toc = time.perf_counter()
55
  time_out = f"The embedding took {toc - tic:0.1f} seconds"
56
  print(time_out)
57
 
58
  # If you want to save your files for next time
59
  if return_intermediate_files == "Yes":
60
+ print("Saving embeddings to file")
61
  if embeddings_super_compress == "No":
62
  semantic_search_file_name = data_file_name_no_ext + '_' + 'embeddings.npz'
63
  np.savez_compressed(semantic_search_file_name, embeddings_out)
 
69
 
70
  # Pre-reduce embeddings for visualisation purposes
71
  if reduce_embeddings == "Yes":
72
+ reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', random_state=random_seed).fit_transform(embeddings_out)
73
  return embeddings_out, reduced_embeddings
74
 
75
  return embeddings_out, None
funcs/representation_model.py CHANGED
@@ -10,8 +10,7 @@ from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, T
10
  from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
11
 
12
 
13
- hf_model_name = 'TheBloke/phi-2-orange-GGUF' #'NousResearch/Nous-Capybara-7B-V1.9-GGUF' # 'second-state/stablelm-2-zephyr-1.6b-GGUF'
14
- hf_model_file = 'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf' # 'stablelm-2-zephyr-1_6b-Q5_K_M.gguf'
15
  chosen_prompt = open_hermes_prompt # stablelm_prompt
16
  chosen_start_tag = open_hermes_start # stablelm_start
17
 
@@ -46,7 +45,7 @@ def find_model_file(hf_model_name, hf_model_file, search_folder):
46
  found_file = find_file(folder_path, file_to_find)
47
  return found_file
48
 
49
- found_file = find_model_file(hf_model_name, hf_model_file, os.environ["HF_HOME"])#".")
50
 
51
  # Currently set n_gpu_layers to 0 even with cuda due to persistent bugs in implementation with cuda
52
  if torch.cuda.is_available():
@@ -120,10 +119,16 @@ llm_config = LLamacppInitConfigGpu(last_n_tokens_size=last_n_tokens_size,
120
  # KeyBERT
121
  keybert = KeyBERTInspired()
122
 
123
- def create_representation_model(create_llm_topic_labels, llm_config, found_file, chosen_start_tag):
124
 
125
  if create_llm_topic_labels == "Yes":
126
  # Use llama.cpp to load in model
 
 
 
 
 
 
127
  llm = Llama(model_path=found_file, stop=chosen_start_tag, n_gpu_layers=llm_config.n_gpu_layers, n_ctx=llm_config.n_ctx) #**llm_config.model_dump())#
128
  #print(llm.n_gpu_layers)
129
  llm_model = LlamaCPP(llm, prompt=chosen_prompt)#, **gen_config.model_dump())
 
10
  from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
11
 
12
 
13
+
 
14
  chosen_prompt = open_hermes_prompt # stablelm_prompt
15
  chosen_start_tag = open_hermes_start # stablelm_start
16
 
 
45
  found_file = find_file(folder_path, file_to_find)
46
  return found_file
47
 
48
+
49
 
50
  # Currently set n_gpu_layers to 0 even with cuda due to persistent bugs in implementation with cuda
51
  if torch.cuda.is_available():
 
119
  # KeyBERT
120
  keybert = KeyBERTInspired()
121
 
122
+ def create_representation_model(create_llm_topic_labels, llm_config, hf_model_name, hf_model_file, chosen_start_tag):
123
 
124
  if create_llm_topic_labels == "Yes":
125
  # Use llama.cpp to load in model
126
+
127
+ # Check for HF_HOME environment variable and supply a default value if it's not found (current folder)
128
+ hf_home_value = os.getenv("HF_HOME", '.')
129
+
130
+ found_file = find_model_file(hf_model_name, hf_model_file, hf_home_value)
131
+
132
  llm = Llama(model_path=found_file, stop=chosen_start_tag, n_gpu_layers=llm_config.n_gpu_layers, n_ctx=llm_config.n_ctx) #**llm_config.model_dump())#
133
  #print(llm.n_gpu_layers)
134
  llm_model = LlamaCPP(llm, prompt=chosen_prompt)#, **gen_config.model_dump())