Spaces:
Sleeping
Sleeping
seanpedrickcase
commited on
Commit
β’
34f1e83
1
Parent(s):
89c4d20
Added example of how to run function from command line. Updated packages. Embedding model default now smaller and at fp16.
Browse files- .dockerignore +1 -1
- .gitignore +1 -1
- README.md +2 -2
- app.py +43 -11
- download_model.py +1 -1
- funcs/anonymiser.py +5 -4
- funcs/aws_functions.py +211 -0
- funcs/embeddings.py +3 -1
- funcs/helper_functions.py +12 -2
- funcs/presidio_analyzer_custom.py +3 -7
- funcs/topic_core_funcs.py +22 -8
- requirements.txt +21 -16
- requirements_aws.txt +16 -16
- requirements_gpu.txt +21 -15
- run_cmd_line_example_command.txt +3 -0
- run_from_cmd_line.py +88 -0
.dockerignore
CHANGED
@@ -18,7 +18,7 @@
|
|
18 |
old_code/*
|
19 |
model/*
|
20 |
output_model/*
|
21 |
-
|
22 |
build_deps/*
|
23 |
dist/*
|
24 |
build/*
|
|
|
18 |
old_code/*
|
19 |
model/*
|
20 |
output_model/*
|
21 |
+
examples/*
|
22 |
build_deps/*
|
23 |
dist/*
|
24 |
build/*
|
.gitignore
CHANGED
@@ -18,7 +18,7 @@
|
|
18 |
old_code/*
|
19 |
model/*
|
20 |
output_model/*
|
21 |
-
|
22 |
build_deps/*
|
23 |
dist/*
|
24 |
build/*
|
|
|
18 |
old_code/*
|
19 |
model/*
|
20 |
output_model/*
|
21 |
+
examples/*
|
22 |
build_deps/*
|
23 |
dist/*
|
24 |
build/*
|
README.md
CHANGED
@@ -4,7 +4,7 @@ emoji: π
|
|
4 |
colorFrom: red
|
5 |
colorTo: yellow
|
6 |
sdk: gradio
|
7 |
-
sdk_version:
|
8 |
app_file: app.py
|
9 |
pinned: true
|
10 |
license: apache-2.0
|
@@ -14,7 +14,7 @@ license: apache-2.0
|
|
14 |
|
15 |
Generate topics from open text in tabular data, based on [BERTopic](https://maartengr.github.io/BERTopic/). Upload a data file (csv, xlsx, or parquet), then specify the open text column that you want to use to generate topics. Click 'Extract topics' after you have selected the minimum similar documents per topic and maximum total topics. Duplicate this space, or clone to your computer to avoid queues here!
|
16 |
|
17 |
-
Uses fast TF-IDF
|
18 |
|
19 |
For small datasets, consider breaking up your text into sentences under 'Clean data' -> 'Split open text...' before topic modelling.
|
20 |
|
|
|
4 |
colorFrom: red
|
5 |
colorTo: yellow
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 5.6.0
|
8 |
app_file: app.py
|
9 |
pinned: true
|
10 |
license: apache-2.0
|
|
|
14 |
|
15 |
Generate topics from open text in tabular data, based on [BERTopic](https://maartengr.github.io/BERTopic/). Upload a data file (csv, xlsx, or parquet), then specify the open text column that you want to use to generate topics. Click 'Extract topics' after you have selected the minimum similar documents per topic and maximum total topics. Duplicate this space, or clone to your computer to avoid queues here!
|
16 |
|
17 |
+
Uses fast TF-IDF based embeddings by default, which are fast but does not lead to high quality clusering. Change to higher quality [mxbai-embed-xsmall-v1](mixedbread-ai/mxbai-embed-xsmall-v1) model embeddings (384 dimensions) for better results but slower processing time. If you have an embeddings .npz file previously made using this model, you can load this in at the same time to skip the first modelling step. If you have a pre-defined list of topics for zero-shot modelling, you can upload this as a csv file under 'I have my own list of topics...'. Further configuration options are available such as maximum topics allowed, minimum documents per topic etc.. Topic representation with LLMs currently based on [Llama-3.2-3B-Instruct-Q5_K_M.gguf](https://huggingface.co/bartowski/Llama-3.2-3B-Instruct-GGUF), which is quite slow on CPU, so use a GPU-enabled computer if possible, building from the requirements_gpu.txt file in the base folder.
|
18 |
|
19 |
For small datasets, consider breaking up your text into sentences under 'Clean data' -> 'Split open text...' before topic modelling.
|
20 |
|
app.py
CHANGED
@@ -1,11 +1,13 @@
|
|
1 |
import os
|
|
|
2 |
import gradio as gr
|
3 |
import pandas as pd
|
4 |
import numpy as np
|
|
|
5 |
|
6 |
from funcs.topic_core_funcs import pre_clean, optimise_zero_shot, extract_topics, reduce_outliers, represent_topics, visualise_topics, save_as_pytorch_model, change_default_vis_col
|
7 |
from funcs.helper_functions import initial_file_load, custom_regex_load, ensure_output_folder_exists, output_folder, get_connection_params, get_or_create_env_var
|
8 |
-
from funcs.
|
9 |
from sklearn.feature_extraction.text import CountVectorizer
|
10 |
from funcs.auth import authenticate_user, download_file_from_s3
|
11 |
|
@@ -14,11 +16,18 @@ max_word_occurence_slider_default = 0.95
|
|
14 |
|
15 |
ensure_output_folder_exists()
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
# Gradio app
|
18 |
|
19 |
-
|
20 |
|
21 |
-
with
|
22 |
|
23 |
original_data_state = gr.State(pd.DataFrame())
|
24 |
data_state = gr.State(pd.DataFrame())
|
@@ -32,15 +41,27 @@ with block:
|
|
32 |
label_list_state = gr.State(pd.DataFrame())
|
33 |
vectoriser_state = gr.State(CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=min_word_occurence_slider_default, max_df=max_word_occurence_slider_default))
|
34 |
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
gr.Markdown(
|
39 |
"""
|
40 |
# Topic modeller
|
41 |
Generate topics from open text in tabular data, based on [BERTopic](https://maartengr.github.io/BERTopic/). Upload a data file (csv, xlsx, or parquet), then specify the open text column that you want to use to generate topics. Click 'Extract topics' after you have selected the minimum similar documents per topic and maximum total topics. Duplicate this space, or clone to your computer to avoid queues here!
|
42 |
|
43 |
-
Uses fast TF-IDF
|
44 |
|
45 |
For small datasets, consider breaking up your text into sentences under 'Clean data' -> 'Split open text...' before topic modelling.
|
46 |
|
@@ -57,7 +78,7 @@ with block:
|
|
57 |
with gr.Row():
|
58 |
clean_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove html, URLs, non-ASCII, multiple digits, emails, postcodes (UK).")
|
59 |
drop_duplicate_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove duplicate text, drop < 50 character strings.")
|
60 |
-
anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="
|
61 |
#with gr.Row():
|
62 |
split_sentence_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Split text into sentences. Useful for small datasets.")
|
63 |
#additional_custom_delimiters_drop = gr.Dropdown(choices=["and", ",", "as well as", "also"], multiselect=True, label="Additional custom delimiters to split sentences.")
|
@@ -159,14 +180,25 @@ with block:
|
|
159 |
plot_btn.click(fn=visualise_topics, inputs=[topic_model_state, data_state, data_file_name_no_ext_state, quality_mode_drop, embeddings_state, in_label, in_colnames, legend_label, sample_slide, visualisation_type_radio, seed_number], outputs=[vis_output_single_text, out_plot_file, plot, plot_2], api_name="plot")
|
160 |
|
161 |
# Get session hash from connection parameters
|
162 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
|
164 |
COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
|
165 |
print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
|
166 |
|
167 |
-
|
168 |
if __name__ == "__main__":
|
169 |
if os.environ['COGNITO_AUTH'] == "1":
|
170 |
-
|
171 |
else:
|
172 |
-
|
|
|
1 |
import os
|
2 |
+
import socket
|
3 |
import gradio as gr
|
4 |
import pandas as pd
|
5 |
import numpy as np
|
6 |
+
from datetime import datetime
|
7 |
|
8 |
from funcs.topic_core_funcs import pre_clean, optimise_zero_shot, extract_topics, reduce_outliers, represent_topics, visualise_topics, save_as_pytorch_model, change_default_vis_col
|
9 |
from funcs.helper_functions import initial_file_load, custom_regex_load, ensure_output_folder_exists, output_folder, get_connection_params, get_or_create_env_var
|
10 |
+
from funcs.aws_functions import upload_file_to_s3
|
11 |
from sklearn.feature_extraction.text import CountVectorizer
|
12 |
from funcs.auth import authenticate_user, download_file_from_s3
|
13 |
|
|
|
16 |
|
17 |
ensure_output_folder_exists()
|
18 |
|
19 |
+
host_name = socket.gethostname()
|
20 |
+
|
21 |
+
today_rev = datetime.now().strftime("%Y%m%d")
|
22 |
+
feedback_logs_folder = 'feedback/' + today_rev + '/' + host_name + '/'
|
23 |
+
access_logs_folder = 'logs/' + today_rev + '/' + host_name + '/'
|
24 |
+
usage_logs_folder = 'usage/' + today_rev + '/' + host_name + '/'
|
25 |
+
|
26 |
# Gradio app
|
27 |
|
28 |
+
app = gr.Blocks(theme = gr.themes.Base())
|
29 |
|
30 |
+
with app:
|
31 |
|
32 |
original_data_state = gr.State(pd.DataFrame())
|
33 |
data_state = gr.State(pd.DataFrame())
|
|
|
41 |
label_list_state = gr.State(pd.DataFrame())
|
42 |
vectoriser_state = gr.State(CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=min_word_occurence_slider_default, max_df=max_word_occurence_slider_default))
|
43 |
|
44 |
+
# Some invisible textboxes to hold some state values
|
45 |
+
session_hash_textbox = gr.Textbox("", visible=False, label="session_hash_textbox")
|
46 |
+
s3_output_folder_textbox = gr.Textbox("", visible=False, label="s3_output_folder_textbox")
|
47 |
+
s3_logs_output_textbox = gr.Textbox("", visible=False, label="s3_logs_output_textbox")
|
48 |
+
|
49 |
+
# Logging state
|
50 |
+
log_file_name = 'log.csv'
|
51 |
+
|
52 |
+
feedback_logs_state = gr.State(feedback_logs_folder + log_file_name)
|
53 |
+
feedback_s3_logs_loc_state = gr.State(feedback_logs_folder)
|
54 |
+
access_logs_state = gr.State(access_logs_folder + log_file_name)
|
55 |
+
access_s3_logs_loc_state = gr.State(access_logs_folder)
|
56 |
+
usage_logs_state = gr.State(usage_logs_folder + log_file_name)
|
57 |
+
usage_s3_logs_loc_state = gr.State(usage_logs_folder)
|
58 |
|
59 |
gr.Markdown(
|
60 |
"""
|
61 |
# Topic modeller
|
62 |
Generate topics from open text in tabular data, based on [BERTopic](https://maartengr.github.io/BERTopic/). Upload a data file (csv, xlsx, or parquet), then specify the open text column that you want to use to generate topics. Click 'Extract topics' after you have selected the minimum similar documents per topic and maximum total topics. Duplicate this space, or clone to your computer to avoid queues here!
|
63 |
|
64 |
+
Uses fast TF-IDF based embeddings by default, which are fast but does not lead to high quality clusering. Change to higher quality [mxbai-embed-xsmall-v1](mixedbread-ai/mxbai-embed-xsmall-v1) model embeddings (384 dimensions) for better results but slower processing time. If you have an embeddings .npz file previously made using this model, you can load this in at the same time to skip the first modelling step. If you have a pre-defined list of topics for zero-shot modelling, you can upload this as a csv file under 'I have my own list of topics...'. Further configuration options are available such as maximum topics allowed, minimum documents per topic etc.. Topic representation with LLMs currently based on [Llama-3.2-3B-Instruct-Q5_K_M.gguf](https://huggingface.co/bartowski/Llama-3.2-3B-Instruct-GGUF), which is quite slow on CPU, so use a GPU-enabled computer if possible, building from the requirements_gpu.txt file in the base folder.
|
65 |
|
66 |
For small datasets, consider breaking up your text into sentences under 'Clean data' -> 'Split open text...' before topic modelling.
|
67 |
|
|
|
78 |
with gr.Row():
|
79 |
clean_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove html, URLs, non-ASCII, multiple digits, emails, postcodes (UK).")
|
80 |
drop_duplicate_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove duplicate text, drop < 50 character strings.")
|
81 |
+
anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Redact personal information - not 100% effective and slow!")
|
82 |
#with gr.Row():
|
83 |
split_sentence_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Split text into sentences. Useful for small datasets.")
|
84 |
#additional_custom_delimiters_drop = gr.Dropdown(choices=["and", ",", "as well as", "also"], multiselect=True, label="Additional custom delimiters to split sentences.")
|
|
|
180 |
plot_btn.click(fn=visualise_topics, inputs=[topic_model_state, data_state, data_file_name_no_ext_state, quality_mode_drop, embeddings_state, in_label, in_colnames, legend_label, sample_slide, visualisation_type_radio, seed_number], outputs=[vis_output_single_text, out_plot_file, plot, plot_2], api_name="plot")
|
181 |
|
182 |
# Get session hash from connection parameters
|
183 |
+
app.load(get_connection_params, inputs=None, outputs=[session_hash_textbox, s3_output_folder_textbox])
|
184 |
+
|
185 |
+
# Log usernames and times of access to file (to know who is using the app when running on AWS)
|
186 |
+
access_callback = gr.CSVLogger(dataset_file_name=log_file_name)
|
187 |
+
access_callback.setup([session_hash_textbox], access_logs_folder)
|
188 |
+
session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
|
189 |
+
then(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
190 |
+
|
191 |
+
# Log processing time/token usage when making a query
|
192 |
+
usage_callback = gr.CSVLogger(dataset_file_name=log_file_name)
|
193 |
+
usage_callback.setup([session_hash_textbox], usage_logs_folder)
|
194 |
+
output_single_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, data_file_name_no_ext_state], None, preprocess=False).\
|
195 |
+
then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
196 |
|
197 |
COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
|
198 |
print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
|
199 |
|
|
|
200 |
if __name__ == "__main__":
|
201 |
if os.environ['COGNITO_AUTH'] == "1":
|
202 |
+
app.queue().launch(show_error=True, auth=authenticate_user)
|
203 |
else:
|
204 |
+
app.queue().launch(show_error=True, inbrowser=True)
|
download_model.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
from huggingface_hub import hf_hub_download
|
2 |
|
3 |
# Define the repository and files to download
|
4 |
-
repo_id = "sentence-transformers/all-MiniLM-L6-v2" #"BAAI/bge-small-en-v1.5"
|
5 |
files_to_download = [
|
6 |
"config.json",
|
7 |
"pytorch_model.bin",
|
|
|
1 |
from huggingface_hub import hf_hub_download
|
2 |
|
3 |
# Define the repository and files to download
|
4 |
+
repo_id = "mixedbread-ai/mxbai-embed-xsmall-v1" #"sentence-transformers/all-MiniLM-L6-v2" #"BAAI/bge-small-en-v1.5"
|
5 |
files_to_download = [
|
6 |
"config.json",
|
7 |
"pytorch_model.bin",
|
funcs/anonymiser.py
CHANGED
@@ -30,6 +30,7 @@ import re
|
|
30 |
import secrets
|
31 |
import base64
|
32 |
import time
|
|
|
33 |
|
34 |
import pandas as pd
|
35 |
|
@@ -42,7 +43,7 @@ from presidio_anonymizer.entities import OperatorConfig
|
|
42 |
from typing import List
|
43 |
|
44 |
# Function to Split Text and Create DataFrame using SpaCy
|
45 |
-
def expand_sentences_spacy(df:pd.DataFrame, colname:str, custom_delimiters:List[str]=[], nlp=nlp):
|
46 |
'''
|
47 |
Expand passages into sentences using Spacy's built in NLP capabilities
|
48 |
'''
|
@@ -50,7 +51,7 @@ def expand_sentences_spacy(df:pd.DataFrame, colname:str, custom_delimiters:List[
|
|
50 |
|
51 |
df = df.drop('index', axis = 1, errors="ignore").reset_index(names='index')
|
52 |
|
53 |
-
for index, row in df.iterrows():
|
54 |
doc = nlp(row[colname])
|
55 |
for sent in doc.sents:
|
56 |
expanded_data.append({'original_index':row['original_index'],'document_index': row['index'], colname: sent.text})
|
@@ -201,11 +202,11 @@ def anonymise_script(df, chosen_col, anon_strat):
|
|
201 |
analyse_tic = time.perf_counter()
|
202 |
#analyzer_results = batch_analyzer.analyze_dict(df_dict, language="en")
|
203 |
analyzer_results = analyze_dict(batch_analyzer, df_dict, language="en")
|
204 |
-
|
205 |
analyzer_results = list(analyzer_results)
|
206 |
|
207 |
analyse_toc = time.perf_counter()
|
208 |
-
analyse_time_out = f"
|
209 |
print(analyse_time_out)
|
210 |
|
211 |
# Generate a 128-bit AES key. Then encode the key using base64 to get a string representation
|
|
|
30 |
import secrets
|
31 |
import base64
|
32 |
import time
|
33 |
+
from gradio import Progress
|
34 |
|
35 |
import pandas as pd
|
36 |
|
|
|
43 |
from typing import List
|
44 |
|
45 |
# Function to Split Text and Create DataFrame using SpaCy
|
46 |
+
def expand_sentences_spacy(df:pd.DataFrame, colname:str, custom_delimiters:List[str]=[], nlp=nlp, progress=Progress(track_tqdm=True)):
|
47 |
'''
|
48 |
Expand passages into sentences using Spacy's built in NLP capabilities
|
49 |
'''
|
|
|
51 |
|
52 |
df = df.drop('index', axis = 1, errors="ignore").reset_index(names='index')
|
53 |
|
54 |
+
for index, row in progress.tqdm(df.iterrows(), unit = "rows", desc="Splitting sentences"):
|
55 |
doc = nlp(row[colname])
|
56 |
for sent in doc.sents:
|
57 |
expanded_data.append({'original_index':row['original_index'],'document_index': row['index'], colname: sent.text})
|
|
|
202 |
analyse_tic = time.perf_counter()
|
203 |
#analyzer_results = batch_analyzer.analyze_dict(df_dict, language="en")
|
204 |
analyzer_results = analyze_dict(batch_analyzer, df_dict, language="en")
|
205 |
+
|
206 |
analyzer_results = list(analyzer_results)
|
207 |
|
208 |
analyse_toc = time.perf_counter()
|
209 |
+
analyse_time_out = f"Anonymising the text took {analyse_toc - analyse_tic:0.1f} seconds."
|
210 |
print(analyse_time_out)
|
211 |
|
212 |
# Generate a 128-bit AES key. Then encode the key using base64 to get a string representation
|
funcs/aws_functions.py
ADDED
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Type, List
|
2 |
+
import pandas as pd
|
3 |
+
import boto3
|
4 |
+
import tempfile
|
5 |
+
import os
|
6 |
+
from funcs.helper_functions import get_or_create_env_var
|
7 |
+
|
8 |
+
PandasDataFrame = Type[pd.DataFrame]
|
9 |
+
|
10 |
+
# Get AWS credentials
|
11 |
+
bucket_name=""
|
12 |
+
|
13 |
+
RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
|
14 |
+
print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
|
15 |
+
|
16 |
+
AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
|
17 |
+
print(f'The value of AWS_REGION is {AWS_REGION}')
|
18 |
+
|
19 |
+
|
20 |
+
|
21 |
+
def get_assumed_role_info():
|
22 |
+
sts_endpoint = 'https://sts.' + AWS_REGION + '.amazonaws.com'
|
23 |
+
sts = boto3.client('sts', region_name=AWS_REGION, endpoint_url=sts_endpoint)
|
24 |
+
response = sts.get_caller_identity()
|
25 |
+
|
26 |
+
# Extract ARN of the assumed role
|
27 |
+
assumed_role_arn = response['Arn']
|
28 |
+
|
29 |
+
# Extract the name of the assumed role from the ARN
|
30 |
+
assumed_role_name = assumed_role_arn.split('/')[-1]
|
31 |
+
|
32 |
+
return assumed_role_arn, assumed_role_name
|
33 |
+
|
34 |
+
if RUN_AWS_FUNCTIONS == "1":
|
35 |
+
try:
|
36 |
+
bucket_name = os.environ['TOPIC_MODELLING_BUCKET']
|
37 |
+
session = boto3.Session()
|
38 |
+
|
39 |
+
except Exception as e:
|
40 |
+
print(e)
|
41 |
+
|
42 |
+
try:
|
43 |
+
assumed_role_arn, assumed_role_name = get_assumed_role_info()
|
44 |
+
|
45 |
+
print("Assumed Role ARN:", assumed_role_arn)
|
46 |
+
print("Assumed Role Name:", assumed_role_name)
|
47 |
+
|
48 |
+
except Exception as e:
|
49 |
+
print(e)
|
50 |
+
|
51 |
+
# Download direct from S3 - requires login credentials
|
52 |
+
def download_file_from_s3(bucket_name, key, local_file_path_and_name):
|
53 |
+
|
54 |
+
s3 = boto3.client('s3')
|
55 |
+
s3.download_file(bucket_name, key, local_file_path_and_name)
|
56 |
+
print(f"File downloaded from S3: s3://{bucket_name}/{key} to {local_file_path_and_name}")
|
57 |
+
|
58 |
+
def download_folder_from_s3(bucket_name, s3_folder, local_folder):
|
59 |
+
"""
|
60 |
+
Download all files from an S3 folder to a local folder.
|
61 |
+
"""
|
62 |
+
s3 = boto3.client('s3')
|
63 |
+
|
64 |
+
# List objects in the specified S3 folder
|
65 |
+
response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
|
66 |
+
|
67 |
+
# Download each object
|
68 |
+
for obj in response.get('Contents', []):
|
69 |
+
# Extract object key and construct local file path
|
70 |
+
object_key = obj['Key']
|
71 |
+
local_file_path = os.path.join(local_folder, os.path.relpath(object_key, s3_folder))
|
72 |
+
|
73 |
+
# Create directories if necessary
|
74 |
+
os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
|
75 |
+
|
76 |
+
# Download the object
|
77 |
+
try:
|
78 |
+
s3.download_file(bucket_name, object_key, local_file_path)
|
79 |
+
print(f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'")
|
80 |
+
except Exception as e:
|
81 |
+
print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
|
82 |
+
|
83 |
+
def download_files_from_s3(bucket_name, s3_folder, local_folder, filenames):
|
84 |
+
"""
|
85 |
+
Download specific files from an S3 folder to a local folder.
|
86 |
+
"""
|
87 |
+
s3 = boto3.client('s3')
|
88 |
+
|
89 |
+
print("Trying to download file: ", filenames)
|
90 |
+
|
91 |
+
if filenames == '*':
|
92 |
+
# List all objects in the S3 folder
|
93 |
+
print("Trying to download all files in AWS folder: ", s3_folder)
|
94 |
+
response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
|
95 |
+
|
96 |
+
print("Found files in AWS folder: ", response.get('Contents', []))
|
97 |
+
|
98 |
+
filenames = [obj['Key'].split('/')[-1] for obj in response.get('Contents', [])]
|
99 |
+
|
100 |
+
print("Found filenames in AWS folder: ", filenames)
|
101 |
+
|
102 |
+
for filename in filenames:
|
103 |
+
object_key = os.path.join(s3_folder, filename)
|
104 |
+
local_file_path = os.path.join(local_folder, filename)
|
105 |
+
|
106 |
+
# Create directories if necessary
|
107 |
+
os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
|
108 |
+
|
109 |
+
# Download the object
|
110 |
+
try:
|
111 |
+
s3.download_file(bucket_name, object_key, local_file_path)
|
112 |
+
print(f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'")
|
113 |
+
except Exception as e:
|
114 |
+
print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
|
115 |
+
|
116 |
+
def load_data_from_aws(in_aws_keyword_file, aws_password="", bucket_name=bucket_name):
|
117 |
+
|
118 |
+
temp_dir = tempfile.mkdtemp()
|
119 |
+
local_address_stub = temp_dir + '/topic-modelling/'
|
120 |
+
files = []
|
121 |
+
|
122 |
+
if not 'TEST_PASSWORD' in os.environ:
|
123 |
+
out_message = "Can't verify password for dataset access. Do you have a valid AWS connection? Data not loaded."
|
124 |
+
return files, out_message
|
125 |
+
|
126 |
+
if aws_password:
|
127 |
+
if "Test_file" in in_aws_keyword_file and aws_password == os.environ['TEST_PASSWORD']:
|
128 |
+
|
129 |
+
s3_folder_stub = 'example-data/test-data/latest/'
|
130 |
+
|
131 |
+
local_folder_path = local_address_stub
|
132 |
+
|
133 |
+
# Check if folder exists
|
134 |
+
if not os.path.exists(local_folder_path):
|
135 |
+
print(f"Folder {local_folder_path} does not exist! Making folder.")
|
136 |
+
|
137 |
+
os.mkdir(local_folder_path)
|
138 |
+
|
139 |
+
# Check if folder is empty
|
140 |
+
if len(os.listdir(local_folder_path)) == 0:
|
141 |
+
print(f"Folder {local_folder_path} is empty")
|
142 |
+
# Download data
|
143 |
+
download_files_from_s3(bucket_name, s3_folder_stub, local_folder_path, filenames='*')
|
144 |
+
|
145 |
+
print("AWS data downloaded")
|
146 |
+
|
147 |
+
else:
|
148 |
+
print(f"Folder {local_folder_path} is not empty")
|
149 |
+
|
150 |
+
#files = os.listdir(local_folder_stub)
|
151 |
+
#print(files)
|
152 |
+
|
153 |
+
files = [os.path.join(local_folder_path, f) for f in os.listdir(local_folder_path) if os.path.isfile(os.path.join(local_folder_path, f))]
|
154 |
+
|
155 |
+
out_message = "Data successfully loaded from AWS"
|
156 |
+
print(out_message)
|
157 |
+
|
158 |
+
else:
|
159 |
+
out_message = "Data not loaded from AWS"
|
160 |
+
print(out_message)
|
161 |
+
else:
|
162 |
+
out_message = "No password provided. Please ask the data team for access if you need this."
|
163 |
+
print(out_message)
|
164 |
+
|
165 |
+
return files, out_message
|
166 |
+
|
167 |
+
def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=bucket_name):
|
168 |
+
"""
|
169 |
+
Uploads a file from local machine to Amazon S3.
|
170 |
+
|
171 |
+
Args:
|
172 |
+
- local_file_path: Local file path(s) of the file(s) to upload.
|
173 |
+
- s3_key: Key (path) to the file in the S3 bucket.
|
174 |
+
- s3_bucket: Name of the S3 bucket.
|
175 |
+
|
176 |
+
Returns:
|
177 |
+
- Message as variable/printed to console
|
178 |
+
"""
|
179 |
+
final_out_message = []
|
180 |
+
|
181 |
+
s3_client = boto3.client('s3')
|
182 |
+
|
183 |
+
if isinstance(local_file_paths, str):
|
184 |
+
local_file_paths = [local_file_paths]
|
185 |
+
|
186 |
+
for file in local_file_paths:
|
187 |
+
if s3_client:
|
188 |
+
#print(s3_client)
|
189 |
+
try:
|
190 |
+
# Get file name off file path
|
191 |
+
file_name = os.path.basename(file)
|
192 |
+
|
193 |
+
s3_key_full = s3_key + file_name
|
194 |
+
print("S3 key: ", s3_key_full)
|
195 |
+
|
196 |
+
s3_client.upload_file(file, s3_bucket, s3_key_full)
|
197 |
+
out_message = "File " + file_name + " uploaded successfully!"
|
198 |
+
print(out_message)
|
199 |
+
|
200 |
+
except Exception as e:
|
201 |
+
out_message = f"Error uploading file(s): {e}"
|
202 |
+
print(out_message)
|
203 |
+
|
204 |
+
final_out_message.append(out_message)
|
205 |
+
final_out_message_str = '\n'.join(final_out_message)
|
206 |
+
|
207 |
+
else: final_out_message_str = "Could not connect to AWS."
|
208 |
+
|
209 |
+
return final_out_message_str
|
210 |
+
|
211 |
+
|
funcs/embeddings.py
CHANGED
@@ -70,7 +70,9 @@ def make_or_load_embeddings(docs: list, file_list: list, embeddings_out: np.ndar
|
|
70 |
|
71 |
elif high_quality_mode_opt == "Yes":
|
72 |
print("Creating dense embeddings based on transformers model")
|
73 |
-
|
|
|
|
|
74 |
embeddings_out = embedding_model.encode(sentences=docs, show_progress_bar = True, batch_size = 32)#, precision="int8") # For large
|
75 |
|
76 |
toc = time.perf_counter()
|
|
|
70 |
|
71 |
elif high_quality_mode_opt == "Yes":
|
72 |
print("Creating dense embeddings based on transformers model")
|
73 |
+
|
74 |
+
# Convert model to half precision (fp16)
|
75 |
+
embedding_model.half()
|
76 |
embeddings_out = embedding_model.encode(sentences=docs, show_progress_bar = True, batch_size = 32)#, precision="int8") # For large
|
77 |
|
78 |
toc = time.perf_counter()
|
funcs/helper_functions.py
CHANGED
@@ -162,7 +162,11 @@ def initial_file_load(in_file):
|
|
162 |
topic_model = None
|
163 |
embeddings = np.array([])
|
164 |
|
165 |
-
|
|
|
|
|
|
|
|
|
166 |
|
167 |
data_file_names = [string for string in file_list if "npz" not in string.lower() and "pkl" not in string.lower() and "topic_list.csv" not in string.lower()]
|
168 |
if data_file_names:
|
@@ -207,7 +211,11 @@ def custom_regex_load(in_file):
|
|
207 |
|
208 |
custom_regex = pd.DataFrame()
|
209 |
|
210 |
-
|
|
|
|
|
|
|
|
|
211 |
|
212 |
regex_file_names = [string for string in file_list if "csv" in string.lower()]
|
213 |
if regex_file_names:
|
@@ -241,6 +249,8 @@ def get_file_path_end_with_ext(file_path):
|
|
241 |
|
242 |
filename_end = match.group(2) if match else ''
|
243 |
|
|
|
|
|
244 |
return filename_end
|
245 |
|
246 |
# Zip the above to export file
|
|
|
162 |
topic_model = None
|
163 |
embeddings = np.array([])
|
164 |
|
165 |
+
# If in_file is a string file path, otherwise assume it is a Gradio file input component
|
166 |
+
if isinstance(in_file, str):
|
167 |
+
file_list = [in_file]
|
168 |
+
else:
|
169 |
+
file_list = [string.name for string in in_file]
|
170 |
|
171 |
data_file_names = [string for string in file_list if "npz" not in string.lower() and "pkl" not in string.lower() and "topic_list.csv" not in string.lower()]
|
172 |
if data_file_names:
|
|
|
211 |
|
212 |
custom_regex = pd.DataFrame()
|
213 |
|
214 |
+
# If in_file is a string file path, otherwise assume it is a Gradio file input component
|
215 |
+
if isinstance(in_file, str):
|
216 |
+
file_list = [in_file]
|
217 |
+
else:
|
218 |
+
file_list = [string.name for string in in_file]
|
219 |
|
220 |
regex_file_names = [string for string in file_list if "csv" in string.lower()]
|
221 |
if regex_file_names:
|
|
|
249 |
|
250 |
filename_end = match.group(2) if match else ''
|
251 |
|
252 |
+
print("filename_end:", filename_end)
|
253 |
+
|
254 |
return filename_end
|
255 |
|
256 |
# Zip the above to export file
|
funcs/presidio_analyzer_custom.py
CHANGED
@@ -26,16 +26,12 @@ def analyze_iterator_custom(
|
|
26 |
texts = self._validate_types(texts)
|
27 |
|
28 |
# Process the texts as batch for improved performance
|
29 |
-
nlp_artifacts_batch:
|
30 |
-
Tuple[str, NlpArtifacts]
|
31 |
-
] = self.analyzer_engine.nlp_engine.process_batch(
|
32 |
texts=texts, language=language
|
33 |
-
)
|
34 |
-
|
35 |
-
|
36 |
|
37 |
list_results = []
|
38 |
-
for text, nlp_artifacts in
|
39 |
results = self.analyzer_engine.analyze(
|
40 |
text=str(text), nlp_artifacts=nlp_artifacts, language=language, **kwargs
|
41 |
)
|
|
|
26 |
texts = self._validate_types(texts)
|
27 |
|
28 |
# Process the texts as batch for improved performance
|
29 |
+
nlp_artifacts_batch: List[Tuple[str, NlpArtifacts]] = list(self.analyzer_engine.nlp_engine.process_batch(
|
|
|
|
|
30 |
texts=texts, language=language
|
31 |
+
))
|
|
|
|
|
32 |
|
33 |
list_results = []
|
34 |
+
for text, nlp_artifacts in tqdm(nlp_artifacts_batch, total=list_length, desc="Analysing text for personal information", unit="rows"):
|
35 |
results = self.analyzer_engine.analyze(
|
36 |
text=str(text), nlp_artifacts=nlp_artifacts, language=language, **kwargs
|
37 |
)
|
funcs/topic_core_funcs.py
CHANGED
@@ -13,7 +13,7 @@ PandasDataFrame = Type[pd.DataFrame]
|
|
13 |
|
14 |
from funcs.clean_funcs import initial_clean, regex_clean
|
15 |
from funcs.anonymiser import expand_sentences_spacy
|
16 |
-
from funcs.helper_functions import read_file, zip_folder, delete_files_in_folder, save_topic_outputs, output_folder, get_or_create_env_var
|
17 |
from funcs.embeddings import make_or_load_embeddings, torch_device
|
18 |
from funcs.bertopic_vis_documents import visualize_documents_custom, visualize_hierarchical_documents_custom, hierarchical_topics_custom, visualize_hierarchy_custom
|
19 |
from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag, random_seed, RUNNING_ON_AWS
|
@@ -37,13 +37,13 @@ today_rev = datetime.now().strftime("%Y%m%d")
|
|
37 |
|
38 |
# Load embeddings
|
39 |
if RUNNING_ON_AWS=="0":
|
40 |
-
embeddings_name = "mixedbread-ai/mxbai-embed-
|
41 |
else:
|
42 |
-
embeddings_name = "
|
43 |
|
44 |
# LLM model used for representing topics
|
45 |
-
hf_model_name =
|
46 |
-
hf_model_file =
|
47 |
|
48 |
# When topic modelling column is chosen, change the default visualisation column to the same
|
49 |
def change_default_vis_col(in_colnames:List[str]):
|
@@ -55,7 +55,7 @@ def change_default_vis_col(in_colnames:List[str]):
|
|
55 |
else:
|
56 |
return gr.Dropdown()
|
57 |
|
58 |
-
def pre_clean(data: pd.DataFrame, in_colnames: list, data_file_name_no_ext: str, custom_regex: pd.DataFrame, clean_text: str, drop_duplicate_text: str, anonymise_drop: str, sentence_split_drop: str, min_sentence_length: int, embeddings_state: dict, progress: gr.Progress = gr.Progress(track_tqdm=True)) -> tuple:
|
59 |
"""
|
60 |
Pre-processes the input data by cleaning text, removing duplicates, anonymizing data, and splitting sentences based on the provided options.
|
61 |
|
@@ -70,6 +70,7 @@ def pre_clean(data: pd.DataFrame, in_colnames: list, data_file_name_no_ext: str,
|
|
70 |
sentence_split_drop (str): Option to split text into sentences ("Yes" or "No").
|
71 |
min_sentence_length (int): Minimum length of sentences after split (integer value of character length)
|
72 |
embeddings_state (dict): State of the embeddings.
|
|
|
73 |
progress (gr.Progress, optional): Progress tracker for the cleaning process.
|
74 |
|
75 |
Returns:
|
@@ -81,6 +82,10 @@ def pre_clean(data: pd.DataFrame, in_colnames: list, data_file_name_no_ext: str,
|
|
81 |
|
82 |
progress(0, desc = "Cleaning data")
|
83 |
|
|
|
|
|
|
|
|
|
84 |
if not in_colnames:
|
85 |
error_message = "Please enter one column name to use for cleaning and finding topics."
|
86 |
print(error_message)
|
@@ -132,7 +137,7 @@ def pre_clean(data: pd.DataFrame, in_colnames: list, data_file_name_no_ext: str,
|
|
132 |
#print("Data shape after duplicate/null removal: ", data.shape)
|
133 |
|
134 |
if anonymise_drop == "Yes":
|
135 |
-
progress(0.
|
136 |
|
137 |
if '_anon' not in data_file_name_no_ext:
|
138 |
data_file_name_no_ext = data_file_name_no_ext + "_anon"
|
@@ -261,7 +266,12 @@ def extract_topics(
|
|
261 |
vectoriser_state = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=min_word_occurence_slider, max_df=max_word_occurence_slider)
|
262 |
|
263 |
output_list = []
|
264 |
-
|
|
|
|
|
|
|
|
|
|
|
265 |
|
266 |
if calc_probs == "No":
|
267 |
calc_probs = False
|
@@ -352,6 +362,10 @@ def extract_topics(
|
|
352 |
else:
|
353 |
embeddings_file_name = output_folder + data_file_name_no_ext + '_' + 'large_embeddings_compress.npz'
|
354 |
|
|
|
|
|
|
|
|
|
355 |
np.savez_compressed(embeddings_file_name, embeddings_out)
|
356 |
|
357 |
output_list.append(embeddings_file_name)
|
|
|
13 |
|
14 |
from funcs.clean_funcs import initial_clean, regex_clean
|
15 |
from funcs.anonymiser import expand_sentences_spacy
|
16 |
+
from funcs.helper_functions import read_file, zip_folder, delete_files_in_folder, save_topic_outputs, output_folder, get_or_create_env_var, custom_regex_load
|
17 |
from funcs.embeddings import make_or_load_embeddings, torch_device
|
18 |
from funcs.bertopic_vis_documents import visualize_documents_custom, visualize_hierarchical_documents_custom, hierarchical_topics_custom, visualize_hierarchy_custom
|
19 |
from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag, random_seed, RUNNING_ON_AWS
|
|
|
37 |
|
38 |
# Load embeddings
|
39 |
if RUNNING_ON_AWS=="0":
|
40 |
+
embeddings_name = "mixedbread-ai/mxbai-embed-xsmall-v1" #"mixedbread-ai/mxbai-embed-large-v1"
|
41 |
else:
|
42 |
+
embeddings_name = "mixedbread-ai/mxbai-embed-xsmall-v1"
|
43 |
|
44 |
# LLM model used for representing topics
|
45 |
+
hf_model_name = "bartowski/Llama-3.2-3B-Instruct-GGUF" #"bartowski/Phi-3.1-mini-128k-instruct-GGUF"
|
46 |
+
hf_model_file = "Llama-3.2-3B-Instruct-Q5_K_M.gguf" #"Phi-3.1-mini-128k-instruct-Q4_K_M.gguf"
|
47 |
|
48 |
# When topic modelling column is chosen, change the default visualisation column to the same
|
49 |
def change_default_vis_col(in_colnames:List[str]):
|
|
|
55 |
else:
|
56 |
return gr.Dropdown()
|
57 |
|
58 |
+
def pre_clean(data: pd.DataFrame, in_colnames: list, data_file_name_no_ext: str, custom_regex: pd.DataFrame, clean_text: str, drop_duplicate_text: str, anonymise_drop: str, sentence_split_drop: str, min_sentence_length: int, embeddings_state: dict, output_folder: str = output_folder, progress: gr.Progress = gr.Progress(track_tqdm=True)) -> tuple:
|
59 |
"""
|
60 |
Pre-processes the input data by cleaning text, removing duplicates, anonymizing data, and splitting sentences based on the provided options.
|
61 |
|
|
|
70 |
sentence_split_drop (str): Option to split text into sentences ("Yes" or "No").
|
71 |
min_sentence_length (int): Minimum length of sentences after split (integer value of character length)
|
72 |
embeddings_state (dict): State of the embeddings.
|
73 |
+
output_folder (str, optional): Output folder. Defaults to output_folder.
|
74 |
progress (gr.Progress, optional): Progress tracker for the cleaning process.
|
75 |
|
76 |
Returns:
|
|
|
82 |
|
83 |
progress(0, desc = "Cleaning data")
|
84 |
|
85 |
+
# If custom_regex is a string, assume this is a string path, and load in the data from the path
|
86 |
+
if isinstance(custom_regex, str):
|
87 |
+
custom_regex_text, custom_regex = custom_regex_load(custom_regex)
|
88 |
+
|
89 |
if not in_colnames:
|
90 |
error_message = "Please enter one column name to use for cleaning and finding topics."
|
91 |
print(error_message)
|
|
|
137 |
#print("Data shape after duplicate/null removal: ", data.shape)
|
138 |
|
139 |
if anonymise_drop == "Yes":
|
140 |
+
progress(0.4, desc= "Anonymising data")
|
141 |
|
142 |
if '_anon' not in data_file_name_no_ext:
|
143 |
data_file_name_no_ext = data_file_name_no_ext + "_anon"
|
|
|
266 |
vectoriser_state = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=min_word_occurence_slider, max_df=max_word_occurence_slider)
|
267 |
|
268 |
output_list = []
|
269 |
+
|
270 |
+
# If in_file is a string file path, otherwise assume it is a Gradio file input component
|
271 |
+
if isinstance(in_files, str):
|
272 |
+
file_list = [in_files]
|
273 |
+
else:
|
274 |
+
file_list = [string.name for string in in_files]
|
275 |
|
276 |
if calc_probs == "No":
|
277 |
calc_probs = False
|
|
|
362 |
else:
|
363 |
embeddings_file_name = output_folder + data_file_name_no_ext + '_' + 'large_embeddings_compress.npz'
|
364 |
|
365 |
+
print("output_folder:", output_folder)
|
366 |
+
print("data_file_name_no_ext:", data_file_name_no_ext)
|
367 |
+
print("embeddings_file_name:", embeddings_file_name)
|
368 |
+
|
369 |
np.savez_compressed(embeddings_file_name, embeddings_out)
|
370 |
|
371 |
output_list.append(embeddings_file_name)
|
requirements.txt
CHANGED
@@ -1,18 +1,23 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
7 |
spacy==3.8.0
|
8 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
9 |
-
pyarrow
|
10 |
-
openpyxl
|
11 |
-
Faker
|
12 |
-
presidio_analyzer==2.2.
|
13 |
-
presidio_anonymizer==2.2.
|
14 |
-
scipy
|
15 |
-
polars
|
16 |
-
sentence-transformers==3.0
|
17 |
-
llama-cpp-python==0.2
|
18 |
-
numpy==1.26.4
|
|
|
1 |
+
hdbscan==0.8.40
|
2 |
+
pandas==2.2.3
|
3 |
+
plotly==5.24.1
|
4 |
+
scikit-learn==1.5.2
|
5 |
+
umap-learn==0.5.7
|
6 |
+
gradio==5.6.0
|
7 |
+
boto3==1.35.64
|
8 |
+
transformers==4.46.3
|
9 |
+
accelerate==1.1.1
|
10 |
+
torch==2.5.1
|
11 |
+
bertopic==0.16.4
|
12 |
spacy==3.8.0
|
13 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
14 |
+
pyarrow
|
15 |
+
openpyxl
|
16 |
+
Faker
|
17 |
+
presidio_analyzer==2.2.355
|
18 |
+
presidio_anonymizer==2.2.355
|
19 |
+
scipy
|
20 |
+
polars
|
21 |
+
sentence-transformers==3.2.0
|
22 |
+
llama-cpp-python==0.3.2 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
|
23 |
+
#numpy==1.26.4
|
requirements_aws.txt
CHANGED
@@ -1,18 +1,18 @@
|
|
1 |
-
hdbscan==0.8.
|
2 |
-
pandas==2.2.
|
3 |
-
plotly==5.
|
4 |
-
scikit-learn==1.5.
|
5 |
-
umap-learn==0.5.
|
6 |
-
boto3
|
7 |
spacy==3.8.0
|
8 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
9 |
-
gradio==
|
10 |
-
pyarrow
|
11 |
-
openpyxl
|
12 |
-
Faker
|
13 |
-
presidio_analyzer==2.2.
|
14 |
-
presidio_anonymizer==2.2.
|
15 |
-
scipy
|
16 |
-
polars
|
17 |
-
transformers==4.
|
18 |
-
numpy==1.26.4
|
|
|
1 |
+
hdbscan==0.8.40
|
2 |
+
pandas==2.2.3
|
3 |
+
plotly==5.24.1
|
4 |
+
scikit-learn==1.5.2
|
5 |
+
umap-learn==0.5.7
|
6 |
+
boto3==1.35.64
|
7 |
spacy==3.8.0
|
8 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
9 |
+
gradio==5.6.0
|
10 |
+
pyarrow
|
11 |
+
openpyxl
|
12 |
+
Faker
|
13 |
+
presidio_analyzer==2.2.35
|
14 |
+
presidio_anonymizer==2.2.35
|
15 |
+
scipy
|
16 |
+
polars
|
17 |
+
transformers==4.46.3
|
18 |
+
#numpy==1.26.4
|
requirements_gpu.txt
CHANGED
@@ -1,19 +1,25 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
spacy==3.8.0
|
7 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
8 |
-
pyarrow
|
9 |
-
openpyxl
|
10 |
-
Faker
|
11 |
-
presidio_analyzer==2.2.
|
12 |
-
presidio_anonymizer==2.2.
|
13 |
-
scipy
|
14 |
-
polars
|
15 |
-
llama-cpp-python==0.2
|
16 |
torch --index-url https://download.pytorch.org/whl/cu121
|
17 |
-
sentence-transformers==3.0
|
18 |
-
numpy==1.26.4
|
19 |
|
|
|
1 |
+
hdbscan==0.8.40
|
2 |
+
pandas==2.2.3
|
3 |
+
plotly==5.24.1
|
4 |
+
scikit-learn==1.5.2
|
5 |
+
umap-learn==0.5.7
|
6 |
+
gradio==5.6.0
|
7 |
+
boto3==1.35.64
|
8 |
+
transformers==4.46.3
|
9 |
+
accelerate==1.1.1
|
10 |
+
torch==2.5.1 --index-url https://download.pytorch.org/whl/cu121
|
11 |
+
bertopic==0.16.4
|
12 |
spacy==3.8.0
|
13 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
14 |
+
pyarrow
|
15 |
+
openpyxl
|
16 |
+
Faker
|
17 |
+
presidio_analyzer==2.2.355
|
18 |
+
presidio_anonymizer==2.2.355
|
19 |
+
scipy
|
20 |
+
polars
|
21 |
+
llama-cpp-python==0.3.2 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
|
22 |
torch --index-url https://download.pytorch.org/whl/cu121
|
23 |
+
sentence-transformers==3.2.0
|
24 |
+
#numpy==1.26.4
|
25 |
|
run_cmd_line_example_command.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
# To just get embeddings out
|
2 |
+
|
3 |
+
python run_from_cmd_line.py --data_file "C:\Users\SPedrickCase\OneDrive - Lambeth Council\Apps\topic_modelling\examples\combined_case_notes.csv" --in_colnames "Case Note" --clean_text Yes --drop_duplicate_text No --anonymise_drop Yes --split_sentence_drop No --custom_regex_file "C:\Users\SPedrickCase\OneDrive - Lambeth Council\Apps\topic_modelling\examples\regex_to_remove.csv" --embeddings_high_quality_mode Yes --return_only_embeddings_drop Yes --output_folder "C:\Users\SPedrickCase\OneDrive - Lambeth Council\2024\ASC Project\asc_predict\apps\topic_modelling\output"
|
run_from_cmd_line.py
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
from funcs.topic_core_funcs import pre_clean, extract_topics
|
5 |
+
from funcs.helper_functions import custom_regex_load, initial_file_load, output_folder
|
6 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
7 |
+
|
8 |
+
print("Output folder:", output_folder)
|
9 |
+
|
10 |
+
def main():
|
11 |
+
|
12 |
+
parser = argparse.ArgumentParser(description="Run pre_clean and extract_topics from command line.")
|
13 |
+
|
14 |
+
# Arguments for pre_clean
|
15 |
+
parser.add_argument('--data_file', type=str, required=True, help='Path to the data file (csv, xlsx, or parquet).')
|
16 |
+
parser.add_argument('--in_colnames', type=str, required=True, help='Column name to find topics.')
|
17 |
+
parser.add_argument('--custom_regex_file', type=str, help='Path to custom regex removal file.', default=None)
|
18 |
+
parser.add_argument('--clean_text', type=str, choices=['Yes', 'No'], default='No', help='Remove html, URLs, etc.')
|
19 |
+
parser.add_argument('--drop_duplicate_text', type=str, choices=['Yes', 'No'], default='No', help='Remove duplicate text.')
|
20 |
+
parser.add_argument('--anonymise_drop', type=str, choices=['Yes', 'No'], default='No', help='Redact personal information.')
|
21 |
+
parser.add_argument('--split_sentence_drop', type=str, choices=['Yes', 'No'], default='No', help='Split text into sentences.')
|
22 |
+
parser.add_argument('--min_sentence_length_num', type=int, default=5, help='Min char length of split sentences.')
|
23 |
+
|
24 |
+
parser.add_argument('--min_docs_slider', type=int, default=5, help='Minimum number of similar documents needed to make a topic.')
|
25 |
+
parser.add_argument('--max_topics_slider', type=int, default=0, help='Maximum number of topics.')
|
26 |
+
parser.add_argument('--min_word_occurence_slider', type=float, default=0.01, help='Minimum word occurrence proportion.')
|
27 |
+
parser.add_argument('--max_word_occurence_slider', type=float, default=0.95, help='Maximum word occurrence proportion.')
|
28 |
+
parser.add_argument('--embeddings_high_quality_mode', type=str, choices=['Yes', 'No'], default='No', help='Use high-quality embeddings.')
|
29 |
+
parser.add_argument('--zero_shot_similarity', type=float, default=0.55, help='Minimum similarity for zero-shot topic assignment.')
|
30 |
+
parser.add_argument('--seed_number', type=int, default=42, help='Random seed for processing.')
|
31 |
+
parser.add_argument('--return_only_embeddings_drop', type=str, default="No", help='Return only embeddings from the function, do not assign topics.')
|
32 |
+
parser.add_argument('--output_folder', type=str, default=output_folder, help='Output folder for results.')
|
33 |
+
|
34 |
+
args = parser.parse_args()
|
35 |
+
|
36 |
+
# Load data
|
37 |
+
#data = pd.read_csv(args.data_file) if args.data_file.endswith('.csv') else pd.read_excel(args.data_file)
|
38 |
+
#custom_regex = pd.read_csv(args.custom_regex_file) if args.custom_regex_file else pd.DataFrame()
|
39 |
+
|
40 |
+
in_colnames_all, in_label, data, output_single_text, topic_model_state, embeddings_state, data_file_name_no_ext, label_list_state, original_data_state = initial_file_load(args.data_file)
|
41 |
+
custom_regex_output_text, custom_regex = custom_regex_load(args.custom_regex_file) if args.custom_regex_file else pd.DataFrame()
|
42 |
+
|
43 |
+
print("data_file_name_no_ext:", data_file_name_no_ext)
|
44 |
+
|
45 |
+
# Pre-clean data
|
46 |
+
pre_clean_output = pre_clean(
|
47 |
+
data=data,
|
48 |
+
in_colnames=[args.in_colnames],
|
49 |
+
data_file_name_no_ext=data_file_name_no_ext,
|
50 |
+
custom_regex=custom_regex,
|
51 |
+
clean_text=args.clean_text,
|
52 |
+
drop_duplicate_text=args.drop_duplicate_text,
|
53 |
+
anonymise_drop=args.anonymise_drop,
|
54 |
+
sentence_split_drop=args.split_sentence_drop,
|
55 |
+
min_sentence_length=args.min_sentence_length_num,
|
56 |
+
embeddings_state=np.array([]),
|
57 |
+
output_folder=output_folder
|
58 |
+
)
|
59 |
+
|
60 |
+
# Extract topics
|
61 |
+
extract_topics_output = extract_topics(
|
62 |
+
data=pre_clean_output[2],
|
63 |
+
in_files=args.data_file,
|
64 |
+
min_docs_slider=args.min_docs_slider,
|
65 |
+
in_colnames=[args.in_colnames],
|
66 |
+
max_topics_slider=args.max_topics_slider,
|
67 |
+
candidate_topics=[],
|
68 |
+
data_file_name_no_ext=data_file_name_no_ext,
|
69 |
+
custom_labels_df=pd.DataFrame(),
|
70 |
+
return_intermediate_files='Yes',
|
71 |
+
embeddings_super_compress='No',
|
72 |
+
high_quality_mode=args.embeddings_high_quality_mode,
|
73 |
+
save_topic_model='No',
|
74 |
+
embeddings_out=np.array([]),
|
75 |
+
embeddings_type_state='',
|
76 |
+
zero_shot_similarity=args.zero_shot_similarity,
|
77 |
+
calc_probs='No',
|
78 |
+
vectoriser_state=CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=args.min_word_occurence_slider, max_df=args.max_word_occurence_slider),
|
79 |
+
min_word_occurence_slider=args.min_word_occurence_slider,
|
80 |
+
max_word_occurence_slider=args.max_word_occurence_slider,
|
81 |
+
split_sentence_drop=args.split_sentence_drop,
|
82 |
+
random_seed=args.seed_number,
|
83 |
+
return_only_embeddings_drop=args.return_only_embeddings_drop,
|
84 |
+
output_folder=output_folder
|
85 |
+
)
|
86 |
+
|
87 |
+
if __name__ == "__main__":
|
88 |
+
main()
|