Spaces:

seanpedrickcase
/

topic_modelling

Running

App Files Files Community

seanpedrickcase commited on Nov 20, 2024

Commit

34f1e83

1 Parent(s): 89c4d20

Added example of how to run function from command line. Updated packages. Embedding model default now smaller and at fp16.

Browse files

Files changed (16) hide show

.dockerignore +1 -1
.gitignore +1 -1
README.md +2 -2
app.py +43 -11
download_model.py +1 -1
funcs/anonymiser.py +5 -4
funcs/aws_functions.py +211 -0
funcs/embeddings.py +3 -1
funcs/helper_functions.py +12 -2
funcs/presidio_analyzer_custom.py +3 -7
funcs/topic_core_funcs.py +22 -8
requirements.txt +21 -16
requirements_aws.txt +16 -16
requirements_gpu.txt +21 -15
run_cmd_line_example_command.txt +3 -0
run_from_cmd_line.py +88 -0

.dockerignore CHANGED Viewed

@@ -18,7 +18,7 @@
 old_code/*
 model/*
 output_model/*
-data/*
 build_deps/*
 dist/*
 build/*

 old_code/*
 model/*
 output_model/*
+examples/*
 build_deps/*
 dist/*
 build/*

.gitignore CHANGED Viewed

@@ -18,7 +18,7 @@
 old_code/*
 model/*
 output_model/*
-data/*
 build_deps/*
 dist/*
 build/*

 old_code/*
 model/*
 output_model/*
+examples/*
 build_deps/*
 dist/*
 build/*

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🚀
 colorFrom: red
 colorTo: yellow
 sdk: gradio
-sdk_version: 4.44.0
 app_file: app.py
 pinned: true
 license: apache-2.0
@@ -14,7 +14,7 @@ license: apache-2.0
 Generate topics from open text in tabular data, based on [BERTopic](https://maartengr.github.io/BERTopic/). Upload a data file (csv, xlsx, or parquet), then specify the open text column that you want to use to generate topics. Click 'Extract topics' after you have selected the minimum similar documents per topic and maximum total topics. Duplicate this space, or clone to your computer to avoid queues here!
-Uses fast TF-IDF-based embeddings by default, which are fast but does not lead to high quality clusering. Change to higher quality [mxbai-embed-large-v1](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1) model embeddings (512 dimensions) for better results but slower processing time. If you have an embeddings .npz file previously made using this model, you can load this in at the same time to skip the first modelling step. If you have a pre-defined list of topics for zero-shot modelling, you can upload this as a csv file under 'I have my own list of topics...'. Further configuration options are available such as maximum topics allowed, minimum documents per topic etc.. Topic representation with LLMs currently based on [Phi-3-mini-128k-instruct-GGUF](https://huggingface.co/QuantFactory/Phi-3-mini-128k-instruct-GGUF), which is quite slow on CPU, so use a GPU-enabled computer if possible, building from the requirements_gpu.txt file in the base folder.
 For small datasets, consider breaking up your text into sentences under 'Clean data' -> 'Split open text...' before topic modelling.

 colorFrom: red
 colorTo: yellow
 sdk: gradio
+sdk_version: 5.6.0
 app_file: app.py
 pinned: true
 license: apache-2.0
 Generate topics from open text in tabular data, based on [BERTopic](https://maartengr.github.io/BERTopic/). Upload a data file (csv, xlsx, or parquet), then specify the open text column that you want to use to generate topics. Click 'Extract topics' after you have selected the minimum similar documents per topic and maximum total topics. Duplicate this space, or clone to your computer to avoid queues here!
+Uses fast TF-IDF based embeddings by default, which are fast but does not lead to high quality clusering. Change to higher quality [mxbai-embed-xsmall-v1](mixedbread-ai/mxbai-embed-xsmall-v1) model embeddings (384 dimensions) for better results but slower processing time. If you have an embeddings .npz file previously made using this model, you can load this in at the same time to skip the first modelling step. If you have a pre-defined list of topics for zero-shot modelling, you can upload this as a csv file under 'I have my own list of topics...'. Further configuration options are available such as maximum topics allowed, minimum documents per topic etc.. Topic representation with LLMs currently based on [Llama-3.2-3B-Instruct-Q5_K_M.gguf](https://huggingface.co/bartowski/Llama-3.2-3B-Instruct-GGUF), which is quite slow on CPU, so use a GPU-enabled computer if possible, building from the requirements_gpu.txt file in the base folder.
 For small datasets, consider breaking up your text into sentences under 'Clean data' -> 'Split open text...' before topic modelling.

app.py CHANGED Viewed

@@ -1,11 +1,13 @@
 import os
 import gradio as gr
 import pandas as pd
 import numpy as np
 from funcs.topic_core_funcs import pre_clean, optimise_zero_shot, extract_topics, reduce_outliers, represent_topics, visualise_topics, save_as_pytorch_model, change_default_vis_col
 from funcs.helper_functions import initial_file_load, custom_regex_load, ensure_output_folder_exists, output_folder, get_connection_params, get_or_create_env_var
-from funcs.embeddings import make_or_load_embeddings
 from sklearn.feature_extraction.text import CountVectorizer
 from funcs.auth import authenticate_user, download_file_from_s3
@@ -14,11 +16,18 @@ max_word_occurence_slider_default = 0.95
 ensure_output_folder_exists()
 # Gradio app
-block = gr.Blocks(theme = gr.themes.Base())
-with block:
     original_data_state  = gr.State(pd.DataFrame())
     data_state = gr.State(pd.DataFrame())
@@ -32,15 +41,27 @@ with block:
     label_list_state = gr.State(pd.DataFrame())
     vectoriser_state = gr.State(CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=min_word_occurence_slider_default, max_df=max_word_occurence_slider_default))
-    session_hash_state = gr.State("")
-    s3_output_folder_state = gr.State("")
     gr.Markdown(
     """
     # Topic modeller
     Generate topics from open text in tabular data, based on [BERTopic](https://maartengr.github.io/BERTopic/). Upload a data file (csv, xlsx, or parquet), then specify the open text column that you want to use to generate topics. Click 'Extract topics' after you have selected the minimum similar documents per topic and maximum total topics. Duplicate this space, or clone to your computer to avoid queues here!
-    Uses fast TF-IDF-based embeddings by default, which are fast but does not lead to high quality clusering. Change to higher quality [mxbai-embed-large-v1](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1) model embeddings (1024 dimensions) for better results but slower processing time. If you have an embeddings .npz file previously made using this model, you can load this in at the same time to skip the first modelling step. If you have a pre-defined list of topics for zero-shot modelling, you can upload this as a csv file under 'I have my own list of topics...'. Further configuration options are available such as maximum topics allowed, minimum documents per topic etc.. Topic representation with LLMs currently based on [Phi-3.1-mini-128k-instruct-GGUF](https://huggingface.co/bartowski/Phi-3.1-mini-128k-instruct-GGUF), which is quite slow on CPU, so use a GPU-enabled computer if possible, building from the requirements_gpu.txt file in the base folder.
     For small datasets, consider breaking up your text into sentences under 'Clean data' -> 'Split open text...' before topic modelling.
@@ -57,7 +78,7 @@ with block:
             with gr.Row():
                 clean_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove html, URLs, non-ASCII, multiple digits, emails, postcodes (UK).")
                 drop_duplicate_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove duplicate text, drop < 50 character strings.")
-                anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Personal details are redacted - not 100% effective and slow!")
                 #with gr.Row():
                 split_sentence_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Split text into sentences. Useful for small datasets.")
                 #additional_custom_delimiters_drop = gr.Dropdown(choices=["and", ",", "as well as", "also"], multiselect=True, label="Additional custom delimiters to split sentences.")
@@ -159,14 +180,25 @@ with block:
     plot_btn.click(fn=visualise_topics, inputs=[topic_model_state, data_state, data_file_name_no_ext_state, quality_mode_drop, embeddings_state, in_label, in_colnames, legend_label, sample_slide, visualisation_type_radio, seed_number], outputs=[vis_output_single_text, out_plot_file, plot, plot_2], api_name="plot")
     # Get session hash from connection parameters
-    block.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state])
 COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
 print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
 if __name__ == "__main__":
     if os.environ['COGNITO_AUTH'] == "1":
-        block.queue().launch(show_error=True, auth=authenticate_user)
     else:
-        block.queue().launch(show_error=True, inbrowser=True)

 import os
+import socket
 import gradio as gr
 import pandas as pd
 import numpy as np
+from datetime import datetime
 from funcs.topic_core_funcs import pre_clean, optimise_zero_shot, extract_topics, reduce_outliers, represent_topics, visualise_topics, save_as_pytorch_model, change_default_vis_col
 from funcs.helper_functions import initial_file_load, custom_regex_load, ensure_output_folder_exists, output_folder, get_connection_params, get_or_create_env_var
+from funcs.aws_functions import upload_file_to_s3
 from sklearn.feature_extraction.text import CountVectorizer
 from funcs.auth import authenticate_user, download_file_from_s3
 ensure_output_folder_exists()
+host_name = socket.gethostname()
+today_rev = datetime.now().strftime("%Y%m%d")
+feedback_logs_folder = 'feedback/' + today_rev + '/' + host_name + '/'
+access_logs_folder = 'logs/' + today_rev + '/' + host_name + '/'
+usage_logs_folder = 'usage/' + today_rev + '/' + host_name + '/'
 # Gradio app
+app = gr.Blocks(theme = gr.themes.Base())
+with app:
     original_data_state  = gr.State(pd.DataFrame())
     data_state = gr.State(pd.DataFrame())
     label_list_state = gr.State(pd.DataFrame())
     vectoriser_state = gr.State(CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=min_word_occurence_slider_default, max_df=max_word_occurence_slider_default))
+    # Some invisible textboxes to hold some state values
+    session_hash_textbox = gr.Textbox("", visible=False, label="session_hash_textbox")
+    s3_output_folder_textbox = gr.Textbox("", visible=False, label="s3_output_folder_textbox")
+    s3_logs_output_textbox = gr.Textbox("", visible=False, label="s3_logs_output_textbox")
+    # Logging state
+    log_file_name = 'log.csv'
+    feedback_logs_state = gr.State(feedback_logs_folder + log_file_name)
+    feedback_s3_logs_loc_state = gr.State(feedback_logs_folder)
+    access_logs_state = gr.State(access_logs_folder + log_file_name)
+    access_s3_logs_loc_state = gr.State(access_logs_folder)
+    usage_logs_state = gr.State(usage_logs_folder + log_file_name)
+    usage_s3_logs_loc_state = gr.State(usage_logs_folder)
     gr.Markdown(
     """
     # Topic modeller
     Generate topics from open text in tabular data, based on [BERTopic](https://maartengr.github.io/BERTopic/). Upload a data file (csv, xlsx, or parquet), then specify the open text column that you want to use to generate topics. Click 'Extract topics' after you have selected the minimum similar documents per topic and maximum total topics. Duplicate this space, or clone to your computer to avoid queues here!
+    Uses fast TF-IDF based embeddings by default, which are fast but does not lead to high quality clusering. Change to higher quality [mxbai-embed-xsmall-v1](mixedbread-ai/mxbai-embed-xsmall-v1) model embeddings (384 dimensions) for better results but slower processing time. If you have an embeddings .npz file previously made using this model, you can load this in at the same time to skip the first modelling step. If you have a pre-defined list of topics for zero-shot modelling, you can upload this as a csv file under 'I have my own list of topics...'. Further configuration options are available such as maximum topics allowed, minimum documents per topic etc.. Topic representation with LLMs currently based on [Llama-3.2-3B-Instruct-Q5_K_M.gguf](https://huggingface.co/bartowski/Llama-3.2-3B-Instruct-GGUF), which is quite slow on CPU, so use a GPU-enabled computer if possible, building from the requirements_gpu.txt file in the base folder.
     For small datasets, consider breaking up your text into sentences under 'Clean data' -> 'Split open text...' before topic modelling.
             with gr.Row():
                 clean_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove html, URLs, non-ASCII, multiple digits, emails, postcodes (UK).")
                 drop_duplicate_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove duplicate text, drop < 50 character strings.")
+                anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Redact personal information - not 100% effective and slow!")
                 #with gr.Row():
                 split_sentence_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Split text into sentences. Useful for small datasets.")
                 #additional_custom_delimiters_drop = gr.Dropdown(choices=["and", ",", "as well as", "also"], multiselect=True, label="Additional custom delimiters to split sentences.")
     plot_btn.click(fn=visualise_topics, inputs=[topic_model_state, data_state, data_file_name_no_ext_state, quality_mode_drop, embeddings_state, in_label, in_colnames, legend_label, sample_slide, visualisation_type_radio, seed_number], outputs=[vis_output_single_text, out_plot_file, plot, plot_2], api_name="plot")
     # Get session hash from connection parameters
+    app.load(get_connection_params, inputs=None, outputs=[session_hash_textbox, s3_output_folder_textbox])
+    # Log usernames and times of access to file (to know who is using the app when running on AWS)
+    access_callback = gr.CSVLogger(dataset_file_name=log_file_name)
+    access_callback.setup([session_hash_textbox], access_logs_folder)
+    session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
+    then(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
+    # Log processing time/token usage when making a query
+    usage_callback = gr.CSVLogger(dataset_file_name=log_file_name)
+    usage_callback.setup([session_hash_textbox], usage_logs_folder)
+    output_single_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, data_file_name_no_ext_state], None, preprocess=False).\
+    then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
 COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
 print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
 if __name__ == "__main__":
     if os.environ['COGNITO_AUTH'] == "1":
+        app.queue().launch(show_error=True, auth=authenticate_user)
     else:
+        app.queue().launch(show_error=True, inbrowser=True)

download_model.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from huggingface_hub import hf_hub_download
 # Define the repository and files to download
-repo_id = "sentence-transformers/all-MiniLM-L6-v2" #"BAAI/bge-small-en-v1.5"
 files_to_download = [
     "config.json",
     "pytorch_model.bin",

 from huggingface_hub import hf_hub_download
 # Define the repository and files to download
+repo_id = "mixedbread-ai/mxbai-embed-xsmall-v1" #"sentence-transformers/all-MiniLM-L6-v2" #"BAAI/bge-small-en-v1.5"
 files_to_download = [
     "config.json",
     "pytorch_model.bin",

funcs/anonymiser.py CHANGED Viewed

@@ -30,6 +30,7 @@ import re
 import secrets
 import base64
 import time
 import pandas as pd
@@ -42,7 +43,7 @@ from presidio_anonymizer.entities import OperatorConfig
 from typing import List
 # Function to Split Text and Create DataFrame using SpaCy
-def expand_sentences_spacy(df:pd.DataFrame, colname:str, custom_delimiters:List[str]=[], nlp=nlp):
     '''
     Expand passages into sentences using Spacy's built in NLP capabilities
     '''
@@ -50,7 +51,7 @@ def expand_sentences_spacy(df:pd.DataFrame, colname:str, custom_delimiters:List[
     df = df.drop('index', axis = 1, errors="ignore").reset_index(names='index')
-    for index, row in df.iterrows():
         doc = nlp(row[colname])
         for sent in doc.sents:
             expanded_data.append({'original_index':row['original_index'],'document_index': row['index'], colname: sent.text})
@@ -201,11 +202,11 @@ def anonymise_script(df, chosen_col, anon_strat):
     analyse_tic = time.perf_counter()
     #analyzer_results = batch_analyzer.analyze_dict(df_dict, language="en")
     analyzer_results = analyze_dict(batch_analyzer, df_dict, language="en")
-    #print(analyzer_results)
     analyzer_results = list(analyzer_results)
     analyse_toc = time.perf_counter()
-    analyse_time_out = f"Analysing the text took {analyse_toc - analyse_tic:0.1f} seconds."
     print(analyse_time_out)
     # Generate a 128-bit AES key. Then encode the key using base64 to get a string representation

 import secrets
 import base64
 import time
+from gradio import Progress
 import pandas as pd
 from typing import List
 # Function to Split Text and Create DataFrame using SpaCy
+def expand_sentences_spacy(df:pd.DataFrame, colname:str, custom_delimiters:List[str]=[], nlp=nlp, progress=Progress(track_tqdm=True)):
     '''
     Expand passages into sentences using Spacy's built in NLP capabilities
     '''
     df = df.drop('index', axis = 1, errors="ignore").reset_index(names='index')
+    for index, row in progress.tqdm(df.iterrows(), unit = "rows", desc="Splitting sentences"):
         doc = nlp(row[colname])
         for sent in doc.sents:
             expanded_data.append({'original_index':row['original_index'],'document_index': row['index'], colname: sent.text})
     analyse_tic = time.perf_counter()
     #analyzer_results = batch_analyzer.analyze_dict(df_dict, language="en")
     analyzer_results = analyze_dict(batch_analyzer, df_dict, language="en")
     analyzer_results = list(analyzer_results)
     analyse_toc = time.perf_counter()
+    analyse_time_out = f"Anonymising the text took {analyse_toc - analyse_tic:0.1f} seconds."
     print(analyse_time_out)
     # Generate a 128-bit AES key. Then encode the key using base64 to get a string representation

funcs/aws_functions.py ADDED Viewed

	@@ -0,0 +1,211 @@

+from typing import Type, List
+import pandas as pd
+import boto3
+import tempfile
+import os
+from funcs.helper_functions import get_or_create_env_var
+PandasDataFrame = Type[pd.DataFrame]
+# Get AWS credentials
+bucket_name=""
+RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
+print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
+AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
+print(f'The value of AWS_REGION is {AWS_REGION}')
+def get_assumed_role_info():
+        sts_endpoint = 'https://sts.' + AWS_REGION + '.amazonaws.com'
+        sts = boto3.client('sts', region_name=AWS_REGION, endpoint_url=sts_endpoint)
+        response = sts.get_caller_identity()
+        # Extract ARN of the assumed role
+        assumed_role_arn = response['Arn']
+        # Extract the name of the assumed role from the ARN
+        assumed_role_name = assumed_role_arn.split('/')[-1]
+        return assumed_role_arn, assumed_role_name
+if RUN_AWS_FUNCTIONS == "1":
+    try:
+        bucket_name = os.environ['TOPIC_MODELLING_BUCKET']
+        session = boto3.Session()
+    except Exception as e:
+        print(e)
+    try:
+        assumed_role_arn, assumed_role_name = get_assumed_role_info()
+        print("Assumed Role ARN:", assumed_role_arn)
+        print("Assumed Role Name:", assumed_role_name)
+    except Exception as e:
+        print(e)
+# Download direct from S3 - requires login credentials
+def download_file_from_s3(bucket_name, key, local_file_path_and_name):
+    s3 = boto3.client('s3')
+    s3.download_file(bucket_name, key, local_file_path_and_name)
+    print(f"File downloaded from S3: s3://{bucket_name}/{key} to {local_file_path_and_name}")
+def download_folder_from_s3(bucket_name, s3_folder, local_folder):
+    """
+    Download all files from an S3 folder to a local folder.
+    """
+    s3 = boto3.client('s3')
+    # List objects in the specified S3 folder
+    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
+    # Download each object
+    for obj in response.get('Contents', []):
+        # Extract object key and construct local file path
+        object_key = obj['Key']
+        local_file_path = os.path.join(local_folder, os.path.relpath(object_key, s3_folder))
+        # Create directories if necessary
+        os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
+        # Download the object
+        try:
+            s3.download_file(bucket_name, object_key, local_file_path)
+            print(f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'")
+        except Exception as e:
+            print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
+def download_files_from_s3(bucket_name, s3_folder, local_folder, filenames):
+    """
+    Download specific files from an S3 folder to a local folder.
+    """
+    s3 = boto3.client('s3')
+    print("Trying to download file: ", filenames)
+    if filenames == '*':
+        # List all objects in the S3 folder
+        print("Trying to download all files in AWS folder: ", s3_folder)
+        response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
+        print("Found files in AWS folder: ", response.get('Contents', []))
+        filenames = [obj['Key'].split('/')[-1] for obj in response.get('Contents', [])]
+        print("Found filenames in AWS folder: ", filenames)
+    for filename in filenames:
+        object_key = os.path.join(s3_folder, filename)
+        local_file_path = os.path.join(local_folder, filename)
+        # Create directories if necessary
+        os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
+        # Download the object
+        try:
+            s3.download_file(bucket_name, object_key, local_file_path)
+            print(f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'")
+        except Exception as e:
+            print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
+def load_data_from_aws(in_aws_keyword_file, aws_password="", bucket_name=bucket_name):
+    temp_dir = tempfile.mkdtemp()
+    local_address_stub = temp_dir + '/topic-modelling/'
+    files = []
+    if not 'TEST_PASSWORD' in os.environ:
+        out_message = "Can't verify password for dataset access. Do you have a valid AWS connection? Data not loaded."
+        return files, out_message
+    if aws_password:
+        if "Test_file" in in_aws_keyword_file and aws_password == os.environ['TEST_PASSWORD']:
+            s3_folder_stub = 'example-data/test-data/latest/'
+            local_folder_path = local_address_stub
+            # Check if folder exists
+            if not os.path.exists(local_folder_path):
+                print(f"Folder {local_folder_path} does not exist! Making folder.")
+                os.mkdir(local_folder_path)
+            # Check if folder is empty
+            if len(os.listdir(local_folder_path)) == 0:
+                print(f"Folder {local_folder_path} is empty")
+                # Download data
+                download_files_from_s3(bucket_name, s3_folder_stub, local_folder_path, filenames='*')
+                print("AWS data downloaded")
+            else:
+                print(f"Folder {local_folder_path} is not empty")
+            #files = os.listdir(local_folder_stub)
+            #print(files)
+            files = [os.path.join(local_folder_path, f) for f in os.listdir(local_folder_path) if os.path.isfile(os.path.join(local_folder_path, f))]
+            out_message = "Data successfully loaded from AWS"
+            print(out_message)
+        else:
+            out_message = "Data not loaded from AWS"
+            print(out_message)
+    else:
+        out_message = "No password provided. Please ask the data team for access if you need this."
+        print(out_message)
+    return files, out_message
+def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=bucket_name):
+    """
+    Uploads a file from local machine to Amazon S3.
+    Args:
+    - local_file_path: Local file path(s) of the file(s) to upload.
+    - s3_key: Key (path) to the file in the S3 bucket.
+    - s3_bucket: Name of the S3 bucket.
+    Returns:
+    - Message as variable/printed to console
+    """
+    final_out_message = []
+    s3_client = boto3.client('s3')
+    if isinstance(local_file_paths, str):
+        local_file_paths = [local_file_paths]
+    for file in local_file_paths:
+        if s3_client:
+            #print(s3_client)
+            try:
+                # Get file name off file path
+                file_name = os.path.basename(file)
+                s3_key_full = s3_key + file_name
+                print("S3 key: ", s3_key_full)
+                s3_client.upload_file(file, s3_bucket, s3_key_full)
+                out_message = "File " + file_name + " uploaded successfully!"
+                print(out_message)
+            except Exception as e:
+                out_message = f"Error uploading file(s): {e}"
+                print(out_message)
+            final_out_message.append(out_message)
+            final_out_message_str = '\n'.join(final_out_message)
+        else: final_out_message_str = "Could not connect to AWS."
+    return final_out_message_str

funcs/embeddings.py CHANGED Viewed

@@ -70,7 +70,9 @@ def make_or_load_embeddings(docs: list, file_list: list, embeddings_out: np.ndar
             elif high_quality_mode_opt == "Yes":
                 print("Creating dense embeddings based on transformers model")
                 embeddings_out = embedding_model.encode(sentences=docs, show_progress_bar = True, batch_size = 32)#, precision="int8") # For large
             toc = time.perf_counter()

             elif high_quality_mode_opt == "Yes":
                 print("Creating dense embeddings based on transformers model")
+                # Convert model to half precision (fp16)
+                embedding_model.half()
                 embeddings_out = embedding_model.encode(sentences=docs, show_progress_bar = True, batch_size = 32)#, precision="int8") # For large
             toc = time.perf_counter()

funcs/helper_functions.py CHANGED Viewed

@@ -162,7 +162,11 @@ def initial_file_load(in_file):
     topic_model = None
     embeddings = np.array([])
-    file_list = [string.name for string in in_file]
     data_file_names = [string for string in file_list if "npz" not in string.lower() and "pkl" not in string.lower() and "topic_list.csv" not in string.lower()]
     if data_file_names:
@@ -207,7 +211,11 @@ def custom_regex_load(in_file):
     custom_regex = pd.DataFrame()
-    file_list = [string.name for string in in_file]
     regex_file_names = [string for string in file_list if "csv" in string.lower()]
     if regex_file_names:
@@ -241,6 +249,8 @@ def get_file_path_end_with_ext(file_path):
     filename_end = match.group(2) if match else ''
     return filename_end
 # Zip the above to export file

     topic_model = None
     embeddings = np.array([])
+    # If in_file is a string file path, otherwise assume it is a Gradio file input component
+    if isinstance(in_file, str):
+        file_list = [in_file]
+    else:
+        file_list = [string.name for string in in_file]
     data_file_names = [string for string in file_list if "npz" not in string.lower() and "pkl" not in string.lower() and "topic_list.csv" not in string.lower()]
     if data_file_names:
     custom_regex = pd.DataFrame()
+    # If in_file is a string file path, otherwise assume it is a Gradio file input component
+    if isinstance(in_file, str):
+        file_list = [in_file]
+    else:
+        file_list = [string.name for string in in_file]
     regex_file_names = [string for string in file_list if "csv" in string.lower()]
     if regex_file_names:
     filename_end = match.group(2) if match else ''
+    print("filename_end:", filename_end)
     return filename_end
 # Zip the above to export file

funcs/presidio_analyzer_custom.py CHANGED Viewed

@@ -26,16 +26,12 @@ def analyze_iterator_custom(
         texts = self._validate_types(texts)
         # Process the texts as batch for improved performance
-        nlp_artifacts_batch: Iterator[
-            Tuple[str, NlpArtifacts]
-        ] = self.analyzer_engine.nlp_engine.process_batch(
             texts=texts, language=language
-        )
         list_results = []
-        for text, nlp_artifacts in progress.tqdm(nlp_artifacts_batch, total = list_length, desc = "Analysing text for personal information", unit = "rows"):
             results = self.analyzer_engine.analyze(
                 text=str(text), nlp_artifacts=nlp_artifacts, language=language, **kwargs
             )

         texts = self._validate_types(texts)
         # Process the texts as batch for improved performance
+        nlp_artifacts_batch: List[Tuple[str, NlpArtifacts]] = list(self.analyzer_engine.nlp_engine.process_batch(
             texts=texts, language=language
+        ))
         list_results = []
+        for text, nlp_artifacts in tqdm(nlp_artifacts_batch, total=list_length, desc="Analysing text for personal information", unit="rows"):
             results = self.analyzer_engine.analyze(
                 text=str(text), nlp_artifacts=nlp_artifacts, language=language, **kwargs
             )

funcs/topic_core_funcs.py CHANGED Viewed

@@ -13,7 +13,7 @@ PandasDataFrame = Type[pd.DataFrame]
 from funcs.clean_funcs import initial_clean, regex_clean
 from funcs.anonymiser import expand_sentences_spacy
-from funcs.helper_functions import read_file, zip_folder, delete_files_in_folder, save_topic_outputs, output_folder, get_or_create_env_var
 from funcs.embeddings import make_or_load_embeddings, torch_device
 from funcs.bertopic_vis_documents import visualize_documents_custom, visualize_hierarchical_documents_custom, hierarchical_topics_custom, visualize_hierarchy_custom
 from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag, random_seed, RUNNING_ON_AWS
@@ -37,13 +37,13 @@ today_rev = datetime.now().strftime("%Y%m%d")
 # Load embeddings
 if RUNNING_ON_AWS=="0":
-    embeddings_name = "mixedbread-ai/mxbai-embed-large-v1" #"BAAI/large-small-en-v1.5" #"jinaai/jina-embeddings-v2-base-en"
 else:
-    embeddings_name = "sentence-transformers/all-MiniLM-L6-v2"
 # LLM model used for representing topics
-hf_model_name =  "bartowski/Phi-3.1-mini-128k-instruct-GGUF"#'second-state/stablelm-2-zephyr-1.6b-GGUF' #'TheBloke/phi-2-orange-GGUF' #'NousResearch/Nous-Capybara-7B-V1.9-GGUF'
-hf_model_file =   "Phi-3.1-mini-128k-instruct-Q4_K_M.gguf"#'stablelm-2-zephyr-1_6b-Q5_K_M.gguf' # 'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf'
 # When topic modelling column is chosen, change the default visualisation column to the same
 def change_default_vis_col(in_colnames:List[str]):
@@ -55,7 +55,7 @@ def change_default_vis_col(in_colnames:List[str]):
     else:
         return gr.Dropdown()
-def pre_clean(data: pd.DataFrame, in_colnames: list, data_file_name_no_ext: str, custom_regex: pd.DataFrame, clean_text: str, drop_duplicate_text: str, anonymise_drop: str, sentence_split_drop: str, min_sentence_length: int, embeddings_state: dict, progress: gr.Progress = gr.Progress(track_tqdm=True)) -> tuple:
     """
     Pre-processes the input data by cleaning text, removing duplicates, anonymizing data, and splitting sentences based on the provided options.
@@ -70,6 +70,7 @@ def pre_clean(data: pd.DataFrame, in_colnames: list, data_file_name_no_ext: str,
         sentence_split_drop (str): Option to split text into sentences ("Yes" or "No").
         min_sentence_length (int): Minimum length of sentences after split (integer value of character length)
         embeddings_state (dict): State of the embeddings.
         progress (gr.Progress, optional): Progress tracker for the cleaning process.
     Returns:
@@ -81,6 +82,10 @@ def pre_clean(data: pd.DataFrame, in_colnames: list, data_file_name_no_ext: str,
     progress(0, desc = "Cleaning data")
     if not in_colnames:
         error_message = "Please enter one column name to use for cleaning and finding topics."
         print(error_message)
@@ -132,7 +137,7 @@ def pre_clean(data: pd.DataFrame, in_colnames: list, data_file_name_no_ext: str,
         #print("Data shape after duplicate/null removal: ", data.shape)
     if anonymise_drop == "Yes":
-        progress(0.6, desc= "Anonymising data")
         if '_anon' not in data_file_name_no_ext:
             data_file_name_no_ext = data_file_name_no_ext + "_anon"
@@ -261,7 +266,12 @@ def extract_topics(
     vectoriser_state = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=min_word_occurence_slider, max_df=max_word_occurence_slider)
     output_list = []
-    file_list = [string.name for string in in_files]
     if calc_probs == "No":
         calc_probs = False
@@ -352,6 +362,10 @@ def extract_topics(
             else:
                 embeddings_file_name = output_folder + data_file_name_no_ext + '_' + 'large_embeddings_compress.npz'
         np.savez_compressed(embeddings_file_name, embeddings_out)
         output_list.append(embeddings_file_name)

 from funcs.clean_funcs import initial_clean, regex_clean
 from funcs.anonymiser import expand_sentences_spacy
+from funcs.helper_functions import read_file, zip_folder, delete_files_in_folder, save_topic_outputs, output_folder, get_or_create_env_var, custom_regex_load
 from funcs.embeddings import make_or_load_embeddings, torch_device
 from funcs.bertopic_vis_documents import visualize_documents_custom, visualize_hierarchical_documents_custom, hierarchical_topics_custom, visualize_hierarchy_custom
 from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag, random_seed, RUNNING_ON_AWS
 # Load embeddings
 if RUNNING_ON_AWS=="0":
+    embeddings_name = "mixedbread-ai/mxbai-embed-xsmall-v1" #"mixedbread-ai/mxbai-embed-large-v1"
 else:
+    embeddings_name = "mixedbread-ai/mxbai-embed-xsmall-v1"
 # LLM model used for representing topics
+hf_model_name = "bartowski/Llama-3.2-3B-Instruct-GGUF" #"bartowski/Phi-3.1-mini-128k-instruct-GGUF"
+hf_model_file = "Llama-3.2-3B-Instruct-Q5_K_M.gguf" #"Phi-3.1-mini-128k-instruct-Q4_K_M.gguf"
 # When topic modelling column is chosen, change the default visualisation column to the same
 def change_default_vis_col(in_colnames:List[str]):
     else:
         return gr.Dropdown()
+def pre_clean(data: pd.DataFrame, in_colnames: list, data_file_name_no_ext: str, custom_regex: pd.DataFrame, clean_text: str, drop_duplicate_text: str, anonymise_drop: str, sentence_split_drop: str, min_sentence_length: int, embeddings_state: dict, output_folder: str = output_folder, progress: gr.Progress = gr.Progress(track_tqdm=True)) -> tuple:
     """
     Pre-processes the input data by cleaning text, removing duplicates, anonymizing data, and splitting sentences based on the provided options.
         sentence_split_drop (str): Option to split text into sentences ("Yes" or "No").
         min_sentence_length (int): Minimum length of sentences after split (integer value of character length)
         embeddings_state (dict): State of the embeddings.
+        output_folder (str, optional): Output folder. Defaults to output_folder.
         progress (gr.Progress, optional): Progress tracker for the cleaning process.
     Returns:
     progress(0, desc = "Cleaning data")
+    # If custom_regex is a string, assume this is a string path, and load in the data from the path
+    if isinstance(custom_regex, str):
+       custom_regex_text, custom_regex =  custom_regex_load(custom_regex)
     if not in_colnames:
         error_message = "Please enter one column name to use for cleaning and finding topics."
         print(error_message)
         #print("Data shape after duplicate/null removal: ", data.shape)
     if anonymise_drop == "Yes":
+        progress(0.4, desc= "Anonymising data")
         if '_anon' not in data_file_name_no_ext:
             data_file_name_no_ext = data_file_name_no_ext + "_anon"
     vectoriser_state = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=min_word_occurence_slider, max_df=max_word_occurence_slider)
     output_list = []
+    # If in_file is a string file path, otherwise assume it is a Gradio file input component
+    if isinstance(in_files, str):
+        file_list = [in_files]
+    else:
+        file_list = [string.name for string in in_files]
     if calc_probs == "No":
         calc_probs = False
             else:
                 embeddings_file_name = output_folder + data_file_name_no_ext + '_' + 'large_embeddings_compress.npz'
+        print("output_folder:", output_folder)
+        print("data_file_name_no_ext:", data_file_name_no_ext)
+        print("embeddings_file_name:", embeddings_file_name)
         np.savez_compressed(embeddings_file_name, embeddings_out)
         output_list.append(embeddings_file_name)

requirements.txt CHANGED Viewed

@@ -1,18 +1,23 @@
-gradio==4.44.1
-boto3
-transformers==4.41.2
-accelerate==0.26.1
-torch==2.4.0
-bertopic==0.16.2
 spacy==3.8.0
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
-pyarrow==17.0.0
-openpyxl==3.1.2
-Faker==22.2.0
-presidio_analyzer==2.2.354
-presidio_anonymizer==2.2.354
-scipy==1.11.4
-polars==0.20.6
-sentence-transformers==3.0.1
-llama-cpp-python==0.2.87 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
-numpy==1.26.4

+hdbscan==0.8.40
+pandas==2.2.3
+plotly==5.24.1
+scikit-learn==1.5.2
+umap-learn==0.5.7
+gradio==5.6.0
+boto3==1.35.64
+transformers==4.46.3
+accelerate==1.1.1
+torch==2.5.1
+bertopic==0.16.4
 spacy==3.8.0
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
+pyarrow
+openpyxl
+Faker
+presidio_analyzer==2.2.355
+presidio_anonymizer==2.2.355
+scipy
+polars
+sentence-transformers==3.2.0
+llama-cpp-python==0.3.2 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
+#numpy==1.26.4

requirements_aws.txt CHANGED Viewed

@@ -1,18 +1,18 @@
-hdbscan==0.8.38.post1
-pandas==2.2.2
-plotly==5.23.0
-scikit-learn==1.5.1
-umap-learn==0.5.6
-boto3
 spacy==3.8.0
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
-gradio==4.44.1
-pyarrow==17.0.0
-openpyxl==3.1.2
-Faker==22.2.0
-presidio_analyzer==2.2.354
-presidio_anonymizer==2.2.354
-scipy==1.11.4
-polars==0.20.6
-transformers==4.44.0
-numpy==1.26.4

+hdbscan==0.8.40
+pandas==2.2.3
+plotly==5.24.1
+scikit-learn==1.5.2
+umap-learn==0.5.7
+boto3==1.35.64
 spacy==3.8.0
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
+gradio==5.6.0
+pyarrow
+openpyxl
+Faker
+presidio_analyzer==2.2.35
+presidio_anonymizer==2.2.35
+scipy
+polars
+transformers==4.46.3
+#numpy==1.26.4

requirements_gpu.txt CHANGED Viewed

@@ -1,19 +1,25 @@
-gradio==4.44.1
-boto3
-transformers==4.41.2
-accelerate==0.26.1
-bertopic==0.16.2
 spacy==3.8.0
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
-pyarrow==17.0.0
-openpyxl==3.1.3
-Faker==22.2.0
-presidio_analyzer==2.2.354
-presidio_anonymizer==2.2.354
-scipy==1.11.4
-polars==0.20.6
-llama-cpp-python==0.2.87 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
 torch --index-url https://download.pytorch.org/whl/cu121
-sentence-transformers==3.0.1
-numpy==1.26.4

+hdbscan==0.8.40
+pandas==2.2.3
+plotly==5.24.1
+scikit-learn==1.5.2
+umap-learn==0.5.7
+gradio==5.6.0
+boto3==1.35.64
+transformers==4.46.3
+accelerate==1.1.1
+torch==2.5.1 --index-url https://download.pytorch.org/whl/cu121
+bertopic==0.16.4
 spacy==3.8.0
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
+pyarrow
+openpyxl
+Faker
+presidio_analyzer==2.2.355
+presidio_anonymizer==2.2.355
+scipy
+polars
+llama-cpp-python==0.3.2 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
 torch --index-url https://download.pytorch.org/whl/cu121
+sentence-transformers==3.2.0
+#numpy==1.26.4

run_cmd_line_example_command.txt ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # To just get embeddings out
2	+
3	+ python run_from_cmd_line.py --data_file "C:\Users\SPedrickCase\OneDrive - Lambeth Council\Apps\topic_modelling\examples\combined_case_notes.csv" --in_colnames "Case Note" --clean_text Yes --drop_duplicate_text No --anonymise_drop Yes --split_sentence_drop No --custom_regex_file "C:\Users\SPedrickCase\OneDrive - Lambeth Council\Apps\topic_modelling\examples\regex_to_remove.csv" --embeddings_high_quality_mode Yes --return_only_embeddings_drop Yes --output_folder "C:\Users\SPedrickCase\OneDrive - Lambeth Council\2024\ASC Project\asc_predict\apps\topic_modelling\output"

run_from_cmd_line.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import argparse
+import pandas as pd
+import numpy as np
+from funcs.topic_core_funcs import pre_clean, extract_topics
+from funcs.helper_functions import custom_regex_load, initial_file_load, output_folder
+from sklearn.feature_extraction.text import CountVectorizer
+print("Output folder:", output_folder)
+def main():
+    parser = argparse.ArgumentParser(description="Run pre_clean and extract_topics from command line.")
+    # Arguments for pre_clean
+    parser.add_argument('--data_file', type=str, required=True, help='Path to the data file (csv, xlsx, or parquet).')
+    parser.add_argument('--in_colnames', type=str, required=True, help='Column name to find topics.')
+    parser.add_argument('--custom_regex_file', type=str, help='Path to custom regex removal file.', default=None)
+    parser.add_argument('--clean_text', type=str, choices=['Yes', 'No'], default='No', help='Remove html, URLs, etc.')
+    parser.add_argument('--drop_duplicate_text', type=str, choices=['Yes', 'No'], default='No', help='Remove duplicate text.')
+    parser.add_argument('--anonymise_drop', type=str, choices=['Yes', 'No'], default='No', help='Redact personal information.')
+    parser.add_argument('--split_sentence_drop', type=str, choices=['Yes', 'No'], default='No', help='Split text into sentences.')
+    parser.add_argument('--min_sentence_length_num', type=int, default=5, help='Min char length of split sentences.')
+    parser.add_argument('--min_docs_slider', type=int, default=5, help='Minimum number of similar documents needed to make a topic.')
+    parser.add_argument('--max_topics_slider', type=int, default=0, help='Maximum number of topics.')
+    parser.add_argument('--min_word_occurence_slider', type=float, default=0.01, help='Minimum word occurrence proportion.')
+    parser.add_argument('--max_word_occurence_slider', type=float, default=0.95, help='Maximum word occurrence proportion.')
+    parser.add_argument('--embeddings_high_quality_mode', type=str, choices=['Yes', 'No'], default='No', help='Use high-quality embeddings.')
+    parser.add_argument('--zero_shot_similarity', type=float, default=0.55, help='Minimum similarity for zero-shot topic assignment.')
+    parser.add_argument('--seed_number', type=int, default=42, help='Random seed for processing.')
+    parser.add_argument('--return_only_embeddings_drop', type=str, default="No", help='Return only embeddings from the function, do not assign topics.')
+    parser.add_argument('--output_folder', type=str, default=output_folder, help='Output folder for results.')
+    args = parser.parse_args()
+    # Load data
+    #data = pd.read_csv(args.data_file) if args.data_file.endswith('.csv') else pd.read_excel(args.data_file)
+    #custom_regex = pd.read_csv(args.custom_regex_file) if args.custom_regex_file else pd.DataFrame()
+    in_colnames_all, in_label, data, output_single_text, topic_model_state, embeddings_state, data_file_name_no_ext, label_list_state, original_data_state = initial_file_load(args.data_file)
+    custom_regex_output_text, custom_regex = custom_regex_load(args.custom_regex_file) if args.custom_regex_file else pd.DataFrame()
+    print("data_file_name_no_ext:", data_file_name_no_ext)
+    # Pre-clean data
+    pre_clean_output = pre_clean(
+        data=data,
+        in_colnames=[args.in_colnames],
+        data_file_name_no_ext=data_file_name_no_ext,
+        custom_regex=custom_regex,
+        clean_text=args.clean_text,
+        drop_duplicate_text=args.drop_duplicate_text,
+        anonymise_drop=args.anonymise_drop,
+        sentence_split_drop=args.split_sentence_drop,
+        min_sentence_length=args.min_sentence_length_num,
+        embeddings_state=np.array([]),
+        output_folder=output_folder
+    )
+    # Extract topics
+    extract_topics_output = extract_topics(
+        data=pre_clean_output[2],
+        in_files=args.data_file,
+        min_docs_slider=args.min_docs_slider,
+        in_colnames=[args.in_colnames],
+        max_topics_slider=args.max_topics_slider,
+        candidate_topics=[],
+        data_file_name_no_ext=data_file_name_no_ext,
+        custom_labels_df=pd.DataFrame(),
+        return_intermediate_files='Yes',
+        embeddings_super_compress='No',
+        high_quality_mode=args.embeddings_high_quality_mode,
+        save_topic_model='No',
+        embeddings_out=np.array([]),
+        embeddings_type_state='',
+        zero_shot_similarity=args.zero_shot_similarity,
+        calc_probs='No',
+        vectoriser_state=CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=args.min_word_occurence_slider, max_df=args.max_word_occurence_slider),
+        min_word_occurence_slider=args.min_word_occurence_slider,
+        max_word_occurence_slider=args.max_word_occurence_slider,
+        split_sentence_drop=args.split_sentence_drop,
+        random_seed=args.seed_number,
+        return_only_embeddings_drop=args.return_only_embeddings_drop,
+        output_folder=output_folder
+    )
+if __name__ == "__main__":
+    main()