Spaces:
Running
Running
seanpedrickcase
commited on
Commit
·
1e2bb3e
1
Parent(s):
55f0ce3
Only aggregate topics not 'other', allowed for minimum sentence length, default max_topics now will auto aggregate topics. Added Cognito Auth functionality (boto3 with AWS).
Browse files- app.py +21 -12
- funcs/anonymiser.py +49 -6
- funcs/auth.py +54 -0
- funcs/clean_funcs.py +24 -10
- funcs/helper_functions.py +85 -16
- funcs/topic_core_funcs.py +21 -7
- requirements.txt +1 -0
- requirements_gpu.txt +2 -1
app.py
CHANGED
@@ -1,14 +1,12 @@
|
|
1 |
-
|
2 |
-
# import os
|
3 |
-
# os.system("pip install scipy==1.11.4")
|
4 |
-
|
5 |
import gradio as gr
|
6 |
import pandas as pd
|
7 |
import numpy as np
|
8 |
|
9 |
from funcs.topic_core_funcs import pre_clean, optimise_zero_shot, extract_topics, reduce_outliers, represent_topics, visualise_topics, save_as_pytorch_model, change_default_vis_col
|
10 |
-
from funcs.helper_functions import initial_file_load, custom_regex_load, ensure_output_folder_exists, output_folder, get_connection_params
|
11 |
from sklearn.feature_extraction.text import CountVectorizer
|
|
|
12 |
|
13 |
min_word_occurence_slider_default = 0.01
|
14 |
max_word_occurence_slider_default = 0.95
|
@@ -34,6 +32,7 @@ with block:
|
|
34 |
vectoriser_state = gr.State(CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=min_word_occurence_slider_default, max_df=max_word_occurence_slider_default))
|
35 |
|
36 |
session_hash_state = gr.State("")
|
|
|
37 |
|
38 |
gr.Markdown(
|
39 |
"""
|
@@ -55,10 +54,14 @@ with block:
|
|
55 |
|
56 |
with gr.Accordion("Clean data", open = False):
|
57 |
with gr.Row():
|
58 |
-
clean_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove html,
|
59 |
drop_duplicate_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove duplicate text, drop < 50 character strings.")
|
60 |
anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Personal details are redacted - not 100% effective and slow!")
|
|
|
61 |
split_sentence_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Split text into sentences. Useful for small datasets.")
|
|
|
|
|
|
|
62 |
with gr.Row():
|
63 |
custom_regex = gr.UploadButton(label="Import custom regex removal file", file_count="multiple")
|
64 |
gr.Markdown("""Import custom regex - csv table with one column of regex patterns with no header. Strings matching this pattern will be removed. Example pattern: (?i)roosevelt for case insensitive removal of this term.""")
|
@@ -76,8 +79,8 @@ with block:
|
|
76 |
with gr.Accordion("Topic modelling settings - change documents per topic, max topics, frequency of terms", open = False):
|
77 |
|
78 |
with gr.Row():
|
79 |
-
min_docs_slider = gr.Slider(minimum = 2, maximum = 1000, value =
|
80 |
-
max_topics_slider = gr.Slider(minimum =
|
81 |
with gr.Row():
|
82 |
min_word_occurence_slider = gr.Slider(minimum = 0.001, maximum = 0.9, value = min_word_occurence_slider_default, step = 0.001, label = "Keep terms that appear in this minimum proportion of documents. Avoids creating topics with very uncommon words.")
|
83 |
max_word_occurence_slider = gr.Slider(minimum = 0.1, maximum = 1.0, value =max_word_occurence_slider_default, step = 0.01, label = "Keep terms that appear in less than this maximum proportion of documents. Avoids very common words in topic names.")
|
@@ -131,7 +134,7 @@ with block:
|
|
131 |
|
132 |
# Clean data
|
133 |
custom_regex.upload(fn=custom_regex_load, inputs=[custom_regex], outputs=[custom_regex_text, custom_regex_state])
|
134 |
-
clean_btn.click(fn=pre_clean, inputs=[data_state, in_colnames, data_file_name_no_ext_state, custom_regex_state, clean_text, drop_duplicate_text, anonymise_drop, split_sentence_drop], outputs=[output_single_text, output_file, data_state, data_file_name_no_ext_state, embeddings_state], api_name="clean")
|
135 |
|
136 |
# Optimise for keeping only zero-shot topics
|
137 |
zero_shot_optimiser_btn.click(fn=optimise_zero_shot, outputs=[quality_mode_drop, min_docs_slider, max_topics_slider, min_word_occurence_slider, max_word_occurence_slider, zero_shot_similarity])
|
@@ -152,8 +155,14 @@ with block:
|
|
152 |
plot_btn.click(fn=visualise_topics, inputs=[topic_model_state, data_state, data_file_name_no_ext_state, quality_mode_drop, embeddings_state, in_label, in_colnames, legend_label, sample_slide, visualisation_type_radio, seed_number], outputs=[vis_output_single_text, out_plot_file, plot, plot_2], api_name="plot")
|
153 |
|
154 |
# Get session hash from connection parameters
|
155 |
-
block.load(get_connection_params, inputs=None, outputs=[session_hash_state])
|
|
|
|
|
|
|
|
|
156 |
|
157 |
-
# Launch the Gradio app
|
158 |
if __name__ == "__main__":
|
159 |
-
|
|
|
|
|
|
|
|
1 |
+
import os
|
|
|
|
|
|
|
2 |
import gradio as gr
|
3 |
import pandas as pd
|
4 |
import numpy as np
|
5 |
|
6 |
from funcs.topic_core_funcs import pre_clean, optimise_zero_shot, extract_topics, reduce_outliers, represent_topics, visualise_topics, save_as_pytorch_model, change_default_vis_col
|
7 |
+
from funcs.helper_functions import initial_file_load, custom_regex_load, ensure_output_folder_exists, output_folder, get_connection_params, get_or_create_env_var
|
8 |
from sklearn.feature_extraction.text import CountVectorizer
|
9 |
+
from funcs.auth import authenticate_user, download_file_from_s3
|
10 |
|
11 |
min_word_occurence_slider_default = 0.01
|
12 |
max_word_occurence_slider_default = 0.95
|
|
|
32 |
vectoriser_state = gr.State(CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=min_word_occurence_slider_default, max_df=max_word_occurence_slider_default))
|
33 |
|
34 |
session_hash_state = gr.State("")
|
35 |
+
s3_output_folder_state = gr.State("")
|
36 |
|
37 |
gr.Markdown(
|
38 |
"""
|
|
|
54 |
|
55 |
with gr.Accordion("Clean data", open = False):
|
56 |
with gr.Row():
|
57 |
+
clean_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove html, URLs, non-ASCII, multiple digits, emails, postcodes (UK).")
|
58 |
drop_duplicate_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove duplicate text, drop < 50 character strings.")
|
59 |
anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Personal details are redacted - not 100% effective and slow!")
|
60 |
+
#with gr.Row():
|
61 |
split_sentence_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Split text into sentences. Useful for small datasets.")
|
62 |
+
#additional_custom_delimiters_drop = gr.Dropdown(choices=["and", ",", "as well as", "also"], multiselect=True, label="Additional custom delimiters to split sentences.")
|
63 |
+
min_sentence_length_num = gr.Number(value=5, label="Min char length of split sentences")
|
64 |
+
|
65 |
with gr.Row():
|
66 |
custom_regex = gr.UploadButton(label="Import custom regex removal file", file_count="multiple")
|
67 |
gr.Markdown("""Import custom regex - csv table with one column of regex patterns with no header. Strings matching this pattern will be removed. Example pattern: (?i)roosevelt for case insensitive removal of this term.""")
|
|
|
79 |
with gr.Accordion("Topic modelling settings - change documents per topic, max topics, frequency of terms", open = False):
|
80 |
|
81 |
with gr.Row():
|
82 |
+
min_docs_slider = gr.Slider(minimum = 2, maximum = 1000, value = 5, step = 1, label = "Minimum number of similar documents needed to make a topic.")
|
83 |
+
max_topics_slider = gr.Slider(minimum = 0, maximum = 500, value = 0, step = 1, label = "Maximum number of topics. If set to 0, then will choose topics to merge automatically.")
|
84 |
with gr.Row():
|
85 |
min_word_occurence_slider = gr.Slider(minimum = 0.001, maximum = 0.9, value = min_word_occurence_slider_default, step = 0.001, label = "Keep terms that appear in this minimum proportion of documents. Avoids creating topics with very uncommon words.")
|
86 |
max_word_occurence_slider = gr.Slider(minimum = 0.1, maximum = 1.0, value =max_word_occurence_slider_default, step = 0.01, label = "Keep terms that appear in less than this maximum proportion of documents. Avoids very common words in topic names.")
|
|
|
134 |
|
135 |
# Clean data
|
136 |
custom_regex.upload(fn=custom_regex_load, inputs=[custom_regex], outputs=[custom_regex_text, custom_regex_state])
|
137 |
+
clean_btn.click(fn=pre_clean, inputs=[data_state, in_colnames, data_file_name_no_ext_state, custom_regex_state, clean_text, drop_duplicate_text, anonymise_drop, split_sentence_drop, min_sentence_length_num, embeddings_state], outputs=[output_single_text, output_file, data_state, data_file_name_no_ext_state, embeddings_state], api_name="clean")
|
138 |
|
139 |
# Optimise for keeping only zero-shot topics
|
140 |
zero_shot_optimiser_btn.click(fn=optimise_zero_shot, outputs=[quality_mode_drop, min_docs_slider, max_topics_slider, min_word_occurence_slider, max_word_occurence_slider, zero_shot_similarity])
|
|
|
155 |
plot_btn.click(fn=visualise_topics, inputs=[topic_model_state, data_state, data_file_name_no_ext_state, quality_mode_drop, embeddings_state, in_label, in_colnames, legend_label, sample_slide, visualisation_type_radio, seed_number], outputs=[vis_output_single_text, out_plot_file, plot, plot_2], api_name="plot")
|
156 |
|
157 |
# Get session hash from connection parameters
|
158 |
+
block.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state])
|
159 |
+
|
160 |
+
COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
|
161 |
+
print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
|
162 |
+
|
163 |
|
|
|
164 |
if __name__ == "__main__":
|
165 |
+
if os.environ['COGNITO_AUTH'] == "1":
|
166 |
+
block.queue().launch(show_error=True, auth=authenticate_user)
|
167 |
+
else:
|
168 |
+
block.queue().launch(show_error=True, inbrowser=True)
|
funcs/anonymiser.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
from spacy.cli import download
|
2 |
import spacy
|
|
|
3 |
from funcs.presidio_analyzer_custom import analyze_dict
|
4 |
spacy.prefer_gpu()
|
5 |
|
@@ -24,11 +25,6 @@ def spacy_model_installed(model_name):
|
|
24 |
model_name = "en_core_web_sm"
|
25 |
nlp = spacy_model_installed(model_name)
|
26 |
|
27 |
-
#spacy.load(model_name)
|
28 |
-
# Need to overwrite version of gradio present in Huggingface spaces as it doesn't have like buttons/avatars (Oct 2023)
|
29 |
-
#os.system("pip uninstall -y gradio")
|
30 |
-
#os.system("pip install gradio==3.50.0")
|
31 |
-
#os.system("python -m spacy download en_core_web_lg")
|
32 |
|
33 |
import re
|
34 |
import secrets
|
@@ -43,16 +39,63 @@ from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, PatternRecogn
|
|
43 |
from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
|
44 |
from presidio_anonymizer.entities import OperatorConfig
|
45 |
|
|
|
|
|
46 |
# Function to Split Text and Create DataFrame using SpaCy
|
47 |
-
def expand_sentences_spacy(df, colname, nlp=nlp):
|
48 |
expanded_data = []
|
|
|
|
|
|
|
|
|
49 |
df = df.drop('index', axis = 1, errors="ignore").reset_index(names='index')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
for index, row in df.iterrows():
|
51 |
doc = nlp(row[colname])
|
52 |
for sent in doc.sents:
|
53 |
expanded_data.append({'document_index': row['index'], colname: sent.text})
|
54 |
return pd.DataFrame(expanded_data)
|
55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
def anon_consistent_names(df):
|
57 |
# ## Pick out common names and replace them with the same person value
|
58 |
df_dict = df.to_dict(orient="list")
|
|
|
1 |
from spacy.cli import download
|
2 |
import spacy
|
3 |
+
from spacy.pipeline import Sentencizer
|
4 |
from funcs.presidio_analyzer_custom import analyze_dict
|
5 |
spacy.prefer_gpu()
|
6 |
|
|
|
25 |
model_name = "en_core_web_sm"
|
26 |
nlp = spacy_model_installed(model_name)
|
27 |
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
import re
|
30 |
import secrets
|
|
|
39 |
from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
|
40 |
from presidio_anonymizer.entities import OperatorConfig
|
41 |
|
42 |
+
from typing import List
|
43 |
+
|
44 |
# Function to Split Text and Create DataFrame using SpaCy
|
45 |
+
def expand_sentences_spacy(df, colname, custom_delimiters:List[str]=[], nlp=nlp):
|
46 |
expanded_data = []
|
47 |
+
|
48 |
+
# if not custom_delimiters:
|
49 |
+
# custom_delimiters = []
|
50 |
+
|
51 |
df = df.drop('index', axis = 1, errors="ignore").reset_index(names='index')
|
52 |
+
|
53 |
+
# sentencizer = Sentencizer()
|
54 |
+
|
55 |
+
# new_punct_chars = sentencizer.default_punct_chars
|
56 |
+
# new_punct_chars.extend(custom_delimiters)
|
57 |
+
|
58 |
+
# config = {"punct_chars": new_punct_chars}
|
59 |
+
# nlp.add_pipe("sentencizer", config=config)
|
60 |
+
|
61 |
for index, row in df.iterrows():
|
62 |
doc = nlp(row[colname])
|
63 |
for sent in doc.sents:
|
64 |
expanded_data.append({'document_index': row['index'], colname: sent.text})
|
65 |
return pd.DataFrame(expanded_data)
|
66 |
|
67 |
+
# def expand_sentences_spacy(df, colname, custom_delimiters:List[str]=[], nlp=nlp):
|
68 |
+
|
69 |
+
# #print("Custom delimiters:", custom_delimiters)
|
70 |
+
|
71 |
+
# expanded_data = []
|
72 |
+
# df = df.drop('index', axis = 1, errors="ignore").reset_index(names='index')
|
73 |
+
|
74 |
+
# sentencizer = Sentencizer()
|
75 |
+
|
76 |
+
# new_punct_chars = sentencizer.default_punct_chars
|
77 |
+
# if custom_delimiters:
|
78 |
+
# new_punct_chars.extend(custom_delimiters)
|
79 |
+
|
80 |
+
# pattern = "(" + "|".join(re.escape(punct) for punct in new_punct_chars) + ")"
|
81 |
+
# #print("Patterns:", pattern)
|
82 |
+
# split_list = []
|
83 |
+
|
84 |
+
# for idx, string in enumerate(df[colname]):
|
85 |
+
# new_split = re.split(pattern, string)
|
86 |
+
# for n, sentence in enumerate(new_split):
|
87 |
+
# if sentence:
|
88 |
+
# # If there is a split delimiter in the 'sentence' after, add it to the previous sentence as it will be removed at a later step
|
89 |
+
# if n + 1 < len(new_split):
|
90 |
+
# if new_split[n + 1]:
|
91 |
+
# # If the next split is in the list of split characters, then add it to this current sentence
|
92 |
+
# if new_split[n + 1] in new_punct_chars:
|
93 |
+
# split_list.append({'document_index': idx, colname: sentence + new_split[n + 1]})
|
94 |
+
# else:
|
95 |
+
# split_list.append({'document_index': idx, colname: sentence})
|
96 |
+
|
97 |
+
# return pd.DataFrame(split_list)
|
98 |
+
|
99 |
def anon_consistent_names(df):
|
100 |
# ## Pick out common names and replace them with the same person value
|
101 |
df_dict = df.to_dict(orient="list")
|
funcs/auth.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import boto3
|
2 |
+
from funcs.helper_functions import get_or_create_env_var
|
3 |
+
|
4 |
+
client_id = get_or_create_env_var('AWS_CLIENT_ID', 'aws_client_placeholder') # This client id is borrowed from async gradio app client
|
5 |
+
print(f'The value of AWS_CLIENT_ID is {client_id}')
|
6 |
+
|
7 |
+
user_pool_id = get_or_create_env_var('AWS_USER_POOL_ID', 'aws_user_pool_placeholder')
|
8 |
+
print(f'The value of AWS_USER_POOL_ID is {user_pool_id}')
|
9 |
+
|
10 |
+
def authenticate_user(username, password, user_pool_id=user_pool_id, client_id=client_id):
|
11 |
+
"""Authenticates a user against an AWS Cognito user pool.
|
12 |
+
|
13 |
+
Args:
|
14 |
+
user_pool_id (str): The ID of the Cognito user pool.
|
15 |
+
client_id (str): The ID of the Cognito user pool client.
|
16 |
+
username (str): The username of the user.
|
17 |
+
password (str): The password of the user.
|
18 |
+
|
19 |
+
Returns:
|
20 |
+
bool: True if the user is authenticated, False otherwise.
|
21 |
+
"""
|
22 |
+
|
23 |
+
client = boto3.client('cognito-idp') # Cognito Identity Provider client
|
24 |
+
|
25 |
+
try:
|
26 |
+
response = client.initiate_auth(
|
27 |
+
AuthFlow='USER_PASSWORD_AUTH',
|
28 |
+
AuthParameters={
|
29 |
+
'USERNAME': username,
|
30 |
+
'PASSWORD': password,
|
31 |
+
},
|
32 |
+
ClientId=client_id
|
33 |
+
)
|
34 |
+
|
35 |
+
# If successful, you'll receive an AuthenticationResult in the response
|
36 |
+
if response.get('AuthenticationResult'):
|
37 |
+
return True
|
38 |
+
else:
|
39 |
+
return False
|
40 |
+
|
41 |
+
except client.exceptions.NotAuthorizedException:
|
42 |
+
return False
|
43 |
+
except client.exceptions.UserNotFoundException:
|
44 |
+
return False
|
45 |
+
except Exception as e:
|
46 |
+
print(f"An error occurred: {e}")
|
47 |
+
return False
|
48 |
+
|
49 |
+
|
50 |
+
def download_file_from_s3(bucket_name, key, local_file_path):
|
51 |
+
|
52 |
+
s3 = boto3.client('s3')
|
53 |
+
s3.download_file(bucket_name, key, local_file_path)
|
54 |
+
print(f"File downloaded from S3: s3://{bucket_name}/{key} to {local_file_path}")
|
funcs/clean_funcs.py
CHANGED
@@ -8,26 +8,40 @@ custom_words = []
|
|
8 |
my_stop_words = custom_words
|
9 |
|
10 |
# #### Some of my cleaning functions
|
|
|
11 |
html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0| '
|
12 |
html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
|
|
|
13 |
email_pattern_regex = r'\S*@\S*\s?'
|
14 |
num_pattern_regex = r'[0-9]+'
|
15 |
-
nums_two_more_regex = r'\b[0-9]{2,}\b|\b[0-9]+\s[0-9]+\b'
|
16 |
postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
|
17 |
multiple_spaces_regex = r'\s{2,}'
|
18 |
|
19 |
def initial_clean(texts, custom_regex, progress=gr.Progress()):
|
|
|
20 |
texts = pl.Series(texts).str.strip_chars()
|
21 |
-
text = texts.str.replace_all(html_pattern_regex, ' ')
|
22 |
-
text = text.str.replace_all(html_start_pattern_end_dots_regex, ' ')
|
23 |
-
text = text.str.replace_all(email_pattern_regex, ' ')
|
24 |
-
text = text.str.replace_all(nums_two_more_regex, ' ')
|
25 |
-
text = text.str.replace_all(postcode_pattern_regex, ' ')
|
26 |
-
text = text.str.replace_all(multiple_spaces_regex, ' ')
|
27 |
-
|
28 |
-
text = text.to_list()
|
29 |
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
def regex_clean(texts, custom_regex, progress=gr.Progress()):
|
33 |
texts = pl.Series(texts).str.strip_chars()
|
|
|
8 |
my_stop_words = custom_words
|
9 |
|
10 |
# #### Some of my cleaning functions
|
11 |
+
url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+|(?:www\.)[a-zA-Z0-9._-]+\.[a-zA-Z]{2,}'
|
12 |
html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0| '
|
13 |
html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
|
14 |
+
non_ascii_pattern = r'[^\x00-\x7F]+'
|
15 |
email_pattern_regex = r'\S*@\S*\s?'
|
16 |
num_pattern_regex = r'[0-9]+'
|
17 |
+
nums_two_more_regex = r'\b\d+[\.|\,]\d+\b|\b[0-9]{2,}\b|\b[0-9]+\s[0-9]+\b' # Should match two digit numbers or more, and also if there are full stops or commas in between
|
18 |
postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
|
19 |
multiple_spaces_regex = r'\s{2,}'
|
20 |
|
21 |
def initial_clean(texts, custom_regex, progress=gr.Progress()):
|
22 |
+
# Convert to polars Series
|
23 |
texts = pl.Series(texts).str.strip_chars()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
+
# Define a list of patterns and their replacements
|
26 |
+
patterns = [
|
27 |
+
(url_pattern, ' '),
|
28 |
+
(html_pattern_regex, ' '),
|
29 |
+
(html_start_pattern_end_dots_regex, ' '),
|
30 |
+
(non_ascii_pattern, ' '),
|
31 |
+
(email_pattern_regex, ' '),
|
32 |
+
(nums_two_more_regex, ' '),
|
33 |
+
(postcode_pattern_regex, ' '),
|
34 |
+
(multiple_spaces_regex, ' ')
|
35 |
+
]
|
36 |
+
|
37 |
+
# Apply each regex replacement
|
38 |
+
for pattern, replacement in patterns:
|
39 |
+
texts = texts.str.replace_all(pattern, replacement)
|
40 |
+
|
41 |
+
# Convert the series back to a list
|
42 |
+
texts = texts.to_list()
|
43 |
+
|
44 |
+
return texts
|
45 |
|
46 |
def regex_clean(texts, custom_regex, progress=gr.Progress()):
|
47 |
texts = pl.Series(texts).str.strip_chars()
|
funcs/helper_functions.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
import sys
|
2 |
import os
|
3 |
import zipfile
|
4 |
import re
|
@@ -45,35 +44,66 @@ def ensure_output_folder_exists():
|
|
45 |
else:
|
46 |
print(f"The 'output/' folder already exists.")
|
47 |
|
48 |
-
def get_connection_params(request: gr.Request):
|
49 |
-
|
50 |
-
|
51 |
-
'''
|
52 |
if request:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
# print("Request headers dictionary:", request.headers)
|
55 |
# print("All host elements", request.client)
|
56 |
# print("IP address:", request.client.host)
|
57 |
# print("Query parameters:", dict(request.query_params))
|
|
|
|
|
58 |
print("Session hash:", request.session_hash)
|
59 |
|
60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
out_session_hash = request.headers['x-cognito-id']
|
62 |
base_folder = "user-files/"
|
63 |
-
|
64 |
|
65 |
else:
|
66 |
out_session_hash = request.session_hash
|
67 |
base_folder = "temp-files/"
|
68 |
-
#print("Cognito ID not found. Using session hash as save folder
|
69 |
|
70 |
output_folder = base_folder + out_session_hash + "/"
|
71 |
-
#
|
|
|
72 |
|
73 |
-
return out_session_hash
|
74 |
else:
|
75 |
print("No session parameters found.")
|
76 |
-
return ""
|
77 |
|
78 |
def detect_file_type(filename):
|
79 |
"""Detect the file type based on its extension."""
|
@@ -286,6 +316,24 @@ def save_topic_outputs(topic_model: BERTopic, data_file_name_no_ext: str, output
|
|
286 |
columns_found = [column for column in columns_to_check if column in topic_model.get_document_info(docs).columns]
|
287 |
doc_dets = topic_model.get_document_info(docs)[columns_found]
|
288 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
289 |
# If you have created a 'sentence split' dataset from the cleaning options, map these sentences back to the original document.
|
290 |
try:
|
291 |
if split_sentence_drop == "Yes":
|
@@ -296,21 +344,42 @@ def save_topic_outputs(topic_model: BERTopic, data_file_name_no_ext: str, output
|
|
296 |
grouped = doc_dets.groupby('parent_document_index')
|
297 |
|
298 |
# 2. Aggregate Topics and Probabilities:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
299 |
def aggregate_topics(group):
|
300 |
original_text = ' '.join(group['Document'])
|
301 |
-
|
|
|
|
|
302 |
|
303 |
if 'Name' in group.columns:
|
304 |
-
|
|
|
305 |
else:
|
306 |
topic_names = None
|
307 |
|
308 |
if 'Probability' in group.columns:
|
309 |
-
probabilities
|
|
|
310 |
else:
|
311 |
-
probabilities = None
|
|
|
|
|
312 |
|
313 |
-
return pd.Series({'Document':original_text, 'Topic numbers': topics, 'Topic names': topic_names, 'Probabilities': probabilities})
|
314 |
|
315 |
#result_df = grouped.apply(aggregate_topics).reset_index()
|
316 |
doc_det_agg = grouped.apply(lambda x: aggregate_topics(x)).reset_index()
|
|
|
|
|
1 |
import os
|
2 |
import zipfile
|
3 |
import re
|
|
|
44 |
else:
|
45 |
print(f"The 'output/' folder already exists.")
|
46 |
|
47 |
+
async def get_connection_params(request: gr.Request):
|
48 |
+
base_folder = ""
|
49 |
+
|
|
|
50 |
if request:
|
51 |
+
#print("request user:", request.username)
|
52 |
+
|
53 |
+
#request_data = await request.json() # Parse JSON body
|
54 |
+
#print("All request data:", request_data)
|
55 |
+
#context_value = request_data.get('context')
|
56 |
+
#if 'context' in request_data:
|
57 |
+
# print("Request context dictionary:", request_data['context'])
|
58 |
|
59 |
# print("Request headers dictionary:", request.headers)
|
60 |
# print("All host elements", request.client)
|
61 |
# print("IP address:", request.client.host)
|
62 |
# print("Query parameters:", dict(request.query_params))
|
63 |
+
# To get the underlying FastAPI items you would need to use await and some fancy @ stuff for a live query: https://fastapi.tiangolo.com/vi/reference/request/
|
64 |
+
#print("Request dictionary to object:", request.request.body())
|
65 |
print("Session hash:", request.session_hash)
|
66 |
|
67 |
+
# Retrieving or setting CUSTOM_CLOUDFRONT_HEADER
|
68 |
+
CUSTOM_CLOUDFRONT_HEADER_var = get_or_create_env_var('CUSTOM_CLOUDFRONT_HEADER', '')
|
69 |
+
#print(f'The value of CUSTOM_CLOUDFRONT_HEADER is {CUSTOM_CLOUDFRONT_HEADER_var}')
|
70 |
+
|
71 |
+
# Retrieving or setting CUSTOM_CLOUDFRONT_HEADER_VALUE
|
72 |
+
CUSTOM_CLOUDFRONT_HEADER_VALUE_var = get_or_create_env_var('CUSTOM_CLOUDFRONT_HEADER_VALUE', '')
|
73 |
+
#print(f'The value of CUSTOM_CLOUDFRONT_HEADER_VALUE_var is {CUSTOM_CLOUDFRONT_HEADER_VALUE_var}')
|
74 |
+
|
75 |
+
if CUSTOM_CLOUDFRONT_HEADER_var and CUSTOM_CLOUDFRONT_HEADER_VALUE_var:
|
76 |
+
if CUSTOM_CLOUDFRONT_HEADER_var in request.headers:
|
77 |
+
supplied_cloudfront_custom_value = request.headers[CUSTOM_CLOUDFRONT_HEADER_var]
|
78 |
+
if supplied_cloudfront_custom_value == CUSTOM_CLOUDFRONT_HEADER_VALUE_var:
|
79 |
+
print("Custom Cloudfront header found:", supplied_cloudfront_custom_value)
|
80 |
+
else:
|
81 |
+
raise(ValueError, "Custom Cloudfront header value does not match expected value.")
|
82 |
+
|
83 |
+
# Get output save folder from 1 - username passed in from direct Cognito login, 2 - Cognito ID header passed through a Lambda authenticator, 3 - the session hash.
|
84 |
+
|
85 |
+
if request.username:
|
86 |
+
out_session_hash = request.username
|
87 |
+
base_folder = "user-files/"
|
88 |
+
|
89 |
+
elif 'x-cognito-id' in request.headers:
|
90 |
out_session_hash = request.headers['x-cognito-id']
|
91 |
base_folder = "user-files/"
|
92 |
+
print("Cognito ID found:", out_session_hash)
|
93 |
|
94 |
else:
|
95 |
out_session_hash = request.session_hash
|
96 |
base_folder = "temp-files/"
|
97 |
+
# print("Cognito ID not found. Using session hash as save folder:", out_session_hash)
|
98 |
|
99 |
output_folder = base_folder + out_session_hash + "/"
|
100 |
+
#if bucket_name:
|
101 |
+
# print("S3 output folder is: " + "s3://" + bucket_name + "/" + output_folder)
|
102 |
|
103 |
+
return out_session_hash, output_folder
|
104 |
else:
|
105 |
print("No session parameters found.")
|
106 |
+
return "",""
|
107 |
|
108 |
def detect_file_type(filename):
|
109 |
"""Detect the file type based on its extension."""
|
|
|
316 |
columns_found = [column for column in columns_to_check if column in topic_model.get_document_info(docs).columns]
|
317 |
doc_dets = topic_model.get_document_info(docs)[columns_found]
|
318 |
|
319 |
+
### If there are full topic probabilities, join these on to the document details df
|
320 |
+
def is_valid_dataframe(df):
|
321 |
+
"""
|
322 |
+
Checks if the given object is a non-empty pandas DataFrame.
|
323 |
+
|
324 |
+
Args:
|
325 |
+
df: The object to check.
|
326 |
+
|
327 |
+
Returns:
|
328 |
+
True if df is a non-empty DataFrame, False otherwise.
|
329 |
+
"""
|
330 |
+
if df is None: # Check for None first
|
331 |
+
return False
|
332 |
+
return isinstance(df, pd.DataFrame) and not df.empty
|
333 |
+
|
334 |
+
if is_valid_dataframe(topic_model.probabilities_):
|
335 |
+
doc_dets = doc_dets.merge(topic_model.probabilities_, left_index=True, right_index=True, how="left")
|
336 |
+
|
337 |
# If you have created a 'sentence split' dataset from the cleaning options, map these sentences back to the original document.
|
338 |
try:
|
339 |
if split_sentence_drop == "Yes":
|
|
|
344 |
grouped = doc_dets.groupby('parent_document_index')
|
345 |
|
346 |
# 2. Aggregate Topics and Probabilities:
|
347 |
+
# def aggregate_topics(group):
|
348 |
+
# original_text = ' '.join(group['Document'])
|
349 |
+
# topics = group['Topic'].tolist()
|
350 |
+
|
351 |
+
# if 'Name' in group.columns:
|
352 |
+
# topic_names = group['Name'].tolist()
|
353 |
+
# else:
|
354 |
+
# topic_names = None
|
355 |
+
|
356 |
+
# if 'Probability' in group.columns:
|
357 |
+
# probabilities = group['Probability'].tolist()
|
358 |
+
# else:
|
359 |
+
# probabilities = None # Or any other default value you prefer
|
360 |
+
|
361 |
+
# return pd.Series({'Document':original_text, 'Topic numbers': topics, 'Topic names': topic_names, 'Probabilities': probabilities})
|
362 |
+
|
363 |
def aggregate_topics(group):
|
364 |
original_text = ' '.join(group['Document'])
|
365 |
+
|
366 |
+
# Filter out topics starting with '-1'
|
367 |
+
topics = [topic for topic in group['Topic'].tolist() if not str(topic).startswith('-1')]
|
368 |
|
369 |
if 'Name' in group.columns:
|
370 |
+
# Filter out topic names corresponding to excluded topics
|
371 |
+
topic_names = [name for topic, name in zip(group['Topic'], group['Name'].tolist()) if not str(topic).startswith('-1')]
|
372 |
else:
|
373 |
topic_names = None
|
374 |
|
375 |
if 'Probability' in group.columns:
|
376 |
+
# Filter out probabilities corresponding to excluded topics
|
377 |
+
probabilities = [prob for topic, prob in zip(group['Topic'], group['Probability'].tolist()) if not str(topic).startswith('-1')]
|
378 |
else:
|
379 |
+
probabilities = None
|
380 |
+
|
381 |
+
return pd.Series({'Document': original_text, 'Topic numbers': topics, 'Topic names': topic_names, 'Probabilities': probabilities})
|
382 |
|
|
|
383 |
|
384 |
#result_df = grouped.apply(aggregate_topics).reset_index()
|
385 |
doc_det_agg = grouped.apply(lambda x: aggregate_topics(x)).reset_index()
|
funcs/topic_core_funcs.py
CHANGED
@@ -52,7 +52,7 @@ def change_default_vis_col(in_colnames:List[str]):
|
|
52 |
else:
|
53 |
return gr.Dropdown()
|
54 |
|
55 |
-
def pre_clean(data: pd.DataFrame, in_colnames: list, data_file_name_no_ext: str, custom_regex: pd.DataFrame, clean_text: str, drop_duplicate_text: str, anonymise_drop: str, sentence_split_drop: str, embeddings_state: dict, progress: gr.Progress = gr.Progress(track_tqdm=True)) -> tuple:
|
56 |
"""
|
57 |
Pre-processes the input data by cleaning text, removing duplicates, anonymizing data, and splitting sentences based on the provided options.
|
58 |
|
@@ -65,6 +65,7 @@ def pre_clean(data: pd.DataFrame, in_colnames: list, data_file_name_no_ext: str,
|
|
65 |
drop_duplicate_text (str): Option to drop duplicate text ("Yes" or "No").
|
66 |
anonymise_drop (str): Option to anonymize data ("Yes" or "No").
|
67 |
sentence_split_drop (str): Option to split text into sentences ("Yes" or "No").
|
|
|
68 |
embeddings_state (dict): State of the embeddings.
|
69 |
progress (gr.Progress, optional): Progress tracker for the cleaning process.
|
70 |
|
@@ -140,6 +141,8 @@ def pre_clean(data: pd.DataFrame, in_colnames: list, data_file_name_no_ext: str,
|
|
140 |
anon_toc = time.perf_counter()
|
141 |
time_out = f"Anonymising text took {anon_toc - anon_tic:0.1f} seconds"
|
142 |
|
|
|
|
|
143 |
if sentence_split_drop == "Yes":
|
144 |
progress(0.6, desc= "Splitting text into sentences")
|
145 |
|
@@ -149,11 +152,14 @@ def pre_clean(data: pd.DataFrame, in_colnames: list, data_file_name_no_ext: str,
|
|
149 |
anon_tic = time.perf_counter()
|
150 |
|
151 |
data = expand_sentences_spacy(data, in_colnames_list_first)
|
152 |
-
data = data[data[in_colnames_list_first].str.len()
|
|
|
153 |
data.reset_index(inplace=True, drop=True)
|
154 |
|
155 |
anon_toc = time.perf_counter()
|
156 |
-
time_out = f"
|
|
|
|
|
157 |
|
158 |
out_data_name = output_folder + data_file_name_no_ext + "_" + today_rev + ".csv"
|
159 |
data.to_csv(out_data_name)
|
@@ -252,6 +258,9 @@ def extract_topics(
|
|
252 |
elif calc_probs == "Yes":
|
253 |
print("Calculating all probabilities.")
|
254 |
calc_probs = True
|
|
|
|
|
|
|
255 |
|
256 |
if not in_colnames:
|
257 |
error_message = "Please enter one column name to use for cleaning and finding topics."
|
@@ -279,7 +288,7 @@ def extract_topics(
|
|
279 |
# Attempt to load the model from each local location
|
280 |
for location in local_embeddings_locations:
|
281 |
try:
|
282 |
-
embedding_model = SentenceTransformer(location
|
283 |
print(f"Found local model installation at: {location}")
|
284 |
break # Exit the loop if the model is found
|
285 |
except Exception as e:
|
@@ -287,7 +296,7 @@ def extract_topics(
|
|
287 |
continue
|
288 |
else:
|
289 |
# If the loop completes without finding the model in any local location
|
290 |
-
embedding_model = SentenceTransformer(embeddings_name
|
291 |
print("Could not find local model installation. Downloading from Huggingface")
|
292 |
|
293 |
#embedding_model = SentenceTransformer(embeddings_name, truncate_dim=512)
|
@@ -343,6 +352,7 @@ def extract_topics(
|
|
343 |
assigned_topics, probs = topic_model.fit_transform(docs, embeddings_out)
|
344 |
|
345 |
if calc_probs == True:
|
|
|
346 |
topics_probs_out = pd.DataFrame(topic_model.probabilities_)
|
347 |
topics_probs_out_name = output_folder + "topic_full_probs_" + data_file_name_no_ext + "_" + today_rev + ".csv"
|
348 |
topics_probs_out.to_csv(topics_probs_out_name)
|
@@ -385,6 +395,10 @@ def extract_topics(
|
|
385 |
assigned_topics, probs = topic_model.fit_transform(docs, embeddings_out)
|
386 |
|
387 |
if calc_probs == True:
|
|
|
|
|
|
|
|
|
388 |
topics_probs_out = pd.DataFrame(topic_model.probabilities_)
|
389 |
topics_probs_out_name = output_folder + "topic_full_probs_" + data_file_name_no_ext + "_" + today_rev + ".csv"
|
390 |
topics_probs_out.to_csv(topics_probs_out_name)
|
@@ -424,7 +438,7 @@ def extract_topics(
|
|
424 |
|
425 |
# Tidy up topic label format a bit to have commas and spaces by default
|
426 |
if not candidate_topics:
|
427 |
-
print("Zero shot topics found, so not renaming")
|
428 |
new_topic_labels = topic_model.generate_topic_labels(nr_words=3, separator=", ")
|
429 |
topic_model.set_topic_labels(new_topic_labels)
|
430 |
if candidate_topics:
|
@@ -447,7 +461,7 @@ def extract_topics(
|
|
447 |
# If you want to save your embedding files
|
448 |
if return_intermediate_files == "Yes":
|
449 |
print("Saving embeddings to file")
|
450 |
-
if high_quality_mode == "
|
451 |
embeddings_file_name = output_folder + data_file_name_no_ext + '_' + 'tfidf_embeddings.npz'
|
452 |
else:
|
453 |
if embeddings_super_compress == "No":
|
|
|
52 |
else:
|
53 |
return gr.Dropdown()
|
54 |
|
55 |
+
def pre_clean(data: pd.DataFrame, in_colnames: list, data_file_name_no_ext: str, custom_regex: pd.DataFrame, clean_text: str, drop_duplicate_text: str, anonymise_drop: str, sentence_split_drop: str, min_sentence_length: int, embeddings_state: dict, progress: gr.Progress = gr.Progress(track_tqdm=True)) -> tuple:
|
56 |
"""
|
57 |
Pre-processes the input data by cleaning text, removing duplicates, anonymizing data, and splitting sentences based on the provided options.
|
58 |
|
|
|
65 |
drop_duplicate_text (str): Option to drop duplicate text ("Yes" or "No").
|
66 |
anonymise_drop (str): Option to anonymize data ("Yes" or "No").
|
67 |
sentence_split_drop (str): Option to split text into sentences ("Yes" or "No").
|
68 |
+
min_sentence_length (int): Minimum length of sentences after split (integer value of character length)
|
69 |
embeddings_state (dict): State of the embeddings.
|
70 |
progress (gr.Progress, optional): Progress tracker for the cleaning process.
|
71 |
|
|
|
141 |
anon_toc = time.perf_counter()
|
142 |
time_out = f"Anonymising text took {anon_toc - anon_tic:0.1f} seconds"
|
143 |
|
144 |
+
print(time_out)
|
145 |
+
|
146 |
if sentence_split_drop == "Yes":
|
147 |
progress(0.6, desc= "Splitting text into sentences")
|
148 |
|
|
|
152 |
anon_tic = time.perf_counter()
|
153 |
|
154 |
data = expand_sentences_spacy(data, in_colnames_list_first)
|
155 |
+
data = data[data[in_colnames_list_first].str.len() > min_sentence_length] # Keep only rows with at more than 5 characters
|
156 |
+
data[in_colnames_list_first] = data[in_colnames_list_first].str.strip()
|
157 |
data.reset_index(inplace=True, drop=True)
|
158 |
|
159 |
anon_toc = time.perf_counter()
|
160 |
+
time_out = f"Splitting text took {anon_toc - anon_tic:0.1f} seconds"
|
161 |
+
|
162 |
+
print(time_out)
|
163 |
|
164 |
out_data_name = output_folder + data_file_name_no_ext + "_" + today_rev + ".csv"
|
165 |
data.to_csv(out_data_name)
|
|
|
258 |
elif calc_probs == "Yes":
|
259 |
print("Calculating all probabilities.")
|
260 |
calc_probs = True
|
261 |
+
|
262 |
+
if max_topics_slider == 0:
|
263 |
+
max_topics_slider = 'auto'
|
264 |
|
265 |
if not in_colnames:
|
266 |
error_message = "Please enter one column name to use for cleaning and finding topics."
|
|
|
288 |
# Attempt to load the model from each local location
|
289 |
for location in local_embeddings_locations:
|
290 |
try:
|
291 |
+
embedding_model = SentenceTransformer(location)#, truncate_dim=512)
|
292 |
print(f"Found local model installation at: {location}")
|
293 |
break # Exit the loop if the model is found
|
294 |
except Exception as e:
|
|
|
296 |
continue
|
297 |
else:
|
298 |
# If the loop completes without finding the model in any local location
|
299 |
+
embedding_model = SentenceTransformer(embeddings_name)#, truncate_dim=512)
|
300 |
print("Could not find local model installation. Downloading from Huggingface")
|
301 |
|
302 |
#embedding_model = SentenceTransformer(embeddings_name, truncate_dim=512)
|
|
|
352 |
assigned_topics, probs = topic_model.fit_transform(docs, embeddings_out)
|
353 |
|
354 |
if calc_probs == True:
|
355 |
+
|
356 |
topics_probs_out = pd.DataFrame(topic_model.probabilities_)
|
357 |
topics_probs_out_name = output_folder + "topic_full_probs_" + data_file_name_no_ext + "_" + today_rev + ".csv"
|
358 |
topics_probs_out.to_csv(topics_probs_out_name)
|
|
|
395 |
assigned_topics, probs = topic_model.fit_transform(docs, embeddings_out)
|
396 |
|
397 |
if calc_probs == True:
|
398 |
+
|
399 |
+
assigned_topics, probs = topic_model.transform(docs, embeddings_out)
|
400 |
+
print("Probs:", probs)
|
401 |
+
topic_model.probabilities_ = probs
|
402 |
topics_probs_out = pd.DataFrame(topic_model.probabilities_)
|
403 |
topics_probs_out_name = output_folder + "topic_full_probs_" + data_file_name_no_ext + "_" + today_rev + ".csv"
|
404 |
topics_probs_out.to_csv(topics_probs_out_name)
|
|
|
438 |
|
439 |
# Tidy up topic label format a bit to have commas and spaces by default
|
440 |
if not candidate_topics:
|
441 |
+
print("Zero shot topics not found, so not renaming")
|
442 |
new_topic_labels = topic_model.generate_topic_labels(nr_words=3, separator=", ")
|
443 |
topic_model.set_topic_labels(new_topic_labels)
|
444 |
if candidate_topics:
|
|
|
461 |
# If you want to save your embedding files
|
462 |
if return_intermediate_files == "Yes":
|
463 |
print("Saving embeddings to file")
|
464 |
+
if high_quality_mode == "No":
|
465 |
embeddings_file_name = output_folder + data_file_name_no_ext + '_' + 'tfidf_embeddings.npz'
|
466 |
else:
|
467 |
if embeddings_super_compress == "No":
|
requirements.txt
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
gradio # Not specified version due to interaction with spacy - reinstall latest version after requirements.txt load
|
|
|
2 |
transformers==4.41.2
|
3 |
accelerate==0.26.1
|
4 |
torch==2.3.1
|
|
|
1 |
gradio # Not specified version due to interaction with spacy - reinstall latest version after requirements.txt load
|
2 |
+
boto3
|
3 |
transformers==4.41.2
|
4 |
accelerate==0.26.1
|
5 |
torch==2.3.1
|
requirements_gpu.txt
CHANGED
@@ -1,11 +1,12 @@
|
|
1 |
gradio # Not specified version due to interaction with spacy - reinstall latest version after requirements.txt load
|
|
|
2 |
transformers==4.41.2
|
3 |
accelerate==0.26.1
|
4 |
bertopic==0.16.2
|
5 |
spacy==3.7.4
|
6 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
|
7 |
pyarrow==14.0.2
|
8 |
-
openpyxl==3.1.
|
9 |
Faker==22.2.0
|
10 |
presidio_analyzer==2.2.354
|
11 |
presidio_anonymizer==2.2.354
|
|
|
1 |
gradio # Not specified version due to interaction with spacy - reinstall latest version after requirements.txt load
|
2 |
+
boto3
|
3 |
transformers==4.41.2
|
4 |
accelerate==0.26.1
|
5 |
bertopic==0.16.2
|
6 |
spacy==3.7.4
|
7 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
|
8 |
pyarrow==14.0.2
|
9 |
+
openpyxl==3.1.3
|
10 |
Faker==22.2.0
|
11 |
presidio_analyzer==2.2.354
|
12 |
presidio_anonymizer==2.2.354
|