seanpedrickcase commited on
Commit
01c88c0
1 Parent(s): e08f9b8

Added logging, anonymising all Excel sheets, simple redaction tags, some Dockerfile optimisation

Browse files
.dockerignore CHANGED
@@ -14,4 +14,5 @@ poppler/*
14
  build/*
15
  dist/*
16
  build_deps/*
 
17
  doc_redaction_amplify_app/*
 
14
  build/*
15
  dist/*
16
  build_deps/*
17
+ logs/*
18
  doc_redaction_amplify_app/*
.gitignore CHANGED
@@ -14,4 +14,5 @@ poppler/*
14
  build/*
15
  dist/*
16
  build_deps/*
 
17
  doc_redaction_amplify_app/*
 
14
  build/*
15
  dist/*
16
  build_deps/*
17
+ logs/*
18
  doc_redaction_amplify_app/*
Dockerfile CHANGED
@@ -1,12 +1,8 @@
1
- FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
 
2
 
3
  # Install system dependencies. Need to specify -y for poppler to get it to install
4
  RUN apt-get update \
5
- && apt-get install -y \
6
- tesseract-ocr -y \
7
- poppler-utils -y \
8
- libgl1-mesa-glx -y \
9
- libglib2.0-0 -y \
10
  && apt-get clean \
11
  && rm -rf /var/lib/apt/lists/*
12
 
@@ -14,19 +10,34 @@ WORKDIR /src
14
 
15
  COPY requirements.txt .
16
 
17
- RUN pip install --no-cache-dir -r requirements.txt
 
 
18
 
19
- RUN pip install --no-cache-dir gradio==4.36.1
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  # Set up a new user named "user" with user ID 1000
22
  RUN useradd -m -u 1000 user
23
 
24
- # Change ownership of /home/user directory
25
- #RUN chown -R user:user /home/user
26
-
27
  # Make output folder
28
- RUN mkdir -p /home/user/app/output && chown -R user:user /home/user/app/output
29
- RUN mkdir -p /home/user/app/tld && chown -R user:user /home/user/app/tld
 
 
 
 
 
30
 
31
  # Switch to the "user" user
32
  USER user
@@ -34,18 +45,15 @@ USER user
34
  # Set environmental variables
35
  ENV HOME=/home/user \
36
  PATH=/home/user/.local/bin:$PATH \
37
- PYTHONPATH=$HOME/app \
38
  PYTHONUNBUFFERED=1 \
 
39
  GRADIO_ALLOW_FLAGGING=never \
40
  GRADIO_NUM_PORTS=1 \
41
  GRADIO_SERVER_NAME=0.0.0.0 \
42
  GRADIO_SERVER_PORT=7860 \
43
  GRADIO_THEME=huggingface \
44
  TLDEXTRACT_CACHE=$HOME/app/tld/.tld_set_snapshot \
45
- #GRADIO_TEMP_DIR=$HOME/tmp \
46
- #GRADIO_ROOT_PATH=/address-match \
47
- # gunicorn keep alive timeout limit extended for GUI-based work - https://github.com/tiangolo/uvicorn-gunicorn-fastapi-docker?tab=readme-ov-file#timeout
48
- KEEP_ALIVE=60 \
49
  SYSTEM=spaces
50
 
51
  # Set the working directory to the user's home directory
@@ -53,6 +61,5 @@ WORKDIR $HOME/app
53
 
54
  # Copy the current directory contents into the container at $HOME/app setting the owner to the user
55
  COPY --chown=user . $HOME/app
56
- #COPY . $HOME/app
57
 
58
  CMD ["python", "app.py"]
 
1
+ # Stage 1: Build dependencies and download models
2
+ FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm AS builder
3
 
4
  # Install system dependencies. Need to specify -y for poppler to get it to install
5
  RUN apt-get update \
 
 
 
 
 
6
  && apt-get clean \
7
  && rm -rf /var/lib/apt/lists/*
8
 
 
10
 
11
  COPY requirements.txt .
12
 
13
+ RUN pip install --no-cache-dir --target=/install -r requirements.txt
14
+
15
+ RUN rm requirements.txt
16
 
17
+ # Stage 2: Final runtime image
18
+ FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
19
+
20
+ # Install system dependencies. Need to specify -y for poppler to get it to install
21
+ RUN apt-get update \
22
+ && apt-get install -y \
23
+ tesseract-ocr \
24
+ poppler-utils \
25
+ libgl1-mesa-glx \
26
+ libglib2.0-0 \
27
+ && apt-get clean \
28
+ && rm -rf /var/lib/apt/lists/*
29
 
30
  # Set up a new user named "user" with user ID 1000
31
  RUN useradd -m -u 1000 user
32
 
 
 
 
33
  # Make output folder
34
+ RUN mkdir -p /home/user/app/output \
35
+ && mkdir -p /home/user/app/tld \
36
+ && mkdir -p /home/user/app/logs \
37
+ && chown -R user:user /home/user/app
38
+
39
+ # Copy installed packages from builder stage
40
+ COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
41
 
42
  # Switch to the "user" user
43
  USER user
 
45
  # Set environmental variables
46
  ENV HOME=/home/user \
47
  PATH=/home/user/.local/bin:$PATH \
48
+ PYTHONPATH=/home/user/app \
49
  PYTHONUNBUFFERED=1 \
50
+ PYTHONDONTWRITEBYTECODE=1 \
51
  GRADIO_ALLOW_FLAGGING=never \
52
  GRADIO_NUM_PORTS=1 \
53
  GRADIO_SERVER_NAME=0.0.0.0 \
54
  GRADIO_SERVER_PORT=7860 \
55
  GRADIO_THEME=huggingface \
56
  TLDEXTRACT_CACHE=$HOME/app/tld/.tld_set_snapshot \
 
 
 
 
57
  SYSTEM=spaces
58
 
59
  # Set the working directory to the user's home directory
 
61
 
62
  # Copy the current directory contents into the container at $HOME/app setting the owner to the user
63
  COPY --chown=user . $HOME/app
 
64
 
65
  CMD ["python", "app.py"]
app.py CHANGED
@@ -6,7 +6,7 @@ os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
6
  from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var
7
  from tools.file_redaction import choose_and_run_redactor
8
  from tools.file_conversion import prepare_image_or_text_pdf
9
- from tools.data_anonymise import do_anonymise
10
  from tools.auth import authenticate_user
11
  #from tools.aws_functions import load_data_from_aws
12
  import gradio as gr
@@ -28,6 +28,7 @@ with app:
28
  prepared_pdf_state = gr.State([])
29
  output_image_files_state = gr.State([])
30
  output_file_list_state = gr.State([])
 
31
 
32
  session_hash_state = gr.State()
33
  s3_output_folder_state = gr.State()
@@ -51,7 +52,8 @@ with app:
51
 
52
  with gr.Row():
53
  output_summary = gr.Textbox(label="Output summary")
54
- output_file = gr.File(label="Output file")
 
55
 
56
  with gr.Row():
57
  convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary", visible=False)
@@ -64,16 +66,19 @@ with app:
64
  )
65
  with gr.Accordion("Paste open text", open = False):
66
  in_text = gr.Textbox(label="Enter open text", lines=10)
67
- with gr.Accordion("Upload xlsx (first sheet read only) or csv file(s)", open = False):
68
- in_file_text = gr.File(label="Choose an xlsx (first sheet read only) or csv files", file_count= "multiple", file_types=['.xlsx', '.csv', '.parquet', '.csv.gz'])
 
 
69
 
70
- in_colnames = gr.Dropdown(choices=["Choose a column"], multiselect = True, label="Select columns that you want to anonymise. Ensure that at least one named column exists in all files.")
71
 
72
- match_btn = gr.Button("Anonymise text", variant="primary")
73
 
74
  with gr.Row():
75
  text_output_summary = gr.Textbox(label="Output result")
76
- text_output_file = gr.File(label="Output file")
 
77
 
78
  with gr.Tab(label="Redaction settings"):
79
  gr.Markdown(
@@ -83,13 +88,16 @@ with app:
83
  with gr.Accordion("Settings for documents", open = True):
84
  in_redaction_method = gr.Radio(label="Default document redaction method - text analysis is faster is not useful for image-based PDFs. Imaged-based is slightly less accurate in general.", value = "Text analysis", choices=["Text analysis", "Image analysis"])
85
  with gr.Accordion("Settings for open text or xlsx/csv files", open = True):
86
- anon_strat = gr.Radio(choices=["replace", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace")
87
 
88
  with gr.Accordion("Settings for documents and open text/xlsx/csv files", open = True):
89
  in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact (click close to down arrow for full list)")
90
  with gr.Row():
91
  in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False)
92
  in_allow_list = gr.Dataframe(label="Allow list - enter a new term to ignore for redaction on each row e.g. Lambeth -> add new row -> Lambeth 2030", headers=["Allow list"], row_count=1, col_count=(1, 'fixed'), value=[[""]], type="array", column_widths=["100px"], datatype='str')
 
 
 
93
 
94
  # AWS options - not yet implemented
95
  # with gr.Tab(label="Advanced options"):
@@ -104,26 +112,38 @@ with app:
104
  # ### Loading AWS data ###
105
  # load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
106
 
 
107
 
108
  # Document redaction
109
- redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list],
110
  outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
111
- then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list],
112
- outputs=[output_summary, output_file, output_file_list_state], api_name="redact_doc")#.\
113
- #then(fn = convert_text_pdf_to_img_pdf, inputs=[in_file, output_file_list_state],
114
- #outputs=[output_summary, output_file])
115
 
116
- #convert_text_pdf_to_img_btn.click(fn = convert_text_pdf_to_img_pdf, inputs=[in_file, output_file_list_state],
117
- # outputs=[output_summary, output_file], api_name="convert_to_img")
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
- # Open text interaction
120
- in_file_text.upload(fn=put_columns_in_df, inputs=[in_file_text], outputs=[in_colnames])
121
- match_btn.click(fn=do_anonymise, inputs=[in_file_text, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list], outputs=[text_output_summary, text_output_file], api_name="redact_text")
122
 
123
- app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state])
 
124
 
125
  # Launch the Gradio app
126
- COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '1')
127
  print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
128
 
129
  if __name__ == "__main__":
 
6
  from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var
7
  from tools.file_redaction import choose_and_run_redactor
8
  from tools.file_conversion import prepare_image_or_text_pdf
9
+ from tools.data_anonymise import anonymise_data_files
10
  from tools.auth import authenticate_user
11
  #from tools.aws_functions import load_data_from_aws
12
  import gradio as gr
 
28
  prepared_pdf_state = gr.State([])
29
  output_image_files_state = gr.State([])
30
  output_file_list_state = gr.State([])
31
+ text_output_file_list_state = gr.State([])
32
 
33
  session_hash_state = gr.State()
34
  s3_output_folder_state = gr.State()
 
52
 
53
  with gr.Row():
54
  output_summary = gr.Textbox(label="Output summary")
55
+ output_file = gr.File(label="Output files")
56
+ text_documents_done = gr.Number(value=0, label="Number of documents redacted", interactive=False)
57
 
58
  with gr.Row():
59
  convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary", visible=False)
 
66
  )
67
  with gr.Accordion("Paste open text", open = False):
68
  in_text = gr.Textbox(label="Enter open text", lines=10)
69
+ with gr.Accordion("Upload xlsx or csv files", open = True):
70
+ in_data_files = gr.File(label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
71
+
72
+ in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheets to anonymise"], multiselect = True, label="Select Excel sheets that you want to anonymise (showing sheets present across all Excel files).", visible=False, allow_custom_value=True)
73
 
74
+ in_colnames = gr.Dropdown(choices=["Choose columns to anonymise"], multiselect = True, label="Select columns that you want to anonymise (showing columns present across all files).")
75
 
76
+ tabular_data_redact_btn = gr.Button("Anonymise text", variant="primary")
77
 
78
  with gr.Row():
79
  text_output_summary = gr.Textbox(label="Output result")
80
+ text_output_file = gr.File(label="Output files")
81
+ text_tabular_files_done = gr.Number(value=0, label="Number of tabular files redacted", interactive=False)
82
 
83
  with gr.Tab(label="Redaction settings"):
84
  gr.Markdown(
 
88
  with gr.Accordion("Settings for documents", open = True):
89
  in_redaction_method = gr.Radio(label="Default document redaction method - text analysis is faster is not useful for image-based PDFs. Imaged-based is slightly less accurate in general.", value = "Text analysis", choices=["Text analysis", "Image analysis"])
90
  with gr.Accordion("Settings for open text or xlsx/csv files", open = True):
91
+ anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
92
 
93
  with gr.Accordion("Settings for documents and open text/xlsx/csv files", open = True):
94
  in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact (click close to down arrow for full list)")
95
  with gr.Row():
96
  in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False)
97
  in_allow_list = gr.Dataframe(label="Allow list - enter a new term to ignore for redaction on each row e.g. Lambeth -> add new row -> Lambeth 2030", headers=["Allow list"], row_count=1, col_count=(1, 'fixed'), value=[[""]], type="array", column_widths=["100px"], datatype='str')
98
+
99
+ # Invisible text box to hold the session hash/username just for logging purposes
100
+ session_hash_textbox = gr.Textbox(value="", visible=False)
101
 
102
  # AWS options - not yet implemented
103
  # with gr.Tab(label="Advanced options"):
 
112
  # ### Loading AWS data ###
113
  # load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
114
 
115
+ callback = gr.CSVLogger()
116
 
117
  # Document redaction
118
+ redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary],
119
  outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
120
+ then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state],
121
+ outputs=[output_summary, output_file, output_file_list_state, text_documents_done], api_name="redact_doc")
 
 
122
 
123
+ # If the output file count text box changes, keep going with redacting each document until done
124
+ text_documents_done.change(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary],
125
+ outputs=[output_summary, prepared_pdf_state]).\
126
+ then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state],
127
+ outputs=[output_summary, output_file, output_file_list_state, text_documents_done])
128
+
129
+ # Tabular data redaction
130
+ in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets])
131
+
132
+ tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, in_excel_sheets], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done], api_name="redact_text")
133
+
134
+ # If the output file count text box changes, keep going with redacting each data file until done
135
+ text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, in_excel_sheets], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done])
136
+
137
+ app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
138
 
139
+ # This needs to be called at some point prior to the first call to callback.flag()
140
+ callback.setup([session_hash_textbox], "logs")
 
141
 
142
+ #app.load(lambda *args: callback.flag(list(args)), [session_hash_textbox], None, preprocess=False)
143
+ session_hash_textbox.change(lambda *args: callback.flag(list(args)), [session_hash_textbox], None, preprocess=False)
144
 
145
  # Launch the Gradio app
146
+ COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
147
  print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
148
 
149
  if __name__ == "__main__":
requirements.txt CHANGED
@@ -6,10 +6,10 @@ presidio_anonymizer==2.2.354
6
  presidio-image-redactor==0.0.52
7
  pikepdf==8.15.1
8
  pandas==2.2.2
9
- spacy # Not specified as latest versions create a conflict with latest versions of gradio
10
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1.tar.gz
11
- gradio # Not specified as latest versions create a conflict with latest versions of spacy
12
- boto3==1.34.103
13
- faker
14
- openpyxl
15
- pyarrow
 
6
  presidio-image-redactor==0.0.52
7
  pikepdf==8.15.1
8
  pandas==2.2.2
9
+ spacy==3.7.5
10
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1.tar.gz
11
+ gradio>=4.26.0
12
+ boto3==1.34.158
13
+ pyarrow==14.0.2
14
+ openpyxl==3.1.2
15
+ Faker==22.2.0
tools/data_anonymise.py CHANGED
@@ -11,9 +11,9 @@ from typing import List
11
 
12
  from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine
13
  from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
14
- from presidio_anonymizer.entities import OperatorConfig
15
 
16
- from tools.helper_functions import output_folder, get_file_path_end, read_file
17
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
18
 
19
  # Use custom version of analyze_dict to be able to track progress
@@ -116,17 +116,20 @@ def anon_consistent_names(df):
116
 
117
  return scrubbed_df_consistent_names
118
 
119
- def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[str], allow_list:List[str]=[], progress=Progress(track_tqdm=False)):
 
 
 
120
  # DataFrame to dict
121
  df_dict = df.to_dict(orient="list")
122
 
123
- if allow_list:
124
- allow_list_flat = [item for sublist in allow_list for item in sublist]
125
 
126
  #analyzer = nlp_analyser #AnalyzerEngine()
127
  batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
128
 
129
- anonymizer = AnonymizerEngine()
130
 
131
  batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
132
 
@@ -134,19 +137,19 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
134
  # entities=chosen_redact_entities,
135
  # score_threshold=score_threshold,
136
  # return_decision_process=False,
137
- # allow_list=allow_list_flat)
138
 
139
  print("Identifying personal information")
140
  analyse_tic = time.perf_counter()
141
 
142
- print("Allow list:", allow_list)
143
 
144
  # Use custom analyzer to be able to track progress with Gradio
145
  analyzer_results = analyze_dict(batch_analyzer, df_dict, language=language,
146
  entities=chosen_redact_entities,
147
  score_threshold=score_threshold,
148
  return_decision_process=False,
149
- allow_list=allow_list_flat)
150
  analyzer_results = list(analyzer_results)
151
  #analyzer_results
152
 
@@ -154,9 +157,7 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
154
  analyse_time_out = f"Analysing the text took {analyse_toc - analyse_tic:0.1f} seconds."
155
  print(analyse_time_out)
156
 
157
- # Generate a 128-bit AES key. Then encode the key using base64 to get a string representation
158
- key = secrets.token_bytes(16) # 128 bits = 16 bytes
159
- key_string = base64.b64encode(key).decode('utf-8')
160
 
161
  # Create faker function (note that it has to receive a value)
162
 
@@ -166,6 +167,7 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
166
  return fake.first_name()
167
 
168
  # Set up the anonymization configuration WITHOUT DATE_TIME
 
169
  replace_config = eval('{"DEFAULT": OperatorConfig("replace")}')
170
  redact_config = eval('{"DEFAULT": OperatorConfig("redact")}')
171
  hash_config = eval('{"DEFAULT": OperatorConfig("hash")}')
@@ -173,12 +175,16 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
173
  people_encrypt_config = eval('{"PERSON": OperatorConfig("encrypt", {"key": key_string})}') # The encryption is using AES cypher in CBC mode and requires a cryptographic key as an input for both the encryption and the decryption.
174
  fake_first_name_config = eval('{"PERSON": OperatorConfig("custom", {"lambda": fake_first_name})}')
175
 
176
-
177
- if anon_strat == "replace": chosen_mask_config = replace_config
178
  if anon_strat == "redact": chosen_mask_config = redact_config
179
  if anon_strat == "hash": chosen_mask_config = hash_config
180
  if anon_strat == "mask": chosen_mask_config = mask_config
181
- if anon_strat == "encrypt": chosen_mask_config = people_encrypt_config
 
 
 
 
182
  elif anon_strat == "fake_first_name": chosen_mask_config = fake_first_name_config
183
 
184
  # I think in general people will want to keep date / times
@@ -190,17 +196,10 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
190
  anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results, operators=combined_config)
191
 
192
  scrubbed_df = pd.DataFrame(anonymizer_results)
193
-
194
- # Create reporting message
195
- out_message = "Successfully anonymised"
196
-
197
- if anon_strat == "encrypt":
198
- out_message = out_message + ". Your decryption key is " + key_string + "."
199
 
200
- return scrubbed_df, out_message
201
 
202
- def do_anonymise(in_file, in_text:str, anon_strat:str, chosen_cols:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
203
-
204
  def check_lists(list1, list2):
205
  return any(string in list2 for string in list1)
206
 
@@ -221,69 +220,164 @@ def do_anonymise(in_file, in_text:str, anon_strat:str, chosen_cols:List[str], la
221
  common_strings.append(string)
222
  return common_strings
223
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
  # Load file
 
 
 
 
 
 
 
 
 
 
225
 
226
  anon_df = pd.DataFrame()
227
- out_files_list = []
228
 
229
  # Check if files and text exist
230
- if not in_file:
231
  if in_text:
232
- in_file=['open_text']
233
  else:
234
  out_message = "Please enter text or a file to redact."
235
- return out_message, None
 
 
 
 
 
 
236
 
237
- for match_file in progress.tqdm(in_file, desc="Anonymising files", unit = "file"):
 
 
238
 
239
- if match_file=='open_text':
240
  anon_df = pd.DataFrame(data={'text':[in_text]})
241
  chosen_cols=['text']
242
- out_file_part = match_file
243
  else:
244
- anon_df = read_file(match_file)
245
- out_file_part = get_file_path_end(match_file.name)
 
246
 
247
-
 
 
 
 
 
 
 
248
 
249
- # Check for chosen col, skip file if not found
250
- all_cols_original_order = list(anon_df.columns)
251
 
252
- any_cols_found = check_lists(chosen_cols, all_cols_original_order)
 
253
 
254
- if any_cols_found == False:
255
- out_message = "No chosen columns found in dataframe: " + out_file_part
256
- print(out_message)
257
- continue
258
- else:
259
- chosen_cols_in_anon_df = get_common_strings(chosen_cols, all_cols_original_order)
260
 
261
- # Split dataframe to keep only selected columns
262
- print("Remaining columns to redact:", chosen_cols_in_anon_df)
263
-
264
- anon_df_part = anon_df[chosen_cols_in_anon_df]
265
- anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
266
-
267
- # Anonymise the selected columns
268
- anon_df_part_out, out_message = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, allow_list)
269
-
270
- # Rejoin the dataframe together
271
- anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1)
272
- anon_df_out = anon_df_out[all_cols_original_order]
273
-
274
- # Export file
275
-
276
-
277
- # out_file_part = re.sub(r'\.csv', '', match_file.name)
278
 
279
- anon_export_file_name = output_folder + out_file_part + "_anon_" + anon_strat + ".csv"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
 
281
- anon_df_out.to_csv(anon_export_file_name, index = None)
 
282
 
283
- out_files_list.append(anon_export_file_name)
284
 
285
- # Print result text to output text box if just anonymising open text
286
- if match_file=='open_text':
287
- out_message = anon_df_out['text'][0]
288
 
289
- return out_message, out_files_list
 
11
 
12
  from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine
13
  from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
14
+ from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
15
 
16
+ from tools.helper_functions import output_folder, get_file_path_end, read_file, detect_file_type
17
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
18
 
19
  # Use custom version of analyze_dict to be able to track progress
 
116
 
117
  return scrubbed_df_consistent_names
118
 
119
+ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=[], progress=Progress(track_tqdm=False)):
120
+
121
+ key_string = ""
122
+
123
  # DataFrame to dict
124
  df_dict = df.to_dict(orient="list")
125
 
126
+ if in_allow_list:
127
+ in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
128
 
129
  #analyzer = nlp_analyser #AnalyzerEngine()
130
  batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
131
 
132
+ anonymizer = AnonymizerEngine()#conflict_resolution=ConflictResolutionStrategy.MERGE_SIMILAR_OR_CONTAINED)
133
 
134
  batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
135
 
 
137
  # entities=chosen_redact_entities,
138
  # score_threshold=score_threshold,
139
  # return_decision_process=False,
140
+ # in_allow_list=in_allow_list_flat)
141
 
142
  print("Identifying personal information")
143
  analyse_tic = time.perf_counter()
144
 
145
+ print("Allow list:", in_allow_list)
146
 
147
  # Use custom analyzer to be able to track progress with Gradio
148
  analyzer_results = analyze_dict(batch_analyzer, df_dict, language=language,
149
  entities=chosen_redact_entities,
150
  score_threshold=score_threshold,
151
  return_decision_process=False,
152
+ allow_list=in_allow_list_flat)
153
  analyzer_results = list(analyzer_results)
154
  #analyzer_results
155
 
 
157
  analyse_time_out = f"Analysing the text took {analyse_toc - analyse_tic:0.1f} seconds."
158
  print(analyse_time_out)
159
 
160
+
 
 
161
 
162
  # Create faker function (note that it has to receive a value)
163
 
 
167
  return fake.first_name()
168
 
169
  # Set up the anonymization configuration WITHOUT DATE_TIME
170
+ simple_replace_config = eval('{"DEFAULT": OperatorConfig("replace", {"new_value": "REDACTED"})}')
171
  replace_config = eval('{"DEFAULT": OperatorConfig("replace")}')
172
  redact_config = eval('{"DEFAULT": OperatorConfig("redact")}')
173
  hash_config = eval('{"DEFAULT": OperatorConfig("hash")}')
 
175
  people_encrypt_config = eval('{"PERSON": OperatorConfig("encrypt", {"key": key_string})}') # The encryption is using AES cypher in CBC mode and requires a cryptographic key as an input for both the encryption and the decryption.
176
  fake_first_name_config = eval('{"PERSON": OperatorConfig("custom", {"lambda": fake_first_name})}')
177
 
178
+ if anon_strat == "replace with <REDACTED>": chosen_mask_config = simple_replace_config
179
+ if anon_strat == "replace with <ENTITY_NAME>": chosen_mask_config = replace_config
180
  if anon_strat == "redact": chosen_mask_config = redact_config
181
  if anon_strat == "hash": chosen_mask_config = hash_config
182
  if anon_strat == "mask": chosen_mask_config = mask_config
183
+ if anon_strat == "encrypt":
184
+ chosen_mask_config = people_encrypt_config
185
+ # Generate a 128-bit AES key. Then encode the key using base64 to get a string representation
186
+ key = secrets.token_bytes(16) # 128 bits = 16 bytes
187
+ key_string = base64.b64encode(key).decode('utf-8')
188
  elif anon_strat == "fake_first_name": chosen_mask_config = fake_first_name_config
189
 
190
  # I think in general people will want to keep date / times
 
196
  anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results, operators=combined_config)
197
 
198
  scrubbed_df = pd.DataFrame(anonymizer_results)
 
 
 
 
 
 
199
 
200
+ return scrubbed_df, key_string
201
 
202
+ def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, excel_sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name):
 
203
  def check_lists(list1, list2):
204
  return any(string in list2 for string in list1)
205
 
 
220
  common_strings.append(string)
221
  return common_strings
222
 
223
+ # Check for chosen col, skip file if not found
224
+ all_cols_original_order = list(anon_df.columns)
225
+
226
+ any_cols_found = check_lists(chosen_cols, all_cols_original_order)
227
+
228
+ if any_cols_found == False:
229
+ out_message = "No chosen columns found in dataframe: " + out_file_part
230
+ print(out_message)
231
+ else:
232
+ chosen_cols_in_anon_df = get_common_strings(chosen_cols, all_cols_original_order)
233
+
234
+ # Split dataframe to keep only selected columns
235
+ print("Remaining columns to redact:", chosen_cols_in_anon_df)
236
+
237
+ anon_df_part = anon_df[chosen_cols_in_anon_df]
238
+ anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
239
+
240
+ # Anonymise the selected columns
241
+ anon_df_part_out, key_string = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, in_allow_list)
242
+
243
+ # Rejoin the dataframe together
244
+ anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1)
245
+ anon_df_out = anon_df_out[all_cols_original_order]
246
+
247
+ # Export file
248
+
249
+ # Rename anonymisation strategy for file path naming
250
+ if anon_strat == "replace with <REDACTED>": anon_strat_txt = "redact_simple"
251
+ elif anon_strat == "replace with <ENTITY_NAME>": anon_strat_txt = "redact_entity_type"
252
+ else: anon_strat_txt = anon_strat
253
+
254
+ # If the file is an xlsx, add a new sheet to the existing xlsx. Otherwise, write to csv
255
+ if file_type == 'xlsx':
256
+
257
+ anon_export_file_name = anon_xlsx_export_file_name
258
+
259
+ # Create a Pandas Excel writer using XlsxWriter as the engine.
260
+ with pd.ExcelWriter(anon_xlsx_export_file_name, engine='openpyxl', mode='a') as writer:
261
+ # Write each DataFrame to a different worksheet.
262
+ anon_df_out.to_excel(writer, sheet_name=excel_sheet_name, index=None)
263
+
264
+ else:
265
+ anon_export_file_name = output_folder + out_file_part + "_" + excel_sheet_name + "_anon_" + anon_strat_txt + ".csv"
266
+ anon_df_out.to_csv(anon_export_file_name, index = None)
267
+
268
+ out_file_paths.append(anon_export_file_name)
269
+
270
+ # As files are created in a loop, there is a risk of duplicate file names being output. Use set to keep uniques.
271
+ out_file_paths = list(set(out_file_paths))
272
+
273
+ # Print result text to output text box if just anonymising open text
274
+ if anon_file=='open_text':
275
+ out_message = [anon_df_out['text'][0]]
276
+
277
+ return out_file_paths, out_message, key_string
278
+
279
+ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chosen_cols:List[str], language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list = [], in_excel_sheets:list=[], progress=Progress(track_tqdm=True)):
280
+
281
+ tic = time.perf_counter()
282
+
283
  # Load file
284
+ # If out message or out_file_paths are blank, change to a list so it can be appended to
285
+ if isinstance(out_message, str):
286
+ out_message = [out_message]
287
+
288
+ if not out_file_paths:
289
+ out_file_paths = []
290
+
291
+
292
+ if in_allow_list:
293
+ in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
294
 
295
  anon_df = pd.DataFrame()
296
+ #out_file_paths = []
297
 
298
  # Check if files and text exist
299
+ if not file_paths:
300
  if in_text:
301
+ file_paths=['open_text']
302
  else:
303
  out_message = "Please enter text or a file to redact."
304
+ return out_message, out_file_paths, out_file_paths, latest_file_completed
305
+
306
+ # If we have already redacted the last file, return the input out_message and file list to the relevant components
307
+ if latest_file_completed == len(file_paths):
308
+ print("Last file reached, returning files:", str(latest_file_completed))
309
+ final_out_message = '\n'.join(out_message)
310
+ return final_out_message, out_file_paths, out_file_paths, latest_file_completed
311
 
312
+ file_path_loop = [file_paths[int(latest_file_completed)]]
313
+
314
+ for anon_file in progress.tqdm(file_path_loop, desc="Anonymising files", unit = "file"):
315
 
316
+ if anon_file=='open_text':
317
  anon_df = pd.DataFrame(data={'text':[in_text]})
318
  chosen_cols=['text']
319
+ out_file_part = anon_file
320
  else:
321
+ # If file is an xlsx, we are going to run through all the Excel sheets to anonymise them separately.
322
+ file_type = detect_file_type(anon_file)
323
+ print("File type is:", file_type)
324
 
325
+ out_file_part = get_file_path_end(anon_file.name)
326
+
327
+ if file_type == 'xlsx':
328
+ print("Running through all xlsx sheets")
329
+ #anon_xlsx = pd.ExcelFile(anon_file)
330
+ if not in_excel_sheets:
331
+ out_message.append("No Excel sheets selected. Please select at least one to anonymise.")
332
+ continue
333
 
334
+ anon_xlsx = pd.ExcelFile(anon_file)
 
335
 
336
+ # Create xlsx file:
337
+ anon_xlsx_export_file_name = output_folder + out_file_part + ".xlsx"
338
 
339
+ from openpyxl import Workbook
340
+
341
+ wb = Workbook()
342
+ wb.save(anon_xlsx_export_file_name)
 
 
343
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
 
345
+ # Iterate through the sheet names
346
+ for sheet_name in in_excel_sheets:
347
+ # Read each sheet into a DataFrame
348
+ if sheet_name not in anon_xlsx.sheet_names:
349
+ continue
350
+
351
+ anon_df = pd.read_excel(anon_file, sheet_name=sheet_name)
352
+
353
+ # Process the DataFrame (e.g., print its contents)
354
+ print(f"Sheet Name: {sheet_name}")
355
+ print(anon_df.head()) # Print the first few rows
356
+
357
+
358
+ out_file_paths, out_message, key_string = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name)
359
+
360
+ else:
361
+ sheet_name = ""
362
+ anon_df = read_file(anon_file)
363
+ out_file_part = get_file_path_end(anon_file.name)
364
+ out_file_paths, out_message, key_string = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "")
365
+
366
+ # Increase latest file completed count unless we are at the last file
367
+ if latest_file_completed != len(file_paths):
368
+ print("Completed file number:", str(latest_file_completed))
369
+ latest_file_completed += 1
370
+
371
+ toc = time.perf_counter()
372
+ out_time = f"in {toc - tic:0.1f} seconds."
373
+ print(out_time)
374
 
375
+ if anon_strat == "encrypt":
376
+ out_message.append(". Your decryption key is " + key_string + ".")
377
 
378
+ out_message.append("Anonymisation of file '" + out_file_part + "' successfully completed in")
379
 
380
+ out_message_out = '\n'.join(out_message)
381
+ out_message_out = out_message_out + " " + out_time
 
382
 
383
+ return out_message_out, out_file_paths, out_file_paths, latest_file_completed
tools/file_conversion.py CHANGED
@@ -89,15 +89,31 @@ def process_file(file_path):
89
 
90
  return img_object
91
 
92
- def prepare_image_or_text_pdf(file_paths:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, progress=Progress(track_tqdm=True)):
93
 
94
- out_message = ''
95
- out_file_paths = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
  #in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
98
 
 
 
99
  #for file in progress.tqdm(file_paths, desc="Preparing files"):
100
- for file in file_paths:
101
  file_path = file.name
102
 
103
  #if file_path:
@@ -112,7 +128,7 @@ def prepare_image_or_text_pdf(file_paths:List[str], in_redact_method:str, in_all
112
  if is_pdf_or_image(file_path) == False:
113
  out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
114
  print(out_message)
115
- return out_message, None
116
 
117
  out_file_path = process_file(file_path)
118
  print("Out file path at image conversion step:", out_file_path)
@@ -121,7 +137,7 @@ def prepare_image_or_text_pdf(file_paths:List[str], in_redact_method:str, in_all
121
  if is_pdf(file_path) == False:
122
  out_message = "Please upload a PDF file for text analysis."
123
  print(out_message)
124
- return out_message, None
125
 
126
  out_file_path = file_path
127
 
@@ -151,10 +167,4 @@ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
151
 
152
  print("Out file paths:", out_file_paths)
153
 
154
- return out_message, out_file_paths
155
-
156
-
157
-
158
-
159
-
160
-
 
89
 
90
  return img_object
91
 
92
+ def prepare_image_or_text_pdf(file_paths:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], progress=Progress(track_tqdm=True)):
93
 
94
+ # If out message or out_file_paths are blank, change to a list so it can be appended to
95
+ #if isinstance(out_message, str):
96
+ # out_message = [out_message]
97
+
98
+ if not file_paths:
99
+ file_paths = []
100
+
101
+ out_file_paths = file_paths
102
+
103
+ latest_file_completed = int(latest_file_completed)
104
+
105
+ # If we have already redacted the last file, return the input out_message and file list to the relevant components
106
+ if latest_file_completed == len(out_file_paths):
107
+ print("Last file reached, returning files:", str(latest_file_completed))
108
+ #final_out_message = '\n'.join(out_message)
109
+ return out_message, out_file_paths
110
 
111
  #in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
112
 
113
+ file_paths_loop = [out_file_paths[int(latest_file_completed)]]
114
+
115
  #for file in progress.tqdm(file_paths, desc="Preparing files"):
116
+ for file in file_paths_loop:
117
  file_path = file.name
118
 
119
  #if file_path:
 
128
  if is_pdf_or_image(file_path) == False:
129
  out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
130
  print(out_message)
131
+ return out_message, out_file_paths
132
 
133
  out_file_path = process_file(file_path)
134
  print("Out file path at image conversion step:", out_file_path)
 
137
  if is_pdf(file_path) == False:
138
  out_message = "Please upload a PDF file for text analysis."
139
  print(out_message)
140
+ return out_message, out_file_paths
141
 
142
  out_file_path = file_path
143
 
 
167
 
168
  print("Out file paths:", out_file_paths)
169
 
170
+ return out_message, out_file_paths
 
 
 
 
 
 
tools/file_redaction.py CHANGED
@@ -17,20 +17,36 @@ from tools.file_conversion import process_file, is_pdf, convert_text_pdf_to_img_
17
  import gradio as gr
18
 
19
 
20
- def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, progress=gr.Progress(track_tqdm=True)):
21
 
22
  tic = time.perf_counter()
23
 
24
- out_message = []
25
- out_file_paths = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  if in_allow_list:
28
  in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
29
-
30
 
31
  print("File paths:", file_paths)
32
 
33
- for file in progress.tqdm(file_paths, desc="Redacting files", unit = "files"):
34
  file_path = file.name
35
 
36
  if file_path:
@@ -42,7 +58,7 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
42
  else:
43
  out_message = "No file selected"
44
  print(out_message)
45
- return out_message, out_file_paths
46
 
47
  if in_redact_method == "Image analysis":
48
  # Analyse and redact image-based pdf or image
@@ -57,6 +73,11 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
57
  out_file_paths.append(out_image_file_path)
58
  out_message.append("File '" + file_path_without_ext + "' successfully redacted and saved to file.")
59
 
 
 
 
 
 
60
  elif in_redact_method == "Text analysis":
61
  if is_pdf(file_path) == False:
62
  return "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'.", None, None
@@ -81,21 +102,26 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
81
  out_file_paths.extend(img_output_file_path)
82
 
83
  # Add confirmation for converting to image if you want
84
- # out_message.append(img_output_summary)
 
 
 
 
85
 
86
  else:
87
  out_message = "No redaction method selected"
88
  print(out_message)
89
- return out_message, out_file_paths
 
90
 
91
  toc = time.perf_counter()
92
- out_time = f"Time taken: {toc - tic:0.1f} seconds."
93
  print(out_time)
94
 
95
  out_message_out = '\n'.join(out_message)
96
- out_message_out = out_message_out + "\n\n" + out_time
97
 
98
- return out_message_out, out_file_paths, out_file_paths
99
 
100
  def merge_img_bboxes(bboxes, horizontal_threshold=150, vertical_threshold=25):
101
  merged_bboxes = []
 
17
  import gradio as gr
18
 
19
 
20
+ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list = [], progress=gr.Progress(track_tqdm=True)):
21
 
22
  tic = time.perf_counter()
23
 
24
+ # If out message is string or out_file_paths are blank, change to a list so it can be appended to
25
+ if isinstance(out_message, str):
26
+ out_message = [out_message]
27
+
28
+ if not out_file_paths:
29
+ out_file_paths = []
30
+
31
+ print("Latest file completed is:", str(latest_file_completed))
32
+
33
+ latest_file_completed = int(latest_file_completed)
34
+
35
+ # If we have already redacted the last file, return the input out_message and file list to the relevant components
36
+ if latest_file_completed == len(file_paths):
37
+ print("Last file reached, returning files:", str(latest_file_completed))
38
+ final_out_message = '\n'.join(out_message)
39
+ return final_out_message, out_file_paths, out_file_paths, latest_file_completed
40
+
41
+ file_paths_loop = [file_paths[int(latest_file_completed)]]
42
 
43
  if in_allow_list:
44
  in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
45
+
46
 
47
  print("File paths:", file_paths)
48
 
49
+ for file in progress.tqdm(file_paths_loop, desc="Redacting files", unit = "files"):
50
  file_path = file.name
51
 
52
  if file_path:
 
58
  else:
59
  out_message = "No file selected"
60
  print(out_message)
61
+ return out_message, out_file_paths, out_file_paths, latest_file_completed
62
 
63
  if in_redact_method == "Image analysis":
64
  # Analyse and redact image-based pdf or image
 
73
  out_file_paths.append(out_image_file_path)
74
  out_message.append("File '" + file_path_without_ext + "' successfully redacted and saved to file.")
75
 
76
+ # Increase latest file completed count unless we are at the last file
77
+ if latest_file_completed != len(file_paths):
78
+ print("Completed file number:", str(latest_file_completed))
79
+ latest_file_completed += 1
80
+
81
  elif in_redact_method == "Text analysis":
82
  if is_pdf(file_path) == False:
83
  return "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'.", None, None
 
102
  out_file_paths.extend(img_output_file_path)
103
 
104
  # Add confirmation for converting to image if you want
105
+ # out_message.append(img_output_summary)
106
+
107
+ if latest_file_completed != len(file_paths):
108
+ print("Completed file number:", str(latest_file_completed))
109
+ latest_file_completed += 1
110
 
111
  else:
112
  out_message = "No redaction method selected"
113
  print(out_message)
114
+ return out_message, out_file_paths, out_file_paths, latest_file_completed
115
+
116
 
117
  toc = time.perf_counter()
118
+ out_time = f"in {toc - tic:0.1f} seconds."
119
  print(out_time)
120
 
121
  out_message_out = '\n'.join(out_message)
122
+ out_message_out = out_message_out + " " + out_time
123
 
124
+ return out_message_out, out_file_paths, out_file_paths, latest_file_completed
125
 
126
  def merge_img_bboxes(bboxes, horizontal_threshold=150, vertical_threshold=25):
127
  merged_bboxes = []
tools/helper_functions.py CHANGED
@@ -76,17 +76,46 @@ def ensure_output_folder_exists():
76
  def put_columns_in_df(in_file):
77
  new_choices = []
78
  concat_choices = []
 
 
79
 
80
  for file in in_file:
81
- df = read_file(file.name)
82
- new_choices = list(df.columns)
 
83
 
84
- concat_choices.extend(new_choices)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
 
 
86
  # Drop duplicate columns
87
  concat_choices = list(set(concat_choices))
88
-
89
- return gr.Dropdown(choices=concat_choices, value=concat_choices)
 
 
 
90
 
91
  # Following function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
92
  def add_folder_to_path(folder_path: str):
@@ -104,7 +133,7 @@ def add_folder_to_path(folder_path: str):
104
  if absolute_path not in current_path.split(os.pathsep):
105
  full_path_extension = absolute_path + os.pathsep + current_path
106
  os.environ['PATH'] = full_path_extension
107
- print(f"Updated PATH with: ", full_path_extension)
108
  else:
109
  print(f"Directory {folder_path} already exists in PATH.")
110
  else:
@@ -167,7 +196,7 @@ async def get_connection_params(request: gr.Request):
167
  #if bucket_name:
168
  # print("S3 output folder is: " + "s3://" + bucket_name + "/" + output_folder)
169
 
170
- return out_session_hash, output_folder
171
  else:
172
  print("No session parameters found.")
173
  return "",""
 
76
  def put_columns_in_df(in_file):
77
  new_choices = []
78
  concat_choices = []
79
+ all_sheet_names = []
80
+ number_of_excel_files = 0
81
 
82
  for file in in_file:
83
+ file_name = file.name
84
+ file_type = detect_file_type(file_name)
85
+ print("File type is:", file_type)
86
 
87
+ if file_type == 'xlsx':
88
+ number_of_excel_files += 1
89
+ new_choices = []
90
+ print("Running through all xlsx sheets")
91
+ anon_xlsx = pd.ExcelFile(file_name)
92
+ new_sheet_names = anon_xlsx.sheet_names
93
+ # Iterate through the sheet names
94
+ for sheet_name in new_sheet_names:
95
+ # Read each sheet into a DataFrame
96
+ df = pd.read_excel(file_name, sheet_name=sheet_name)
97
+
98
+ # Process the DataFrame (e.g., print its contents)
99
+ print(f"Sheet Name: {sheet_name}")
100
+ print(df.head()) # Print the first few rows
101
+
102
+ new_choices.extend(list(df.columns))
103
+
104
+ all_sheet_names.extend(new_sheet_names)
105
+
106
+ else:
107
+ df = read_file(file_name)
108
+ new_choices = list(df.columns)
109
 
110
+ concat_choices.extend(new_choices)
111
+
112
  # Drop duplicate columns
113
  concat_choices = list(set(concat_choices))
114
+
115
+ if number_of_excel_files > 0:
116
+ return gr.Dropdown(choices=concat_choices, value=concat_choices), gr.Dropdown(choices=all_sheet_names, value=all_sheet_names, visible=True)
117
+ else:
118
+ return gr.Dropdown(choices=concat_choices, value=concat_choices), gr.Dropdown(visible=False)
119
 
120
  # Following function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
121
  def add_folder_to_path(folder_path: str):
 
133
  if absolute_path not in current_path.split(os.pathsep):
134
  full_path_extension = absolute_path + os.pathsep + current_path
135
  os.environ['PATH'] = full_path_extension
136
+ #print(f"Updated PATH with: ", full_path_extension)
137
  else:
138
  print(f"Directory {folder_path} already exists in PATH.")
139
  else:
 
196
  #if bucket_name:
197
  # print("S3 output folder is: " + "s3://" + bucket_name + "/" + output_folder)
198
 
199
+ return out_session_hash, output_folder, out_session_hash
200
  else:
201
  print("No session parameters found.")
202
  return "",""