seanpedrickcase commited on
Commit
34bd97b
1 Parent(s): d0b63c6

Created custom csvlogger to try to overcome AWS Lambda's incompatibility with multithread locks

Browse files
Files changed (3) hide show
  1. Dockerfile +0 -5
  2. app.py +5 -4
  3. tools/custom_csvlogger.py +171 -0
Dockerfile CHANGED
@@ -1,5 +1,3 @@
1
-
2
-
3
  # Stage 1: Build dependencies and download models
4
  FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm AS builder
5
 
@@ -83,9 +81,6 @@ WORKDIR $HOME/app
83
  # Copy the app code to the container
84
  COPY --chown=user . $HOME/app
85
 
86
- # Default entrypoint (can be overridden by build argument)
87
- ARG APP_MODE=gradio
88
-
89
  ENTRYPOINT [ "/entrypoint.sh" ]
90
 
91
  # Default command for Lambda mode
 
 
 
1
  # Stage 1: Build dependencies and download models
2
  FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm AS builder
3
 
 
81
  # Copy the app code to the container
82
  COPY --chown=user . $HOME/app
83
 
 
 
 
84
  ENTRYPOINT [ "/entrypoint.sh" ]
85
 
86
  # Default command for Lambda mode
app.py CHANGED
@@ -17,6 +17,7 @@ from tools.redaction_review import apply_redactions, modify_existing_page_redact
17
  from tools.data_anonymise import anonymise_data_files
18
  from tools.auth import authenticate_user
19
  from tools.load_spacy_model_custom_recognisers import custom_entities
 
20
 
21
 
22
  today_rev = datetime.now().strftime("%Y%m%d")
@@ -372,25 +373,25 @@ with app:
372
  app.load(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
373
 
374
  # Log usernames and times of access to file (to know who is using the app when running on AWS)
375
- access_callback = gr.CSVLogger(dataset_file_name=log_file_name)
376
  access_callback.setup([session_hash_textbox], access_logs_folder)
377
  session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
378
  then(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
379
 
380
  # User submitted feedback for pdf redactions
381
- pdf_callback = gr.CSVLogger(dataset_file_name=log_file_name)
382
  pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], feedback_logs_folder)
383
  pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], None, preprocess=False).\
384
  then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
385
 
386
  # User submitted feedback for data redactions
387
- data_callback = gr.CSVLogger(dataset_file_name=log_file_name)
388
  data_callback.setup([data_feedback_radio, data_further_details_text, data_file_name_textbox], feedback_logs_folder)
389
  data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args)), [data_feedback_radio, data_further_details_text, data_file_name_textbox], None, preprocess=False).\
390
  then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
391
 
392
  # Log processing time/token usage when making a query
393
- usage_callback = gr.CSVLogger(dataset_file_name=log_file_name)
394
  usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], usage_logs_folder)
395
  latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], None, preprocess=False).\
396
  then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
 
17
  from tools.data_anonymise import anonymise_data_files
18
  from tools.auth import authenticate_user
19
  from tools.load_spacy_model_custom_recognisers import custom_entities
20
+ from tools.custom_csvlogger import CSVLogger_custom
21
 
22
 
23
  today_rev = datetime.now().strftime("%Y%m%d")
 
373
  app.load(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
374
 
375
  # Log usernames and times of access to file (to know who is using the app when running on AWS)
376
+ access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
377
  access_callback.setup([session_hash_textbox], access_logs_folder)
378
  session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
379
  then(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
380
 
381
  # User submitted feedback for pdf redactions
382
+ pdf_callback = CSVLogger_custom(dataset_file_name=log_file_name)
383
  pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], feedback_logs_folder)
384
  pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], None, preprocess=False).\
385
  then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
386
 
387
  # User submitted feedback for data redactions
388
+ data_callback = CSVLogger_custom(dataset_file_name=log_file_name)
389
  data_callback.setup([data_feedback_radio, data_further_details_text, data_file_name_textbox], feedback_logs_folder)
390
  data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args)), [data_feedback_radio, data_further_details_text, data_file_name_textbox], None, preprocess=False).\
391
  then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
392
 
393
  # Log processing time/token usage when making a query
394
+ usage_callback = CSVLogger_custom(dataset_file_name=log_file_name)
395
  usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], usage_logs_folder)
396
  latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], None, preprocess=False).\
397
  then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
tools/custom_csvlogger.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import contextlib
3
+ import csv
4
+ import datetime
5
+ import os
6
+ import re
7
+ from collections.abc import Sequence
8
+ from multiprocessing import Lock
9
+ from pathlib import Path
10
+ from typing import TYPE_CHECKING, Any
11
+
12
+ from gradio_client import utils as client_utils
13
+
14
+ import gradio as gr
15
+ from gradio import utils, wasm_utils
16
+
17
+ if TYPE_CHECKING:
18
+ from gradio.components import Component
19
+ from gradio.flagging import FlaggingCallback
20
+ from threading import Lock
21
+
22
+ class CSVLogger_custom(FlaggingCallback):
23
+ """
24
+ The default implementation of the FlaggingCallback abstract class in gradio>=5.0. Each flagged
25
+ sample (both the input and output data) is logged to a CSV file with headers on the machine running
26
+ the gradio app. Unlike ClassicCSVLogger, this implementation is concurrent-safe and it creates a new
27
+ dataset file every time the headers of the CSV (derived from the labels of the components) change. It also
28
+ only creates columns for "username" and "flag" if the flag_option and username are provided, respectively.
29
+
30
+ Example:
31
+ import gradio as gr
32
+ def image_classifier(inp):
33
+ return {'cat': 0.3, 'dog': 0.7}
34
+ demo = gr.Interface(fn=image_classifier, inputs="image", outputs="label",
35
+ flagging_callback=CSVLogger())
36
+ Guides: using-flagging
37
+ """
38
+
39
+ def __init__(
40
+ self,
41
+ simplify_file_data: bool = True,
42
+ verbose: bool = True,
43
+ dataset_file_name: str | None = None,
44
+ ):
45
+ """
46
+ Parameters:
47
+ simplify_file_data: If True, the file data will be simplified before being written to the CSV file. If CSVLogger is being used to cache examples, this is set to False to preserve the original FileData class
48
+ verbose: If True, prints messages to the console about the dataset file creation
49
+ dataset_file_name: The name of the dataset file to be created (should end in ".csv"). If None, the dataset file will be named "dataset1.csv" or the next available number.
50
+ """
51
+ self.simplify_file_data = simplify_file_data
52
+ self.verbose = verbose
53
+ self.dataset_file_name = dataset_file_name
54
+ self.lock = (
55
+ Lock() if not wasm_utils.IS_WASM else contextlib.nullcontext()
56
+ ) # The multiprocessing module doesn't work on Lite.
57
+
58
+ def setup(
59
+ self,
60
+ components: Sequence[Component],
61
+ flagging_dir: str | Path,
62
+ ):
63
+ self.components = components
64
+ self.flagging_dir = Path(flagging_dir)
65
+ self.first_time = True
66
+
67
+ def _create_dataset_file(self, additional_headers: list[str] | None = None):
68
+ os.makedirs(self.flagging_dir, exist_ok=True)
69
+
70
+ if additional_headers is None:
71
+ additional_headers = []
72
+ headers = (
73
+ [
74
+ getattr(component, "label", None) or f"component {idx}"
75
+ for idx, component in enumerate(self.components)
76
+ ]
77
+ + additional_headers
78
+ + [
79
+ "timestamp",
80
+ ]
81
+ )
82
+ headers = utils.sanitize_list_for_csv(headers)
83
+ dataset_files = list(Path(self.flagging_dir).glob("dataset*.csv"))
84
+
85
+ if self.dataset_file_name:
86
+ self.dataset_filepath = self.flagging_dir / self.dataset_file_name
87
+ elif dataset_files:
88
+ try:
89
+ latest_file = max(
90
+ dataset_files, key=lambda f: int(re.findall(r"\d+", f.stem)[0])
91
+ )
92
+ latest_num = int(re.findall(r"\d+", latest_file.stem)[0])
93
+
94
+ with open(latest_file, newline="", encoding="utf-8") as csvfile:
95
+ reader = csv.reader(csvfile)
96
+ existing_headers = next(reader, None)
97
+
98
+ if existing_headers != headers:
99
+ new_num = latest_num + 1
100
+ self.dataset_filepath = self.flagging_dir / f"dataset{new_num}.csv"
101
+ else:
102
+ self.dataset_filepath = latest_file
103
+ except Exception:
104
+ self.dataset_filepath = self.flagging_dir / "dataset1.csv"
105
+ else:
106
+ self.dataset_filepath = self.flagging_dir / "dataset1.csv"
107
+
108
+ if not Path(self.dataset_filepath).exists():
109
+ with open(
110
+ self.dataset_filepath, "w", newline="", encoding="utf-8"
111
+ ) as csvfile:
112
+ writer = csv.writer(csvfile)
113
+ writer.writerow(utils.sanitize_list_for_csv(headers))
114
+ if self.verbose:
115
+ print("Created dataset file at:", self.dataset_filepath)
116
+ elif self.verbose:
117
+ print("Using existing dataset file at:", self.dataset_filepath)
118
+
119
+ def flag(
120
+ self,
121
+ flag_data: list[Any],
122
+ flag_option: str | None = None,
123
+ username: str | None = None,
124
+ ) -> int:
125
+ if self.first_time:
126
+ additional_headers = []
127
+ if flag_option is not None:
128
+ additional_headers.append("flag")
129
+ if username is not None:
130
+ additional_headers.append("username")
131
+ self._create_dataset_file(additional_headers=additional_headers)
132
+ self.first_time = False
133
+
134
+ csv_data = []
135
+ for idx, (component, sample) in enumerate(
136
+ zip(self.components, flag_data, strict=False)
137
+ ):
138
+ save_dir = (
139
+ self.flagging_dir
140
+ / client_utils.strip_invalid_filename_characters(
141
+ getattr(component, "label", None) or f"component {idx}"
142
+ )
143
+ )
144
+ if utils.is_prop_update(sample):
145
+ csv_data.append(str(sample))
146
+ else:
147
+ data = (
148
+ component.flag(sample, flag_dir=save_dir)
149
+ if sample is not None
150
+ else ""
151
+ )
152
+ if self.simplify_file_data:
153
+ data = utils.simplify_file_data_in_str(data)
154
+ csv_data.append(data)
155
+
156
+ if flag_option is not None:
157
+ csv_data.append(flag_option)
158
+ if username is not None:
159
+ csv_data.append(username)
160
+ csv_data.append(str(datetime.datetime.now()))
161
+
162
+ with self.lock:
163
+ with open(
164
+ self.dataset_filepath, "a", newline="", encoding="utf-8"
165
+ ) as csvfile:
166
+ writer = csv.writer(csvfile)
167
+ writer.writerow(utils.sanitize_list_for_csv(csv_data))
168
+ with open(self.dataset_filepath, encoding="utf-8") as csvfile:
169
+ line_count = len(list(csv.reader(csvfile))) - 1
170
+
171
+ return line_count