Spaces:
Runtime error
Runtime error
Alex Strick van Linschoten
commited on
Commit
•
f4f594a
1
Parent(s):
c6ed7c3
add area calculations and delete model
Browse files- 2022-01-15-vfnet-post-self-train.pth +0 -3
- app.py +46 -4
2022-01-15-vfnet-post-self-train.pth
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:8db6b7adeef1d66f4e8684bdca6fb9fb4720ad149e4994e10a5af3e26bfc2507
|
3 |
-
size 131183383
|
|
|
|
|
|
|
|
app.py
CHANGED
@@ -13,7 +13,6 @@ from icevision.all import *
|
|
13 |
from icevision.models.checkpoint import *
|
14 |
from PIL import Image as PILImage
|
15 |
|
16 |
-
# checkpoint_path = "./2022-01-15-vfnet-post-self-train.pth"
|
17 |
checkpoint_path = "./allsynthetic-imgsize768.pth"
|
18 |
checkpoint_and_model = model_from_checkpoint(checkpoint_path)
|
19 |
model = checkpoint_and_model["model"]
|
@@ -33,11 +32,38 @@ learn = load_learner(
|
|
33 |
labels = learn.dls.vocab
|
34 |
|
35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
def predict(pdf, confidence, generate_file):
|
37 |
filename_without_extension = pdf.name[:-4]
|
38 |
document = fitz.open(pdf.name)
|
39 |
results = []
|
40 |
images = []
|
|
|
41 |
tmp_dir = tempfile.gettempdir()
|
42 |
for page_num, page in enumerate(document, start=1):
|
43 |
image_pixmap = page.get_pixmap()
|
@@ -77,6 +103,9 @@ def predict(pdf, confidence, generate_file):
|
|
77 |
tmp_dir, filename_without_extension, "redacted_pages.pdf"
|
78 |
)
|
79 |
if generate_file:
|
|
|
|
|
|
|
80 |
pdf = FPDF()
|
81 |
pdf.set_auto_page_break(0)
|
82 |
imagelist = sorted(
|
@@ -109,7 +138,11 @@ def predict(pdf, confidence, generate_file):
|
|
109 |
font_size=16,
|
110 |
label_color="#FF59D6",
|
111 |
)
|
112 |
-
|
|
|
|
|
|
|
|
|
113 |
pred_dict["img"].save(
|
114 |
os.path.join(
|
115 |
tmp_dir, filename_without_extension, f"pred-{image}"
|
@@ -123,10 +156,19 @@ def predict(pdf, confidence, generate_file):
|
|
123 |
)
|
124 |
pdf.output(report, "F")
|
125 |
|
126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
|
128 |
if generate_file:
|
129 |
-
return text_output, images, report
|
130 |
else:
|
131 |
return text_output, images, None
|
132 |
|
|
|
13 |
from icevision.models.checkpoint import *
|
14 |
from PIL import Image as PILImage
|
15 |
|
|
|
16 |
checkpoint_path = "./allsynthetic-imgsize768.pth"
|
17 |
checkpoint_and_model = model_from_checkpoint(checkpoint_path)
|
18 |
model = checkpoint_and_model["model"]
|
|
|
32 |
labels = learn.dls.vocab
|
33 |
|
34 |
|
35 |
+
def get_content_area(pred_dict) -> int:
|
36 |
+
if "content" not in pred_dict["labels"]:
|
37 |
+
return 0
|
38 |
+
content_bboxes = [
|
39 |
+
pred_dict["bboxes"][idx]
|
40 |
+
for idx, label in enumerate(pred_dict["labels"])
|
41 |
+
if label == "content"
|
42 |
+
]
|
43 |
+
cb = content_bboxes[0]
|
44 |
+
return (cb.xmax - cb.xmin) * (cb.ymax - cb.ymin)
|
45 |
+
|
46 |
+
|
47 |
+
def get_redaction_area(pred_dict) -> int:
|
48 |
+
if "redaction" not in pred_dict["labels"]:
|
49 |
+
return 0
|
50 |
+
redaction_bboxes = [
|
51 |
+
pred_dict["bboxes"][idx]
|
52 |
+
for idx, label in enumerate(pred_dict["labels"])
|
53 |
+
if label == "redaction"
|
54 |
+
]
|
55 |
+
return sum(
|
56 |
+
(bbox.xmax - bbox.xmin) * (bbox.ymax - bbox.ymin)
|
57 |
+
for bbox in redaction_bboxes
|
58 |
+
)
|
59 |
+
|
60 |
+
|
61 |
def predict(pdf, confidence, generate_file):
|
62 |
filename_without_extension = pdf.name[:-4]
|
63 |
document = fitz.open(pdf.name)
|
64 |
results = []
|
65 |
images = []
|
66 |
+
total_redacted_image_areas = 0
|
67 |
tmp_dir = tempfile.gettempdir()
|
68 |
for page_num, page in enumerate(document, start=1):
|
69 |
image_pixmap = page.get_pixmap()
|
|
|
103 |
tmp_dir, filename_without_extension, "redacted_pages.pdf"
|
104 |
)
|
105 |
if generate_file:
|
106 |
+
total_image_areas = 0
|
107 |
+
total_content_areas = 0
|
108 |
+
total_redaction_area = 0
|
109 |
pdf = FPDF()
|
110 |
pdf.set_auto_page_break(0)
|
111 |
imagelist = sorted(
|
|
|
138 |
font_size=16,
|
139 |
label_color="#FF59D6",
|
140 |
)
|
141 |
+
|
142 |
+
total_image_areas += pred_dict["width"] * pred_dict["height"]
|
143 |
+
total_content_areas += get_content_area(pred_dict)
|
144 |
+
total_redaction_area += get_redaction_area(pred_dict)
|
145 |
+
|
146 |
pred_dict["img"].save(
|
147 |
os.path.join(
|
148 |
tmp_dir, filename_without_extension, f"pred-{image}"
|
|
|
156 |
)
|
157 |
pdf.output(report, "F")
|
158 |
|
159 |
+
total_redaction_proportion = round(
|
160 |
+
(total_redaction_area / total_image_areas) * 100, 1
|
161 |
+
)
|
162 |
+
content_redaction_proportion = round(
|
163 |
+
(total_redaction_area / total_content_areas) * 100, 1
|
164 |
+
)
|
165 |
+
|
166 |
+
text_output = f"A total of {len(redacted_pages)} pages were redacted. \n\n The redacted page numbers were: {', '.join(redacted_pages)}. "
|
167 |
+
|
168 |
+
redaction_analysis = f"{total_redaction_proportion}% of the total area of the redacted pages was redacted. \n {content_redaction_proportion}% of the actual content of those redacted pages was redacted."
|
169 |
|
170 |
if generate_file:
|
171 |
+
return text_output + redaction_analysis, images, report
|
172 |
else:
|
173 |
return text_output, images, None
|
174 |
|