Alex Strick van Linschoten
upload app
64c717a
raw
history blame
3.09 kB
import gradio as gr
import skimage
from fastai.learner import load_learner
from fastai.vision.all import *
from huggingface_hub import hf_hub_download
import fitz
import tempfile
import os
from fpdf import FPDF
learn = load_learner(
hf_hub_download("strickvl/redaction-classifier-fastai", "model.pkl")
)
labels = learn.dls.vocab
def predict(pdf, confidence, generate_file):
document = fitz.open(pdf.name)
results = []
images = []
tmp_dir = tempfile.gettempdir()
for page_num, page in enumerate(document, start=1):
image_pixmap = page.get_pixmap()
image = image_pixmap.tobytes()
_, _, probs = learn.predict(image)
results.append(
{labels[i]: float(probs[i]) for i in range(len(labels))}
)
if probs[0] > (confidence / 100):
redaction_count = len(images)
image_pixmap.save(os.path.join(tmp_dir, f"page-{page_num}.png"))
images.append(
[
f"Redacted page #{redaction_count + 1} on page {page_num}",
os.path.join(tmp_dir, f"page-{page_num}.png"),
]
)
redacted_pages = [
str(page + 1)
for page in range(len(results))
if results[page]["redacted"] > (confidence / 100)
]
report = os.path.join(tmp_dir, "redacted_pages.pdf")
if generate_file:
pdf = FPDF()
pdf.set_auto_page_break(0)
imagelist = sorted(
[i for i in os.listdir(tmp_dir) if i.endswith("png")]
)
for image in imagelist:
pdf.add_page()
pdf.image(os.path.join(tmp_dir, image), w=190, h=280)
pdf.output(report, "F")
text_output = f"A total of {len(redacted_pages)} pages were redacted. \n\n The redacted page numbers were: {', '.join(redacted_pages)}."
if generate_file:
return text_output, images, report
else:
return text_output, images, None
title = "Redaction Detector"
description = "A classifier trained on publicly released redacted (and unredacted) FOIA documents, using [fastai](https://github.com/fastai/fastai)."
with open("article.md") as f:
article = f.read()
examples = [["test1.pdf", 80, False], ["test2.pdf", 80, False]]
interpretation = "default"
enable_queue = True
theme = "grass"
allow_flagging = "never"
demo = gr.Interface(
fn=predict,
inputs=[
"file",
gr.inputs.Slider(
minimum=0,
maximum=100,
step=None,
default=80,
label="Confidence",
optional=False,
),
"checkbox",
],
outputs=[
gr.outputs.Textbox(label="Document Analysis"),
gr.outputs.Carousel(["text", "image"], label="Redacted pages"),
gr.outputs.File(label="Download redacted pages"),
],
title=title,
description=description,
article=article,
theme=theme,
allow_flagging=allow_flagging,
examples=examples,
interpretation=interpretation,
)
demo.launch(
cache_examples=True,
enable_queue=enable_queue,
)