Spaces:

strickvl
/

redaction-detector

Runtime error

File size: 3,088 Bytes

64c717a

import gradio as gr
import skimage
from fastai.learner import load_learner
from fastai.vision.all import *
from huggingface_hub import hf_hub_download
import fitz
import tempfile
import os
from fpdf import FPDF

learn = load_learner(
    hf_hub_download("strickvl/redaction-classifier-fastai", "model.pkl")
)

labels = learn.dls.vocab


def predict(pdf, confidence, generate_file):
    document = fitz.open(pdf.name)
    results = []
    images = []
    tmp_dir = tempfile.gettempdir()
    for page_num, page in enumerate(document, start=1):
        image_pixmap = page.get_pixmap()
        image = image_pixmap.tobytes()
        _, _, probs = learn.predict(image)
        results.append(
            {labels[i]: float(probs[i]) for i in range(len(labels))}
        )
        if probs[0] > (confidence / 100):
            redaction_count = len(images)
            image_pixmap.save(os.path.join(tmp_dir, f"page-{page_num}.png"))
            images.append(
                [
                    f"Redacted page #{redaction_count + 1} on page {page_num}",
                    os.path.join(tmp_dir, f"page-{page_num}.png"),
                ]
            )

    redacted_pages = [
        str(page + 1)
        for page in range(len(results))
        if results[page]["redacted"] > (confidence / 100)
    ]
    report = os.path.join(tmp_dir, "redacted_pages.pdf")
    if generate_file:
        pdf = FPDF()
        pdf.set_auto_page_break(0)
        imagelist = sorted(
            [i for i in os.listdir(tmp_dir) if i.endswith("png")]
        )
        for image in imagelist:
            pdf.add_page()
            pdf.image(os.path.join(tmp_dir, image), w=190, h=280)
        pdf.output(report, "F")
    text_output = f"A total of {len(redacted_pages)} pages were redacted. \n\n The redacted page numbers were: {', '.join(redacted_pages)}."
    if generate_file:
        return text_output, images, report
    else:
        return text_output, images, None


title = "Redaction Detector"

description = "A classifier trained on publicly released redacted (and unredacted) FOIA documents, using [fastai](https://github.com/fastai/fastai)."

with open("article.md") as f:
    article = f.read()

examples = [["test1.pdf", 80, False], ["test2.pdf", 80, False]]
interpretation = "default"
enable_queue = True
theme = "grass"
allow_flagging = "never"

demo = gr.Interface(
    fn=predict,
    inputs=[
        "file",
        gr.inputs.Slider(
            minimum=0,
            maximum=100,
            step=None,
            default=80,
            label="Confidence",
            optional=False,
        ),
        "checkbox",
    ],
    outputs=[
        gr.outputs.Textbox(label="Document Analysis"),
        gr.outputs.Carousel(["text", "image"], label="Redacted pages"),
        gr.outputs.File(label="Download redacted pages"),
    ],
    title=title,
    description=description,
    article=article,
    theme=theme,
    allow_flagging=allow_flagging,
    examples=examples,
    interpretation=interpretation,
)

demo.launch(
    cache_examples=True,
    enable_queue=enable_queue,
)