import gradio as gr import skimage from fastai.learner import load_learner from fastai.vision.all import * from huggingface_hub import hf_hub_download import fitz import tempfile import os from fpdf import FPDF learn = load_learner( hf_hub_download("strickvl/redaction-classifier-fastai", "model.pkl") ) labels = learn.dls.vocab def predict(pdf, confidence, generate_file): document = fitz.open(pdf.name) results = [] images = [] tmp_dir = tempfile.gettempdir() for page_num, page in enumerate(document, start=1): image_pixmap = page.get_pixmap() image = image_pixmap.tobytes() _, _, probs = learn.predict(image) results.append( {labels[i]: float(probs[i]) for i in range(len(labels))} ) if probs[0] > (confidence / 100): redaction_count = len(images) image_pixmap.save(os.path.join(tmp_dir, f"page-{page_num}.png")) images.append( [ f"Redacted page #{redaction_count + 1} on page {page_num}", os.path.join(tmp_dir, f"page-{page_num}.png"), ] ) redacted_pages = [ str(page + 1) for page in range(len(results)) if results[page]["redacted"] > (confidence / 100) ] report = os.path.join(tmp_dir, "redacted_pages.pdf") if generate_file: pdf = FPDF() pdf.set_auto_page_break(0) imagelist = sorted( [i for i in os.listdir(tmp_dir) if i.endswith("png")] ) for image in imagelist: pdf.add_page() pdf.image(os.path.join(tmp_dir, image), w=190, h=280) pdf.output(report, "F") text_output = f"A total of {len(redacted_pages)} pages were redacted. \n\n The redacted page numbers were: {', '.join(redacted_pages)}." if generate_file: return text_output, images, report else: return text_output, images, None title = "Redaction Detector" description = "A classifier trained on publicly released redacted (and unredacted) FOIA documents, using [fastai](https://github.com/fastai/fastai)." with open("article.md") as f: article = f.read() examples = [["test1.pdf", 80, False], ["test2.pdf", 80, False]] interpretation = "default" enable_queue = True theme = "grass" allow_flagging = "never" demo = gr.Interface( fn=predict, inputs=[ "file", gr.inputs.Slider( minimum=0, maximum=100, step=None, default=80, label="Confidence", optional=False, ), "checkbox", ], outputs=[ gr.outputs.Textbox(label="Document Analysis"), gr.outputs.Carousel(["text", "image"], label="Redacted pages"), gr.outputs.File(label="Download redacted pages"), ], title=title, description=description, article=article, theme=theme, allow_flagging=allow_flagging, examples=examples, interpretation=interpretation, ) demo.launch( cache_examples=True, enable_queue=enable_queue, )