Spaces:

strickvl
/

redaction-detector

Runtime error

Alex Strick van Linschoten

upload app

64c717a almost 3 years ago

3.09 kB

	import gradio as gr
	import skimage
	from fastai.learner import load_learner
	from fastai.vision.all import *
	from huggingface_hub import hf_hub_download
	import fitz
	import tempfile
	import os
	from fpdf import FPDF

	learn = load_learner(
	hf_hub_download("strickvl/redaction-classifier-fastai", "model.pkl")
	)

	labels = learn.dls.vocab


	def predict(pdf, confidence, generate_file):
	document = fitz.open(pdf.name)
	results = []
	images = []
	tmp_dir = tempfile.gettempdir()
	for page_num, page in enumerate(document, start=1):
	image_pixmap = page.get_pixmap()
	image = image_pixmap.tobytes()
	_, _, probs = learn.predict(image)
	results.append(
	{labels[i]: float(probs[i]) for i in range(len(labels))}
	)
	if probs[0] > (confidence / 100):
	redaction_count = len(images)
	image_pixmap.save(os.path.join(tmp_dir, f"page-{page_num}.png"))
	images.append(
	[
	f"Redacted page #{redaction_count + 1} on page {page_num}",
	os.path.join(tmp_dir, f"page-{page_num}.png"),
	]
	)

	redacted_pages = [
	str(page + 1)
	for page in range(len(results))
	if results[page]["redacted"] > (confidence / 100)
	]
	report = os.path.join(tmp_dir, "redacted_pages.pdf")
	if generate_file:
	pdf = FPDF()
	pdf.set_auto_page_break(0)
	imagelist = sorted(
	[i for i in os.listdir(tmp_dir) if i.endswith("png")]
	)
	for image in imagelist:
	pdf.add_page()
	pdf.image(os.path.join(tmp_dir, image), w=190, h=280)
	pdf.output(report, "F")
	text_output = f"A total of {len(redacted_pages)} pages were redacted. \n\n The redacted page numbers were: {', '.join(redacted_pages)}."
	if generate_file:
	return text_output, images, report
	else:
	return text_output, images, None


	title = "Redaction Detector"

	description = "A classifier trained on publicly released redacted (and unredacted) FOIA documents, using [fastai](https://github.com/fastai/fastai)."

	with open("article.md") as f:
	article = f.read()

	examples = [["test1.pdf", 80, False], ["test2.pdf", 80, False]]
	interpretation = "default"
	enable_queue = True
	theme = "grass"
	allow_flagging = "never"

	demo = gr.Interface(
	fn=predict,
	inputs=[
	"file",
	gr.inputs.Slider(
	minimum=0,
	maximum=100,
	step=None,
	default=80,
	label="Confidence",
	optional=False,
	),
	"checkbox",
	],
	outputs=[
	gr.outputs.Textbox(label="Document Analysis"),
	gr.outputs.Carousel(["text", "image"], label="Redacted pages"),
	gr.outputs.File(label="Download redacted pages"),
	],
	title=title,
	description=description,
	article=article,
	theme=theme,
	allow_flagging=allow_flagging,
	examples=examples,
	interpretation=interpretation,
	)

	demo.launch(
	cache_examples=True,
	enable_queue=enable_queue,
	)