Spaces:

chrisfinlayson
/

foundry-pdf-redact

Sleeping

Chris Finlayson

Fix to file input

d96c1bc 12 months ago

7.18 kB

	import gradio as gr
	import os
	import fitz
	import re

	date = re.compile(
	u"(?:(?<!\:)(?<!\:\d)[0-3]?\d(?:st\|nd\|rd\|th)?\s+(?:of\s+)?(?:jan\.?\|january\|feb\.?\|february\|mar\.?\|march\|apr\.?\|april\|may\|jun\.?\|june\|jul\.?\|july\|aug\.?\|august\|sep\.?\|september\|oct\.?\|october\|nov\.?\|november\|dec\.?\|december)\|(?:jan\.?\|january\|feb\.?\|february\|mar\.?\|march\|apr\.?\|april\|may\|jun\.?\|june\|jul\.?\|july\|aug\.?\|august\|sep\.?\|september\|oct\.?\|october\|nov\.?\|november\|dec\.?\|december)\s+(?<!\:)(?<!\:\d)[0-3]?\d(?:st\|nd\|rd\|th)?)(?:\,)?\s*(?:\d{4})?\|[0-3]?\d[-\./][0-3]?\d[-\./]\d{2,4}",
	re.IGNORECASE,
	)
	time = re.compile(u"\d{1,2}:\d{2} ?(?:[ap]\.?m\.?)?\|\d[ap]\.?m\.?", re.IGNORECASE)
	phone = re.compile(
	u"""((?:(?<![\d-])(?:\+?\d{1,3}[-.\s]?)?(?:$?\d{3}$?[-.\s]?)?\d{3}[-.\s]?\d{4}(?![\d-]))\|(?:(?<![\d-])(?:(?:$\+?\d{2}$)\|(?:\+?\d{2}))\s\d{2}\s\d{3}\s\d{4}(?![\d-])))"""
	)
	phones_with_exts = re.compile(
	u"((?:(?:\+?1\s(?:[.-]\s)?)?(?:$\s(?:[2-9]1[02-9]\|[2-9][02-8]1\|[2-9][02-8][02-9])\s$\|(?:[2-9]1[02-9]\|[2-9][02-8]1\|[2-9][02-8][02-9]))\s(?:[.-]\s)?)?(?:[2-9]1[02-9]\|[2-9][02-9]1\|[2-9][02-9]{2})\s(?:[.-]\s)?(?:[0-9]{4})(?:\s(?:#\|x\.?\|ext\.?\|extension)\s(?:\d+)?))",
	re.IGNORECASE,
	)
	email = re.compile(
	u"([a-z0-9!#$%&'+\/=?^_`{\|.}~-]+@(?:[a-z0-9](?:[a-z0-9-][a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)",
	re.IGNORECASE,
	)
	ip = re.compile(
	u"(?:25[0-5]\|2[0-4][0-9]\|[01]?[0-9][0-9]?)\.(?:25[0-5]\|2[0-4][0-9]\|[01]?[0-9][0-9]?)\.(?:25[0-5]\|2[0-4][0-9]\|[01]?[0-9][0-9]?)\.(?:25[0-5]\|2[0-4][0-9]\|[01]?[0-9][0-9]?)",
	re.IGNORECASE,
	)
	ipv6 = re.compile(
	u"\s(?!.::.::)(?:(?!:)\|:(?=:))(?:[0-9a-f]{0,4}(?:(?<=::)\|(?<!::):)){6}(?:[0-9a-f]{0,4}(?:(?<=::)\|(?<!::):)[0-9a-f]{0,4}(?:(?<=::)\|(?<!:)\|(?<=:)(?<!::):)\|(?:25[0-4]\|2[0-4]\d\|1\d\d\|[1-9]?\d)(?:\.(?:25[0-4]\|2[0-4]\d\|1\d\d\|[1-9]?\d)){3})\s",
	re.VERBOSE \| re.IGNORECASE \| re.DOTALL,
	)

	credit_card = re.compile(u"((?:(?:\\d{4}[- ]?){3}\\d{4}\|\\d{15,16}))(?![\\d])")
	btc_address = re.compile(
	u"(?<![a-km-zA-HJ-NP-Z0-9])[13][a-km-zA-HJ-NP-Z0-9]{26,33}(?![a-km-zA-HJ-NP-Z0-9])"
	)
	street_address = re.compile(
	u"\d{1,4} [\w\s]{1,20}(?:street\|st\|avenue\|ave\|road\|rd\|highway\|hwy\|square\|sq\|trail\|trl\|drive\|dr\|court\|ct\|park\|parkway\|pkwy\|circle\|cir\|boulevard\|blvd)\W?(?=\s\|$)",
	re.IGNORECASE,
	)
	zip_code = re.compile(r"\b\d{5}(?:[-\s]\d{4})?\b")
	po_box = re.compile(r"P\.? ?O\.? Box \d+", re.IGNORECASE)

	postcodes = re.compile("([gG][iI][rR] {0,}0[aA]{2})\|((([a-pr-uwyzA-PR-UWYZ][a-hk-yA-HK-Y]?[0-9][0-9]?)\|(([a-pr-uwyzA-PR-UWYZ][0-9][a-hjkstuwA-HJKSTUW])\|([a-pr-uwyzA-PR-UWYZ][a-hk-yA-HK-Y][0-9][abehmnprv-yABEHMNPRV-Y]))) {0,}[0-9][abd-hjlnp-uw-zABD-HJLNP-UW-Z]{2})")
	ukphones = re.compile("^\s$?(020[7,8]{1}$?[ ]?[1-9]{1}[0-9{2}[ ]?[0-9]{4})\|(0[1-8]{1}[0-9]{3}\)?[ ]?[1-9]{1}[0-9]{2}[ ]?[0-9]{3})\s$")

	regexes = {
	"dates": date,
	"times": time,
	"phones": phone,
	"phones_with_exts": phones_with_exts,
	"emails": email,
	"ips": ip,
	"ipv6s": ipv6,
	"credit_cards": credit_card,
	"btc_addresses": btc_address,
	"street_addresses": street_address,
	"zip_codes": zip_code,
	"po_boxes": po_box,
	"postcodes": postcodes,
	"ukphones": ukphones
	}

	"""
	This class is used to compile and find all matches of a given regex pattern in a text.
	It takes an object and a regex pattern as input.
	"""

	class regex:
	def __init__(self, obj, regex):
	self.obj = obj
	self.regex = regex

	def __call__(self, *args):
	def regex_method(text=None):
	return [x for x
	in self.regex.findall(text or self.obj.text)]

	return regex_method

	"""
	This class is used to compile and find all matches of a given regex pattern in a text.
	It takes an object and a regex pattern as input.
	"""

	class PiiRegex(object):
	def __init__(self, text=""):
	self.text = text

	# Build class attributes of callables.
	for k, v in regexes.items():
	setattr(self, k, regex(self, v)(self))

	if text:
	for key in regexes.keys():
	method = getattr(self, key)
	setattr(self, key, method())

	def any_match(self, text=""):
	"""Scan through all available matches and try to match.
	"""
	if text:
	self.text = text

	# Regenerate class attribute callables.
	for k, v in regexes.items():
	setattr(self, k, regex(self, v)(self))
	for key in regexes.keys():
	method = getattr(self, key)
	setattr(self, key, method())

	matches = []
	for match in regexes.keys():
	# If we've got a result, add it to matches.
	if getattr(self, match):
	print (f"PII located in document: {match}")
	matches.append(match)

	return True if matches else False

	"""
	This class is used to redact sensitive information from a PDF file.
	It takes a file as input and redacts all the sensitive information found in the file.
	The redacted file is saved as a new PDF file.
	"""

	class Redactor:

	# static methods work independent of class object
	@staticmethod
	def get_sensitive_data(lines):
	sensitive_data = []
	for line in lines:
	pii = PiiRegex(line)
	if pii.any_match():
	sensitive_data.append(line)
	return sensitive_data

	# constructor
	def __init__(self, file):
	self.file = file

	def redaction(self):

	""" main redactor code """
	doc = fitz.open(self.file)
	for page in doc:
	sensitive = self.get_sensitive_data(page.get_text("text")
	.split('\n'))
	for data in sensitive:
	areas = page.search_for(data)
	# drawing outline over sensitive datas
	if data:
	for area in areas:
	annot = page.add_redact_annot(area.quad, text='REDACTED', fontname=None, fontsize=11, fill=(1, 1, 1), text_color=(0, 0, 0), cross_out=True)
	annot.update()
	# applying the redaction
	page.apply_redactions()
	# saving it to a new pdf
	redacted_file = os.path.splitext(self.file.name)[0] + '_redacted.pdf'
	doc.save(redacted_file)
	print(f"Successfully redacted. The redacted file is saved as {redacted_file}")



	def redact_pdf(file):

	# Load the redaction pipeline
	redactor = Redactor(file)
	redactor.redaction()
	# Return the redacted pdf file
	return os.path.splitext(file.name)[0] + '_redacted.pdf'


	inputs = [
	gr.File(label="Upload PDF")
	]

	outputs = [
	gr.File(label="Redacted PDF")
	]

	description = 'This tool detects and redacts the following types of PII information: dates, times, phone numbers, emails, IP addresses, credit card numbers, Bitcoin addresses, street addresses, zip codes, PO boxes, UK postcodes, and UK phone numbers.'
	iface = gr.Interface(fn=redact_pdf, inputs=inputs, outputs=outputs, title="PDF Redactor", description=description)
	iface.launch()