Spaces:

RenAzum
/

documentAnalyzer

Sleeping

App Files Files Community

documentAnalyzer / app.py

RenAzum

upload file fix

7ec8f89 about 1 month ago

raw

history blame contribute delete

4.24 kB

	import streamlit as st
	import fitz # PyMuPDF
	import docx
	from difflib import HtmlDiff, SequenceMatcher
	import os

	# Directory to save uploaded files
	UPLOAD_DIR = "uploaded_files"
	if not os.path.exists(UPLOAD_DIR):
	os.makedirs(UPLOAD_DIR)

	# Functions to save, extract text, and metadata
	def save_uploaded_file(uploaded_file):
	file_path = os.path.join(UPLOAD_DIR, uploaded_file.name)
	with open(file_path, "wb") as f:
	f.write(uploaded_file.getbuffer())
	return file_path

	def extract_text_pdf(file_path):
	doc = fitz.open(file_path)
	text = ""
	for page in doc:
	text += page.get_text()
	return text

	def extract_text_word(file_path):
	doc = docx.Document(file_path)
	text = "\n".join([para.text for para in doc.paragraphs])
	return text

	def extract_metadata_pdf(file_path):
	doc = fitz.open(file_path)
	metadata = doc.metadata
	return metadata

	def extract_metadata_word(file_path):
	doc = docx.Document(file_path)
	core_props = doc.core_properties
	metadata = {
	"author": core_props.author,
	"created": core_props.created,
	"modified": core_props.modified
	}
	return metadata

	# Function to compare text and return highlighted HTML differences
	def compare_texts(text1, text2):
	differ = HtmlDiff()
	return differ.make_file(text1.splitlines(), text2.splitlines(), context=True, numlines=2)

	# Function to calculate similarity score
	def calculate_similarity(text1, text2):
	matcher = SequenceMatcher(None, text1, text2)
	return matcher.ratio()

	# Streamlit App Interface
	st.title("Document Edit Detection POC")

	st.write("Upload both the original and edited documents below:")

	# File upload
	original_file = st.file_uploader("Upload Original Document", type=["pdf", "docx"])
	edited_file = st.file_uploader("Upload Edited Document", type=["pdf", "docx"])

	# Process if both files are uploaded
	if original_file and edited_file:
	# Save uploaded files
	original_file_path = save_uploaded_file(original_file)
	edited_file_path = save_uploaded_file(edited_file)

	# Identify file types
	original_ext = os.path.splitext(original_file.name)[1]
	edited_ext = os.path.splitext(edited_file.name)[1]

	# Check if both files are of the same type
	if original_ext != edited_ext:
	st.error("Both documents must be of the same type (PDF or DOCX).")
	else:
	# Extract text and metadata
	if original_ext == ".pdf":
	original_text = extract_text_pdf(original_file_path)
	edited_text = extract_text_pdf(edited_file_path)
	original_metadata = extract_metadata_pdf(original_file_path)
	edited_metadata = extract_metadata_pdf(edited_file_path)
	else:
	original_text = extract_text_word(original_file_path)
	edited_text = extract_text_word(edited_file_path)
	original_metadata = extract_metadata_word(original_file_path)
	edited_metadata = extract_metadata_word(edited_file_path)

	# Display Metadata
	st.subheader("Metadata Comparison")
	metadata_match = original_metadata == edited_metadata
	st.write("Metadata Match:", metadata_match)

	st.write("Original Document Metadata:")
	st.write(original_metadata)

	st.write("Edited Document Metadata:")
	st.write(edited_metadata)

	# Compare text
	st.subheader("Text Comparison")
	text_diff_html = compare_texts(original_text, edited_text)
	similarity_score = calculate_similarity(original_text, edited_text)

	st.write("Similarity Score:", round(similarity_score * 100, 2), "%")
	text_match = similarity_score == 1.0
	st.write("Text Match:", text_match)

	# Display highlighted text differences
	st.write("Differences:")
	st.components.v1.html(text_diff_html, height=400, scrolling=True)

	# Report Generation
	st.subheader("Report Summary")
	st.write("Metadata Match:", metadata_match)
	st.write("Text Match:", text_match)
	st.write("Similarity Score:", round(similarity_score * 100, 2), "%")
	else:
	st.info("Please upload both the original and edited documents to proceed.")