RenAzum's picture
upload file fix
7ec8f89
import streamlit as st
import fitz # PyMuPDF
import docx
from difflib import HtmlDiff, SequenceMatcher
import os
# Directory to save uploaded files
UPLOAD_DIR = "uploaded_files"
if not os.path.exists(UPLOAD_DIR):
os.makedirs(UPLOAD_DIR)
# Functions to save, extract text, and metadata
def save_uploaded_file(uploaded_file):
file_path = os.path.join(UPLOAD_DIR, uploaded_file.name)
with open(file_path, "wb") as f:
f.write(uploaded_file.getbuffer())
return file_path
def extract_text_pdf(file_path):
doc = fitz.open(file_path)
text = ""
for page in doc:
text += page.get_text()
return text
def extract_text_word(file_path):
doc = docx.Document(file_path)
text = "\n".join([para.text for para in doc.paragraphs])
return text
def extract_metadata_pdf(file_path):
doc = fitz.open(file_path)
metadata = doc.metadata
return metadata
def extract_metadata_word(file_path):
doc = docx.Document(file_path)
core_props = doc.core_properties
metadata = {
"author": core_props.author,
"created": core_props.created,
"modified": core_props.modified
}
return metadata
# Function to compare text and return highlighted HTML differences
def compare_texts(text1, text2):
differ = HtmlDiff()
return differ.make_file(text1.splitlines(), text2.splitlines(), context=True, numlines=2)
# Function to calculate similarity score
def calculate_similarity(text1, text2):
matcher = SequenceMatcher(None, text1, text2)
return matcher.ratio()
# Streamlit App Interface
st.title("Document Edit Detection POC")
st.write("Upload both the original and edited documents below:")
# File upload
original_file = st.file_uploader("Upload Original Document", type=["pdf", "docx"])
edited_file = st.file_uploader("Upload Edited Document", type=["pdf", "docx"])
# Process if both files are uploaded
if original_file and edited_file:
# Save uploaded files
original_file_path = save_uploaded_file(original_file)
edited_file_path = save_uploaded_file(edited_file)
# Identify file types
original_ext = os.path.splitext(original_file.name)[1]
edited_ext = os.path.splitext(edited_file.name)[1]
# Check if both files are of the same type
if original_ext != edited_ext:
st.error("Both documents must be of the same type (PDF or DOCX).")
else:
# Extract text and metadata
if original_ext == ".pdf":
original_text = extract_text_pdf(original_file_path)
edited_text = extract_text_pdf(edited_file_path)
original_metadata = extract_metadata_pdf(original_file_path)
edited_metadata = extract_metadata_pdf(edited_file_path)
else:
original_text = extract_text_word(original_file_path)
edited_text = extract_text_word(edited_file_path)
original_metadata = extract_metadata_word(original_file_path)
edited_metadata = extract_metadata_word(edited_file_path)
# Display Metadata
st.subheader("Metadata Comparison")
metadata_match = original_metadata == edited_metadata
st.write("Metadata Match:", metadata_match)
st.write("Original Document Metadata:")
st.write(original_metadata)
st.write("Edited Document Metadata:")
st.write(edited_metadata)
# Compare text
st.subheader("Text Comparison")
text_diff_html = compare_texts(original_text, edited_text)
similarity_score = calculate_similarity(original_text, edited_text)
st.write("Similarity Score:", round(similarity_score * 100, 2), "%")
text_match = similarity_score == 1.0
st.write("Text Match:", text_match)
# Display highlighted text differences
st.write("Differences:")
st.components.v1.html(text_diff_html, height=400, scrolling=True)
# Report Generation
st.subheader("Report Summary")
st.write("Metadata Match:", metadata_match)
st.write("Text Match:", text_match)
st.write("Similarity Score:", round(similarity_score * 100, 2), "%")
else:
st.info("Please upload both the original and edited documents to proceed.")