doc-analysis

Sleeping

File size: 4,241 Bytes

import streamlit as st
import fitz  # PyMuPDF
import docx
from difflib import HtmlDiff, SequenceMatcher
import os

# Directory to save uploaded files
UPLOAD_DIR = "uploaded_files"
if not os.path.exists(UPLOAD_DIR):
    os.makedirs(UPLOAD_DIR)

# Functions to save, extract text, and metadata
def save_uploaded_file(uploaded_file):
    file_path = os.path.join(UPLOAD_DIR, uploaded_file.name)
    with open(file_path, "wb") as f:
        f.write(uploaded_file.getbuffer())
    return file_path

def extract_text_pdf(file_path):
    doc = fitz.open(file_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

def extract_text_word(file_path):
    doc = docx.Document(file_path)
    text = "\n".join([para.text for para in doc.paragraphs])
    return text

def extract_metadata_pdf(file_path):
    doc = fitz.open(file_path)
    metadata = doc.metadata
    return metadata

def extract_metadata_word(file_path):
    doc = docx.Document(file_path)
    core_props = doc.core_properties
    metadata = {
        "author": core_props.author,
        "created": core_props.created,
        "modified": core_props.modified
    }
    return metadata

# Function to compare text and return highlighted HTML differences
def compare_texts(text1, text2):
    differ = HtmlDiff()
    return differ.make_file(text1.splitlines(), text2.splitlines(), context=True, numlines=2)

# Function to calculate similarity score
def calculate_similarity(text1, text2):
    matcher = SequenceMatcher(None, text1, text2)
    return matcher.ratio()

# Streamlit App Interface
st.title("Document Edit Detection POC")

st.write("Upload both the original and edited documents below:")

# File upload
original_file = st.file_uploader("Upload Original Document", type=["pdf", "docx"])
edited_file = st.file_uploader("Upload Edited Document", type=["pdf", "docx"])

# Process if both files are uploaded
if original_file and edited_file:
    # Save uploaded files
    original_file_path = save_uploaded_file(original_file)
    edited_file_path = save_uploaded_file(edited_file)
    
    # Identify file types
    original_ext = os.path.splitext(original_file.name)[1]
    edited_ext = os.path.splitext(edited_file.name)[1]
    
    # Check if both files are of the same type
    if original_ext != edited_ext:
        st.error("Both documents must be of the same type (PDF or DOCX).")
    else:
        # Extract text and metadata
        if original_ext == ".pdf":
            original_text = extract_text_pdf(original_file_path)
            edited_text = extract_text_pdf(edited_file_path)
            original_metadata = extract_metadata_pdf(original_file_path)
            edited_metadata = extract_metadata_pdf(edited_file_path)
        else:
            original_text = extract_text_word(original_file_path)
            edited_text = extract_text_word(edited_file_path)
            original_metadata = extract_metadata_word(original_file_path)
            edited_metadata = extract_metadata_word(edited_file_path)

        # Display Metadata
        st.subheader("Metadata Comparison")
        metadata_match = original_metadata == edited_metadata
        st.write("Metadata Match:", metadata_match)

        st.write("Original Document Metadata:")
        st.write(original_metadata)

        st.write("Edited Document Metadata:")
        st.write(edited_metadata)

        # Compare text
        st.subheader("Text Comparison")
        text_diff_html = compare_texts(original_text, edited_text)
        similarity_score = calculate_similarity(original_text, edited_text)
        
        st.write("Similarity Score:", round(similarity_score * 100, 2), "%")
        text_match = similarity_score == 1.0
        st.write("Text Match:", text_match)
        
        # Display highlighted text differences
        st.write("Differences:")
        st.components.v1.html(text_diff_html, height=400, scrolling=True)

        # Report Generation
        st.subheader("Report Summary")
        st.write("Metadata Match:", metadata_match)
        st.write("Text Match:", text_match)
        st.write("Similarity Score:", round(similarity_score * 100, 2), "%")
else:
    st.info("Please upload both the original and edited documents to proceed.")