Spaces:

SathvikGanta
/

UC2_Image_Based_PDF_omparison

Sleeping

App Files Files Community

Update app.py

by SuriRaja - opened Dec 1, 2024

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+41

-72

Files changed (1) hide show

app.py +41 -72

app.py CHANGED Viewed

@@ -1,86 +1,43 @@
-import gradio as gr
-import fitz  # PyMuPDF
-import cv2
-from pdf2image import convert_from_path
-import pytesseract
-import numpy as np
-import os
-from fpdf import FPDF
-# Convert PDFs to images
-def convert_pdf_to_images(pdf_path, dpi=300):
-    images = convert_from_path(pdf_path, dpi=dpi, poppler_path="/usr/bin")
-    return [cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) for image in images]
-# Align images
-def align_images(img1, img2):
-    gray1 = cv2.cvtColor(img1, cv2.COLOR_BGR2GRAY)
-    gray2 = cv2.cvtColor(img2, cv2.COLOR_BGR2GRAY)
-    orb = cv2.ORB_create()
-    kp1, des1 = orb.detectAndCompute(gray1, None)
-    kp2, des2 = orb.detectAndCompute(gray2, None)
-    bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
-    matches = bf.match(des1, des2)
-    matches = sorted(matches, key=lambda x: x.distance)
-    src_pts = np.float32([kp1[m.queryIdx].pt for m in matches]).reshape(-1, 1, 2)
-    dst_pts = np.float32([kp2[m.trainIdx].pt for m in matches]).reshape(-1, 1, 2)
-    matrix, _ = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, 5.0)
-    # Validate if alignment is good enough
-    if matrix is None or len(matches) < 10:  # Check if sufficient matches exist
-        raise ValueError("Alignment failed. Insufficient matches between images.")
-    aligned_img = cv2.warpPerspective(img2, matrix, (img1.shape[1], img1.shape[0]))
-    return aligned_img
-# Compare images with noise reduction and filtering
-def compare_images(img1, img2):
-    diff = cv2.absdiff(img1, img2)
-    gray_diff = cv2.cvtColor(diff, cv2.COLOR_BGR2GRAY)
-    # Apply Gaussian blur to reduce noise
-    blurred_diff = cv2.GaussianBlur(gray_diff, (5, 5), 0)
-    # Apply thresholding
-    _, thresh = cv2.threshold(blurred_diff, 40, 255, cv2.THRESH_BINARY)
-    # Morphological operations to smooth out noise
-    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
-    cleaned = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
-    # Filter out small regions
-    contours, _ = cv2.findContours(cleaned, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-    filtered_mask = np.zeros_like(cleaned)
-    for cnt in contours:
-        if cv2.contourArea(cnt) > 100:  # Ignore small differences (area < 100 pixels)
-            cv2.drawContours(filtered_mask, [cnt], -1, 255, -1)
-    return filtered_mask
-# Highlight changes
 def highlight_changes(img, mask):
     overlay = img.copy()
     contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
     for cnt in contours:
-        if cv2.contourArea(cnt) > 100:  # Filter based on area to reduce false positives
-            x, y, w, h = cv2.boundingRect(cnt)
-            cv2.rectangle(overlay, (x, y), (x + w, y + h), (0, 0, 255), 2)  # Red for changes
-    return overlay
-# Generate comparison PDF
 def generate_comparison_pdf(original_pdf, edited_pdf):
     original_images = convert_pdf_to_images(original_pdf)
     edited_images = convert_pdf_to_images(edited_pdf)
     combined_images = []
-    for orig_img, edit_img in zip(original_images, edited_images):
         aligned_img = align_images(orig_img, edit_img)
         diff_mask = compare_images(orig_img, aligned_img)
-        highlighted_img = highlight_changes(edit_img, diff_mask)
         # Ensure dimensions match
         height = min(orig_img.shape[0], highlighted_img.shape[0])
         orig_img_resized = orig_img[:height]
         highlighted_img_resized = highlighted_img[:height]
         combined_images.append(np.hstack((orig_img_resized, highlighted_img_resized)))
     output_path = "outputs/comparison_result.pdf"
     pdf = FPDF()
     for img in combined_images:
@@ -89,8 +46,17 @@ def generate_comparison_pdf(original_pdf, edited_pdf):
         pdf.add_page()
         pdf.image(temp_path, x=10, y=10, w=190)
         os.remove(temp_path)
     pdf.output(output_path)
-    return output_path
 # Gradio interface function
 def pdf_comparison(original_pdf, edited_pdf):
@@ -103,8 +69,8 @@ def pdf_comparison(original_pdf, edited_pdf):
         return "Error: File size exceeds 50 MB. Please upload smaller files."
     # Proceed with PDF comparison
-    result_path = generate_comparison_pdf(original_pdf.name, edited_pdf.name)
-    return result_path
 # Gradio interface
 interface = gr.Interface(
@@ -113,7 +79,10 @@ interface = gr.Interface(
         gr.File(label="Upload Original PDF", file_types=[".pdf"]),
         gr.File(label="Upload Edited PDF", file_types=[".pdf"])
     ],
-    outputs=gr.File(label="Download Comparison Report"),
 )
 if __name__ == "__main__":

+# Highlight changes and categorize small and large differences
 def highlight_changes(img, mask):
     overlay = img.copy()
     contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    summary = []
     for cnt in contours:
+        area = cv2.contourArea(cnt)
+        x, y, w, h = cv2.boundingRect(cnt)
+        if area > 500:  # Major differences
+            cv2.rectangle(overlay, (x, y), (x + w, y + h), (0, 0, 255), 2)  # Red
+            summary.append(f"Major Difference: Location=({x}, {y}), Size=({w}x{h}), Area={area}")
+        elif 100 < area <= 500:  # Small differences
+            cv2.rectangle(overlay, (x, y), (x + w, y + h), (255, 0, 0), 2)  # Blue
+            summary.append(f"Small Difference: Location=({x}, {y}), Size=({w}x{h}), Area={area}")
+    return overlay, summary
+# Generate comparison PDF with detailed summary
 def generate_comparison_pdf(original_pdf, edited_pdf):
     original_images = convert_pdf_to_images(original_pdf)
     edited_images = convert_pdf_to_images(edited_pdf)
     combined_images = []
+    all_summaries = []
+    for page_num, (orig_img, edit_img) in enumerate(zip(original_images, edited_images), start=1):
         aligned_img = align_images(orig_img, edit_img)
         diff_mask = compare_images(orig_img, aligned_img)
+        highlighted_img, summary = highlight_changes(edit_img, diff_mask)
+        # Add page number to summary
+        page_summary = [f"Page {page_num}:"]
+        page_summary.extend(summary)
+        all_summaries.extend(page_summary)
         # Ensure dimensions match
         height = min(orig_img.shape[0], highlighted_img.shape[0])
         orig_img_resized = orig_img[:height]
         highlighted_img_resized = highlighted_img[:height]
         combined_images.append(np.hstack((orig_img_resized, highlighted_img_resized)))
+    # Generate the PDF
     output_path = "outputs/comparison_result.pdf"
     pdf = FPDF()
     for img in combined_images:
         pdf.add_page()
         pdf.image(temp_path, x=10, y=10, w=190)
         os.remove(temp_path)
+    # Add detailed summary to the PDF
+    summary_path = "outputs/summary.txt"
+    with open(summary_path, "w") as f:
+        f.write("\n".join(all_summaries))
+    pdf.add_page()
+    pdf.set_font("Arial", size=12)
+    pdf.multi_cell(0, 10, "\n".join(all_summaries))
     pdf.output(output_path)
+    return output_path, summary_path
 # Gradio interface function
 def pdf_comparison(original_pdf, edited_pdf):
         return "Error: File size exceeds 50 MB. Please upload smaller files."
     # Proceed with PDF comparison
+    result_path, summary_path = generate_comparison_pdf(original_pdf.name, edited_pdf.name)
+    return result_path, summary_path
 # Gradio interface
 interface = gr.Interface(
         gr.File(label="Upload Original PDF", file_types=[".pdf"]),
         gr.File(label="Upload Edited PDF", file_types=[".pdf"])
     ],
+    outputs=[
+        gr.File(label="Download Comparison Report"),
+        gr.File(label="Download Detailed Summary")
+    ],
 )
 if __name__ == "__main__":