Spaces:

wjbmattingly
/

medieval-htr-page

Running on Zero

App Files Files Community

wjbmattingly commited on Aug 9, 2024

Commit

0d4066e

verified ·

1 Parent(s): 228b5a6

Update app.py

Browse files

Files changed (1) hide show

app.py +94 -58

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import gradio as gr
-from transformers import TrOCRProcessor, VisionEncoderDecoderModel
 import torch
 import subprocess
 import json
 from PIL import Image, ImageDraw
@@ -23,97 +23,133 @@ MODEL_OPTIONS = {
     "Medieval Print": "medieval-data/trocr-medieval-print"
 }
-# Global variables to store the current model and processor
-current_model = None
-current_processor = None
-current_model_name = None
 def load_model(model_name):
-    global current_model, current_processor, current_model_name
-    if model_name != current_model_name:
-        model_id = MODEL_OPTIONS[model_name]
-        current_processor = TrOCRProcessor.from_pretrained(model_id)
-        current_model = VisionEncoderDecoderModel.from_pretrained(model_id)
-        current_model_name = model_name
-        # Move model to GPU if available, else use CPU
-        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        current_model = current_model.to(device)
-    return current_processor, current_model
-def process_image(image, model_name):
-    # Save the uploaded image to a temporary file
-    with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_img:
-        image.save(temp_img, format="JPEG")
-        temp_img_path = temp_img.name
     # Run Kraken for line detection
     lines_json_path = "lines.json"
-    kraken_command = f"kraken -i {temp_img_path} {lines_json_path} binarize segment -bl"
     subprocess.run(kraken_command, shell=True, check=True)
     # Load the lines from the JSON file
     with open(lines_json_path, 'r') as f:
         lines_data = json.load(f)
-    processor, model = load_model(model_name)
-    # Determine device
-    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-    # Process each line
-    transcriptions = []
-    for line in lines_data['lines']:
-        # Extract line coordinates
-        x1, y1 = line['baseline'][0]
-        x2, y2 = line['baseline'][-1]
         # Crop the line from the original image
         line_image = image.crop((x1, y1, x2, y2))
-        # Convert to RGB mode (3 channels)
-        line_image = line_image.convert('RGB')
-        # Prepare image for TrOCR
-        pixel_values = processor(line_image, return_tensors="pt").pixel_values
-        pixel_values = pixel_values.to(device)
         # Generate (no beam search)
-        with torch.no_grad():
-            generated_ids = model.generate(pixel_values)
         # Decode
         generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
         transcriptions.append(generated_text)
-    # Clean up temporary files
-    os.unlink(temp_img_path)
-    os.unlink(lines_json_path)
-    # Create an image with bounding boxes
-    draw = ImageDraw.Draw(image)
-    for line in lines_data['lines']:
-        coords = line['baseline']
-        draw.line(coords, fill="red", width=2)
-    return image, "\n".join(transcriptions)
 # Gradio interface
 with gr.Blocks() as iface:
-    gr.Markdown("# Medieval Document Transcription")
-    gr.Markdown("Upload an image of a medieval document and select a model to transcribe it. The tool will detect lines and transcribe each line separately.")
-    with gr.Row():
-        input_image = gr.Image(type="pil", label="Input Image")
-        model_dropdown = gr.Dropdown(choices=list(MODEL_OPTIONS.keys()), label="Select Model", value="Medieval Base")
     with gr.Row():
         output_image = gr.Image(type="pil", label="Detected Lines")
-        transcription_output = gr.Textbox(label="Transcription", lines=10)
-    submit_button = gr.Button("Transcribe")
-    submit_button.click(fn=process_image, inputs=[input_image, model_dropdown], outputs=[output_image, transcription_output])
-iface.launch(share=True)

 import gradio as gr
 import torch
+from transformers import TrOCRProcessor, VisionEncoderDecoderModel
 import subprocess
 import json
 from PIL import Image, ImageDraw
     "Medieval Print": "medieval-data/trocr-medieval-print"
 }
 def load_model(model_name):
+    model_id = MODEL_OPTIONS[model_name]
+    processor = TrOCRProcessor.from_pretrained(model_id)
+    model = VisionEncoderDecoderModel.from_pretrained(model_id)
+    # Move model to GPU if available, else use CPU
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model = model.to(device)
+    return processor, model
+def detect_lines(image_path):
     # Run Kraken for line detection
     lines_json_path = "lines.json"
+    kraken_command = f"kraken -i {image_path} {lines_json_path} segment -bl"
     subprocess.run(kraken_command, shell=True, check=True)
     # Load the lines from the JSON file
     with open(lines_json_path, 'r') as f:
         lines_data = json.load(f)
+    # Clean up temporary file
+    os.unlink(lines_json_path)
+    return lines_data['lines']
+def extract_line_images(image, lines):
+    line_images = []
+    for line in lines:
+        polygon = line['boundary']
+        # Calculate bounding box
+        x_coords, y_coords = zip(*polygon)
+        x1, y1, x2, y2 = int(min(x_coords)), int(min(y_coords)), int(max(x_coords)), int(max(y_coords))
         # Crop the line from the original image
         line_image = image.crop((x1, y1, x2, y2))
+        # Create a mask for the polygon
+        mask = Image.new('L', (x2-x1, y2-y1), 0)
+        adjusted_polygon = [(int(x-x1), int(y-y1)) for x, y in polygon]
+        ImageDraw.Draw(mask).polygon(adjusted_polygon, outline=255, fill=255)
+        # Convert images to numpy arrays
+        line_array = np.array(line_image)
+        mask_array = np.array(mask)
+        # Apply the mask
+        masked_line = np.where(mask_array[:,:,np.newaxis] == 255, line_array, 255)
+        # Convert back to PIL Image
+        masked_line_image = Image.fromarray(masked_line.astype('uint8'), 'RGB')
+        line_images.append(masked_line_image)
+    return line_images
+def visualize_lines(image, lines):
+    output_image = image.copy()
+    draw = ImageDraw.Draw(output_image)
+    for line in lines:
+        polygon = [(int(x), int(y)) for x, y in line['boundary']]
+        draw.polygon(polygon, outline="red")
+    return output_image
+def transcribe_lines(line_images, model_name):
+    processor, model = load_model(model_name)
+    transcriptions = []
+    for line_image in line_images:
+        # Process the line image
+        pixel_values = processor(images=line_image, return_tensors="pt").pixel_values
         # Generate (no beam search)
+        generated_ids = model.generate(pixel_values)
         # Decode
         generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
         transcriptions.append(generated_text)
+    return transcriptions
+def process_document(image, model_name):
+    # Save the uploaded image temporarily
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file:
+        image.save(temp_file, format="JPEG")
+        temp_file_path = temp_file.name
+    # Step 1: Detect lines
+    lines = detect_lines(temp_file_path)
+    # Visualize detected lines
+    output_image = visualize_lines(image, lines)
+    # Step 2: Extract line images
+    line_images = extract_line_images(image, lines)
+    # Step 3: Transcribe lines
+    transcriptions = transcribe_lines(line_images, model_name)
+    # Clean up temporary file
+    os.unlink(temp_file_path)
+    return output_image, "\n".join(transcriptions)
 # Gradio interface
+def gradio_process_document(image, model_name):
+    output_image, transcriptions = process_document(image, model_name)
+    return output_image, transcriptions
 with gr.Blocks() as iface:
+    gr.Markdown("# Document OCR and Transcription")
+    gr.Markdown("Upload an image and select a model to detect lines and transcribe the text.")
+    with gr.Column():
+        input_image = gr.Image(type="pil", label="Upload Image", height=300, width=300)  # Adjusted size here
+        model_dropdown = gr.Dropdown(choices=list(MODEL_OPTIONS.keys()), value="Medieval Base", label="Select Model")
+        submit_button = gr.Button("Process")
     with gr.Row():
         output_image = gr.Image(type="pil", label="Detected Lines")
+        output_text = gr.Textbox(label="Transcription")
+    submit_button.click(
+        fn=gradio_process_document,
+        inputs=[input_image, model_dropdown],
+        outputs=[output_image, output_text]
+    )
+iface.launch()