Spaces:

madhavkotecha
/

OCR_with_LLMs

Running

App Files Files Community

madhavkotecha commited on Nov 27, 2024

Commit

b7dfd46

verified ·

1 Parent(s): fa9ebe1

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -73

app.py CHANGED Viewed

@@ -1,13 +1,13 @@
-from pycparser.ply.yacc import token
 from ultralytics import YOLO
-from transformers import TrOCRProcessor, VisionEncoderDecoderModel, AutoModelForCausalLM, pipeline, AutoModelForMaskedLM
 from PIL import Image
 import numpy as np
 import pandas as pd
 from nltk.translate import bleu_score
 from nltk.translate.bleu_score import SmoothingFunction
 import torch
-import gradio as gr
 yolo_weights_path = "final_wts.pt"
@@ -15,15 +15,16 @@ device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is
 processor = TrOCRProcessor.from_pretrained('microsoft/trocr-large-handwritten')
 trocr_model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-large-handwritten').to(device)
-trocr_model.config.num_beams = 1
-yolo_model = YOLO(yolo_weights_path).to('cpu')
-unmasker_large = pipeline('fill-mask', model='roberta-large', device=device)
 roberta_model = AutoModelForMaskedLM.from_pretrained("roberta-large").to(device)
-print(f'TrOCR and YOLO Models loaded on {device}')
 CONFIDENCE_THRESHOLD = 0.72
@@ -61,7 +62,7 @@ def inference(image_path, debug=False, return_texts='final'):
         for i in range(len(generated_texts)):
             if len(generated_texts[i]) > 2 and generated_texts[i][:2] == '# ':
                 generated_texts[i] = generated_texts[i][2:]
             if len(generated_texts[i]) > 2 and generated_texts[i][-2:] == ' #':
                 generated_texts[i] = generated_texts[i][:-2]
         return generated_texts
@@ -107,11 +108,29 @@ def inference(image_path, debug=False, return_texts='final'):
             new = qualified_texts[i]['bleu'] < BLEU_THRESHOLD
         return final_texts
     cropped_images, y, bounding_box_path = get_cropped_images(image_path)
     if debug:
         print('Number of cropped images:', len(cropped_images))
     generated_texts, logits, gen_tokens = get_model_output(cropped_images)
     normalised_scores = get_scores(logits)
     if return_texts == 'generated':
         return pd.DataFrame({
             'text': generated_texts,
@@ -133,81 +152,47 @@ def inference(image_path, debug=False, return_texts='final'):
         return pd.DataFrame(qualified_texts)
     final_texts = remove_overlapping_texts(qualified_texts)
     final_texts_df = pd.DataFrame(final_texts, columns=['text', 'score', 'y'])
-    final_tokens = [text['tokens'] for text in final_texts]
     final_logits = [text['logits'] for text in final_texts]
     if return_texts == 'final':
         return final_texts_df
-    return final_texts_df, bounding_box_path, final_tokens, final_logits, generated_texts
-# image_path = "raw_dataset/g06-037h.png"
-# df, bounding_path, tokens, logits, gen_texts = inference(image_path, debug=False, return_texts='final_v2')
-def get_new_logits(tokens):
-    inputs = tokens.reshape(1, -1)
-    # Get the logits from the model
-    with torch.no_grad():
-        outputs = roberta_model(input_ids=inputs, attention_mask=torch.ones(inputs.shape).to(device))
-        logits = outputs.logits
-    logits_flattened = logits.reshape(-1, slogits.shape[-1])
-    print(processor.batch_decode([logits_flattened.argmax(-1)], skip_special_tokens=True))
-    return logits.reshape(tokens.shape + (logits.shape[-1],))
-slogits = torch.stack([logit for logit in logits], dim=0)
-tokens = slogits.argmax(-1)
-confidence = slogits.softmax(-1).max(-1).values
-indices = torch.where(confidence < 0.5)
-# put 50264(mask) when confidence < 0.5
-for i, j in zip(indices[0], indices[1]):
-    if i != 6:
-        continue
-    tokens[i, j] = torch.tensor(50264)
-new_logits = get_new_logits(tokens)
-for i, j in zip(indices[0], indices[1]):
-    slogits[i, j] = slogits[i, j] * 0.1 + new_logits[i, j] * 0.5
-logits_flattened = slogits.reshape(-1, slogits.shape[-1])
-processor.batch_decode([logits_flattened.argmax(-1)], skip_special_tokens=True)
-def gradio_inference(image_path):
-    """
-    Function to handle inference and output the generated texts and final processed texts.
-    """
-    df, bounding_path, tokens, logits, gen_texts = inference(image_path, debug=False, return_texts='final_v2')
-    # Convert the DataFrame for final texts to a readable format
-    final_texts = df.to_string(index=False)
-    # Convert the list of generated texts into a readable string
-    gen_texts_output = '\n'.join(gen_texts)
-    return gen_texts_output, final_texts
-image_input = gr.inputs.Image(type="filepath", label="Upload Image")
-generated_output = gr.outputs.Textbox(label="Generated Texts")
-final_output = gr.outputs.Textbox(label="Final Processed Texts")
 interface = gr.Interface(
-    fn=gradio_inference,
-    inputs=image_input,
-    outputs=[generated_output, final_output],
-    title="OCR using LLMs",
-    description="Upload an image and get generated and final processed texts",
 )
-interface.launch()

+import gradio as gr
 from ultralytics import YOLO
+from transformers import TrOCRProcessor, VisionEncoderDecoderModel, AutoModelForMaskedLM
 from PIL import Image
 import numpy as np
 import pandas as pd
+import tempfile
 from nltk.translate import bleu_score
 from nltk.translate.bleu_score import SmoothingFunction
 import torch
 yolo_weights_path = "final_wts.pt"
 processor = TrOCRProcessor.from_pretrained('microsoft/trocr-large-handwritten')
 trocr_model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-large-handwritten').to(device)
+trocr_model.config.num_beams = 2
+yolo_model = YOLO(yolo_weights_path).to(device)
 roberta_model = AutoModelForMaskedLM.from_pretrained("roberta-large").to(device)
+print(f'TrOCR, YOLO and Roberta Models loaded on {device}')
+CONFIDENCE_THRESHOLD = 0.72
+BLEU_THRESHOLD = 0.6
 CONFIDENCE_THRESHOLD = 0.72
         for i in range(len(generated_texts)):
             if len(generated_texts[i]) > 2 and generated_texts[i][:2] == '# ':
                 generated_texts[i] = generated_texts[i][2:]
             if len(generated_texts[i]) > 2 and generated_texts[i][-2:] == ' #':
                 generated_texts[i] = generated_texts[i][:-2]
         return generated_texts
             new = qualified_texts[i]['bleu'] < BLEU_THRESHOLD
         return final_texts
+    def get_lm_logits(ocr_tokens, confidence):
+        tokens = ocr_tokens.clone()
+        indices = torch.where(confidence < 0.5)
+        for i, j in zip(indices[0], indices[1]):
+            if i != 6:
+                continue
+            tokens[i, j] = torch.tensor(50264)
+        inputs = tokens.reshape(1, -1)
+        with torch.no_grad():
+            outputs = roberta_model(input_ids=inputs, attention_mask=torch.ones(inputs.shape).to(device))
+            lm_logits = outputs.logits
+        return lm_logits.reshape(ocr_tokens.shape[0], ocr_tokens.shape[1], -1), indices
     cropped_images, y, bounding_box_path = get_cropped_images(image_path)
     if debug:
         print('Number of cropped images:', len(cropped_images))
     generated_texts, logits, gen_tokens = get_model_output(cropped_images)
     normalised_scores = get_scores(logits)
+    generated_df = pd.DataFrame({
+        'text': generated_texts,
+    })
     if return_texts == 'generated':
         return pd.DataFrame({
             'text': generated_texts,
         return pd.DataFrame(qualified_texts)
     final_texts = remove_overlapping_texts(qualified_texts)
     final_texts_df = pd.DataFrame(final_texts, columns=['text', 'score', 'y'])
     final_logits = [text['logits'] for text in final_texts]
+    logits = torch.stack([logit for logit in final_logits], dim=0)
+    tokens = logits.argmax(-1)
+    confidence = logits.softmax(-1).max(-1).values
     if return_texts == 'final':
         return final_texts_df
+    lm_logits, indices = get_lm_logits(tokens, confidence)
+    combined_logits = logits.clone()
+    for i, j in zip(indices[0], indices[1]):
+        combined_logits[i, j] = logits[i, j] * 0.9 + lm_logits[i, j] * 0.1
+    return final_texts_df, bounding_box_path, tokens, combined_logits, confidence, generated_df
+def process_image(image):
+    text, bounding_path = "", ""
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_image:
+        image.save(temp_image.name)
+        image_path = temp_image.name
+        df, bounding_path, tokens, logits, confidence, generated_df = inference(image_path, debug=False, return_texts='final_v2')
+        text = df['text'].str.cat(sep='\n')
+        before_text = generated_df['text'].str.cat(sep='\n')
+    bounding_img = Image.open(bounding_path)
+    return bounding_img, before_text, text
+# Define Gradio Interface
 interface = gr.Interface(
+    fn=process_image,  # Call the process_image function
+    inputs=gr.Image(type="pil"),  # Expect an image input
+    outputs=[
+        gr.Image(type="pil", label="Bounding Box Image"),
+        gr.Textbox(label="Extracted Text"),
+        gr.Textbox(label="Post Processed Text"),
+    ],
+    title="OCR Pipeline with YOLO, TrOCR and Roberta",
+    description="Upload an image to detect text regions with YOLO, merge bounding boxes, and extract text using TrOCR which is then preprocessed with Roberta for contextual understanding.",
 )
+# Launch the interface
+if __name__ == "__main__":
+    interface.launch()