Spaces:

acverma
/

documentAI

Runtime error

App Files Files

acverma commited on Sep 10, 2022

Commit

3e11b40

•

1 Parent(s): c900483

v2

Browse files

Files changed (1) hide show

app.py +78 -55

app.py CHANGED Viewed

@@ -20,13 +20,13 @@ os.system('pip install torch==1.8.0+cu101 torchvision==0.9.0+cu101 -f https://do
 os.system('pip install -q detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.8/index.html')
 os.system('pip install -q pytesseract')
-#!pip install gradio
-#!pip install -q git+https://github.com/huggingface/transformers.git
-#!pip install h5py
-#!pip install -q datasets seqeval
 import gradio as gr
@@ -42,19 +42,16 @@ from transformers import AutoProcessor
 from datasets import Features, Sequence, ClassLabel, Value, Array2D, Array3D
 from datasets import load_dataset # this dataset uses the new Image feature :)
-from transformers import LayoutLMv3ForTokenClassification
-from transformers import AutoModelForTokenClassification
 #import cv2
 from PIL import Image, ImageDraw, ImageFont
-dataset = load_dataset("nielsr/funsd-layoutlmv3")
-example = dataset["test"][0]
-#image_path = "/root/.cache/huggingface/datasets/nielsr___funsd-layoutlmv3/funsd/1.0.0/0e3f4efdfd59aa1c3b4952c517894f7b1fc4d75c12ef01bcc8626a69e41c1bb9/funsd-layoutlmv3-test.arrow"
-image_path = '/root/.cache/huggingface/datasets/nielsr___funsd-layoutlmv3/funsd/1.0.0/0e3f4efdfd59aa1c3b4952c517894f7b1fc4d75c12ef01bcc8626a69e41c1bb9'
 example = dataset["test"][0]
 example["image"].save("example1.png")
@@ -65,22 +62,23 @@ example1["image"].save("example2.png")
 example2 = dataset["test"][2]
 example2["image"].save("example3.png")
-example2["image"]
-#Image.open(dataset[2][image_path]).convert("RGB").save("example1.png")
-#Image.open(dataset[1]["image_path"]).convert("RGB").save("example2.png")
-#Image.open(dataset[0]["image_path"]).convert("RGB").save("example3.png")
 words, boxes, ner_tags = example["tokens"], example["bboxes"], example["ner_tags"]
 features = dataset["test"].features
 column_names = dataset["test"].column_names
 image_column_name = "image"
 text_column_name = "tokens"
 boxes_column_name = "bboxes"
 label_column_name = "ner_tags"
 def get_label_list(labels):
     unique_labels = set()
     for label in labels:
@@ -100,24 +98,34 @@ else:
     label2id = {v: k for k,v in enumerate(label_list)}
 num_labels = len(label_list)
-label2color = {'question':'blue', 'answer':'green', 'header':'orange', 'other':'violet'}
-def prepare_examples(examples):
-  images = examples[image_column_name]
-  words = examples[text_column_name]
-  boxes = examples[boxes_column_name]
-  word_labels = examples[label_column_name]
-  encoding = processor(images, words, boxes=boxes, word_labels=word_labels,
-                       truncation=True, padding="max_length")
-  return encoding
-processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
-model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base",
-                                                         id2label=id2label,
-                                                         label2id=label2id)
 # we need to define custom features for `set_format` (used later on) to work properly
 features = Features({
@@ -128,22 +136,44 @@ features = Features({
     'labels': Sequence(feature=Value(dtype='int64')),
 })
-eval_dataset = dataset["test"].map(
-    prepare_examples,
-    batched=True,
-    remove_columns=column_names,
-    features=features,
-)
-def unnormalize_box(bbox, width, height):
-     return [
-         width * (bbox[0] / 1000),
-         height * (bbox[1] / 1000),
-         width * (bbox[2] / 1000),
-         height * (bbox[3] / 1000),
-     ]
-def process_image(image):
     print(type(image))
     width, height = image.size
@@ -157,11 +187,6 @@ def process_image(image):
         print(k,v.shape)
     # encode
-    #encoding = processor(image, truncation=True, return_offsets_mapping=True, return_tensors="pt")
-    #offset_mapping = encoding.pop('offset_mapping')
-    #encoding = processor(image, words, truncation=True,boxes=boxes, word_labels=word_labels,return_offsets_mapping=True, return_tensors="pt")
-    #offset_mapping = encoding.pop('offset_mapping')
     encoding = processor(image, truncation=True,boxes=boxes, word_labels=word_labels,return_offsets_mapping=True, return_tensors="pt")
     offset_mapping = encoding.pop('offset_mapping')
@@ -185,11 +210,7 @@ def process_image(image):
     token_boxes = encoding.bbox.squeeze().tolist()
     width, height = image.size
-    #true_predictions = [model.config.id2label[pred] for pred, label in zip(predictions, labels) if label != - 100]
-    #true_labels = [model.config.id2label[label] for prediction, label in zip(predictions, labels) if label != -100]
-    #true_boxes = [unnormalize_box(box, width, height) for box, label in zip(token_boxes, labels) if label != -100]
     # only keep non-subword predictions
     is_subword = np.array(offset_mapping.squeeze().tolist())[:,0] != 0
     true_predictions = [id2label[pred] for idx, pred in enumerate(predictions) if not is_subword[idx]]
@@ -205,7 +226,7 @@ def process_image(image):
     return image
-title = "DocumentAI - Extraction of Key Information using LayoutLMv3 model"
 description = "Extraction of Form or Invoice Extraction - We use Microsoft's LayoutLMv3 trained on Invoice Dataset to predict the Biller Name, Biller Address, Biller post_code, Due_date, GST, Invoice_date, Invoice_number, Subtotal and Total. To use it, simply upload an image or use the example image below. Results will show up in a few seconds."
 article="<b>References</b><br>[1] Y. Xu et al., “LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking.” 2022. <a href='https://arxiv.org/abs/2204.08387'>Paper Link</a><br>[2]  <a href='https://github.com/NielsRogge/Transformers-Tutorials/tree/master/LayoutLMv3'>LayoutLMv3 training and inference</a>"
@@ -216,7 +237,7 @@ css = """.output_image, .input_image {height: 600px !important}"""
 iface = gr.Interface(fn=process_image,
                      inputs=gr.inputs.Image(type="pil"),
-                     outputs=gr.outputs.Image(type="pil", label="annotated predict image"),
                      title=title,
                      description=description,
                      article=article,
@@ -225,4 +246,6 @@ iface = gr.Interface(fn=process_image,
                      analytics_enabled = True, enable_queue=True
                      )
-iface.launch(inline=False, share=False, debug=False)

 os.system('pip install -q detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.8/index.html')
 os.system('pip install -q pytesseract')
+!pip install gradio
+!pip install -q git+https://github.com/huggingface/transformers.git
+!pip install h5py
+!pip install -q datasets seqeval
 import gradio as gr
 from datasets import Features, Sequence, ClassLabel, Value, Array2D, Array3D
 from datasets import load_dataset # this dataset uses the new Image feature :)
+from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
 #import cv2
 from PIL import Image, ImageDraw, ImageFont
+processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base",apply_ocr = True)
+model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base")
+dataset = load_dataset("nielsr/funsd-layoutlmv3")
 example = dataset["test"][0]
 example["image"].save("example1.png")
 example2 = dataset["test"][2]
 example2["image"].save("example3.png")
+#example2["image"]
+labels = dataset["test"].features['ner_tags'].feature.names
 words, boxes, ner_tags = example["tokens"], example["bboxes"], example["ner_tags"]
 features = dataset["test"].features
 column_names = dataset["test"].column_names
 image_column_name = "image"
 text_column_name = "tokens"
 boxes_column_name = "bboxes"
 label_column_name = "ner_tags"
+# In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the
+# unique labels.
 def get_label_list(labels):
     unique_labels = set()
     for label in labels:
     label2id = {v: k for k,v in enumerate(label_list)}
 num_labels = len(label_list)
+def get_label_list(labels):
+    unique_labels = set()
+    for label in labels:
+        unique_labels = unique_labels | set(label)
+    label_list = list(unique_labels)
+    label_list.sort()
+    return label_list
+label2color = {'question':'blue', 'answer':'green', 'header':'orange', 'other':'violet'}
+def unnormalize_box(bbox, width, height):
+     return [
+         width * (bbox[0] / 1000),
+         height * (bbox[1] / 1000),
+         width * (bbox[2] / 1000),
+         height * (bbox[3] / 1000),
+     ]
+#def prepare_examples(examples):
+#  images = examples[image_column_name]
+#  words = examples[text_column_name]
+#  boxes = examples[boxes_column_name]
+#  word_labels = examples[label_column_name]
+#  encoding = processor(images, words, boxes=boxes, word_labels=word_labels,
+ #                      truncation=True, padding="max_length")
+ # return encoding
 # we need to define custom features for `set_format` (used later on) to work properly
 features = Features({
     'labels': Sequence(feature=Value(dtype='int64')),
 })
+def process_image(image):
+    width, height = image.size
+    # encode
+    encoding = processor(image, truncation=True, return_offsets_mapping=True, return_tensors="pt")
+    offset_mapping = encoding.pop('offset_mapping')
+    # forward pass
+    outputs = model(**encoding)
+    # get predictions
+    predictions = outputs.logits.argmax(-1).squeeze().tolist()
+    token_boxes = encoding.bbox.squeeze().tolist()
+    # only keep non-subword predictions
+    is_subword = np.array(offset_mapping.squeeze().tolist())[:,0] != 0
+    true_predictions = [id2label[pred] for idx, pred in enumerate(predictions) if not is_subword[idx]]
+    true_boxes = [unnormalize_box(box, width, height) for idx, box in enumerate(token_boxes) if not is_subword[idx]]
+    # draw predictions over the image
+    draw = ImageDraw.Draw(image)
+    font = ImageFont.load_default()
+    def iob_to_label(label):
+        label = label[2:]
+        if not label:
+            return 'other'
+    return label
+    label2color = {'question':'blue', 'answer':'green', 'header':'orange', 'other':'violet'}
+    for prediction, box in zip(true_predictions, true_boxes):
+        predicted_label = iob_to_label(prediction) #.lower()
+        draw.rectangle(box, outline=label2color[predicted_label])
+        draw.text((box[0]+10, box[1]-10), text=predicted_label, fill=label2color[predicted_label], font=font)
+    return image
+#def process_image(image):
     print(type(image))
     width, height = image.size
         print(k,v.shape)
     # encode
     encoding = processor(image, truncation=True,boxes=boxes, word_labels=word_labels,return_offsets_mapping=True, return_tensors="pt")
     offset_mapping = encoding.pop('offset_mapping')
     token_boxes = encoding.bbox.squeeze().tolist()
     width, height = image.size
     # only keep non-subword predictions
     is_subword = np.array(offset_mapping.squeeze().tolist())[:,0] != 0
     true_predictions = [id2label[pred] for idx, pred in enumerate(predictions) if not is_subword[idx]]
     return image
+title = "DocumentAI - Extraction using LayoutLMv3 model"
 description = "Extraction of Form or Invoice Extraction - We use Microsoft's LayoutLMv3 trained on Invoice Dataset to predict the Biller Name, Biller Address, Biller post_code, Due_date, GST, Invoice_date, Invoice_number, Subtotal and Total. To use it, simply upload an image or use the example image below. Results will show up in a few seconds."
 article="<b>References</b><br>[1] Y. Xu et al., “LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking.” 2022. <a href='https://arxiv.org/abs/2204.08387'>Paper Link</a><br>[2]  <a href='https://github.com/NielsRogge/Transformers-Tutorials/tree/master/LayoutLMv3'>LayoutLMv3 training and inference</a>"
 iface = gr.Interface(fn=process_image,
                      inputs=gr.inputs.Image(type="pil"),
+                     outputs=gr.outputs.Image(type="pil", label="annotated image"),
                      title=title,
                      description=description,
                      article=article,
                      analytics_enabled = True, enable_queue=True
                      )
+#iface.launch(inline=False, share=False, debug=False)
+iface.launch(inline=False)