ivelin commited on
Commit
e8e6698
1 Parent(s): 7b3f48a

Signed-off-by: ivelin <ivelin.eth@gmail.com>

Files changed (1) hide show
  1. app.py +9 -6
app.py CHANGED
@@ -4,7 +4,6 @@ from PIL import Image, ImageDraw
4
  import math
5
  import torch
6
  import html
7
- import json
8
  from transformers import DonutProcessor, VisionEncoderDecoderModel
9
 
10
  pretrained_repo_name = "ivelin/donut-refexp-draft"
@@ -56,7 +55,6 @@ def process_refexp(image: Image, prompt: str):
56
  print(
57
  fr"predicted decoder sequence before token2json: {html.escape(sequence)}")
58
  bbox = processor.token2json(sequence)
59
- bbox = json.loads(bbox)
60
  print(f"predicted bounding box: {bbox}")
61
 
62
  print(f"image object: {image}")
@@ -65,10 +63,15 @@ def process_refexp(image: Image, prompt: str):
65
  print(f"image width, height: {width, height}")
66
  print(f"processed prompt: {prompt}")
67
 
68
- xmin = math.floor(width*bbox["xmin"]) if bbox.get("xmin") else 0
69
- ymin = math.floor(height*bbox["ymin"]) if bbox.get("ymin") else 0
70
- xmax = math.floor(width*bbox["xmax"]) if bbox.get("xmax") else 1
71
- ymax = math.floor(height*bbox["ymax"]) if bbox.get("ymax") else 1
 
 
 
 
 
72
 
73
  print(
74
  f"to image pixel values: xmin, ymin, xmax, ymax: {xmin, ymin, xmax, ymax}")
 
4
  import math
5
  import torch
6
  import html
 
7
  from transformers import DonutProcessor, VisionEncoderDecoderModel
8
 
9
  pretrained_repo_name = "ivelin/donut-refexp-draft"
 
55
  print(
56
  fr"predicted decoder sequence before token2json: {html.escape(sequence)}")
57
  bbox = processor.token2json(sequence)
 
58
  print(f"predicted bounding box: {bbox}")
59
 
60
  print(f"image object: {image}")
 
63
  print(f"image width, height: {width, height}")
64
  print(f"processed prompt: {prompt}")
65
 
66
+ # safeguard in case text prediction is missing some bounding box coordinates
67
+ xmin = math.floor(width*float(bbox["xmin"])
68
+ ) if bbox.get("xmin") is not None else 0
69
+ ymin = math.floor(
70
+ height*float(bbox["ymin"])) if bbox.get("ymin") is not None else 0
71
+ xmax = math.floor(width*float(bbox["xmax"])
72
+ ) if bbox.get("xmax") is not None else 1
73
+ ymax = math.floor(
74
+ height*float(bbox["ymax"])) if bbox.get("ymax") is not None else 1
75
 
76
  print(
77
  f"to image pixel values: xmin, ymin, xmax, ymax: {xmin, ymin, xmax, ymax}")