ivelin commited on
Commit
7b3f48a
·
1 Parent(s): 72d0321

Signed-off-by: ivelin <ivelin.eth@gmail.com>

Files changed (1) hide show
  1. app.py +6 -4
app.py CHANGED
@@ -4,6 +4,7 @@ from PIL import Image, ImageDraw
4
  import math
5
  import torch
6
  import html
 
7
  from transformers import DonutProcessor, VisionEncoderDecoderModel
8
 
9
  pretrained_repo_name = "ivelin/donut-refexp-draft"
@@ -55,6 +56,7 @@ def process_refexp(image: Image, prompt: str):
55
  print(
56
  fr"predicted decoder sequence before token2json: {html.escape(sequence)}")
57
  bbox = processor.token2json(sequence)
 
58
  print(f"predicted bounding box: {bbox}")
59
 
60
  print(f"image object: {image}")
@@ -63,10 +65,10 @@ def process_refexp(image: Image, prompt: str):
63
  print(f"image width, height: {width, height}")
64
  print(f"processed prompt: {prompt}")
65
 
66
- xmin = math.floor(width*float(bbox["xmin"])) if bbox.get("xmin") else 0
67
- ymin = math.floor(height*float(bbox["ymin"])) if bbox.get("ymin") else 0
68
- xmax = math.floor(width*float(bbox["xmax"])) if bbox.get("xmax") else 1
69
- ymax = math.floor(height*float(bbox["ymax"])) if bbox.get("ymax") else 1
70
 
71
  print(
72
  f"to image pixel values: xmin, ymin, xmax, ymax: {xmin, ymin, xmax, ymax}")
 
4
  import math
5
  import torch
6
  import html
7
+ import json
8
  from transformers import DonutProcessor, VisionEncoderDecoderModel
9
 
10
  pretrained_repo_name = "ivelin/donut-refexp-draft"
 
56
  print(
57
  fr"predicted decoder sequence before token2json: {html.escape(sequence)}")
58
  bbox = processor.token2json(sequence)
59
+ bbox = json.loads(bbox)
60
  print(f"predicted bounding box: {bbox}")
61
 
62
  print(f"image object: {image}")
 
65
  print(f"image width, height: {width, height}")
66
  print(f"processed prompt: {prompt}")
67
 
68
+ xmin = math.floor(width*bbox["xmin"]) if bbox.get("xmin") else 0
69
+ ymin = math.floor(height*bbox["ymin"]) if bbox.get("ymin") else 0
70
+ xmax = math.floor(width*bbox["xmax"]) if bbox.get("xmax") else 1
71
+ ymax = math.floor(height*bbox["ymax"]) if bbox.get("ymax") else 1
72
 
73
  print(
74
  f"to image pixel values: xmin, ymin, xmax, ymax: {xmin, ymin, xmax, ymax}")