DevBM commited on
Commit
2f651f8
1 Parent(s): 86ee8b4

Upload 32 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ imgs/windows_home.png filter=lfs diff=lfs merge=lfs -text
__pycache__/omniparser.cpython-312.pyc ADDED
Binary file (3.76 kB). View file
 
__pycache__/utils.cpython-312.pyc ADDED
Binary file (22.8 kB). View file
 
__pycache__/utils.cpython-39.pyc ADDED
Binary file (19.7 kB). View file
 
app.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ import gradio as gr
4
+ import numpy as np
5
+ import torch
6
+ from PIL import Image
7
+ import io
8
+
9
+
10
+ import base64, os
11
+ from utils import check_ocr_box, get_yolo_model, get_caption_model_processor, get_som_labeled_img
12
+ import torch
13
+ from PIL import Image
14
+
15
+ yolo_model = get_yolo_model(model_path='weights/icon_detect/best.pt')
16
+ caption_model_processor = get_caption_model_processor(model_name="florence2", model_name_or_path="weights/icon_caption_florence")
17
+ platform = 'pc'
18
+ if platform == 'pc':
19
+ draw_bbox_config = {
20
+ 'text_scale': 0.8,
21
+ 'text_thickness': 2,
22
+ 'text_padding': 2,
23
+ 'thickness': 2,
24
+ }
25
+ elif platform == 'web':
26
+ draw_bbox_config = {
27
+ 'text_scale': 0.8,
28
+ 'text_thickness': 2,
29
+ 'text_padding': 3,
30
+ 'thickness': 3,
31
+ }
32
+ elif platform == 'mobile':
33
+ draw_bbox_config = {
34
+ 'text_scale': 0.8,
35
+ 'text_thickness': 2,
36
+ 'text_padding': 3,
37
+ 'thickness': 3,
38
+ }
39
+
40
+
41
+
42
+ MARKDOWN = """
43
+ # OmniParser for Pure Vision Based General GUI Agent 🔥
44
+ <div>
45
+ <a href="https://arxiv.org/pdf/2408.00203">
46
+ <img src="https://img.shields.io/badge/arXiv-2408.00203-b31b1b.svg" alt="Arxiv" style="display:inline-block;">
47
+ </a>
48
+ </div>
49
+
50
+ OmniParser is a screen parsing tool to convert general GUI screen to structured elements.
51
+ """
52
+
53
+ DEVICE = torch.device('cpu')
54
+
55
+ # @spaces.GPU
56
+ # @torch.inference_mode()
57
+ # @torch.autocast(device_type="cuda", dtype=torch.bfloat16)
58
+ def process(
59
+ image_input,
60
+ box_threshold,
61
+ iou_threshold
62
+ ) -> Optional[Image.Image]:
63
+
64
+ image_save_path = 'imgs/saved_image_demo.png'
65
+ image_input.save(image_save_path)
66
+ # import pdb; pdb.set_trace()
67
+
68
+ ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image_save_path, display_img = False, output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, 'text_threshold':0.9})
69
+ text, ocr_bbox = ocr_bbox_rslt
70
+ # print('prompt:', prompt)
71
+ dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_save_path, yolo_model, BOX_TRESHOLD = box_threshold, output_coord_in_ratio=True, ocr_bbox=ocr_bbox,draw_bbox_config=draw_bbox_config, caption_model_processor=caption_model_processor, ocr_text=text,iou_threshold=iou_threshold)
72
+ image = Image.open(io.BytesIO(base64.b64decode(dino_labled_img)))
73
+ print('finish processing')
74
+ parsed_content_list = '\n'.join(parsed_content_list)
75
+ return image, str(parsed_content_list)
76
+
77
+
78
+
79
+ with gr.Blocks() as demo:
80
+ gr.Markdown(MARKDOWN)
81
+ with gr.Row():
82
+ with gr.Column():
83
+ image_input_component = gr.Image(
84
+ type='pil', label='Upload image')
85
+ # set the threshold for removing the bounding boxes with low confidence, default is 0.05
86
+ box_threshold_component = gr.Slider(
87
+ label='Box Threshold', minimum=0.01, maximum=1.0, step=0.01, value=0.05)
88
+ # set the threshold for removing the bounding boxes with large overlap, default is 0.1
89
+ iou_threshold_component = gr.Slider(
90
+ label='IOU Threshold', minimum=0.01, maximum=1.0, step=0.01, value=0.1)
91
+ submit_button_component = gr.Button(
92
+ value='Submit', variant='primary')
93
+ with gr.Column():
94
+ image_output_component = gr.Image(type='pil', label='Image Output')
95
+ text_output_component = gr.Textbox(label='Parsed screen elements', placeholder='Text Output')
96
+
97
+ submit_button_component.click(
98
+ fn=process,
99
+ inputs=[
100
+ image_input_component,
101
+ box_threshold_component,
102
+ iou_threshold_component
103
+ ],
104
+ outputs=[image_output_component, text_output_component]
105
+ )
106
+
107
+ # demo.launch(debug=False, show_error=True, share=True)
108
+ demo.launch(share=True, server_port=7861, server_name='0.0.0.0')
img.png ADDED
imgs/google_page.png ADDED
imgs/logo.png ADDED
imgs/saved_image_demo.png ADDED
imgs/windows_home.png ADDED

Git LFS Details

  • SHA256: 036008abc32379393876e722fedab2bd02bda9b667b957bc150c2f83c725ebac
  • Pointer size: 132 Bytes
  • Size of remote file: 6.1 MB
imgs/windows_multitab.png ADDED
omniparser.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils import get_som_labeled_img, check_ocr_box, get_caption_model_processor, get_dino_model, get_yolo_model
2
+ import torch
3
+ from ultralytics import YOLO
4
+ from PIL import Image
5
+ from typing import Dict, Tuple, List
6
+ import io
7
+ import base64
8
+
9
+
10
+ config = {
11
+ 'som_model_path': 'finetuned_icon_detect.pt',
12
+ 'device': 'cpu',
13
+ 'caption_model_path': 'Salesforce/blip2-opt-2.7b',
14
+ 'draw_bbox_config': {
15
+ 'text_scale': 0.8,
16
+ 'text_thickness': 2,
17
+ 'text_padding': 3,
18
+ 'thickness': 3,
19
+ },
20
+ 'BOX_TRESHOLD': 0.05
21
+ }
22
+
23
+
24
+ class Omniparser(object):
25
+ def __init__(self, config: Dict):
26
+ self.config = config
27
+
28
+ self.som_model = get_yolo_model(model_path=config['som_model_path'])
29
+ # self.caption_model_processor = get_caption_model_processor(config['caption_model_path'], device=cofig['device'])
30
+ # self.caption_model_processor['model'].to(torch.float32)
31
+
32
+ def parse(self, image_path: str):
33
+ print('Parsing image:', image_path)
34
+ ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image_path, display_img = False, output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, 'text_threshold':0.9})
35
+ text, ocr_bbox = ocr_bbox_rslt
36
+
37
+ draw_bbox_config = self.config['draw_bbox_config']
38
+ BOX_TRESHOLD = self.config['BOX_TRESHOLD']
39
+ dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_path, self.som_model, BOX_TRESHOLD = BOX_TRESHOLD, output_coord_in_ratio=False, ocr_bbox=ocr_bbox,draw_bbox_config=draw_bbox_config, caption_model_processor=None, ocr_text=text,use_local_semantics=False)
40
+
41
+ image = Image.open(io.BytesIO(base64.b64decode(dino_labled_img)))
42
+ # formating output
43
+ return_list = [{'from': 'omniparser', 'shape': {'x':coord[0], 'y':coord[1], 'width':coord[2], 'height':coord[3]},
44
+ 'text': parsed_content_list[i].split(': ')[1], 'type':'text'} for i, (k, coord) in enumerate(label_coordinates.items()) if i < len(parsed_content_list)]
45
+ return_list.extend(
46
+ [{'from': 'omniparser', 'shape': {'x':coord[0], 'y':coord[1], 'width':coord[2], 'height':coord[3]},
47
+ 'text': 'None', 'type':'icon'} for i, (k, coord) in enumerate(label_coordinates.items()) if i >= len(parsed_content_list)]
48
+ )
49
+
50
+ return [image, return_list]
51
+
52
+ parser = Omniparser(config)
53
+ image_path = 'examples/pc_1.png'
54
+
55
+ # time the parser
56
+ import time
57
+ s = time.time()
58
+ image, parsed_content_list = parser.parse(image_path)
59
+ device = config['device']
60
+ print(f'Time taken for Omniparser on {device}:', time.time() - s)
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ easyocr
3
+ torchvision
4
+ supervision==0.18.0
5
+ openai==1.3.5
6
+ transformers
7
+ ultralytics==8.1.24
8
+ azure-identity
9
+ numpy
10
+ opencv-python
11
+ opencv-python-headless
12
+ gradio
13
+ dill
14
+ accelerate
15
+ timm
16
+ einops==0.8.0
util/__init__.py ADDED
File without changes
util/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (139 Bytes). View file
 
util/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (141 Bytes). View file
 
util/__pycache__/action_matching.cpython-39.pyc ADDED
Binary file (8.49 kB). View file
 
util/__pycache__/box_annotator.cpython-312.pyc ADDED
Binary file (9.79 kB). View file
 
util/__pycache__/box_annotator.cpython-39.pyc ADDED
Binary file (6.57 kB). View file
 
util/action_matching.py ADDED
@@ -0,0 +1,425 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ Adapted from https://github.com/google-research/google-research/tree/master/android_in_the_wild
3
+ '''
4
+
5
+ import jax
6
+ import jax.numpy as jnp
7
+ import numpy as np
8
+
9
+ # import action_type as action_type_lib
10
+ import enum
11
+
12
+ class ActionType(enum.IntEnum):
13
+ # Placeholders for unused enum values
14
+ UNUSED_0 = 0
15
+ UNUSED_1 = 1
16
+ UNUSED_2 = 2
17
+ UNUSED_8 = 8
18
+ UNUSED_9 = 9
19
+
20
+ ########### Agent actions ###########
21
+
22
+ # A type action that sends text to the emulator. Note that this simply sends
23
+ # text and does not perform any clicks for element focus or enter presses for
24
+ # submitting text.
25
+ TYPE = 3
26
+
27
+ # The dual point action used to represent all gestures.
28
+ DUAL_POINT = 4
29
+
30
+ # These actions differentiate pressing the home and back button from touches.
31
+ # They represent explicit presses of back and home performed using ADB.
32
+ PRESS_BACK = 5
33
+ PRESS_HOME = 6
34
+
35
+ # An action representing that ADB command for hitting enter was performed.
36
+ PRESS_ENTER = 7
37
+
38
+ ########### Episode status actions ###########
39
+
40
+ # An action used to indicate the desired task has been completed and resets
41
+ # the environment. This action should also be used in the case that the task
42
+ # has already been completed and there is nothing to do.
43
+ # e.g. The task is to turn on the Wi-Fi when it is already on
44
+ STATUS_TASK_COMPLETE = 10
45
+
46
+ # An action used to indicate that desired task is impossible to complete and
47
+ # resets the environment. This can be a result of many different things
48
+ # including UI changes, Android version differences, etc.
49
+ STATUS_TASK_IMPOSSIBLE = 11
50
+
51
+
52
+ _TAP_DISTANCE_THRESHOLD = 0.14 # Fraction of the screen
53
+ ANNOTATION_WIDTH_AUGMENT_FRACTION = 1.4
54
+ ANNOTATION_HEIGHT_AUGMENT_FRACTION = 1.4
55
+
56
+ # Interval determining if an action is a tap or a swipe.
57
+ _SWIPE_DISTANCE_THRESHOLD = 0.04
58
+
59
+
60
+ def _yx_in_bounding_boxes(
61
+ yx, bounding_boxes
62
+ ):
63
+ """Check if the (y,x) point is contained in each bounding box.
64
+
65
+ Args:
66
+ yx: The (y, x) coordinate in pixels of the point.
67
+ bounding_boxes: A 2D int array of shape (num_bboxes, 4), where each row
68
+ represents a bounding box: (y_top_left, x_top_left, box_height,
69
+ box_width). Note: containment is inclusive of the bounding box edges.
70
+
71
+ Returns:
72
+ is_inside: A 1D bool array where each element specifies if the point is
73
+ contained within the respective box.
74
+ """
75
+ y, x = yx
76
+
77
+ # `bounding_boxes` has shape (n_elements, 4); we extract each array along the
78
+ # last axis into shape (n_elements, 1), then squeeze unneeded dimension.
79
+ top, left, height, width = [
80
+ jnp.squeeze(v, axis=-1) for v in jnp.split(bounding_boxes, 4, axis=-1)
81
+ ]
82
+
83
+ # The y-axis is inverted for AndroidEnv, so bottom = top + height.
84
+ bottom, right = top + height, left + width
85
+
86
+ return jnp.logical_and(y >= top, y <= bottom) & jnp.logical_and(
87
+ x >= left, x <= right)
88
+
89
+
90
+ def _resize_annotation_bounding_boxes(
91
+ annotation_positions, annotation_width_augment_fraction,
92
+ annotation_height_augment_fraction):
93
+ """Resize the bounding boxes by the given fractions.
94
+
95
+ Args:
96
+ annotation_positions: Array of shape (N, 4), where each row represents the
97
+ (y, x, height, width) of the bounding boxes.
98
+ annotation_width_augment_fraction: The fraction to augment the box widths,
99
+ E.g., 1.4 == 240% total increase.
100
+ annotation_height_augment_fraction: Same as described for width, but for box
101
+ height.
102
+
103
+ Returns:
104
+ Resized bounding box.
105
+
106
+ """
107
+ height_change = (
108
+ annotation_height_augment_fraction * annotation_positions[:, 2])
109
+ width_change = (
110
+ annotation_width_augment_fraction * annotation_positions[:, 3])
111
+
112
+ # Limit bounding box positions to the screen.
113
+ resized_annotations = jnp.stack([
114
+ jnp.maximum(0, annotation_positions[:, 0] - (height_change / 2)),
115
+ jnp.maximum(0, annotation_positions[:, 1] - (width_change / 2)),
116
+ jnp.minimum(1, annotation_positions[:, 2] + height_change),
117
+ jnp.minimum(1, annotation_positions[:, 3] + width_change),
118
+ ],
119
+ axis=1)
120
+ return resized_annotations
121
+
122
+
123
+ def is_tap_action(normalized_start_yx,
124
+ normalized_end_yx):
125
+ distance = jnp.linalg.norm(
126
+ jnp.array(normalized_start_yx) - jnp.array(normalized_end_yx))
127
+ return distance <= _SWIPE_DISTANCE_THRESHOLD
128
+
129
+
130
+ def _is_non_dual_point_action(action_type):
131
+ return jnp.not_equal(action_type, ActionType.DUAL_POINT)
132
+
133
+
134
+ def _check_tap_actions_match(
135
+ tap_1_yx,
136
+ tap_2_yx,
137
+ annotation_positions,
138
+ matching_tap_distance_threshold_screen_percentage,
139
+ annotation_width_augment_fraction,
140
+ annotation_height_augment_fraction,
141
+ ):
142
+ """Determines if two tap actions are the same."""
143
+ resized_annotation_positions = _resize_annotation_bounding_boxes(
144
+ annotation_positions,
145
+ annotation_width_augment_fraction,
146
+ annotation_height_augment_fraction,
147
+ )
148
+
149
+ # Check if the ground truth tap action falls in an annotation's bounding box.
150
+ tap1_in_box = _yx_in_bounding_boxes(tap_1_yx, resized_annotation_positions)
151
+ tap2_in_box = _yx_in_bounding_boxes(tap_2_yx, resized_annotation_positions)
152
+ both_in_box = jnp.max(tap1_in_box & tap2_in_box)
153
+
154
+ # If the ground-truth tap action falls outside any of the annotation
155
+ # bounding boxes or one of the actions is inside a bounding box and the other
156
+ # is outside bounding box or vice versa, compare the points using Euclidean
157
+ # distance.
158
+ within_threshold = (
159
+ jnp.linalg.norm(jnp.array(tap_1_yx) - jnp.array(tap_2_yx))
160
+ <= matching_tap_distance_threshold_screen_percentage
161
+ )
162
+ return jnp.logical_or(both_in_box, within_threshold)
163
+
164
+
165
+ def _check_drag_actions_match(
166
+ drag_1_touch_yx,
167
+ drag_1_lift_yx,
168
+ drag_2_touch_yx,
169
+ drag_2_lift_yx,
170
+ ):
171
+ """Determines if two drag actions are the same."""
172
+ # Store drag deltas (the change in the y and x coordinates from touch to
173
+ # lift), magnitudes, and the index of the main axis, which is the axis with
174
+ # the greatest change in coordinate value (e.g. a drag starting at (0, 0) and
175
+ # ending at (0.3, 0.5) has a main axis index of 1).
176
+ drag_1_deltas = drag_1_lift_yx - drag_1_touch_yx
177
+ drag_1_magnitudes = jnp.abs(drag_1_deltas)
178
+ drag_1_main_axis = np.argmax(drag_1_magnitudes)
179
+ drag_2_deltas = drag_2_lift_yx - drag_2_touch_yx
180
+ drag_2_magnitudes = jnp.abs(drag_2_deltas)
181
+ drag_2_main_axis = np.argmax(drag_2_magnitudes)
182
+
183
+ return jnp.equal(drag_1_main_axis, drag_2_main_axis)
184
+
185
+
186
+ def check_actions_match(
187
+ action_1_touch_yx,
188
+ action_1_lift_yx,
189
+ action_1_action_type,
190
+ action_2_touch_yx,
191
+ action_2_lift_yx,
192
+ action_2_action_type,
193
+ annotation_positions,
194
+ tap_distance_threshold = _TAP_DISTANCE_THRESHOLD,
195
+ annotation_width_augment_fraction = ANNOTATION_WIDTH_AUGMENT_FRACTION,
196
+ annotation_height_augment_fraction = ANNOTATION_HEIGHT_AUGMENT_FRACTION,
197
+ ):
198
+ """Determines if two actions are considered to be the same.
199
+
200
+ Two actions being "the same" is defined here as two actions that would result
201
+ in a similar screen state.
202
+
203
+ Args:
204
+ action_1_touch_yx: The (y, x) coordinates of the first action's touch.
205
+ action_1_lift_yx: The (y, x) coordinates of the first action's lift.
206
+ action_1_action_type: The action type of the first action.
207
+ action_2_touch_yx: The (y, x) coordinates of the second action's touch.
208
+ action_2_lift_yx: The (y, x) coordinates of the second action's lift.
209
+ action_2_action_type: The action type of the second action.
210
+ annotation_positions: The positions of the UI annotations for the screen. It
211
+ is A 2D int array of shape (num_bboxes, 4), where each row represents a
212
+ bounding box: (y_top_left, x_top_left, box_height, box_width). Note that
213
+ containment is inclusive of the bounding box edges.
214
+ tap_distance_threshold: The threshold that determines if two taps result in
215
+ a matching screen state if they don't fall the same bounding boxes.
216
+ annotation_width_augment_fraction: The fraction to increase the width of the
217
+ bounding box by.
218
+ annotation_height_augment_fraction: The fraction to increase the height of
219
+ of the bounding box by.
220
+
221
+ Returns:
222
+ A boolean representing whether the two given actions are the same or not.
223
+ """
224
+ action_1_touch_yx = jnp.asarray(action_1_touch_yx)
225
+ action_1_lift_yx = jnp.asarray(action_1_lift_yx)
226
+ action_2_touch_yx = jnp.asarray(action_2_touch_yx)
227
+ action_2_lift_yx = jnp.asarray(action_2_lift_yx)
228
+
229
+ # Checks if at least one of the actions is global (i.e. not DUAL_POINT),
230
+ # because if that is the case, only the actions' types need to be compared.
231
+ has_non_dual_point_action = jnp.logical_or(
232
+ _is_non_dual_point_action(action_1_action_type),
233
+ _is_non_dual_point_action(action_2_action_type),
234
+ )
235
+ #print("non dual point: "+str(has_non_dual_point_action))
236
+
237
+ different_dual_point_types = jnp.logical_xor(
238
+ is_tap_action(action_1_touch_yx, action_1_lift_yx),
239
+ is_tap_action(action_2_touch_yx, action_2_lift_yx),
240
+ )
241
+ #print("different dual type: "+str(different_dual_point_types))
242
+
243
+ is_tap = jnp.logical_and(
244
+ is_tap_action(action_1_touch_yx, action_1_lift_yx),
245
+ is_tap_action(action_2_touch_yx, action_2_lift_yx),
246
+ )
247
+ #print("is tap: "+str(is_tap))
248
+
249
+ taps_match = _check_tap_actions_match(
250
+ action_1_touch_yx,
251
+ action_2_touch_yx,
252
+ annotation_positions,
253
+ tap_distance_threshold,
254
+ annotation_width_augment_fraction,
255
+ annotation_height_augment_fraction,
256
+ )
257
+ #print("tap match: "+str(taps_match))
258
+
259
+ taps_match = jnp.logical_and(is_tap, taps_match)
260
+ #print("tap match: "+str(taps_match))
261
+
262
+ drags_match = _check_drag_actions_match(
263
+ action_1_touch_yx, action_1_lift_yx, action_2_touch_yx, action_2_lift_yx
264
+ )
265
+ drags_match = jnp.where(is_tap, False, drags_match)
266
+ #print("drag match: "+str(drags_match))
267
+
268
+ return jnp.where(
269
+ has_non_dual_point_action,
270
+ jnp.equal(action_1_action_type, action_2_action_type),
271
+ jnp.where(
272
+ different_dual_point_types,
273
+ False,
274
+ jnp.logical_or(taps_match, drags_match),
275
+ ),
276
+ )
277
+
278
+
279
+ def action_2_format(step_data):
280
+ # 把test数据集中的动作格式转换为计算matching score的格式
281
+ action_type = step_data["action_type_id"]
282
+
283
+ if action_type == 4:
284
+ if step_data["action_type_text"] == 'click': # 点击
285
+ touch_point = step_data["touch"]
286
+ lift_point = step_data["lift"]
287
+ else: # 上下左右滑动
288
+ if step_data["action_type_text"] == 'scroll down':
289
+ touch_point = [0.5, 0.8]
290
+ lift_point = [0.5, 0.2]
291
+ elif step_data["action_type_text"] == 'scroll up':
292
+ touch_point = [0.5, 0.2]
293
+ lift_point = [0.5, 0.8]
294
+ elif step_data["action_type_text"] == 'scroll left':
295
+ touch_point = [0.2, 0.5]
296
+ lift_point = [0.8, 0.5]
297
+ elif step_data["action_type_text"] == 'scroll right':
298
+ touch_point = [0.8, 0.5]
299
+ lift_point = [0.2, 0.5]
300
+ else:
301
+ touch_point = [-1.0, -1.0]
302
+ lift_point = [-1.0, -1.0]
303
+
304
+ if action_type == 3:
305
+ typed_text = step_data["type_text"]
306
+ else:
307
+ typed_text = ""
308
+
309
+ action = {"action_type": action_type, "touch_point": touch_point, "lift_point": lift_point,
310
+ "typed_text": typed_text}
311
+
312
+ action["touch_point"] = [action["touch_point"][1], action["touch_point"][0]]
313
+ action["lift_point"] = [action["lift_point"][1], action["lift_point"][0]]
314
+ action["typed_text"] = action["typed_text"].lower()
315
+
316
+ return action
317
+
318
+
319
+ def pred_2_format(step_data):
320
+ # 把模型输出的内容转换为计算action_matching的格式
321
+ action_type = step_data["action_type"]
322
+
323
+ if action_type == 4: # 点击
324
+ action_type_new = 4
325
+ touch_point = step_data["click_point"]
326
+ lift_point = step_data["click_point"]
327
+ typed_text = ""
328
+ elif action_type == 0:
329
+ action_type_new = 4
330
+ touch_point = [0.5, 0.8]
331
+ lift_point = [0.5, 0.2]
332
+ typed_text = ""
333
+ elif action_type == 1:
334
+ action_type_new = 4
335
+ touch_point = [0.5, 0.2]
336
+ lift_point = [0.5, 0.8]
337
+ typed_text = ""
338
+ elif action_type == 8:
339
+ action_type_new = 4
340
+ touch_point = [0.2, 0.5]
341
+ lift_point = [0.8, 0.5]
342
+ typed_text = ""
343
+ elif action_type == 9:
344
+ action_type_new = 4
345
+ touch_point = [0.8, 0.5]
346
+ lift_point = [0.2, 0.5]
347
+ typed_text = ""
348
+ else:
349
+ action_type_new = action_type
350
+ touch_point = [-1.0, -1.0]
351
+ lift_point = [-1.0, -1.0]
352
+ typed_text = ""
353
+ if action_type_new == 3:
354
+ typed_text = step_data["typed_text"]
355
+
356
+ action = {"action_type": action_type_new, "touch_point": touch_point, "lift_point": lift_point,
357
+ "typed_text": typed_text}
358
+
359
+ action["touch_point"] = [action["touch_point"][1], action["touch_point"][0]]
360
+ action["lift_point"] = [action["lift_point"][1], action["lift_point"][0]]
361
+ action["typed_text"] = action["typed_text"].lower()
362
+
363
+ return action
364
+
365
+
366
+ def pred_2_format_simplified(step_data):
367
+ # 把模型输出的内容转换为计算action_matching的格式
368
+ action_type = step_data["action_type"]
369
+
370
+ if action_type == 'click' : # 点击
371
+ action_type_new = 4
372
+ touch_point = step_data["click_point"]
373
+ lift_point = step_data["click_point"]
374
+ typed_text = ""
375
+ elif action_type == 'scroll' and step_data["direction"] == 'down':
376
+ action_type_new = 4
377
+ touch_point = [0.5, 0.8]
378
+ lift_point = [0.5, 0.2]
379
+ typed_text = ""
380
+ elif action_type == 'scroll' and step_data["direction"] == 'up':
381
+ action_type_new = 4
382
+ touch_point = [0.5, 0.2]
383
+ lift_point = [0.5, 0.8]
384
+ typed_text = ""
385
+ elif action_type == 'scroll' and step_data["direction"] == 'left':
386
+ action_type_new = 4
387
+ touch_point = [0.2, 0.5]
388
+ lift_point = [0.8, 0.5]
389
+ typed_text = ""
390
+ elif action_type == 'scroll' and step_data["direction"] == 'right':
391
+ action_type_new = 4
392
+ touch_point = [0.8, 0.5]
393
+ lift_point = [0.2, 0.5]
394
+ typed_text = ""
395
+ elif action_type == 'type':
396
+ action_type_new = 3
397
+ touch_point = [-1.0, -1.0]
398
+ lift_point = [-1.0, -1.0]
399
+ typed_text = step_data["text"]
400
+ elif action_type == 'navigate_back':
401
+ action_type_new = 5
402
+ touch_point = [-1.0, -1.0]
403
+ lift_point = [-1.0, -1.0]
404
+ typed_text = ""
405
+ elif action_type == 'navigate_home':
406
+ action_type_new = 6
407
+ touch_point = [-1.0, -1.0]
408
+ lift_point = [-1.0, -1.0]
409
+ typed_text = ""
410
+ else:
411
+ action_type_new = action_type
412
+ touch_point = [-1.0, -1.0]
413
+ lift_point = [-1.0, -1.0]
414
+ typed_text = ""
415
+ # if action_type_new == 'type':
416
+ # typed_text = step_data["text"]
417
+
418
+ action = {"action_type": action_type_new, "touch_point": touch_point, "lift_point": lift_point,
419
+ "typed_text": typed_text}
420
+
421
+ action["touch_point"] = [action["touch_point"][1], action["touch_point"][0]]
422
+ action["lift_point"] = [action["lift_point"][1], action["lift_point"][0]]
423
+ action["typed_text"] = action["typed_text"].lower()
424
+
425
+ return action
util/action_type.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ Adapted from https://github.com/google-research/google-research/tree/master/android_in_the_wild
3
+ '''
4
+
5
+ import enum
6
+
7
+ class ActionType(enum.IntEnum):
8
+
9
+ # Placeholders for unused enum values
10
+ UNUSED_0 = 0
11
+ UNUSED_1 = 1
12
+ UNUSED_2 = 2
13
+ UNUSED_8 = 8
14
+ UNUSED_9 = 9
15
+
16
+ ########### Agent actions ###########
17
+
18
+ # A type action that sends text to the emulator. Note that this simply sends
19
+ # text and does not perform any clicks for element focus or enter presses for
20
+ # submitting text.
21
+ TYPE = 3
22
+
23
+ # The dual point action used to represent all gestures.
24
+ DUAL_POINT = 4
25
+
26
+ # These actions differentiate pressing the home and back button from touches.
27
+ # They represent explicit presses of back and home performed using ADB.
28
+ PRESS_BACK = 5
29
+ PRESS_HOME = 6
30
+
31
+ # An action representing that ADB command for hitting enter was performed.
32
+ PRESS_ENTER = 7
33
+
34
+ ########### Episode status actions ###########
35
+
36
+ # An action used to indicate the desired task has been completed and resets
37
+ # the environment. This action should also be used in the case that the task
38
+ # has already been completed and there is nothing to do.
39
+ # e.g. The task is to turn on the Wi-Fi when it is already on
40
+ STATUS_TASK_COMPLETE = 10
41
+
42
+ # An action used to indicate that desired task is impossible to complete and
43
+ # resets the environment. This can be a result of many different things
44
+ # including UI changes, Android version differences, etc.
45
+ STATUS_TASK_IMPOSSIBLE = 11
util/box_annotator.py ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional, Union, Tuple
2
+
3
+ import cv2
4
+ import numpy as np
5
+
6
+ from supervision.detection.core import Detections
7
+ from supervision.draw.color import Color, ColorPalette
8
+
9
+
10
+ class BoxAnnotator:
11
+ """
12
+ A class for drawing bounding boxes on an image using detections provided.
13
+
14
+ Attributes:
15
+ color (Union[Color, ColorPalette]): The color to draw the bounding box,
16
+ can be a single color or a color palette
17
+ thickness (int): The thickness of the bounding box lines, default is 2
18
+ text_color (Color): The color of the text on the bounding box, default is white
19
+ text_scale (float): The scale of the text on the bounding box, default is 0.5
20
+ text_thickness (int): The thickness of the text on the bounding box,
21
+ default is 1
22
+ text_padding (int): The padding around the text on the bounding box,
23
+ default is 5
24
+
25
+ """
26
+
27
+ def __init__(
28
+ self,
29
+ color: Union[Color, ColorPalette] = ColorPalette.DEFAULT,
30
+ thickness: int = 3, # 1 for seeclick 2 for mind2web and 3 for demo
31
+ text_color: Color = Color.BLACK,
32
+ text_scale: float = 0.5, # 0.8 for mobile/web, 0.3 for desktop # 0.4 for mind2web
33
+ text_thickness: int = 2, #1, # 2 for demo
34
+ text_padding: int = 10,
35
+ avoid_overlap: bool = True,
36
+ ):
37
+ self.color: Union[Color, ColorPalette] = color
38
+ self.thickness: int = thickness
39
+ self.text_color: Color = text_color
40
+ self.text_scale: float = text_scale
41
+ self.text_thickness: int = text_thickness
42
+ self.text_padding: int = text_padding
43
+ self.avoid_overlap: bool = avoid_overlap
44
+
45
+ def annotate(
46
+ self,
47
+ scene: np.ndarray,
48
+ detections: Detections,
49
+ labels: Optional[List[str]] = None,
50
+ skip_label: bool = False,
51
+ image_size: Optional[Tuple[int, int]] = None,
52
+ ) -> np.ndarray:
53
+ """
54
+ Draws bounding boxes on the frame using the detections provided.
55
+
56
+ Args:
57
+ scene (np.ndarray): The image on which the bounding boxes will be drawn
58
+ detections (Detections): The detections for which the
59
+ bounding boxes will be drawn
60
+ labels (Optional[List[str]]): An optional list of labels
61
+ corresponding to each detection. If `labels` are not provided,
62
+ corresponding `class_id` will be used as label.
63
+ skip_label (bool): Is set to `True`, skips bounding box label annotation.
64
+ Returns:
65
+ np.ndarray: The image with the bounding boxes drawn on it
66
+
67
+ Example:
68
+ ```python
69
+ import supervision as sv
70
+
71
+ classes = ['person', ...]
72
+ image = ...
73
+ detections = sv.Detections(...)
74
+
75
+ box_annotator = sv.BoxAnnotator()
76
+ labels = [
77
+ f"{classes[class_id]} {confidence:0.2f}"
78
+ for _, _, confidence, class_id, _ in detections
79
+ ]
80
+ annotated_frame = box_annotator.annotate(
81
+ scene=image.copy(),
82
+ detections=detections,
83
+ labels=labels
84
+ )
85
+ ```
86
+ """
87
+ font = cv2.FONT_HERSHEY_SIMPLEX
88
+ for i in range(len(detections)):
89
+ x1, y1, x2, y2 = detections.xyxy[i].astype(int)
90
+ class_id = (
91
+ detections.class_id[i] if detections.class_id is not None else None
92
+ )
93
+ idx = class_id if class_id is not None else i
94
+ color = (
95
+ self.color.by_idx(idx)
96
+ if isinstance(self.color, ColorPalette)
97
+ else self.color
98
+ )
99
+ cv2.rectangle(
100
+ img=scene,
101
+ pt1=(x1, y1),
102
+ pt2=(x2, y2),
103
+ color=color.as_bgr(),
104
+ thickness=self.thickness,
105
+ )
106
+ if skip_label:
107
+ continue
108
+
109
+ text = (
110
+ f"{class_id}"
111
+ if (labels is None or len(detections) != len(labels))
112
+ else labels[i]
113
+ )
114
+
115
+ text_width, text_height = cv2.getTextSize(
116
+ text=text,
117
+ fontFace=font,
118
+ fontScale=self.text_scale,
119
+ thickness=self.text_thickness,
120
+ )[0]
121
+
122
+ if not self.avoid_overlap:
123
+ text_x = x1 + self.text_padding
124
+ text_y = y1 - self.text_padding
125
+
126
+ text_background_x1 = x1
127
+ text_background_y1 = y1 - 2 * self.text_padding - text_height
128
+
129
+ text_background_x2 = x1 + 2 * self.text_padding + text_width
130
+ text_background_y2 = y1
131
+ # text_x = x1 - self.text_padding - text_width
132
+ # text_y = y1 + self.text_padding + text_height
133
+ # text_background_x1 = x1 - 2 * self.text_padding - text_width
134
+ # text_background_y1 = y1
135
+ # text_background_x2 = x1
136
+ # text_background_y2 = y1 + 2 * self.text_padding + text_height
137
+ else:
138
+ text_x, text_y, text_background_x1, text_background_y1, text_background_x2, text_background_y2 = get_optimal_label_pos(self.text_padding, text_width, text_height, x1, y1, x2, y2, detections, image_size)
139
+
140
+ cv2.rectangle(
141
+ img=scene,
142
+ pt1=(text_background_x1, text_background_y1),
143
+ pt2=(text_background_x2, text_background_y2),
144
+ color=color.as_bgr(),
145
+ thickness=cv2.FILLED,
146
+ )
147
+ # import pdb; pdb.set_trace()
148
+ box_color = color.as_rgb()
149
+ luminance = 0.299 * box_color[0] + 0.587 * box_color[1] + 0.114 * box_color[2]
150
+ text_color = (0,0,0) if luminance > 160 else (255,255,255)
151
+ cv2.putText(
152
+ img=scene,
153
+ text=text,
154
+ org=(text_x, text_y),
155
+ fontFace=font,
156
+ fontScale=self.text_scale,
157
+ # color=self.text_color.as_rgb(),
158
+ color=text_color,
159
+ thickness=self.text_thickness,
160
+ lineType=cv2.LINE_AA,
161
+ )
162
+ return scene
163
+
164
+
165
+ def box_area(box):
166
+ return (box[2] - box[0]) * (box[3] - box[1])
167
+
168
+ def intersection_area(box1, box2):
169
+ x1 = max(box1[0], box2[0])
170
+ y1 = max(box1[1], box2[1])
171
+ x2 = min(box1[2], box2[2])
172
+ y2 = min(box1[3], box2[3])
173
+ return max(0, x2 - x1) * max(0, y2 - y1)
174
+
175
+ def IoU(box1, box2, return_max=True):
176
+ intersection = intersection_area(box1, box2)
177
+ union = box_area(box1) + box_area(box2) - intersection
178
+ if box_area(box1) > 0 and box_area(box2) > 0:
179
+ ratio1 = intersection / box_area(box1)
180
+ ratio2 = intersection / box_area(box2)
181
+ else:
182
+ ratio1, ratio2 = 0, 0
183
+ if return_max:
184
+ return max(intersection / union, ratio1, ratio2)
185
+ else:
186
+ return intersection / union
187
+
188
+
189
+ def get_optimal_label_pos(text_padding, text_width, text_height, x1, y1, x2, y2, detections, image_size):
190
+ """ check overlap of text and background detection box, and get_optimal_label_pos,
191
+ pos: str, position of the text, must be one of 'top left', 'top right', 'outer left', 'outer right' TODO: if all are overlapping, return the last one, i.e. outer right
192
+ Threshold: default to 0.3
193
+ """
194
+
195
+ def get_is_overlap(detections, text_background_x1, text_background_y1, text_background_x2, text_background_y2, image_size):
196
+ is_overlap = False
197
+ for i in range(len(detections)):
198
+ detection = detections.xyxy[i].astype(int)
199
+ if IoU([text_background_x1, text_background_y1, text_background_x2, text_background_y2], detection) > 0.3:
200
+ is_overlap = True
201
+ break
202
+ # check if the text is out of the image
203
+ if text_background_x1 < 0 or text_background_x2 > image_size[0] or text_background_y1 < 0 or text_background_y2 > image_size[1]:
204
+ is_overlap = True
205
+ return is_overlap
206
+
207
+ # if pos == 'top left':
208
+ text_x = x1 + text_padding
209
+ text_y = y1 - text_padding
210
+
211
+ text_background_x1 = x1
212
+ text_background_y1 = y1 - 2 * text_padding - text_height
213
+
214
+ text_background_x2 = x1 + 2 * text_padding + text_width
215
+ text_background_y2 = y1
216
+ is_overlap = get_is_overlap(detections, text_background_x1, text_background_y1, text_background_x2, text_background_y2, image_size)
217
+ if not is_overlap:
218
+ return text_x, text_y, text_background_x1, text_background_y1, text_background_x2, text_background_y2
219
+
220
+ # elif pos == 'outer left':
221
+ text_x = x1 - text_padding - text_width
222
+ text_y = y1 + text_padding + text_height
223
+
224
+ text_background_x1 = x1 - 2 * text_padding - text_width
225
+ text_background_y1 = y1
226
+
227
+ text_background_x2 = x1
228
+ text_background_y2 = y1 + 2 * text_padding + text_height
229
+ is_overlap = get_is_overlap(detections, text_background_x1, text_background_y1, text_background_x2, text_background_y2, image_size)
230
+ if not is_overlap:
231
+ return text_x, text_y, text_background_x1, text_background_y1, text_background_x2, text_background_y2
232
+
233
+
234
+ # elif pos == 'outer right':
235
+ text_x = x2 + text_padding
236
+ text_y = y1 + text_padding + text_height
237
+
238
+ text_background_x1 = x2
239
+ text_background_y1 = y1
240
+
241
+ text_background_x2 = x2 + 2 * text_padding + text_width
242
+ text_background_y2 = y1 + 2 * text_padding + text_height
243
+
244
+ is_overlap = get_is_overlap(detections, text_background_x1, text_background_y1, text_background_x2, text_background_y2, image_size)
245
+ if not is_overlap:
246
+ return text_x, text_y, text_background_x1, text_background_y1, text_background_x2, text_background_y2
247
+
248
+ # elif pos == 'top right':
249
+ text_x = x2 - text_padding - text_width
250
+ text_y = y1 - text_padding
251
+
252
+ text_background_x1 = x2 - 2 * text_padding - text_width
253
+ text_background_y1 = y1 - 2 * text_padding - text_height
254
+
255
+ text_background_x2 = x2
256
+ text_background_y2 = y1
257
+
258
+ is_overlap = get_is_overlap(detections, text_background_x1, text_background_y1, text_background_x2, text_background_y2, image_size)
259
+ if not is_overlap:
260
+ return text_x, text_y, text_background_x1, text_background_y1, text_background_x2, text_background_y2
261
+
262
+ return text_x, text_y, text_background_x1, text_background_y1, text_background_x2, text_background_y2
utils.py ADDED
@@ -0,0 +1,402 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from ultralytics import YOLO
2
+ import os
3
+ import io
4
+ import base64
5
+ import time
6
+ from PIL import Image, ImageDraw, ImageFont
7
+ import json
8
+ import requests
9
+ # utility function
10
+ import os
11
+ from openai import AzureOpenAI
12
+
13
+ import json
14
+ import sys
15
+ import os
16
+ import cv2
17
+ import numpy as np
18
+ # %matplotlib inline
19
+ from matplotlib import pyplot as plt
20
+ import easyocr
21
+ reader = easyocr.Reader(['en'])
22
+ import time
23
+ import base64
24
+
25
+ import os
26
+ import ast
27
+ import torch
28
+ from typing import Tuple, List
29
+ from torchvision.ops import box_convert
30
+ import re
31
+ from torchvision.transforms import ToPILImage
32
+ import supervision as sv
33
+ import torchvision.transforms as T
34
+
35
+
36
+ def get_caption_model_processor(model_name, model_name_or_path="Salesforce/blip2-opt-2.7b", device=None):
37
+ if not device:
38
+ device = "cuda" if torch.cuda.is_available() else "cpu"
39
+ if model_name == "blip2":
40
+ from transformers import Blip2Processor, Blip2ForConditionalGeneration
41
+ processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
42
+ if device == 'cpu':
43
+ model = Blip2ForConditionalGeneration.from_pretrained(
44
+ model_name_or_path, device_map=None, torch_dtype=torch.float32
45
+ )
46
+ else:
47
+ model = Blip2ForConditionalGeneration.from_pretrained(
48
+ model_name_or_path, device_map=None, torch_dtype=torch.float16
49
+ ).to(device)
50
+ elif model_name == "florence2":
51
+ from transformers import AutoProcessor, AutoModelForCausalLM
52
+ processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)
53
+ if device == 'cpu':
54
+ model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.float32, trust_remote_code=True)
55
+ else:
56
+ model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.float16, trust_remote_code=True).to(device)
57
+ return {'model': model.to(device), 'processor': processor}
58
+
59
+
60
+ def get_yolo_model(model_path):
61
+ from ultralytics import YOLO
62
+ # Load the model.
63
+ model = YOLO(model_path)
64
+ return model
65
+
66
+
67
+ def get_parsed_content_icon(filtered_boxes, ocr_bbox, image_source, caption_model_processor, prompt=None):
68
+ to_pil = ToPILImage()
69
+ if ocr_bbox:
70
+ non_ocr_boxes = filtered_boxes[len(ocr_bbox):]
71
+ else:
72
+ non_ocr_boxes = filtered_boxes
73
+ croped_pil_image = []
74
+ for i, coord in enumerate(non_ocr_boxes):
75
+ xmin, xmax = int(coord[0]*image_source.shape[1]), int(coord[2]*image_source.shape[1])
76
+ ymin, ymax = int(coord[1]*image_source.shape[0]), int(coord[3]*image_source.shape[0])
77
+ cropped_image = image_source[ymin:ymax, xmin:xmax, :]
78
+ croped_pil_image.append(to_pil(cropped_image))
79
+
80
+ model, processor = caption_model_processor['model'], caption_model_processor['processor']
81
+ if not prompt:
82
+ if 'florence' in model.config.name_or_path:
83
+ prompt = "<CAPTION>"
84
+ else:
85
+ prompt = "The image shows"
86
+
87
+ batch_size = 10 # Number of samples per batch
88
+ generated_texts = []
89
+ device = model.device
90
+
91
+ for i in range(0, len(croped_pil_image), batch_size):
92
+ batch = croped_pil_image[i:i+batch_size]
93
+ if model.device.type == 'cuda':
94
+ inputs = processor(images=batch, text=[prompt]*len(batch), return_tensors="pt").to(device=device, dtype=torch.float16)
95
+ else:
96
+ inputs = processor(images=batch, text=[prompt]*len(batch), return_tensors="pt").to(device=device)
97
+ if 'florence' in model.config.name_or_path:
98
+ generated_ids = model.generate(input_ids=inputs["input_ids"],pixel_values=inputs["pixel_values"],max_new_tokens=1024,num_beams=3, do_sample=False)
99
+ else:
100
+ generated_ids = model.generate(**inputs, max_length=100, num_beams=5, no_repeat_ngram_size=2, early_stopping=True, num_return_sequences=1) # temperature=0.01, do_sample=True,
101
+ generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
102
+ generated_text = [gen.strip() for gen in generated_text]
103
+ generated_texts.extend(generated_text)
104
+
105
+ return generated_texts
106
+
107
+
108
+
109
+ def get_parsed_content_icon_phi3v(filtered_boxes, ocr_bbox, image_source, caption_model_processor):
110
+ to_pil = ToPILImage()
111
+ if ocr_bbox:
112
+ non_ocr_boxes = filtered_boxes[len(ocr_bbox):]
113
+ else:
114
+ non_ocr_boxes = filtered_boxes
115
+ croped_pil_image = []
116
+ for i, coord in enumerate(non_ocr_boxes):
117
+ xmin, xmax = int(coord[0]*image_source.shape[1]), int(coord[2]*image_source.shape[1])
118
+ ymin, ymax = int(coord[1]*image_source.shape[0]), int(coord[3]*image_source.shape[0])
119
+ cropped_image = image_source[ymin:ymax, xmin:xmax, :]
120
+ croped_pil_image.append(to_pil(cropped_image))
121
+
122
+ model, processor = caption_model_processor['model'], caption_model_processor['processor']
123
+ device = model.device
124
+ messages = [{"role": "user", "content": "<|image_1|>\ndescribe the icon in one sentence"}]
125
+ prompt = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
126
+
127
+ batch_size = 5 # Number of samples per batch
128
+ generated_texts = []
129
+
130
+ for i in range(0, len(croped_pil_image), batch_size):
131
+ images = croped_pil_image[i:i+batch_size]
132
+ image_inputs = [processor.image_processor(x, return_tensors="pt") for x in images]
133
+ inputs ={'input_ids': [], 'attention_mask': [], 'pixel_values': [], 'image_sizes': []}
134
+ texts = [prompt] * len(images)
135
+ for i, txt in enumerate(texts):
136
+ input = processor._convert_images_texts_to_inputs(image_inputs[i], txt, return_tensors="pt")
137
+ inputs['input_ids'].append(input['input_ids'])
138
+ inputs['attention_mask'].append(input['attention_mask'])
139
+ inputs['pixel_values'].append(input['pixel_values'])
140
+ inputs['image_sizes'].append(input['image_sizes'])
141
+ max_len = max([x.shape[1] for x in inputs['input_ids']])
142
+ for i, v in enumerate(inputs['input_ids']):
143
+ inputs['input_ids'][i] = torch.cat([processor.tokenizer.pad_token_id * torch.ones(1, max_len - v.shape[1], dtype=torch.long), v], dim=1)
144
+ inputs['attention_mask'][i] = torch.cat([torch.zeros(1, max_len - v.shape[1], dtype=torch.long), inputs['attention_mask'][i]], dim=1)
145
+ inputs_cat = {k: torch.concatenate(v).to(device) for k, v in inputs.items()}
146
+
147
+ generation_args = {
148
+ "max_new_tokens": 25,
149
+ "temperature": 0.01,
150
+ "do_sample": False,
151
+ }
152
+ generate_ids = model.generate(**inputs_cat, eos_token_id=processor.tokenizer.eos_token_id, **generation_args)
153
+ # # remove input tokens
154
+ generate_ids = generate_ids[:, inputs_cat['input_ids'].shape[1]:]
155
+ response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
156
+ response = [res.strip('\n').strip() for res in response]
157
+ generated_texts.extend(response)
158
+
159
+ return generated_texts
160
+
161
+ def remove_overlap(boxes, iou_threshold, ocr_bbox=None):
162
+ assert ocr_bbox is None or isinstance(ocr_bbox, List)
163
+
164
+ def box_area(box):
165
+ return (box[2] - box[0]) * (box[3] - box[1])
166
+
167
+ def intersection_area(box1, box2):
168
+ x1 = max(box1[0], box2[0])
169
+ y1 = max(box1[1], box2[1])
170
+ x2 = min(box1[2], box2[2])
171
+ y2 = min(box1[3], box2[3])
172
+ return max(0, x2 - x1) * max(0, y2 - y1)
173
+
174
+ def IoU(box1, box2):
175
+ intersection = intersection_area(box1, box2)
176
+ union = box_area(box1) + box_area(box2) - intersection + 1e-6
177
+ if box_area(box1) > 0 and box_area(box2) > 0:
178
+ ratio1 = intersection / box_area(box1)
179
+ ratio2 = intersection / box_area(box2)
180
+ else:
181
+ ratio1, ratio2 = 0, 0
182
+ return max(intersection / union, ratio1, ratio2)
183
+
184
+ boxes = boxes.tolist()
185
+ filtered_boxes = []
186
+ if ocr_bbox:
187
+ filtered_boxes.extend(ocr_bbox)
188
+ # print('ocr_bbox!!!', ocr_bbox)
189
+ for i, box1 in enumerate(boxes):
190
+ # if not any(IoU(box1, box2) > iou_threshold and box_area(box1) > box_area(box2) for j, box2 in enumerate(boxes) if i != j):
191
+ is_valid_box = True
192
+ for j, box2 in enumerate(boxes):
193
+ if i != j and IoU(box1, box2) > iou_threshold and box_area(box1) > box_area(box2):
194
+ is_valid_box = False
195
+ break
196
+ if is_valid_box:
197
+ # add the following 2 lines to include ocr bbox
198
+ if ocr_bbox:
199
+ if not any(IoU(box1, box3) > iou_threshold for k, box3 in enumerate(ocr_bbox)):
200
+ filtered_boxes.append(box1)
201
+ else:
202
+ filtered_boxes.append(box1)
203
+ return torch.tensor(filtered_boxes)
204
+
205
+ def load_image(image_path: str) -> Tuple[np.array, torch.Tensor]:
206
+ transform = T.Compose(
207
+ [
208
+ T.RandomResize([800], max_size=1333),
209
+ T.ToTensor(),
210
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
211
+ ]
212
+ )
213
+ image_source = Image.open(image_path).convert("RGB")
214
+ image = np.asarray(image_source)
215
+ image_transformed, _ = transform(image_source, None)
216
+ return image, image_transformed
217
+
218
+
219
+ def annotate(image_source: np.ndarray, boxes: torch.Tensor, logits: torch.Tensor, phrases: List[str], text_scale: float,
220
+ text_padding=5, text_thickness=2, thickness=3) -> np.ndarray:
221
+ """
222
+ This function annotates an image with bounding boxes and labels.
223
+
224
+ Parameters:
225
+ image_source (np.ndarray): The source image to be annotated.
226
+ boxes (torch.Tensor): A tensor containing bounding box coordinates. in cxcywh format, pixel scale
227
+ logits (torch.Tensor): A tensor containing confidence scores for each bounding box.
228
+ phrases (List[str]): A list of labels for each bounding box.
229
+ text_scale (float): The scale of the text to be displayed. 0.8 for mobile/web, 0.3 for desktop # 0.4 for mind2web
230
+
231
+ Returns:
232
+ np.ndarray: The annotated image.
233
+ """
234
+ h, w, _ = image_source.shape
235
+ boxes = boxes * torch.Tensor([w, h, w, h])
236
+ xyxy = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy()
237
+ xywh = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xywh").numpy()
238
+ detections = sv.Detections(xyxy=xyxy)
239
+
240
+ labels = [f"{phrase}" for phrase in range(boxes.shape[0])]
241
+
242
+ from util.box_annotator import BoxAnnotator
243
+ box_annotator = BoxAnnotator(text_scale=text_scale, text_padding=text_padding,text_thickness=text_thickness,thickness=thickness) # 0.8 for mobile/web, 0.3 for desktop # 0.4 for mind2web
244
+ annotated_frame = image_source.copy()
245
+ annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels, image_size=(w,h))
246
+
247
+ label_coordinates = {f"{phrase}": v for phrase, v in zip(phrases, xywh)}
248
+ return annotated_frame, label_coordinates
249
+
250
+
251
+ def predict(model, image, caption, box_threshold, text_threshold):
252
+ """ Use huggingface model to replace the original model
253
+ """
254
+ model, processor = model['model'], model['processor']
255
+ device = model.device
256
+
257
+ inputs = processor(images=image, text=caption, return_tensors="pt").to(device)
258
+ with torch.no_grad():
259
+ outputs = model(**inputs)
260
+
261
+ results = processor.post_process_grounded_object_detection(
262
+ outputs,
263
+ inputs.input_ids,
264
+ box_threshold=box_threshold, # 0.4,
265
+ text_threshold=text_threshold, # 0.3,
266
+ target_sizes=[image.size[::-1]]
267
+ )[0]
268
+ boxes, logits, phrases = results["boxes"], results["scores"], results["labels"]
269
+ return boxes, logits, phrases
270
+
271
+
272
+ def predict_yolo(model, image_path, box_threshold):
273
+ """ Use huggingface model to replace the original model
274
+ """
275
+ # model = model['model']
276
+
277
+ result = model.predict(
278
+ source=image_path,
279
+ conf=box_threshold,
280
+ # iou=0.5, # default 0.7
281
+ )
282
+ boxes = result[0].boxes.xyxy#.tolist() # in pixel space
283
+ conf = result[0].boxes.conf
284
+ phrases = [str(i) for i in range(len(boxes))]
285
+
286
+ return boxes, conf, phrases
287
+
288
+
289
+ def get_som_labeled_img(img_path, model=None, BOX_TRESHOLD = 0.01, output_coord_in_ratio=False, ocr_bbox=None, text_scale=0.4, text_padding=5, draw_bbox_config=None, caption_model_processor=None, ocr_text=[], use_local_semantics=True, iou_threshold=0.9,prompt=None):
290
+ """ ocr_bbox: list of xyxy format bbox
291
+ """
292
+ TEXT_PROMPT = "clickable buttons on the screen"
293
+ # BOX_TRESHOLD = 0.02 # 0.05/0.02 for web and 0.1 for mobile
294
+ TEXT_TRESHOLD = 0.01 # 0.9 # 0.01
295
+ image_source = Image.open(img_path).convert("RGB")
296
+ w, h = image_source.size
297
+ # import pdb; pdb.set_trace()
298
+ if False: # TODO
299
+ xyxy, logits, phrases = predict(model=model, image=image_source, caption=TEXT_PROMPT, box_threshold=BOX_TRESHOLD, text_threshold=TEXT_TRESHOLD)
300
+ else:
301
+ xyxy, logits, phrases = predict_yolo(model=model, image_path=img_path, box_threshold=BOX_TRESHOLD)
302
+ xyxy = xyxy / torch.Tensor([w, h, w, h]).to(xyxy.device)
303
+ image_source = np.asarray(image_source)
304
+ phrases = [str(i) for i in range(len(phrases))]
305
+
306
+ # annotate the image with labels
307
+ h, w, _ = image_source.shape
308
+ if ocr_bbox:
309
+ ocr_bbox = torch.tensor(ocr_bbox) / torch.Tensor([w, h, w, h])
310
+ ocr_bbox=ocr_bbox.tolist()
311
+ else:
312
+ print('no ocr bbox!!!')
313
+ ocr_bbox = None
314
+ filtered_boxes = remove_overlap(boxes=xyxy, iou_threshold=iou_threshold, ocr_bbox=ocr_bbox)
315
+
316
+ # get parsed icon local semantics
317
+ if use_local_semantics:
318
+ caption_model = caption_model_processor['model']
319
+ if 'phi3_v' in caption_model.config.model_type:
320
+ parsed_content_icon = get_parsed_content_icon_phi3v(filtered_boxes, ocr_bbox, image_source, caption_model_processor)
321
+ else:
322
+ parsed_content_icon = get_parsed_content_icon(filtered_boxes, ocr_bbox, image_source, caption_model_processor, prompt=prompt)
323
+ ocr_text = [f"Text Box ID {i}: {txt}" for i, txt in enumerate(ocr_text)]
324
+ icon_start = len(ocr_text)
325
+ parsed_content_icon_ls = []
326
+ for i, txt in enumerate(parsed_content_icon):
327
+ parsed_content_icon_ls.append(f"Icon Box ID {str(i+icon_start)}: {txt}")
328
+ parsed_content_merged = ocr_text + parsed_content_icon_ls
329
+ else:
330
+ ocr_text = [f"Text Box ID {i}: {txt}" for i, txt in enumerate(ocr_text)]
331
+ parsed_content_merged = ocr_text
332
+
333
+ filtered_boxes = box_convert(boxes=filtered_boxes, in_fmt="xyxy", out_fmt="cxcywh")
334
+
335
+ phrases = [i for i in range(len(filtered_boxes))]
336
+
337
+ # draw boxes
338
+ if draw_bbox_config:
339
+ annotated_frame, label_coordinates = annotate(image_source=image_source, boxes=filtered_boxes, logits=logits, phrases=phrases, **draw_bbox_config)
340
+ else:
341
+ annotated_frame, label_coordinates = annotate(image_source=image_source, boxes=filtered_boxes, logits=logits, phrases=phrases, text_scale=text_scale, text_padding=text_padding)
342
+
343
+ pil_img = Image.fromarray(annotated_frame)
344
+ buffered = io.BytesIO()
345
+ pil_img.save(buffered, format="PNG")
346
+ encoded_image = base64.b64encode(buffered.getvalue()).decode('ascii')
347
+ if output_coord_in_ratio:
348
+ # h, w, _ = image_source.shape
349
+ label_coordinates = {k: [v[0]/w, v[1]/h, v[2]/w, v[3]/h] for k, v in label_coordinates.items()}
350
+ assert w == annotated_frame.shape[1] and h == annotated_frame.shape[0]
351
+
352
+ return encoded_image, label_coordinates, parsed_content_merged
353
+
354
+
355
+ def get_xywh(input):
356
+ x, y, w, h = input[0][0], input[0][1], input[2][0] - input[0][0], input[2][1] - input[0][1]
357
+ x, y, w, h = int(x), int(y), int(w), int(h)
358
+ return x, y, w, h
359
+
360
+ def get_xyxy(input):
361
+ x, y, xp, yp = input[0][0], input[0][1], input[2][0], input[2][1]
362
+ x, y, xp, yp = int(x), int(y), int(xp), int(yp)
363
+ return x, y, xp, yp
364
+
365
+ def get_xywh_yolo(input):
366
+ x, y, w, h = input[0], input[1], input[2] - input[0], input[3] - input[1]
367
+ x, y, w, h = int(x), int(y), int(w), int(h)
368
+ return x, y, w, h
369
+
370
+
371
+
372
+ def check_ocr_box(image_path, display_img = True, output_bb_format='xywh', goal_filtering=None, easyocr_args=None):
373
+ if easyocr_args is None:
374
+ easyocr_args = {}
375
+ result = reader.readtext(image_path, **easyocr_args)
376
+ is_goal_filtered = False
377
+ # print('goal filtering pred:', result[-5:])
378
+ coord = [item[0] for item in result]
379
+ text = [item[1] for item in result]
380
+ # read the image using cv2
381
+ if display_img:
382
+ opencv_img = cv2.imread(image_path)
383
+ opencv_img = cv2.cvtColor(opencv_img, cv2.COLOR_RGB2BGR)
384
+ bb = []
385
+ for item in coord:
386
+ x, y, a, b = get_xywh(item)
387
+ # print(x, y, a, b)
388
+ bb.append((x, y, a, b))
389
+ cv2.rectangle(opencv_img, (x, y), (x+a, y+b), (0, 255, 0), 2)
390
+
391
+ # Display the image
392
+ plt.imshow(opencv_img)
393
+ else:
394
+ if output_bb_format == 'xywh':
395
+ bb = [get_xywh(item) for item in coord]
396
+ elif output_bb_format == 'xyxy':
397
+ bb = [get_xyxy(item) for item in coord]
398
+ # print('bounding box!!!', bb)
399
+ return (text, bb), is_goal_filtered
400
+
401
+
402
+
weights/convert_safetensor_to_pt.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from ultralytics.nn.tasks import DetectionModel
3
+ from safetensors.torch import load_file
4
+
5
+ tensor_dict = load_file("weights/icon_detect/model.safetensors")
6
+
7
+ model = DetectionModel('weights/icon_detect/model.yaml')
8
+ model.load_state_dict(tensor_dict)
9
+ torch.save({'model':model}, 'weights/icon_detect/best.pt')
weights/icon_caption_florence/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) Microsoft Corporation.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE
weights/icon_caption_florence/ckpt.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:553e45ab33fb7835894c9c74a13b4f582510cda44446094731d1716be78ff8b0
3
+ size 1084009247
weights/icon_caption_florence/config.json ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "microsoft/Florence-2-base-ft",
3
+ "architectures": [
4
+ "Florence2ForConditionalGeneration"
5
+ ],
6
+ "auto_map": {
7
+ "AutoConfig": "microsoft/Florence-2-base-ft--configuration_florence2.Florence2Config",
8
+ "AutoModelForCausalLM": "microsoft/Florence-2-base-ft--modeling_florence2.Florence2ForConditionalGeneration"
9
+ },
10
+ "bos_token_id": 2,
11
+ "eos_token_id": 1,
12
+ "ignore_index": -100,
13
+ "is_encoder_decoder": true,
14
+ "model_type": "florence2",
15
+ "pad_token_id": 0,
16
+ "projection_dim": 768,
17
+ "text_config": {
18
+ "_name_or_path": "",
19
+ "activation_dropout": 0.1,
20
+ "activation_function": "gelu",
21
+ "add_bias_logits": false,
22
+ "add_cross_attention": false,
23
+ "add_final_layer_norm": false,
24
+ "architectures": null,
25
+ "attention_dropout": 0.1,
26
+ "bad_words_ids": null,
27
+ "begin_suppress_tokens": null,
28
+ "bos_token_id": 0,
29
+ "chunk_size_feed_forward": 0,
30
+ "classif_dropout": 0.1,
31
+ "classifier_dropout": 0.0,
32
+ "cross_attention_hidden_size": null,
33
+ "d_model": 768,
34
+ "decoder_attention_heads": 12,
35
+ "decoder_ffn_dim": 3072,
36
+ "decoder_layerdrop": 0.0,
37
+ "decoder_layers": 6,
38
+ "decoder_start_token_id": 2,
39
+ "diversity_penalty": 0.0,
40
+ "do_sample": false,
41
+ "dropout": 0.1,
42
+ "early_stopping": true,
43
+ "encoder_attention_heads": 12,
44
+ "encoder_ffn_dim": 3072,
45
+ "encoder_layerdrop": 0.0,
46
+ "encoder_layers": 6,
47
+ "encoder_no_repeat_ngram_size": 0,
48
+ "eos_token_id": 2,
49
+ "exponential_decay_length_penalty": null,
50
+ "finetuning_task": null,
51
+ "forced_bos_token_id": 0,
52
+ "forced_eos_token_id": 2,
53
+ "gradient_checkpointing": false,
54
+ "id2label": {
55
+ "0": "LABEL_0",
56
+ "1": "LABEL_1",
57
+ "2": "LABEL_2"
58
+ },
59
+ "init_std": 0.02,
60
+ "is_decoder": false,
61
+ "is_encoder_decoder": true,
62
+ "label2id": {
63
+ "LABEL_0": 0,
64
+ "LABEL_1": 1,
65
+ "LABEL_2": 2
66
+ },
67
+ "length_penalty": 1.0,
68
+ "max_length": 20,
69
+ "max_position_embeddings": 1024,
70
+ "min_length": 0,
71
+ "model_type": "florence2_language",
72
+ "no_repeat_ngram_size": 3,
73
+ "normalize_before": false,
74
+ "num_beam_groups": 1,
75
+ "num_beams": 3,
76
+ "num_hidden_layers": 6,
77
+ "num_return_sequences": 1,
78
+ "output_attentions": false,
79
+ "output_hidden_states": false,
80
+ "output_scores": false,
81
+ "pad_token_id": 1,
82
+ "prefix": null,
83
+ "problem_type": null,
84
+ "pruned_heads": {},
85
+ "remove_invalid_values": false,
86
+ "repetition_penalty": 1.0,
87
+ "return_dict": true,
88
+ "return_dict_in_generate": false,
89
+ "scale_embedding": false,
90
+ "sep_token_id": null,
91
+ "suppress_tokens": null,
92
+ "task_specific_params": null,
93
+ "temperature": 1.0,
94
+ "tf_legacy_loss": false,
95
+ "tie_encoder_decoder": false,
96
+ "tie_word_embeddings": true,
97
+ "tokenizer_class": null,
98
+ "top_k": 50,
99
+ "top_p": 1.0,
100
+ "torch_dtype": null,
101
+ "torchscript": false,
102
+ "typical_p": 1.0,
103
+ "use_bfloat16": false,
104
+ "use_cache": true,
105
+ "vocab_size": 51289
106
+ },
107
+ "torch_dtype": "float32",
108
+ "transformers_version": "4.40.2",
109
+ "vision_config": {
110
+ "_name_or_path": "",
111
+ "add_cross_attention": false,
112
+ "architectures": null,
113
+ "bad_words_ids": null,
114
+ "begin_suppress_tokens": null,
115
+ "bos_token_id": null,
116
+ "chunk_size_feed_forward": 0,
117
+ "cross_attention_hidden_size": null,
118
+ "decoder_start_token_id": null,
119
+ "depths": [
120
+ 1,
121
+ 1,
122
+ 9,
123
+ 1
124
+ ],
125
+ "dim_embed": [
126
+ 128,
127
+ 256,
128
+ 512,
129
+ 1024
130
+ ],
131
+ "diversity_penalty": 0.0,
132
+ "do_sample": false,
133
+ "drop_path_rate": 0.1,
134
+ "early_stopping": false,
135
+ "enable_checkpoint": false,
136
+ "encoder_no_repeat_ngram_size": 0,
137
+ "eos_token_id": null,
138
+ "exponential_decay_length_penalty": null,
139
+ "finetuning_task": null,
140
+ "forced_bos_token_id": null,
141
+ "forced_eos_token_id": null,
142
+ "id2label": {
143
+ "0": "LABEL_0",
144
+ "1": "LABEL_1"
145
+ },
146
+ "image_feature_source": [
147
+ "spatial_avg_pool",
148
+ "temporal_avg_pool"
149
+ ],
150
+ "image_pos_embed": {
151
+ "max_pos_embeddings": 50,
152
+ "type": "learned_abs_2d"
153
+ },
154
+ "is_decoder": false,
155
+ "is_encoder_decoder": false,
156
+ "label2id": {
157
+ "LABEL_0": 0,
158
+ "LABEL_1": 1
159
+ },
160
+ "length_penalty": 1.0,
161
+ "max_length": 20,
162
+ "min_length": 0,
163
+ "model_type": "davit",
164
+ "no_repeat_ngram_size": 0,
165
+ "num_beam_groups": 1,
166
+ "num_beams": 1,
167
+ "num_groups": [
168
+ 4,
169
+ 8,
170
+ 16,
171
+ 32
172
+ ],
173
+ "num_heads": [
174
+ 4,
175
+ 8,
176
+ 16,
177
+ 32
178
+ ],
179
+ "num_return_sequences": 1,
180
+ "output_attentions": false,
181
+ "output_hidden_states": false,
182
+ "output_scores": false,
183
+ "pad_token_id": null,
184
+ "patch_padding": [
185
+ 3,
186
+ 1,
187
+ 1,
188
+ 1
189
+ ],
190
+ "patch_prenorm": [
191
+ false,
192
+ true,
193
+ true,
194
+ true
195
+ ],
196
+ "patch_size": [
197
+ 7,
198
+ 3,
199
+ 3,
200
+ 3
201
+ ],
202
+ "patch_stride": [
203
+ 4,
204
+ 2,
205
+ 2,
206
+ 2
207
+ ],
208
+ "prefix": null,
209
+ "problem_type": null,
210
+ "projection_dim": 768,
211
+ "pruned_heads": {},
212
+ "remove_invalid_values": false,
213
+ "repetition_penalty": 1.0,
214
+ "return_dict": true,
215
+ "return_dict_in_generate": false,
216
+ "sep_token_id": null,
217
+ "suppress_tokens": null,
218
+ "task_specific_params": null,
219
+ "temperature": 1.0,
220
+ "tf_legacy_loss": false,
221
+ "tie_encoder_decoder": false,
222
+ "tie_word_embeddings": true,
223
+ "tokenizer_class": null,
224
+ "top_k": 50,
225
+ "top_p": 1.0,
226
+ "torch_dtype": null,
227
+ "torchscript": false,
228
+ "typical_p": 1.0,
229
+ "use_bfloat16": false,
230
+ "visual_temporal_embedding": {
231
+ "max_temporal_embeddings": 100,
232
+ "type": "COSINE"
233
+ },
234
+ "window_size": 12
235
+ },
236
+ "vocab_size": 51289
237
+ }
weights/icon_caption_florence/generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 2,
4
+ "decoder_start_token_id": 2,
5
+ "early_stopping": true,
6
+ "eos_token_id": 1,
7
+ "forced_bos_token_id": 0,
8
+ "forced_eos_token_id": 2,
9
+ "no_repeat_ngram_size": 3,
10
+ "num_beams": 3,
11
+ "pad_token_id": 0,
12
+ "transformers_version": "4.40.2"
13
+ }
weights/icon_caption_florence/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0d7a6bedf8c6dbabe7d40b9f78ada36e78c1a93617506bc06a93279a78dfb14
3
+ size 1083916964
weights/icon_detect/LICENSE ADDED
@@ -0,0 +1,661 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ GNU AFFERO GENERAL PUBLIC LICENSE
2
+ Version 3, 19 November 2007
3
+
4
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
5
+ Everyone is permitted to copy and distribute verbatim copies
6
+ of this license document, but changing it is not allowed.
7
+
8
+ Preamble
9
+
10
+ The GNU Affero General Public License is a free, copyleft license for
11
+ software and other kinds of works, specifically designed to ensure
12
+ cooperation with the community in the case of network server software.
13
+
14
+ The licenses for most software and other practical works are designed
15
+ to take away your freedom to share and change the works. By contrast,
16
+ our General Public Licenses are intended to guarantee your freedom to
17
+ share and change all versions of a program--to make sure it remains free
18
+ software for all its users.
19
+
20
+ When we speak of free software, we are referring to freedom, not
21
+ price. Our General Public Licenses are designed to make sure that you
22
+ have the freedom to distribute copies of free software (and charge for
23
+ them if you wish), that you receive source code or can get it if you
24
+ want it, that you can change the software or use pieces of it in new
25
+ free programs, and that you know you can do these things.
26
+
27
+ Developers that use our General Public Licenses protect your rights
28
+ with two steps: (1) assert copyright on the software, and (2) offer
29
+ you this License which gives you legal permission to copy, distribute
30
+ and/or modify the software.
31
+
32
+ A secondary benefit of defending all users' freedom is that
33
+ improvements made in alternate versions of the program, if they
34
+ receive widespread use, become available for other developers to
35
+ incorporate. Many developers of free software are heartened and
36
+ encouraged by the resulting cooperation. However, in the case of
37
+ software used on network servers, this result may fail to come about.
38
+ The GNU General Public License permits making a modified version and
39
+ letting the public access it on a server without ever releasing its
40
+ source code to the public.
41
+
42
+ The GNU Affero General Public License is designed specifically to
43
+ ensure that, in such cases, the modified source code becomes available
44
+ to the community. It requires the operator of a network server to
45
+ provide the source code of the modified version running there to the
46
+ users of that server. Therefore, public use of a modified version, on
47
+ a publicly accessible server, gives the public access to the source
48
+ code of the modified version.
49
+
50
+ An older license, called the Affero General Public License and
51
+ published by Affero, was designed to accomplish similar goals. This is
52
+ a different license, not a version of the Affero GPL, but Affero has
53
+ released a new version of the Affero GPL which permits relicensing under
54
+ this license.
55
+
56
+ The precise terms and conditions for copying, distribution and
57
+ modification follow.
58
+
59
+ TERMS AND CONDITIONS
60
+
61
+ 0. Definitions.
62
+
63
+ "This License" refers to version 3 of the GNU Affero General Public License.
64
+
65
+ "Copyright" also means copyright-like laws that apply to other kinds of
66
+ works, such as semiconductor masks.
67
+
68
+ "The Program" refers to any copyrightable work licensed under this
69
+ License. Each licensee is addressed as "you". "Licensees" and
70
+ "recipients" may be individuals or organizations.
71
+
72
+ To "modify" a work means to copy from or adapt all or part of the work
73
+ in a fashion requiring copyright permission, other than the making of an
74
+ exact copy. The resulting work is called a "modified version" of the
75
+ earlier work or a work "based on" the earlier work.
76
+
77
+ A "covered work" means either the unmodified Program or a work based
78
+ on the Program.
79
+
80
+ To "propagate" a work means to do anything with it that, without
81
+ permission, would make you directly or secondarily liable for
82
+ infringement under applicable copyright law, except executing it on a
83
+ computer or modifying a private copy. Propagation includes copying,
84
+ distribution (with or without modification), making available to the
85
+ public, and in some countries other activities as well.
86
+
87
+ To "convey" a work means any kind of propagation that enables other
88
+ parties to make or receive copies. Mere interaction with a user through
89
+ a computer network, with no transfer of a copy, is not conveying.
90
+
91
+ An interactive user interface displays "Appropriate Legal Notices"
92
+ to the extent that it includes a convenient and prominently visible
93
+ feature that (1) displays an appropriate copyright notice, and (2)
94
+ tells the user that there is no warranty for the work (except to the
95
+ extent that warranties are provided), that licensees may convey the
96
+ work under this License, and how to view a copy of this License. If
97
+ the interface presents a list of user commands or options, such as a
98
+ menu, a prominent item in the list meets this criterion.
99
+
100
+ 1. Source Code.
101
+
102
+ The "source code" for a work means the preferred form of the work
103
+ for making modifications to it. "Object code" means any non-source
104
+ form of a work.
105
+
106
+ A "Standard Interface" means an interface that either is an official
107
+ standard defined by a recognized standards body, or, in the case of
108
+ interfaces specified for a particular programming language, one that
109
+ is widely used among developers working in that language.
110
+
111
+ The "System Libraries" of an executable work include anything, other
112
+ than the work as a whole, that (a) is included in the normal form of
113
+ packaging a Major Component, but which is not part of that Major
114
+ Component, and (b) serves only to enable use of the work with that
115
+ Major Component, or to implement a Standard Interface for which an
116
+ implementation is available to the public in source code form. A
117
+ "Major Component", in this context, means a major essential component
118
+ (kernel, window system, and so on) of the specific operating system
119
+ (if any) on which the executable work runs, or a compiler used to
120
+ produce the work, or an object code interpreter used to run it.
121
+
122
+ The "Corresponding Source" for a work in object code form means all
123
+ the source code needed to generate, install, and (for an executable
124
+ work) run the object code and to modify the work, including scripts to
125
+ control those activities. However, it does not include the work's
126
+ System Libraries, or general-purpose tools or generally available free
127
+ programs which are used unmodified in performing those activities but
128
+ which are not part of the work. For example, Corresponding Source
129
+ includes interface definition files associated with source files for
130
+ the work, and the source code for shared libraries and dynamically
131
+ linked subprograms that the work is specifically designed to require,
132
+ such as by intimate data communication or control flow between those
133
+ subprograms and other parts of the work.
134
+
135
+ The Corresponding Source need not include anything that users
136
+ can regenerate automatically from other parts of the Corresponding
137
+ Source.
138
+
139
+ The Corresponding Source for a work in source code form is that
140
+ same work.
141
+
142
+ 2. Basic Permissions.
143
+
144
+ All rights granted under this License are granted for the term of
145
+ copyright on the Program, and are irrevocable provided the stated
146
+ conditions are met. This License explicitly affirms your unlimited
147
+ permission to run the unmodified Program. The output from running a
148
+ covered work is covered by this License only if the output, given its
149
+ content, constitutes a covered work. This License acknowledges your
150
+ rights of fair use or other equivalent, as provided by copyright law.
151
+
152
+ You may make, run and propagate covered works that you do not
153
+ convey, without conditions so long as your license otherwise remains
154
+ in force. You may convey covered works to others for the sole purpose
155
+ of having them make modifications exclusively for you, or provide you
156
+ with facilities for running those works, provided that you comply with
157
+ the terms of this License in conveying all material for which you do
158
+ not control copyright. Those thus making or running the covered works
159
+ for you must do so exclusively on your behalf, under your direction
160
+ and control, on terms that prohibit them from making any copies of
161
+ your copyrighted material outside their relationship with you.
162
+
163
+ Conveying under any other circumstances is permitted solely under
164
+ the conditions stated below. Sublicensing is not allowed; section 10
165
+ makes it unnecessary.
166
+
167
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
168
+
169
+ No covered work shall be deemed part of an effective technological
170
+ measure under any applicable law fulfilling obligations under article
171
+ 11 of the WIPO copyright treaty adopted on 20 December 1996, or
172
+ similar laws prohibiting or restricting circumvention of such
173
+ measures.
174
+
175
+ When you convey a covered work, you waive any legal power to forbid
176
+ circumvention of technological measures to the extent such circumvention
177
+ is effected by exercising rights under this License with respect to
178
+ the covered work, and you disclaim any intention to limit operation or
179
+ modification of the work as a means of enforcing, against the work's
180
+ users, your or third parties' legal rights to forbid circumvention of
181
+ technological measures.
182
+
183
+ 4. Conveying Verbatim Copies.
184
+
185
+ You may convey verbatim copies of the Program's source code as you
186
+ receive it, in any medium, provided that you conspicuously and
187
+ appropriately publish on each copy an appropriate copyright notice;
188
+ keep intact all notices stating that this License and any
189
+ non-permissive terms added in accord with section 7 apply to the code;
190
+ keep intact all notices of the absence of any warranty; and give all
191
+ recipients a copy of this License along with the Program.
192
+
193
+ You may charge any price or no price for each copy that you convey,
194
+ and you may offer support or warranty protection for a fee.
195
+
196
+ 5. Conveying Modified Source Versions.
197
+
198
+ You may convey a work based on the Program, or the modifications to
199
+ produce it from the Program, in the form of source code under the
200
+ terms of section 4, provided that you also meet all of these conditions:
201
+
202
+ a) The work must carry prominent notices stating that you modified
203
+ it, and giving a relevant date.
204
+
205
+ b) The work must carry prominent notices stating that it is
206
+ released under this License and any conditions added under section
207
+ 7. This requirement modifies the requirement in section 4 to
208
+ "keep intact all notices".
209
+
210
+ c) You must license the entire work, as a whole, under this
211
+ License to anyone who comes into possession of a copy. This
212
+ License will therefore apply, along with any applicable section 7
213
+ additional terms, to the whole of the work, and all its parts,
214
+ regardless of how they are packaged. This License gives no
215
+ permission to license the work in any other way, but it does not
216
+ invalidate such permission if you have separately received it.
217
+
218
+ d) If the work has interactive user interfaces, each must display
219
+ Appropriate Legal Notices; however, if the Program has interactive
220
+ interfaces that do not display Appropriate Legal Notices, your
221
+ work need not make them do so.
222
+
223
+ A compilation of a covered work with other separate and independent
224
+ works, which are not by their nature extensions of the covered work,
225
+ and which are not combined with it such as to form a larger program,
226
+ in or on a volume of a storage or distribution medium, is called an
227
+ "aggregate" if the compilation and its resulting copyright are not
228
+ used to limit the access or legal rights of the compilation's users
229
+ beyond what the individual works permit. Inclusion of a covered work
230
+ in an aggregate does not cause this License to apply to the other
231
+ parts of the aggregate.
232
+
233
+ 6. Conveying Non-Source Forms.
234
+
235
+ You may convey a covered work in object code form under the terms
236
+ of sections 4 and 5, provided that you also convey the
237
+ machine-readable Corresponding Source under the terms of this License,
238
+ in one of these ways:
239
+
240
+ a) Convey the object code in, or embodied in, a physical product
241
+ (including a physical distribution medium), accompanied by the
242
+ Corresponding Source fixed on a durable physical medium
243
+ customarily used for software interchange.
244
+
245
+ b) Convey the object code in, or embodied in, a physical product
246
+ (including a physical distribution medium), accompanied by a
247
+ written offer, valid for at least three years and valid for as
248
+ long as you offer spare parts or customer support for that product
249
+ model, to give anyone who possesses the object code either (1) a
250
+ copy of the Corresponding Source for all the software in the
251
+ product that is covered by this License, on a durable physical
252
+ medium customarily used for software interchange, for a price no
253
+ more than your reasonable cost of physically performing this
254
+ conveying of source, or (2) access to copy the
255
+ Corresponding Source from a network server at no charge.
256
+
257
+ c) Convey individual copies of the object code with a copy of the
258
+ written offer to provide the Corresponding Source. This
259
+ alternative is allowed only occasionally and noncommercially, and
260
+ only if you received the object code with such an offer, in accord
261
+ with subsection 6b.
262
+
263
+ d) Convey the object code by offering access from a designated
264
+ place (gratis or for a charge), and offer equivalent access to the
265
+ Corresponding Source in the same way through the same place at no
266
+ further charge. You need not require recipients to copy the
267
+ Corresponding Source along with the object code. If the place to
268
+ copy the object code is a network server, the Corresponding Source
269
+ may be on a different server (operated by you or a third party)
270
+ that supports equivalent copying facilities, provided you maintain
271
+ clear directions next to the object code saying where to find the
272
+ Corresponding Source. Regardless of what server hosts the
273
+ Corresponding Source, you remain obligated to ensure that it is
274
+ available for as long as needed to satisfy these requirements.
275
+
276
+ e) Convey the object code using peer-to-peer transmission, provided
277
+ you inform other peers where the object code and Corresponding
278
+ Source of the work are being offered to the general public at no
279
+ charge under subsection 6d.
280
+
281
+ A separable portion of the object code, whose source code is excluded
282
+ from the Corresponding Source as a System Library, need not be
283
+ included in conveying the object code work.
284
+
285
+ A "User Product" is either (1) a "consumer product", which means any
286
+ tangible personal property which is normally used for personal, family,
287
+ or household purposes, or (2) anything designed or sold for incorporation
288
+ into a dwelling. In determining whether a product is a consumer product,
289
+ doubtful cases shall be resolved in favor of coverage. For a particular
290
+ product received by a particular user, "normally used" refers to a
291
+ typical or common use of that class of product, regardless of the status
292
+ of the particular user or of the way in which the particular user
293
+ actually uses, or expects or is expected to use, the product. A product
294
+ is a consumer product regardless of whether the product has substantial
295
+ commercial, industrial or non-consumer uses, unless such uses represent
296
+ the only significant mode of use of the product.
297
+
298
+ "Installation Information" for a User Product means any methods,
299
+ procedures, authorization keys, or other information required to install
300
+ and execute modified versions of a covered work in that User Product from
301
+ a modified version of its Corresponding Source. The information must
302
+ suffice to ensure that the continued functioning of the modified object
303
+ code is in no case prevented or interfered with solely because
304
+ modification has been made.
305
+
306
+ If you convey an object code work under this section in, or with, or
307
+ specifically for use in, a User Product, and the conveying occurs as
308
+ part of a transaction in which the right of possession and use of the
309
+ User Product is transferred to the recipient in perpetuity or for a
310
+ fixed term (regardless of how the transaction is characterized), the
311
+ Corresponding Source conveyed under this section must be accompanied
312
+ by the Installation Information. But this requirement does not apply
313
+ if neither you nor any third party retains the ability to install
314
+ modified object code on the User Product (for example, the work has
315
+ been installed in ROM).
316
+
317
+ The requirement to provide Installation Information does not include a
318
+ requirement to continue to provide support service, warranty, or updates
319
+ for a work that has been modified or installed by the recipient, or for
320
+ the User Product in which it has been modified or installed. Access to a
321
+ network may be denied when the modification itself materially and
322
+ adversely affects the operation of the network or violates the rules and
323
+ protocols for communication across the network.
324
+
325
+ Corresponding Source conveyed, and Installation Information provided,
326
+ in accord with this section must be in a format that is publicly
327
+ documented (and with an implementation available to the public in
328
+ source code form), and must require no special password or key for
329
+ unpacking, reading or copying.
330
+
331
+ 7. Additional Terms.
332
+
333
+ "Additional permissions" are terms that supplement the terms of this
334
+ License by making exceptions from one or more of its conditions.
335
+ Additional permissions that are applicable to the entire Program shall
336
+ be treated as though they were included in this License, to the extent
337
+ that they are valid under applicable law. If additional permissions
338
+ apply only to part of the Program, that part may be used separately
339
+ under those permissions, but the entire Program remains governed by
340
+ this License without regard to the additional permissions.
341
+
342
+ When you convey a copy of a covered work, you may at your option
343
+ remove any additional permissions from that copy, or from any part of
344
+ it. (Additional permissions may be written to require their own
345
+ removal in certain cases when you modify the work.) You may place
346
+ additional permissions on material, added by you to a covered work,
347
+ for which you have or can give appropriate copyright permission.
348
+
349
+ Notwithstanding any other provision of this License, for material you
350
+ add to a covered work, you may (if authorized by the copyright holders of
351
+ that material) supplement the terms of this License with terms:
352
+
353
+ a) Disclaiming warranty or limiting liability differently from the
354
+ terms of sections 15 and 16 of this License; or
355
+
356
+ b) Requiring preservation of specified reasonable legal notices or
357
+ author attributions in that material or in the Appropriate Legal
358
+ Notices displayed by works containing it; or
359
+
360
+ c) Prohibiting misrepresentation of the origin of that material, or
361
+ requiring that modified versions of such material be marked in
362
+ reasonable ways as different from the original version; or
363
+
364
+ d) Limiting the use for publicity purposes of names of licensors or
365
+ authors of the material; or
366
+
367
+ e) Declining to grant rights under trademark law for use of some
368
+ trade names, trademarks, or service marks; or
369
+
370
+ f) Requiring indemnification of licensors and authors of that
371
+ material by anyone who conveys the material (or modified versions of
372
+ it) with contractual assumptions of liability to the recipient, for
373
+ any liability that these contractual assumptions directly impose on
374
+ those licensors and authors.
375
+
376
+ All other non-permissive additional terms are considered "further
377
+ restrictions" within the meaning of section 10. If the Program as you
378
+ received it, or any part of it, contains a notice stating that it is
379
+ governed by this License along with a term that is a further
380
+ restriction, you may remove that term. If a license document contains
381
+ a further restriction but permits relicensing or conveying under this
382
+ License, you may add to a covered work material governed by the terms
383
+ of that license document, provided that the further restriction does
384
+ not survive such relicensing or conveying.
385
+
386
+ If you add terms to a covered work in accord with this section, you
387
+ must place, in the relevant source files, a statement of the
388
+ additional terms that apply to those files, or a notice indicating
389
+ where to find the applicable terms.
390
+
391
+ Additional terms, permissive or non-permissive, may be stated in the
392
+ form of a separately written license, or stated as exceptions;
393
+ the above requirements apply either way.
394
+
395
+ 8. Termination.
396
+
397
+ You may not propagate or modify a covered work except as expressly
398
+ provided under this License. Any attempt otherwise to propagate or
399
+ modify it is void, and will automatically terminate your rights under
400
+ this License (including any patent licenses granted under the third
401
+ paragraph of section 11).
402
+
403
+ However, if you cease all violation of this License, then your
404
+ license from a particular copyright holder is reinstated (a)
405
+ provisionally, unless and until the copyright holder explicitly and
406
+ finally terminates your license, and (b) permanently, if the copyright
407
+ holder fails to notify you of the violation by some reasonable means
408
+ prior to 60 days after the cessation.
409
+
410
+ Moreover, your license from a particular copyright holder is
411
+ reinstated permanently if the copyright holder notifies you of the
412
+ violation by some reasonable means, this is the first time you have
413
+ received notice of violation of this License (for any work) from that
414
+ copyright holder, and you cure the violation prior to 30 days after
415
+ your receipt of the notice.
416
+
417
+ Termination of your rights under this section does not terminate the
418
+ licenses of parties who have received copies or rights from you under
419
+ this License. If your rights have been terminated and not permanently
420
+ reinstated, you do not qualify to receive new licenses for the same
421
+ material under section 10.
422
+
423
+ 9. Acceptance Not Required for Having Copies.
424
+
425
+ You are not required to accept this License in order to receive or
426
+ run a copy of the Program. Ancillary propagation of a covered work
427
+ occurring solely as a consequence of using peer-to-peer transmission
428
+ to receive a copy likewise does not require acceptance. However,
429
+ nothing other than this License grants you permission to propagate or
430
+ modify any covered work. These actions infringe copyright if you do
431
+ not accept this License. Therefore, by modifying or propagating a
432
+ covered work, you indicate your acceptance of this License to do so.
433
+
434
+ 10. Automatic Licensing of Downstream Recipients.
435
+
436
+ Each time you convey a covered work, the recipient automatically
437
+ receives a license from the original licensors, to run, modify and
438
+ propagate that work, subject to this License. You are not responsible
439
+ for enforcing compliance by third parties with this License.
440
+
441
+ An "entity transaction" is a transaction transferring control of an
442
+ organization, or substantially all assets of one, or subdividing an
443
+ organization, or merging organizations. If propagation of a covered
444
+ work results from an entity transaction, each party to that
445
+ transaction who receives a copy of the work also receives whatever
446
+ licenses to the work the party's predecessor in interest had or could
447
+ give under the previous paragraph, plus a right to possession of the
448
+ Corresponding Source of the work from the predecessor in interest, if
449
+ the predecessor has it or can get it with reasonable efforts.
450
+
451
+ You may not impose any further restrictions on the exercise of the
452
+ rights granted or affirmed under this License. For example, you may
453
+ not impose a license fee, royalty, or other charge for exercise of
454
+ rights granted under this License, and you may not initiate litigation
455
+ (including a cross-claim or counterclaim in a lawsuit) alleging that
456
+ any patent claim is infringed by making, using, selling, offering for
457
+ sale, or importing the Program or any portion of it.
458
+
459
+ 11. Patents.
460
+
461
+ A "contributor" is a copyright holder who authorizes use under this
462
+ License of the Program or a work on which the Program is based. The
463
+ work thus licensed is called the contributor's "contributor version".
464
+
465
+ A contributor's "essential patent claims" are all patent claims
466
+ owned or controlled by the contributor, whether already acquired or
467
+ hereafter acquired, that would be infringed by some manner, permitted
468
+ by this License, of making, using, or selling its contributor version,
469
+ but do not include claims that would be infringed only as a
470
+ consequence of further modification of the contributor version. For
471
+ purposes of this definition, "control" includes the right to grant
472
+ patent sublicenses in a manner consistent with the requirements of
473
+ this License.
474
+
475
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
476
+ patent license under the contributor's essential patent claims, to
477
+ make, use, sell, offer for sale, import and otherwise run, modify and
478
+ propagate the contents of its contributor version.
479
+
480
+ In the following three paragraphs, a "patent license" is any express
481
+ agreement or commitment, however denominated, not to enforce a patent
482
+ (such as an express permission to practice a patent or covenant not to
483
+ sue for patent infringement). To "grant" such a patent license to a
484
+ party means to make such an agreement or commitment not to enforce a
485
+ patent against the party.
486
+
487
+ If you convey a covered work, knowingly relying on a patent license,
488
+ and the Corresponding Source of the work is not available for anyone
489
+ to copy, free of charge and under the terms of this License, through a
490
+ publicly available network server or other readily accessible means,
491
+ then you must either (1) cause the Corresponding Source to be so
492
+ available, or (2) arrange to deprive yourself of the benefit of the
493
+ patent license for this particular work, or (3) arrange, in a manner
494
+ consistent with the requirements of this License, to extend the patent
495
+ license to downstream recipients. "Knowingly relying" means you have
496
+ actual knowledge that, but for the patent license, your conveying the
497
+ covered work in a country, or your recipient's use of the covered work
498
+ in a country, would infringe one or more identifiable patents in that
499
+ country that you have reason to believe are valid.
500
+
501
+ If, pursuant to or in connection with a single transaction or
502
+ arrangement, you convey, or propagate by procuring conveyance of, a
503
+ covered work, and grant a patent license to some of the parties
504
+ receiving the covered work authorizing them to use, propagate, modify
505
+ or convey a specific copy of the covered work, then the patent license
506
+ you grant is automatically extended to all recipients of the covered
507
+ work and works based on it.
508
+
509
+ A patent license is "discriminatory" if it does not include within
510
+ the scope of its coverage, prohibits the exercise of, or is
511
+ conditioned on the non-exercise of one or more of the rights that are
512
+ specifically granted under this License. You may not convey a covered
513
+ work if you are a party to an arrangement with a third party that is
514
+ in the business of distributing software, under which you make payment
515
+ to the third party based on the extent of your activity of conveying
516
+ the work, and under which the third party grants, to any of the
517
+ parties who would receive the covered work from you, a discriminatory
518
+ patent license (a) in connection with copies of the covered work
519
+ conveyed by you (or copies made from those copies), or (b) primarily
520
+ for and in connection with specific products or compilations that
521
+ contain the covered work, unless you entered into that arrangement,
522
+ or that patent license was granted, prior to 28 March 2007.
523
+
524
+ Nothing in this License shall be construed as excluding or limiting
525
+ any implied license or other defenses to infringement that may
526
+ otherwise be available to you under applicable patent law.
527
+
528
+ 12. No Surrender of Others' Freedom.
529
+
530
+ If conditions are imposed on you (whether by court order, agreement or
531
+ otherwise) that contradict the conditions of this License, they do not
532
+ excuse you from the conditions of this License. If you cannot convey a
533
+ covered work so as to satisfy simultaneously your obligations under this
534
+ License and any other pertinent obligations, then as a consequence you may
535
+ not convey it at all. For example, if you agree to terms that obligate you
536
+ to collect a royalty for further conveying from those to whom you convey
537
+ the Program, the only way you could satisfy both those terms and this
538
+ License would be to refrain entirely from conveying the Program.
539
+
540
+ 13. Remote Network Interaction; Use with the GNU General Public License.
541
+
542
+ Notwithstanding any other provision of this License, if you modify the
543
+ Program, your modified version must prominently offer all users
544
+ interacting with it remotely through a computer network (if your version
545
+ supports such interaction) an opportunity to receive the Corresponding
546
+ Source of your version by providing access to the Corresponding Source
547
+ from a network server at no charge, through some standard or customary
548
+ means of facilitating copying of software. This Corresponding Source
549
+ shall include the Corresponding Source for any work covered by version 3
550
+ of the GNU General Public License that is incorporated pursuant to the
551
+ following paragraph.
552
+
553
+ Notwithstanding any other provision of this License, you have
554
+ permission to link or combine any covered work with a work licensed
555
+ under version 3 of the GNU General Public License into a single
556
+ combined work, and to convey the resulting work. The terms of this
557
+ License will continue to apply to the part which is the covered work,
558
+ but the work with which it is combined will remain governed by version
559
+ 3 of the GNU General Public License.
560
+
561
+ 14. Revised Versions of this License.
562
+
563
+ The Free Software Foundation may publish revised and/or new versions of
564
+ the GNU Affero General Public License from time to time. Such new versions
565
+ will be similar in spirit to the present version, but may differ in detail to
566
+ address new problems or concerns.
567
+
568
+ Each version is given a distinguishing version number. If the
569
+ Program specifies that a certain numbered version of the GNU Affero General
570
+ Public License "or any later version" applies to it, you have the
571
+ option of following the terms and conditions either of that numbered
572
+ version or of any later version published by the Free Software
573
+ Foundation. If the Program does not specify a version number of the
574
+ GNU Affero General Public License, you may choose any version ever published
575
+ by the Free Software Foundation.
576
+
577
+ If the Program specifies that a proxy can decide which future
578
+ versions of the GNU Affero General Public License can be used, that proxy's
579
+ public statement of acceptance of a version permanently authorizes you
580
+ to choose that version for the Program.
581
+
582
+ Later license versions may give you additional or different
583
+ permissions. However, no additional obligations are imposed on any
584
+ author or copyright holder as a result of your choosing to follow a
585
+ later version.
586
+
587
+ 15. Disclaimer of Warranty.
588
+
589
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
590
+ APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
591
+ HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
592
+ OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
593
+ THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
594
+ PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
595
+ IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
596
+ ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
597
+
598
+ 16. Limitation of Liability.
599
+
600
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
601
+ WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
602
+ THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
603
+ GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
604
+ USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
605
+ DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
606
+ PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
607
+ EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
608
+ SUCH DAMAGES.
609
+
610
+ 17. Interpretation of Sections 15 and 16.
611
+
612
+ If the disclaimer of warranty and limitation of liability provided
613
+ above cannot be given local legal effect according to their terms,
614
+ reviewing courts shall apply local law that most closely approximates
615
+ an absolute waiver of all civil liability in connection with the
616
+ Program, unless a warranty or assumption of liability accompanies a
617
+ copy of the Program in return for a fee.
618
+
619
+ END OF TERMS AND CONDITIONS
620
+
621
+ How to Apply These Terms to Your New Programs
622
+
623
+ If you develop a new program, and you want it to be of the greatest
624
+ possible use to the public, the best way to achieve this is to make it
625
+ free software which everyone can redistribute and change under these terms.
626
+
627
+ To do so, attach the following notices to the program. It is safest
628
+ to attach them to the start of each source file to most effectively
629
+ state the exclusion of warranty; and each file should have at least
630
+ the "copyright" line and a pointer to where the full notice is found.
631
+
632
+ <one line to give the program's name and a brief idea of what it does.>
633
+ Copyright (C) <year> <name of author>
634
+
635
+ This program is free software: you can redistribute it and/or modify
636
+ it under the terms of the GNU Affero General Public License as published by
637
+ the Free Software Foundation, either version 3 of the License, or
638
+ (at your option) any later version.
639
+
640
+ This program is distributed in the hope that it will be useful,
641
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
642
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
643
+ GNU Affero General Public License for more details.
644
+
645
+ You should have received a copy of the GNU Affero General Public License
646
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
647
+
648
+ Also add information on how to contact you by electronic and paper mail.
649
+
650
+ If your software can interact with users remotely through a computer
651
+ network, you should also make sure that it provides a way for users to
652
+ get its source. For example, if your program is a web application, its
653
+ interface could display a "Source" link that leads users to an archive
654
+ of the code. There are many ways you could offer source, and different
655
+ solutions will be better for different programs; see section 13 for the
656
+ specific requirements.
657
+
658
+ You should also get your employer (if you work as a programmer) or school,
659
+ if any, to sign a "copyright disclaimer" for the program, if necessary.
660
+ For more information on this, and how to apply and follow the GNU AGPL, see
661
+ <https://www.gnu.org/licenses/>.
weights/icon_detect/best.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5bac552a2d3ed748db7bf799f64b5633bcb94f827e41d6e147082a2a46ede22d
3
+ size 12222450
weights/icon_detect/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84caa6c7e0607d7d6edddb46f2affd0d0d86ddf34e71fbcce7b4ba461bd97574
3
+ size 6075790
weights/icon_detect/model.yaml ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ backbone:
2
+ - - -1
3
+ - 1
4
+ - Conv
5
+ - - 64
6
+ - 3
7
+ - 2
8
+ - - -1
9
+ - 1
10
+ - Conv
11
+ - - 128
12
+ - 3
13
+ - 2
14
+ - - -1
15
+ - 3
16
+ - C2f
17
+ - - 128
18
+ - true
19
+ - - -1
20
+ - 1
21
+ - Conv
22
+ - - 256
23
+ - 3
24
+ - 2
25
+ - - -1
26
+ - 6
27
+ - C2f
28
+ - - 256
29
+ - true
30
+ - - -1
31
+ - 1
32
+ - Conv
33
+ - - 512
34
+ - 3
35
+ - 2
36
+ - - -1
37
+ - 6
38
+ - C2f
39
+ - - 512
40
+ - true
41
+ - - -1
42
+ - 1
43
+ - Conv
44
+ - - 1024
45
+ - 3
46
+ - 2
47
+ - - -1
48
+ - 3
49
+ - C2f
50
+ - - 1024
51
+ - true
52
+ - - -1
53
+ - 1
54
+ - SPPF
55
+ - - 1024
56
+ - 5
57
+ ch: 3
58
+ depth_multiple: 0.33
59
+ head:
60
+ - - -1
61
+ - 1
62
+ - nn.Upsample
63
+ - - None
64
+ - 2
65
+ - nearest
66
+ - - - -1
67
+ - 6
68
+ - 1
69
+ - Concat
70
+ - - 1
71
+ - - -1
72
+ - 3
73
+ - C2f
74
+ - - 512
75
+ - - -1
76
+ - 1
77
+ - nn.Upsample
78
+ - - None
79
+ - 2
80
+ - nearest
81
+ - - - -1
82
+ - 4
83
+ - 1
84
+ - Concat
85
+ - - 1
86
+ - - -1
87
+ - 3
88
+ - C2f
89
+ - - 256
90
+ - - -1
91
+ - 1
92
+ - Conv
93
+ - - 256
94
+ - 3
95
+ - 2
96
+ - - - -1
97
+ - 12
98
+ - 1
99
+ - Concat
100
+ - - 1
101
+ - - -1
102
+ - 3
103
+ - C2f
104
+ - - 512
105
+ - - -1
106
+ - 1
107
+ - Conv
108
+ - - 512
109
+ - 3
110
+ - 2
111
+ - - - -1
112
+ - 9
113
+ - 1
114
+ - Concat
115
+ - - 1
116
+ - - -1
117
+ - 3
118
+ - C2f
119
+ - - 1024
120
+ - - - 15
121
+ - 18
122
+ - 21
123
+ - 1
124
+ - Detect
125
+ - - nc
126
+ nc: 1
127
+ width_multiple: 0.25