Spaces:

showlab
/

ShowUI

Running on Zero

App Files Files Community

h-siyuan commited on 15 days ago

Commit

48a56a5

•

1 Parent(s): 2392f1e

remove large local files to save space storage

Browse files

Files changed (15) hide show

showui-2b/.gitattributes +0 -35
showui-2b/README.md +0 -188
showui-2b/added_tokens.json +0 -16
showui-2b/config.json +0 -48
showui-2b/examples/0730d43001da36204b8cb9495b61308.png +0 -0
showui-2b/examples/chrome.png +0 -0
showui-2b/examples/showui.png +0 -0
showui-2b/generation_config.json +0 -14
showui-2b/merges.txt +0 -0
showui-2b/preprocessor_config.json +0 -29
showui-2b/pytorch_model.bin +0 -3
showui-2b/special_tokens_map.json +0 -24
showui-2b/tokenizer.json +0 -0
showui-2b/tokenizer_config.json +0 -144
showui-2b/vocab.json +0 -0

showui-2b/.gitattributes DELETED Viewed

@@ -1,35 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

showui-2b/README.md DELETED Viewed

@@ -1,188 +0,0 @@
----
-tags:
-- GUI agents
-- vision-language-action model
-- computer use
----
-[Github](https://github.com/showlab/ShowUI/tree/main) | [Quick Start](https://huggingface.co/showlab/ShowUI-2B)
-<img src="examples/showui.png" alt="ShowUI" width="640">
-ShowUI is a lightweight (2B) vision-language-action model designed for GUI agents.
-[![demo](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Models-blue)](https://huggingface.co/showlab/ShowUI-2B)&nbsp;
-## ⭐ Quick Start
-1. Load model
-```python
-import ast
-import torch
-from PIL import Image, ImageDraw
-from qwen_vl_utils import process_vision_info
-from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
-def draw_point(image_input, point=None, radius=5):
-    if isinstance(image_input, str):
-        image = Image.open(BytesIO(requests.get(image_input).content)) if image_input.startswith('http') else Image.open(image_input)
-    else:
-        image = image_input
-    if point:
-        x, y = point[0] * image.width, point[1] * image.height
-        ImageDraw.Draw(image).ellipse((x - radius, y - radius, x + radius, y + radius), fill='red')
-    display(image)
-    return
-model = Qwen2VLForConditionalGeneration.from_pretrained(
-    "showlab/ShowUI-2B",
-    torch_dtype=torch.bfloat16,
-    device_map="auto"
-)
-min_pixels = 256*28*28
-max_pixels = 1344*28*28
-processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
-```
-2. **UI Grounding**
-```python
-img_url = 'examples/web_dbd7514b-9ca3-40cd-b09a-990f7b955da1.png'
-query = "Nahant"
-_SYSTEM = "Based on the screenshot of the page, I give a text description and you give its corresponding location. The coordinate represents a clickable location [x, y] for an element, which is a relative coordinate on the screenshot, scaled from 0 to 1."
-messages = [
-    {
-        "role": "user",
-        "content": [
-            {"type": "text", "text": _SYSTEM},
-            {"type": "image", "image": img_url, "min_pixels": min_pixels, "max_pixels": max_pixels},
-            {"type": "text", "text": query}
-        ],
-    }
-]
-text = processor.apply_chat_template(
-    messages, tokenize=False, add_generation_prompt=True,
-)
-image_inputs, video_inputs = process_vision_info(messages)
-inputs = processor(
-    text=[text],
-    images=image_inputs,
-    videos=video_inputs,
-    padding=True,
-    return_tensors="pt",
-)
-inputs = inputs.to("cuda")
-generated_ids = model.generate(**inputs, max_new_tokens=128)
-generated_ids_trimmed = [
-    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-]
-output_text = processor.batch_decode(
-    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-)[0]
-click_xy = ast.literal_eval(output_text)
-# [0.73, 0.21]
-draw_point(img_url, click_xy, 10)
-```
-This will visualize the grounding results like (where the red points are [x,y])
-![download](https://github.com/user-attachments/assets/8fe2783d-05b6-44e6-a26c-8718d02b56cb)
-3. **UI Navigation**
-- Set up system prompt.
-```python
-_NAV_SYSTEM = """You are an assistant trained to navigate the {_APP} screen.
-Given a task instruction, a screen observation, and an action history sequence,
-output the next action and wait for the next observation.
-Here is the action space:
-{_ACTION_SPACE}
-"""
-_NAV_FORMAT = """
-Format the action as a dictionary with the following keys:
-{'action': 'ACTION_TYPE', 'value': 'element', 'position': [x,y]}
-If value or position is not applicable, set it as `None`.
-Position might be [[x1,y1], [x2,y2]] if the action requires a start and end position.
-Position represents the relative coordinates on the screenshot and should be scaled to a range of 0-1.
-"""
-action_map = {
-'web': """
-1. `CLICK`: Click on an element, value is not applicable and the position [x,y] is required.
-2. `INPUT`: Type a string into an element, value is a string to type and the position [x,y] is required.
-3. `SELECT`: Select a value for an element, value is not applicable and the position [x,y] is required.
-4. `HOVER`: Hover on an element, value is not applicable and the position [x,y] is required.
-5. `ANSWER`: Answer the question, value is the answer and the position is not applicable.
-6. `ENTER`: Enter operation, value and position are not applicable.
-7. `SCROLL`: Scroll the screen, value is the direction to scroll and the position is not applicable.
-8. `SELECT_TEXT`: Select some text content, value is not applicable and position [[x1,y1], [x2,y2]] is the start and end position of the select operation.
-9. `COPY`: Copy the text, value is the text to copy and the position is not applicable.
-""",
-'phone': """
-1. `INPUT`: Type a string into an element, value is not applicable and the position [x,y] is required.
-2. `SWIPE`: Swipe the screen, value is not applicable and the position [[x1,y1], [x2,y2]] is the start and end position of the swipe operation.
-3. `TAP`: Tap on an element, value is not applicable and the position [x,y] is required.
-4. `ANSWER`: Answer the question, value is the status (e.g., 'task complete') and the position is not applicable.
-5. `ENTER`: Enter operation, value and position are not applicable.
-"""
-}
-_NAV_USER = """{system}
-Task: {task}
-Observation: <|image_1|>
-Action History: {action_history}
-What is the next action?
-"""
-```
-```python
-img_url = 'examples/chrome.png'
-split='web'
-system_prompt = _NAV_SYSTEM.format(_APP=split, _ACTION_SPACE=action_map[split])
-query = "Search the weather for the New York city."
-messages = [
-    {
-        "role": "user",
-        "content": [
-            {"type": "text", "text": system_prompt},
-            {"type": "image", "image": img_url, "min_pixels": min_pixels, "max_pixels": max_pixels},
-            {"type": "text", "text": query}
-        ],
-    }
-]
-text = processor.apply_chat_template(
-    messages, tokenize=False, add_generation_prompt=True,
-)
-image_inputs, video_inputs = process_vision_info(messages)
-inputs = processor(
-    text=[text],
-    images=image_inputs,
-    videos=video_inputs,
-    padding=True,
-    return_tensors="pt",
-)
-inputs = inputs.to("cuda")
-generated_ids = model.generate(**inputs, max_new_tokens=128)
-generated_ids_trimmed = [
-    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-]
-output_text = processor.batch_decode(
-    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-)[0]
-print(output_text)
-# {'action': 'CLICK', 'value': None, 'position': [0.49, 0.42]},
-# {'action': 'INPUT', 'value': 'weather for New York city', 'position': [0.49, 0.42]},
-# {'action': 'ENTER', 'value': None, 'position': None}
-```
-![download](https://github.com/user-attachments/assets/624097ea-06f2-4c8f-83f6-b6b9ee439c0c)

showui-2b/added_tokens.json DELETED Viewed

@@ -1,16 +0,0 @@
-{
-  "<|box_end|>": 151649,
-  "<|box_start|>": 151648,
-  "<|endoftext|>": 151643,
-  "<|im_end|>": 151645,
-  "<|im_start|>": 151644,
-  "<|image_pad|>": 151655,
-  "<|object_ref_end|>": 151647,
-  "<|object_ref_start|>": 151646,
-  "<|quad_end|>": 151651,
-  "<|quad_start|>": 151650,
-  "<|video_pad|>": 151656,
-  "<|vision_end|>": 151653,
-  "<|vision_pad|>": 151654,
-  "<|vision_start|>": 151652
-}

showui-2b/config.json DELETED Viewed

@@ -1,48 +0,0 @@
-{
-  "_name_or_path": "Qwen/Qwen2-VL-2B-Instruct",
-  "architectures": [
-    "Qwen2VLForConditionalGeneration"
-  ],
-  "attention_dropout": 0.0,
-  "bos_token_id": 151643,
-  "eos_token_id": 151645,
-  "hidden_act": "silu",
-  "hidden_size": 1536,
-  "image_token_id": 151655,
-  "initializer_range": 0.02,
-  "intermediate_size": 8960,
-  "max_position_embeddings": 32768,
-  "max_window_layers": 28,
-  "model_type": "qwen2_vl",
-  "num_attention_heads": 12,
-  "num_hidden_layers": 28,
-  "num_key_value_heads": 2,
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": {
-    "mrope_section": [
-      16,
-      24,
-      24
-    ],
-    "type": "mrope"
-  },
-  "rope_theta": 1000000.0,
-  "sliding_window": 32768,
-  "tie_word_embeddings": true,
-  "tokenizer_model_max_length": 4096,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.45.0.dev0",
-  "use_cache": false,
-  "use_sliding_window": false,
-  "video_token_id": 151656,
-  "vision_config": {
-    "hidden_size": 1536,
-    "in_chans": 3,
-    "model_type": "qwen2_vl",
-    "spatial_patch_size": 14
-  },
-  "vision_end_token_id": 151653,
-  "vision_start_token_id": 151652,
-  "vision_token_id": 151654,
-  "vocab_size": 151936
-}

showui-2b/examples/0730d43001da36204b8cb9495b61308.png DELETED Viewed

Binary file (67.6 kB)

showui-2b/examples/chrome.png DELETED Viewed

Binary file (67.6 kB)

showui-2b/examples/showui.png DELETED Viewed

Binary file (54 kB)

showui-2b/generation_config.json DELETED Viewed

@@ -1,14 +0,0 @@
-{
-  "_attn_implementation": "eager",
-  "bos_token_id": 151643,
-  "do_sample": true,
-  "eos_token_id": [
-    151645,
-    151643
-  ],
-  "pad_token_id": 151643,
-  "temperature": 0.01,
-  "top_k": 1,
-  "top_p": 0.001,
-  "transformers_version": "4.45.0.dev0"
-}

showui-2b/merges.txt DELETED Viewed

The diff for this file is too large to render. See raw diff

showui-2b/preprocessor_config.json DELETED Viewed

@@ -1,29 +0,0 @@
-{
-  "do_convert_rgb": true,
-  "do_normalize": true,
-  "do_rescale": true,
-  "do_resize": true,
-  "image_mean": [
-    0.48145466,
-    0.4578275,
-    0.40821073
-  ],
-  "image_processor_type": "Qwen2VLImageProcessor",
-  "image_std": [
-    0.26862954,
-    0.26130258,
-    0.27577711
-  ],
-  "max_pixels": 12845056,
-  "merge_size": 2,
-  "min_pixels": 3136,
-  "patch_size": 14,
-  "processor_class": "Qwen2VLProcessor",
-  "resample": 3,
-  "rescale_factor": 0.00392156862745098,
-  "size": {
-    "max_pixels": 12845056,
-    "min_pixels": 3136
-  },
-  "temporal_patch_size": 2
-}

showui-2b/pytorch_model.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:68080df785764e98976eb9cc93a07c6c69cf8a6933738496e02aef55b53d2aa3
-size 4418202778

showui-2b/special_tokens_map.json DELETED Viewed

@@ -1,24 +0,0 @@
-{
-  "additional_special_tokens": [
-    "<|im_start|>",
-    "<|im_end|>",
-    "<|object_ref_start|>",
-    "<|object_ref_end|>",
-    "<|box_start|>",
-    "<|box_end|>",
-    "<|quad_start|>",
-    "<|quad_end|>",
-    "<|vision_start|>",
-    "<|vision_end|>",
-    "<|vision_pad|>",
-    "<|image_pad|>",
-    "<|video_pad|>"
-  ],
-  "eos_token": {
-    "content": "<|im_end|>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  }
-}

showui-2b/tokenizer.json DELETED Viewed

The diff for this file is too large to render. See raw diff

showui-2b/tokenizer_config.json DELETED Viewed

@@ -1,144 +0,0 @@
-{
-  "add_prefix_space": false,
-  "added_tokens_decoder": {
-    "151643": {
-      "content": "<|endoftext|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "151644": {
-      "content": "<|im_start|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "151645": {
-      "content": "<|im_end|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "151646": {
-      "content": "<|object_ref_start|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "151647": {
-      "content": "<|object_ref_end|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "151648": {
-      "content": "<|box_start|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "151649": {
-      "content": "<|box_end|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "151650": {
-      "content": "<|quad_start|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "151651": {
-      "content": "<|quad_end|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "151652": {
-      "content": "<|vision_start|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "151653": {
-      "content": "<|vision_end|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "151654": {
-      "content": "<|vision_pad|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "151655": {
-      "content": "<|image_pad|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "151656": {
-      "content": "<|video_pad|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    }
-  },
-  "additional_special_tokens": [
-    "<|im_start|>",
-    "<|im_end|>",
-    "<|object_ref_start|>",
-    "<|object_ref_end|>",
-    "<|box_start|>",
-    "<|box_end|>",
-    "<|quad_start|>",
-    "<|quad_end|>",
-    "<|vision_start|>",
-    "<|vision_end|>",
-    "<|vision_pad|>",
-    "<|image_pad|>",
-    "<|video_pad|>"
-  ],
-  "bos_token": null,
-  "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
-  "clean_up_tokenization_spaces": false,
-  "eos_token": "<|im_end|>",
-  "errors": "replace",
-  "model_max_length": 4096,
-  "pad_token": null,
-  "padding_side": "right",
-  "processor_class": "Qwen2VLProcessor",
-  "split_special_tokens": false,
-  "tokenizer_class": "Qwen2Tokenizer",
-  "unk_token": null
-}

showui-2b/vocab.json DELETED Viewed

The diff for this file is too large to render. See raw diff