elliesleightholm Xenova HF staff commited on
Commit
ef343cf
1 Parent(s): 8385fef

Upload ONNX weights (+ quantizations) + Transformers.js support (#1)

Browse files

- Upload ONNX weights (+ quantizations) (31a9a46f0afbbbe34a1668adcd72a03ce37d5b9b)
- Create config.json (a05d487c0285b3c35898c17db40f374058b42d81)
- Create preprocessor_config.json (8690aa35ac67dacfbc214cdff245602eaa83b5e2)
- Update tokenizer_config.json (cc93df47ec2c676dc83e0d3e92e3f56f57366066)
- Update README.md (4a45991a7a637ae0cf9ff467bcd6db7e166247d7)


Co-authored-by: Joshua <Xenova@users.noreply.huggingface.co>

README.md CHANGED
@@ -5,6 +5,7 @@ tags:
5
  - fashion
6
  - multimodal retrieval
7
  - siglip
 
8
  library_name: open_clip
9
  pipeline_tag: zero-shot-image-classification
10
  license: apache-2.0
@@ -25,6 +26,9 @@ The model was fine-tuned from ViT-B-16-SigLIP (webli).
25
 
26
 
27
  ## Usage
 
 
 
28
  The model can be seamlessly used with [OpenCLIP](https://github.com/mlfoundations/open_clip) by
29
 
30
  ```python
@@ -49,6 +53,55 @@ with torch.no_grad(), torch.cuda.amp.autocast():
49
  print("Label probs:", text_probs)
50
  ```
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  ## Benchmark Results
53
  Average evaluation results on 6 public multimodal fashion datasets ([Atlas](https://huggingface.co/datasets/Marqo/atlas), [DeepFashion (In-shop)](https://huggingface.co/datasets/Marqo/deepfashion-inshop), [DeepFashion (Multimodal)](https://huggingface.co/datasets/Marqo/deepfashion-multimodal), [Fashion200k](https://huggingface.co/datasets/Marqo/fashion200k), [KAGL](https://huggingface.co/datasets/Marqo/KAGL), and [Polyvore](https://huggingface.co/datasets/Marqo/polyvore)) are reported below:
54
 
 
5
  - fashion
6
  - multimodal retrieval
7
  - siglip
8
+ - transformers.js
9
  library_name: open_clip
10
  pipeline_tag: zero-shot-image-classification
11
  license: apache-2.0
 
26
 
27
 
28
  ## Usage
29
+
30
+ ### OpenCLIP
31
+
32
  The model can be seamlessly used with [OpenCLIP](https://github.com/mlfoundations/open_clip) by
33
 
34
  ```python
 
53
  print("Label probs:", text_probs)
54
  ```
55
 
56
+ ### Transformers.js
57
+
58
+ You can also run the model in JavaScript with the [Transformers.js](https://huggingface.co/docs/transformers.js) library.
59
+
60
+ First, install it from [NPM](https://www.npmjs.com/package/@huggingface/transformers) using:
61
+
62
+ ```bash
63
+ npm i @huggingface/transformers
64
+ ```
65
+
66
+ Then, compute embeddings as follows:
67
+ ```js
68
+ import { SiglipTextModel, SiglipVisionModel, AutoTokenizer, AutoProcessor, RawImage, softmax, dot } from '@huggingface/transformers';
69
+
70
+ const model_id = 'Marqo/marqo-fashionSigLIP';
71
+
72
+ // Load tokenizer and text model
73
+ const tokenizer = await AutoTokenizer.from_pretrained(model_id);
74
+ const text_model = await SiglipTextModel.from_pretrained(model_id);
75
+
76
+ // Load processor and vision model
77
+ const processor = await AutoProcessor.from_pretrained(model_id);
78
+ const vision_model = await SiglipVisionModel.from_pretrained(model_id);
79
+
80
+ // Run tokenization
81
+ const texts = ['a hat', 'a t-shirt', 'shoes'];
82
+ const text_inputs = tokenizer(texts, { padding: 'max_length', truncation: true });
83
+
84
+ // Compute text embeddings
85
+ const { text_embeds } = await text_model(text_inputs);
86
+
87
+ // Read image and run processor
88
+ const image = await RawImage.read('https://raw.githubusercontent.com/marqo-ai/marqo-FashionCLIP/main/docs/fashion-hippo.png');
89
+ const image_inputs = await processor(image);
90
+
91
+ // Compute vision embeddings
92
+ const { image_embeds } = await vision_model(image_inputs);
93
+
94
+ // Compute similarity scores
95
+ const normalized_text_embeds = text_embeds.normalize().tolist();
96
+ const normalized_image_embeds = image_embeds.normalize().tolist()[0];
97
+
98
+ const text_probs = softmax(normalized_text_embeds.map((text_embed) =>
99
+ 100.0 * dot(normalized_image_embeds, text_embed)
100
+ ));
101
+ console.log(text_probs);
102
+ // [0.9860219105287394, 0.00777916527489097, 0.006198924196369721]
103
+ ```
104
+
105
  ## Benchmark Results
106
  Average evaluation results on 6 public multimodal fashion datasets ([Atlas](https://huggingface.co/datasets/Marqo/atlas), [DeepFashion (In-shop)](https://huggingface.co/datasets/Marqo/deepfashion-inshop), [DeepFashion (Multimodal)](https://huggingface.co/datasets/Marqo/deepfashion-multimodal), [Fashion200k](https://huggingface.co/datasets/Marqo/fashion200k), [KAGL](https://huggingface.co/datasets/Marqo/KAGL), and [Polyvore](https://huggingface.co/datasets/Marqo/polyvore)) are reported below:
107
 
config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "model_type": "siglip"
3
+ }
onnx/text_model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1d501b23bddf27ba828c037b0780e44fdf47ca4c0b925ef190ab5bcf7aaf6e6
3
+ size 441361402
onnx/text_model_bnb4.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4be58194517cdafe1695c2675259ffdba8871b77f8c5828cd45598177453f5d5
3
+ size 173734396
onnx/text_model_fp16.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f5fcccd805e8910663dcbd6821c3cbe040bbe508c656964de08736272228806
3
+ size 220817780
onnx/text_model_int8.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5a7995e029c6ee9346fa46857661c4a171d28b164fa7703b85a527d73adf170
3
+ size 111125229
onnx/text_model_q4.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f62fd2b046d82ec1ed91881fe9f4f0b34d4e11bae539a33092712031ffee129
3
+ size 178600156
onnx/text_model_q4f16.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e48e36d0acdc1579aac8edb3c593a3f2ef9d55f7646b9e3acd7baf8f00d7d0ce
3
+ size 108904023
onnx/text_model_quantized.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5a7995e029c6ee9346fa46857661c4a171d28b164fa7703b85a527d73adf170
3
+ size 111125229
onnx/text_model_uint8.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e45a29f61825b0fdc4ab2648c84791d6af1b63fe86ce7bd7c7fee43fc3b1c4d
3
+ size 111125261
onnx/vision_model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7e773846b27a699c45ba7e3978514b7fca420662d7e69e3b9226982f09f4a13
3
+ size 371715502
onnx/vision_model_bnb4.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7724073a1af61260bc4d250492ff1d2bfbf2e4d4e171e90c5044186e2198948
3
+ size 55430656
onnx/vision_model_fp16.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6d3e644416e543c62344c06f7dc8be5687633901a2931faccbe28688e30737e
3
+ size 185947013
onnx/vision_model_int8.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dae6e934667af50590c3d0ca69c7b14edc71f4df1b4f670e5ba6bc623495b691
3
+ size 93973410
onnx/vision_model_q4.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a864016eff2d8829ec6088b1593c3c9e75a70e0413e0f02d4a4bbfddc3ef89d3
3
+ size 61181030
onnx/vision_model_q4f16.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:988c55937d35228f860bcc90742940ec3efe8d00054518249f6c82b66e1b4a7c
3
+ size 53686874
onnx/vision_model_quantized.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1009cfce0eedd409f601e8351eedab72e8529641105eee6517821a9a634a2f4
3
+ size 93973443
onnx/vision_model_uint8.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1009cfce0eedd409f601e8351eedab72e8529641105eee6517821a9a634a2f4
3
+ size 93973443
preprocessor_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "do_rescale": true,
4
+ "do_resize": true,
5
+ "image_processor_type": "SiglipImageProcessor",
6
+ "image_mean": [
7
+ 0.5,
8
+ 0.5,
9
+ 0.5
10
+ ],
11
+ "processor_class": "SiglipProcessor",
12
+ "resample": 3,
13
+ "rescale_factor": 0.00392156862745098,
14
+ "size": {
15
+ "height": 224,
16
+ "width": 224
17
+ },
18
+ "image_std": [
19
+ 0.5,
20
+ 0.5,
21
+ 0.5
22
+ ]
23
+ }
tokenizer_config.json CHANGED
@@ -931,7 +931,7 @@
931
  "eos_token": "</s>",
932
  "extra_ids": 100,
933
  "legacy": false,
934
- "model_max_length": 1000000000000000019884624838656,
935
  "pad_token": "</s>",
936
  "sp_model_kwargs": {},
937
  "tokenizer_class": "T5Tokenizer",
 
931
  "eos_token": "</s>",
932
  "extra_ids": 100,
933
  "legacy": false,
934
+ "model_max_length": 64,
935
  "pad_token": "</s>",
936
  "sp_model_kwargs": {},
937
  "tokenizer_class": "T5Tokenizer",