Divyasreepat
commited on
Commit
•
4ec3ea9
1
Parent(s):
5563737
Upload folder using huggingface_hub
Browse files- README.md +12 -0
- assets/tokenizer/merges.txt +0 -0
- assets/tokenizer/vocabulary.json +0 -0
- config.json +49 -0
- image_converter.json +33 -0
- metadata.json +6 -0
- model.weights.h5 +3 -0
- tokenizer.json +21 -0
README.md
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
library_name: keras-hub
|
3 |
+
---
|
4 |
+
This is a [`CLIP` model](https://keras.io/api/keras_hub/models/clip) uploaded using the KerasHub library and can be used with JAX, TensorFlow, and PyTorch backends.
|
5 |
+
Model config:
|
6 |
+
* **name:** clip_backbone
|
7 |
+
* **trainable:** True
|
8 |
+
* **vision_encoder:** {'module': 'keras_hub.src.models.clip.clip_vision_encoder', 'class_name': 'CLIPVisionEncoder', 'config': {'name': 'clip_vision_encoder', 'trainable': True, 'patch_size': 14, 'hidden_dim': 1024, 'num_layers': 24, 'num_heads': 16, 'intermediate_dim': 4096, 'intermediate_activation': 'quick_gelu', 'intermediate_output_index': None, 'image_shape': [336, 336, 3]}, 'registered_name': 'keras_hub>CLIPVisionEncoder'}
|
9 |
+
* **text_encoder:** {'module': 'keras_hub.src.models.clip.clip_text_encoder', 'class_name': 'CLIPTextEncoder', 'config': {'name': 'clip_text_encoder', 'trainable': True, 'vocabulary_size': 49408, 'embedding_dim': 768, 'hidden_dim': 768, 'num_layers': 12, 'num_heads': 12, 'intermediate_dim': 3072, 'intermediate_activation': 'quick_gelu', 'intermediate_output_index': None, 'max_sequence_length': 77}, 'registered_name': 'keras_hub>CLIPTextEncoder'}
|
10 |
+
* **projection_dim:** 768
|
11 |
+
|
12 |
+
This model card has been generated automatically and should be completed by the model author. See [Model Cards documentation](https://huggingface.co/docs/hub/model-cards) for more information.
|
assets/tokenizer/merges.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
assets/tokenizer/vocabulary.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
config.json
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"module": "keras_hub.src.models.clip.clip_backbone",
|
3 |
+
"class_name": "CLIPBackbone",
|
4 |
+
"config": {
|
5 |
+
"name": "clip_backbone",
|
6 |
+
"trainable": true,
|
7 |
+
"vision_encoder": {
|
8 |
+
"module": "keras_hub.src.models.clip.clip_vision_encoder",
|
9 |
+
"class_name": "CLIPVisionEncoder",
|
10 |
+
"config": {
|
11 |
+
"name": "clip_vision_encoder",
|
12 |
+
"trainable": true,
|
13 |
+
"patch_size": 14,
|
14 |
+
"hidden_dim": 1024,
|
15 |
+
"num_layers": 24,
|
16 |
+
"num_heads": 16,
|
17 |
+
"intermediate_dim": 4096,
|
18 |
+
"intermediate_activation": "quick_gelu",
|
19 |
+
"intermediate_output_index": null,
|
20 |
+
"image_shape": [
|
21 |
+
336,
|
22 |
+
336,
|
23 |
+
3
|
24 |
+
]
|
25 |
+
},
|
26 |
+
"registered_name": "keras_hub>CLIPVisionEncoder"
|
27 |
+
},
|
28 |
+
"text_encoder": {
|
29 |
+
"module": "keras_hub.src.models.clip.clip_text_encoder",
|
30 |
+
"class_name": "CLIPTextEncoder",
|
31 |
+
"config": {
|
32 |
+
"name": "clip_text_encoder",
|
33 |
+
"trainable": true,
|
34 |
+
"vocabulary_size": 49408,
|
35 |
+
"embedding_dim": 768,
|
36 |
+
"hidden_dim": 768,
|
37 |
+
"num_layers": 12,
|
38 |
+
"num_heads": 12,
|
39 |
+
"intermediate_dim": 3072,
|
40 |
+
"intermediate_activation": "quick_gelu",
|
41 |
+
"intermediate_output_index": null,
|
42 |
+
"max_sequence_length": 77
|
43 |
+
},
|
44 |
+
"registered_name": "keras_hub>CLIPTextEncoder"
|
45 |
+
},
|
46 |
+
"projection_dim": 768
|
47 |
+
},
|
48 |
+
"registered_name": "keras_hub>CLIPBackbone"
|
49 |
+
}
|
image_converter.json
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"module": "keras_hub.src.models.clip.clip_image_converter",
|
3 |
+
"class_name": "CLIPImageConverter",
|
4 |
+
"config": {
|
5 |
+
"name": "clip_image_converter",
|
6 |
+
"trainable": true,
|
7 |
+
"dtype": {
|
8 |
+
"module": "keras",
|
9 |
+
"class_name": "DTypePolicy",
|
10 |
+
"config": {
|
11 |
+
"name": "float32"
|
12 |
+
},
|
13 |
+
"registered_name": null
|
14 |
+
},
|
15 |
+
"image_size": [
|
16 |
+
336,
|
17 |
+
336
|
18 |
+
],
|
19 |
+
"scale": [
|
20 |
+
0.014598426619242919,
|
21 |
+
0.015007768493717055,
|
22 |
+
0.014220065717024086
|
23 |
+
],
|
24 |
+
"offset": [
|
25 |
+
-1.79226253374815,
|
26 |
+
-1.7520971281645974,
|
27 |
+
-1.4802197687835659
|
28 |
+
],
|
29 |
+
"interpolation": "bicubic",
|
30 |
+
"crop_to_aspect_ratio": true
|
31 |
+
},
|
32 |
+
"registered_name": "keras_hub>CLIPImageConverter"
|
33 |
+
}
|
metadata.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"keras_version": "3.6.0",
|
3 |
+
"keras_hub_version": "0.17.0.dev0",
|
4 |
+
"parameter_count": 427944770,
|
5 |
+
"date_saved": "2024-10-29@23:00:18"
|
6 |
+
}
|
model.weights.h5
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f5e1ed23e32e204f2019c5c9af6fe03a6d90b048232cc3b4f4430b8bb00ce7bb
|
3 |
+
size 1712969228
|
tokenizer.json
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"module": "keras_hub.src.models.clip.clip_tokenizer",
|
3 |
+
"class_name": "CLIPTokenizer",
|
4 |
+
"config": {
|
5 |
+
"name": "clip_tokenizer",
|
6 |
+
"trainable": true,
|
7 |
+
"dtype": {
|
8 |
+
"module": "keras",
|
9 |
+
"class_name": "DTypePolicy",
|
10 |
+
"config": {
|
11 |
+
"name": "int32"
|
12 |
+
},
|
13 |
+
"registered_name": null
|
14 |
+
},
|
15 |
+
"config_file": "tokenizer.json",
|
16 |
+
"sequence_length": null,
|
17 |
+
"add_prefix_space": false,
|
18 |
+
"pad_with_end_token": true
|
19 |
+
},
|
20 |
+
"registered_name": "keras_hub>CLIPTokenizer"
|
21 |
+
}
|