Upload 11 files

Browse files

Files changed (11) hide show

README.md +46 -0
added_tokens.json +4 -0
config.json +28 -0
merges.txt +0 -0
metaclip_h14_fullcc2.5b.bin +3 -0
model.safetensors +3 -0
preprocessor_config.json +28 -0
pytorch_model.bin +3 -0
special_tokens_map.json +6 -0
tokenizer_config.json +33 -0
vocab.json +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,46 @@

+---
+license: cc-by-nc-4.0
+tags:
+- vision
+- metaclip
+widget:
+- src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/cat-dog-music.png
+  candidate_labels: playing music, playing sports
+  example_title: Cat & Dog
+---
+# MetaCLIP model, huge-sized version, patch resolution 14
+MetaCLIP model applied to 2.5 billion data points of CommonCrawl (CC). It was introduced in the paper [Demystifying CLIP Data](https://arxiv.org/abs/2309.16671) by Xu et al. and first released in [this repository](https://github.com/facebookresearch/MetaCLIP).
+Disclaimer: The team releasing MetaCLIP did not write a model card for this model so this model card has been written by the Hugging Face team.
+## Model description
+The [Demystifying CLIP Data](https://arxiv.org/abs/2309.16671) paper aims to reveal CLIP’s method around training data curation. OpenAI never open-sourced code regarding their data preparation pipeline.
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/clip_overview.jpg"
+alt="drawing" width="600"/>
+<small> CLIP high-level overview. Taken from the <a href="https://arxiv.org/abs/2103.00020">CLIP paper</a>. </small>
+## Intended uses & limitations
+You can use the raw model for linking images with text in a shared embedding space. This enables things like zero-shot image classification, text-based image retrieval, image-based text retrieval, etc.
+### How to use
+We refer to the [docs](https://huggingface.co/docs/transformers/main/en/model_doc/clip#usage). Just replace the names of the models on the hub.
+### BibTeX entry and citation info
+```bibtex
+@misc{xu2023demystifying,
+      title={Demystifying CLIP Data},
+      author={Hu Xu and Saining Xie and Xiaoqing Ellen Tan and Po-Yao Huang and Russell Howes and Vasu Sharma and Shang-Wen Li and Gargi Ghosh and Luke Zettlemoyer and Christoph Feichtenhofer},
+      year={2023},
+      eprint={2309.16671},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```

added_tokens.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "<|endoftext|>": 49407,
+  "<|startoftext|>": 49406
+}

config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "architectures": [
+    "CLIPModel"
+  ],
+  "initializer_factor": 1.0,
+  "logit_scale_init_value": 2.6592,
+  "model_type": "clip",
+  "projection_dim": 1024,
+  "text_config": {
+    "hidden_size": 1024,
+    "intermediate_size": 4096,
+    "model_type": "clip_text_model",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 24,
+    "projection_dim": 1024
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.34.0",
+  "vision_config": {
+    "hidden_size": 1280,
+    "intermediate_size": 5120,
+    "model_type": "clip_vision_model",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 32,
+    "patch_size": 14,
+    "projection_dim": 1024
+  }
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

metaclip_h14_fullcc2.5b.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b790ce09d92080471aaa8718293126169f9d07b8147b9677bb2f14ddf8d9fff4
+size 3944704310

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6173e1a29c7b449ce5ebe788936625c909b4d9059eb89bc4ec80bbdc00bc6571
+size 3944549372

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "crop_size": {
+    "height": 224,
+    "width": 224
+  },
+  "do_center_crop": true,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "CLIPImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "processor_class": "CLIPProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "shortest_edge": 224
+  }
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1930613c95917df0af9d07d368a20c68a181bc9a8821ac145648f0f7fcc095e5
+size 3944739266

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token": "<|startoftext|>",
+  "eos_token": "<|endoftext|>",
+  "pad_token": "<|endoftext|>",
+  "unk_token": "<|endoftext|>"
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "49406": {
+      "content": "<|startoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "49407": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [],
+  "bos_token": "<|startoftext|>",
+  "clean_up_tokenization_spaces": true,
+  "do_lower_case": true,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "model_max_length": 77,
+  "pad_token": "<|endoftext|>",
+  "processor_class": "CLIPProcessor",
+  "tokenizer_class": "CLIPTokenizer",
+  "tokenizer_file": "/Users/georgebredis/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/tokenizer.json",
+  "unk_token": "<|endoftext|>"
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff