yujiepan's picture
Update README.md
82c57db verified
metadata
library_name: transformers
tags: []

yujiepan/clip-vit-tiny-random-patch14-336

This model is intended for debugging.

Usage

from transformers import CLIPProcessor, CLIPModel, CLIPConfig
from PIL import Image
import requests
import torch

model_id = "yujiepan/clip-vit-tiny-random-patch14-336"
model = CLIPModel.from_pretrained(model_id).cuda()
processor = CLIPProcessor.from_pretrained(model_id)

url = "https://assets-c4akfrf5b4d3f4b7.z01.azurefd.net/assets/2024/04/BMDataViz_661fb89f3845e.png"
image = Image.open(requests.get(url, stream=True).raw)
text = "A description of the image"
inputs = processor(text=[text], images=image, return_tensors="pt", padding=True).to("cuda")
with torch.no_grad():
    outputs = model(**inputs)
logits_per_image = outputs.logits_per_image  # shape: [batch_size, num_texts]
logits_per_text = outputs.logits_per_text  # shape: [batch_size, num_images]
probs = logits_per_image.softmax(dim=1)  # shape: [batch_size, num_texts]
print(probs)

Codes

from transformers import CLIPProcessor, CLIPModel, CLIPConfig
from PIL import Image
import requests
import torch

model_name = "openai/clip-vit-large-patch14-336"
config = CLIPConfig.from_pretrained(model_name)
config = config.to_dict()
config["projection_dim"] = 8
config["text_config"]["hidden_size"] = 8
config["text_config"]["projection_dim"] = 8
config["text_config"]["intermediate_size"] = 16
config["text_config"]["num_hidden_layers"] = 2
config["text_config"]["num_attention_heads"] = 2
config["vision_config"]["hidden_size"] = 8
config["vision_config"]["projection_dim"] = 8
config["vision_config"]["intermediate_size"] = 16
config["vision_config"]["num_hidden_layers"] = 2
config["vision_config"]["num_attention_heads"] = 2
config = CLIPConfig.from_dict(config)
model = CLIPModel(config).half().cuda()
processor = CLIPProcessor.from_pretrained(model_name)

url = "https://assets-c4akfrf5b4d3f4b7.z01.azurefd.net/assets/2024/04/BMDataViz_661fb89f3845e.png"
image = Image.open(requests.get(url, stream=True).raw)
text = "A description of the image"
inputs = processor(text=[text], images=image, return_tensors="pt", padding=True).to("cuda")
with torch.no_grad():
    outputs = model(**inputs)
logits_per_image = outputs.logits_per_image  # shape: [batch_size, num_texts]
logits_per_text = outputs.logits_per_text  # shape: [batch_size, num_images]
probs = logits_per_image.softmax(dim=1)  # shape: [batch_size, num_texts]
print(probs)

model.push_to_hub("yujiepan/clip-vit-tiny-random-patch14-336")
processor.push_to_hub("yujiepan/clip-vit-tiny-random-patch14-336")