This is an image captioning model training by Zayn


from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer

model = VisionEncoderDecoderModel.from_pretrained("Zayn/AICVTG_What_if_a_machine_could_create_captions_automatically")
feature_extractor = ViTFeatureExtractor.from_pretrained("Zayn/AICVTG_What_if_a_machine_could_create_captions_automatically")
tokenizer = AutoTokenizer.from_pretrained("Zayn/AICVTG_What_if_a_machine_could_create_captions_automatically")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



max_length = 20
num_beams = 8
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
def predict_step(image_paths):
  images = []
  for image_path in image_paths:
    i_image = Image.open(image_path)
    if i_image.mode != "RGB":
      i_image = i_image.convert(mode="RGB")

    images.append(i_image)

  pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
  pixel_values = pixel_values.to(device)

  output_ids = model.generate(pixel_values, **gen_kwargs)

  preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
  preds = [pred.strip() for pred in preds]
  return preds


predict_step(['Image URL.jpg'])
Downloads last month
13
Safetensors
Model size
264M params
Tensor type
F32
ยท
U8
ยท
Inference Providers NEW
This model is not currently available via any of the supported Inference Providers.

Space using Zayn/AICVTG_What_if_a_machine_could_create_captions_automatically 1