--- metrics: - bleu - rouge tags: - image-to-text - image-captioning - vision-transformer - ViT-B/16 language: - id - en --- # Sample running code ```python from transformers import VisionEncoderDecoderModel, ViTImageProcessor, GPT2Tokenizer import torch from PIL import Image model = VisionEncoderDecoderModel.from_pretrained("evlinzxxx/best_model_ViTB16_GPT2") feature_extractor = ViTImageProcessor.from_pretrained("evlinzxxx/best_model_ViTB16_GPT2") tokenizer = GPT2Tokenizer.from_pretrained("evlinzxxx/best_model_ViTB16_GPT2") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) def show_image_and_captions(url): # get the image and display it display(load_image(url)) # get the captions on various models our_caption = get_caption(model, image_processor, tokenizer, url) # print the captions print(f"Our caption: {our_caption}") show_image_and_captions("/content/drive/MyDrive/try/test_400/gl_16.jpg") # ['navigate around the obstacle ahead adjusting your route to bypass the parked car.'] ```