import gradio as gr from transformers import AutoConfig,ViTImageProcessor,ViTForImageClassification,AutoModel import base64 import os processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224') model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224') images = 'room.jpg' def image_classifier(image): inputs = processor(images=image, return_tensors="pt") outputs = model(**inputs) logits = outputs.logits logits_np = logits.detach().cpu().numpy() logits_args = logits_np.argsort()[0][-3:] prediction_classes = [model.config.id2label[predicted_class_idx] for predicted_class_idx in logits_args ] result = {} for i,item in enumerate(prediction_classes): result[item] = logits_np[0][i] return result with gr.Blocks(title="Image Classification using Google Vision Transformer") as demo : gr.Markdown( """

The Vision Transformer (ViT)

Transformer encoder model (BERT-like) pretrained on a large collection of images in a supervised fashion, namely ImageNet-21k, at a resolution of 224x224 pixels. Next, the model was fine-tuned on ImageNet (also referred to as ILSVRC2012), a dataset comprising 1 million images and 1,000 classes, also at resolution 224x224.
""" ) with gr.Row(): with gr.Column(): # inputt = gr.inputs.Image(shape=(200, 200)), inputt = gr.Image(type="numpy", label="Input Image for Classification") button = gr.Button(value="Classify") with gr.Column(): output = gr.Label() button.click(image_classifier,inputt,output) demo.launch()