kushagra124 commited on
Commit
894c286
1 Parent(s): 6f90e5a

adding app

Browse files
Files changed (4) hide show
  1. app.py +51 -0
  2. google_vit.ipynb +134 -0
  3. requirements.txt +5 -0
  4. room.jpg +0 -0
app.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from transformers import AutoConfig,ViTImageProcessor,ViTForImageClassification,AutoModel
4
+ import base64
5
+ import os
6
+
7
+ processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
8
+ model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')
9
+ images = 'room.jpg'
10
+
11
+
12
+ def image_classifier(image):
13
+ inputs = processor(images=image, return_tensors="pt")
14
+ outputs = model(**inputs)
15
+ logits = outputs.logits
16
+
17
+ logits_np = logits.detach().cpu().numpy()
18
+ logits_args = logits_np.argsort()[0][-3:]
19
+
20
+ prediction_classes = [model.config.id2label[predicted_class_idx] for predicted_class_idx in logits_args ]
21
+
22
+ result = {}
23
+ for i,item in enumerate(prediction_classes):
24
+ result[item] = logits_np[0][i]
25
+
26
+ return result
27
+
28
+
29
+ with gr.Blocks(title="Image Classification using Google Vision Transformer") as demo :
30
+ gr.Markdown(
31
+ """
32
+ <center>
33
+ <h1>
34
+ The Vision Transformer (ViT)
35
+ </h1>
36
+ Transformer encoder model (BERT-like) pretrained on a large collection of images in a supervised fashion, namely ImageNet-21k, at a resolution of 224x224 pixels.
37
+ Next, the model was fine-tuned on ImageNet (also referred to as ILSVRC2012), a dataset comprising 1 million images and 1,000 classes, also at resolution 224x224.
38
+ </center>
39
+ """
40
+ )
41
+ with gr.Row():
42
+ with gr.Column():
43
+ # inputt = gr.inputs.Image(shape=(200, 200)),
44
+ inputt = gr.Image(type="numpy", label="Input Image for Classification")
45
+ button = gr.Button(value="Classify")
46
+ with gr.Column():
47
+ output = gr.Label()
48
+ button.click(image_classifier,inputt,output)
49
+
50
+ demo.launch()
51
+
google_vit.ipynb ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stderr",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "/home/jarvis/.local/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
13
+ " from .autonotebook import tqdm as notebook_tqdm\n"
14
+ ]
15
+ }
16
+ ],
17
+ "source": [
18
+ "from transformers import ViTImageProcessor, ViTForImageClassification,FlaxViTForImageClassification\n",
19
+ "from PIL import Image\n",
20
+ "import requests\n",
21
+ "from matplotlib import pyplot as plt "
22
+ ]
23
+ },
24
+ {
25
+ "cell_type": "code",
26
+ "execution_count": 44,
27
+ "metadata": {},
28
+ "outputs": [
29
+ {
30
+ "name": "stdout",
31
+ "output_type": "stream",
32
+ "text": [
33
+ "['tiger cat', 'tabby, tabby cat', 'Egyptian cat'] [282 281 285]\n"
34
+ ]
35
+ }
36
+ ],
37
+ "source": [
38
+ "url = 'http://images.cocodataset.org/val2017/000000039769.jpg'\n",
39
+ "image = Image.open(requests.get(url, stream=True).raw)\n",
40
+ "\n",
41
+ "processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')\n",
42
+ "model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')\n",
43
+ "\n",
44
+ "inputs = processor(images=image, return_tensors=\"pt\")\n",
45
+ "outputs = model(**inputs)\n",
46
+ "logits = outputs.logits\n",
47
+ "\n",
48
+ "logits_np = logits.detach().cpu().numpy()\n",
49
+ "logits_args = logits_np.argsort()[0][-3:]\n",
50
+ "\n",
51
+ "prediction_classes = [model.config.id2label[predicted_class_idx] for predicted_class_idx in logits_args ]\n",
52
+ "print(prediction_classes,logits_args)\n"
53
+ ]
54
+ },
55
+ {
56
+ "cell_type": "code",
57
+ "execution_count": 46,
58
+ "metadata": {},
59
+ "outputs": [
60
+ {
61
+ "data": {
62
+ "text/plain": [
63
+ "{'tiger cat': -0.27440035,\n",
64
+ " 'tabby, tabby cat': 0.8215165,\n",
65
+ " 'Egyptian cat': -0.08364794}"
66
+ ]
67
+ },
68
+ "execution_count": 46,
69
+ "metadata": {},
70
+ "output_type": "execute_result"
71
+ }
72
+ ],
73
+ "source": [
74
+ "result = {}\n",
75
+ "for i,item in enumerate(prediction_classes):\n",
76
+ " result[item] = logits_np[0][i]\n",
77
+ "\n",
78
+ "result"
79
+ ]
80
+ },
81
+ {
82
+ "cell_type": "code",
83
+ "execution_count": 11,
84
+ "metadata": {},
85
+ "outputs": [
86
+ {
87
+ "data": {
88
+ "text/plain": [
89
+ "['tiger cat', 'tabby, tabby cat', 'Egyptian cat']"
90
+ ]
91
+ },
92
+ "execution_count": 11,
93
+ "metadata": {},
94
+ "output_type": "execute_result"
95
+ }
96
+ ],
97
+ "source": [
98
+ "# model predicts one of the 1000 ImageNet classes\n",
99
+ "\n",
100
+ "prediction_classes = [model.config.id2label[predicted_class_idx] for predicted_class_idx in logits_args ]\n",
101
+ "\n",
102
+ "prediction_classes\n"
103
+ ]
104
+ },
105
+ {
106
+ "cell_type": "code",
107
+ "execution_count": null,
108
+ "metadata": {},
109
+ "outputs": [],
110
+ "source": []
111
+ }
112
+ ],
113
+ "metadata": {
114
+ "kernelspec": {
115
+ "display_name": "py_llm",
116
+ "language": "python",
117
+ "name": "python3"
118
+ },
119
+ "language_info": {
120
+ "codemirror_mode": {
121
+ "name": "ipython",
122
+ "version": 3
123
+ },
124
+ "file_extension": ".py",
125
+ "mimetype": "text/x-python",
126
+ "name": "python",
127
+ "nbconvert_exporter": "python",
128
+ "pygments_lexer": "ipython3",
129
+ "version": "3.10.12"
130
+ }
131
+ },
132
+ "nbformat": 4,
133
+ "nbformat_minor": 2
134
+ }
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio
2
+ transformers
3
+ torch
4
+ sentencepiece
5
+ huggingface_hub
room.jpg ADDED