jarif commited on
Commit
da9f201
·
verified ·
1 Parent(s): 113b8e7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -47
app.py CHANGED
@@ -1,47 +1,46 @@
1
- import gradio as gr
2
- from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
3
- import torch
4
- from PIL import Image
5
- import matplotlib.pyplot as plt
6
-
7
- # Load model and tokenizer
8
- model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
9
- feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
10
- tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
11
-
12
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13
- model.to(device)
14
-
15
- # Define generation parameters
16
- max_length = 16
17
- num_beams = 4
18
- gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
19
-
20
- def predict_step(image):
21
- # Process the input image
22
- i_image = Image.fromarray(image)
23
- if i_image.mode != "RGB":
24
- i_image = i_image.convert(mode="RGB")
25
-
26
- # Prepare image for the model
27
- pixel_values = feature_extractor(images=[i_image], return_tensors="pt").pixel_values
28
- pixel_values = pixel_values.to(device)
29
-
30
- # Generate prediction
31
- output_ids = model.generate(pixel_values, **gen_kwargs)
32
- preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
33
- preds = [pred.strip() for pred in preds]
34
-
35
- return preds[0]
36
-
37
- # Define Gradio interface
38
- interface = gr.Interface(
39
- fn=predict_step,
40
- inputs=gr.inputs.Image(type="numpy"),
41
- outputs="text",
42
- title="Image-to-Text Conversion",
43
- description="Upload an image, and this model will generate a textual description of the image."
44
- )
45
-
46
- # Launch the interface
47
- interface.launch()
 
1
+ import gradio as gr
2
+ from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
3
+ import torch
4
+ from PIL import Image
5
+
6
+ # Load model and tokenizer
7
+ model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
8
+ feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
9
+ tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
10
+
11
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
12
+ model.to(device)
13
+
14
+ # Define generation parameters
15
+ max_length = 16
16
+ num_beams = 4
17
+ gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
18
+
19
+ def predict_step(image):
20
+ # Convert image from numpy array to PIL image
21
+ i_image = Image.fromarray(image)
22
+ if i_image.mode != "RGB":
23
+ i_image = i_image.convert(mode="RGB")
24
+
25
+ # Prepare image for the model
26
+ pixel_values = feature_extractor(images=[i_image], return_tensors="pt").pixel_values
27
+ pixel_values = pixel_values.to(device)
28
+
29
+ # Generate prediction
30
+ output_ids = model.generate(pixel_values, **gen_kwargs)
31
+ preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
32
+ preds = [pred.strip() for pred in preds]
33
+
34
+ return preds[0]
35
+
36
+ # Define Gradio interface
37
+ interface = gr.Interface(
38
+ fn=predict_step,
39
+ inputs=gr.Image(type="numpy"), # Updated to use gr.Image
40
+ outputs="text",
41
+ title="Image-to-Text Conversion",
42
+ description="Upload an image, and this model will generate a textual description of the image."
43
+ )
44
+
45
+ # Launch the interface
46
+ interface.launch()