krishnapal2308 commited on
Commit
7301eb7
·
1 Parent(s): 4582e37

pipeline to manual

Browse files
__pycache__/inference_script.cpython-310.pyc CHANGED
Binary files a/__pycache__/inference_script.cpython-310.pyc and b/__pycache__/inference_script.cpython-310.pyc differ
 
__pycache__/vit_gpt2.cpython-310.pyc CHANGED
Binary files a/__pycache__/vit_gpt2.cpython-310.pyc and b/__pycache__/vit_gpt2.cpython-310.pyc differ
 
app.py CHANGED
@@ -1,6 +1,4 @@
1
- import base64
2
  import tempfile
3
- import numpy as np
4
  import gradio as gr
5
  from gtts import gTTS
6
  import inference_script
 
 
1
  import tempfile
 
2
  import gradio as gr
3
  from gtts import gTTS
4
  import inference_script
inference_script.py CHANGED
@@ -1,9 +1,9 @@
1
  import numpy as np
2
-
3
  import tensorflow as tf
4
  import keras
5
  from keras.models import Model
6
-
 
7
 
8
  class Encoder(Model):
9
  def __init__(self, embed_dim):
 
1
  import numpy as np
 
2
  import tensorflow as tf
3
  import keras
4
  from keras.models import Model
5
+ import warnings
6
+ warnings.filterwarnings('ignore')
7
 
8
  class Encoder(Model):
9
  def __init__(self, embed_dim):
test.py CHANGED
@@ -1,6 +1,4 @@
1
- import base64
2
  import tempfile
3
- import numpy as np
4
  import gradio as gr
5
  from gtts import gTTS
6
  import inference_script
 
 
1
  import tempfile
 
2
  import gradio as gr
3
  from gtts import gTTS
4
  import inference_script
vit_gpt2.py CHANGED
@@ -1,39 +1,27 @@
1
- from transformers import pipeline
2
  from PIL import Image
 
 
 
 
 
 
 
 
 
 
3
 
4
 
5
  def predict_step(img_array):
6
- image_to_text = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
7
  i_image = Image.fromarray(img_array)
8
 
9
  if i_image.mode != "RGB":
10
  i_image = i_image.convert(mode="RGB")
11
 
12
- prediction = image_to_text(i_image)
13
- return prediction[0]['generated_text']
14
-
15
- # from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
16
- # from PIL import Image
17
- #
18
- # model = VisionEncoderDecoderModel.from_pretrained("vit-gpt2-image-captioning")
19
- # feature_extractor = ViTImageProcessor.from_pretrained("vit-gpt2-image-captioning")
20
- # tokenizer = AutoTokenizer.from_pretrained("vit-gpt2-image-captioning")
21
- #
22
- # max_length = 16
23
- # num_beams = 4
24
- # gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
25
- #
26
- #
27
- # def predict_step(img_array):
28
- # i_image = Image.fromarray(img_array)
29
- #
30
- # if i_image.mode != "RGB":
31
- # i_image = i_image.convert(mode="RGB")
32
- #
33
- # pixel_values = feature_extractor(images=i_image, return_tensors="pt", do_normalize=True).pixel_values
34
- #
35
- # output_ids = model.generate(pixel_values, **gen_kwargs)
36
- #
37
- # pred = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
38
- # pred = [p.strip() for p in pred]
39
- # return pred
 
1
+ from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
2
  from PIL import Image
3
+ import warnings
4
+ warnings.filterwarnings('ignore')
5
+
6
+ model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
7
+ feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
8
+ tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
9
+
10
+ max_length = 16
11
+ num_beams = 4
12
+ gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
13
 
14
 
15
  def predict_step(img_array):
 
16
  i_image = Image.fromarray(img_array)
17
 
18
  if i_image.mode != "RGB":
19
  i_image = i_image.convert(mode="RGB")
20
 
21
+ pixel_values = feature_extractor(images=i_image, return_tensors="pt", do_normalize=True).pixel_values
22
+
23
+ output_ids = model.generate(pixel_values, **gen_kwargs)
24
+
25
+ pred = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
26
+ pred = [p.strip() for p in pred]
27
+ return pred