priyesh17 commited on
Commit
325e144
·
verified ·
1 Parent(s): fe761a7

Image caption generator files

Browse files
Files changed (6) hide show
  1. CapGen.h5 +3 -0
  2. VGGModel.h5 +3 -0
  3. app.py +30 -0
  4. requirements.txt +7 -0
  5. tokenizer.pickle +3 -0
  6. util.py +70 -0
CapGen.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64ae41da5492e7f0eed48871c657d84b7e30a8773f2296082cbc4814a6370206
3
+ size 71970004
VGGModel.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:54d3f0c3eaff7acc305672af334af23bac9ac39da654296b0a6175c0fc7cdd87
3
+ size 537113328
app.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import tensorflow as tf
3
+ from PIL import Image
4
+ import numpy as np
5
+ import io
6
+ from util import generate_caption
7
+
8
+ # Function to load the model
9
+
10
+ # Streamlit app
11
+ st.title("Image Caption Generator")
12
+
13
+ uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg"])
14
+
15
+ if uploaded_file is not None:
16
+ image = Image.open(uploaded_file)
17
+ image = image.resize((224, 224))
18
+ st.image(image, caption='Uploaded Image', use_column_width=True)
19
+ st.write("")
20
+ st.write("Generating caption...")
21
+ caption = generate_caption(image)
22
+ st.write(f"Caption: {caption}")
23
+
24
+ # Add some information about the app
25
+ st.sidebar.header("About")
26
+ st.sidebar.info("This app uses a Deep Learning model(RNN model) along with VGG16 model(feature extractor) to generate captions for uploaded images.")
27
+ st.sidebar.info("Upload an image to get started!")
28
+ st.sidebar.info("The model is trained on Flickr8k dataset.")
29
+ st.sidebar.info("By Priyesh Gawali")
30
+ st.sidebar.markdown("[Github repository](https://github.com/Roronoa-17/Image_Caption_Generator.git)")
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ tensorflow
2
+ numpy
3
+ streamlit==1.35.0
4
+ scikit-learn
5
+ pickle-mixin
6
+ Pillow
7
+ gdown
tokenizer.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e21b2723c91942491147ae3d21fc27cb9afac743712c76497f6ddc376b24d8bf
3
+ size 334824
util.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tensorflow as tf
2
+ from tensorflow.keras.applications.vgg16 import preprocess_input
3
+ from tensorflow.keras.preprocessing.image import img_to_array
4
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
5
+ import numpy as np
6
+ import pickle
7
+
8
+ CapGenerator = tf.keras.models.load_model('models/CapGen.h5')
9
+ VGGMod = tf.keras.models.load_model('models/VGGModel.h5')
10
+ max_length = 35
11
+
12
+ with open('models/tokenizer.pickle', 'rb') as handle:
13
+ tokenizer = pickle.load(handle)
14
+
15
+ vocab_size = len(tokenizer.word_index) + 1
16
+
17
+ def idx_to_word(integer, tokenizer):
18
+ for word, index in tokenizer.word_index.items():
19
+ if index == integer:
20
+ return word
21
+ return None
22
+
23
+ def predict_caption(model, image, tokenizer, max_length=max_length):
24
+ # add start tag for generation process
25
+ in_text = 'startseq'
26
+ # iterate over the max length of sequence
27
+ for i in range(max_length):
28
+ # encode input sequence
29
+ sequence = tokenizer.texts_to_sequences([in_text])[0]
30
+ # pad the sequence
31
+ sequence = pad_sequences([sequence], max_length)
32
+ # predict next word
33
+ yhat = model.predict([image, sequence], verbose=0)
34
+ # get index with high probability
35
+ yhat = np.argmax(yhat)
36
+ # convert index to word
37
+ word = idx_to_word(yhat, tokenizer)
38
+ # stop if word not found
39
+ if word is None:
40
+ break
41
+ # append word as input for generating next word
42
+ in_text += " " + word
43
+ # stop if we reach end tag
44
+ if word == 'endseq':
45
+ break
46
+
47
+ return in_text
48
+
49
+ def feature_extractor(image):
50
+
51
+ # Img to np array
52
+ image = img_to_array(image)
53
+
54
+ # Reshaping
55
+ image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
56
+
57
+ # Preprocessing for passing through VGG16
58
+ image = preprocess_input(image)
59
+
60
+ feature = VGGMod.predict(image, verbose=0)
61
+
62
+ return feature
63
+
64
+ def generate_caption(image_name):
65
+
66
+ y_pred = predict_caption(CapGenerator, feature_extractor(image_name), tokenizer, max_length)
67
+ y_pred = y_pred[8:-7].upper()
68
+
69
+ return y_pred
70
+