import streamlit as st import pickle st.header("Image Captioner") st.markdown("Building the framework may take upto a minute. Please be patient. Thank you!") features=pickle.load(open("features.pkl","rb")) all_captions=pickle.load(open("all_captions.pkl","rb")) from tensorflow.keras.preprocessing.text import Tokenizer tokenizer = Tokenizer() tokenizer.fit_on_texts(all_captions) vocab_size = len(tokenizer.word_index) + 1 max_length = max(len(caption.split()) for caption in all_captions) from tensorflow import keras model = keras.models.load_model("best_model.h5") from tensorflow.keras.preprocessing.sequence import pad_sequences def idx_to_word(integer, tokenizer): for word, index in tokenizer.word_index.items(): if index == integer: return word return None # generate caption for an image import numpy as np def predict_caption(model, image, tokenizer, max_length): # add start tag for generation process in_text = 'startseq' # iterate over the max length of sequence for i in range(max_length): # encode input sequence sequence = tokenizer.texts_to_sequences([in_text])[0] # pad the sequence sequence = pad_sequences([sequence], max_length) # predict next word yhat = model.predict([image, sequence], verbose=0) # get index with high probability yhat = np.argmax(yhat) # convert index to word word = idx_to_word(yhat, tokenizer) # stop if word not found if word is None: break in_text += " " + word if word == 'endseq': break return in_text from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input from tensorflow.keras.models import Model from gtts import gTTS from io import BytesIO sound_file = BytesIO() vgg_model = VGG16() vgg_model = Model(inputs=vgg_model.inputs, outputs=vgg_model.layers[-2].output) from tensorflow.keras.preprocessing.image import img_to_array from PIL import Image uploaded_image=st.file_uploader("Upload image to be captioned",type=["jpg","png","jpeg",]) image_path="bushman.jpeg" if(uploaded_image!=None): display_image=Image.open(uploaded_image) st.image(display_image) if st.button("Caption"): st.text("Please be patient...") display_image=display_image.resize((224,224)) image = img_to_array(display_image) image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2])) image = preprocess_input(image) feature = vgg_model.predict(image, verbose=0) final=predict_caption(model, feature, tokenizer, max_length) final_output=((" ").join(final.split(" ")[1:len(final.split(" "))-1])) tts = gTTS(final_output, lang='en') tts.write_to_fp(sound_file) st.text("Output:") st.text(final_output) st.audio(sound_file)