Spaces:
Runtime error
Runtime error
import streamlit as st | |
import pickle | |
st.header("Image Captioner") | |
st.markdown("Building the framework may take upto a minute. Please be patient. Thank you!") | |
features=pickle.load(open("features.pkl","rb")) | |
all_captions=pickle.load(open("all_captions.pkl","rb")) | |
from tensorflow.keras.preprocessing.text import Tokenizer | |
tokenizer = Tokenizer() | |
tokenizer.fit_on_texts(all_captions) | |
vocab_size = len(tokenizer.word_index) + 1 | |
max_length = max(len(caption.split()) for caption in all_captions) | |
from tensorflow import keras | |
model = keras.models.load_model("best_model.h5") | |
from tensorflow.keras.preprocessing.sequence import pad_sequences | |
def idx_to_word(integer, tokenizer): | |
for word, index in tokenizer.word_index.items(): | |
if index == integer: | |
return word | |
return None | |
# generate caption for an image | |
import numpy as np | |
def predict_caption(model, image, tokenizer, max_length): | |
# add start tag for generation process | |
in_text = 'startseq' | |
# iterate over the max length of sequence | |
for i in range(max_length): | |
# encode input sequence | |
sequence = tokenizer.texts_to_sequences([in_text])[0] | |
# pad the sequence | |
sequence = pad_sequences([sequence], max_length) | |
# predict next word | |
yhat = model.predict([image, sequence], verbose=0) | |
# get index with high probability | |
yhat = np.argmax(yhat) | |
# convert index to word | |
word = idx_to_word(yhat, tokenizer) | |
# stop if word not found | |
if word is None: | |
break | |
in_text += " " + word | |
if word == 'endseq': | |
break | |
return in_text | |
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input | |
from tensorflow.keras.models import Model | |
from gtts import gTTS | |
from io import BytesIO | |
sound_file = BytesIO() | |
vgg_model = VGG16() | |
vgg_model = Model(inputs=vgg_model.inputs, outputs=vgg_model.layers[-2].output) | |
from tensorflow.keras.preprocessing.image import img_to_array | |
from PIL import Image | |
uploaded_image=st.file_uploader("Upload image to be captioned",type=["jpg","png","jpeg","webp"]) | |
image_path="bushman.jpeg" | |
if(uploaded_image!=None): | |
display_image=Image.open(uploaded_image) | |
st.image(display_image) | |
if st.button("Caption"): | |
st.text("Please be patient...") | |
display_image=display_image.resize((224,224)) | |
image = img_to_array(display_image) | |
image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2])) | |
image = preprocess_input(image) | |
feature = vgg_model.predict(image, verbose=0) | |
final=predict_caption(model, feature, tokenizer, max_length) | |
final_output=((" ").join(final.split(" ")[1:len(final.split(" "))-1])) | |
tts = gTTS(final_output, lang='en') | |
tts.write_to_fp(sound_file) | |
st.text("Output:") | |
st.markdown(final_output) | |
st.audio(sound_file) |