amgadhasan
/

arabic-image-captioner

TF-Keras

Model card Files Files and versions Community

amgadhasan commited on Jan 6, 2024

Commit

0198bb9

1 Parent(s): 386e8e5

Update image_captioner.py

Browse files

Files changed (1) hide show

image_captioner.py +130 -42

image_captioner.py CHANGED Viewed

@@ -1,49 +1,136 @@
-import os
-os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 import tensorflow as tf
-from utils.constants import MAX_LENGTH, IMAGE_SIZE, HIDDEN_UNITS
 import json
-import io
-class ImageCaptioner():
     """
-    A custom class that builds the full model from the smaller sub models. It contains a cnn for feature extraction, a cnn_encoder to encode the features to a suitable dimension,
-    an RNN decoder that contains an attention layer and RNN layer to generate text from the last predicted token + encoded image features.
     """
-    def __init__(self, cnn, cnn_encoder, rnn_decoder, **kwargs):
         """
-        Initializes the ImageCaptioner class with the given arguments.
         Args:
-        cnn: A convolutional neural network that is used to extract features from images.
-        cnn_encoder: A model that encodes the image features into a lower-dimensional space.
-        rnn_decoder: A recurrent neural network that generates captions for the input images.
-        max_length: The maximum length of the captions that the model generates.
-        **kwargs: Additional keyword arguments that are not used in this implementation.
         """
-        self.cnn = cnn
-        self.cnn_encoder = cnn_encoder
-        self.rnn_decoder = rnn_decoder
-        self.MAX_LENGTH = MAX_LENGTH
-        self.START_TOKEN_INDEX = 1
-        self.END_TOKEN_INDEX = 2
-        self.HIDDEN_UNITS = HIDDEN_UNITS
-    def __call__(self, inputs):
         """
         Calls the MyCustomModel instance with the given inputs.
         Args:
-        inputs: A list of input tensors containing the decoder input, encoded features, and hidden state.
         Returns:
-        The output tensor of the RNN decoder.
         """
-        [decoder_input, encoded_features, hidden_state] = inputs
-        return self.rnn_decoder(decoder_input, encoded_features, hidden_state, training=False)
-    def predict(self, image):
         """
         Generates a caption for the given image.
@@ -53,9 +140,13 @@ class ImageCaptioner():
         Returns:
         A tuple containing the indices of the predicted tokens and the attention weights sequence.
         """
-        image_features = self.cnn(image)
-        reshaped_features = tf.reshape(image_features, (tf.shape(image_features)[0], -1, image_features.shape[3]))
-        encoded_features = self.cnn_encoder(reshaped_features)
         # Get the RNN's initial state and start token for each new sample
         # hidden_state = tf.zeros((1, 512))
@@ -64,18 +155,15 @@ class ImageCaptioner():
         # caption_probability = 1
         # predicted_tokens_indices = []
         # attention_weights_sequence = []
-        n_captions = 2
-        results = tf.Variable(tf.zeros(shape=(n_captions, self.MAX_LENGTH),dtype='int32'), )
-        scores = tf.ones(shape=(n_captions,))
         #hidden = decoder.get_initial_state(batch_size=1)
         #hiddens = self.rnn_decoder.get_initial_state(batch_size=n_captions)
-        hiddens = tf.zeros((n_captions, self.HIDDEN_UNITS))
-        #hiddens = [hidden for _ in range(n)]
-        #dec_input = tf.expand_dims([tokenizer.word_index['بب']], 0)
-        dec_inputs = tf.fill(dims=(n_captions,1), value=self.START_TOKEN_INDEX)
         batch_indices = list(range(n_captions)) # batch size
-        for i in range(self.MAX_LENGTH):
-            logits, hiddens, attention_weights = self.__call__([dec_inputs, encoded_features, hiddens])
             predicted_ids = tf.random.categorical(logits, num_samples=1, dtype=tf.int32)  # shape (batch_size,num_samples)
             predicted_ids = tf.squeeze(predicted_ids, axis=-1)
             #predicted_ids = tf.convert_to_tensor(predicted_ids, dtype=tf.int32)#tf.cast(predicted_ids, tf.int32)
@@ -97,7 +185,7 @@ class ImageCaptioner():
         most_probable_sequence_id = int(tf.math.argmax(scores))
         best_caption = list(results[most_probable_sequence_id].numpy())
         print(best_caption)
-        eos_loc = best_caption.index(self.END_TOKEN_INDEX)
         #caption_text = tokenizer.sequences_to_texts([best_caption[:eos_loc]])
         return best_caption[:eos_loc], None
@@ -111,4 +199,4 @@ class ImageCaptioner():
         #         break
         #     decoder_input = tf.expand_dims([tf.cast(predicted_token_index, tf.int32)], 0)
-        # return predicted_tokens_indices, attention_weights_sequence

 import tensorflow as tf
+from tensorflow.keras.models import load_model
+import pathlib
 import json
+def load_config(path: pathlib.Path) -> pathlib.Path:
     """
+    A helper function to load a JSON config.
+    Args:
+    path (pathlib.Path): The path to the saved model.
+    Returns:
+    dict: The loaded config as a Python dict.
     """
+    with open(path) as f:
+        config = json.load(f)
+    return config
+class Tokenizer:
+    def __init__(self, path: str):
+        self.config = load_config(path / "tokenizer_config.json")
+        self.tokenizer = self.load_from_json(path / "tokenizer.json")
+    def load_from_json(self, file_path: pathlib.Path) -> tf.keras.preprocessing.text.Tokenizer:
         """
+        A helper function to load tokenizer saved as JSON file.
         Args:
+        file_path (pathlib.Path): The path to the tokenizer JSON file.
+        Returns:
+        tf.keras.preprocessing.text.Tokenizer: The loaded tokenizer.
+        """
+        with open(file_path) as file:
+            data = json.load(file)
+            loaded_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(data)
+        return loaded_tokenizer
+class Model:
+    def __init__(self, path: str):
+        self.config = load_config(path / "model_config.json")
+        self.cnn = self._load_model(path / "cnn")
+        self.cnn_projector = self._load_model(path / "cnn_projector")
+        self.rnn_decoder = self._load_model(path / "decoder")
+    def _load_model(self, path: pathlib.Path) -> tf.keras.Model:
+        """
+        A helper function to load a saved Keras model from the given path.
+        Args:
+        path (pathlib.Path): The path to the saved model.
+        Returns:
+        tf.keras.Model: The loaded Keras model.
         """
+        return load_model(path)
+    def encode(self, images) -> tf.Tensor:
+        """
+        Encodes the input images and returns the encoded features.
+        Args:
+        images (tf.Tensor): The input images tensor.
+        Returns:
+        tf.Tensor: The encoded features tensor.
+        """
+        images_features = self.cnn(images)
+        reshaped_features = tf.reshape(images_features, (tf.shape(images_features)[0], -1, images_features.shape[3]))
+        encoded_features = self.cnn_projector(reshaped_features)
+        return encoded_features
+    def decode(self, decoder_inputs, encoded_features, hidden_states) -> dict:
+        """
+        Decodes the input and returns the logits, hidden states, and attention weights.
+        Args:
+        decoder_inputs (tf.Tensor): The decoder input tensor.
+        encoded_features (tf.Tensor): The encoded features tensor.
+        hidden_states (tf.Tensor): The hidden states tensor.
+        Returns:
+        dict: A dictionary containing the logits, hidden states, and attention weights.
+        """
+        logits, hidden_states, attention_weights = self.rnn_decoder([decoder_inputs, encoded_features, hidden_states])
+        return {"logits": logits, "hidden_states": hidden_states, "attention_weights": attention_weights}
+    def __call__(self, images, decoder_inputs, hidden_states) -> dict:
         """
         Calls the MyCustomModel instance with the given inputs.
         Args:
+        images (tf.Tensor): The input images tensor.
+        decoder_inputs (tf.Tensor): The decoder input tensor.
+        hidden_states (tf.Tensor): The hidden states tensor.
         Returns:
+        dict: A dictionary containing the logits, hidden states, and attention weights.
         """
+        encoded_features = self.encode(images)
+        outputs = self.decode(decoder_inputs, encoded_features, hidden_states)
+        return outputs
+class ImageCaptioner():
+    """
+    A custom class that builds the full model from the smaller sub-models. It contains a CNN for feature extraction, a CNN encoder to encode the features to a suitable dimension,
+    an RNN decoder that contains an attention layer and RNN layer to generate text from the last predicted token + encoded image features.
+    """
+    def __init__(self, model_path: pathlib.Path, tokenizer_path, preprocessor):
+        """
+        Initializes the ImageCaptioner class with the given arguments.
+        Args:
+        path (pathlib.Path): The path to the directory containing the saved models and configuration files.
+        **kwargs: Additional keyword arguments that are not used in this implementation.
+        """
+        self.preprocessor = preprocessor
+        self.tokenizer = Tokenizer(tokenizer_path)
+        self.model = Model(model_path)
+    def predict(self, images, max_length, num_captions=5):
         """
         Generates a caption for the given image.
         Returns:
         A tuple containing the indices of the predicted tokens and the attention weights sequence.
         """
+        if not max_length or max_length > self.model.config['max_length']:
+            max_length = self.model.config['max_length']
+        images = tf.image.resize(images, self.model.config["image_size"])
+        images = self.preprocessor(images)
+        encoded_features = self.model.encode(images)
         # Get the RNN's initial state and start token for each new sample
         # hidden_state = tf.zeros((1, 512))
         # caption_probability = 1
         # predicted_tokens_indices = []
         # attention_weights_sequence = []
+        results = tf.Variable(tf.zeros(shape=(num_captions, max_length),dtype='int32'), )
+        scores = tf.ones(shape=(num_captions,))
         #hidden = decoder.get_initial_state(batch_size=1)
         #hiddens = self.rnn_decoder.get_initial_state(batch_size=n_captions)
+        hidden_states = tf.zeros((num_captions, self.model.config["num_hidden_units"]))
+        dec_inputs = tf.fill(dims=(n_captions,1), value=self.tokenizer_config['bos_token_id'])
         batch_indices = list(range(n_captions)) # batch size
+        for i in range(max_length):
+            logits, hidden_states, attention_weights = self.model.decode(decoder_inputs, encoded_features, hidden_states)
             predicted_ids = tf.random.categorical(logits, num_samples=1, dtype=tf.int32)  # shape (batch_size,num_samples)
             predicted_ids = tf.squeeze(predicted_ids, axis=-1)
             #predicted_ids = tf.convert_to_tensor(predicted_ids, dtype=tf.int32)#tf.cast(predicted_ids, tf.int32)
         most_probable_sequence_id = int(tf.math.argmax(scores))
         best_caption = list(results[most_probable_sequence_id].numpy())
         print(best_caption)
+        eos_loc = best_caption.index(self.tokenizer_config['eos_token_id'])
         #caption_text = tokenizer.sequences_to_texts([best_caption[:eos_loc]])
         return best_caption[:eos_loc], None
         #         break
         #     decoder_input = tf.expand_dims([tf.cast(predicted_token_index, tf.int32)], 0)
+        # return predicted_tokens_indices, attention_weights_sequence