amgadhasan commited on
Commit
0198bb9
·
1 Parent(s): 386e8e5

Update image_captioner.py

Browse files
Files changed (1) hide show
  1. image_captioner.py +130 -42
image_captioner.py CHANGED
@@ -1,49 +1,136 @@
1
- import os
2
- os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
3
  import tensorflow as tf
4
- from utils.constants import MAX_LENGTH, IMAGE_SIZE, HIDDEN_UNITS
 
5
  import json
6
- import io
7
 
8
 
9
- class ImageCaptioner():
10
  """
11
- A custom class that builds the full model from the smaller sub models. It contains a cnn for feature extraction, a cnn_encoder to encode the features to a suitable dimension,
12
- an RNN decoder that contains an attention layer and RNN layer to generate text from the last predicted token + encoded image features.
 
 
 
 
 
13
  """
14
- def __init__(self, cnn, cnn_encoder, rnn_decoder, **kwargs):
 
 
 
 
 
 
 
 
 
 
 
15
  """
16
- Initializes the ImageCaptioner class with the given arguments.
17
 
18
  Args:
19
- cnn: A convolutional neural network that is used to extract features from images.
20
- cnn_encoder: A model that encodes the image features into a lower-dimensional space.
21
- rnn_decoder: A recurrent neural network that generates captions for the input images.
22
- max_length: The maximum length of the captions that the model generates.
23
- **kwargs: Additional keyword arguments that are not used in this implementation.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  """
25
- self.cnn = cnn
26
- self.cnn_encoder = cnn_encoder
27
- self.rnn_decoder = rnn_decoder
28
- self.MAX_LENGTH = MAX_LENGTH
29
- self.START_TOKEN_INDEX = 1
30
- self.END_TOKEN_INDEX = 2
31
- self.HIDDEN_UNITS = HIDDEN_UNITS
32
-
33
- def __call__(self, inputs):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  """
35
  Calls the MyCustomModel instance with the given inputs.
36
 
37
  Args:
38
- inputs: A list of input tensors containing the decoder input, encoded features, and hidden state.
 
 
39
 
40
  Returns:
41
- The output tensor of the RNN decoder.
42
  """
43
- [decoder_input, encoded_features, hidden_state] = inputs
44
- return self.rnn_decoder(decoder_input, encoded_features, hidden_state, training=False)
 
45
 
46
- def predict(self, image):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  """
48
  Generates a caption for the given image.
49
 
@@ -53,9 +140,13 @@ class ImageCaptioner():
53
  Returns:
54
  A tuple containing the indices of the predicted tokens and the attention weights sequence.
55
  """
56
- image_features = self.cnn(image)
57
- reshaped_features = tf.reshape(image_features, (tf.shape(image_features)[0], -1, image_features.shape[3]))
58
- encoded_features = self.cnn_encoder(reshaped_features)
 
 
 
 
59
 
60
  # Get the RNN's initial state and start token for each new sample
61
  # hidden_state = tf.zeros((1, 512))
@@ -64,18 +155,15 @@ class ImageCaptioner():
64
  # caption_probability = 1
65
  # predicted_tokens_indices = []
66
  # attention_weights_sequence = []
67
- n_captions = 2
68
- results = tf.Variable(tf.zeros(shape=(n_captions, self.MAX_LENGTH),dtype='int32'), )
69
- scores = tf.ones(shape=(n_captions,))
70
  #hidden = decoder.get_initial_state(batch_size=1)
71
  #hiddens = self.rnn_decoder.get_initial_state(batch_size=n_captions)
72
- hiddens = tf.zeros((n_captions, self.HIDDEN_UNITS))
73
- #hiddens = [hidden for _ in range(n)]
74
- #dec_input = tf.expand_dims([tokenizer.word_index['بب']], 0)
75
- dec_inputs = tf.fill(dims=(n_captions,1), value=self.START_TOKEN_INDEX)
76
  batch_indices = list(range(n_captions)) # batch size
77
- for i in range(self.MAX_LENGTH):
78
- logits, hiddens, attention_weights = self.__call__([dec_inputs, encoded_features, hiddens])
79
  predicted_ids = tf.random.categorical(logits, num_samples=1, dtype=tf.int32) # shape (batch_size,num_samples)
80
  predicted_ids = tf.squeeze(predicted_ids, axis=-1)
81
  #predicted_ids = tf.convert_to_tensor(predicted_ids, dtype=tf.int32)#tf.cast(predicted_ids, tf.int32)
@@ -97,7 +185,7 @@ class ImageCaptioner():
97
  most_probable_sequence_id = int(tf.math.argmax(scores))
98
  best_caption = list(results[most_probable_sequence_id].numpy())
99
  print(best_caption)
100
- eos_loc = best_caption.index(self.END_TOKEN_INDEX)
101
  #caption_text = tokenizer.sequences_to_texts([best_caption[:eos_loc]])
102
 
103
  return best_caption[:eos_loc], None
@@ -111,4 +199,4 @@ class ImageCaptioner():
111
  # break
112
  # decoder_input = tf.expand_dims([tf.cast(predicted_token_index, tf.int32)], 0)
113
 
114
- # return predicted_tokens_indices, attention_weights_sequence
 
 
 
1
  import tensorflow as tf
2
+ from tensorflow.keras.models import load_model
3
+ import pathlib
4
  import json
 
5
 
6
 
7
+ def load_config(path: pathlib.Path) -> pathlib.Path:
8
  """
9
+ A helper function to load a JSON config.
10
+
11
+ Args:
12
+ path (pathlib.Path): The path to the saved model.
13
+
14
+ Returns:
15
+ dict: The loaded config as a Python dict.
16
  """
17
+ with open(path) as f:
18
+ config = json.load(f)
19
+
20
+ return config
21
+
22
+
23
+ class Tokenizer:
24
+ def __init__(self, path: str):
25
+ self.config = load_config(path / "tokenizer_config.json")
26
+ self.tokenizer = self.load_from_json(path / "tokenizer.json")
27
+
28
+ def load_from_json(self, file_path: pathlib.Path) -> tf.keras.preprocessing.text.Tokenizer:
29
  """
30
+ A helper function to load tokenizer saved as JSON file.
31
 
32
  Args:
33
+ file_path (pathlib.Path): The path to the tokenizer JSON file.
34
+
35
+ Returns:
36
+ tf.keras.preprocessing.text.Tokenizer: The loaded tokenizer.
37
+ """
38
+ with open(file_path) as file:
39
+ data = json.load(file)
40
+ loaded_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(data)
41
+
42
+ return loaded_tokenizer
43
+
44
+ class Model:
45
+ def __init__(self, path: str):
46
+ self.config = load_config(path / "model_config.json")
47
+ self.cnn = self._load_model(path / "cnn")
48
+ self.cnn_projector = self._load_model(path / "cnn_projector")
49
+ self.rnn_decoder = self._load_model(path / "decoder")
50
+
51
+ def _load_model(self, path: pathlib.Path) -> tf.keras.Model:
52
+ """
53
+ A helper function to load a saved Keras model from the given path.
54
+
55
+ Args:
56
+ path (pathlib.Path): The path to the saved model.
57
+
58
+ Returns:
59
+ tf.keras.Model: The loaded Keras model.
60
  """
61
+ return load_model(path)
62
+
63
+ def encode(self, images) -> tf.Tensor:
64
+ """
65
+ Encodes the input images and returns the encoded features.
66
+
67
+ Args:
68
+ images (tf.Tensor): The input images tensor.
69
+
70
+ Returns:
71
+ tf.Tensor: The encoded features tensor.
72
+ """
73
+ images_features = self.cnn(images)
74
+ reshaped_features = tf.reshape(images_features, (tf.shape(images_features)[0], -1, images_features.shape[3]))
75
+ encoded_features = self.cnn_projector(reshaped_features)
76
+
77
+ return encoded_features
78
+
79
+ def decode(self, decoder_inputs, encoded_features, hidden_states) -> dict:
80
+ """
81
+ Decodes the input and returns the logits, hidden states, and attention weights.
82
+
83
+ Args:
84
+ decoder_inputs (tf.Tensor): The decoder input tensor.
85
+ encoded_features (tf.Tensor): The encoded features tensor.
86
+ hidden_states (tf.Tensor): The hidden states tensor.
87
+
88
+ Returns:
89
+ dict: A dictionary containing the logits, hidden states, and attention weights.
90
+ """
91
+ logits, hidden_states, attention_weights = self.rnn_decoder([decoder_inputs, encoded_features, hidden_states])
92
+
93
+ return {"logits": logits, "hidden_states": hidden_states, "attention_weights": attention_weights}
94
+
95
+ def __call__(self, images, decoder_inputs, hidden_states) -> dict:
96
  """
97
  Calls the MyCustomModel instance with the given inputs.
98
 
99
  Args:
100
+ images (tf.Tensor): The input images tensor.
101
+ decoder_inputs (tf.Tensor): The decoder input tensor.
102
+ hidden_states (tf.Tensor): The hidden states tensor.
103
 
104
  Returns:
105
+ dict: A dictionary containing the logits, hidden states, and attention weights.
106
  """
107
+ encoded_features = self.encode(images)
108
+
109
+ outputs = self.decode(decoder_inputs, encoded_features, hidden_states)
110
 
111
+ return outputs
112
+
113
+
114
+ class ImageCaptioner():
115
+ """
116
+ A custom class that builds the full model from the smaller sub-models. It contains a CNN for feature extraction, a CNN encoder to encode the features to a suitable dimension,
117
+ an RNN decoder that contains an attention layer and RNN layer to generate text from the last predicted token + encoded image features.
118
+ """
119
+ def __init__(self, model_path: pathlib.Path, tokenizer_path, preprocessor):
120
+ """
121
+ Initializes the ImageCaptioner class with the given arguments.
122
+
123
+ Args:
124
+ path (pathlib.Path): The path to the directory containing the saved models and configuration files.
125
+ **kwargs: Additional keyword arguments that are not used in this implementation.
126
+ """
127
+ self.preprocessor = preprocessor
128
+
129
+ self.tokenizer = Tokenizer(tokenizer_path)
130
+
131
+ self.model = Model(model_path)
132
+
133
+ def predict(self, images, max_length, num_captions=5):
134
  """
135
  Generates a caption for the given image.
136
 
 
140
  Returns:
141
  A tuple containing the indices of the predicted tokens and the attention weights sequence.
142
  """
143
+ if not max_length or max_length > self.model.config['max_length']:
144
+ max_length = self.model.config['max_length']
145
+
146
+ images = tf.image.resize(images, self.model.config["image_size"])
147
+ images = self.preprocessor(images)
148
+
149
+ encoded_features = self.model.encode(images)
150
 
151
  # Get the RNN's initial state and start token for each new sample
152
  # hidden_state = tf.zeros((1, 512))
 
155
  # caption_probability = 1
156
  # predicted_tokens_indices = []
157
  # attention_weights_sequence = []
158
+ results = tf.Variable(tf.zeros(shape=(num_captions, max_length),dtype='int32'), )
159
+ scores = tf.ones(shape=(num_captions,))
 
160
  #hidden = decoder.get_initial_state(batch_size=1)
161
  #hiddens = self.rnn_decoder.get_initial_state(batch_size=n_captions)
162
+ hidden_states = tf.zeros((num_captions, self.model.config["num_hidden_units"]))
163
+ dec_inputs = tf.fill(dims=(n_captions,1), value=self.tokenizer_config['bos_token_id'])
 
 
164
  batch_indices = list(range(n_captions)) # batch size
165
+ for i in range(max_length):
166
+ logits, hidden_states, attention_weights = self.model.decode(decoder_inputs, encoded_features, hidden_states)
167
  predicted_ids = tf.random.categorical(logits, num_samples=1, dtype=tf.int32) # shape (batch_size,num_samples)
168
  predicted_ids = tf.squeeze(predicted_ids, axis=-1)
169
  #predicted_ids = tf.convert_to_tensor(predicted_ids, dtype=tf.int32)#tf.cast(predicted_ids, tf.int32)
 
185
  most_probable_sequence_id = int(tf.math.argmax(scores))
186
  best_caption = list(results[most_probable_sequence_id].numpy())
187
  print(best_caption)
188
+ eos_loc = best_caption.index(self.tokenizer_config['eos_token_id'])
189
  #caption_text = tokenizer.sequences_to_texts([best_caption[:eos_loc]])
190
 
191
  return best_caption[:eos_loc], None
 
199
  # break
200
  # decoder_input = tf.expand_dims([tf.cast(predicted_token_index, tf.int32)], 0)
201
 
202
+ # return predicted_tokens_indices, attention_weights_sequence