Spaces:

abven
/

ImageCaptionGenerator

Sleeping

App Files Files Community

VenkateshRoshan commited on Oct 8, 2024

Commit

3138612

1 Parent(s): be7ebcc

Training script is now working

Browse files

Files changed (8) hide show

config/__pycache__/config.cpython-310.pyc +0 -0
config/config.py +2 -1
data/__pycache__/dataLoader.cpython-310.pyc +0 -0
data/dataLoader.py +53 -46
main.py +0 -14
models/__pycache__/model.cpython-310.pyc +0 -0
models/model.py +56 -0
train.py +78 -0

config/__pycache__/config.cpython-310.pyc CHANGED Viewed

Binary files a/config/__pycache__/config.cpython-310.pyc and b/config/__pycache__/config.cpython-310.pyc differ

config/config.py CHANGED Viewed

@@ -8,4 +8,5 @@ class Config:
     EPOCHS = 10
     DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
     AWS_S3_BUCKET = 'your-s3-bucket-name'
-    DATASET_PATH = '../Datasets/Flickr8K/'

     EPOCHS = 10
     DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
     AWS_S3_BUCKET = 'your-s3-bucket-name'
+    DATASET_PATH = '../Datasets/Flickr8K/'
+    BATCH_SIZE = 32

data/__pycache__/dataLoader.cpython-310.pyc CHANGED Viewed

Binary files a/data/__pycache__/dataLoader.cpython-310.pyc and b/data/__pycache__/dataLoader.cpython-310.pyc differ

data/dataLoader.py CHANGED Viewed

@@ -1,52 +1,59 @@
-import numpy as np
 import os
-import cv2
 from PIL import Image
 from torchvision import transforms
-import pandas as pd
-class dataLoader:
-    def __init__(self, path):
-        self.path = path
-        self.img_path = path + 'images/'
-        self.caption_path = path + 'captions.csv'
-        self.img_list = os.listdir(self.img_path)
-        self.caption_dict = self.get_caption_dict()
-        self.transform = transforms.Compose([
-            transforms.Resize((224, 224)),
-            transforms.ToTensor()
         ])
-    def get_caption_dict(self):
-        caption_dict = {}
-        df = pd.read_csv(self.caption_path, delimiter=',')
-        for i in range(len(df)):
-            img_name = df.iloc[i, 0]
-            caption = df.iloc[i, 1]
-            caption_dict[img_name] = caption
-        return caption_dict
-    def get_image(self, img_name):
-        img = Image.open(self.img_path + img_name)
-        img = self.transform(img)
-        return img
-    def get_caption(self, img_name):
-        return self.caption_dict[img_name]
-    def get_batch(self, batch_size):
-        batch = np.random.choice(self.img_list, batch_size)
-        images = []
-        captions = []
-        for img_name in batch:
-            images.append(self.get_image(img_name))
-            captions.append(self.get_caption(img_name))
-        return images, captions
-    def get_all(self):
-        images = []
-        captions = []
-        for img_name in self.img_list:
-            images.append(self.get_image(img_name))
-            captions.append(self.get_caption(img_name))
-        return images, captions

 import os
+import pandas as pd
 from PIL import Image
 from torchvision import transforms
+from torch.utils.data import Dataset
+class ImageCaptionDataset(Dataset):
+    """
+    Custom PyTorch Dataset class to handle loading and transforming image-caption pairs
+    where image paths and captions are provided in a CSV file.
+    Attributes:
+        caption_file (str): Path to the CSV file containing image paths and captions.
+        transform (torchvision.transforms.Compose): Transformations to apply on the images.
+    """
+    def __init__(self, caption_file: str, file_path: str, transform=None):
+        """
+        Initialize dataset with caption CSV file and optional transform.
+        Args:
+            caption_file (str): Path to the CSV file where each row has an image path and caption.
+            transform (callable, optional): Optional transform to apply on an image.
+        """
+        self.df = pd.read_csv(caption_file)
+        self.image_path = file_path
+        self.transform = transform or transforms.Compose([
+            transforms.Resize((224, 224)),   # Resize to 224x224 for ViT
+            transforms.ToTensor(),           # Convert to tensor
+            # Normalize to have values in the range [0, 1]
+            # transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
         ])
+    def __len__(self):
+        """
+        Return the total number of samples in the dataset.
+        """
+        return len(self.df)
+    def __getitem__(self, idx):
+        """
+        Retrieve an image and its corresponding caption by index.
+        Args:
+            idx (int): Index of the data item.
+        Returns:
+            tuple: (image, caption) where image is the transformed image tensor and caption is the associated text.
+        """
+        img_path = self.df.iloc[idx, 0]  # The first column contains image paths
+        caption = self.df.iloc[idx, 1]   # The second column contains captions
+        # Load image
+        image = Image.open(self.image_path+img_path).convert('RGB')
+        # Apply transformations to the image
+        if self.transform:
+            image = self.transform(image)
+        return image, caption

main.py DELETED Viewed

@@ -1,14 +0,0 @@
-import numpy as np
-import os
-import cv2
-from PIL import Image
-from matplotlib import pyplot as plt
-from config.config import Config
-from data.dataLoader import dataLoader
-if __name__ == '__main__':
-    dl = dataLoader(Config.DATASET_PATH)
-    images, captions = dl.get_all()
-    print('Number of images:', len(images))
-    print('Number of captions:', len(captions))

models/__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (2.69 kB). View file

models/model.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import torch
+from transformers import ViTModel, ViTFeatureExtractor, GPT2LMHeadModel, GPT2Tokenizer
+from config.config import Config
+from torchsummary import summary
+from torchvision import transforms
+class ImageCaptioningModel:
+    def __init__(self):
+        """Initialize the ViT and GPT-2 models for image captioning."""
+        self.device = Config.DEVICE
+        self.vit_model = ViTModel.from_pretrained(Config.VIT_MODEL).to(self.device)
+        self.feature_extractor = ViTFeatureExtractor.from_pretrained(Config.VIT_MODEL)
+        self.gpt2_model = GPT2LMHeadModel.from_pretrained(Config.GPT2_MODEL).to(self.device)
+        self.tokenizer = GPT2Tokenizer.from_pretrained(Config.GPT2_MODEL)
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+    def extract_image_features(self, images):
+        """Extract features from images using ViT."""
+        pixel_values = self.feature_extractor(images=images, return_tensors="pt", do_rescale=False).pixel_values.to(self.device)
+        with torch.no_grad():
+            outputs = self.vit_model(pixel_values)
+        return outputs.last_hidden_state[:, 0, :]  # [batch_size, hidden_size]
+    def prepare_gpt2_inputs(self, image_features, captions):
+        """Prepare GPT-2 inputs."""
+        # Tokenize the captions
+        tokenized_captions = self.tokenizer(captions, padding="longest", truncation=True,
+                                            max_length=Config.MAX_SEQ_LEN, return_tensors="pt").to(self.device)
+        # Get the word embeddings for the tokens
+        token_embeddings = self.gpt2_model.transformer.wte(tokenized_captions['input_ids'])
+        # Concatenate image features with token embeddings
+        image_features = image_features.unsqueeze(1)  # Reshape to [batch_size, 1, hidden_size]
+        inputs_embeds = torch.cat((image_features, token_embeddings), dim=1)  # Concatenate along the sequence dimension
+        # Adjust input_ids to account for the image feature token
+        batch_size = image_features.shape[0]
+        image_token_id = torch.full((batch_size, 1), fill_value=self.tokenizer.bos_token_id, device=self.device)
+        input_ids = torch.cat((image_token_id, tokenized_captions['input_ids']), dim=1)
+        # Adjust attention_mask to account for the image feature token
+        image_attention = torch.ones((batch_size, 1), device=self.device)
+        attention_mask = torch.cat((image_attention, tokenized_captions['attention_mask']), dim=1)
+        return inputs_embeds, input_ids, attention_mask
+    def save(self, path):
+        """Save model to disk."""
+        self.gpt2_model.save_pretrained(path)
+        self.tokenizer.save_pretrained(path)
+    def load(self, path):
+        """Load model from disk."""
+        self.gpt2_model = GPT2LMHeadModel.from_pretrained(path).to(self.device)
+        self.tokenizer = GPT2Tokenizer.from_pretrained(path).to(self.device)

train.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import torch
+import numpy as np
+from tqdm import tqdm
+from torch.utils.data import DataLoader
+from data.dataLoader import ImageCaptionDataset
+from config.config import Config
+from models.model import ImageCaptioningModel
+from torchsummary import summary
+def train_model(model,dataLoader, optimizer, loss_fn):
+    model.gpt2_model.train()
+    for epoch in range(Config.EPOCHS):
+        epoch_loss = 0
+        for batch_idx, (images, captions) in tqdm(enumerate(dataLoader)):
+            print(f'\rBatch {batch_idx + 1}/{len(dataLoader)} , Loss : {epoch_loss/(batch_idx+1):.4f}\t', end='')
+            images = images.to(Config.DEVICE)
+            captions = [caption for caption in captions]
+            # extract image features
+            image_features = model.extract_image_features(images)
+            # print("Image Features shape:", image_features.shape)
+            input_embeds, input_ids, attention_mask = model.prepare_gpt2_inputs(image_features, captions)
+            # print("Input Embeds shape:", input_embeds.shape)
+            # print("Input IDs shape:", input_ids.shape)
+            # print("Attention Mask shape:", attention_mask.shape)
+            # Match Inputs Embeds and Input Ids and Attention Masks
+            assert input_embeds.shape[1] == input_ids.shape[1] == attention_mask.shape[1]
+            optimizer.zero_grad()
+            outputs = model.gpt2_model(inputs_embeds=input_embeds, labels=input_ids, attention_mask=attention_mask)
+            loss = outputs.loss
+            loss.backward()
+            optimizer.step()
+            epoch_loss += loss.item()
+        print(f'Epoch {epoch + 1}, Loss: {epoch_loss:.4f}')
+    # Save the model
+    model.save('model')
+    # return model
+if __name__ == '__main__':
+    # Initialize dataset using the CSV file
+    dataset = ImageCaptionDataset(
+        caption_file=Config.DATASET_PATH + 'captions.csv',    # Path to captions CSV file
+        file_path = Config.DATASET_PATH+ '/images/', # Path to images folder
+    )
+    # Create DataLoader for batch processing
+    dataloader = DataLoader(
+        dataset,
+        batch_size=Config.BATCH_SIZE, # Specify the batch size
+        shuffle=True, # Shuffle the data
+        num_workers=4 # Number of subprocesses for data loading
+    )
+    # # Iterate over the dataloader
+    # for batch_idx, (images, captions) in enumerate(dataloader):
+    #     print(f'Batch {batch_idx + 1}:')
+    #     print(f'Images shape: {images.shape}')
+    #     print(f'Captions: {captions}')
+    #     # Pass 'images' and 'captions' to your model for training/validation
+    # Initialize the ImageCaptioningModel
+    model = ImageCaptioningModel()
+    optimizer = torch.optim.Adam(model.gpt2_model.parameters(), lr=Config.LEARNING_RATE)
+    loss_fn = torch.nn.CrossEntropyLoss()
+    train_model(model, dataloader, optimizer, loss_fn)