Spaces:

bdepecik
/

caption

Build error

App Files Files Community

bdepecik commited on Sep 14, 2023

Commit

b74f043

1 Parent(s): 4691fdb

Upload 6 files

Browse files

Files changed (6) hide show

.gitignore +46 -0
app.py +132 -0
get_data.py +24 -0
preprocess_images.py +158 -0
requirements.txt +68 -0
setup.py +10 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,46 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+*.egg-info/
+*.egg
+# Flask stuff:
+instance/
+.webassets-cache
+data/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Environments
+.env/
+2008.02693.pdf
+images/original_demo.avi
+images/better_quality_demo.gif

app.py ADDED Viewed

	@@ -0,0 +1,132 @@

+from transformers import Blip2ForConditionalGeneration
+from transformers import Blip2Processor
+from peft import PeftModel
+import streamlit as st
+from PIL import Image
+#import torch
+import os
+preprocess_ckp = "Salesforce/blip2-opt-2.7b" #Checkpoint path used for perprocess image
+base_model_ckp = "./model/blip2-opt-2.7b-fp16-sharded" #Base model checkpoint path
+peft_model_ckp = "./model/blip2_peft" #PEFT model checkpoint path
+sample_img_path = "./sample_images"
+map_sampleid_name = {
+                    'dress' : '00fe223d-9d1f-4bd3-a556-7ece9d28e6fb.jpeg',
+                    'earrings': '0b3862ae-f89e-419c-bc1e-57418abd4180.jpeg',
+                    'sweater': '0c21ba7b-ceb6-4136-94a4-1d4394499986.jpeg',
+                    'sunglasses': '0e44ec10-e53b-473a-a77f-ac8828bb5e01.jpeg',
+                    'shoe': '4cd37d6d-e7ea-4c6e-aab2-af700e480bc1.jpeg',
+                    'hat': '69aeb517-c66c-47b8-af7d-bdf1fde57ed0.jpeg',
+                    'heels':'447abc42-6ac7-4458-a514-bdcd570b1cd1.jpeg',
+                    'socks': 'd188836c-b734-4031-98e5-423d5ff1239d.jpeg',
+                    'tee': 'e2d8637a-5478-429d-a2a8-3d5859dbc64d.jpeg',
+                    'bracelet': 'e78518ac-0f54-4483-a233-fad6511f0b86.jpeg'
+                    }
+def init_model(init_model_required):
+    if init_model_required:
+        #Preprocess input
+        processor = Blip2Processor.from_pretrained(preprocess_ckp)
+        #Model
+        #Inferance on GPU device. Will give error in CPU system, as "load_in_8bit" is an setting of bitsandbytes library and only works for GPU
+        #model = Blip2ForConditionalGeneration.from_pretrained(base_model_ckp, load_in_8bit = True, device_map = "auto")
+        #Inferance on CPU device
+        model = Blip2ForConditionalGeneration.from_pretrained(base_model_ckp)
+        model = PeftModel.from_pretrained(model, peft_model_ckp)
+        init_model_required = False
+    return processor, model, init_model_required
+#def main():
+st.header("Automate Fashion Image Captioning using BLIP-2")
+st.caption("The fashion industry is worth trillions of dollars. The goal of any company/seller is to help customer tofind the right product from a huge corpus of products that they are searching for.")
+st.caption("So, when customer find the right product they are mostly going to add the item to their cart and which help in company revenue.")
+st.caption("Accurate and enchanting descriptions of clothes on shopping websites can help customers without fashion knowledge to better understand the features (attributes, style, functionality, etc.) of the items and increase online sales by enticing more customers.")
+st.caption("Also, most of the time when any customer visits shopping websites, they are looking for a certain style or type of clothes that wish to purchase, they search for the item by providing a description of the item and the system finds the relevant items that match the search query by computing the similarity score between the query and the item caption.")
+st.caption("Given the clothes image provide a short caption that describes the item. In general, in image captioning datasets (e.g., COCO, Fliker), the descriptions of fashion items have three unique features, which makes the automatic generation of captions a challenging task. First, fashion captioning needs to describe the attributes of an item, while image captioning generally narrates the objects and their relations in the image.")
+st.caption("Solution: Used Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models [(BLIP-2)](https://huggingface.co/Salesforce/blip2-opt-2.7b) by Salesforce. The original model size was too large. It was quite challenging to fit and fine-tune the model on the 16GB GPU.")
+st.caption("So, for this project have downloaded the pre-trained model [ybelkada/blip2-opt-2.7b-fp16-sharded](https://huggingface.co/ybelkada/blip2-opt-2.7b-fp16-sharded). This model uses OPT-2.7b LLM model with reduced precision to float16.")
+st.caption("For more detail: [Github link](https://github.com/SmithaUpadhyaya/fashion_image_caption)")    #write
+#Select few sample images for the catagory of cloths
+with st.form("app", clear_on_submit = True):
+    st.caption("Select image:")
+    option = 'None'
+    option = st.selectbox('From sample', ('None', 'dress', 'earrings', 'sweater', 'sunglasses', 'shoe', 'hat', 'heels', 'socks', 'tee', 'bracelet'), index = 0)
+    st.text("Or")
+    file_name = None
+    file_name = st.file_uploader(label = "Upload an image", accept_multiple_files = False)
+    btn_click = st.form_submit_button('Generate')
+    st.caption("Application deployed on CPU basic with 16GB RAM")
+    if btn_click:
+        image = None
+        if file_name is not None:
+            image = Image.open(file_name)
+        elif option is not 'None':
+            file_name = os.path.join(sample_img_path, map_sampleid_name[option])
+            image = Image.open(file_name)
+        if image is not None:
+            image_col, caption_text = st.columns(2)
+            image_col.header("Image")
+            caption_text.header("Generated Caption")
+            image_col.image(image.resize((252,252)), use_column_width = True)
+            caption_text.text("")
+            if 'init_model_required' not in st.session_state:
+                with st.spinner('Initializing model...'):
+                    init_model_required = True
+                    processor, model, init_model_required = init_model(init_model_required)
+                    #Save session init model in session state
+                    if 'init_model_required' not in st.session_state:
+                        st.session_state.init_model_required = init_model_required
+                        st.session_state.processor = processor
+                        st.session_state.model = model
+            else:
+                processor = st.session_state.processor
+                model = st.session_state.model
+            with st.spinner('Generating Caption...'):
+                #Preprocess the image
+                #Inferance on GPU. When used this on GPU will get errors like: "slow_conv2d_cpu" not implemented for 'Half'" , " Input type (float) and bias type (struct c10::Half)"
+                #inputs = processor(images = image, return_tensors = "pt").to('cuda', torch.float16)
+                #Inferance on CPU
+                inputs = processor(images = image, return_tensors = "pt")
+                pixel_values = inputs.pixel_values
+                #Predict the caption for the imahe
+                generated_ids = model.generate(pixel_values = pixel_values, max_length = 10)
+                generated_caption = processor.batch_decode(generated_ids, skip_special_tokens = True)[0]
+                #Output the predict text
+                caption_text.text(generated_caption)
+#if __name__ == "__main__":
+#   main()

get_data.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import src.process_json_data as pj
+import argparse
+"""
+Script to download the image from imageurl and convert json file to .parquet file
+"""
+if __name__ == '__main__':
+    ap = argparse.ArgumentParser()
+    ap.add_argument("-n", "--noofrecords", required = True, type=int, help = "Number of records to read from json file")
+    args = vars(ap.parse_args())
+    read_records = args["noofrecords"]
+    numbers_records = pj.read_data(read_records)
+    if numbers_records == 0:
+        print('No records to process found')
+    else:
+        print(f'Sucessfully processed: {numbers_records} of raw data and saved at "{pj.processed_data}" file')

preprocess_images.py ADDED Viewed

	@@ -0,0 +1,158 @@

+from multiprocessing import cpu_count
+from multiprocessing import Pool
+import src.process_image as pi
+import pyarrow.parquet as pq
+from tqdm import tqdm
+import pandas as pd
+import argparse
+import math
+import gc
+import os
+"""
+########################## Script to Pre-process image files by converting them to numpy array and save numpy arrays to hdf5 format #####################
+Working on limtted RAM resource, so convert 18k images into numpy array and them to hdf5 always caused "OutofMemory" error. Also convert 18k too long time.
+Faced memory issue to load entire the .parquet file as well.
+Solution took:
+-> Load .parquet file:
+    Read data from parquet file in batch of N. N batch of records are sent to process.
+-> Process image to numpy:
+    Create Shared Memory based on the batch size and the space need for an image. So shared memory size will be = N * image_size
+    Used Shared Memory that can now store N image at a time. Any more then that Shared Memory was not created and will give OutOfMemory error.
+    Image size was too large, so convert image to numpy will also be large.
+    While experiment found 300 batch size was ideal
+We created shared memory only once and after each batch of N we clean up the shared memory to store next batch process data
+-> Create pool of process, each process write to shared memory.
+-> End of each batch N data we created a .hdf5. So by end of the script we will have (Num_Records/N) .hdf5 files
+-> Now we combine each .hdf5 into single .hdf5
+"""
+###################################################################################################
+#Since we do not know the number of batch it will generted for any give dataset file. We shall created a generator to loop thrught and show a progress
+def generator():
+  while True:
+    yield
+if __name__ == "__main__":
+    # construct the argument parser and parse the arguments
+    ap = argparse.ArgumentParser()
+    #ap.add_argument("-i", "--images", required = True, type = str, help = "base path to directory containing of images")
+    ap.add_argument("-d", "--proc_data", required = False, type = str, default = '\data\\processed\\test_data.parquet' ,help = "filename with path, of dataset who's images need to be converted to numpy array")
+    ap.add_argument("-p", "--procs", type = int, default = -1, help = "# of processes to spin up")
+    #ap.add_argument("-r", "--datarng", type = str, default = '0:100', help = "range of records to process")
+    ap.add_argument("-b", "--batch_size", type = int, default = 250, help = "chunk/batch size to read data from file")
+    ap.add_argument("-a", "--action_type", type = int, default = 1, help = "action script will perform. 1-> convert image to numpy and combine files. 2-> combine the files. 3-> convert image to numpy")
+    args = vars(ap.parse_args())
+    # determine the number of concurrent processes to launch when
+    # distributing the load across the system, then create the list
+    # of process IDs
+    NUM_WORKERS = args["procs"] if args["procs"] > 0 else cpu_count()
+    #IMAGE_BASE_FOLDER = args["images"] #data\\images. Since we run in parallel process, We can not init this variable for each process. So defined it global
+    DATA_TO_PROCESS = args["proc_data"] #data\\processed\\test_data.parquet
+    BATCH_SIZE = args["batch_size"] #Batch size
+    ACTION_TYPE = args["action_type"]
+    shared_memory_created = False
+    try:
+        if (ACTION_TYPE == 1) or (ACTION_TYPE == 3):
+            print(f'[INFO]: Process to read the {DATA_TO_PROCESS}, convert images to numpy array and store. Started...')
+            # When working with large valid(1902 records)/train(18k records) fail to allocate resource for shared memory.
+            # Gave "[WinError 1450] Insufficient system resources exist to complete the requested service"
+            # To over come the issue decided to split the records range and then merge those records i.e pass/read the data in batch
+            """
+            # Code that read range from input argument, then read the entire dataset and then slide the range.
+            # But in this approach first entire file records is loaded, which take memory space and the script fail to allocate shared memory space.
+            # Method 1:
+            RANGE = args["datarng"]
+            start = int(RANGE.split(':')[0])
+            end = int(RANGE.split(':')[1])
+            data = pd.read_parquet(DATA_TO_PROCESS)
+            if (start > 0) and (end > 0) and (start < end):
+                data = data[start:end]
+            """
+            #Method 2: Read the data in batch using parquet
+            parquet_file = pq.ParquetFile(DATA_TO_PROCESS)
+            number_of_records = parquet_file.metadata.num_rows
+            start = 0
+            end = 0
+            number_of_batch = math.ceil(number_of_records/BATCH_SIZE)
+            with tqdm(total = number_of_batch) as pbar:
+                for i in parquet_file.iter_batches(batch_size = BATCH_SIZE):
+                    end +=BATCH_SIZE
+                    RANGE = str(start) + ':' + str(end)
+                    data = i.to_pandas()
+                    print(f'[INFO]: Process data range {RANGE} started.')
+                    img_shm, img_id_shm = pi.create_shared_memory_nparray(data.shape[0], shared_memory_created)
+                    shared_memory_created = True
+                    print('[INFO]: Sucessfully created shared memory resource.')
+                    process_args = list(zip(range(0, data.shape[0]), data['id'], data['image_name'], [data.shape[0]] * data.shape[0]))
+                    print('[INFO]: Starting Pool process...')
+                    with Pool(NUM_WORKERS) as pror_pool:
+                        #tqdm with pool not helpfull
+                        #for _ in tqdm(pror_pool.map(pi.process_images, process_args), total = data.shape[0]):
+                        #    pass
+                        pror_pool.map(pi.process_images, process_args)
+                    print('[INFO]: Started saving data to hdf5 format...')
+                    hdf5_filename, filename = os.path.split(DATA_TO_PROCESS)
+                    hdf5_filename = os.path.join(hdf5_filename, filename.split('.')[0] + '_' + RANGE.replace(':','_') + '.h5')
+                    pi.save_to_hdf(hdf5_filename, data.shape[0])
+                    print(f'[INFO]: Process data range {RANGE} completed.')
+                    start = end
+                    del [data]
+                    pbar.update(1)
+            print('[INFO]: Process to convert images to numpy array and store in seperate files. Completed.')
+        if (ACTION_TYPE == 1) or (ACTION_TYPE == 2):
+            print('[INFO]: Combine multiple hdf5 files into one started...')
+            path, name = os.path.split(DATA_TO_PROCESS)
+            name = name.split('.')[0]
+            pi.combine_multiple_hdf(name, path)
+    except Exception as e:
+        print(f'Error Occured: {e}')
+    finally:
+        if shared_memory_created:
+            pi.release_shared_memory()
+        gc.collect()
+    print('[INFO]: Script execution completed.')
+############################################################333
+#Sample code
+#python preprocess_images.py -d data\processed\test_data.parquet
+#python preprocess_images.py -d data\processed\validate_data.parquet -r 0:100
+#python preprocess_images.py -d data\processed\validate_data.parquet
+#python preprocess_images.py -d data\processed\validate_data.parquet -b 300
+#python preprocess_images.py -d data\processed\train_data.parquet -b 300 -a 2

requirements.txt ADDED Viewed

	@@ -0,0 +1,68 @@

+# local package
+-e .
+#Python Version 3.7.9
+# external requirements
+#General Library
+pandas==1.3.5
+numpy==1.20
+tqdm
+pyarrow==8.0.0 #Required to work with parquet file type. These is one engine that is used for parquet file
+fastparquet==0.8.1 #Required to work with parquet file type. This is also one engine that is used for parquet file
+#Download the image from web using imageurl
+requests==2.28.2
+#Not required since in the end worked with jpeg file only.
+#Since found that hdf5 was large and taking much time to read image data from hdf5, compare to jpeg file for each image
+#h5py==3.8.0 #Read and Save numpy data in hdf5.
+#Library for EDA
+ipywidgets
+opencv-python==4.5.4.60
+matplotlib==3.5.3
+seaborn==0.12.2
+WordCloud
+#Pre-process text library
+#pycontractions #Does not work with py3.9. So copied the code from there github
+nltk==3.8.1
+#Model Library
+scikit-learn
+#tensorflow==2.11.0 #Final model BLIP2 does not have TF compatible in HuggingFace. So went with Pytorch
+#Library required for BLIP2 model and VisionEncoderDecoder model
+rouge_score==0.1.2
+accelerate==0.20.3
+transformers==4.30.2 #Help: https://huggingface.co/docs/transformers/v4.20.1/en/installation#installation
+datasets==2.13.0 #Huggingface dataset, Help: https://huggingface.co/docs/datasets/installation
+evaluate==0.4.0 #Huggingface evaluate library, Help: https://huggingface.co/docs/evaluate/index
+peft==0.4.0# #0.4.0 was dev version. In case version is not release. Install using pip install -q git+https://github.com/huggingface/peft.git
+bitsandbytes==0.39.0
+pytorch==2.0.0
+#Deploy
+streamlit==1.16.0
+#Step 1 Create a Virtual enviroment with VSCode inside project folder
+#py -<<python_version>> -m venv <<your_environment_name>>
+#Step 2 Activate the Virtual Environment by calling Activate.bat
+#\<<your_environment_name>>\Scripts\Activate.bat
+#Step 3: Select this Environment as Interpreatr in VScode
+#-> Ctrl+Shift+P
+#-> Select from drop down or type : "Python: Select Interpreter"
+#-> Select "Enter interpreter path..."
+#-> Select "Find.." and browse to folder and select" \Scripts\python.exe" in the new environment folder that we created.
+#Step 4: [Optional]: Upgrade pip in your_enviroment
+#-> Open the Terminal
+#-> Terminal should show <<your_environment_name>> in the command line. If not execute Step 2 again
+#-> pip install pip --upgrade
+#Step 5: Install the requirement dll
+#pip install -r requirements.txt

setup.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from setuptools import find_packages, setup
+setup(
+    name = 'Fashion_Image_Captioning',
+    packages = find_packages(),
+    version = '2.1.2',
+    description='Fashion Image Captioning trained using HuggingFace Transfromer Library, PEFT, LoRA',
+    author = 'USmitha',
+    license = 'MIT',
+)