Spaces:

Upyaya
/

Fashion-Image-Captioning-using-BLIP-2

Paused

App Files Files

Upyaya commited on Jun 27, 2023

Commit

fd469c3

•

1 Parent(s): d173f19

Upload file not generating caption

Browse files

Add a progress bar to understand, uploaded image fail to generate capation

Files changed (1) hide show

app.py +60 -27

app.py CHANGED Viewed

@@ -53,50 +53,83 @@ def main():
     st.caption("Accurate and enchanting descriptions of clothes on shopping websites can help customers without fashion knowledge to better understand the features (attributes, style, functionality, etc.) of the items and increase online sales by enticing more customers.")
     st.caption("Also, most of the time when any customer visits shopping websites, they are looking for a certain style or type of clothes that wish to purchase, they search for the item by providing a description of the item and the system finds the relevant items that match the search query by computing the similarity score between the query and the item caption.")
     st.caption("Given the clothes image provide a short caption that describes the item. In general, in image captioning datasets (e.g., COCO, Fliker), the descriptions of fashion items have three unique features, which makes the automatic generation of captions a challenging task. First, fashion captioning needs to describe the attributes of an item, while image captioning generally narrates the objects and their relations in the image.")
-    st.caption("Solution: Used Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models (BLIP-2)")
-    st.write("Github: [link](https://github.com/SmithaUpadhyaya/fashion_image_caption)")
     processor, model = init_model()
     #Select few sample images for the catagory of cloths
-    st.text("Select image:")
     option = st.selectbox('From sample', ('None', 'dress', 'earrings', 'sweater', 'sunglasses', 'shoe', 'hat', 'heels', 'socks', 'tee', 'bracelet'), index = 0)
     st.text("Or")
     file_name = st.file_uploader(label = "Upload an image", accept_multiple_files = False)
-    image = None
-    if file_name is not None:
-        image = Image.open(file_name)
-    elif option is not 'None':
-        file_name = os.path.join(sample_img_path, map_sampleid_name[option])
-        image = Image.open(file_name)
-    if image is not None:
-        image_col, caption_text = st.columns(2)
-        image_col.header("Image")
-        image_col.image(image, use_column_width = True)
-        #Preprocess the image
-        #Inferance on GPU. When used this on GPU will get errors like: "slow_conv2d_cpu" not implemented for 'Half'" , " Input type (float) and bias type (struct c10::Half)"
-        #inputs = processor(images = image, return_tensors = "pt").to('cuda', torch.float16)
-        #Inferance on CPU
-        inputs = processor(images = image, return_tensors = "pt")
-        pixel_values = inputs.pixel_values
-        #Predict the caption for the imahe
-        generated_ids = model.generate(pixel_values = pixel_values, max_length = 25)
-        generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-        #Output the predict text
-        caption_text.header("Generated Caption")
-        caption_text.text(generated_caption)
 if __name__ == "__main__":

     st.caption("Accurate and enchanting descriptions of clothes on shopping websites can help customers without fashion knowledge to better understand the features (attributes, style, functionality, etc.) of the items and increase online sales by enticing more customers.")
     st.caption("Also, most of the time when any customer visits shopping websites, they are looking for a certain style or type of clothes that wish to purchase, they search for the item by providing a description of the item and the system finds the relevant items that match the search query by computing the similarity score between the query and the item caption.")
     st.caption("Given the clothes image provide a short caption that describes the item. In general, in image captioning datasets (e.g., COCO, Fliker), the descriptions of fashion items have three unique features, which makes the automatic generation of captions a challenging task. First, fashion captioning needs to describe the attributes of an item, while image captioning generally narrates the objects and their relations in the image.")
+    st.caption("Solution: Used Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models [(BLIP-2)](https://huggingface.co/Salesforce/blip2-opt-6.7b) by Salesforce")
+    st.write("For more detail: [Github link](https://github.com/SmithaUpadhyaya/fashion_image_caption)")
+    footer = """<style>
+            a:link , a:visited{
+            color: blue;
+            background-color: transparent;
+            text-decoration: underline;
+            }
+            a:hover,  a:active {
+            color: red;
+            background-color: transparent;
+            text-decoration: underline;
+            }
+            .footer {
+            position: fixed;
+            left: 0;
+            bottom: 0;
+            width: 100%;
+            background-color: white;
+            color: black;
+            text-align: center;
+            }
+            </style>
+            <div class="footer">
+            <p>Application deployed on CPU with 16GB RAM</p>
+            </div>
+            """
+    st.markdown(footer,unsafe_allow_html=True)
     processor, model = init_model()
     #Select few sample images for the catagory of cloths
+    st.caption("Select image:")
     option = st.selectbox('From sample', ('None', 'dress', 'earrings', 'sweater', 'sunglasses', 'shoe', 'hat', 'heels', 'socks', 'tee', 'bracelet'), index = 0)
     st.text("Or")
     file_name = st.file_uploader(label = "Upload an image", accept_multiple_files = False)
+    btn_click = st.button('Generate')
+    if btn_click:
+        image = None
+        if file_name is not None:
+            image = Image.open(file_name)
+        elif option is not 'None':
+            file_name = os.path.join(sample_img_path, map_sampleid_name[option])
+            image = Image.open(file_name)
+        if image is not None:
+            with st.spinner('Generating Caption...'):
+                image_col, caption_text = st.columns(2)
+                image_col.header("Image")
+                image_col.image(image, use_column_width = True)
+                #Preprocess the image
+                #Inferance on GPU. When used this on GPU will get errors like: "slow_conv2d_cpu" not implemented for 'Half'" , " Input type (float) and bias type (struct c10::Half)"
+                #inputs = processor(images = image, return_tensors = "pt").to('cuda', torch.float16)
+                #Inferance on CPU
+                inputs = processor(images = image, return_tensors = "pt")
+                pixel_values = inputs.pixel_values
+                #Predict the caption for the imahe
+                generated_ids = model.generate(pixel_values = pixel_values, max_length = 25)
+                generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+                #Output the predict text
+                caption_text.header("Generated Caption")
+                caption_text.text(generated_caption)
 if __name__ == "__main__":