Spaces:

Arabic-Clip
/

AraCLIP

Running

App Files Files Community

pain commited on Jul 23, 2024

Commit

58f7258

verified ·

1 Parent(s): 1664c3c

Upload 7 files

Browse files

Files changed (5) hide show

.gitignore +16 -0
app.py +46 -37
logo_araclip.png +0 -0
requirements.txt +95 -3
utils.py +9 -21

.gitignore ADDED Viewed

	@@ -0,0 +1,16 @@

+cashed_pickles/*
+photos/*
+.env/*
+*/__pycache__/*
+.gradio/*
+*/.ipynb_checkpoints/*
+*/.vscode/*
+*/.git/*
+*/.gitignore
+*/.gitattributes
+*/.gitmodules
+*/.gitkeep
+*/.gitlab-ci.yml
+*/.gitlab/*
+*/.github/*
+*/

app.py CHANGED Viewed

@@ -12,8 +12,8 @@ with gr.Blocks() as demo_araclip:
     gr.Markdown("## Input parameters")
-    txt = gr.Textbox(label="Text Query (Caption)")
-    num = gr.Slider(label="Number of retrieved image", value=1, minimum=1)
     with gr.Row():
@@ -22,26 +22,15 @@ with gr.Blocks() as demo_araclip:
     gr.Markdown("## Retrieved Images")
     gallery = gr.Gallery(
-        label="Generated images", show_label=True, elem_id="gallery"
     , columns=[5], rows=[1], object_fit="contain", height="auto")
     with gr.Row():
-        lables = gr.Label(label="Text image similarity")
-    with gr.Row():
-        with gr.Column(scale=1):
-            gr.Markdown("<div style='text-align: center; font-size: 24px; font-weight: bold;'>Data Retrieved based on Images Similarity</div>")
-            json_output = gr.JSON()
-        with gr.Column(scale=1):
-            gr.Markdown("<div style='text-align: center; font-size: 24px; font-weight: bold;'>Data Retrieved based on Text similarity</div>")
-            json_text = gr.JSON()
-    btn.click(utils.predict, inputs=[txt, num, dadtaset_select], outputs=[gallery,lables, json_output, json_text])
     gr.Examples(
@@ -49,7 +38,7 @@ with gr.Blocks() as demo_araclip:
                   ["وقوف قطة بمخالبها على فأرة حاسوب على المكتب", 10],
                   ["صحن به شوربة صينية بالخضار، وإلى جانبه بطاطس مقلية وزجاجة ماء", 7]],
         inputs=[txt, num, dadtaset_select],
-        outputs=[gallery,lables, json_output, json_text],
         fn=utils.predict,
         cache_examples=False,
     )
@@ -64,8 +53,8 @@ with gr.Blocks() as demo_mclip:
     gr.Markdown("## Input parameters")
-    txt = gr.Textbox(label="Text Query (Caption)")
-    num = gr.Slider(label="Number of retrieved image", value=1, minimum=1)
     with gr.Row():
         btn = gr.Button("Retrieve images", scale=1)
@@ -79,37 +68,57 @@ with gr.Blocks() as demo_mclip:
     lables = gr.Label()
-    with gr.Row():
-        with gr.Column(scale=1):
-            gr.Markdown("## Images Retrieved")
-            json_output = gr.JSON()
-        with gr.Column(scale=1):
-            gr.Markdown("## Text Retrieved")
-            json_text = gr.JSON()
-    btn.click(utils.predict_mclip, inputs=[txt, num, dadtaset_select], outputs=[gallery,lables, json_output, json_text])
     gr.Examples(
         examples=[["تخطي لاعب فريق بيتسبرج بايرتس منطقة اللوحة الرئيسية في مباراة بدوري البيسبول", 5],
                   ["وقوف قطة بمخالبها على فأرة حاسوب على المكتب", 10],
                   ["صحن به شوربة صينية بالخضار، وإلى جانبه بطاطس مقلية وزجاجة ماء", 7]],
         inputs=[txt, num, dadtaset_select],
-        outputs=[gallery,lables, json_output, json_text],
         fn=utils.predict_mclip,
         cache_examples=False,
     )
 # Group the demos in a TabbedInterface
 with gr.Blocks() as demo:
-    gr.Markdown("<font color=red size=10><center>AraClip: Arabic Image Retrieval Application</center></font>")
-    gr.TabbedInterface([demo_araclip, demo_mclip], ["Our Model", "Mclip model"])
 if __name__ == "__main__":
     demo.launch()

     gr.Markdown("## Input parameters")
+    txt = gr.Textbox(label="Text Query")
+    num = gr.Slider(label="Number of retrieved image", value=1, minimum=1, step=1)
     with gr.Row():
     gr.Markdown("## Retrieved Images")
     gallery = gr.Gallery(
+        show_label=False, elem_id="gallery"
     , columns=[5], rows=[1], object_fit="contain", height="auto")
     with gr.Row():
+        lables = gr.Label(label="Text-image similarity")
+    btn.click(utils.predict, inputs=[txt, num, dadtaset_select], outputs=[gallery,lables])
     gr.Examples(
                   ["وقوف قطة بمخالبها على فأرة حاسوب على المكتب", 10],
                   ["صحن به شوربة صينية بالخضار، وإلى جانبه بطاطس مقلية وزجاجة ماء", 7]],
         inputs=[txt, num, dadtaset_select],
+        outputs=[gallery,lables],
         fn=utils.predict,
         cache_examples=False,
     )
     gr.Markdown("## Input parameters")
+    txt = gr.Textbox(label="Text Query")
+    num = gr.Slider(label="Number of retrieved image", value=1, minimum=1, step=1)
     with gr.Row():
         btn = gr.Button("Retrieve images", scale=1)
     lables = gr.Label()
+    btn.click(utils.predict_mclip, inputs=[txt, num, dadtaset_select], outputs=[gallery,lables])
     gr.Examples(
         examples=[["تخطي لاعب فريق بيتسبرج بايرتس منطقة اللوحة الرئيسية في مباراة بدوري البيسبول", 5],
                   ["وقوف قطة بمخالبها على فأرة حاسوب على المكتب", 10],
                   ["صحن به شوربة صينية بالخضار، وإلى جانبه بطاطس مقلية وزجاجة ماء", 7]],
         inputs=[txt, num, dadtaset_select],
+        outputs=[gallery,lables],
         fn=utils.predict_mclip,
         cache_examples=False,
     )
+# Define custom CSS to increase the size of the tabs
+custom_css = """
+.gr-tabbed-interface .gr-tab {
+    font-size: 50px;  /* Increase the font size */
+    padding: 10px;    /* Increase the padding */
+}
+"""
 # Group the demos in a TabbedInterface
 with gr.Blocks() as demo:
+    # gr.Image("statics/logo_araclip.png")
+    gr.Markdown("""
+            <center> <img src="/file=statics/logo_araclip.png" alt="Imgur" style="width:200px"></center>
+                """)
+    gr.Markdown("<center> <font color=red size=10>AraClip: Arabic Image Retrieval Application</font></center>")
+    gr.Markdown("""
+            <font size=4>   To run the demo 🤗, please select the model, then the dataset you would like to search in, enter a text query, and specify the number of retrieved images.</font>
+                """)
+    gr.TabbedInterface([demo_araclip, demo_mclip], ["Our Model", "Mclip model"], css=custom_css)
+    gr.Markdown(
+        """
+            If you find this work helpful, please help us to ⭐ the repositories in <a href='https://github.com/Arabic-Clip' target='_blank'>Github Organization</a>. Thank you!
+            ---
+            📝 **Citation**
+            To be shared soon.
+            📋 **License**
+            """
+            )
 if __name__ == "__main__":
     demo.launch()

logo_araclip.png ADDED Viewed

requirements.txt CHANGED Viewed

@@ -1,5 +1,97 @@
 open-clip-torch==2.23.0
-transformers==4.36.1
 torch==2.1.1
-gradio==4.9.0
-multilingual-clip==1.0.10

+aiofiles==23.2.1
+altair==5.2.0
+annotated-types==0.6.0
+anyio==3.7.1
+attrs==23.1.0
+certifi==2023.11.17
+charset-normalizer==3.3.2
+click==8.1.7
+colorama==0.4.6
+contourpy==1.1.1
+cycler==0.12.1
+exceptiongroup==1.2.0
+fastapi==0.105.0
+ffmpy==0.3.1
+filelock==3.13.1
+fonttools==4.46.0
+fsspec==2023.12.2
+ftfy==6.1.3
+gradio==4.38.1
+gradio-client==1.1.0
+h11==0.14.0
+httpcore==1.0.5
+httpx==0.27.0
+huggingface-hub==0.19.4
+idna==3.6
+importlib-resources==6.1.1
+Jinja2==3.1.2
+jsonschema==4.20.0
+jsonschema-specifications==2023.11.2
+kiwisolver==1.4.5
+markdown-it-py==3.0.0
+MarkupSafe==2.1.3
+matplotlib==3.7.4
+mdurl==0.1.2
+mpmath==1.3.0
+multilingual-clip==1.0.10
+networkx==3.1
+numpy==1.24.4
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==8.9.2.26
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-nccl-cu12==2.18.1
+nvidia-nvjitlink-cu12==12.3.101
+nvidia-nvtx-cu12==12.1.105
 open-clip-torch==2.23.0
+orjson==3.9.10
+packaging==23.2
+pandas==2.0.3
+Pillow==10.1.0
+pkgutil-resolve-name==1.3.10
+protobuf==4.25.1
+pydantic==2.5.2
+pydantic-core==2.14.5
+pydub==0.25.1
+pygments==2.17.2
+pyparsing==3.1.1
+python-dateutil==2.8.2
+python-multipart==0.0.9
+pytz==2023.3.post1
+PyYAML==6.0.1
+referencing==0.32.0
+regex==2023.10.3
+requests==2.31.0
+rich==13.7.0
+rpds-py==0.13.2
+ruff==0.5.4
+safetensors==0.4.1
+semantic-version==2.10.0
+sentencepiece==0.1.99
+shellingham==1.5.4
+six==1.16.0
+sniffio==1.3.0
+starlette==0.27.0
+sympy==1.12
+timm==0.9.12
+tokenizers==0.15.0
+tomlkit==0.12.0
+toolz==0.12.0
 torch==2.1.1
+torchvision==0.16.1
+tqdm==4.66.1
+transformers==4.36.1
+triton==2.1.0
+typer==0.12.3
+typing-extensions==4.9.0
+tzdata==2023.3
+urllib3==2.1.0
+uvicorn==0.24.0.post1
+wcwidth==0.2.12
+websockets==11.0.3
+zipp==3.17.0

utils.py CHANGED Viewed

@@ -106,32 +106,20 @@ def find_image(language_model,clip_model, text_query, dataset, image_features, t
         probs = txt_logits.softmax(dim=-1).cpu().detach().numpy().T
         file_paths = []
-        labels, json_data = {}, {}
         for i in range(1, num+1):
             idx = np.argsort(probs, axis=0)[-i, 0]
             path = images_path + dataset.get_image_name(idx)
-            path_l = (path,f"{sorted_data[idx]['caption_ar']}")
             labels[f" Image # {i}"] = probs[idx]
-            json_data[f" Image # {i}"] = sorted_data[idx]
             file_paths.append(path_l)
-    json_text = {}
-    for _, txt_logits_full in text_logits.items():
-        probs_text = txt_logits_full.softmax(dim=-1).cpu().detach().numpy().T
-        for j in range(1, num+1):
-            idx = np.argsort(probs_text, axis=0)[-j, 0]
-            json_text[f" Text # {j}"] = sorted_data[idx]
-    return file_paths, labels, json_data, json_text
@@ -163,12 +151,12 @@ araclip = AraClip()
 def predict(text, num, dadtaset_select):
     if dadtaset_select == "XTD dataset":
-        image_paths, labels, json_data, json_text = find_image(araclip.language_model,araclip.clip_model, text, araclip.load_xtd_dataset(), araclip.load_pickle_file("cashed_pickles/XTD_pickles/araclip/image_features_XTD_1000_images_arabert_siglib_best_model.pickle") , araclip.load_pickle_file("cashed_pickles/XTD_pickles/araclip/image_features_XTD_1000_images_arabert_siglib_best_model.pickle"), araclip.sorted_data_xtd, 'photos/XTD10_dataset/', num=int(num))
     else:
-        image_paths, labels, json_data, json_text = find_image(araclip.language_model,araclip.clip_model, text, araclip.load_flicker8k_dataset(), araclip.load_pickle_file("cashed_pickles/flicker_8k/araclip/image_features_flicker_8k_images_arabert_siglib_best_model.pickle") , araclip.load_pickle_file("cashed_pickles/flicker_8k/araclip/text_features_flicker_8k_images_arabert_siglib_best_model.pickle"), araclip.sorted_data_flicker8k, "photos/Flicker8k_Dataset/", num=int(num))
-    return image_paths, labels, json_data, json_text
 class Mclip():
@@ -203,10 +191,10 @@ def predict_mclip(text, num, dadtaset_select):
     if dadtaset_select == "XTD dataset":
-        image_paths, labels, json_data, json_text = find_image(mclip.language_model_mclip,mclip.clip_model_mclip, text, mclip.load_xtd_dataset() , mclip.load_pickle_file("cashed_pickles/XTD_pickles/mclip/image_features_XTD_1000_images_XLM_Roberta_Large_Vit_B_16Plus_ar.pickle") , mclip.load_pickle_file("cashed_pickles/XTD_pickles/mclip/text_features_XTD_1000_images_XLM_Roberta_Large_Vit_B_16Plus_ar.pickle") , mclip.sorted_data_xtd , 'photos/XTD10_dataset/', num=int(num))
     else:
-        image_paths, labels, json_data, json_text = find_image(mclip.language_model_mclip,mclip.clip_model_mclip, text, mclip.load_flicker8k_dataset() , mclip.load_pickle_file("cashed_pickles/flicker_8k/mclip/image_features_flicker_8k_images_XLM_Roberta_Large_Vit_B_16Plus_ar.pickle") , mclip.load_pickle_file("cashed_pickles/flicker_8k/mclip/text_features_flicker_8k_images_XLM_Roberta_Large_Vit_B_16Plus_ar.pickle") , mclip.sorted_data_flicker8k , 'photos/Flicker8k_Dataset/', num=int(num))
-    return image_paths, labels, json_data, json_text

         probs = txt_logits.softmax(dim=-1).cpu().detach().numpy().T
         file_paths = []
+        labels = {}
         for i in range(1, num+1):
             idx = np.argsort(probs, axis=0)[-i, 0]
             path = images_path + dataset.get_image_name(idx)
+            path_l = (path, "")
             labels[f" Image # {i}"] = probs[idx]
             file_paths.append(path_l)
+    return file_paths, labels
 def predict(text, num, dadtaset_select):
     if dadtaset_select == "XTD dataset":
+        image_paths, labels = find_image(araclip.language_model,araclip.clip_model, text, araclip.load_xtd_dataset(), araclip.load_pickle_file("cashed_pickles/XTD_pickles/araclip/image_features_XTD_1000_images_arabert_siglib_best_model.pickle") , araclip.load_pickle_file("cashed_pickles/XTD_pickles/araclip/image_features_XTD_1000_images_arabert_siglib_best_model.pickle"), araclip.sorted_data_xtd, 'photos/XTD10_dataset/', num=int(num))
     else:
+        image_paths, labels = find_image(araclip.language_model,araclip.clip_model, text, araclip.load_flicker8k_dataset(), araclip.load_pickle_file("cashed_pickles/flicker_8k/araclip/image_features_flicker_8k_images_arabert_siglib_best_model.pickle") , araclip.load_pickle_file("cashed_pickles/flicker_8k/araclip/text_features_flicker_8k_images_arabert_siglib_best_model.pickle"), araclip.sorted_data_flicker8k, "photos/Flicker8k_Dataset/", num=int(num))
+    return image_paths, labels
 class Mclip():
     if dadtaset_select == "XTD dataset":
+        image_paths, labels = find_image(mclip.language_model_mclip,mclip.clip_model_mclip, text, mclip.load_xtd_dataset() , mclip.load_pickle_file("cashed_pickles/XTD_pickles/mclip/image_features_XTD_1000_images_XLM_Roberta_Large_Vit_B_16Plus_ar.pickle") , mclip.load_pickle_file("cashed_pickles/XTD_pickles/mclip/text_features_XTD_1000_images_XLM_Roberta_Large_Vit_B_16Plus_ar.pickle") , mclip.sorted_data_xtd , 'photos/XTD10_dataset/', num=int(num))
     else:
+        image_paths, labels = find_image(mclip.language_model_mclip,mclip.clip_model_mclip, text, mclip.load_flicker8k_dataset() , mclip.load_pickle_file("cashed_pickles/flicker_8k/mclip/image_features_flicker_8k_images_XLM_Roberta_Large_Vit_B_16Plus_ar.pickle") , mclip.load_pickle_file("cashed_pickles/flicker_8k/mclip/text_features_flicker_8k_images_XLM_Roberta_Large_Vit_B_16Plus_ar.pickle") , mclip.sorted_data_flicker8k , 'photos/Flicker8k_Dataset/', num=int(num))
+    return image_paths, labels