Spaces:

vivien
/

clip-slip

Running

App Files Files Community

Vivien commited on Jan 3, 2022

Commit

1945055

1 Parent(s): 74e4bcd

Adjust sidebar text

Browse files

Files changed (1) hide show

app.py +10 -4

app.py CHANGED Viewed

@@ -85,7 +85,9 @@ source = {0: "\nSource: Unsplash", 1: "\nSource: The Movie Database (TMDB)"}
 def get_html(url_list, url_list_slip, height=150):
-    html = "<div style='display: flex; flex-wrap: wrap; justify-content: space-evenly;'>"
     html += "<span style='margin-top: 20px; max-width: 1200px; display: flex; align-content: flex-start; flex-wrap: wrap; justify-content: space-evenly; width: 50%'>"
     html += "<div style='width: 100%; text-align: center;'><b>CLIP</b> (<a href='https://arxiv.org/abs/2103.00020'>Arxiv</a>, <a href='https://github.com/openai/CLIP'>GitHub</a>) from OpenAI</div>"
     for url, title, link in url_list:
@@ -104,10 +106,12 @@ def get_html(url_list, url_list_slip, height=150):
     html += "</span></div>"
     return html
 def compute_text_embeddings(list_of_strings):
     inputs = processor(text=list_of_strings, return_tensors="pt", padding=True)
     return model.get_text_features(**inputs)
 def compute_text_embeddings_slip(list_of_strings):
     texts = tokenizer(list_of_strings)
     if cuda_available:
@@ -115,6 +119,7 @@ def compute_text_embeddings_slip(list_of_strings):
     texts = texts.view(-1, 77).contiguous()
     return slip_model.encode_text(texts)
 def image_search(query, corpus, n_results=24):
     text_embeddings = compute_text_embeddings([query]).detach().numpy()
     text_embeddings_slip = compute_text_embeddings_slip([query]).detach().numpy()
@@ -150,13 +155,14 @@ description = """
 **Enter your query and hit enter**
-CLIP and SLIP are ML models that encode images and texts as vectors so that the vectors of an image and its caption are similar. They can notably be used for zero-shot image classification, text-based image retrieval or image generation.
 *Built with OpenAI's [CLIP](https://openai.com/blog/clip/) model, Meta AI's [SLIP](https://github.com/facebookresearch/SLIP) model, 🤗 Hugging Face's [transformers library](https://huggingface.co/transformers/), [Streamlit](https://streamlit.io/), 25k images from [Unsplash](https://unsplash.com/) and 8k images from [The Movie Database (TMDB)](https://www.themoviedb.org/)*
 """
 st.markdown(
     """
           <style>
@@ -196,4 +202,4 @@ query = c.text_input("", value="clouds at sunset")
 corpus = st.radio("", ["Unsplash", "Movies"])
 if len(query) > 0:
     results, results_slip = image_search(query, corpus)
-    st.markdown(get_html(results, results_slip), unsafe_allow_html=True)

 def get_html(url_list, url_list_slip, height=150):
+    html = (
+        "<div style='display: flex; flex-wrap: wrap; justify-content: space-evenly;'>"
+    )
     html += "<span style='margin-top: 20px; max-width: 1200px; display: flex; align-content: flex-start; flex-wrap: wrap; justify-content: space-evenly; width: 50%'>"
     html += "<div style='width: 100%; text-align: center;'><b>CLIP</b> (<a href='https://arxiv.org/abs/2103.00020'>Arxiv</a>, <a href='https://github.com/openai/CLIP'>GitHub</a>) from OpenAI</div>"
     for url, title, link in url_list:
     html += "</span></div>"
     return html
 def compute_text_embeddings(list_of_strings):
     inputs = processor(text=list_of_strings, return_tensors="pt", padding=True)
     return model.get_text_features(**inputs)
 def compute_text_embeddings_slip(list_of_strings):
     texts = tokenizer(list_of_strings)
     if cuda_available:
     texts = texts.view(-1, 77).contiguous()
     return slip_model.encode_text(texts)
 def image_search(query, corpus, n_results=24):
     text_embeddings = compute_text_embeddings([query]).detach().numpy()
     text_embeddings_slip = compute_text_embeddings_slip([query]).detach().numpy()
 **Enter your query and hit enter**
+CLIP and SLIP are ML models that encode images and texts as vectors so that the vectors of an image and its caption are similar. They can notably be used for zero-shot image classification, text-based image retrieval or image generation.
+Cf. this Twitter [thread](https://twitter.com/vivien000000/status/1475829936443334660) with some suprising differences between CLIP and SLIP.
 *Built with OpenAI's [CLIP](https://openai.com/blog/clip/) model, Meta AI's [SLIP](https://github.com/facebookresearch/SLIP) model, 🤗 Hugging Face's [transformers library](https://huggingface.co/transformers/), [Streamlit](https://streamlit.io/), 25k images from [Unsplash](https://unsplash.com/) and 8k images from [The Movie Database (TMDB)](https://www.themoviedb.org/)*
 """
 st.markdown(
     """
           <style>
 corpus = st.radio("", ["Unsplash", "Movies"])
 if len(query) > 0:
     results, results_slip = image_search(query, corpus)
+    st.markdown(get_html(results, results_slip), unsafe_allow_html=True)