Vivien
commited on
Commit
•
1945055
1
Parent(s):
74e4bcd
Adjust sidebar text
Browse files
app.py
CHANGED
@@ -85,7 +85,9 @@ source = {0: "\nSource: Unsplash", 1: "\nSource: The Movie Database (TMDB)"}
|
|
85 |
|
86 |
|
87 |
def get_html(url_list, url_list_slip, height=150):
|
88 |
-
html =
|
|
|
|
|
89 |
html += "<span style='margin-top: 20px; max-width: 1200px; display: flex; align-content: flex-start; flex-wrap: wrap; justify-content: space-evenly; width: 50%'>"
|
90 |
html += "<div style='width: 100%; text-align: center;'><b>CLIP</b> (<a href='https://arxiv.org/abs/2103.00020'>Arxiv</a>, <a href='https://github.com/openai/CLIP'>GitHub</a>) from OpenAI</div>"
|
91 |
for url, title, link in url_list:
|
@@ -104,10 +106,12 @@ def get_html(url_list, url_list_slip, height=150):
|
|
104 |
html += "</span></div>"
|
105 |
return html
|
106 |
|
|
|
107 |
def compute_text_embeddings(list_of_strings):
|
108 |
inputs = processor(text=list_of_strings, return_tensors="pt", padding=True)
|
109 |
return model.get_text_features(**inputs)
|
110 |
|
|
|
111 |
def compute_text_embeddings_slip(list_of_strings):
|
112 |
texts = tokenizer(list_of_strings)
|
113 |
if cuda_available:
|
@@ -115,6 +119,7 @@ def compute_text_embeddings_slip(list_of_strings):
|
|
115 |
texts = texts.view(-1, 77).contiguous()
|
116 |
return slip_model.encode_text(texts)
|
117 |
|
|
|
118 |
def image_search(query, corpus, n_results=24):
|
119 |
text_embeddings = compute_text_embeddings([query]).detach().numpy()
|
120 |
text_embeddings_slip = compute_text_embeddings_slip([query]).detach().numpy()
|
@@ -150,13 +155,14 @@ description = """
|
|
150 |
|
151 |
**Enter your query and hit enter**
|
152 |
|
153 |
-
CLIP and SLIP are ML models that encode images and texts as vectors so that the vectors of an image and its caption are similar. They can notably be used for zero-shot image classification, text-based image retrieval or image generation.
|
|
|
|
|
154 |
|
155 |
*Built with OpenAI's [CLIP](https://openai.com/blog/clip/) model, Meta AI's [SLIP](https://github.com/facebookresearch/SLIP) model, 🤗 Hugging Face's [transformers library](https://huggingface.co/transformers/), [Streamlit](https://streamlit.io/), 25k images from [Unsplash](https://unsplash.com/) and 8k images from [The Movie Database (TMDB)](https://www.themoviedb.org/)*
|
156 |
"""
|
157 |
|
158 |
|
159 |
-
|
160 |
st.markdown(
|
161 |
"""
|
162 |
<style>
|
@@ -196,4 +202,4 @@ query = c.text_input("", value="clouds at sunset")
|
|
196 |
corpus = st.radio("", ["Unsplash", "Movies"])
|
197 |
if len(query) > 0:
|
198 |
results, results_slip = image_search(query, corpus)
|
199 |
-
st.markdown(get_html(results, results_slip), unsafe_allow_html=True)
|
|
|
85 |
|
86 |
|
87 |
def get_html(url_list, url_list_slip, height=150):
|
88 |
+
html = (
|
89 |
+
"<div style='display: flex; flex-wrap: wrap; justify-content: space-evenly;'>"
|
90 |
+
)
|
91 |
html += "<span style='margin-top: 20px; max-width: 1200px; display: flex; align-content: flex-start; flex-wrap: wrap; justify-content: space-evenly; width: 50%'>"
|
92 |
html += "<div style='width: 100%; text-align: center;'><b>CLIP</b> (<a href='https://arxiv.org/abs/2103.00020'>Arxiv</a>, <a href='https://github.com/openai/CLIP'>GitHub</a>) from OpenAI</div>"
|
93 |
for url, title, link in url_list:
|
|
|
106 |
html += "</span></div>"
|
107 |
return html
|
108 |
|
109 |
+
|
110 |
def compute_text_embeddings(list_of_strings):
|
111 |
inputs = processor(text=list_of_strings, return_tensors="pt", padding=True)
|
112 |
return model.get_text_features(**inputs)
|
113 |
|
114 |
+
|
115 |
def compute_text_embeddings_slip(list_of_strings):
|
116 |
texts = tokenizer(list_of_strings)
|
117 |
if cuda_available:
|
|
|
119 |
texts = texts.view(-1, 77).contiguous()
|
120 |
return slip_model.encode_text(texts)
|
121 |
|
122 |
+
|
123 |
def image_search(query, corpus, n_results=24):
|
124 |
text_embeddings = compute_text_embeddings([query]).detach().numpy()
|
125 |
text_embeddings_slip = compute_text_embeddings_slip([query]).detach().numpy()
|
|
|
155 |
|
156 |
**Enter your query and hit enter**
|
157 |
|
158 |
+
CLIP and SLIP are ML models that encode images and texts as vectors so that the vectors of an image and its caption are similar. They can notably be used for zero-shot image classification, text-based image retrieval or image generation.
|
159 |
+
|
160 |
+
Cf. this Twitter [thread](https://twitter.com/vivien000000/status/1475829936443334660) with some suprising differences between CLIP and SLIP.
|
161 |
|
162 |
*Built with OpenAI's [CLIP](https://openai.com/blog/clip/) model, Meta AI's [SLIP](https://github.com/facebookresearch/SLIP) model, 🤗 Hugging Face's [transformers library](https://huggingface.co/transformers/), [Streamlit](https://streamlit.io/), 25k images from [Unsplash](https://unsplash.com/) and 8k images from [The Movie Database (TMDB)](https://www.themoviedb.org/)*
|
163 |
"""
|
164 |
|
165 |
|
|
|
166 |
st.markdown(
|
167 |
"""
|
168 |
<style>
|
|
|
202 |
corpus = st.radio("", ["Unsplash", "Movies"])
|
203 |
if len(query) > 0:
|
204 |
results, results_slip = image_search(query, corpus)
|
205 |
+
st.markdown(get_html(results, results_slip), unsafe_allow_html=True)
|