thomasht86 commited on
Commit
3b2eca4
·
verified ·
1 Parent(s): 1f02318

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. README.md +30 -4
  2. frontend/app.py +47 -34
  3. main.py +36 -9
README.md CHANGED
@@ -27,11 +27,10 @@ preload_from_hub:
27
 
28
  # Visual Retrieval ColPali
29
 
30
-
31
  # Developing
32
 
33
  First, install `uv`:
34
-
35
  ```bash
36
  curl -LsSf https://astral.sh/uv/install.sh | sh
37
  ```
@@ -60,7 +59,8 @@ python hello.py
60
 
61
  First, set up your `.env` file by renaming `.env.example` to `.env` and filling in the required values.
62
  (Token can be shared with 1password, `HF_TOKEN` is personal and must be created at huggingface)
63
- If you are just connecting to a deployed Vespa app, you can skip to [Connecting to the Vespa app](#connecting-to-the-vespa-app-and-querying).
 
64
 
65
  ### Deploying the Vespa app
66
 
@@ -87,7 +87,8 @@ On Mac:
87
  brew install poppler
88
  ```
89
 
90
- First, you need to create a huggingface token, after you have accepted the term to use the model at https://huggingface.co/google/paligemma-3b-mix-448.
 
91
  Add the token to your environment variables as `HF_TOKEN`:
92
 
93
  ```bash
@@ -112,4 +113,29 @@ python query_vespa.py
112
 
113
  ```bash
114
  python main.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  ```
 
27
 
28
  # Visual Retrieval ColPali
29
 
 
30
  # Developing
31
 
32
  First, install `uv`:
33
+
34
  ```bash
35
  curl -LsSf https://astral.sh/uv/install.sh | sh
36
  ```
 
59
 
60
  First, set up your `.env` file by renaming `.env.example` to `.env` and filling in the required values.
61
  (Token can be shared with 1password, `HF_TOKEN` is personal and must be created at huggingface)
62
+ If you are just connecting to a deployed Vespa app, you can skip
63
+ to [Connecting to the Vespa app](#connecting-to-the-vespa-app-and-querying).
64
 
65
  ### Deploying the Vespa app
66
 
 
87
  brew install poppler
88
  ```
89
 
90
+ First, you need to create a huggingface token, after you have accepted the term to use the model
91
+ at https://huggingface.co/google/paligemma-3b-mix-448.
92
  Add the token to your environment variables as `HF_TOKEN`:
93
 
94
  ```bash
 
113
 
114
  ```bash
115
  python main.py
116
+ ```
117
+
118
+ ## Deploy to huggingface 🤗
119
+
120
+ To deploy, run
121
+
122
+ ```bash
123
+ huggingface-cli upload vespa-engine/colpali-vespa-visual-retrieval . . --repo-type=space
124
+ ```
125
+
126
+ Note that you need to set `HF_TOKEN` environment variable first.
127
+ This is personal, and must be created at [huggingface](https://huggingface.co/settings/tokens).
128
+ Make sure the token has `write` access.
129
+ Be ware that this will not delete existing files, only modify or add,
130
+ see [huggingface-cli](https://huggingface.co/docs/huggingface_hub/en/guides/upload#upload-from-the-cli) for more
131
+ information.
132
+
133
+ ### Making changes to CSS
134
+
135
+ To make changes to output.css apply, run
136
+
137
+ ```bash
138
+ shad4fast watch # watches all files passed through the tailwind.config.js content section
139
+
140
+ shad4fast build # minifies the current output.css file to reduce bundle size in production.
141
  ```
frontend/app.py CHANGED
@@ -204,6 +204,27 @@ def LoadingMessage():
204
  )
205
 
206
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
  def SearchResult(results: list, query_id: Optional[str] = None):
208
  if not results:
209
  return Div(
@@ -216,7 +237,7 @@ def SearchResult(results: list, query_id: Optional[str] = None):
216
 
217
  # Otherwise, display the search results
218
  result_items = []
219
- for result in results:
220
  fields = result["fields"] # Extract the 'fields' part of each result
221
  full_image_base64 = f"data:image/jpeg;base64,{fields['full_image']}"
222
 
@@ -224,21 +245,30 @@ def SearchResult(results: list, query_id: Optional[str] = None):
224
  sim_map_fields = {
225
  key: value
226
  for key, value in fields.items()
227
- if key.startswith("sim_map_") and len(key.split("_")[-1]) >= 4
 
 
228
  }
229
 
230
  # Generate buttons for the sim_map fields
231
  sim_map_buttons = []
232
  for key, value in sim_map_fields.items():
233
- sim_map_base64 = f"data:image/jpeg;base64,{value}"
234
- sim_map_buttons.append(
235
- Button(
236
- key.split("_")[-1],
237
- size="sm",
238
- data_image_src=sim_map_base64,
239
- cls="sim-map-button pointer-events-auto font-mono text-xs h-5 rounded-none px-2",
 
 
 
 
 
 
 
 
240
  )
241
- )
242
 
243
  # Add "Reset Image" button to restore the full image
244
  reset_button = Button(
@@ -249,11 +279,7 @@ def SearchResult(results: list, query_id: Optional[str] = None):
249
  cls="reset-button pointer-events-auto font-mono text-xs h-5 rounded-none px-2",
250
  )
251
 
252
- tokens_icon = (
253
- Lucide(icon="loader-circle", size="15", cls="animate-spin")
254
- if query_id is not None
255
- else Lucide(icon="images", size="15")
256
- )
257
 
258
  # Add "Tokens" button - this has no action, just a placeholder
259
  tokens_button = Button(
@@ -307,22 +333,9 @@ def SearchResult(results: list, query_id: Optional[str] = None):
307
  cls="grid grid-cols-1 md:grid-cols-2 col-span-2",
308
  )
309
  )
310
-
311
- if query_id is not None:
312
- return Div(
313
- *result_items,
314
- image_swapping,
315
- hx_get=f"/updated_search_results?query_id={query_id}",
316
- hx_trigger="every 1s",
317
- hx_target="#search-results",
318
- hx_swap="outerHTML",
319
- id="search-results",
320
- cls="grid grid-cols-2 gap-px bg-border",
321
- )
322
- else:
323
- return Div(
324
- *result_items,
325
- image_swapping,
326
- id="search-results",
327
- cls="grid grid-cols-2 gap-px bg-border",
328
- )
 
204
  )
205
 
206
 
207
+ def SimMapButtonReady(query_id, idx, token, img_src):
208
+ return Button(
209
+ token,
210
+ size="sm",
211
+ data_image_src=img_src,
212
+ id=f"sim-map-button-{query_id}-{idx}-{token}",
213
+ cls="sim-map-button pointer-events-auto font-mono text-xs h-5 rounded-none px-2",
214
+ )
215
+
216
+
217
+ def SimMapButtonPoll(query_id, idx, token):
218
+ return Button(
219
+ Lucide(icon="loader-circle", size="15", cls="animate-spin"),
220
+ size="sm",
221
+ disabled=True,
222
+ hx_get=f"/get_sim_map?query_id={query_id}&idx={idx}&token={token}",
223
+ hx_trigger="every 1s",
224
+ hx_swap="outerHTML",
225
+ )
226
+
227
+
228
  def SearchResult(results: list, query_id: Optional[str] = None):
229
  if not results:
230
  return Div(
 
237
 
238
  # Otherwise, display the search results
239
  result_items = []
240
+ for idx, result in enumerate(results):
241
  fields = result["fields"] # Extract the 'fields' part of each result
242
  full_image_base64 = f"data:image/jpeg;base64,{fields['full_image']}"
243
 
 
245
  sim_map_fields = {
246
  key: value
247
  for key, value in fields.items()
248
+ if key.startswith(
249
+ "sim_map_"
250
+ ) # filtering is done before creating with 'is_special_token'-function
251
  }
252
 
253
  # Generate buttons for the sim_map fields
254
  sim_map_buttons = []
255
  for key, value in sim_map_fields.items():
256
+ if value is not None:
257
+ sim_map_base64 = f"data:image/jpeg;base64,{value}"
258
+ sim_map_buttons.append(
259
+ SimMapButtonReady(
260
+ query_id=query_id,
261
+ idx=idx,
262
+ token=key.split("_")[-1],
263
+ image_src=sim_map_base64,
264
+ )
265
+ )
266
+ else:
267
+ sim_map_buttons.append(
268
+ SimMapButtonPoll(
269
+ query_id=query_id, idx=idx, token=key.split("_")[-1]
270
+ )
271
  )
 
272
 
273
  # Add "Reset Image" button to restore the full image
274
  reset_button = Button(
 
279
  cls="reset-button pointer-events-auto font-mono text-xs h-5 rounded-none px-2",
280
  )
281
 
282
+ tokens_icon = Lucide(icon="images", size="15")
 
 
 
 
283
 
284
  # Add "Tokens" button - this has no action, just a placeholder
285
  tokens_button = Button(
 
333
  cls="grid grid-cols-1 md:grid-cols-2 col-span-2",
334
  )
335
  )
336
+ return Div(
337
+ *result_items,
338
+ image_swapping,
339
+ id="search-results",
340
+ cls="grid grid-cols-2 gap-px bg-border",
341
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
main.py CHANGED
@@ -11,11 +11,19 @@ from backend.colpali import (
11
  get_result_from_query,
12
  get_query_embeddings_and_token_map,
13
  add_sim_maps_to_result,
 
14
  )
15
  from backend.vespa_app import get_vespa_app
16
  from backend.cache import LRUCache
17
  from backend.modelmanager import ModelManager
18
- from frontend.app import Home, Search, SearchBox, SearchResult
 
 
 
 
 
 
 
19
  from frontend.layout import Layout
20
  import hashlib
21
 
@@ -143,7 +151,15 @@ async def get(request, query: str, nn: bool = True):
143
  model, processor, query, q_embs, token_to_idx, result, query_id
144
  )
145
  )
 
 
 
 
 
146
  search_results = get_results_children(result)
 
 
 
147
  return SearchResult(search_results, query_id)
148
 
149
 
@@ -176,18 +192,29 @@ async def generate_similarity_map(
176
  task_cache.set(query_id, True)
177
 
178
 
179
- @app.get("/updated_search_results")
180
- async def updated_search_results(query_id: str):
 
 
 
 
 
181
  result = result_cache.get(query_id)
182
  if result is None:
183
- return HTMLResponse(status_code=204)
184
  search_results = get_results_children(result)
185
- # Check if task is completed - Stop polling if it is
186
- if task_cache.get(query_id):
187
- updated_content = SearchResult(results=search_results, query_id=None)
188
  else:
189
- updated_content = SearchResult(results=search_results, query_id=query_id)
190
- return updated_content
 
 
 
 
 
 
191
 
192
 
193
  @rt("/app")
 
11
  get_result_from_query,
12
  get_query_embeddings_and_token_map,
13
  add_sim_maps_to_result,
14
+ is_special_token,
15
  )
16
  from backend.vespa_app import get_vespa_app
17
  from backend.cache import LRUCache
18
  from backend.modelmanager import ModelManager
19
+ from frontend.app import (
20
+ Home,
21
+ Search,
22
+ SearchBox,
23
+ SearchResult,
24
+ SimMapButtonPoll,
25
+ SimMapButtonReady,
26
+ )
27
  from frontend.layout import Layout
28
  import hashlib
29
 
 
151
  model, processor, query, q_embs, token_to_idx, result, query_id
152
  )
153
  )
154
+ fields_to_add = [
155
+ f"sim_map_{token}"
156
+ for token in token_to_idx.keys()
157
+ if not is_special_token(token)
158
+ ]
159
  search_results = get_results_children(result)
160
+ for result in search_results:
161
+ for sim_map_key in fields_to_add:
162
+ result["fields"][sim_map_key] = None
163
  return SearchResult(search_results, query_id)
164
 
165
 
 
192
  task_cache.set(query_id, True)
193
 
194
 
195
+ @app.get("/get_sim_map")
196
+ async def get_sim_map(query_id: str, idx: int, token: str):
197
+ """
198
+ Endpoint that each of the sim map button polls to get the sim map image
199
+ when it is ready. If it is not ready, returns a SimMapButtonPoll, that
200
+ continues to poll every 1 second.
201
+ """
202
  result = result_cache.get(query_id)
203
  if result is None:
204
+ return SimMapButtonPoll(query_id=query_id, idx=idx, token=token)
205
  search_results = get_results_children(result)
206
+ # Check if idx exists in list of children
207
+ if idx >= len(search_results):
208
+ return SimMapButtonPoll(query_id=query_id, idx=idx, token=token)
209
  else:
210
+ sim_map_key = f"sim_map_{token}"
211
+ sim_map_b64 = search_results[idx]["fields"].get(sim_map_key, None)
212
+ if sim_map_b64 is None:
213
+ return SimMapButtonPoll(query_id=query_id, idx=idx, token=token)
214
+ sim_map_img_src = f"data:image/jpeg;base64,{sim_map_b64}"
215
+ return SimMapButtonReady(
216
+ query_id=query_id, idx=idx, token=token, img_src=sim_map_img_src
217
+ )
218
 
219
 
220
  @rt("/app")