Spaces:
Running
on
T4
Running
on
T4
thomasht86
commited on
Upload folder using huggingface_hub
Browse files- README.md +10 -0
- prepare_feed_deploy.py +59 -46
- requirements.txt +84 -2
- vespa_feed_to_hf_dataset.py +42 -0
README.md
CHANGED
@@ -126,6 +126,16 @@ python main.py
|
|
126 |
|
127 |
## Deploy to huggingface π€
|
128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
To deploy, run
|
130 |
|
131 |
```bash
|
|
|
126 |
|
127 |
## Deploy to huggingface π€
|
128 |
|
129 |
+
### Compiling dependencies
|
130 |
+
|
131 |
+
Before a deploy, make sure to run this to compile the `uv` lock file to `requirements.txt` if you have made changes to the dependencies:
|
132 |
+
|
133 |
+
```bash
|
134 |
+
uv pip compile pyproject.toml -o requirements.txt
|
135 |
+
```
|
136 |
+
|
137 |
+
### Deploying to huggingface
|
138 |
+
|
139 |
To deploy, run
|
140 |
|
141 |
```bash
|
prepare_feed_deploy.py
CHANGED
@@ -1,16 +1,16 @@
|
|
1 |
# %% [markdown]
|
2 |
# # Visual PDF Retrieval - demo application
|
3 |
-
#
|
4 |
# In this notebook, we will prepare the Vespa backend application for our visual retrieval demo.
|
5 |
# We will use ColPali as the model to extract patch vectors from images of pdf pages.
|
6 |
# At query time, we use MaxSim to retrieve and/or (based on the configuration) rank the page results.
|
7 |
-
#
|
8 |
# To see the application in action, visit TODO:
|
9 |
-
#
|
10 |
# The web application is written in FastHTML, meaning the complete application is written in python.
|
11 |
-
#
|
12 |
# The steps we will take in this notebook are:
|
13 |
-
#
|
14 |
# 0. Setup and configuration
|
15 |
# 1. Download the data
|
16 |
# 2. Prepare the data
|
@@ -18,14 +18,14 @@
|
|
18 |
# 4. Deploy the Vespa application
|
19 |
# 5. Create the Vespa application
|
20 |
# 6. Feed the data to the Vespa application
|
21 |
-
#
|
22 |
# All the steps that are needed to provision the Vespa application, including feeding the data, can be done from this notebook.
|
23 |
# We have tried to make it easy for others to run this notebook, to create your own PDF Enterprise Search application using Vespa.
|
24 |
-
#
|
25 |
|
26 |
# %% [markdown]
|
27 |
# ## 0. Setup and Configuration
|
28 |
-
#
|
29 |
|
30 |
# %%
|
31 |
import os
|
@@ -83,11 +83,11 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
83 |
|
84 |
# %% [markdown]
|
85 |
# ### Create a free trial in Vespa Cloud
|
86 |
-
#
|
87 |
# Create a tenant from [here](https://vespa.ai/free-trial/).
|
88 |
# The trial includes $300 credit.
|
89 |
# Take note of your tenant name.
|
90 |
-
#
|
91 |
|
92 |
# %%
|
93 |
VESPA_TENANT_NAME = "vespa-team"
|
@@ -95,17 +95,17 @@ VESPA_TENANT_NAME = "vespa-team"
|
|
95 |
# %% [markdown]
|
96 |
# Here, set your desired application name. (Will be created in later steps)
|
97 |
# Note that you can not have hyphen `-` or underscore `_` in the application name.
|
98 |
-
#
|
99 |
|
100 |
# %%
|
101 |
-
VESPA_APPLICATION_NAME = "
|
102 |
VESPA_SCHEMA_NAME = "pdf_page"
|
103 |
|
104 |
# %% [markdown]
|
105 |
# Next, you need to create some tokens for feeding data, and querying the application.
|
106 |
# We recommend separate tokens for feeding and querying, (the former with write permission, and the latter with read permission).
|
107 |
# The tokens can be created from the [Vespa Cloud console](https://console.vespa-cloud.com/) in the 'Account' -> 'Tokens' section.
|
108 |
-
#
|
109 |
|
110 |
# %%
|
111 |
VESPA_TOKEN_ID_WRITE = "colpalidemo_write"
|
@@ -113,7 +113,7 @@ VESPA_TOKEN_ID_READ = "colpalidemo_read"
|
|
113 |
|
114 |
# %% [markdown]
|
115 |
# We also need to set the value of the write token to be able to feed data to the Vespa application.
|
116 |
-
#
|
117 |
|
118 |
# %%
|
119 |
VESPA_CLOUD_SECRET_TOKEN = os.getenv("VESPA_CLOUD_SECRET_TOKEN") or input(
|
@@ -124,7 +124,7 @@ VESPA_CLOUD_SECRET_TOKEN = os.getenv("VESPA_CLOUD_SECRET_TOKEN") or input(
|
|
124 |
# We will also use the Gemini API to create sample queries for our images.
|
125 |
# You can also use other VLM's to create these queries.
|
126 |
# Create a Gemini API key from [here](https://aistudio.google.com/app/apikey).
|
127 |
-
#
|
128 |
|
129 |
# %%
|
130 |
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") or input(
|
@@ -152,21 +152,21 @@ processor = ColPaliProcessor.from_pretrained(MODEL_NAME)
|
|
152 |
|
153 |
# %% [markdown]
|
154 |
# ## 1. Download PDFs
|
155 |
-
#
|
156 |
# We are going to use public reports from the Norwegian Government Pension Fund Global (also known as the Oil Fund).
|
157 |
# The fund puts transparency at the forefront and publishes reports on its investments, holdings, and returns, as well as its strategy and governance.
|
158 |
-
#
|
159 |
# These reports are the ones we are going to use for this showcase.
|
160 |
# Here are some sample images:
|
161 |
-
#
|
162 |
# ![Sample1](./static/img/gfpg-sample-1.png)
|
163 |
# ![Sample2](./static/img/gfpg-sample-2.png)
|
164 |
-
#
|
165 |
|
166 |
# %% [markdown]
|
167 |
# As we can see, a lot of the information is in the form of tables, charts and numbers.
|
168 |
# These are not easily extractable using pdf-readers or OCR tools.
|
169 |
-
#
|
170 |
|
171 |
# %%
|
172 |
import requests
|
@@ -180,16 +180,20 @@ html_content = response.text
|
|
180 |
soup = BeautifulSoup(html_content, "html.parser")
|
181 |
|
182 |
links = []
|
|
|
183 |
|
184 |
-
# Find all
|
185 |
-
for
|
186 |
-
|
187 |
-
|
|
|
|
|
|
|
188 |
href = a_tag["href"]
|
189 |
full_url = urljoin(url, href)
|
190 |
links.append(full_url)
|
191 |
-
|
192 |
-
links
|
193 |
|
194 |
# %%
|
195 |
# Limit the number of PDFs to download
|
@@ -274,7 +278,8 @@ pdfs
|
|
274 |
|
275 |
# %% [markdown]
|
276 |
# ## 2. Convert PDFs to Images
|
277 |
-
#
|
|
|
278 |
|
279 |
# %%
|
280 |
def get_pdf_images(pdf_path):
|
@@ -300,6 +305,7 @@ for pdf in tqdm(pdfs):
|
|
300 |
pdf_pages.append(
|
301 |
{
|
302 |
"title": title,
|
|
|
303 |
"url": pdf["url"],
|
304 |
"path": pdf_file,
|
305 |
"image": image,
|
@@ -324,17 +330,17 @@ print(f"Number of text with length == 0: {Counter(text_lengths)[0]}")
|
|
324 |
|
325 |
# %% [markdown]
|
326 |
# ## 3. Generate Queries
|
327 |
-
#
|
328 |
# In this step, we want to generate queries for each page image.
|
329 |
# These will be useful for 2 reasons:
|
330 |
-
#
|
331 |
# 1. We can use these queries as typeahead suggestions in the search bar.
|
332 |
# 2. We can use the queries to generate an evaluation dataset. See [Improving Retrieval with LLM-as-a-judge](https://blog.vespa.ai/improving-retrieval-with-llm-as-a-judge/) for a deeper dive into this topic.
|
333 |
-
#
|
334 |
# The prompt for generating queries is taken from [this](https://danielvanstrien.xyz/posts/post-with-code/colpali/2024-09-23-generate_colpali_dataset.html#an-update-retrieval-focused-prompt) wonderful blog post by Daniel van Strien.
|
335 |
-
#
|
336 |
# We will use the Gemini API to generate these queries, with `gemini-1.5-flash-8b` as the model.
|
337 |
-
#
|
338 |
|
339 |
# %%
|
340 |
from pydantic import BaseModel
|
@@ -413,6 +419,7 @@ def generate_queries(image, prompt_text, pydantic_model):
|
|
413 |
}
|
414 |
return queries
|
415 |
|
|
|
416 |
# %%
|
417 |
for pdf in tqdm(pdf_pages):
|
418 |
image = pdf.get("image")
|
@@ -488,9 +495,10 @@ with open("output/pdf_pages.json", "w") as f:
|
|
488 |
|
489 |
# %% [markdown]
|
490 |
# ## 4. Generate embeddings
|
491 |
-
#
|
492 |
# Now that we have the queries, we can use the ColPali model to generate embeddings for each page image.
|
493 |
-
#
|
|
|
494 |
|
495 |
# %%
|
496 |
def generate_embeddings(images, model, processor, batch_size=2) -> np.ndarray:
|
@@ -530,6 +538,7 @@ def generate_embeddings(images, model, processor, batch_size=2) -> np.ndarray:
|
|
530 |
all_embeddings = np.concatenate(embeddings_list, axis=0)
|
531 |
return all_embeddings
|
532 |
|
|
|
533 |
# %%
|
534 |
# Generate embeddings for all images
|
535 |
images = [pdf["image"] for pdf in pdf_pages]
|
@@ -540,9 +549,10 @@ embeddings.shape
|
|
540 |
|
541 |
# %% [markdown]
|
542 |
# ## 5. Prepare Data on Vespa Format
|
543 |
-
#
|
544 |
# Now, that we have all the data we need, all that remains is to make sure it is in the right format for Vespa.
|
545 |
-
#
|
|
|
546 |
|
547 |
# %%
|
548 |
def float_to_binary_embedding(float_query_embedding: dict) -> dict:
|
@@ -555,10 +565,12 @@ def float_to_binary_embedding(float_query_embedding: dict) -> dict:
|
|
555 |
binary_query_embeddings[k] = binary_vector
|
556 |
return binary_query_embeddings
|
557 |
|
|
|
558 |
# %%
|
559 |
vespa_feed = []
|
560 |
for pdf, embedding in zip(pdf_pages, embeddings):
|
561 |
url = pdf["url"]
|
|
|
562 |
title = pdf["title"]
|
563 |
image = pdf["image"]
|
564 |
text = pdf.get("text", "")
|
@@ -580,6 +592,7 @@ for pdf, embedding in zip(pdf_pages, embeddings):
|
|
580 |
"id": id_hash,
|
581 |
"url": url,
|
582 |
"title": title,
|
|
|
583 |
"page_number": page_no,
|
584 |
"blur_image": base_64_image,
|
585 |
"full_image": base_64_full_image,
|
@@ -616,7 +629,7 @@ len(vespa_feed)
|
|
616 |
|
617 |
# %% [markdown]
|
618 |
# ## 5. Prepare Vespa Application
|
619 |
-
#
|
620 |
|
621 |
# %%
|
622 |
# Define the Vespa schema
|
@@ -631,6 +644,7 @@ colpali_schema = Schema(
|
|
631 |
match=["word"],
|
632 |
),
|
633 |
Field(name="url", type="string", indexing=["summary", "index"]),
|
|
|
634 |
Field(
|
635 |
name="title",
|
636 |
type="string",
|
@@ -720,9 +734,7 @@ colpali_schema = Schema(
|
|
720 |
DocumentSummary(
|
721 |
name="suggestions",
|
722 |
summary_fields=[
|
723 |
-
Summary(
|
724 |
-
name="questions"
|
725 |
-
),
|
726 |
],
|
727 |
from_disk=True,
|
728 |
),
|
@@ -756,11 +768,12 @@ mapfunctions = [
|
|
756 |
# Define the 'bm25' rank profile
|
757 |
colpali_bm25_profile = RankProfile(
|
758 |
name="bm25",
|
759 |
-
inputs=[("query(qt)", "tensor<float>(querytoken{}, v[128])")],
|
760 |
first_phase="bm25(title) + bm25(text)",
|
761 |
functions=mapfunctions,
|
762 |
)
|
763 |
|
|
|
764 |
# A function to create an inherited rank profile which also returns quantized similarity scores
|
765 |
def with_quantized_similarity(rank_profile: RankProfile) -> RankProfile:
|
766 |
return RankProfile(
|
@@ -770,6 +783,7 @@ def with_quantized_similarity(rank_profile: RankProfile) -> RankProfile:
|
|
770 |
summary_features=["quantized"],
|
771 |
)
|
772 |
|
|
|
773 |
colpali_schema.add_rank_profile(colpali_bm25_profile)
|
774 |
colpali_schema.add_rank_profile(with_quantized_similarity(colpali_bm25_profile))
|
775 |
|
@@ -941,7 +955,7 @@ vespa_application_package = ApplicationPackage(
|
|
941 |
|
942 |
# %% [markdown]
|
943 |
# ## 6. Deploy Vespa Application
|
944 |
-
#
|
945 |
|
946 |
# %%
|
947 |
VESPA_TEAM_API_KEY = os.getenv("VESPA_TEAM_API_KEY") or input(
|
@@ -966,17 +980,18 @@ print(f"Application deployed. Token endpoint URL: {endpoint_url}")
|
|
966 |
# %% [markdown]
|
967 |
# Make sure to take note of the token endpoint_url.
|
968 |
# You need to put this in your `.env` file - `VESPA_APP_URL=https://abcd.vespa-app.cloud` - to access the Vespa application from your web application.
|
969 |
-
#
|
970 |
|
971 |
# %% [markdown]
|
972 |
# ## 8. Feed Data to Vespa
|
973 |
-
#
|
974 |
|
975 |
# %%
|
976 |
# Instantiate Vespa connection using token
|
977 |
app = Vespa(url=endpoint_url, vespa_cloud_secret_token=VESPA_CLOUD_SECRET_TOKEN)
|
978 |
app.get_application_status()
|
979 |
|
|
|
980 |
# %%
|
981 |
def callback(response: VespaResponse, id: str):
|
982 |
if not response.is_successful():
|
@@ -987,5 +1002,3 @@ def callback(response: VespaResponse, id: str):
|
|
987 |
|
988 |
# Feed data into Vespa asynchronously
|
989 |
app.feed_async_iterable(vespa_feed, schema=VESPA_SCHEMA_NAME, callback=callback)
|
990 |
-
|
991 |
-
|
|
|
1 |
# %% [markdown]
|
2 |
# # Visual PDF Retrieval - demo application
|
3 |
+
#
|
4 |
# In this notebook, we will prepare the Vespa backend application for our visual retrieval demo.
|
5 |
# We will use ColPali as the model to extract patch vectors from images of pdf pages.
|
6 |
# At query time, we use MaxSim to retrieve and/or (based on the configuration) rank the page results.
|
7 |
+
#
|
8 |
# To see the application in action, visit TODO:
|
9 |
+
#
|
10 |
# The web application is written in FastHTML, meaning the complete application is written in python.
|
11 |
+
#
|
12 |
# The steps we will take in this notebook are:
|
13 |
+
#
|
14 |
# 0. Setup and configuration
|
15 |
# 1. Download the data
|
16 |
# 2. Prepare the data
|
|
|
18 |
# 4. Deploy the Vespa application
|
19 |
# 5. Create the Vespa application
|
20 |
# 6. Feed the data to the Vespa application
|
21 |
+
#
|
22 |
# All the steps that are needed to provision the Vespa application, including feeding the data, can be done from this notebook.
|
23 |
# We have tried to make it easy for others to run this notebook, to create your own PDF Enterprise Search application using Vespa.
|
24 |
+
#
|
25 |
|
26 |
# %% [markdown]
|
27 |
# ## 0. Setup and Configuration
|
28 |
+
#
|
29 |
|
30 |
# %%
|
31 |
import os
|
|
|
83 |
|
84 |
# %% [markdown]
|
85 |
# ### Create a free trial in Vespa Cloud
|
86 |
+
#
|
87 |
# Create a tenant from [here](https://vespa.ai/free-trial/).
|
88 |
# The trial includes $300 credit.
|
89 |
# Take note of your tenant name.
|
90 |
+
#
|
91 |
|
92 |
# %%
|
93 |
VESPA_TENANT_NAME = "vespa-team"
|
|
|
95 |
# %% [markdown]
|
96 |
# Here, set your desired application name. (Will be created in later steps)
|
97 |
# Note that you can not have hyphen `-` or underscore `_` in the application name.
|
98 |
+
#
|
99 |
|
100 |
# %%
|
101 |
+
VESPA_APPLICATION_NAME = "colpalidemo"
|
102 |
VESPA_SCHEMA_NAME = "pdf_page"
|
103 |
|
104 |
# %% [markdown]
|
105 |
# Next, you need to create some tokens for feeding data, and querying the application.
|
106 |
# We recommend separate tokens for feeding and querying, (the former with write permission, and the latter with read permission).
|
107 |
# The tokens can be created from the [Vespa Cloud console](https://console.vespa-cloud.com/) in the 'Account' -> 'Tokens' section.
|
108 |
+
#
|
109 |
|
110 |
# %%
|
111 |
VESPA_TOKEN_ID_WRITE = "colpalidemo_write"
|
|
|
113 |
|
114 |
# %% [markdown]
|
115 |
# We also need to set the value of the write token to be able to feed data to the Vespa application.
|
116 |
+
#
|
117 |
|
118 |
# %%
|
119 |
VESPA_CLOUD_SECRET_TOKEN = os.getenv("VESPA_CLOUD_SECRET_TOKEN") or input(
|
|
|
124 |
# We will also use the Gemini API to create sample queries for our images.
|
125 |
# You can also use other VLM's to create these queries.
|
126 |
# Create a Gemini API key from [here](https://aistudio.google.com/app/apikey).
|
127 |
+
#
|
128 |
|
129 |
# %%
|
130 |
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") or input(
|
|
|
152 |
|
153 |
# %% [markdown]
|
154 |
# ## 1. Download PDFs
|
155 |
+
#
|
156 |
# We are going to use public reports from the Norwegian Government Pension Fund Global (also known as the Oil Fund).
|
157 |
# The fund puts transparency at the forefront and publishes reports on its investments, holdings, and returns, as well as its strategy and governance.
|
158 |
+
#
|
159 |
# These reports are the ones we are going to use for this showcase.
|
160 |
# Here are some sample images:
|
161 |
+
#
|
162 |
# ![Sample1](./static/img/gfpg-sample-1.png)
|
163 |
# ![Sample2](./static/img/gfpg-sample-2.png)
|
164 |
+
#
|
165 |
|
166 |
# %% [markdown]
|
167 |
# As we can see, a lot of the information is in the form of tables, charts and numbers.
|
168 |
# These are not easily extractable using pdf-readers or OCR tools.
|
169 |
+
#
|
170 |
|
171 |
# %%
|
172 |
import requests
|
|
|
180 |
soup = BeautifulSoup(html_content, "html.parser")
|
181 |
|
182 |
links = []
|
183 |
+
url_to_year = {}
|
184 |
|
185 |
+
# Find all 'div's with id starting with 'year-'
|
186 |
+
for year_div in soup.find_all("div", id=lambda x: x and x.startswith("year-")):
|
187 |
+
year_id = year_div.get("id", "")
|
188 |
+
year = year_id.replace("year-", "")
|
189 |
+
|
190 |
+
# Within this div, find all 'a' elements with the specific classes
|
191 |
+
for a_tag in year_div.select("a.button.button--download-secondary[href]"):
|
192 |
href = a_tag["href"]
|
193 |
full_url = urljoin(url, href)
|
194 |
links.append(full_url)
|
195 |
+
url_to_year[full_url] = year
|
196 |
+
links, url_to_year
|
197 |
|
198 |
# %%
|
199 |
# Limit the number of PDFs to download
|
|
|
278 |
|
279 |
# %% [markdown]
|
280 |
# ## 2. Convert PDFs to Images
|
281 |
+
#
|
282 |
+
|
283 |
|
284 |
# %%
|
285 |
def get_pdf_images(pdf_path):
|
|
|
305 |
pdf_pages.append(
|
306 |
{
|
307 |
"title": title,
|
308 |
+
"year": int(url_to_year[pdf["url"]]),
|
309 |
"url": pdf["url"],
|
310 |
"path": pdf_file,
|
311 |
"image": image,
|
|
|
330 |
|
331 |
# %% [markdown]
|
332 |
# ## 3. Generate Queries
|
333 |
+
#
|
334 |
# In this step, we want to generate queries for each page image.
|
335 |
# These will be useful for 2 reasons:
|
336 |
+
#
|
337 |
# 1. We can use these queries as typeahead suggestions in the search bar.
|
338 |
# 2. We can use the queries to generate an evaluation dataset. See [Improving Retrieval with LLM-as-a-judge](https://blog.vespa.ai/improving-retrieval-with-llm-as-a-judge/) for a deeper dive into this topic.
|
339 |
+
#
|
340 |
# The prompt for generating queries is taken from [this](https://danielvanstrien.xyz/posts/post-with-code/colpali/2024-09-23-generate_colpali_dataset.html#an-update-retrieval-focused-prompt) wonderful blog post by Daniel van Strien.
|
341 |
+
#
|
342 |
# We will use the Gemini API to generate these queries, with `gemini-1.5-flash-8b` as the model.
|
343 |
+
#
|
344 |
|
345 |
# %%
|
346 |
from pydantic import BaseModel
|
|
|
419 |
}
|
420 |
return queries
|
421 |
|
422 |
+
|
423 |
# %%
|
424 |
for pdf in tqdm(pdf_pages):
|
425 |
image = pdf.get("image")
|
|
|
495 |
|
496 |
# %% [markdown]
|
497 |
# ## 4. Generate embeddings
|
498 |
+
#
|
499 |
# Now that we have the queries, we can use the ColPali model to generate embeddings for each page image.
|
500 |
+
#
|
501 |
+
|
502 |
|
503 |
# %%
|
504 |
def generate_embeddings(images, model, processor, batch_size=2) -> np.ndarray:
|
|
|
538 |
all_embeddings = np.concatenate(embeddings_list, axis=0)
|
539 |
return all_embeddings
|
540 |
|
541 |
+
|
542 |
# %%
|
543 |
# Generate embeddings for all images
|
544 |
images = [pdf["image"] for pdf in pdf_pages]
|
|
|
549 |
|
550 |
# %% [markdown]
|
551 |
# ## 5. Prepare Data on Vespa Format
|
552 |
+
#
|
553 |
# Now, that we have all the data we need, all that remains is to make sure it is in the right format for Vespa.
|
554 |
+
#
|
555 |
+
|
556 |
|
557 |
# %%
|
558 |
def float_to_binary_embedding(float_query_embedding: dict) -> dict:
|
|
|
565 |
binary_query_embeddings[k] = binary_vector
|
566 |
return binary_query_embeddings
|
567 |
|
568 |
+
|
569 |
# %%
|
570 |
vespa_feed = []
|
571 |
for pdf, embedding in zip(pdf_pages, embeddings):
|
572 |
url = pdf["url"]
|
573 |
+
year = pdf["year"]
|
574 |
title = pdf["title"]
|
575 |
image = pdf["image"]
|
576 |
text = pdf.get("text", "")
|
|
|
592 |
"id": id_hash,
|
593 |
"url": url,
|
594 |
"title": title,
|
595 |
+
"year": year,
|
596 |
"page_number": page_no,
|
597 |
"blur_image": base_64_image,
|
598 |
"full_image": base_64_full_image,
|
|
|
629 |
|
630 |
# %% [markdown]
|
631 |
# ## 5. Prepare Vespa Application
|
632 |
+
#
|
633 |
|
634 |
# %%
|
635 |
# Define the Vespa schema
|
|
|
644 |
match=["word"],
|
645 |
),
|
646 |
Field(name="url", type="string", indexing=["summary", "index"]),
|
647 |
+
Field(name="year", type="int", indexing=["summary", "attribute"]),
|
648 |
Field(
|
649 |
name="title",
|
650 |
type="string",
|
|
|
734 |
DocumentSummary(
|
735 |
name="suggestions",
|
736 |
summary_fields=[
|
737 |
+
Summary(name="questions"),
|
|
|
|
|
738 |
],
|
739 |
from_disk=True,
|
740 |
),
|
|
|
768 |
# Define the 'bm25' rank profile
|
769 |
colpali_bm25_profile = RankProfile(
|
770 |
name="bm25",
|
771 |
+
inputs=[("query(qt)", "tensor<float>(querytoken{}, v[128])")],
|
772 |
first_phase="bm25(title) + bm25(text)",
|
773 |
functions=mapfunctions,
|
774 |
)
|
775 |
|
776 |
+
|
777 |
# A function to create an inherited rank profile which also returns quantized similarity scores
|
778 |
def with_quantized_similarity(rank_profile: RankProfile) -> RankProfile:
|
779 |
return RankProfile(
|
|
|
783 |
summary_features=["quantized"],
|
784 |
)
|
785 |
|
786 |
+
|
787 |
colpali_schema.add_rank_profile(colpali_bm25_profile)
|
788 |
colpali_schema.add_rank_profile(with_quantized_similarity(colpali_bm25_profile))
|
789 |
|
|
|
955 |
|
956 |
# %% [markdown]
|
957 |
# ## 6. Deploy Vespa Application
|
958 |
+
#
|
959 |
|
960 |
# %%
|
961 |
VESPA_TEAM_API_KEY = os.getenv("VESPA_TEAM_API_KEY") or input(
|
|
|
980 |
# %% [markdown]
|
981 |
# Make sure to take note of the token endpoint_url.
|
982 |
# You need to put this in your `.env` file - `VESPA_APP_URL=https://abcd.vespa-app.cloud` - to access the Vespa application from your web application.
|
983 |
+
#
|
984 |
|
985 |
# %% [markdown]
|
986 |
# ## 8. Feed Data to Vespa
|
987 |
+
#
|
988 |
|
989 |
# %%
|
990 |
# Instantiate Vespa connection using token
|
991 |
app = Vespa(url=endpoint_url, vespa_cloud_secret_token=VESPA_CLOUD_SECRET_TOKEN)
|
992 |
app.get_application_status()
|
993 |
|
994 |
+
|
995 |
# %%
|
996 |
def callback(response: VespaResponse, id: str):
|
997 |
if not response.is_successful():
|
|
|
1002 |
|
1003 |
# Feed data into Vespa asynchronously
|
1004 |
app.feed_async_iterable(vespa_feed, schema=VESPA_SCHEMA_NAME, callback=callback)
|
|
|
|
requirements.txt
CHANGED
@@ -24,8 +24,15 @@ attrs==24.2.0
|
|
24 |
# via aiohttp
|
25 |
beautifulsoup4==4.12.3
|
26 |
# via python-fasthtml
|
|
|
|
|
27 |
cachetools==5.5.0
|
28 |
# via google-auth
|
|
|
|
|
|
|
|
|
|
|
29 |
certifi==2024.8.30
|
30 |
# via
|
31 |
# httpcore
|
@@ -39,16 +46,27 @@ click==8.1.7
|
|
39 |
# via
|
40 |
# typer
|
41 |
# uvicorn
|
|
|
|
|
42 |
colpali-engine==0.3.1
|
43 |
# via
|
44 |
# visual-retrieval-colpali (pyproject.toml)
|
45 |
# vidore-benchmark
|
|
|
|
|
|
|
|
|
46 |
contourpy==1.3.0
|
47 |
# via matplotlib
|
48 |
cryptography==43.0.1
|
49 |
# via pyvespa
|
50 |
cycler==0.12.1
|
51 |
# via matplotlib
|
|
|
|
|
|
|
|
|
|
|
52 |
datasets==2.21.0
|
53 |
# via
|
54 |
# mteb
|
@@ -168,11 +186,16 @@ itsdangerous==2.2.0
|
|
168 |
jinja2==3.1.4
|
169 |
# via
|
170 |
# pyvespa
|
|
|
171 |
# torch
|
172 |
joblib==1.4.2
|
173 |
# via scikit-learn
|
174 |
kiwisolver==1.4.7
|
175 |
# via matplotlib
|
|
|
|
|
|
|
|
|
176 |
loguru==0.7.2
|
177 |
# via vidore-benchmark
|
178 |
lucide-fasthtml==0.0.9
|
@@ -181,6 +204,8 @@ lxml==5.3.0
|
|
181 |
# via
|
182 |
# lucide-fasthtml
|
183 |
# pyvespa
|
|
|
|
|
184 |
markdown-it-py==3.0.0
|
185 |
# via rich
|
186 |
markupsafe==2.1.5
|
@@ -201,11 +226,17 @@ multidict==6.1.0
|
|
201 |
# yarl
|
202 |
multiprocess==0.70.16
|
203 |
# via datasets
|
|
|
|
|
|
|
|
|
|
|
204 |
networkx==3.3
|
205 |
# via torch
|
206 |
numpy==1.26.4
|
207 |
# via
|
208 |
# accelerate
|
|
|
209 |
# colpali-engine
|
210 |
# contourpy
|
211 |
# datasets
|
@@ -217,6 +248,8 @@ numpy==1.26.4
|
|
217 |
# scikit-learn
|
218 |
# scipy
|
219 |
# seaborn
|
|
|
|
|
220 |
# transformers
|
221 |
# vidore-benchmark
|
222 |
oauthlib==3.2.2
|
@@ -229,7 +262,10 @@ packaging==24.1
|
|
229 |
# huggingface-hub
|
230 |
# matplotlib
|
231 |
# peft
|
|
|
|
|
232 |
# transformers
|
|
|
233 |
pandas==2.2.3
|
234 |
# via
|
235 |
# datasets
|
@@ -247,8 +283,14 @@ pillow==10.4.0
|
|
247 |
# pdf2image
|
248 |
# sentence-transformers
|
249 |
# vidore-benchmark
|
|
|
|
|
250 |
polars==1.9.0
|
251 |
# via mteb
|
|
|
|
|
|
|
|
|
252 |
proto-plus==1.24.0
|
253 |
# via
|
254 |
# google-ai-generativelanguage
|
@@ -277,8 +319,12 @@ pycparser==2.22
|
|
277 |
# via cffi
|
278 |
pydantic==2.9.2
|
279 |
# via
|
|
|
280 |
# google-generativeai
|
281 |
# mteb
|
|
|
|
|
|
|
282 |
pydantic-core==2.23.4
|
283 |
# via pydantic
|
284 |
pygments==2.18.0
|
@@ -334,7 +380,9 @@ requests==2.32.3
|
|
334 |
# mteb
|
335 |
# pyvespa
|
336 |
# requests-toolbelt
|
|
|
337 |
# transformers
|
|
|
338 |
requests-toolbelt==1.0.0
|
339 |
# via pyvespa
|
340 |
rich==13.9.2
|
@@ -366,27 +414,47 @@ sentence-transformers==3.1.1
|
|
366 |
sentencepiece==0.2.0
|
367 |
# via vidore-benchmark
|
368 |
setuptools==75.1.0
|
369 |
-
# via
|
|
|
|
|
|
|
|
|
370 |
shad4fast==1.2.1
|
371 |
# via visual-retrieval-colpali (pyproject.toml)
|
372 |
shellingham==1.5.4
|
373 |
# via typer
|
374 |
six==1.16.0
|
375 |
# via python-dateutil
|
|
|
|
|
376 |
sniffio==1.3.1
|
377 |
# via
|
378 |
# anyio
|
379 |
# httpx
|
380 |
soupsieve==2.6
|
381 |
# via beautifulsoup4
|
|
|
|
|
|
|
|
|
|
|
|
|
382 |
sqlite-minutils==3.37.0.post3
|
383 |
# via fastlite
|
|
|
|
|
|
|
|
|
|
|
|
|
384 |
starlette==0.39.2
|
385 |
# via python-fasthtml
|
386 |
sympy==1.13.3
|
387 |
# via torch
|
388 |
tenacity==9.0.0
|
389 |
# via pyvespa
|
|
|
|
|
390 |
threadpoolctl==3.5.0
|
391 |
# via scikit-learn
|
392 |
tokenizers==0.20.0
|
@@ -408,6 +476,7 @@ tqdm==4.66.5
|
|
408 |
# mteb
|
409 |
# peft
|
410 |
# sentence-transformers
|
|
|
411 |
# transformers
|
412 |
transformers==4.45.1
|
413 |
# via
|
@@ -416,10 +485,14 @@ transformers==4.45.1
|
|
416 |
# sentence-transformers
|
417 |
# vidore-benchmark
|
418 |
typer==0.12.5
|
419 |
-
# via
|
|
|
|
|
|
|
420 |
typing-extensions==4.12.2
|
421 |
# via
|
422 |
# anyio
|
|
|
423 |
# google-generativeai
|
424 |
# huggingface-hub
|
425 |
# mteb
|
@@ -448,10 +521,19 @@ vespacli==8.391.23
|
|
448 |
# via visual-retrieval-colpali (pyproject.toml)
|
449 |
vidore-benchmark==4.0.0
|
450 |
# via visual-retrieval-colpali (pyproject.toml)
|
|
|
|
|
|
|
|
|
|
|
451 |
watchfiles==0.24.0
|
452 |
# via uvicorn
|
|
|
|
|
453 |
websockets==13.1
|
454 |
# via uvicorn
|
|
|
|
|
455 |
xxhash==3.5.0
|
456 |
# via datasets
|
457 |
yarl==1.13.1
|
|
|
24 |
# via aiohttp
|
25 |
beautifulsoup4==4.12.3
|
26 |
# via python-fasthtml
|
27 |
+
blis==0.7.11
|
28 |
+
# via thinc
|
29 |
cachetools==5.5.0
|
30 |
# via google-auth
|
31 |
+
catalogue==2.0.10
|
32 |
+
# via
|
33 |
+
# spacy
|
34 |
+
# srsly
|
35 |
+
# thinc
|
36 |
certifi==2024.8.30
|
37 |
# via
|
38 |
# httpcore
|
|
|
46 |
# via
|
47 |
# typer
|
48 |
# uvicorn
|
49 |
+
cloudpathlib==0.20.0
|
50 |
+
# via weasel
|
51 |
colpali-engine==0.3.1
|
52 |
# via
|
53 |
# visual-retrieval-colpali (pyproject.toml)
|
54 |
# vidore-benchmark
|
55 |
+
confection==0.1.5
|
56 |
+
# via
|
57 |
+
# thinc
|
58 |
+
# weasel
|
59 |
contourpy==1.3.0
|
60 |
# via matplotlib
|
61 |
cryptography==43.0.1
|
62 |
# via pyvespa
|
63 |
cycler==0.12.1
|
64 |
# via matplotlib
|
65 |
+
cymem==2.0.8
|
66 |
+
# via
|
67 |
+
# preshed
|
68 |
+
# spacy
|
69 |
+
# thinc
|
70 |
datasets==2.21.0
|
71 |
# via
|
72 |
# mteb
|
|
|
186 |
jinja2==3.1.4
|
187 |
# via
|
188 |
# pyvespa
|
189 |
+
# spacy
|
190 |
# torch
|
191 |
joblib==1.4.2
|
192 |
# via scikit-learn
|
193 |
kiwisolver==1.4.7
|
194 |
# via matplotlib
|
195 |
+
langcodes==3.4.1
|
196 |
+
# via spacy
|
197 |
+
language-data==1.2.0
|
198 |
+
# via langcodes
|
199 |
loguru==0.7.2
|
200 |
# via vidore-benchmark
|
201 |
lucide-fasthtml==0.0.9
|
|
|
204 |
# via
|
205 |
# lucide-fasthtml
|
206 |
# pyvespa
|
207 |
+
marisa-trie==1.2.1
|
208 |
+
# via language-data
|
209 |
markdown-it-py==3.0.0
|
210 |
# via rich
|
211 |
markupsafe==2.1.5
|
|
|
226 |
# yarl
|
227 |
multiprocess==0.70.16
|
228 |
# via datasets
|
229 |
+
murmurhash==1.0.10
|
230 |
+
# via
|
231 |
+
# preshed
|
232 |
+
# spacy
|
233 |
+
# thinc
|
234 |
networkx==3.3
|
235 |
# via torch
|
236 |
numpy==1.26.4
|
237 |
# via
|
238 |
# accelerate
|
239 |
+
# blis
|
240 |
# colpali-engine
|
241 |
# contourpy
|
242 |
# datasets
|
|
|
248 |
# scikit-learn
|
249 |
# scipy
|
250 |
# seaborn
|
251 |
+
# spacy
|
252 |
+
# thinc
|
253 |
# transformers
|
254 |
# vidore-benchmark
|
255 |
oauthlib==3.2.2
|
|
|
262 |
# huggingface-hub
|
263 |
# matplotlib
|
264 |
# peft
|
265 |
+
# spacy
|
266 |
+
# thinc
|
267 |
# transformers
|
268 |
+
# weasel
|
269 |
pandas==2.2.3
|
270 |
# via
|
271 |
# datasets
|
|
|
283 |
# pdf2image
|
284 |
# sentence-transformers
|
285 |
# vidore-benchmark
|
286 |
+
pip==24.3.1
|
287 |
+
# via visual-retrieval-colpali (pyproject.toml)
|
288 |
polars==1.9.0
|
289 |
# via mteb
|
290 |
+
preshed==3.0.9
|
291 |
+
# via
|
292 |
+
# spacy
|
293 |
+
# thinc
|
294 |
proto-plus==1.24.0
|
295 |
# via
|
296 |
# google-ai-generativelanguage
|
|
|
319 |
# via cffi
|
320 |
pydantic==2.9.2
|
321 |
# via
|
322 |
+
# confection
|
323 |
# google-generativeai
|
324 |
# mteb
|
325 |
+
# spacy
|
326 |
+
# thinc
|
327 |
+
# weasel
|
328 |
pydantic-core==2.23.4
|
329 |
# via pydantic
|
330 |
pygments==2.18.0
|
|
|
380 |
# mteb
|
381 |
# pyvespa
|
382 |
# requests-toolbelt
|
383 |
+
# spacy
|
384 |
# transformers
|
385 |
+
# weasel
|
386 |
requests-toolbelt==1.0.0
|
387 |
# via pyvespa
|
388 |
rich==13.9.2
|
|
|
414 |
sentencepiece==0.2.0
|
415 |
# via vidore-benchmark
|
416 |
setuptools==75.1.0
|
417 |
+
# via
|
418 |
+
# visual-retrieval-colpali (pyproject.toml)
|
419 |
+
# marisa-trie
|
420 |
+
# spacy
|
421 |
+
# thinc
|
422 |
shad4fast==1.2.1
|
423 |
# via visual-retrieval-colpali (pyproject.toml)
|
424 |
shellingham==1.5.4
|
425 |
# via typer
|
426 |
six==1.16.0
|
427 |
# via python-dateutil
|
428 |
+
smart-open==7.0.5
|
429 |
+
# via weasel
|
430 |
sniffio==1.3.1
|
431 |
# via
|
432 |
# anyio
|
433 |
# httpx
|
434 |
soupsieve==2.6
|
435 |
# via beautifulsoup4
|
436 |
+
spacy==3.7.5
|
437 |
+
# via visual-retrieval-colpali (pyproject.toml)
|
438 |
+
spacy-legacy==3.0.12
|
439 |
+
# via spacy
|
440 |
+
spacy-loggers==1.0.5
|
441 |
+
# via spacy
|
442 |
sqlite-minutils==3.37.0.post3
|
443 |
# via fastlite
|
444 |
+
srsly==2.4.8
|
445 |
+
# via
|
446 |
+
# confection
|
447 |
+
# spacy
|
448 |
+
# thinc
|
449 |
+
# weasel
|
450 |
starlette==0.39.2
|
451 |
# via python-fasthtml
|
452 |
sympy==1.13.3
|
453 |
# via torch
|
454 |
tenacity==9.0.0
|
455 |
# via pyvespa
|
456 |
+
thinc==8.2.5
|
457 |
+
# via spacy
|
458 |
threadpoolctl==3.5.0
|
459 |
# via scikit-learn
|
460 |
tokenizers==0.20.0
|
|
|
476 |
# mteb
|
477 |
# peft
|
478 |
# sentence-transformers
|
479 |
+
# spacy
|
480 |
# transformers
|
481 |
transformers==4.45.1
|
482 |
# via
|
|
|
485 |
# sentence-transformers
|
486 |
# vidore-benchmark
|
487 |
typer==0.12.5
|
488 |
+
# via
|
489 |
+
# spacy
|
490 |
+
# vidore-benchmark
|
491 |
+
# weasel
|
492 |
typing-extensions==4.12.2
|
493 |
# via
|
494 |
# anyio
|
495 |
+
# cloudpathlib
|
496 |
# google-generativeai
|
497 |
# huggingface-hub
|
498 |
# mteb
|
|
|
521 |
# via visual-retrieval-colpali (pyproject.toml)
|
522 |
vidore-benchmark==4.0.0
|
523 |
# via visual-retrieval-colpali (pyproject.toml)
|
524 |
+
wasabi==1.1.3
|
525 |
+
# via
|
526 |
+
# spacy
|
527 |
+
# thinc
|
528 |
+
# weasel
|
529 |
watchfiles==0.24.0
|
530 |
# via uvicorn
|
531 |
+
weasel==0.4.1
|
532 |
+
# via spacy
|
533 |
websockets==13.1
|
534 |
# via uvicorn
|
535 |
+
wrapt==1.16.0
|
536 |
+
# via smart-open
|
537 |
xxhash==3.5.0
|
538 |
# via datasets
|
539 |
yarl==1.13.1
|
vespa_feed_to_hf_dataset.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
import os
|
4 |
+
import base64
|
5 |
+
from PIL import Image
|
6 |
+
import io
|
7 |
+
from datasets import Dataset, Image as HFImage
|
8 |
+
from pathlib import Path
|
9 |
+
from tqdm import tqdm
|
10 |
+
|
11 |
+
load_dotenv()
|
12 |
+
|
13 |
+
df = pd.read_json("output/vespa_feed_full.jsonl", lines=True)
|
14 |
+
df = pd.json_normalize(df["fields"].tolist())
|
15 |
+
|
16 |
+
dataset_dir = Path("hf_dataset")
|
17 |
+
image_dir = dataset_dir / "images"
|
18 |
+
os.makedirs(image_dir, exist_ok=True)
|
19 |
+
|
20 |
+
|
21 |
+
def save_image(image_data, filename):
|
22 |
+
img_data = base64.b64decode(image_data)
|
23 |
+
img = Image.open(io.BytesIO(img_data))
|
24 |
+
img.save(filename)
|
25 |
+
|
26 |
+
|
27 |
+
for idx, row in tqdm(df.iterrows()):
|
28 |
+
blur_filename = os.path.join(image_dir, f"blur_{idx}.jpg")
|
29 |
+
full_filename = os.path.join(image_dir, f"full_{idx}.jpg")
|
30 |
+
save_image(row["blur_image"], blur_filename)
|
31 |
+
save_image(row["full_image"], full_filename)
|
32 |
+
df.at[idx, "blur_image"] = blur_filename
|
33 |
+
df.at[idx, "full_image"] = full_filename
|
34 |
+
|
35 |
+
|
36 |
+
# Step 3: Convert to Hugging Face Dataset
|
37 |
+
dataset = (
|
38 |
+
Dataset.from_dict(df.to_dict(orient="list"))
|
39 |
+
.cast_column("blur_image", HFImage())
|
40 |
+
.cast_column("full_image", HFImage())
|
41 |
+
)
|
42 |
+
dataset.push_to_hub("vespa-engine/gpfg-QA", private=True)
|