NEXAS commited on
Commit
1baa189
·
verified ·
1 Parent(s): cef1053

Update src/utils/ingest_image.py

Browse files
Files changed (1) hide show
  1. src/utils/ingest_image.py +49 -49
src/utils/ingest_image.py CHANGED
@@ -1,49 +1,49 @@
1
- import os
2
- import fitz
3
- import chromadb
4
- from chromadb.utils.data_loaders import ImageLoader
5
- from chromadb.utils.embedding_functions import OpenCLIPEmbeddingFunction # type: ignore
6
-
7
- def extract_and_store_images(pdf_path, db_path='image_vdb', images_dir='extracted_images'):
8
- # Step 1: Extract images from PDF
9
- pdf_document = fitz.open(pdf_path)
10
- os.makedirs(images_dir, exist_ok=True)
11
-
12
- for page_num in range(len(pdf_document)):
13
- page = pdf_document.load_page(page_num)
14
- image_list = page.get_images(full=True)
15
-
16
- for image_index, img in enumerate(image_list):
17
- xref = img[0]
18
- base_image = pdf_document.extract_image(xref)
19
- image_bytes = base_image["image"]
20
- image_ext = base_image["ext"]
21
- image_filename = f"{images_dir}/page_{page_num+1}_img_{image_index+1}.{image_ext}"
22
-
23
- with open(image_filename, "wb") as image_file:
24
- image_file.write(image_bytes)
25
- print(f"Saved: {image_filename}")
26
-
27
- print("Image extraction complete.")
28
-
29
- # Step 2: Add extracted images to ChromaDB
30
- chroma_client = chromadb.PersistentClient(path=db_path)
31
- image_loader = ImageLoader()
32
- CLIP = OpenCLIPEmbeddingFunction()
33
- image_vdb = chroma_client.get_or_create_collection(name="image", embedding_function=CLIP, data_loader=image_loader)
34
-
35
- ids = []
36
- uris = []
37
-
38
- for i, filename in enumerate(sorted(os.listdir(images_dir))):
39
- if filename.endswith('.jpeg') or filename.endswith('.png'):
40
- file_path = os.path.join(images_dir, filename)
41
- ids.append(str(i))
42
- uris.append(file_path)
43
-
44
- image_vdb.add(ids=ids, uris=uris)
45
- print("Images added to the database.")
46
-
47
- return image_vdb
48
-
49
- # Example usage
 
1
+ import os
2
+ import fitz
3
+ import chromadb
4
+ from chromadb.utils.data_loaders import ImageLoader
5
+ from chromadb.utils.embedding_functions import OpenCLIPEmbeddingFunction # type: ignore
6
+
7
+ def extract_and_store_images(pdf_path, db_path=r'.\data\image_vdb', images_dir=r'.\data\extracted_images'):
8
+ # Step 1: Extract images from PDF
9
+ pdf_document = fitz.open(pdf_path)
10
+ os.makedirs(images_dir, exist_ok=True)
11
+
12
+ for page_num in range(len(pdf_document)):
13
+ page = pdf_document.load_page(page_num)
14
+ image_list = page.get_images(full=True)
15
+
16
+ for image_index, img in enumerate(image_list):
17
+ xref = img[0]
18
+ base_image = pdf_document.extract_image(xref)
19
+ image_bytes = base_image["image"]
20
+ image_ext = base_image["ext"]
21
+ image_filename = f"{images_dir}/page_{page_num+1}_img_{image_index+1}.{image_ext}"
22
+
23
+ with open(image_filename, "wb") as image_file:
24
+ image_file.write(image_bytes)
25
+ print(f"Saved: {image_filename}")
26
+
27
+ print("Image extraction complete.")
28
+
29
+ # Step 2: Add extracted images to ChromaDB
30
+ chroma_client = chromadb.PersistentClient(path=db_path)
31
+ image_loader = ImageLoader()
32
+ CLIP = OpenCLIPEmbeddingFunction()
33
+ image_vdb = chroma_client.get_or_create_collection(name="image", embedding_function=CLIP, data_loader=image_loader)
34
+
35
+ ids = []
36
+ uris = []
37
+
38
+ for i, filename in enumerate(sorted(os.listdir(images_dir))):
39
+ if filename.endswith('.jpeg') or filename.endswith('.png'):
40
+ file_path = os.path.join(images_dir, filename)
41
+ ids.append(str(i))
42
+ uris.append(file_path)
43
+
44
+ image_vdb.add(ids=ids, uris=uris)
45
+ print("Images added to the database.")
46
+
47
+ return image_vdb
48
+
49
+ # Example usage