omkar334 commited on
Commit
2468331
·
1 Parent(s): 37f14ac

quantization, reduce chunksize

Browse files
Files changed (3) hide show
  1. client.py +14 -1
  2. preprocessing.py +4 -5
  3. scraper.py +1 -2
client.py CHANGED
@@ -1,7 +1,7 @@
1
  import os
2
 
3
  from dotenv import load_dotenv
4
- from qdrant_client import QdrantClient
5
 
6
  load_dotenv()
7
 
@@ -24,6 +24,13 @@ class HybridClient:
24
  collection_name=collection,
25
  vectors_config=self.qdrant_client.get_fastembed_vector_params(),
26
  sparse_vectors_config=self.qdrant_client.get_fastembed_sparse_vector_params(),
 
 
 
 
 
 
 
27
  )
28
  print(f"--- {collection} collection created")
29
  return collection
@@ -33,6 +40,8 @@ class HybridClient:
33
  documents = []
34
  for chunk in chunks:
35
  documents.append(chunk.pop("text"))
 
 
36
 
37
  self.qdrant_client.add(
38
  collection_name=collection,
@@ -52,3 +61,7 @@ class HybridClient:
52
  # Select and return metadata
53
  # metadata = [hit.metadata for hit in search_result]
54
  return search_result
 
 
 
 
 
1
  import os
2
 
3
  from dotenv import load_dotenv
4
+ from qdrant_client import QdrantClient, models
5
 
6
  load_dotenv()
7
 
 
24
  collection_name=collection,
25
  vectors_config=self.qdrant_client.get_fastembed_vector_params(),
26
  sparse_vectors_config=self.qdrant_client.get_fastembed_sparse_vector_params(),
27
+ quantization_config=models.ScalarQuantization(
28
+ scalar=models.ScalarQuantizationConfig(
29
+ type=models.ScalarType.INT8,
30
+ quantile=0.99,
31
+ always_ram=False,
32
+ ),
33
+ ),
34
  )
35
  print(f"--- {collection} collection created")
36
  return collection
 
40
  documents = []
41
  for chunk in chunks:
42
  documents.append(chunk.pop("text"))
43
+ chunk.pop("color")
44
+ chunk.pop("size")
45
 
46
  self.qdrant_client.add(
47
  collection_name=collection,
 
61
  # Select and return metadata
62
  # metadata = [hit.metadata for hit in search_result]
63
  return search_result
64
+
65
+ def get_chapter_name(self, collection: str):
66
+ points = self.qdrant_client.retrieve(collection_name=collection, ids=[0])
67
+ return points[0]
preprocessing.py CHANGED
@@ -10,14 +10,14 @@ def sort_text(chunks):
10
  right_column = []
11
 
12
  for chunk in chunks:
13
- if chunk["x"] < x_threshold:
14
  left_column.append(chunk)
15
  else:
16
  right_column.append(chunk)
17
 
18
  # Sort the chunks within each column based on the y-coordinate
19
- left_column = sorted(left_column, key=lambda item: item["y"])
20
- right_column = sorted(right_column, key=lambda item: item["y"])
21
 
22
  sorted_text = left_column + right_column
23
  return sorted_text
@@ -75,8 +75,7 @@ def get_chunks(doc):
75
  {
76
  "text": clean_text(text.strip()),
77
  "page": page_num,
78
- "x": block["bbox"][0],
79
- "y": block["bbox"][1],
80
  "color": majority_element(spans, "color"),
81
  "size": majority_element(spans, "size"),
82
  }
 
10
  right_column = []
11
 
12
  for chunk in chunks:
13
+ if chunk["coordinates"][0] < x_threshold:
14
  left_column.append(chunk)
15
  else:
16
  right_column.append(chunk)
17
 
18
  # Sort the chunks within each column based on the y-coordinate
19
+ left_column = sorted(left_column, key=lambda item: item["coordinates"][1])
20
+ right_column = sorted(right_column, key=lambda item: item["coordinates"][1])
21
 
22
  sorted_text = left_column + right_column
23
  return sorted_text
 
75
  {
76
  "text": clean_text(text.strip()),
77
  "page": page_num,
78
+ "coordinates": [round(block["bbox"][0], 1), round(block["bbox"][1], 1)],
 
79
  "color": majority_element(spans, "color"),
80
  "size": majority_element(spans, "size"),
81
  }
scraper.py CHANGED
@@ -70,8 +70,7 @@ async def download(session: aiohttp.ClientSession, url: str, max_retries: int =
70
  async def upload_book(grade, subject, chapters=None):
71
  hclient = HybridClient()
72
 
73
- book = await get_book(grade, subject)
74
- print(type(book))
75
  for collection, pdf in book.items():
76
  print(collection)
77
  chunks = index_pdf(pdf, buffer=True)
 
70
  async def upload_book(grade, subject, chapters=None):
71
  hclient = HybridClient()
72
 
73
+ book = await get_book(grade, subject, chapters)
 
74
  for collection, pdf in book.items():
75
  print(collection)
76
  chunks = index_pdf(pdf, buffer=True)