adowu commited on
Commit
e0f90ab
verified
1 Parent(s): 9b7aea8

Update database.py

Browse files
Files changed (1) hide show
  1. database.py +13 -7
database.py CHANGED
@@ -27,12 +27,12 @@ class KodeksProcessor:
27
 
28
  def extract_metadata(self, text: str) -> Dict:
29
  metadata = {}
30
- dz_u_match = re.search(r'Dz\\.U\\.(\\d{4})\\.(\\d+)\\.(\\d+)', text)
31
  if dz_u_match:
32
  metadata['dz_u'] = f"Dz.U.{dz_u_match.group(1)}.{dz_u_match.group(2)}.{dz_u_match.group(3)}"
33
  metadata['rok'] = dz_u_match.group(1)
34
 
35
- nazwa_match = re.search(r'USTAWA\\s+z dnia(.*?)\\n(.*?)\\n', text)
36
  if nazwa_match:
37
  metadata['data_ustawy'] = nazwa_match.group(1).strip()
38
  metadata['nazwa'] = nazwa_match.group(2).strip()
@@ -47,10 +47,10 @@ class KodeksProcessor:
47
  return "", text
48
 
49
  def process_article(self, article_text: str) -> Dict:
50
- art_num_match = re.match(r'Art\\.\\s*(\\d+)', article_text)
51
  article_num = art_num_match.group(1) if art_num_match else ""
52
 
53
- paragraphs = re.findall(r'搂\\s*(\\d+)[.\\s]+(.*?)(?=搂\\s*\\d+|$)', article_text, re.DOTALL)
54
 
55
  if not paragraphs:
56
  return {
@@ -67,7 +67,7 @@ class KodeksProcessor:
67
 
68
  def split_into_chunks(self, text: str, metadata: Dict) -> List[Dict]:
69
  chunks = []
70
- chapters = re.split(r'(Rozdzia艂 \\d+\\n\\n[^\\n]+)\\n', text)
71
  current_chapter = ""
72
 
73
  for i, section in enumerate(chapters):
@@ -75,7 +75,7 @@ class KodeksProcessor:
75
  current_chapter = section.strip()
76
  continue
77
 
78
- articles = re.split(r'(Art\\.\\s*\\d+.*?)(?=Art\\.\\s*\\d+|$)', section)
79
 
80
  for article in articles:
81
  if not article.strip():
@@ -123,6 +123,7 @@ class KodeksProcessor:
123
  metadatas=[chunk["metadata"]],
124
  ids=[f"{metadata['filename']}_{chunk['metadata']['article']}_{i}"]
125
  )
 
126
 
127
  logging.info("Dodano %d chunk贸w z pliku %s", len(chunks), metadata['filename'])
128
 
@@ -140,4 +141,9 @@ class KodeksProcessor:
140
  n_results=n_results
141
  )
142
  logging.info("Znaleziono %d wynik贸w dla zapytania: %s", len(results['documents'][0]), query)
143
- return results
 
 
 
 
 
 
27
 
28
  def extract_metadata(self, text: str) -> Dict:
29
  metadata = {}
30
+ dz_u_match = re.search(r'Dz\.U\.(\d{4})\.(\d+)\.(\d+)', text)
31
  if dz_u_match:
32
  metadata['dz_u'] = f"Dz.U.{dz_u_match.group(1)}.{dz_u_match.group(2)}.{dz_u_match.group(3)}"
33
  metadata['rok'] = dz_u_match.group(1)
34
 
35
+ nazwa_match = re.search(r'USTAWA\s+z dnia(.*?)\n(.*?)\n', text)
36
  if nazwa_match:
37
  metadata['data_ustawy'] = nazwa_match.group(1).strip()
38
  metadata['nazwa'] = nazwa_match.group(2).strip()
 
47
  return "", text
48
 
49
  def process_article(self, article_text: str) -> Dict:
50
+ art_num_match = re.match(r'Art\.\s*(\d+)', article_text)
51
  article_num = art_num_match.group(1) if art_num_match else ""
52
 
53
+ paragraphs = re.findall(r'搂\s*(\d+)[.\s]+(.*?)(?=搂\s*\d+|$)', article_text, re.DOTALL)
54
 
55
  if not paragraphs:
56
  return {
 
67
 
68
  def split_into_chunks(self, text: str, metadata: Dict) -> List[Dict]:
69
  chunks = []
70
+ chapters = re.split(r'(Rozdzia艂 \d+\n\n[^\n]+)\n', text)
71
  current_chapter = ""
72
 
73
  for i, section in enumerate(chapters):
 
75
  current_chapter = section.strip()
76
  continue
77
 
78
+ articles = re.split(r'(Art\.\s*\d+.*?)(?=Art\.\s*\d+|$)', section)
79
 
80
  for article in articles:
81
  if not article.strip():
 
123
  metadatas=[chunk["metadata"]],
124
  ids=[f"{metadata['filename']}_{chunk['metadata']['article']}_{i}"]
125
  )
126
+ logging.info("Dodano chunk: %s", chunk["text"]) # Logowanie dodawanych chunk贸w
127
 
128
  logging.info("Dodano %d chunk贸w z pliku %s", len(chunks), metadata['filename'])
129
 
 
141
  n_results=n_results
142
  )
143
  logging.info("Znaleziono %d wynik贸w dla zapytania: %s", len(results['documents'][0]), query)
144
+ return results
145
+
146
+ def list_all_documents(self) -> None:
147
+ all_docs = self.collection.query(query_texts=[""], n_results=1000) # Pobierz wszystkie dokumenty
148
+ for doc in all_docs['documents'][0]:
149
+ logging.info("Dokument: %s", doc)