Update database.py
Browse files- database.py +13 -7
database.py
CHANGED
@@ -27,12 +27,12 @@ class KodeksProcessor:
|
|
27 |
|
28 |
def extract_metadata(self, text: str) -> Dict:
|
29 |
metadata = {}
|
30 |
-
dz_u_match = re.search(r'Dz
|
31 |
if dz_u_match:
|
32 |
metadata['dz_u'] = f"Dz.U.{dz_u_match.group(1)}.{dz_u_match.group(2)}.{dz_u_match.group(3)}"
|
33 |
metadata['rok'] = dz_u_match.group(1)
|
34 |
|
35 |
-
nazwa_match = re.search(r'USTAWA
|
36 |
if nazwa_match:
|
37 |
metadata['data_ustawy'] = nazwa_match.group(1).strip()
|
38 |
metadata['nazwa'] = nazwa_match.group(2).strip()
|
@@ -47,10 +47,10 @@ class KodeksProcessor:
|
|
47 |
return "", text
|
48 |
|
49 |
def process_article(self, article_text: str) -> Dict:
|
50 |
-
art_num_match = re.match(r'Art
|
51 |
article_num = art_num_match.group(1) if art_num_match else ""
|
52 |
|
53 |
-
paragraphs = re.findall(r'
|
54 |
|
55 |
if not paragraphs:
|
56 |
return {
|
@@ -67,7 +67,7 @@ class KodeksProcessor:
|
|
67 |
|
68 |
def split_into_chunks(self, text: str, metadata: Dict) -> List[Dict]:
|
69 |
chunks = []
|
70 |
-
chapters = re.split(r'(Rozdzia艂
|
71 |
current_chapter = ""
|
72 |
|
73 |
for i, section in enumerate(chapters):
|
@@ -75,7 +75,7 @@ class KodeksProcessor:
|
|
75 |
current_chapter = section.strip()
|
76 |
continue
|
77 |
|
78 |
-
articles = re.split(r'(Art
|
79 |
|
80 |
for article in articles:
|
81 |
if not article.strip():
|
@@ -123,6 +123,7 @@ class KodeksProcessor:
|
|
123 |
metadatas=[chunk["metadata"]],
|
124 |
ids=[f"{metadata['filename']}_{chunk['metadata']['article']}_{i}"]
|
125 |
)
|
|
|
126 |
|
127 |
logging.info("Dodano %d chunk贸w z pliku %s", len(chunks), metadata['filename'])
|
128 |
|
@@ -140,4 +141,9 @@ class KodeksProcessor:
|
|
140 |
n_results=n_results
|
141 |
)
|
142 |
logging.info("Znaleziono %d wynik贸w dla zapytania: %s", len(results['documents'][0]), query)
|
143 |
-
return results
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
def extract_metadata(self, text: str) -> Dict:
|
29 |
metadata = {}
|
30 |
+
dz_u_match = re.search(r'Dz\.U\.(\d{4})\.(\d+)\.(\d+)', text)
|
31 |
if dz_u_match:
|
32 |
metadata['dz_u'] = f"Dz.U.{dz_u_match.group(1)}.{dz_u_match.group(2)}.{dz_u_match.group(3)}"
|
33 |
metadata['rok'] = dz_u_match.group(1)
|
34 |
|
35 |
+
nazwa_match = re.search(r'USTAWA\s+z dnia(.*?)\n(.*?)\n', text)
|
36 |
if nazwa_match:
|
37 |
metadata['data_ustawy'] = nazwa_match.group(1).strip()
|
38 |
metadata['nazwa'] = nazwa_match.group(2).strip()
|
|
|
47 |
return "", text
|
48 |
|
49 |
def process_article(self, article_text: str) -> Dict:
|
50 |
+
art_num_match = re.match(r'Art\.\s*(\d+)', article_text)
|
51 |
article_num = art_num_match.group(1) if art_num_match else ""
|
52 |
|
53 |
+
paragraphs = re.findall(r'搂\s*(\d+)[.\s]+(.*?)(?=搂\s*\d+|$)', article_text, re.DOTALL)
|
54 |
|
55 |
if not paragraphs:
|
56 |
return {
|
|
|
67 |
|
68 |
def split_into_chunks(self, text: str, metadata: Dict) -> List[Dict]:
|
69 |
chunks = []
|
70 |
+
chapters = re.split(r'(Rozdzia艂 \d+\n\n[^\n]+)\n', text)
|
71 |
current_chapter = ""
|
72 |
|
73 |
for i, section in enumerate(chapters):
|
|
|
75 |
current_chapter = section.strip()
|
76 |
continue
|
77 |
|
78 |
+
articles = re.split(r'(Art\.\s*\d+.*?)(?=Art\.\s*\d+|$)', section)
|
79 |
|
80 |
for article in articles:
|
81 |
if not article.strip():
|
|
|
123 |
metadatas=[chunk["metadata"]],
|
124 |
ids=[f"{metadata['filename']}_{chunk['metadata']['article']}_{i}"]
|
125 |
)
|
126 |
+
logging.info("Dodano chunk: %s", chunk["text"]) # Logowanie dodawanych chunk贸w
|
127 |
|
128 |
logging.info("Dodano %d chunk贸w z pliku %s", len(chunks), metadata['filename'])
|
129 |
|
|
|
141 |
n_results=n_results
|
142 |
)
|
143 |
logging.info("Znaleziono %d wynik贸w dla zapytania: %s", len(results['documents'][0]), query)
|
144 |
+
return results
|
145 |
+
|
146 |
+
def list_all_documents(self) -> None:
|
147 |
+
all_docs = self.collection.query(query_texts=[""], n_results=1000) # Pobierz wszystkie dokumenty
|
148 |
+
for doc in all_docs['documents'][0]:
|
149 |
+
logging.info("Dokument: %s", doc)
|