Update database.py
Browse files- database.py +25 -35
database.py
CHANGED
@@ -50,7 +50,7 @@ class KodeksProcessor:
|
|
50 |
art_num_match = re.match(r'Art\.\s*(\d+)', article_text)
|
51 |
article_num = art_num_match.group(1) if art_num_match else ""
|
52 |
|
53 |
-
paragraphs = re.findall(r'搂\s*(\d+)
|
54 |
|
55 |
if not paragraphs:
|
56 |
return {
|
@@ -67,40 +67,30 @@ class KodeksProcessor:
|
|
67 |
|
68 |
def split_into_chunks(self, text: str, metadata: Dict) -> List[Dict]:
|
69 |
chunks = []
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
for par_num, par_content in processed_article["paragraphs"]:
|
95 |
-
chunks.append({
|
96 |
-
"text": f"Art. {processed_article['article_num']} 搂 {par_num}. {par_content}",
|
97 |
-
"metadata": {**chunk_metadata, "paragraph": par_num}
|
98 |
-
})
|
99 |
-
else:
|
100 |
-
chunks.append({
|
101 |
-
"text": processed_article["content"],
|
102 |
-
"metadata": chunk_metadata
|
103 |
-
})
|
104 |
|
105 |
logging.info("Podzielono tekst na %d chunk贸w.", len(chunks))
|
106 |
return chunks
|
|
|
50 |
art_num_match = re.match(r'Art\.\s*(\d+)', article_text)
|
51 |
article_num = art_num_match.group(1) if art_num_match else ""
|
52 |
|
53 |
+
paragraphs = re.findall(r'搂\s*(\d+)\.\s*(.*?)(?=搂\s*\d+|Art\.\s*\d+|$)', article_text, re.DOTALL)
|
54 |
|
55 |
if not paragraphs:
|
56 |
return {
|
|
|
67 |
|
68 |
def split_into_chunks(self, text: str, metadata: Dict) -> List[Dict]:
|
69 |
chunks = []
|
70 |
+
articles = re.split(r'(Art\.\s*\d+)', text) # Podzia艂 na artyku艂y
|
71 |
+
|
72 |
+
for i in range(1, len(articles), 2): # Przechodzimy przez artyku艂y
|
73 |
+
article_title = articles[i].strip()
|
74 |
+
article_content = articles[i + 1].strip() if i + 1 < len(articles) else ""
|
75 |
+
|
76 |
+
processed_article = self.process_article(article_title + " " + article_content)
|
77 |
+
|
78 |
+
chunk_metadata = {
|
79 |
+
**metadata,
|
80 |
+
"article": processed_article["article_num"]
|
81 |
+
}
|
82 |
+
|
83 |
+
if processed_article["has_paragraphs"]:
|
84 |
+
for par_num, par_content in processed_article["paragraphs"]:
|
85 |
+
chunks.append({
|
86 |
+
"text": f"{article_title} 搂{par_num}. {par_content.strip()}",
|
87 |
+
"metadata": {**chunk_metadata, "paragraph": par_num}
|
88 |
+
})
|
89 |
+
else:
|
90 |
+
chunks.append({
|
91 |
+
"text": processed_article["content"],
|
92 |
+
"metadata": chunk_metadata
|
93 |
+
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
|
95 |
logging.info("Podzielono tekst na %d chunk贸w.", len(chunks))
|
96 |
return chunks
|