adowu commited on
Commit
e6eebe9
verified
1 Parent(s): e0f90ab

Update database.py

Browse files
Files changed (1) hide show
  1. database.py +25 -35
database.py CHANGED
@@ -50,7 +50,7 @@ class KodeksProcessor:
50
  art_num_match = re.match(r'Art\.\s*(\d+)', article_text)
51
  article_num = art_num_match.group(1) if art_num_match else ""
52
 
53
- paragraphs = re.findall(r'搂\s*(\d+)[.\s]+(.*?)(?=搂\s*\d+|$)', article_text, re.DOTALL)
54
 
55
  if not paragraphs:
56
  return {
@@ -67,40 +67,30 @@ class KodeksProcessor:
67
 
68
  def split_into_chunks(self, text: str, metadata: Dict) -> List[Dict]:
69
  chunks = []
70
- chapters = re.split(r'(Rozdzia艂 \d+\n\n[^\n]+)\n', text)
71
- current_chapter = ""
72
-
73
- for i, section in enumerate(chapters):
74
- if section.startswith('Rozdzia艂'):
75
- current_chapter = section.strip()
76
- continue
77
-
78
- articles = re.split(r'(Art\.\s*\d+.*?)(?=Art\.\s*\d+|$)', section)
79
-
80
- for article in articles:
81
- if not article.strip():
82
- continue
83
-
84
- if article.startswith('Art.'):
85
- processed_article = self.process_article(article)
86
-
87
- chunk_metadata = {
88
- **metadata,
89
- "chapter": current_chapter,
90
- "article": processed_article["article_num"]
91
- }
92
-
93
- if processed_article["has_paragraphs"]:
94
- for par_num, par_content in processed_article["paragraphs"]:
95
- chunks.append({
96
- "text": f"Art. {processed_article['article_num']} 搂 {par_num}. {par_content}",
97
- "metadata": {**chunk_metadata, "paragraph": par_num}
98
- })
99
- else:
100
- chunks.append({
101
- "text": processed_article["content"],
102
- "metadata": chunk_metadata
103
- })
104
 
105
  logging.info("Podzielono tekst na %d chunk贸w.", len(chunks))
106
  return chunks
 
50
  art_num_match = re.match(r'Art\.\s*(\d+)', article_text)
51
  article_num = art_num_match.group(1) if art_num_match else ""
52
 
53
+ paragraphs = re.findall(r'搂\s*(\d+)\.\s*(.*?)(?=搂\s*\d+|Art\.\s*\d+|$)', article_text, re.DOTALL)
54
 
55
  if not paragraphs:
56
  return {
 
67
 
68
  def split_into_chunks(self, text: str, metadata: Dict) -> List[Dict]:
69
  chunks = []
70
+ articles = re.split(r'(Art\.\s*\d+)', text) # Podzia艂 na artyku艂y
71
+
72
+ for i in range(1, len(articles), 2): # Przechodzimy przez artyku艂y
73
+ article_title = articles[i].strip()
74
+ article_content = articles[i + 1].strip() if i + 1 < len(articles) else ""
75
+
76
+ processed_article = self.process_article(article_title + " " + article_content)
77
+
78
+ chunk_metadata = {
79
+ **metadata,
80
+ "article": processed_article["article_num"]
81
+ }
82
+
83
+ if processed_article["has_paragraphs"]:
84
+ for par_num, par_content in processed_article["paragraphs"]:
85
+ chunks.append({
86
+ "text": f"{article_title} 搂{par_num}. {par_content.strip()}",
87
+ "metadata": {**chunk_metadata, "paragraph": par_num}
88
+ })
89
+ else:
90
+ chunks.append({
91
+ "text": processed_article["content"],
92
+ "metadata": chunk_metadata
93
+ })
 
 
 
 
 
 
 
 
 
 
94
 
95
  logging.info("Podzielono tekst na %d chunk贸w.", len(chunks))
96
  return chunks