adowu commited on
Commit
150d1ad
verified
1 Parent(s): 514204a

Update database.py

Browse files
Files changed (1) hide show
  1. database.py +99 -37
database.py CHANGED
@@ -11,19 +11,31 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(
11
 
12
  class KodeksProcessor:
13
  def __init__(self):
14
- logging.info("Inicjalizacja klienta bazy danych...")
 
 
 
 
15
  self.client = chromadb.PersistentClient(path=DATABASE_DIR)
 
 
16
  try:
17
  self.collection = self.client.get_collection("kodeksy")
18
- logging.info("Pobrano istniej膮c膮 kolekcj臋 'kodeksy'.")
19
- except:
20
- self.collection = self.client.create_collection(
21
- name="kodeksy",
22
- embedding_function=embedding_functions.SentenceTransformerEmbeddingFunction(
23
- model_name=EMBEDDING_MODEL
 
 
 
 
24
  )
25
- )
26
- logging.info("Utworzono now膮 kolekcj臋 'kodeksy'.")
 
 
27
 
28
  def extract_metadata(self, text: str) -> Dict:
29
  metadata = {}
@@ -37,6 +49,19 @@ class KodeksProcessor:
37
  metadata['data_ustawy'] = nazwa_match.group(1).strip()
38
  metadata['nazwa'] = nazwa_match.group(2).strip()
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  logging.info("Wydobyto metadane: %s", metadata)
41
  return metadata
42
 
@@ -47,7 +72,7 @@ class KodeksProcessor:
47
  return "", text
48
 
49
  def process_article(self, article_text: str) -> Dict:
50
- art_num_match = re.match(r'Art\.\s*(\d+)', article_text)
51
  article_num = art_num_match.group(1) if art_num_match else ""
52
 
53
  paragraphs = re.findall(r'搂\s*(\d+)\.\s*(.*?)(?=搂\s*\d+|Art\.\s*\d+|$)', article_text, re.DOTALL)
@@ -67,9 +92,9 @@ class KodeksProcessor:
67
 
68
  def split_into_chunks(self, text: str, metadata: Dict) -> List[Dict]:
69
  chunks = []
70
- articles = re.split(r'(Art\.\s*\d+)', text) # Podzia艂 na artyku艂y
71
 
72
- for i in range(1, len(articles), 2): # Przechodzimy przez artyku艂y
73
  article_title = articles[i].strip()
74
  article_content = articles[i + 1].strip() if i + 1 < len(articles) else ""
75
 
@@ -98,8 +123,12 @@ class KodeksProcessor:
98
  def process_file(self, filepath: str) -> None:
99
  logging.info("Przetwarzanie pliku: %s", filepath)
100
 
101
- with open(filepath, 'r', encoding='utf-8') as file:
102
- content = file.read()
 
 
 
 
103
 
104
  header, main_content = self.split_header_and_content(content)
105
  metadata = self.extract_metadata(main_content)
@@ -107,16 +136,19 @@ class KodeksProcessor:
107
 
108
  chunks = self.split_into_chunks(main_content, metadata)
109
 
110
- if chunks: # Sprawdzenie, czy s膮 jakie艣 chunk'i do dodania
111
- for i, chunk in enumerate(chunks):
112
- self.collection.add(
113
- documents=[chunk["text"]],
114
- metadatas=[chunk["metadata"]],
115
- ids=[f"{metadata['filename']}_{chunk['metadata']['article']}_{i}"]
116
- )
117
- logging.info("Dodano chunk: %s", chunk["text"]) # Logowanie dodawanych chunk贸w
 
 
 
118
  else:
119
- logging.warning("Brak chunk贸w do dodania z pliku: %s", filepath) # Logowanie braku chunk贸w
120
 
121
  logging.info("Dodano %d chunk贸w z pliku %s", len(chunks), metadata['filename'])
122
 
@@ -125,23 +157,53 @@ class KodeksProcessor:
125
  for filename in os.listdir(directory):
126
  if filename.endswith('.txt'):
127
  filepath = os.path.join(directory, filename)
128
- logging.info("Przetwarzanie pliku: %s", filepath) # Logowanie przetwarzania pliku
129
  self.process_file(filepath)
130
  logging.info("Zako艅czono przetwarzanie plik贸w.")
131
 
132
- def search(self, query: str, n_results: int = 3) -> Dict:
133
  logging.info("Wyszukiwanie w bazie danych dla zapytania: %s", query)
134
- results = self.collection.query(
135
- query_texts=[query],
136
- n_results=n_results
137
- )
138
- logging.info("Znaleziono %d wynik贸w dla zapytania: %s", len(results['documents'][0]), query)
139
- return results
 
 
 
 
 
140
 
141
  def list_all_documents(self) -> None:
142
- all_docs = self.collection.query(query_texts=[""], n_results=1000) # Pobierz wszystkie dokumenty
143
- if all_docs['documents']:
144
- for doc in all_docs['documents'][0]:
145
- logging.info("Dokument: %s", doc)
146
- else:
147
- logging.info("Brak dokument贸w w bazie.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  class KodeksProcessor:
13
  def __init__(self):
14
+ logging.info(f"Inicjalizacja klienta bazy danych w katalogu: {DATABASE_DIR}")
15
+ if not os.path.exists(DATABASE_DIR):
16
+ os.makedirs(DATABASE_DIR)
17
+ logging.info(f"Utworzono katalog {DATABASE_DIR}")
18
+
19
  self.client = chromadb.PersistentClient(path=DATABASE_DIR)
20
+ logging.info("Klient bazy danych zainicjalizowany")
21
+
22
  try:
23
  self.collection = self.client.get_collection("kodeksy")
24
+ logging.info("Pobrano istniej膮c膮 kolekcj臋 'kodeksy'")
25
+ except Exception as e:
26
+ logging.error(f"B艂膮d podczas pobierania kolekcji: {e}")
27
+ logging.info("Pr贸ba utworzenia nowej kolekcji 'kodeksy'")
28
+ try:
29
+ self.collection = self.client.create_collection(
30
+ name="kodeksy",
31
+ embedding_function=embedding_functions.SentenceTransformerEmbeddingFunction(
32
+ model_name=EMBEDDING_MODEL
33
+ )
34
  )
35
+ logging.info("Utworzono now膮 kolekcj臋 'kodeksy'")
36
+ except Exception as e:
37
+ logging.error(f"B艂膮d podczas tworzenia kolekcji: {e}")
38
+ raise
39
 
40
  def extract_metadata(self, text: str) -> Dict:
41
  metadata = {}
 
49
  metadata['data_ustawy'] = nazwa_match.group(1).strip()
50
  metadata['nazwa'] = nazwa_match.group(2).strip()
51
 
52
+ # Dodanie przetwarzania historii zmian
53
+ zmiany = re.findall(r'(\d{4}-\d{2}-\d{2})\s+(zm\.\s+DZ\.U\.(\d{4})\.(\d+)\.(\d+)\s+art\.\s+(\d+)(?:\s+搂\s+(\d+))?)', text)
54
+ if zmiany:
55
+ metadata['historia_zmian'] = [
56
+ {
57
+ 'data': data,
58
+ 'dz_u': f"Dz.U.{rok}.{numer}.{pozycja}",
59
+ 'artykul': artykul,
60
+ 'paragraf': paragraf if paragraf else None
61
+ }
62
+ for data, _, rok, numer, pozycja, artykul, paragraf in zmiany
63
+ ]
64
+
65
  logging.info("Wydobyto metadane: %s", metadata)
66
  return metadata
67
 
 
72
  return "", text
73
 
74
  def process_article(self, article_text: str) -> Dict:
75
+ art_num_match = re.match(r'Art\.\s*(\d+[a-z]?)', article_text)
76
  article_num = art_num_match.group(1) if art_num_match else ""
77
 
78
  paragraphs = re.findall(r'搂\s*(\d+)\.\s*(.*?)(?=搂\s*\d+|Art\.\s*\d+|$)', article_text, re.DOTALL)
 
92
 
93
  def split_into_chunks(self, text: str, metadata: Dict) -> List[Dict]:
94
  chunks = []
95
+ articles = re.split(r'(Art\.\s*\d+[a-z]?)', text)
96
 
97
+ for i in range(1, len(articles), 2):
98
  article_title = articles[i].strip()
99
  article_content = articles[i + 1].strip() if i + 1 < len(articles) else ""
100
 
 
123
  def process_file(self, filepath: str) -> None:
124
  logging.info("Przetwarzanie pliku: %s", filepath)
125
 
126
+ try:
127
+ with open(filepath, 'r', encoding='utf-8') as file:
128
+ content = file.read()
129
+ except Exception as e:
130
+ logging.error(f"B艂膮d podczas odczytu pliku {filepath}: {e}")
131
+ return
132
 
133
  header, main_content = self.split_header_and_content(content)
134
  metadata = self.extract_metadata(main_content)
 
136
 
137
  chunks = self.split_into_chunks(main_content, metadata)
138
 
139
+ if chunks:
140
+ try:
141
+ for i, chunk in enumerate(chunks):
142
+ self.collection.add(
143
+ documents=[chunk["text"]],
144
+ metadatas=[chunk["metadata"]],
145
+ ids=[f"{metadata['filename']}_{chunk['metadata']['article']}_{i}"]
146
+ )
147
+ logging.info(f"Dodano chunk: {chunk['text'][:100]}...") # Logowanie pierwszych 100 znak贸w chunka
148
+ except Exception as e:
149
+ logging.error(f"B艂膮d podczas dodawania chunk贸w do kolekcji: {e}")
150
  else:
151
+ logging.warning(f"Brak chunk贸w do dodania z pliku: {filepath}")
152
 
153
  logging.info("Dodano %d chunk贸w z pliku %s", len(chunks), metadata['filename'])
154
 
 
157
  for filename in os.listdir(directory):
158
  if filename.endswith('.txt'):
159
  filepath = os.path.join(directory, filename)
 
160
  self.process_file(filepath)
161
  logging.info("Zako艅czono przetwarzanie plik贸w.")
162
 
163
+ def search(self, query: str, n_results: int = 3, filters: Dict = None) -> Dict:
164
  logging.info("Wyszukiwanie w bazie danych dla zapytania: %s", query)
165
+ try:
166
+ results = self.collection.query(
167
+ query_texts=[query],
168
+ n_results=n_results,
169
+ where=filters
170
+ )
171
+ logging.info("Znaleziono %d wynik贸w dla zapytania: %s", len(results['documents'][0]), query)
172
+ return results
173
+ except Exception as e:
174
+ logging.error(f"B艂膮d podczas wyszukiwania: {e}")
175
+ return {"documents": [[]], "metadatas": [[]], "distances": [[]]}
176
 
177
  def list_all_documents(self) -> None:
178
+ try:
179
+ all_docs = self.collection.get(include=['metadatas'])
180
+ if all_docs['metadatas']:
181
+ for metadata in all_docs['metadatas']:
182
+ logging.info("Dokument: %s", metadata)
183
+ else:
184
+ logging.info("Brak dokument贸w w bazie.")
185
+ except Exception as e:
186
+ logging.error(f"B艂膮d podczas listowania dokument贸w: {e}")
187
+
188
+ def update_document(self, id: str, new_text: str, new_metadata: Dict) -> None:
189
+ try:
190
+ self.collection.update(
191
+ ids=[id],
192
+ documents=[new_text],
193
+ metadatas=[new_metadata]
194
+ )
195
+ logging.info(f"Zaktualizowano dokument o id: {id}")
196
+ except Exception as e:
197
+ logging.error(f"B艂膮d podczas aktualizacji dokumentu {id}: {e}")
198
+
199
+ def delete_document(self, id: str) -> None:
200
+ try:
201
+ self.collection.delete(ids=[id])
202
+ logging.info(f"Usuni臋to dokument o id: {id}")
203
+ except Exception as e:
204
+ logging.error(f"B艂膮d podczas usuwania dokumentu {id}: {e}")
205
+
206
+ if __name__ == "__main__":
207
+ processor = KodeksProcessor()
208
+ processor.process_all_files("data/kodeksy")
209
+ processor.list_all_documents()