Update database.py
Browse files- database.py +26 -24
database.py
CHANGED
@@ -118,44 +118,46 @@ class KodeksProcessor:
|
|
118 |
return chunks
|
119 |
|
120 |
def process_file(self, filepath: str) -> None:
|
121 |
-
logger.info(f"
|
122 |
-
|
123 |
try:
|
124 |
with open(filepath, 'r', encoding='utf-8') as file:
|
125 |
content = file.read()
|
126 |
-
|
127 |
-
logger.error(f"B艂膮d podczas odczytu pliku {filepath}: {e}")
|
128 |
-
return
|
129 |
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
|
134 |
-
|
|
|
135 |
|
136 |
-
|
137 |
-
try:
|
138 |
-
logger.debug(f"Pr贸ba dodania {len(chunks)} chunk贸w do kolekcji")
|
139 |
self.collection.add(
|
140 |
documents=[chunk["text"] for chunk in chunks],
|
141 |
metadatas=[chunk["metadata"] for chunk in chunks],
|
142 |
ids=[f"{metadata['filename']}_{chunk['metadata']['article']}_{i}" for i, chunk in enumerate(chunks)]
|
143 |
)
|
144 |
-
logger.
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
logger.warning(f"Brak chunk贸w do dodania z pliku: {filepath}")
|
150 |
|
151 |
def process_all_files(self, directory: str) -> None:
|
152 |
logger.info(f"Rozpocz臋cie przetwarzania wszystkich plik贸w w katalogu: {directory}")
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
|
160 |
def verify_data_loading(self):
|
161 |
count = self.collection.count()
|
|
|
118 |
return chunks
|
119 |
|
120 |
def process_file(self, filepath: str) -> None:
|
121 |
+
logger.info(f"Rozpocz臋cie przetwarzania pliku: {filepath}")
|
|
|
122 |
try:
|
123 |
with open(filepath, 'r', encoding='utf-8') as file:
|
124 |
content = file.read()
|
125 |
+
logger.info(f"Odczytano zawarto艣膰 pliku: {filepath}")
|
|
|
|
|
126 |
|
127 |
+
header, main_content = self.split_header_and_content(content)
|
128 |
+
metadata = self.extract_metadata(main_content)
|
129 |
+
metadata['filename'] = os.path.basename(filepath)
|
130 |
|
131 |
+
chunks = self.split_into_chunks(main_content, metadata)
|
132 |
+
logger.info(f"Podzielono plik na {len(chunks)} chunk贸w")
|
133 |
|
134 |
+
if chunks:
|
|
|
|
|
135 |
self.collection.add(
|
136 |
documents=[chunk["text"] for chunk in chunks],
|
137 |
metadatas=[chunk["metadata"] for chunk in chunks],
|
138 |
ids=[f"{metadata['filename']}_{chunk['metadata']['article']}_{i}" for i, chunk in enumerate(chunks)]
|
139 |
)
|
140 |
+
logger.info(f"Dodano {len(chunks)} chunk贸w do kolekcji z pliku {metadata['filename']}")
|
141 |
+
else:
|
142 |
+
logger.warning(f"Brak chunk贸w do dodania z pliku: {filepath}")
|
143 |
+
except Exception as e:
|
144 |
+
logger.error(f"B艂膮d podczas przetwarzania pliku {filepath}: {e}")
|
|
|
145 |
|
146 |
def process_all_files(self, directory: str) -> None:
|
147 |
logger.info(f"Rozpocz臋cie przetwarzania wszystkich plik贸w w katalogu: {directory}")
|
148 |
+
if not os.path.exists(directory):
|
149 |
+
logger.error(f"Katalog {directory} nie istnieje!")
|
150 |
+
return
|
151 |
+
try:
|
152 |
+
files = [f for f in os.listdir(directory) if f.endswith('.txt')]
|
153 |
+
logger.info(f"Znaleziono {len(files)} plik贸w .txt")
|
154 |
+
for filename in files:
|
155 |
+
filepath = os.path.join(directory, filename)
|
156 |
+
logger.info(f"Przetwarzanie pliku: {filepath}")
|
157 |
+
self.process_file(filepath)
|
158 |
+
logger.info("Zako艅czono przetwarzanie plik贸w.")
|
159 |
+
except Exception as e:
|
160 |
+
logger.error(f"B艂膮d podczas przetwarzania plik贸w: {e}")
|
161 |
|
162 |
def verify_data_loading(self):
|
163 |
count = self.collection.count()
|