adowu commited on
Commit
514204a
1 Parent(s): e900f04

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +200 -3
app.py CHANGED
@@ -1,7 +1,204 @@
1
- import streamlit as st
2
- from database import KodeksProcessor
3
- from chatbot import Chatbot
4
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  def initialize_session_state():
7
  if 'chatbot' not in st.session_state:
 
 
 
 
1
  import os
2
+ import re
3
+ import logging
4
+ import streamlit as st
5
+ import chromadb
6
+ from chromadb.utils import embedding_functions
7
+ from huggingface_hub import InferenceClient
8
+ from dotenv import load_dotenv
9
+ from typing import List, Dict, Tuple
10
+
11
+ # Konfiguracja logowania
12
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
13
+
14
+ # Ładowanie zmiennych środowiskowych
15
+ load_dotenv()
16
+
17
+ # Konfiguracja API
18
+ HF_TOKEN = os.getenv('HF_TOKEN')
19
+ MODEL_NAME = "Qwen/Qwen2.5-72B-Instruct"
20
+
21
+ # Konfiguracja bazy danych
22
+ DATABASE_DIR = "chroma_db"
23
+
24
+ # Konfiguracja modelu embeddings
25
+ EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
26
+
27
+ # System prompt
28
+ SYSTEM_PROMPT = """Jesteś asystentem prawniczym specjalizującym się w polskim prawie.
29
+ Twoje odpowiedzi opierają się na aktualnych przepisach prawnych.
30
+ Zawsze cytuj konkretne artykuły i paragrafy z odpowiednich ustaw."""
31
+
32
+ class KodeksProcessor:
33
+ def __init__(self):
34
+ logging.info("Inicjalizacja klienta bazy danych...")
35
+ self.client = chromadb.PersistentClient(path=DATABASE_DIR)
36
+ try:
37
+ self.collection = self.client.get_collection("kodeksy")
38
+ logging.info("Pobrano istniejącą kolekcję 'kodeksy'.")
39
+ except:
40
+ self.collection = self.client.create_collection(
41
+ name="kodeksy",
42
+ embedding_function=embedding_functions.SentenceTransformerEmbeddingFunction(
43
+ model_name=EMBEDDING_MODEL
44
+ )
45
+ )
46
+ logging.info("Utworzono nową kolekcję 'kodeksy'.")
47
+
48
+ def extract_metadata(self, text: str) -> Dict:
49
+ metadata = {}
50
+ dz_u_match = re.search(r'Dz\.U\.(\d{4})\.(\d+)\.(\d+)', text)
51
+ if dz_u_match:
52
+ metadata['dz_u'] = f"Dz.U.{dz_u_match.group(1)}.{dz_u_match.group(2)}.{dz_u_match.group(3)}"
53
+ metadata['rok'] = dz_u_match.group(1)
54
+
55
+ nazwa_match = re.search(r'USTAWA\s+z dnia(.*?)\n(.*?)\n', text)
56
+ if nazwa_match:
57
+ metadata['data_ustawy'] = nazwa_match.group(1).strip()
58
+ metadata['nazwa'] = nazwa_match.group(2).strip()
59
+
60
+ logging.info("Wydobyto metadane: %s", metadata)
61
+ return metadata
62
+
63
+ def split_header_and_content(self, text: str) -> Tuple[str, str]:
64
+ parts = text.split("USTAWA", 1)
65
+ if len(parts) > 1:
66
+ return parts[0], "USTAWA" + parts[1]
67
+ return "", text
68
+
69
+ def process_article(self, article_text: str) -> Dict:
70
+ art_num_match = re.match(r'Art\.\s*(\d+)', article_text)
71
+ article_num = art_num_match.group(1) if art_num_match else ""
72
+
73
+ paragraphs = re.findall(r'§\s*(\d+)\.\s*(.*?)(?=§\s*\d+|Art\.\s*\d+|$)', article_text, re.DOTALL)
74
+
75
+ if not paragraphs:
76
+ return {
77
+ "article_num": article_num,
78
+ "content": article_text.strip(),
79
+ "has_paragraphs": False
80
+ }
81
+
82
+ return {
83
+ "article_num": article_num,
84
+ "paragraphs": paragraphs,
85
+ "has_paragraphs": True
86
+ }
87
+
88
+ def split_into_chunks(self, text: str, metadata: Dict) -> List[Dict]:
89
+ chunks = []
90
+ articles = re.split(r'(Art\.\s*\d+)', text) # Podział na artykuły
91
+
92
+ for i in range(1, len(articles), 2): # Przechodzimy przez artykuły
93
+ article_title = articles[i].strip()
94
+ article_content = articles[i + 1].strip() if i + 1 < len(articles) else ""
95
+
96
+ processed_article = self.process_article(article_title + " " + article_content)
97
+
98
+ chunk_metadata = {
99
+ **metadata,
100
+ "article": processed_article["article_num"]
101
+ }
102
+
103
+ if processed_article["has_paragraphs"]:
104
+ for par_num, par_content in processed_article["paragraphs"]:
105
+ chunks.append({
106
+ "text": f"{article_title} §{par_num}. {par_content.strip()}",
107
+ "metadata": {**chunk_metadata, "paragraph": par_num}
108
+ })
109
+ else:
110
+ chunks.append({
111
+ "text": processed_article["content"],
112
+ "metadata": chunk_metadata
113
+ })
114
+
115
+ logging.info("Podzielono tekst na %d chunków.", len(chunks))
116
+ return chunks
117
+
118
+ def process_file(self, filepath: str) -> None:
119
+ logging.info("Przetwarzanie pliku: %s", filepath)
120
+
121
+ with open(filepath, 'r', encoding='utf-8') as file:
122
+ content = file.read()
123
+
124
+ header, main_content = self.split_header_and_content(content)
125
+ metadata = self.extract_metadata(main_content)
126
+ metadata['filename'] = os.path.basename(filepath)
127
+
128
+ chunks = self.split_into_chunks(main_content, metadata)
129
+
130
+ if chunks: # Sprawdzenie, czy są jakieś chunk'i do dodania
131
+ for i, chunk in enumerate(chunks):
132
+ self.collection.add(
133
+ documents=[chunk["text"]],
134
+ metadatas=[chunk["metadata"]],
135
+ ids=[f"{metadata['filename']}_{chunk['metadata']['article']}_{i}"]
136
+ )
137
+ logging.info("Dodano chunk: %s", chunk["text"]) # Logowanie dodawanych chunków
138
+ else:
139
+ logging.warning("Brak chunków do dodania z pliku: %s", filepath) # Logowanie braku chunków
140
+
141
+ logging.info("Dodano %d chunków z pliku %s", len(chunks), metadata['filename'])
142
+
143
+ def process_all_files(self, directory: str) -> None:
144
+ logging.info("Rozpoczęcie przetwarzania wszystkich plików w katalogu: %s", directory)
145
+ for filename in os.listdir(directory):
146
+ if filename.endswith('.txt'):
147
+ filepath = os.path.join(directory, filename)
148
+ logging.info("Przetwarzanie pliku: %s", filepath) # Logowanie przetwarzania pliku
149
+ self.process_file(filepath)
150
+ logging.info("Zakończono przetwarzanie plików.")
151
+
152
+ def search(self, query: str, n_results: int = 3) -> Dict:
153
+ logging.info("Wyszukiwanie w bazie danych dla zapytania: %s", query)
154
+ results = self.collection.query(
155
+ query_texts=[query],
156
+ n_results=n_results
157
+ )
158
+ logging.info("Znaleziono %d wyników dla zapytania: %s", len(results['documents'][0]), query)
159
+ return results
160
+
161
+ class Chatbot:
162
+ def __init__(self):
163
+ self.client = InferenceClient(api_key=HF_TOKEN)
164
+ self.conversation_history = [
165
+ {"role": "system", "content": SYSTEM_PROMPT}
166
+ ]
167
+
168
+ def generate_context(self, relevant_chunks: List[Dict]) -> str:
169
+ context = "Kontekst z przepisów prawnych:\n\n"
170
+ for chunk in relevant_chunks:
171
+ context += f"{chunk['text']}\n\n"
172
+ return context
173
+
174
+ def get_response(self, user_input: str, context: str) -> str:
175
+ messages = self.conversation_history + [
176
+ {"role": "user", "content": f"Kontekst: {context}\n\nPytanie: {user_input}"}
177
+ ]
178
+
179
+ response = ""
180
+ stream = self.client.chat.completions.create(
181
+ model=MODEL_NAME,
182
+ messages=messages,
183
+ temperature=0.5,
184
+ max_tokens=8192,
185
+ top_p=0.7,
186
+ stream=True
187
+ )
188
+
189
+ for chunk in stream:
190
+ content = chunk.choices[0].delta.content
191
+ if content:
192
+ response += content
193
+ yield content
194
+
195
+ self.conversation_history.append({"role": "user", "content": user_input})
196
+ self.conversation_history.append({"role": "assistant", "content": response})
197
+
198
+ def clear_history(self):
199
+ self.conversation_history = [
200
+ {"role": "system", "content": SYSTEM_PROMPT}
201
+ ]
202
 
203
  def initialize_session_state():
204
  if 'chatbot' not in st.session_state: