oceansweep commited on
Commit
8904b33
1 Parent(s): 904ea9e

Delete App_Function_Libraries/RAG_Library.py

Browse files
Files changed (1) hide show
  1. App_Function_Libraries/RAG_Library.py +0 -396
App_Function_Libraries/RAG_Library.py DELETED
@@ -1,396 +0,0 @@
1
- import numpy as np
2
- from typing import List, Tuple, Dict
3
- from sklearn.feature_extraction.text import TfidfVectorizer
4
- from sklearn.metrics.pairwise import cosine_similarity
5
- from sentence_transformers import SentenceTransformer
6
- import math
7
- from functools import lru_cache
8
- from concurrent.futures import ThreadPoolExecutor
9
- import openai
10
- from transformers import T5ForConditionalGeneration, T5Tokenizer
11
- import torch
12
- import re
13
- import psycopg2
14
- from psycopg2.extras import execute_values
15
- import sqlite3
16
- import logging
17
-
18
-
19
-
20
- ########################################################################################################################################################################################################################################
21
- #
22
- # RAG Chunking
23
- # To fully integrate this chunking system, you'd need to:
24
- #
25
- # Create the UnvectorizedMediaChunks table in your SQLite database.
26
- # Modify your document ingestion process to use chunk_and_store_unvectorized.
27
- # Implement a background process that periodically calls vectorize_all_documents to process unvectorized chunks.
28
-
29
- # This chunking is pretty weak and needs improvement
30
- # See notes for improvements #FIXME
31
- import json
32
- from typing import List, Dict, Any
33
- from datetime import datetime
34
-
35
-
36
- def chunk_and_store_unvectorized(
37
- db_connection,
38
- media_id: int,
39
- text: str,
40
- chunk_size: int = 1000,
41
- overlap: int = 100,
42
- chunk_type: str = 'fixed-length'
43
- ) -> List[int]:
44
- chunks = create_chunks(text, chunk_size, overlap)
45
- return store_unvectorized_chunks(db_connection, media_id, chunks, chunk_type)
46
-
47
-
48
- def create_chunks(text: str, chunk_size: int, overlap: int) -> List[Dict[str, Any]]:
49
- words = text.split()
50
- chunks = []
51
- for i in range(0, len(words), chunk_size - overlap):
52
- chunk_text = ' '.join(words[i:i + chunk_size])
53
- start_char = text.index(words[i])
54
- end_char = start_char + len(chunk_text)
55
- chunks.append({
56
- 'text': chunk_text,
57
- 'start_char': start_char,
58
- 'end_char': end_char,
59
- 'index': len(chunks)
60
- })
61
- return chunks
62
-
63
-
64
- def store_unvectorized_chunks(
65
- db_connection,
66
- media_id: int,
67
- chunks: List[Dict[str, Any]],
68
- chunk_type: str
69
- ) -> List[int]:
70
- cursor = db_connection.cursor()
71
- chunk_ids = []
72
- for chunk in chunks:
73
- cursor.execute("""
74
- INSERT INTO UnvectorizedMediaChunks
75
- (media_id, chunk_text, chunk_index, start_char, end_char, chunk_type, metadata)
76
- VALUES (?, ?, ?, ?, ?, ?, ?)
77
- """, (
78
- media_id,
79
- chunk['text'],
80
- chunk['index'],
81
- chunk['start_char'],
82
- chunk['end_char'],
83
- chunk_type,
84
- json.dumps({'length': len(chunk['text'])}) # Example metadata
85
- ))
86
- chunk_ids.append(cursor.lastrowid)
87
- db_connection.commit()
88
- return chunk_ids
89
-
90
-
91
- def get_unvectorized_chunks(
92
- db_connection,
93
- media_id: int,
94
- limit: int = 100,
95
- offset: int = 0
96
- ) -> List[Dict[str, Any]]:
97
- cursor = db_connection.cursor()
98
- cursor.execute("""
99
- SELECT id, chunk_text, chunk_index, start_char, end_char, chunk_type, metadata
100
- FROM UnvectorizedMediaChunks
101
- WHERE media_id = ? AND is_processed = FALSE
102
- ORDER BY chunk_index
103
- LIMIT ? OFFSET ?
104
- """, (media_id, limit, offset))
105
- return [
106
- {
107
- 'id': row[0],
108
- 'text': row[1],
109
- 'index': row[2],
110
- 'start_char': row[3],
111
- 'end_char': row[4],
112
- 'type': row[5],
113
- 'metadata': json.loads(row[6])
114
- }
115
- for row in cursor.fetchall()
116
- ]
117
-
118
-
119
- def mark_chunks_as_processed(db_connection, chunk_ids: List[int]):
120
- cursor = db_connection.cursor()
121
- cursor.executemany("""
122
- UPDATE UnvectorizedMediaChunks
123
- SET is_processed = TRUE, last_modified = ?
124
- WHERE id = ?
125
- """, [(datetime.now(), chunk_id) for chunk_id in chunk_ids])
126
- db_connection.commit()
127
-
128
-
129
- # Usage example
130
- def process_media_chunks(db_connection, media_id: int, text: str):
131
- chunk_ids = chunk_and_store_unvectorized(db_connection, media_id, text)
132
- print(f"Stored {len(chunk_ids)} unvectorized chunks for media_id {media_id}")
133
-
134
- # Later, when you want to process these chunks:
135
- unprocessed_chunks = get_unvectorized_chunks(db_connection, media_id)
136
- # Process chunks (e.g., vectorize them)
137
- # ...
138
- # After processing, mark them as processed
139
- mark_chunks_as_processed(db_connection, [chunk['id'] for chunk in unprocessed_chunks])
140
- ###########################################################################################################################################################################################################
141
- #
142
- # RAG System
143
-
144
- # To use this updated RAG system in your existing application:
145
- #
146
- # Install required packages:
147
- # pip install sentence-transformers psycopg2-binary scikit-learn transformers torch
148
- # Set up PostgreSQL with pgvector:
149
- #
150
- # Install PostgreSQL and the pgvector extension.
151
- # Create a new database for vector storage.
152
- #
153
- # Update your main application to use the RAG system:
154
- #
155
- # Import the RAGSystem class from this new file.
156
- # Initialize the RAG system with your SQLite and PostgreSQL configurations.
157
- # Use the vectorize_all_documents method to initially vectorize your existing documents.
158
- #
159
- #
160
- # Modify your existing PDF_Ingestion_Lib.py and Book_Ingestion_Lib.py:
161
- #
162
- # After successfully ingesting a document into SQLite, call the vectorization method from the RAG system.
163
-
164
- # Example modification for ingest_text_file in Book_Ingestion_Lib.py:
165
- # from RAG_Library import RAGSystem
166
- #
167
- # # Initialize RAG system (do this once in your main application)
168
- # rag_system = RAGSystem(sqlite_path, pg_config)
169
- #
170
- # def ingest_text_file(file_path, title=None, author=None, keywords=None):
171
- # try:
172
- # # ... (existing code)
173
- #
174
- # # Add the text file to the database
175
- # doc_id = add_media_with_keywords(
176
- # url=file_path,
177
- # title=title,
178
- # media_type='document',
179
- # content=content,
180
- # keywords=keywords,
181
- # prompt='No prompt for text files',
182
- # summary='No summary for text files',
183
- # transcription_model='None',
184
- # author=author,
185
- # ingestion_date=datetime.now().strftime('%Y-%m-%d')
186
- # )
187
- #
188
- # # Vectorize the newly added document
189
- # rag_system.vectorize_document(doc_id, content)
190
- #
191
- # return f"Text file '{title}' by {author} ingested and vectorized successfully."
192
- # except Exception as e:
193
- # logging.error(f"Error ingesting text file: {str(e)}")
194
- # return f"Error ingesting text file: {str(e)}"
195
-
196
-
197
-
198
- # Setup logging
199
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
200
- logger = logging.getLogger(__name__)
201
-
202
- # Constants
203
- EMBEDDING_MODEL = 'all-MiniLM-L6-v2'
204
- VECTOR_DIM = 384 # Dimension of the chosen embedding model
205
-
206
-
207
- class RAGSystem:
208
- def __init__(self, sqlite_path: str, pg_config: Dict[str, str], cache_size: int = 100):
209
- self.sqlite_path = sqlite_path
210
- self.pg_config = pg_config
211
- self.model = SentenceTransformer(EMBEDDING_MODEL)
212
- self.cache_size = cache_size
213
-
214
- self._init_postgres()
215
-
216
- def _init_postgres(self):
217
- with psycopg2.connect(**self.pg_config) as conn:
218
- with conn.cursor() as cur:
219
- cur.execute("""
220
- CREATE TABLE IF NOT EXISTS document_vectors (
221
- id SERIAL PRIMARY KEY,
222
- document_id INTEGER UNIQUE,
223
- vector vector(384)
224
- )
225
- """)
226
- conn.commit()
227
-
228
- @lru_cache(maxsize=100)
229
- def _get_embedding(self, text: str) -> np.ndarray:
230
- return self.model.encode([text])[0]
231
-
232
- def vectorize_document(self, doc_id: int, content: str):
233
- chunks = create_chunks(content, chunk_size=1000, overlap=100)
234
- for chunk in chunks:
235
- vector = self._get_embedding(chunk['text'])
236
-
237
- with psycopg2.connect(**self.pg_config) as conn:
238
- with conn.cursor() as cur:
239
- cur.execute("""
240
- INSERT INTO document_vectors (document_id, chunk_index, vector, metadata)
241
- VALUES (%s, %s, %s, %s)
242
- ON CONFLICT (document_id, chunk_index) DO UPDATE SET vector = EXCLUDED.vector
243
- """, (doc_id, chunk['index'], vector.tolist(), json.dumps(chunk)))
244
- conn.commit()
245
-
246
- def vectorize_all_documents(self):
247
- with sqlite3.connect(self.sqlite_path) as sqlite_conn:
248
- unprocessed_chunks = get_unvectorized_chunks(sqlite_conn, limit=1000)
249
- for chunk in unprocessed_chunks:
250
- self.vectorize_document(chunk['id'], chunk['text'])
251
- mark_chunks_as_processed(sqlite_conn, [chunk['id'] for chunk in unprocessed_chunks])
252
-
253
- def semantic_search(self, query: str, top_k: int = 5) -> List[Tuple[int, int, float]]:
254
- query_vector = self._get_embedding(query)
255
-
256
- with psycopg2.connect(**self.pg_config) as conn:
257
- with conn.cursor() as cur:
258
- cur.execute("""
259
- SELECT document_id, chunk_index, 1 - (vector <-> %s) AS similarity
260
- FROM document_vectors
261
- ORDER BY vector <-> %s ASC
262
- LIMIT %s
263
- """, (query_vector.tolist(), query_vector.tolist(), top_k))
264
- results = cur.fetchall()
265
-
266
- return results
267
-
268
- def get_document_content(self, doc_id: int) -> str:
269
- with sqlite3.connect(self.sqlite_path) as conn:
270
- cur = conn.cursor()
271
- cur.execute("SELECT content FROM media WHERE id = ?", (doc_id,))
272
- result = cur.fetchone()
273
- return result[0] if result else ""
274
-
275
- def bm25_search(self, query: str, top_k: int = 5) -> List[Tuple[int, float]]:
276
- with sqlite3.connect(self.sqlite_path) as conn:
277
- cur = conn.cursor()
278
- cur.execute("SELECT id, content FROM media")
279
- documents = cur.fetchall()
280
-
281
- vectorizer = TfidfVectorizer(use_idf=True)
282
- tfidf_matrix = vectorizer.fit_transform([doc[1] for doc in documents])
283
-
284
- query_vector = vectorizer.transform([query])
285
- doc_lengths = tfidf_matrix.sum(axis=1).A1
286
- avg_doc_length = np.mean(doc_lengths)
287
-
288
- k1, b = 1.5, 0.75
289
- scores = []
290
- for i, doc_vector in enumerate(tfidf_matrix):
291
- score = np.sum(
292
- ((k1 + 1) * query_vector.multiply(doc_vector)).A1 /
293
- (k1 * (1 - b + b * doc_lengths[i] / avg_doc_length) + query_vector.multiply(doc_vector).A1)
294
- )
295
- scores.append((documents[i][0], score))
296
-
297
- return sorted(scores, key=lambda x: x[1], reverse=True)[:top_k]
298
-
299
- def combine_search_results(self, bm25_results: List[Tuple[int, float]], vector_results: List[Tuple[int, float]],
300
- alpha: float = 0.5) -> List[Tuple[int, float]]:
301
- combined_scores = {}
302
- for idx, score in bm25_results + vector_results:
303
- if idx in combined_scores:
304
- combined_scores[idx] += score * (alpha if idx in dict(bm25_results) else (1 - alpha))
305
- else:
306
- combined_scores[idx] = score * (alpha if idx in dict(bm25_results) else (1 - alpha))
307
- return sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
308
-
309
- def expand_query(self, query: str) -> str:
310
- model = T5ForConditionalGeneration.from_pretrained("t5-small")
311
- tokenizer = T5Tokenizer.from_pretrained("t5-small")
312
-
313
- input_text = f"expand query: {query}"
314
- input_ids = tokenizer.encode(input_text, return_tensors="pt")
315
-
316
- outputs = model.generate(input_ids, max_length=50, num_return_sequences=1)
317
- expanded_query = tokenizer.decode(outputs[0], skip_special_tokens=True)
318
-
319
- return f"{query} {expanded_query}"
320
-
321
- def cross_encoder_rerank(self, query: str, initial_results: List[Tuple[int, float]], top_k: int = 5) -> List[
322
- Tuple[int, float]]:
323
- from sentence_transformers import CrossEncoder
324
- model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
325
-
326
- candidate_docs = [self.get_document_content(doc_id) for doc_id, _ in initial_results[:top_k * 2]]
327
- pairs = [[query, doc] for doc in candidate_docs]
328
- scores = model.predict(pairs)
329
-
330
- reranked = sorted(zip(initial_results[:top_k * 2], scores), key=lambda x: x[1], reverse=True)
331
- return [(idx, score) for (idx, _), score in reranked[:top_k]]
332
-
333
- def rag_query(self, query: str, search_type: str = 'combined', top_k: int = 5, use_hyde: bool = False,
334
- rerank: bool = False, expand: bool = False) -> List[Dict[str, any]]:
335
- try:
336
- if expand:
337
- query = self.expand_query(query)
338
-
339
- if use_hyde:
340
- # Implement HyDE if needed
341
- pass
342
- elif search_type == 'vector':
343
- results = self.semantic_search(query, top_k)
344
- elif search_type == 'bm25':
345
- results = self.bm25_search(query, top_k)
346
- elif search_type == 'combined':
347
- bm25_results = self.bm25_search(query, top_k)
348
- vector_results = self.semantic_search(query, top_k)
349
- results = self.combine_search_results(bm25_results, vector_results)
350
- else:
351
- raise ValueError("Invalid search type. Choose 'vector', 'bm25', or 'combined'.")
352
-
353
- if rerank:
354
- results = self.cross_encoder_rerank(query, results, top_k)
355
-
356
- enriched_results = []
357
- for doc_id, score in results:
358
- content = self.get_document_content(doc_id)
359
- enriched_results.append({
360
- "document_id": doc_id,
361
- "score": score,
362
- "content": content[:500] # Truncate content for brevity
363
- })
364
-
365
- return enriched_results
366
- except Exception as e:
367
- logger.error(f"An error occurred during RAG query: {str(e)}")
368
- return []
369
-
370
-
371
- # Example usage
372
- if __name__ == "__main__":
373
- sqlite_path = "path/to/your/sqlite/database.db"
374
- pg_config = {
375
- "dbname": "your_db_name",
376
- "user": "your_username",
377
- "password": "your_password",
378
- "host": "localhost"
379
- }
380
-
381
- rag_system = RAGSystem(sqlite_path, pg_config)
382
-
383
- # Vectorize all documents (run this once or periodically)
384
- rag_system.vectorize_all_documents()
385
-
386
- # Example query
387
- query = "programming concepts for beginners"
388
- results = rag_system.rag_query(query, search_type='combined', expand=True, rerank=True)
389
-
390
- print(f"Search results for query: '{query}'\n")
391
- for i, result in enumerate(results, 1):
392
- print(f"Result {i}:")
393
- print(f"Document ID: {result['document_id']}")
394
- print(f"Score: {result['score']:.4f}")
395
- print(f"Content snippet: {result['content']}")
396
- print("---")