oceansweep commited on
Commit
fddcafb
1 Parent(s): a15fa18

Update App_Function_Libraries/RAG_Libary_2.py

Browse files
Files changed (1) hide show
  1. App_Function_Libraries/RAG_Libary_2.py +721 -720
App_Function_Libraries/RAG_Libary_2.py CHANGED
@@ -1,720 +1,721 @@
1
- # Import necessary modules and functions
2
- import configparser
3
- from typing import Dict, Any
4
- # Local Imports
5
- from App_Function_Libraries.ChromaDB_Library import process_and_store_content, vector_search, chroma_client
6
- from Article_Extractor_Lib import scrape_article
7
- from SQLite_DB import search_db, db
8
- # 3rd-Party Imports
9
- import openai
10
- # Initialize OpenAI client (adjust this based on your API key management)
11
- openai.api_key = "your-openai-api-key"
12
-
13
-
14
- # Main RAG pipeline function
15
- def rag_pipeline(url: str, query: str, api_choice=None) -> Dict[str, Any]:
16
- # Extract content
17
- article_data = scrape_article(url)
18
- content = article_data['content']
19
-
20
- # Process and store content
21
- collection_name = "article_" + str(hash(url))
22
- process_and_store_content(content, collection_name)
23
-
24
- # Perform searches
25
- vector_results = vector_search(collection_name, query, k=5)
26
- fts_results = search_db(query, ["content"], "", page=1, results_per_page=5)
27
-
28
- # Combine results
29
- all_results = vector_results + [result['content'] for result in fts_results]
30
- context = "\n".join(all_results)
31
-
32
- # Generate answer using the selected API
33
- answer = generate_answer(api_choice, context, query)
34
-
35
- return {
36
- "answer": answer,
37
- "context": context
38
- }
39
-
40
- config = configparser.ConfigParser()
41
- config.read('config.txt')
42
-
43
- def generate_answer(api_choice: str, context: str, query: str) -> str:
44
- prompt = f"Context: {context}\n\nQuestion: {query}"
45
- if api_choice == "OpenAI":
46
- from App_Function_Libraries.Summarization_General_Lib import summarize_with_openai
47
- return summarize_with_openai(config['API']['openai_api_key'], prompt, "")
48
- elif api_choice == "Anthropic":
49
- from App_Function_Libraries.Summarization_General_Lib import summarize_with_anthropic
50
- return summarize_with_anthropic(config['API']['anthropic_api_key'], prompt, "")
51
- elif api_choice == "Cohere":
52
- from App_Function_Libraries.Summarization_General_Lib import summarize_with_cohere
53
- return summarize_with_cohere(config['API']['cohere_api_key'], prompt, "")
54
- elif api_choice == "Groq":
55
- from App_Function_Libraries.Summarization_General_Lib import summarize_with_groq
56
- return summarize_with_groq(config['API']['groq_api_key'], prompt, "")
57
- elif api_choice == "OpenRouter":
58
- from App_Function_Libraries.Summarization_General_Lib import summarize_with_openrouter
59
- return summarize_with_openrouter(config['API']['openrouter_api_key'], prompt, "")
60
- elif api_choice == "HuggingFace":
61
- from App_Function_Libraries.Summarization_General_Lib import summarize_with_huggingface
62
- return summarize_with_huggingface(config['API']['huggingface_api_key'], prompt, "")
63
- elif api_choice == "DeepSeek":
64
- from App_Function_Libraries.Summarization_General_Lib import summarize_with_deepseek
65
- return summarize_with_deepseek(config['API']['deepseek_api_key'], prompt, "")
66
- elif api_choice == "Mistral":
67
- from App_Function_Libraries.Summarization_General_Lib import summarize_with_mistral
68
- return summarize_with_mistral(config['API']['mistral_api_key'], prompt, "")
69
- elif api_choice == "Local-LLM":
70
- from App_Function_Libraries.Local_Summarization_Lib import summarize_with_local_llm
71
- return summarize_with_local_llm(config['API']['local_llm_path'], prompt, "")
72
- elif api_choice == "Llama.cpp":
73
- from App_Function_Libraries.Local_Summarization_Lib import summarize_with_llama
74
- return summarize_with_llama(config['API']['llama_api_key'], prompt, "")
75
- elif api_choice == "Kobold":
76
- from App_Function_Libraries.Local_Summarization_Lib import summarize_with_kobold
77
- return summarize_with_kobold(config['API']['kobold_api_key'], prompt, "")
78
- elif api_choice == "Ooba":
79
- from App_Function_Libraries.Local_Summarization_Lib import summarize_with_oobabooga
80
- return summarize_with_oobabooga(config['API']['ooba_api_key'], prompt, "")
81
- elif api_choice == "TabbyAPI":
82
- from App_Function_Libraries.Local_Summarization_Lib import summarize_with_tabbyapi
83
- return summarize_with_tabbyapi(config['API']['tabby_api_key'], prompt, "")
84
- elif api_choice == "vLLM":
85
- from App_Function_Libraries.Local_Summarization_Lib import summarize_with_vllm
86
- return summarize_with_vllm(config['API']['vllm_api_key'], prompt, "")
87
- elif api_choice == "ollama":
88
- from App_Function_Libraries.Local_Summarization_Lib import summarize_with_ollama
89
- return summarize_with_ollama(config['API']['ollama_api_key'], prompt, "")
90
- else:
91
- raise ValueError(f"Unsupported API choice: {api_choice}")
92
-
93
- # Function to preprocess and store all existing content in the database
94
- def preprocess_all_content():
95
- with db.get_connection() as conn:
96
- cursor = conn.cursor()
97
- cursor.execute("SELECT id, content FROM Media")
98
- for row in cursor.fetchall():
99
- process_and_store_content(row[1], f"media_{row[0]}")
100
-
101
-
102
- # Function to perform RAG search across all stored content
103
- def rag_search(query: str, api_choice: str) -> Dict[str, Any]:
104
- # Perform vector search across all collections
105
- all_collections = chroma_client.list_collections()
106
- vector_results = []
107
- for collection in all_collections:
108
- vector_results.extend(vector_search(collection.name, query, k=2))
109
-
110
- # Perform FTS search
111
- fts_results = search_db(query, ["content"], "", page=1, results_per_page=10)
112
-
113
- # Combine results
114
- all_results = vector_results + [result['content'] for result in fts_results]
115
- context = "\n".join(all_results[:10]) # Limit to top 10 results
116
-
117
- # Generate answer using the selected API
118
- answer = generate_answer(api_choice, context, query)
119
-
120
- return {
121
- "answer": answer,
122
- "context": context
123
- }
124
-
125
-
126
- # Example usage:
127
- # 1. Initialize the system:
128
- # create_tables(db) # Ensure FTS tables are set up
129
- # preprocess_all_content() # Process and store all existing content
130
-
131
- # 2. Perform RAG on a specific URL:
132
- # result = rag_pipeline("https://example.com/article", "What is the main topic of this article?")
133
- # print(result['answer'])
134
-
135
- # 3. Perform RAG search across all content:
136
- # result = rag_search("What are the key points about climate change?")
137
- # print(result['answer'])
138
-
139
-
140
-
141
-
142
- ##################################################################################################################
143
- # RAG Pipeline 1
144
- #0.62 0.61 0.75 63402.0
145
- # from langchain_openai import ChatOpenAI
146
- #
147
- # from langchain_community.document_loaders import WebBaseLoader
148
- # from langchain_openai import OpenAIEmbeddings
149
- # from langchain.text_splitter import RecursiveCharacterTextSplitter
150
- # from langchain_chroma import Chroma
151
- #
152
- # from langchain_community.retrievers import BM25Retriever
153
- # from langchain.retrievers import ParentDocumentRetriever
154
- # from langchain.storage import InMemoryStore
155
- # import os
156
- # from operator import itemgetter
157
- # from langchain import hub
158
- # from langchain_core.output_parsers import StrOutputParser
159
- # from langchain_core.runnables import RunnablePassthrough, RunnableParallel, RunnableLambda
160
- # from langchain.retrievers import MergerRetriever
161
- # from langchain.retrievers.document_compressors import DocumentCompressorPipeline
162
-
163
-
164
- # def rag_pipeline():
165
- # try:
166
- # def format_docs(docs):
167
- # return "\n".join(doc.page_content for doc in docs)
168
- #
169
- # llm = ChatOpenAI(model='gpt-4o-mini')
170
- #
171
- # loader = WebBaseLoader('https://en.wikipedia.org/wiki/European_debt_crisis')
172
- # docs = loader.load()
173
- #
174
- # embedding = OpenAIEmbeddings(model='text-embedding-3-large')
175
- #
176
- # splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=200)
177
- # splits = splitter.split_documents(docs)
178
- # c = Chroma.from_documents(documents=splits, embedding=embedding,
179
- # collection_name='testindex-ragbuilder-1724657573', )
180
- # retrievers = []
181
- # retriever = c.as_retriever(search_type='mmr', search_kwargs={'k': 10})
182
- # retrievers.append(retriever)
183
- # retriever = BM25Retriever.from_documents(docs)
184
- # retrievers.append(retriever)
185
- #
186
- # parent_splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=600)
187
- # splits = parent_splitter.split_documents(docs)
188
- # store = InMemoryStore()
189
- # retriever = ParentDocumentRetriever(vectorstore=c, docstore=store, child_splitter=splitter,
190
- # parent_splitter=parent_splitter)
191
- # retriever.add_documents(docs)
192
- # retrievers.append(retriever)
193
- # retriever = MergerRetriever(retrievers=retrievers)
194
- # prompt = hub.pull("rlm/rag-prompt")
195
- # rag_chain = (
196
- # RunnableParallel(context=retriever, question=RunnablePassthrough())
197
- # .assign(context=itemgetter("context") | RunnableLambda(format_docs))
198
- # .assign(answer=prompt | llm | StrOutputParser())
199
- # .pick(["answer", "context"]))
200
- # return rag_chain
201
- # except Exception as e:
202
- # print(f"An error occurred: {e}")
203
-
204
-
205
- ##To get the answer and context, use the following code
206
- # res=rag_pipeline().invoke("your prompt here")
207
- # print(res["answer"])
208
- # print(res["context"])
209
-
210
- ############################################################################################################
211
-
212
-
213
-
214
- ############################################################################################################
215
- # RAG Pipeline 2
216
-
217
- #0.6 0.73 0.68 3125.0
218
- # from langchain_openai import ChatOpenAI
219
- #
220
- # from langchain_community.document_loaders import WebBaseLoader
221
- # from langchain_openai import OpenAIEmbeddings
222
- # from langchain.text_splitter import RecursiveCharacterTextSplitter
223
- # from langchain_chroma import Chroma
224
- # from langchain.retrievers.multi_query import MultiQueryRetriever
225
- # from langchain.retrievers import ParentDocumentRetriever
226
- # from langchain.storage import InMemoryStore
227
- # from langchain_community.document_transformers import EmbeddingsRedundantFilter
228
- # from langchain.retrievers.document_compressors import LLMChainFilter
229
- # from langchain.retrievers.document_compressors import EmbeddingsFilter
230
- # from langchain.retrievers import ContextualCompressionRetriever
231
- # import os
232
- # from operator import itemgetter
233
- # from langchain import hub
234
- # from langchain_core.output_parsers import StrOutputParser
235
- # from langchain_core.runnables import RunnablePassthrough, RunnableParallel, RunnableLambda
236
- # from langchain.retrievers import MergerRetriever
237
- # from langchain.retrievers.document_compressors import DocumentCompressorPipeline
238
-
239
-
240
- # def rag_pipeline():
241
- # try:
242
- # def format_docs(docs):
243
- # return "\n".join(doc.page_content for doc in docs)
244
- #
245
- # llm = ChatOpenAI(model='gpt-4o-mini')
246
- #
247
- # loader = WebBaseLoader('https://en.wikipedia.org/wiki/European_debt_crisis')
248
- # docs = loader.load()
249
- #
250
- # embedding = OpenAIEmbeddings(model='text-embedding-3-large')
251
- #
252
- # splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=200)
253
- # splits = splitter.split_documents(docs)
254
- # c = Chroma.from_documents(documents=splits, embedding=embedding,
255
- # collection_name='testindex-ragbuilder-1724650962', )
256
- # retrievers = []
257
- # retriever = MultiQueryRetriever.from_llm(c.as_retriever(search_type='similarity', search_kwargs={'k': 10}),
258
- # llm=llm)
259
- # retrievers.append(retriever)
260
- #
261
- # parent_splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=600)
262
- # splits = parent_splitter.split_documents(docs)
263
- # store = InMemoryStore()
264
- # retriever = ParentDocumentRetriever(vectorstore=c, docstore=store, child_splitter=splitter,
265
- # parent_splitter=parent_splitter)
266
- # retriever.add_documents(docs)
267
- # retrievers.append(retriever)
268
- # retriever = MergerRetriever(retrievers=retrievers)
269
- # arr_comp = []
270
- # arr_comp.append(EmbeddingsRedundantFilter(embeddings=embedding))
271
- # arr_comp.append(LLMChainFilter.from_llm(llm))
272
- # pipeline_compressor = DocumentCompressorPipeline(transformers=arr_comp)
273
- # retriever = ContextualCompressionRetriever(base_retriever=retriever, base_compressor=pipeline_compressor)
274
- # prompt = hub.pull("rlm/rag-prompt")
275
- # rag_chain = (
276
- # RunnableParallel(context=retriever, question=RunnablePassthrough())
277
- # .assign(context=itemgetter("context") | RunnableLambda(format_docs))
278
- # .assign(answer=prompt | llm | StrOutputParser())
279
- # .pick(["answer", "context"]))
280
- # return rag_chain
281
- # except Exception as e:
282
- # print(f"An error occurred: {e}")
283
-
284
-
285
- ##To get the answer and context, use the following code
286
- # res=rag_pipeline().invoke("your prompt here")
287
- # print(res["answer"])
288
- # print(res["context"])
289
-
290
-
291
-
292
-
293
-
294
-
295
-
296
- ############################################################################################################
297
- # Plain bm25 retriever
298
- # class BM25Retriever(BaseRetriever):
299
- # """`BM25` retriever without Elasticsearch."""
300
- #
301
- # vectorizer: Any
302
- # """ BM25 vectorizer."""
303
- # docs: List[Document] = Field(repr=False)
304
- # """ List of documents."""
305
- # k: int = 4
306
- # """ Number of documents to return."""
307
- # preprocess_func: Callable[[str], List[str]] = default_preprocessing_func
308
- # """ Preprocessing function to use on the text before BM25 vectorization."""
309
- #
310
- # class Config:
311
- # arbitrary_types_allowed = True
312
- #
313
- # @classmethod
314
- # def from_texts(
315
- # cls,
316
- # texts: Iterable[str],
317
- # metadatas: Optional[Iterable[dict]] = None,
318
- # bm25_params: Optional[Dict[str, Any]] = None,
319
- # preprocess_func: Callable[[str], List[str]] = default_preprocessing_func,
320
- # **kwargs: Any,
321
- # ) -> BM25Retriever:
322
- # """
323
- # Create a BM25Retriever from a list of texts.
324
- # Args:
325
- # texts: A list of texts to vectorize.
326
- # metadatas: A list of metadata dicts to associate with each text.
327
- # bm25_params: Parameters to pass to the BM25 vectorizer.
328
- # preprocess_func: A function to preprocess each text before vectorization.
329
- # **kwargs: Any other arguments to pass to the retriever.
330
- #
331
- # Returns:
332
- # A BM25Retriever instance.
333
- # """
334
- # try:
335
- # from rank_bm25 import BM25Okapi
336
- # except ImportError:
337
- # raise ImportError(
338
- # "Could not import rank_bm25, please install with `pip install "
339
- # "rank_bm25`."
340
- # )
341
- #
342
- # texts_processed = [preprocess_func(t) for t in texts]
343
- # bm25_params = bm25_params or {}
344
- # vectorizer = BM25Okapi(texts_processed, **bm25_params)
345
- # metadatas = metadatas or ({} for _ in texts)
346
- # docs = [Document(page_content=t, metadata=m) for t, m in zip(texts, metadatas)]
347
- # return cls(
348
- # vectorizer=vectorizer, docs=docs, preprocess_func=preprocess_func, **kwargs
349
- # )
350
- #
351
- # @classmethod
352
- # def from_documents(
353
- # cls,
354
- # documents: Iterable[Document],
355
- # *,
356
- # bm25_params: Optional[Dict[str, Any]] = None,
357
- # preprocess_func: Callable[[str], List[str]] = default_preprocessing_func,
358
- # **kwargs: Any,
359
- # ) -> BM25Retriever:
360
- # """
361
- # Create a BM25Retriever from a list of Documents.
362
- # Args:
363
- # documents: A list of Documents to vectorize.
364
- # bm25_params: Parameters to pass to the BM25 vectorizer.
365
- # preprocess_func: A function to preprocess each text before vectorization.
366
- # **kwargs: Any other arguments to pass to the retriever.
367
- #
368
- # Returns:
369
- # A BM25Retriever instance.
370
- # """
371
- # texts, metadatas = zip(*((d.page_content, d.metadata) for d in documents))
372
- # return cls.from_texts(
373
- # texts=texts,
374
- # bm25_params=bm25_params,
375
- # metadatas=metadatas,
376
- # preprocess_func=preprocess_func,
377
- # **kwargs,
378
- # )
379
- #
380
- # def _get_relevant_documents(
381
- # self, query: str, *, run_manager: CallbackManagerForRetrieverRun
382
- # ) -> List[Document]:
383
- # processed_query = self.preprocess_func(query)
384
- # return_docs = self.vectorizer.get_top_n(processed_query, self.docs, n=self.k)
385
- # return return_docs
386
- ############################################################################################################
387
-
388
- ############################################################################################################
389
- # ElasticSearch BM25 Retriever
390
- # class ElasticSearchBM25Retriever(BaseRetriever):
391
- # """`Elasticsearch` retriever that uses `BM25`.
392
- #
393
- # To connect to an Elasticsearch instance that requires login credentials,
394
- # including Elastic Cloud, use the Elasticsearch URL format
395
- # https://username:password@es_host:9243. For example, to connect to Elastic
396
- # Cloud, create the Elasticsearch URL with the required authentication details and
397
- # pass it to the ElasticVectorSearch constructor as the named parameter
398
- # elasticsearch_url.
399
- #
400
- # You can obtain your Elastic Cloud URL and login credentials by logging in to the
401
- # Elastic Cloud console at https://cloud.elastic.co, selecting your deployment, and
402
- # navigating to the "Deployments" page.
403
- #
404
- # To obtain your Elastic Cloud password for the default "elastic" user:
405
- #
406
- # 1. Log in to the Elastic Cloud console at https://cloud.elastic.co
407
- # 2. Go to "Security" > "Users"
408
- # 3. Locate the "elastic" user and click "Edit"
409
- # 4. Click "Reset password"
410
- # 5. Follow the prompts to reset the password
411
- #
412
- # The format for Elastic Cloud URLs is
413
- # https://username:password@cluster_id.region_id.gcp.cloud.es.io:9243.
414
- # """
415
- #
416
- # client: Any
417
- # """Elasticsearch client."""
418
- # index_name: str
419
- # """Name of the index to use in Elasticsearch."""
420
- #
421
- # @classmethod
422
- # def create(
423
- # cls, elasticsearch_url: str, index_name: str, k1: float = 2.0, b: float = 0.75
424
- # ) -> ElasticSearchBM25Retriever:
425
- # """
426
- # Create a ElasticSearchBM25Retriever from a list of texts.
427
- #
428
- # Args:
429
- # elasticsearch_url: URL of the Elasticsearch instance to connect to.
430
- # index_name: Name of the index to use in Elasticsearch.
431
- # k1: BM25 parameter k1.
432
- # b: BM25 parameter b.
433
- #
434
- # Returns:
435
- #
436
- # """
437
- # from elasticsearch import Elasticsearch
438
- #
439
- # # Create an Elasticsearch client instance
440
- # es = Elasticsearch(elasticsearch_url)
441
- #
442
- # # Define the index settings and mappings
443
- # settings = {
444
- # "analysis": {"analyzer": {"default": {"type": "standard"}}},
445
- # "similarity": {
446
- # "custom_bm25": {
447
- # "type": "BM25",
448
- # "k1": k1,
449
- # "b": b,
450
- # }
451
- # },
452
- # }
453
- # mappings = {
454
- # "properties": {
455
- # "content": {
456
- # "type": "text",
457
- # "similarity": "custom_bm25", # Use the custom BM25 similarity
458
- # }
459
- # }
460
- # }
461
- #
462
- # # Create the index with the specified settings and mappings
463
- # es.indices.create(index=index_name, mappings=mappings, settings=settings)
464
- # return cls(client=es, index_name=index_name)
465
- #
466
- # def add_texts(
467
- # self,
468
- # texts: Iterable[str],
469
- # refresh_indices: bool = True,
470
- # ) -> List[str]:
471
- # """Run more texts through the embeddings and add to the retriever.
472
- #
473
- # Args:
474
- # texts: Iterable of strings to add to the retriever.
475
- # refresh_indices: bool to refresh ElasticSearch indices
476
- #
477
- # Returns:
478
- # List of ids from adding the texts into the retriever.
479
- # """
480
- # try:
481
- # from elasticsearch.helpers import bulk
482
- # except ImportError:
483
- # raise ImportError(
484
- # "Could not import elasticsearch python package. "
485
- # "Please install it with `pip install elasticsearch`."
486
- # )
487
- # requests = []
488
- # ids = []
489
- # for i, text in enumerate(texts):
490
- # _id = str(uuid.uuid4())
491
- # request = {
492
- # "_op_type": "index",
493
- # "_index": self.index_name,
494
- # "content": text,
495
- # "_id": _id,
496
- # }
497
- # ids.append(_id)
498
- # requests.append(request)
499
- # bulk(self.client, requests)
500
- #
501
- # if refresh_indices:
502
- # self.client.indices.refresh(index=self.index_name)
503
- # return ids
504
- #
505
- # def _get_relevant_documents(
506
- # self, query: str, *, run_manager: CallbackManagerForRetrieverRun
507
- # ) -> List[Document]:
508
- # query_dict = {"query": {"match": {"content": query}}}
509
- # res = self.client.search(index=self.index_name, body=query_dict)
510
- #
511
- # docs = []
512
- # for r in res["hits"]["hits"]:
513
- # docs.append(Document(page_content=r["_source"]["content"]))
514
- # return docs
515
- ############################################################################################################
516
-
517
-
518
- ############################################################################################################
519
- # Multi Query Retriever
520
- # class MultiQueryRetriever(BaseRetriever):
521
- # """Given a query, use an LLM to write a set of queries.
522
- #
523
- # Retrieve docs for each query. Return the unique union of all retrieved docs.
524
- # """
525
- #
526
- # retriever: BaseRetriever
527
- # llm_chain: Runnable
528
- # verbose: bool = True
529
- # parser_key: str = "lines"
530
- # """DEPRECATED. parser_key is no longer used and should not be specified."""
531
- # include_original: bool = False
532
- # """Whether to include the original query in the list of generated queries."""
533
- #
534
- # @classmethod
535
- # def from_llm(
536
- # cls,
537
- # retriever: BaseRetriever,
538
- # llm: BaseLanguageModel,
539
- # prompt: BasePromptTemplate = DEFAULT_QUERY_PROMPT,
540
- # parser_key: Optional[str] = None,
541
- # include_original: bool = False,
542
- # ) -> "MultiQueryRetriever":
543
- # """Initialize from llm using default template.
544
- #
545
- # Args:
546
- # retriever: retriever to query documents from
547
- # llm: llm for query generation using DEFAULT_QUERY_PROMPT
548
- # prompt: The prompt which aims to generate several different versions
549
- # of the given user query
550
- # include_original: Whether to include the original query in the list of
551
- # generated queries.
552
- #
553
- # Returns:
554
- # MultiQueryRetriever
555
- # """
556
- # output_parser = LineListOutputParser()
557
- # llm_chain = prompt | llm | output_parser
558
- # return cls(
559
- # retriever=retriever,
560
- # llm_chain=llm_chain,
561
- # include_original=include_original,
562
- # )
563
- #
564
- # async def _aget_relevant_documents(
565
- # self,
566
- # query: str,
567
- # *,
568
- # run_manager: AsyncCallbackManagerForRetrieverRun,
569
- # ) -> List[Document]:
570
- # """Get relevant documents given a user query.
571
- #
572
- # Args:
573
- # query: user query
574
- #
575
- # Returns:
576
- # Unique union of relevant documents from all generated queries
577
- # """
578
- # queries = await self.agenerate_queries(query, run_manager)
579
- # if self.include_original:
580
- # queries.append(query)
581
- # documents = await self.aretrieve_documents(queries, run_manager)
582
- # return self.unique_union(documents)
583
- #
584
- # async def agenerate_queries(
585
- # self, question: str, run_manager: AsyncCallbackManagerForRetrieverRun
586
- # ) -> List[str]:
587
- # """Generate queries based upon user input.
588
- #
589
- # Args:
590
- # question: user query
591
- #
592
- # Returns:
593
- # List of LLM generated queries that are similar to the user input
594
- # """
595
- # response = await self.llm_chain.ainvoke(
596
- # {"question": question}, config={"callbacks": run_manager.get_child()}
597
- # )
598
- # if isinstance(self.llm_chain, LLMChain):
599
- # lines = response["text"]
600
- # else:
601
- # lines = response
602
- # if self.verbose:
603
- # logger.info(f"Generated queries: {lines}")
604
- # return lines
605
- #
606
- # async def aretrieve_documents(
607
- # self, queries: List[str], run_manager: AsyncCallbackManagerForRetrieverRun
608
- # ) -> List[Document]:
609
- # """Run all LLM generated queries.
610
- #
611
- # Args:
612
- # queries: query list
613
- #
614
- # Returns:
615
- # List of retrieved Documents
616
- # """
617
- # document_lists = await asyncio.gather(
618
- # *(
619
- # self.retriever.ainvoke(
620
- # query, config={"callbacks": run_manager.get_child()}
621
- # )
622
- # for query in queries
623
- # )
624
- # )
625
- # return [doc for docs in document_lists for doc in docs]
626
- #
627
- # def _get_relevant_documents(
628
- # self,
629
- # query: str,
630
- # *,
631
- # run_manager: CallbackManagerForRetrieverRun,
632
- # ) -> List[Document]:
633
- # """Get relevant documents given a user query.
634
- #
635
- # Args:
636
- # query: user query
637
- #
638
- # Returns:
639
- # Unique union of relevant documents from all generated queries
640
- # """
641
- # queries = self.generate_queries(query, run_manager)
642
- # if self.include_original:
643
- # queries.append(query)
644
- # documents = self.retrieve_documents(queries, run_manager)
645
- # return self.unique_union(documents)
646
- #
647
- # def generate_queries(
648
- # self, question: str, run_manager: CallbackManagerForRetrieverRun
649
- # ) -> List[str]:
650
- # """Generate queries based upon user input.
651
- #
652
- # Args:
653
- # question: user query
654
- #
655
- # Returns:
656
- # List of LLM generated queries that are similar to the user input
657
- # """
658
- # response = self.llm_chain.invoke(
659
- # {"question": question}, config={"callbacks": run_manager.get_child()}
660
- # )
661
- # if isinstance(self.llm_chain, LLMChain):
662
- # lines = response["text"]
663
- # else:
664
- # lines = response
665
- # if self.verbose:
666
- # logger.info(f"Generated queries: {lines}")
667
- # return lines
668
- #
669
- # def retrieve_documents(
670
- # self, queries: List[str], run_manager: CallbackManagerForRetrieverRun
671
- # ) -> List[Document]:
672
- # """Run all LLM generated queries.
673
- #
674
- # Args:
675
- # queries: query list
676
- #
677
- # Returns:
678
- # List of retrieved Documents
679
- # """
680
- # documents = []
681
- # for query in queries:
682
- # docs = self.retriever.invoke(
683
- # query, config={"callbacks": run_manager.get_child()}
684
- # )
685
- # documents.extend(docs)
686
- # return documents
687
- #
688
- # def unique_union(self, documents: List[Document]) -> List[Document]:
689
- # """Get unique Documents.
690
- #
691
- # Args:
692
- # documents: List of retrieved Documents
693
- #
694
- # Returns:
695
- # List of unique retrieved Documents
696
- # """
697
- # return _unique_documents(documents)
698
- ############################################################################################################
699
-
700
-
701
-
702
-
703
-
704
-
705
-
706
-
707
- ############################################################################################################
708
- # ElasticSearch Retriever
709
-
710
- # https://github.com/langchain-ai/langchain/tree/44e3e2391c48bfd0a8e6a20adde0b6567f4f43c3/templates/rag-elasticsearch
711
- #
712
- # https://github.com/langchain-ai/langchain/tree/44e3e2391c48bfd0a8e6a20adde0b6567f4f43c3/templates/rag-self-query
713
-
714
-
715
-
716
-
717
-
718
-
719
-
720
-
 
 
1
+ # Import necessary modules and functions
2
+ import configparser
3
+ from typing import Dict, Any
4
+ # Local Imports
5
+ #from App_Function_Libraries.ChromaDB_Library import process_and_store_content, vector_search, chroma_client
6
+ from Article_Extractor_Lib import scrape_article
7
+ from SQLite_DB import search_db, db
8
+ # 3rd-Party Imports
9
+ #import openai
10
+ # Initialize OpenAI client (adjust this based on your API key management)
11
+ #openai.api_key = "your-openai-api-key"
12
+
13
+
14
+ # Main RAG pipeline function
15
+ def rag_pipeline(url: str, query: str, api_choice=None) -> Dict[str, Any]:
16
+ # Extract content
17
+ # article_data = scrape_article(url)
18
+ # content = article_data['content']
19
+
20
+ # Process and store content
21
+ # collection_name = "article_" + str(hash(url))
22
+ # process_and_store_content(content, collection_name)
23
+
24
+ # Perform searches
25
+ # vector_results = vector_search(collection_name, query, k=5)
26
+ # fts_results = search_db(query, ["content"], "", page=1, results_per_page=5)
27
+
28
+ # Combine results
29
+ # all_results = vector_results + [result['content'] for result in fts_results]
30
+ # context = "\n".join(all_results)
31
+
32
+ # Generate answer using the selected API
33
+ # answer = generate_answer(api_choice, context, query)
34
+
35
+ # return {
36
+ # "answer": answer,
37
+ # "context": context
38
+ # }
39
+ pass
40
+
41
+ config = configparser.ConfigParser()
42
+ config.read('config.txt')
43
+
44
+ def generate_answer(api_choice: str, context: str, query: str) -> str:
45
+ prompt = f"Context: {context}\n\nQuestion: {query}"
46
+ if api_choice == "OpenAI":
47
+ from App_Function_Libraries.Summarization_General_Lib import summarize_with_openai
48
+ return summarize_with_openai(config['API']['openai_api_key'], prompt, "")
49
+ elif api_choice == "Anthropic":
50
+ from App_Function_Libraries.Summarization_General_Lib import summarize_with_anthropic
51
+ return summarize_with_anthropic(config['API']['anthropic_api_key'], prompt, "")
52
+ elif api_choice == "Cohere":
53
+ from App_Function_Libraries.Summarization_General_Lib import summarize_with_cohere
54
+ return summarize_with_cohere(config['API']['cohere_api_key'], prompt, "")
55
+ elif api_choice == "Groq":
56
+ from App_Function_Libraries.Summarization_General_Lib import summarize_with_groq
57
+ return summarize_with_groq(config['API']['groq_api_key'], prompt, "")
58
+ elif api_choice == "OpenRouter":
59
+ from App_Function_Libraries.Summarization_General_Lib import summarize_with_openrouter
60
+ return summarize_with_openrouter(config['API']['openrouter_api_key'], prompt, "")
61
+ elif api_choice == "HuggingFace":
62
+ from App_Function_Libraries.Summarization_General_Lib import summarize_with_huggingface
63
+ return summarize_with_huggingface(config['API']['huggingface_api_key'], prompt, "")
64
+ elif api_choice == "DeepSeek":
65
+ from App_Function_Libraries.Summarization_General_Lib import summarize_with_deepseek
66
+ return summarize_with_deepseek(config['API']['deepseek_api_key'], prompt, "")
67
+ elif api_choice == "Mistral":
68
+ from App_Function_Libraries.Summarization_General_Lib import summarize_with_mistral
69
+ return summarize_with_mistral(config['API']['mistral_api_key'], prompt, "")
70
+ elif api_choice == "Local-LLM":
71
+ from App_Function_Libraries.Local_Summarization_Lib import summarize_with_local_llm
72
+ return summarize_with_local_llm(config['API']['local_llm_path'], prompt, "")
73
+ elif api_choice == "Llama.cpp":
74
+ from App_Function_Libraries.Local_Summarization_Lib import summarize_with_llama
75
+ return summarize_with_llama(config['API']['llama_api_key'], prompt, "")
76
+ elif api_choice == "Kobold":
77
+ from App_Function_Libraries.Local_Summarization_Lib import summarize_with_kobold
78
+ return summarize_with_kobold(config['API']['kobold_api_key'], prompt, "")
79
+ elif api_choice == "Ooba":
80
+ from App_Function_Libraries.Local_Summarization_Lib import summarize_with_oobabooga
81
+ return summarize_with_oobabooga(config['API']['ooba_api_key'], prompt, "")
82
+ elif api_choice == "TabbyAPI":
83
+ from App_Function_Libraries.Local_Summarization_Lib import summarize_with_tabbyapi
84
+ return summarize_with_tabbyapi(config['API']['tabby_api_key'], prompt, "")
85
+ elif api_choice == "vLLM":
86
+ from App_Function_Libraries.Local_Summarization_Lib import summarize_with_vllm
87
+ return summarize_with_vllm(config['API']['vllm_api_key'], prompt, "")
88
+ elif api_choice == "ollama":
89
+ from App_Function_Libraries.Local_Summarization_Lib import summarize_with_ollama
90
+ return summarize_with_ollama(config['API']['ollama_api_key'], prompt, "")
91
+ else:
92
+ raise ValueError(f"Unsupported API choice: {api_choice}")
93
+
94
+ # Function to preprocess and store all existing content in the database
95
+ #def preprocess_all_content():
96
+ # with db.get_connection() as conn:
97
+ # cursor = conn.cursor()
98
+ # cursor.execute("SELECT id, content FROM Media")
99
+ # for row in cursor.fetchall():
100
+ # process_and_store_content(row[1], f"media_{row[0]}")
101
+
102
+
103
+ # Function to perform RAG search across all stored content
104
+ def rag_search(query: str, api_choice: str) -> Dict[str, Any]:
105
+ # Perform vector search across all collections
106
+ # all_collections = chroma_client.list_collections()
107
+ # vector_results = []
108
+ # for collection in all_collections:
109
+ # vector_results.extend(vector_search(collection.name, query, k=2))
110
+
111
+ # Perform FTS search
112
+ # fts_results = search_db(query, ["content"], "", page=1, results_per_page=10)
113
+
114
+ # Combine results
115
+ # all_results = vector_results + [result['content'] for result in fts_results]
116
+ # context = "\n".join(all_results[:10]) # Limit to top 10 results
117
+
118
+ # Generate answer using the selected API
119
+ # answer = generate_answer(api_choice, context, query)
120
+
121
+ # return {
122
+ # "answer": answer,
123
+ # "context": context
124
+ # }
125
+ pass
126
+
127
+ # Example usage:
128
+ # 1. Initialize the system:
129
+ # create_tables(db) # Ensure FTS tables are set up
130
+ # preprocess_all_content() # Process and store all existing content
131
+
132
+ # 2. Perform RAG on a specific URL:
133
+ # result = rag_pipeline("https://example.com/article", "What is the main topic of this article?")
134
+ # print(result['answer'])
135
+
136
+ # 3. Perform RAG search across all content:
137
+ # result = rag_search("What are the key points about climate change?")
138
+ # print(result['answer'])
139
+
140
+
141
+
142
+
143
+ ##################################################################################################################
144
+ # RAG Pipeline 1
145
+ #0.62 0.61 0.75 63402.0
146
+ # from langchain_openai import ChatOpenAI
147
+ #
148
+ # from langchain_community.document_loaders import WebBaseLoader
149
+ # from langchain_openai import OpenAIEmbeddings
150
+ # from langchain.text_splitter import RecursiveCharacterTextSplitter
151
+ # from langchain_chroma import Chroma
152
+ #
153
+ # from langchain_community.retrievers import BM25Retriever
154
+ # from langchain.retrievers import ParentDocumentRetriever
155
+ # from langchain.storage import InMemoryStore
156
+ # import os
157
+ # from operator import itemgetter
158
+ # from langchain import hub
159
+ # from langchain_core.output_parsers import StrOutputParser
160
+ # from langchain_core.runnables import RunnablePassthrough, RunnableParallel, RunnableLambda
161
+ # from langchain.retrievers import MergerRetriever
162
+ # from langchain.retrievers.document_compressors import DocumentCompressorPipeline
163
+
164
+
165
+ # def rag_pipeline():
166
+ # try:
167
+ # def format_docs(docs):
168
+ # return "\n".join(doc.page_content for doc in docs)
169
+ #
170
+ # llm = ChatOpenAI(model='gpt-4o-mini')
171
+ #
172
+ # loader = WebBaseLoader('https://en.wikipedia.org/wiki/European_debt_crisis')
173
+ # docs = loader.load()
174
+ #
175
+ # embedding = OpenAIEmbeddings(model='text-embedding-3-large')
176
+ #
177
+ # splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=200)
178
+ # splits = splitter.split_documents(docs)
179
+ # c = Chroma.from_documents(documents=splits, embedding=embedding,
180
+ # collection_name='testindex-ragbuilder-1724657573', )
181
+ # retrievers = []
182
+ # retriever = c.as_retriever(search_type='mmr', search_kwargs={'k': 10})
183
+ # retrievers.append(retriever)
184
+ # retriever = BM25Retriever.from_documents(docs)
185
+ # retrievers.append(retriever)
186
+ #
187
+ # parent_splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=600)
188
+ # splits = parent_splitter.split_documents(docs)
189
+ # store = InMemoryStore()
190
+ # retriever = ParentDocumentRetriever(vectorstore=c, docstore=store, child_splitter=splitter,
191
+ # parent_splitter=parent_splitter)
192
+ # retriever.add_documents(docs)
193
+ # retrievers.append(retriever)
194
+ # retriever = MergerRetriever(retrievers=retrievers)
195
+ # prompt = hub.pull("rlm/rag-prompt")
196
+ # rag_chain = (
197
+ # RunnableParallel(context=retriever, question=RunnablePassthrough())
198
+ # .assign(context=itemgetter("context") | RunnableLambda(format_docs))
199
+ # .assign(answer=prompt | llm | StrOutputParser())
200
+ # .pick(["answer", "context"]))
201
+ # return rag_chain
202
+ # except Exception as e:
203
+ # print(f"An error occurred: {e}")
204
+
205
+
206
+ ##To get the answer and context, use the following code
207
+ # res=rag_pipeline().invoke("your prompt here")
208
+ # print(res["answer"])
209
+ # print(res["context"])
210
+
211
+ ############################################################################################################
212
+
213
+
214
+
215
+ ############################################################################################################
216
+ # RAG Pipeline 2
217
+
218
+ #0.6 0.73 0.68 3125.0
219
+ # from langchain_openai import ChatOpenAI
220
+ #
221
+ # from langchain_community.document_loaders import WebBaseLoader
222
+ # from langchain_openai import OpenAIEmbeddings
223
+ # from langchain.text_splitter import RecursiveCharacterTextSplitter
224
+ # from langchain_chroma import Chroma
225
+ # from langchain.retrievers.multi_query import MultiQueryRetriever
226
+ # from langchain.retrievers import ParentDocumentRetriever
227
+ # from langchain.storage import InMemoryStore
228
+ # from langchain_community.document_transformers import EmbeddingsRedundantFilter
229
+ # from langchain.retrievers.document_compressors import LLMChainFilter
230
+ # from langchain.retrievers.document_compressors import EmbeddingsFilter
231
+ # from langchain.retrievers import ContextualCompressionRetriever
232
+ # import os
233
+ # from operator import itemgetter
234
+ # from langchain import hub
235
+ # from langchain_core.output_parsers import StrOutputParser
236
+ # from langchain_core.runnables import RunnablePassthrough, RunnableParallel, RunnableLambda
237
+ # from langchain.retrievers import MergerRetriever
238
+ # from langchain.retrievers.document_compressors import DocumentCompressorPipeline
239
+
240
+
241
+ # def rag_pipeline():
242
+ # try:
243
+ # def format_docs(docs):
244
+ # return "\n".join(doc.page_content for doc in docs)
245
+ #
246
+ # llm = ChatOpenAI(model='gpt-4o-mini')
247
+ #
248
+ # loader = WebBaseLoader('https://en.wikipedia.org/wiki/European_debt_crisis')
249
+ # docs = loader.load()
250
+ #
251
+ # embedding = OpenAIEmbeddings(model='text-embedding-3-large')
252
+ #
253
+ # splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=200)
254
+ # splits = splitter.split_documents(docs)
255
+ # c = Chroma.from_documents(documents=splits, embedding=embedding,
256
+ # collection_name='testindex-ragbuilder-1724650962', )
257
+ # retrievers = []
258
+ # retriever = MultiQueryRetriever.from_llm(c.as_retriever(search_type='similarity', search_kwargs={'k': 10}),
259
+ # llm=llm)
260
+ # retrievers.append(retriever)
261
+ #
262
+ # parent_splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=600)
263
+ # splits = parent_splitter.split_documents(docs)
264
+ # store = InMemoryStore()
265
+ # retriever = ParentDocumentRetriever(vectorstore=c, docstore=store, child_splitter=splitter,
266
+ # parent_splitter=parent_splitter)
267
+ # retriever.add_documents(docs)
268
+ # retrievers.append(retriever)
269
+ # retriever = MergerRetriever(retrievers=retrievers)
270
+ # arr_comp = []
271
+ # arr_comp.append(EmbeddingsRedundantFilter(embeddings=embedding))
272
+ # arr_comp.append(LLMChainFilter.from_llm(llm))
273
+ # pipeline_compressor = DocumentCompressorPipeline(transformers=arr_comp)
274
+ # retriever = ContextualCompressionRetriever(base_retriever=retriever, base_compressor=pipeline_compressor)
275
+ # prompt = hub.pull("rlm/rag-prompt")
276
+ # rag_chain = (
277
+ # RunnableParallel(context=retriever, question=RunnablePassthrough())
278
+ # .assign(context=itemgetter("context") | RunnableLambda(format_docs))
279
+ # .assign(answer=prompt | llm | StrOutputParser())
280
+ # .pick(["answer", "context"]))
281
+ # return rag_chain
282
+ # except Exception as e:
283
+ # print(f"An error occurred: {e}")
284
+
285
+
286
+ ##To get the answer and context, use the following code
287
+ # res=rag_pipeline().invoke("your prompt here")
288
+ # print(res["answer"])
289
+ # print(res["context"])
290
+
291
+
292
+
293
+
294
+
295
+
296
+
297
+ ############################################################################################################
298
+ # Plain bm25 retriever
299
+ # class BM25Retriever(BaseRetriever):
300
+ # """`BM25` retriever without Elasticsearch."""
301
+ #
302
+ # vectorizer: Any
303
+ # """ BM25 vectorizer."""
304
+ # docs: List[Document] = Field(repr=False)
305
+ # """ List of documents."""
306
+ # k: int = 4
307
+ # """ Number of documents to return."""
308
+ # preprocess_func: Callable[[str], List[str]] = default_preprocessing_func
309
+ # """ Preprocessing function to use on the text before BM25 vectorization."""
310
+ #
311
+ # class Config:
312
+ # arbitrary_types_allowed = True
313
+ #
314
+ # @classmethod
315
+ # def from_texts(
316
+ # cls,
317
+ # texts: Iterable[str],
318
+ # metadatas: Optional[Iterable[dict]] = None,
319
+ # bm25_params: Optional[Dict[str, Any]] = None,
320
+ # preprocess_func: Callable[[str], List[str]] = default_preprocessing_func,
321
+ # **kwargs: Any,
322
+ # ) -> BM25Retriever:
323
+ # """
324
+ # Create a BM25Retriever from a list of texts.
325
+ # Args:
326
+ # texts: A list of texts to vectorize.
327
+ # metadatas: A list of metadata dicts to associate with each text.
328
+ # bm25_params: Parameters to pass to the BM25 vectorizer.
329
+ # preprocess_func: A function to preprocess each text before vectorization.
330
+ # **kwargs: Any other arguments to pass to the retriever.
331
+ #
332
+ # Returns:
333
+ # A BM25Retriever instance.
334
+ # """
335
+ # try:
336
+ # from rank_bm25 import BM25Okapi
337
+ # except ImportError:
338
+ # raise ImportError(
339
+ # "Could not import rank_bm25, please install with `pip install "
340
+ # "rank_bm25`."
341
+ # )
342
+ #
343
+ # texts_processed = [preprocess_func(t) for t in texts]
344
+ # bm25_params = bm25_params or {}
345
+ # vectorizer = BM25Okapi(texts_processed, **bm25_params)
346
+ # metadatas = metadatas or ({} for _ in texts)
347
+ # docs = [Document(page_content=t, metadata=m) for t, m in zip(texts, metadatas)]
348
+ # return cls(
349
+ # vectorizer=vectorizer, docs=docs, preprocess_func=preprocess_func, **kwargs
350
+ # )
351
+ #
352
+ # @classmethod
353
+ # def from_documents(
354
+ # cls,
355
+ # documents: Iterable[Document],
356
+ # *,
357
+ # bm25_params: Optional[Dict[str, Any]] = None,
358
+ # preprocess_func: Callable[[str], List[str]] = default_preprocessing_func,
359
+ # **kwargs: Any,
360
+ # ) -> BM25Retriever:
361
+ # """
362
+ # Create a BM25Retriever from a list of Documents.
363
+ # Args:
364
+ # documents: A list of Documents to vectorize.
365
+ # bm25_params: Parameters to pass to the BM25 vectorizer.
366
+ # preprocess_func: A function to preprocess each text before vectorization.
367
+ # **kwargs: Any other arguments to pass to the retriever.
368
+ #
369
+ # Returns:
370
+ # A BM25Retriever instance.
371
+ # """
372
+ # texts, metadatas = zip(*((d.page_content, d.metadata) for d in documents))
373
+ # return cls.from_texts(
374
+ # texts=texts,
375
+ # bm25_params=bm25_params,
376
+ # metadatas=metadatas,
377
+ # preprocess_func=preprocess_func,
378
+ # **kwargs,
379
+ # )
380
+ #
381
+ # def _get_relevant_documents(
382
+ # self, query: str, *, run_manager: CallbackManagerForRetrieverRun
383
+ # ) -> List[Document]:
384
+ # processed_query = self.preprocess_func(query)
385
+ # return_docs = self.vectorizer.get_top_n(processed_query, self.docs, n=self.k)
386
+ # return return_docs
387
+ ############################################################################################################
388
+
389
+ ############################################################################################################
390
+ # ElasticSearch BM25 Retriever
391
+ # class ElasticSearchBM25Retriever(BaseRetriever):
392
+ # """`Elasticsearch` retriever that uses `BM25`.
393
+ #
394
+ # To connect to an Elasticsearch instance that requires login credentials,
395
+ # including Elastic Cloud, use the Elasticsearch URL format
396
+ # https://username:password@es_host:9243. For example, to connect to Elastic
397
+ # Cloud, create the Elasticsearch URL with the required authentication details and
398
+ # pass it to the ElasticVectorSearch constructor as the named parameter
399
+ # elasticsearch_url.
400
+ #
401
+ # You can obtain your Elastic Cloud URL and login credentials by logging in to the
402
+ # Elastic Cloud console at https://cloud.elastic.co, selecting your deployment, and
403
+ # navigating to the "Deployments" page.
404
+ #
405
+ # To obtain your Elastic Cloud password for the default "elastic" user:
406
+ #
407
+ # 1. Log in to the Elastic Cloud console at https://cloud.elastic.co
408
+ # 2. Go to "Security" > "Users"
409
+ # 3. Locate the "elastic" user and click "Edit"
410
+ # 4. Click "Reset password"
411
+ # 5. Follow the prompts to reset the password
412
+ #
413
+ # The format for Elastic Cloud URLs is
414
+ # https://username:password@cluster_id.region_id.gcp.cloud.es.io:9243.
415
+ # """
416
+ #
417
+ # client: Any
418
+ # """Elasticsearch client."""
419
+ # index_name: str
420
+ # """Name of the index to use in Elasticsearch."""
421
+ #
422
+ # @classmethod
423
+ # def create(
424
+ # cls, elasticsearch_url: str, index_name: str, k1: float = 2.0, b: float = 0.75
425
+ # ) -> ElasticSearchBM25Retriever:
426
+ # """
427
+ # Create a ElasticSearchBM25Retriever from a list of texts.
428
+ #
429
+ # Args:
430
+ # elasticsearch_url: URL of the Elasticsearch instance to connect to.
431
+ # index_name: Name of the index to use in Elasticsearch.
432
+ # k1: BM25 parameter k1.
433
+ # b: BM25 parameter b.
434
+ #
435
+ # Returns:
436
+ #
437
+ # """
438
+ # from elasticsearch import Elasticsearch
439
+ #
440
+ # # Create an Elasticsearch client instance
441
+ # es = Elasticsearch(elasticsearch_url)
442
+ #
443
+ # # Define the index settings and mappings
444
+ # settings = {
445
+ # "analysis": {"analyzer": {"default": {"type": "standard"}}},
446
+ # "similarity": {
447
+ # "custom_bm25": {
448
+ # "type": "BM25",
449
+ # "k1": k1,
450
+ # "b": b,
451
+ # }
452
+ # },
453
+ # }
454
+ # mappings = {
455
+ # "properties": {
456
+ # "content": {
457
+ # "type": "text",
458
+ # "similarity": "custom_bm25", # Use the custom BM25 similarity
459
+ # }
460
+ # }
461
+ # }
462
+ #
463
+ # # Create the index with the specified settings and mappings
464
+ # es.indices.create(index=index_name, mappings=mappings, settings=settings)
465
+ # return cls(client=es, index_name=index_name)
466
+ #
467
+ # def add_texts(
468
+ # self,
469
+ # texts: Iterable[str],
470
+ # refresh_indices: bool = True,
471
+ # ) -> List[str]:
472
+ # """Run more texts through the embeddings and add to the retriever.
473
+ #
474
+ # Args:
475
+ # texts: Iterable of strings to add to the retriever.
476
+ # refresh_indices: bool to refresh ElasticSearch indices
477
+ #
478
+ # Returns:
479
+ # List of ids from adding the texts into the retriever.
480
+ # """
481
+ # try:
482
+ # from elasticsearch.helpers import bulk
483
+ # except ImportError:
484
+ # raise ImportError(
485
+ # "Could not import elasticsearch python package. "
486
+ # "Please install it with `pip install elasticsearch`."
487
+ # )
488
+ # requests = []
489
+ # ids = []
490
+ # for i, text in enumerate(texts):
491
+ # _id = str(uuid.uuid4())
492
+ # request = {
493
+ # "_op_type": "index",
494
+ # "_index": self.index_name,
495
+ # "content": text,
496
+ # "_id": _id,
497
+ # }
498
+ # ids.append(_id)
499
+ # requests.append(request)
500
+ # bulk(self.client, requests)
501
+ #
502
+ # if refresh_indices:
503
+ # self.client.indices.refresh(index=self.index_name)
504
+ # return ids
505
+ #
506
+ # def _get_relevant_documents(
507
+ # self, query: str, *, run_manager: CallbackManagerForRetrieverRun
508
+ # ) -> List[Document]:
509
+ # query_dict = {"query": {"match": {"content": query}}}
510
+ # res = self.client.search(index=self.index_name, body=query_dict)
511
+ #
512
+ # docs = []
513
+ # for r in res["hits"]["hits"]:
514
+ # docs.append(Document(page_content=r["_source"]["content"]))
515
+ # return docs
516
+ ############################################################################################################
517
+
518
+
519
+ ############################################################################################################
520
+ # Multi Query Retriever
521
+ # class MultiQueryRetriever(BaseRetriever):
522
+ # """Given a query, use an LLM to write a set of queries.
523
+ #
524
+ # Retrieve docs for each query. Return the unique union of all retrieved docs.
525
+ # """
526
+ #
527
+ # retriever: BaseRetriever
528
+ # llm_chain: Runnable
529
+ # verbose: bool = True
530
+ # parser_key: str = "lines"
531
+ # """DEPRECATED. parser_key is no longer used and should not be specified."""
532
+ # include_original: bool = False
533
+ # """Whether to include the original query in the list of generated queries."""
534
+ #
535
+ # @classmethod
536
+ # def from_llm(
537
+ # cls,
538
+ # retriever: BaseRetriever,
539
+ # llm: BaseLanguageModel,
540
+ # prompt: BasePromptTemplate = DEFAULT_QUERY_PROMPT,
541
+ # parser_key: Optional[str] = None,
542
+ # include_original: bool = False,
543
+ # ) -> "MultiQueryRetriever":
544
+ # """Initialize from llm using default template.
545
+ #
546
+ # Args:
547
+ # retriever: retriever to query documents from
548
+ # llm: llm for query generation using DEFAULT_QUERY_PROMPT
549
+ # prompt: The prompt which aims to generate several different versions
550
+ # of the given user query
551
+ # include_original: Whether to include the original query in the list of
552
+ # generated queries.
553
+ #
554
+ # Returns:
555
+ # MultiQueryRetriever
556
+ # """
557
+ # output_parser = LineListOutputParser()
558
+ # llm_chain = prompt | llm | output_parser
559
+ # return cls(
560
+ # retriever=retriever,
561
+ # llm_chain=llm_chain,
562
+ # include_original=include_original,
563
+ # )
564
+ #
565
+ # async def _aget_relevant_documents(
566
+ # self,
567
+ # query: str,
568
+ # *,
569
+ # run_manager: AsyncCallbackManagerForRetrieverRun,
570
+ # ) -> List[Document]:
571
+ # """Get relevant documents given a user query.
572
+ #
573
+ # Args:
574
+ # query: user query
575
+ #
576
+ # Returns:
577
+ # Unique union of relevant documents from all generated queries
578
+ # """
579
+ # queries = await self.agenerate_queries(query, run_manager)
580
+ # if self.include_original:
581
+ # queries.append(query)
582
+ # documents = await self.aretrieve_documents(queries, run_manager)
583
+ # return self.unique_union(documents)
584
+ #
585
+ # async def agenerate_queries(
586
+ # self, question: str, run_manager: AsyncCallbackManagerForRetrieverRun
587
+ # ) -> List[str]:
588
+ # """Generate queries based upon user input.
589
+ #
590
+ # Args:
591
+ # question: user query
592
+ #
593
+ # Returns:
594
+ # List of LLM generated queries that are similar to the user input
595
+ # """
596
+ # response = await self.llm_chain.ainvoke(
597
+ # {"question": question}, config={"callbacks": run_manager.get_child()}
598
+ # )
599
+ # if isinstance(self.llm_chain, LLMChain):
600
+ # lines = response["text"]
601
+ # else:
602
+ # lines = response
603
+ # if self.verbose:
604
+ # logger.info(f"Generated queries: {lines}")
605
+ # return lines
606
+ #
607
+ # async def aretrieve_documents(
608
+ # self, queries: List[str], run_manager: AsyncCallbackManagerForRetrieverRun
609
+ # ) -> List[Document]:
610
+ # """Run all LLM generated queries.
611
+ #
612
+ # Args:
613
+ # queries: query list
614
+ #
615
+ # Returns:
616
+ # List of retrieved Documents
617
+ # """
618
+ # document_lists = await asyncio.gather(
619
+ # *(
620
+ # self.retriever.ainvoke(
621
+ # query, config={"callbacks": run_manager.get_child()}
622
+ # )
623
+ # for query in queries
624
+ # )
625
+ # )
626
+ # return [doc for docs in document_lists for doc in docs]
627
+ #
628
+ # def _get_relevant_documents(
629
+ # self,
630
+ # query: str,
631
+ # *,
632
+ # run_manager: CallbackManagerForRetrieverRun,
633
+ # ) -> List[Document]:
634
+ # """Get relevant documents given a user query.
635
+ #
636
+ # Args:
637
+ # query: user query
638
+ #
639
+ # Returns:
640
+ # Unique union of relevant documents from all generated queries
641
+ # """
642
+ # queries = self.generate_queries(query, run_manager)
643
+ # if self.include_original:
644
+ # queries.append(query)
645
+ # documents = self.retrieve_documents(queries, run_manager)
646
+ # return self.unique_union(documents)
647
+ #
648
+ # def generate_queries(
649
+ # self, question: str, run_manager: CallbackManagerForRetrieverRun
650
+ # ) -> List[str]:
651
+ # """Generate queries based upon user input.
652
+ #
653
+ # Args:
654
+ # question: user query
655
+ #
656
+ # Returns:
657
+ # List of LLM generated queries that are similar to the user input
658
+ # """
659
+ # response = self.llm_chain.invoke(
660
+ # {"question": question}, config={"callbacks": run_manager.get_child()}
661
+ # )
662
+ # if isinstance(self.llm_chain, LLMChain):
663
+ # lines = response["text"]
664
+ # else:
665
+ # lines = response
666
+ # if self.verbose:
667
+ # logger.info(f"Generated queries: {lines}")
668
+ # return lines
669
+ #
670
+ # def retrieve_documents(
671
+ # self, queries: List[str], run_manager: CallbackManagerForRetrieverRun
672
+ # ) -> List[Document]:
673
+ # """Run all LLM generated queries.
674
+ #
675
+ # Args:
676
+ # queries: query list
677
+ #
678
+ # Returns:
679
+ # List of retrieved Documents
680
+ # """
681
+ # documents = []
682
+ # for query in queries:
683
+ # docs = self.retriever.invoke(
684
+ # query, config={"callbacks": run_manager.get_child()}
685
+ # )
686
+ # documents.extend(docs)
687
+ # return documents
688
+ #
689
+ # def unique_union(self, documents: List[Document]) -> List[Document]:
690
+ # """Get unique Documents.
691
+ #
692
+ # Args:
693
+ # documents: List of retrieved Documents
694
+ #
695
+ # Returns:
696
+ # List of unique retrieved Documents
697
+ # """
698
+ # return _unique_documents(documents)
699
+ ############################################################################################################
700
+
701
+
702
+
703
+
704
+
705
+
706
+
707
+
708
+ ############################################################################################################
709
+ # ElasticSearch Retriever
710
+
711
+ # https://github.com/langchain-ai/langchain/tree/44e3e2391c48bfd0a8e6a20adde0b6567f4f43c3/templates/rag-elasticsearch
712
+ #
713
+ # https://github.com/langchain-ai/langchain/tree/44e3e2391c48bfd0a8e6a20adde0b6567f4f43c3/templates/rag-self-query
714
+
715
+
716
+
717
+
718
+
719
+
720
+
721
+