OI

Build error

App Files Files Community

JPBianchi commited on Jun 24, 2024

Commit

f85a680

1 Parent(s): 94263d8

multiRAG implementation

Browse files

Files changed (22) hide show

app/engine/chunk_embed.py +1 -3
app/engine/loaders/file.py +49 -6
app/engine/logger.py +14 -8
app/engine/post_process.py +74 -0
app/engine/processing.py +116 -18
app/engine/summary.py +53 -0
app/engine/vectorstore.py +139 -38
app/engine/weaviate_interface_v4.py +4 -0
app/main.py +101 -53
app/main_reflex.py +231 -0
app/notebooks/__init__.py +0 -0
app/requirements.txt +7 -3
app/settings.py +2 -1
app/tests/test_main.py +1 -1
assets/IO_logo.webp +0 -0
assets/OI_logo.jpg +0 -0
assets/amazon_forecast.jpg +0 -0
assets/amazon_idiot.jpg +0 -0
assets/favicon.ico +0 -0
assets/homepage.jpg +0 -0
assets/irrelevant_amazon.jpg +0 -0
assets/multirag_good.jpeg +0 -0

app/engine/chunk_embed.py CHANGED Viewed

@@ -1,10 +1,8 @@
 import os
 import pandas as pd
 import torch
-from settings import parquet_file
 import tiktoken  # tokenizer library for use with OpenAI LLMs
 from llama_index.legacy.text_splitter import SentenceSplitter

 import os
 import pandas as pd
 import torch
+from app.settings import parquet_file
 import tiktoken  # tokenizer library for use with OpenAI LLMs
 from llama_index.legacy.text_splitter import SentenceSplitter

app/engine/loaders/file.py CHANGED Viewed

@@ -3,13 +3,15 @@ import os
 # from langchain.document_loaders import PyPDFLoader  # deprecated
 from langchain_community.document_loaders import PyPDFLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from llama_parse import LlamaParse
 from typing import Union, List, Dict
 from abc import ABC, abstractmethod
-class PDFExtractor(ABC):
     def __init__(self, file_or_list: Union[str, List[str]], num_workers: int = 1, verbose: bool = False):
         """ We can provide a list of files or a single file """
@@ -40,7 +42,7 @@ class PDFExtractor(ABC):
         """
         pass
-class _PyPDFLoader(PDFExtractor):
     def extract_text(self):
         output_dict = {}
@@ -58,7 +60,7 @@ class _PyPDFLoader(PDFExtractor):
         return
-class _LlamaParse(PDFExtractor):
     def extract_text(self):
         # https://github.com/run-llama/llama_parse
@@ -88,18 +90,59 @@ class _LlamaParse(PDFExtractor):
         raise NotImplementedError("Not implemented or LlamaParse does not support table extraction")
         return
-def pdf_extractor(extractor_type: str, *args, **kwargs) -> PDFExtractor:
-    """ Factory function to return the appropriate PDF extractor instance, properly initialized """
     if extractor_type == 'PyPDFLoader':
         return _PyPDFLoader(*args, **kwargs)
     elif extractor_type == 'LlamaParse':
         return _LlamaParse(*args, **kwargs)
     else:
         raise ValueError(f"Unsupported PDF extractor type: {extractor_type}")

 # from langchain.document_loaders import PyPDFLoader  # deprecated
 from langchain_community.document_loaders import PyPDFLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders.csv_loader import CSVLoader
+# ^ if we want to add CSV support, it will transform every row into a k:v pair
 from llama_parse import LlamaParse
 from typing import Union, List, Dict
 from abc import ABC, abstractmethod
+class Extractor(ABC):
     def __init__(self, file_or_list: Union[str, List[str]], num_workers: int = 1, verbose: bool = False):
         """ We can provide a list of files or a single file """
         """
         pass
+class _PyPDFLoader(Extractor):
     def extract_text(self):
         output_dict = {}
         return
+class _LlamaParse(Extractor):
     def extract_text(self):
         # https://github.com/run-llama/llama_parse
         raise NotImplementedError("Not implemented or LlamaParse does not support table extraction")
         return
+class _TXTLoader(Extractor):
+    def extract_text(self):
+        output_dict = {}
+        for fpath in self.filelist:
+            fname = fpath.split('/')[-1]
+            output_dict[fname] = [open(fpath, 'r').read()]
+            # with pdfs, we use a list of strings, one for each page
+            # so we must return a list here, even if it's just one string with everything
+        return output_dict
+    def extract_images(self):
+        raise NotImplementedError("Not implemented or PyPDFLoader does not support image extraction")
+        return
+    def extract_tables(self):
+        raise NotImplementedError("Not implemented or PyPDFLoader does not support table extraction")
+        return
+class _CSVLoader(Extractor):
+    # mock code for now, as a reminder of what we could do if time allows TODO
+    def extract_text(self):
+        output_dict = {}
+        for fpath in self.filelist:
+            fname = fpath.split('/')[-1]
+            output_dict[fname] = [CSVLoader(fpath).load()]  # <<  untested!
+        return output_dict
+    def extract_images(self):
+        raise NotImplementedError("Not implemented or CSVLoader does not support image extraction")
+        return
+    def extract_tables(self):
+        raise NotImplementedError("Not implemented or CSVLoader does not support table extraction")
+        return
+def extractor(extractor_type: str, *args, **kwargs) -> Extractor:
+    """ Function factory to return the appropriate PDF extractor instance, properly initialized """
     if extractor_type == 'PyPDFLoader':
         return _PyPDFLoader(*args, **kwargs)
     elif extractor_type == 'LlamaParse':
         return _LlamaParse(*args, **kwargs)
+    elif extractor_type == 'txt':
+        return _TXTLoader(*args, **kwargs)
     else:
         raise ValueError(f"Unsupported PDF extractor type: {extractor_type}")
+#/usr/bin/env /Users/jpb2/Library/Caches/pypoetry/virtualenvs/reflex-Y1r5RCNB-py3.10/bin/python /Users/jpb2/.vscode/extensions/ms-python.debugpy-2024.6.0-darwin-x64/bundled/libs/debugpy/adapter/../../debugpy/launcher 51572 -- -m reflex run --frontend-port 3000 --loglevel debug
+#/usr/bin/env /Volumes/DATA/Dropbox/IMAC_BACKUP/WORK/PROJECTS/INNOVATION/venv/bin/python /Users/jpb2/.vscode/extensions/ms-python.debugpy-2024.6.0-darwin-x64/bundled/libs/debugpy/adapter/../../debugpy/launcher 53961 -- -m reflex run --frontend-port 3001 --loglevel debug --env dev

app/engine/logger.py CHANGED Viewed

@@ -1,10 +1,16 @@
 import os, logging
-environment = os.getenv("ENVIRONMENT", "dev")
-if environment == "dev":
-    logger = logging.getLogger("uvicorn")
-else:
-    logger = lambda x: _
-    # we should log also in production  TODO
-    # check how it works on HuggingFace, if possible
-    # because we don't have access to the container's file system

 import os, logging
+# import reflex as rx
+logger = logging.getLogger("uvicorn").info
+# logger = lambda x: rx.console_log(x)
+# let's use reflex's logger, but doesn't show in the console??
+# environment = os.getenv("ENVIRONMENT", "dev")
+# if environment == "dev":
+#     logger = logging.getLogger("uvicorn").info
+# else:
+#     logger = lambda x: print(x)
+#     # we should log also in production  TODO
+#     # check how it works on HuggingFace, if possible
+#     # because we don't have access to the container's file system unless in pro mode

app/engine/post_process.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import reflex as rx
+import json
+import requests
+from typing import Optional, List
+from pydantic import BaseModel, Field
+# from rerank import ReRanker
+# https://hub.guardrailsai.com/validator/guardrails/toxic_language
+from guardrails.hub import ToxicLanguage
+from guardrails import Guard
+# guardrails hub install hub://guardrails/detect_pii
+from guardrails.hub import DetectPII
+# https://hub.guardrailsai.com/validator/guardrails/qa_relevance_llm_eval
+from guardrails.hub import QARelevanceLLMEval
+import logging
+logger = logging.getLogger("uvicorn").info
+from .summary import summarize_it
+def IsPii(answer: str) -> bool:
+    guard = Guard().use(DetectPII,
+                        ["EMAIL_ADDRESS", "PHONE_NUMBER"],
+                        "exception",
+                        )
+    try:
+        guard.validate(answer)
+        return True
+    except Exception as e:
+        print(e)
+        return False
+def IsToxic(query: str, threshold=0.5) -> bool:
+    # https://hub.guardrailsai.com/validator/guardrails/toxic_language
+    # Use the Guard with the validator
+    guard = Guard().use(
+            ToxicLanguage,
+            threshold=threshold, # high for highly toxic only
+            validation_method="sentence",
+            on_fail="exception"
+        )
+    try:
+        guard.validate(query)
+        return False
+    except Exception as e:
+        print(e)  # will output the toxic question
+        return True
+def IsRelevant(answer: str, query: str, model: str="gpt-3.5-turbo") -> bool:
+    guard = Guard().use(
+        QARelevanceLLMEval,
+        llm_callable=model,
+        on_fail="exception",
+    )
+    try:
+        guard.validate(
+            answer,
+            metadata={"original_prompt": query},
+        )
+        return True
+    except Exception as e:
+        print(e)
+        return False

app/engine/processing.py CHANGED Viewed

@@ -1,48 +1,146 @@
 import os, pickle
 from typing import List
-from engine.loaders.file import pdf_extractor
-from engine.chunk_embed import chunk_vectorize
-from settings import parquet_file
 from .logger import logger
 from .vectorstore import VectorStore
-# I allow relative imports inside the engine package
-# I could have created a module but things are still changing
-finrag_vectorstore = VectorStore(model_path='sentence-transformers/all-mpnet-base-v2')
 def empty_collection():
-    """ Deletes the Finrag collection if it exists """
-    status = finrag_vectorstore.empty_collection()
     return status
 def index_data():
     if not os.path.exists(parquet_file):
-        logger.info(f"Parquet file {parquet_file} does not exists")
         return 'no data to index'
     # load the parquet file into the vectorstore
-    finrag_vectorstore.index_data()
     os.remove(parquet_file)
     # delete the files so we can load several files and index them when we want
     # without having to keep track of those that have been indexed already
     # this is a simple solution for now, but we can do better
     return "Index creation successful"
-def process_pdf(filepath:str) -> dict:
-    new_content = pdf_extractor('PyPDFLoader', filepath).extract_text()
-    logger.info(f"Successfully extracted text from PDF")
     chunk_vectorize(new_content)
-    logger.info(f"Successfully vectorized PDF content")
     return new_content
-def vector_search(question:str) -> List[str]:
-    ans = finrag_vectorstore.hybrid_search(query=question, limit=3, alpha=0.8)
     return ans

 import os, pickle
 from typing import List
+from .loaders.file import extractor
+from .chunk_embed import chunk_vectorize
+from ..settings import parquet_file
 from .logger import logger
 from .vectorstore import VectorStore
+from .post_process import IsPii, IsToxic, IsRelevant
+from .summary import summarize_it
+from .post_process import IsPii, IsToxic, IsRelevant
+multirag_vectorstore = VectorStore(model_path='sentence-transformers/all-mpnet-base-v2')
 def empty_collection():
+    """ Deletes the MultiRAG collection if it exists """
+    status = multirag_vectorstore.empty_collection()
     return status
 def index_data():
     if not os.path.exists(parquet_file):
+        logger(f"Parquet file {parquet_file} does not exists")
         return 'no data to index'
     # load the parquet file into the vectorstore
+    multirag_vectorstore.index_data()
     os.remove(parquet_file)
     # delete the files so we can load several files and index them when we want
     # without having to keep track of those that have been indexed already
     # this is a simple solution for now, but we can do better
     return "Index creation successful"
+def process_pdf(filepath: str) -> dict:
+    new_content = extractor('PyPDFLoader', filepath).extract_text()
+    logger(f"Successfully extracted text from PDF")
     chunk_vectorize(new_content)
+    logger(f"Successfully vectorized PDF content of {filepath}")
     return new_content
+def process_txt(filepath: str) -> dict:
+    new_content = extractor('txt', filepath).extract_text()
+    logger(f"Successfully extracted text from TXT")
+    chunk_vectorize(new_content)
+    logger(f"Successfully vectorized TXT content")
+    return new_content
+def vector_search_raw(question: str) -> List[str]:
+    """ Just vector search """
+    print("WE are in vector_search_raw")
+    ans = multirag_vectorstore.hybrid_search(query=question,
+                                             limit=6,
+                                             alpha=0.8)
     return ans
+def vector_search(question: str, relevance_thr=0.3) -> List[str]:
+    """ Search + pre/post processing """
+    ## PRE PROCESSING
+    if IsToxic(question):
+        ans =  [f"\"{question}\" is toxic, try again"]
+        return ans
+    ans = multirag_vectorstore.hybrid_search(query=question,
+                                             limit=5,
+                                             alpha=0.8)
+    max_score = max([score for _, _, score in ans])
+    # if no answer has a score high enough, we consider the question irrelevant
+    # we could do better with reranking but here the question is trivial, y/n
+    # it's not like reranking 100 answers to pick the best 5 for RAGing
+    if max_score < relevance_thr:
+        return [f"{question} is IRRELEVANT with max score: {max_score:.2f}, try again"]
+    else:
+        answers = [f"{question} is deemed RELEVANT with max score: {max_score:.2f}"]
+    # let's first quickly print the answers, without summary
+    for i, (fname, ans, score) in enumerate(ans, 1):
+        if score < relevance_thr:
+            continue
+        if IsPii(ans):
+          ans = " Pii detected -" + ans
+        # removed, not accurate
+        if IsRelevant(ans, question):
+            relevant = 'RELEVANT'
+        else:
+            # irrelevant answer
+            relevant = 'IRRELEVANT'
+        summary = summarize_it(question, [ans])
+        ans = f"{ans}\n SUMMARY: {summary}"
+        answers.append(f"{i}: from {fname} - score:{score:.2f} - {relevant} answer - {ans}")
+    # msg = f"Answers to '{self.question}' with summaries"
+    # self.chats[self.current_chat] = [qa1]
+    # for i, (fname, ans, score) in enumerate(self.answer['answer'], 1):
+    #     if score < relevance_thr:
+    #         continue
+    #     msg = ""
+    #     summary = summarize_it(self.question, [ans])
+    #     # if IsPii(ans):
+    #     #   qa.answer += " Pii detected -"
+    #     # removed, not accurate
+    #     # if IsRelevant(ans, self.question):
+    #     #     relevant = 'RELEVANT'
+    #     # else:
+    #     #     # irrelevant answer
+    #     #     relevant = 'IRRELEVANT'
+    #     # qa.answer += f" {relevant} ANSWER - {ans} \n SUMMARY: {summary}"
+    #     qa = QA(question=msg,
+    #             answer=f"{i}: from {fname} - score:{score:.2f} - {ans} - SUMMARY: {summary}"
+    #             )
+    #     # paths are from /assets, so data is assets/data
+    #     search = ans[:30].replace(" ", "%20")   # let's search only first 30 chars
+    #     qa.link = f'data/{fname}#:~:text={search}'
+    #     qa.msg = " - Verify in the document"
+    #     logger(f"Summary: {summary}")
+    #     # it's slower now because of the summaries
+    #     self.chats[self.current_chat].append(qa)
+    #     yield
+    #     msg = ""
+    return answers

app/engine/summary.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from typing import List
+from app.rag.llm import LLM
+#the LLM Class uses the OPENAI_API_KEY env var as the default api_key
+def summarize_it(question: str,
+           search_results: List[str],
+           model: str = 'gpt-3.5-turbo-0125',
+           ) -> str:
+    # TODO turn this into a class if time allows
+    llm = LLM(model)
+    system_message = """
+    You are able to quickly understand a few paragraphs, or quips even, generated by vector search system
+    and generate a one-line summary.
+    """
+    searches = "\n".join([f"Search result {i}: {v}" for i,v in enumerate(search_results,1)])
+    user_prompt = f"""
+    Use the below context enclosed in triple back ticks to answer the question. \n
+    The context is given by a vector search into a vector database made from the company's documents,
+    so you can assume the context is accurate. \n
+    ```
+    Context:
+    ```
+    {searches}
+    ```
+    Question:\n
+    {question}\n
+    ------------------------
+    1. If the context is not relevant to the question, simply say 'Irrelevant content' and nothing else.
+    Pay great attention to making sure your answer is relevant to the question and the context.
+    (for instance, never answer a question about a topic that is not explicitely mentioned in the question)
+    2. Using any external knowledge or resources to answer the question is forbidden.
+    3. Generate a ONE-LINE ONE-LINE summary within the limits of the context and the question.
+    4. Avoid mentioning 'search results' in the answer.
+       Instead, incorporate the information from the search results into the answer.
+    5. Create a clean answer, without backticks, or starting with a new line for instance.
+    ------------------------
+    Answer:\n
+    """.format(searches=searches, question=question)
+    response = llm.chat_completion(system_message=system_message,
+                                   user_message=user_prompt,
+                                   temperature=0.01,  # let's not allow the model to be creative
+                                   stream=False,
+                                   raw_response=False)
+    return response

app/engine/vectorstore.py CHANGED Viewed

@@ -1,19 +1,109 @@
 import os, logging
 from typing import List, Any
 import pandas as pd
 from weaviate.classes.config import Property, DataType
 from .weaviate_interface_v4 import WeaviateWCS, WeaviateIndexer
-from .logger import logger
-from settings import parquet_file
 class VectorStore:
-    def __init__(self, model_path:str = 'sentence-transformers/all-mpnet-base-v2'):
         # we can create several instances to test various models, especially if we finetune one
-        self.finrag_properties = [
-                Property(name='filename',
                          data_type=DataType.TEXT,
                          description='Name of the file',
                          index_filterable=True,
@@ -30,45 +120,54 @@ class VectorStore:
                          index_searchable=True),
               ]
-        self.class_name = "FinRag_all-mpnet-base-v2"
         self.class_config = {'classes': [
                             {"class": self.class_name,
-                            "description": "Financial reports",
                             "vectorIndexType": "hnsw",
-                            # Vector index specific settings for HSNW
                             "vectorIndexConfig": {
                                     "ef": 64,  # higher is better quality vs slower search
                                     "efConstruction": 128, # higher = better index but slower build
                                     "maxConnections": 32,  # max conn per layer - higher = more memory
                             },
                             "vectorizer": "none",
-                            "properties": self.finrag_properties }
                             ]
         }
         self.model_path = model_path
         try:
             self.api_key = os.environ.get('FINRAG_WEAVIATE_API_KEY')
-            self.url =  os.environ.get('FINRAG_WEAVIATE_ENDPOINT')
-            self.client = WeaviateWCS(endpoint=self.url,
-                                      api_key=self.api_key,
-                                      model_name_or_path=self.model_path)
         except Exception as e:
             # raise Exception(f"Could not create Weaviate client: {e}")
-            print(f"Could not create Weaviate client: {e}")
-        assert self.client._client.is_live(), "Weaviate is not live"
-        assert self.client._client.is_ready(), "Weaviate is not ready"
         # careful with accessing '_client' since the weaviate helper usually closes the connection every time
         self.indexer = None
@@ -80,19 +179,21 @@ class VectorStore:
         return self.client.show_all_collections()
-    def create_collection(self, collection_name: str='Finrag', description: str='Financial reports'):
         self.collection_name = collection_name
         if collection_name not in self.collections:
             self.client.create_collection(collection_name=collection_name,
-                                          properties=self.finrag_properties,
                                           description=description)
-            self.collection_name = collection_name
         else:
-            logging.warning(f"Collection {collection_name} already exists")
-    def empty_collection(self, collection_name: str='Finrag') -> bool:
         # not in the library yet, so I simply delete and recreate it
         if collection_name in self.collections:
@@ -100,11 +201,11 @@ class VectorStore:
             self.create_collection()
             return True
         else:
-            logging.warning(f"Collection {collection_name} doesn't exist")
             return False
-    def index_data(self, data: List[dict]= None, collection_name: str='Finrag'):
         if self.indexer is None:
             self.indexer = WeaviateIndexer(self.client)
@@ -127,25 +228,25 @@ class VectorStore:
     def keyword_search(self,
                        query: str,
                        limit: int=5,
-                       return_properties: List[str]=['filename', 'content'],
                        alpha=None  # dummy parameter to match the hybrid_search signature
                        ) -> List[str]:
         response = self.client.keyword_search(
                                 request=query,
                                 collection_name=self.collection_name,
-                                query_properties=['content'],
                                 limit=limit,
                                 filter=None,
                                 return_properties=return_properties,
                                 return_raw=False)
-        return [res['content'] for res in response]
     def vector_search(self,
                       query: str,
                       limit: int=5,
-                      return_properties: List[str]=['filename', 'content'],
                       alpha=None  # dummy parameter to match the hybrid_search signature
                       ) -> List[str]:
@@ -157,24 +258,24 @@ class VectorStore:
                                 return_properties=return_properties,
                                 return_raw=False)
-        return [res['content'] for res in response]
     def hybrid_search(self,
                       query: str,
-                      limit: int=5,
                       alpha=0.5,  # higher = more vector search
-                      return_properties: List[str]=['filename', 'content']
                       ) -> List[str]:
         response = self.client.hybrid_search(
                                 request=query,
                                 collection_name=self.collection_name,
-                                query_properties=['content'],
                                 alpha=alpha,
                                 limit=limit,
                                 filter=None,
                                 return_properties=return_properties,
                                 return_raw=False)
-        return [res['content'] for res in response]

 import os, logging
+from app.engine.logger import logger
 from typing import List, Any
 import pandas as pd
 from weaviate.classes.config import Property, DataType
 from .weaviate_interface_v4 import WeaviateWCS, WeaviateIndexer
+from app.settings import parquet_file
+from weaviate.classes.query import Filter
+from torch import cuda
+if os.path.exists('.we_are_local'):
+    COLLECTION = 'MultiRAG_local'
+else:
+    COLLECTION = 'MultiRAG'
+class dummyWeaviate:
+    """ Created to pass on HF since I had again the client creation issue
+        Temporary solution
+    """
+    def __init__(self,
+                 endpoint: str=None,
+                 api_key: str=None,
+                 model_name_or_path: str='sentence-transformers/all-MiniLM-L6-v2',
+                 embedded: bool=False,
+                 openai_api_key: str=None,
+                 skip_init_checks: bool=False,
+                 **kwargs
+                ):
+        return
+    def _connect(self) -> None:
+        return
+    def _client(self):
+        return
+    def create_collection(self,
+                          collection_name: str,
+                          properties: list[Property],
+                          description: str=None,
+                          **kwargs
+                          ) -> None:
+        return
+    def show_all_collections(self,
+                             detailed: bool=False,
+                             max_details: bool=False
+                             ) -> list[str] | dict:
+        return ['abc', 'def']
+    def show_collection_config(self, collection_name: str):
+        return
+    def show_collection_properties(self, collection_name: str):
+        return
+    def delete_collection(self, collection_name: str):
+        return
+    def get_doc_count(self, collection_name: str):
+        return
+    def keyword_search(self,
+                       request: str,
+                       collection_name: str,
+                       query_properties: list[str]=['content'],
+                       limit: int=10,
+                       filter: Filter=None,
+                       return_properties: list[str]=None,
+                       return_raw: bool=False
+                       ):
+        return
+    def vector_search(self,
+                      request: str,
+                      collection_name: str,
+                      limit: int=10,
+                      return_properties: list[str]=None,
+                      filter: Filter=None,
+                      return_raw: bool=False,
+                      device: str='cuda:0' if cuda.is_available() else 'cpu'
+                      ):
+        return
+    def hybrid_search(self,
+                      request: str,
+                      collection_name: str,
+                      query_properties: list[str]=['content'],
+                      alpha: float=0.5,
+                      limit: int=10,
+                      filter: Filter=None,
+                      return_properties: list[str]=None,
+                      return_raw: bool=False,
+                      device: str='cuda:0' if cuda.is_available() else 'cpu'
+                     ):
+        return
 class VectorStore:
+    def __init__(self, model_path: str = 'sentence-transformers/all-mpnet-base-v2'):
         # we can create several instances to test various models, especially if we finetune one
+        self.MultiRAG_properties = [
+                Property(name='file',
                          data_type=DataType.TEXT,
                          description='Name of the file',
                          index_filterable=True,
                          index_searchable=True),
               ]
+        self.class_name = "MultiRAG_all-mpnet-base-v2"
         self.class_config = {'classes': [
                             {"class": self.class_name,
+                            "description": "multiple types of docs",
                             "vectorIndexType": "hnsw",
+                            # Vector index specific app.settings for HSNW
                             "vectorIndexConfig": {
                                     "ef": 64,  # higher is better quality vs slower search
                                     "efConstruction": 128, # higher = better index but slower build
                                     "maxConnections": 32,  # max conn per layer - higher = more memory
                             },
                             "vectorizer": "none",
+                            "properties": self.MultiRAG_properties}
                             ]
         }
         self.model_path = model_path
         try:
             self.api_key = os.environ.get('FINRAG_WEAVIATE_API_KEY')
+            logger(f"API key: {self.api_key[:5]}")
+            self.url = os.environ.get('FINRAG_WEAVIATE_ENDPOINT')
+            logger(f"URL: {self.url[8:15]}")
+            self.client = WeaviateWCS(
+                    endpoint=self.url,
+                    api_key=self.api_key,
+                    model_name_or_path=self.model_path,
+                    )
+            assert self.client._client.is_live(), "Weaviate is not live"
+            assert self.client._client.is_ready(), "Weaviate is not ready"
+            logger(f"Weaviate client created")
         except Exception as e:
             # raise Exception(f"Could not create Weaviate client: {e}")
+            self.client = dummyWeaviate()  # used when issue with HF client creation, to continue on HF
+            logger(f"Could not create Weaviate client: {e}")
+        # if we fail these tests 'VectorStore' object has no attribute 'client'
+        # it's prob not the env var but the model missing
+        # assert self.client._client.is_live(), "Weaviate is not live"
+        # assert self.client._client.is_ready(), "Weaviate is not ready"
         # careful with accessing '_client' since the weaviate helper usually closes the connection every time
         self.indexer = None
         return self.client.show_all_collections()
+    def create_collection(self,
+                          collection_name: str=COLLECTION,
+                          description: str='Documents'):
         self.collection_name = collection_name
         if collection_name not in self.collections:
             self.client.create_collection(collection_name=collection_name,
+                                          properties=self.MultiRAG_properties,
                                           description=description)
+            # self.collection_name = collection_name
         else:
+            logger(f"Collection {collection_name} already exists")
+    def empty_collection(self, collection_name: str=COLLECTION) -> bool:
         # not in the library yet, so I simply delete and recreate it
         if collection_name in self.collections:
             self.create_collection()
             return True
         else:
+            logger(f"Collection {collection_name} doesn't exist")
             return False
+    def index_data(self, data: List[dict]= None, collection_name: str=COLLECTION):
         if self.indexer is None:
             self.indexer = WeaviateIndexer(self.client)
     def keyword_search(self,
                        query: str,
                        limit: int=5,
+                       return_properties: List[str]=['file', 'content'],
                        alpha=None  # dummy parameter to match the hybrid_search signature
                        ) -> List[str]:
         response = self.client.keyword_search(
                                 request=query,
                                 collection_name=self.collection_name,
+                                query_properties=['file', 'content'],
                                 limit=limit,
                                 filter=None,
                                 return_properties=return_properties,
                                 return_raw=False)
+        return [(res['file'], res['content'], res['score']) for res in response]
     def vector_search(self,
                       query: str,
                       limit: int=5,
+                      return_properties: List[str]=['file', 'content'],
                       alpha=None  # dummy parameter to match the hybrid_search signature
                       ) -> List[str]:
                                 return_properties=return_properties,
                                 return_raw=False)
+        return [(res['file'], res['content'], res['score']) for res in response]
     def hybrid_search(self,
                       query: str,
+                      limit: int=10,
                       alpha=0.5,  # higher = more vector search
+                      return_properties: List[str]=['file', 'content']
                       ) -> List[str]:
+        print("We are in hybrid_search")
         response = self.client.hybrid_search(
                                 request=query,
                                 collection_name=self.collection_name,
+                                query_properties=['file', 'content'],
                                 alpha=alpha,
                                 limit=limit,
                                 filter=None,
                                 return_properties=return_properties,
                                 return_raw=False)
+        return [(res['file'], res['content'], res['score']) for res in response]

app/engine/weaviate_interface_v4.py CHANGED Viewed

@@ -343,9 +343,12 @@ class WeaviateWCS:
             If True, returns raw response from Weaviate.
         '''
         self._connect()
         return_properties = return_properties if return_properties else self.return_properties
         query_vector = self._create_query_vector(request, device=device)
         collection = self._client.collections.get(collection_name)
         response = collection.query.hybrid(query=request,
                                            query_properties=query_properties,
                                            filters=filter,
@@ -354,6 +357,7 @@ class WeaviateWCS:
                                            limit=limit,
                                            return_metadata=MetadataQuery(score=True, distance=True),
                                            return_properties=return_properties)
         if return_raw:
             return response
         else:

             If True, returns raw response from Weaviate.
         '''
         self._connect()
+        print("We are connected to Weaviate")
         return_properties = return_properties if return_properties else self.return_properties
         query_vector = self._create_query_vector(request, device=device)
+        print("After query vector")
         collection = self._client.collections.get(collection_name)
+        print("Just before query")
         response = collection.query.hybrid(query=request,
                                            query_properties=query_properties,
                                            filters=filter,
                                            limit=limit,
                                            return_metadata=MetadataQuery(score=True, distance=True),
                                            return_properties=return_properties)
+        print("After Weaviate response")
         if return_raw:
             return response
         else:

app/main.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import os, random, logging, pickle, shutil
 from dotenv import load_dotenv, find_dotenv
@@ -8,65 +10,83 @@ from fastapi import FastAPI, HTTPException, File, UploadFile, status
 from fastapi.responses import HTMLResponse
 from fastapi.middleware.cors import CORSMiddleware
-from engine.processing import process_pdf, index_data, empty_collection, vector_search
-from rag.rag import rag_it
-from engine.logger import logger
-from settings import datadir
-os.makedirs(datadir, exist_ok=True)
 app = FastAPI()
 environment = os.getenv("ENVIRONMENT", "dev")  # created by dockerfile
-if environment == "dev":
-    logger.warning("Running in development mode - allowing CORS for all origins")
-    app.add_middleware(
-        CORSMiddleware,
-        allow_origins=["*"],
-        allow_credentials=True,
-        allow_methods=["*"],
-        allow_headers=["*"],
-    )
-try:
-    load_dotenv(find_dotenv('env'))
-except Exception as e:
-    pass
 @app.get("/", response_class=HTMLResponse)
 def read_root():
-    logger.info("Title displayed on home page")
     return """
     <html>
         <body>
-            <h1>Welcome to FinExpert, a RAG system designed by JP Bianchi!</h1>
         </body>
     </html>
     """
 @app.get("/ping/")
 def ping():
     """ Testing """
-    logger.info("Someone is pinging the server")
     return {"answer": str(int(random.random() * 100))}
 @app.delete("/erase_data/")
 def erase_data():
-    """ Erase all files in the data directory, but not the vector store """
     if len(os.listdir(datadir)) == 0:
-        logger.info("No data to erase")
         return {"message": "No data to erase"}
-    shutil.rmtree(datadir, ignore_errors=True)
-    os.mkdir(datadir)
-    logger.warning("All data has been erased")
     return {"message": "All data has been erased"}
@@ -75,15 +95,17 @@ def delete_vectors():
     """ Empty the collection in the vector store """
     try:
         status = empty_collection()
-        return {f"""message": "Collection{'' if status else ' NOT'} erased!"""}
     except Exception as e:
         raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
 @app.get("/list_files/")
 def list_files():
     """ List all files in the data directory """
     files = os.listdir(datadir)
-    logger.info(f"Files in data directory: {files}")
     return {"files": files}
@@ -93,18 +115,18 @@ async def upload_file(file: UploadFile = File(...)):
     """  Uploads a file in data directory, for later indexing """
     try:
         filepath = os.path.join(datadir, file.filename)
-        logger.info(f"Fiename detected: {file.filename}")
         if os.path.exists(filepath):
-            logger.warning(f"File {file.filename} already exists: no processing done")
             return {"message": f"File {file.filename} already exists: no processing done"}
         else:
-            logger.info(f"Receiving file: {file.filename}")
             contents = await file.read()
-            logger.info(f"File reception complete!")
     except Exception as e:
-        logger.error(f"Error during file upload: {str(e)}")
         return {"message": f"Error during file upload:  {str(e)}"}
     if file.filename.endswith('.pdf'):
@@ -112,9 +134,14 @@ async def upload_file(file: UploadFile = File(...)):
         # let's save the file in /data even if it's temp storage on HF
         with open(filepath, 'wb') as f:
             f.write(contents)
         try:
-            logger.info(f"Starting to process {file.filename}")
             new_content = process_pdf(filepath)
             success = {"message": f"Successfully uploaded {file.filename}"}
             success.update(new_content)
@@ -122,15 +149,35 @@ async def upload_file(file: UploadFile = File(...)):
         except Exception as e:
             return {"message": f"Failed to extract text from PDF: {str(e)}"}
     else:
-        return {"message": "Only PDF files are accepted"}
 @app.post("/create_index/")
 async def create_index():
     """ Create an index for the uploaded files """
-    logger.info("Creating index for uploaded files")
     try:
         msg = index_data()
         return {"message": msg}
@@ -143,29 +190,30 @@ class Question(BaseModel):
 @app.post("/ask/")
 async def hybrid_search(question: Question):
-    logger.info(f"Processing question: {question.question}")
     try:
         search_results = vector_search(question.question)
-        logger.info(f"Answer: {search_results}")
         return {"answer": search_results}
     except Exception as e:
         raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
 @app.post("/ragit/")
 async def ragit(question: Question):
-    logger.info(f"Processing question: {question.question}")
     try:
-        search_results = vector_search(question.question)
-        logger.info(f"Search results generated: {search_results}")
         answer = rag_it(question.question, search_results)
-        logger.info(f"Answer: {answer}")
         return {"answer": answer}
     except Exception as e:
         raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
 if __name__ == '__main__':
     import uvicorn
     from os import getenv
@@ -175,16 +223,16 @@ if __name__ == '__main__':
     uvicorn.run("main:app", host="0.0.0.0", port=port, reload=reload)
 # Examples:
-# curl -X POST "http://localhost:80/upload" -F "file=@test.pdf"
-# curl -X DELETE "http://localhost:80/erase_data/"
-# curl -X GET "http://localhost:80/list_files/"
-# hf space is at https://jpbianchi-finrag.hf.space/
-# code given by https://jpbianchi-finrag.hf.space/docs
 # Space must be public
-# curl -X POST "https://jpbianchi-finrag.hf.space/upload/" -F "file=@test.pdf"
 # curl -X POST http://localhost:80/ask/ -H "Content-Type: application/json" -d '{"question": "what is Amazon loss"}'
 # curl -X POST http://localhost:80/ragit/ -H "Content-Type: application/json" -d '{"question": "Does ATT have postpaid phone customers?"}'

+# this is the original main.py file, but without the call to fastapi
+# since it is done by reflex's own fast api server
 import os, random, logging, pickle, shutil
 from dotenv import load_dotenv, find_dotenv
 from fastapi.responses import HTMLResponse
 from fastapi.middleware.cors import CORSMiddleware
+try:
+    load_dotenv(find_dotenv('env'))
+except Exception as e:
+    pass
+from app.engine.processing import (  # << creates the collection already
+    process_pdf,
+    process_txt,
+    index_data,
+    empty_collection,
+    vector_search,
+    vector_search_raw
+)
+from app.rag.rag import rag_it
+from app.engine.logger import logger
+from app.settings import datadir, datadir2
+EXTENSIONS = ["pdf", "txt"]
 app = FastAPI()
 environment = os.getenv("ENVIRONMENT", "dev")  # created by dockerfile
+# replaced by cors_allowed_origins=['*'] in rxconfig.py when using Reflex endpoint
+# if environment == "dev":
+#     logger("Running in development mode - allowing CORS for all origins")
+#     app.add_middleware(
+#         CORSMiddleware,
+#         allow_origins=["*"],
+#         allow_credentials=True,
+#         allow_methods=["*"],
+#         allow_headers=["*"],
+#     )
+# not used when using Reflex endpoint
 @app.get("/", response_class=HTMLResponse)
 def read_root():
+    logger("Title displayed on home page")
     return """
     <html>
         <body>
+            <h1>Welcome to MultiRAG, a RAG system designed by JP Bianchi!</h1>
         </body>
     </html>
     """
+# already provided by Reflex
 @app.get("/ping/")
 def ping():
     """ Testing """
+    logger("Someone is pinging the server")
     return {"answer": str(int(random.random() * 100))}
 @app.delete("/erase_data/")
 def erase_data():
+    """ Erase all files in the data directory at the first level only,
+        (in case we would like to use it for something else)
+        but not the vector store or the parquet file.
+        We can do it since the embeddings are in the parquet file already.
+    """
     if len(os.listdir(datadir)) == 0:
+        logger("No data to erase")
         return {"message": "No data to erase"}
+    # if we try to rmtree datadir, it looks like /data can't be deleted on HF
+    for f in os.listdir(datadir):
+        if f == '.DS_Store' or f.split('.')[-1].lower() in EXTENSIONS:
+            print(f"Removing {f}")
+            os.remove(os.path.join(datadir, f))
+            # we don't remove the parquet file, create_index does that
+    logger("All data has been erased")
     return {"message": "All data has been erased"}
     """ Empty the collection in the vector store """
     try:
         status = empty_collection()
+        return {"message": f"Collection{'' if status else ' NOT'} erased!"}
     except Exception as e:
         raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
 @app.get("/list_files/")
 def list_files():
     """ List all files in the data directory """
+    print("Listing files")
     files = os.listdir(datadir)
+    logger(f"Files in data directory: {files}")
     return {"files": files}
     """  Uploads a file in data directory, for later indexing """
     try:
         filepath = os.path.join(datadir, file.filename)
+        logger(f"Fiename detected: {file.filename}")
         if os.path.exists(filepath):
+            logger(f"File {file.filename} already exists: no processing done")
             return {"message": f"File {file.filename} already exists: no processing done"}
         else:
+            logger(f"Receiving file: {file.filename}")
             contents = await file.read()
+            logger(f"File reception complete!")
     except Exception as e:
+        logger(f"Error during file upload: {str(e)}")
         return {"message": f"Error during file upload:  {str(e)}"}
     if file.filename.endswith('.pdf'):
         # let's save the file in /data even if it's temp storage on HF
         with open(filepath, 'wb') as f:
             f.write(contents)
+        # save it also in assets/data because data can be cleared
+        filepath2 = os.path.join(datadir2, file.filename)
+        with open(filepath2, 'wb') as f:
+            f.write(contents)
         try:
+            logger(f"Starting to process {file.filename}")
             new_content = process_pdf(filepath)
             success = {"message": f"Successfully uploaded {file.filename}"}
             success.update(new_content)
         except Exception as e:
             return {"message": f"Failed to extract text from PDF: {str(e)}"}
+    elif file.filename.endswith('.txt'):
+        with open(filepath, 'wb') as f:
+            f.write(contents)
+        filepath2 = os.path.join(datadir2, file.filename)
+        with open(filepath2, 'wb') as f:
+            f.write(contents)
+        try:
+            logger(f"Reading {file.filename}")
+            new_content = process_txt(filepath)
+            success = {"message": f"Successfully uploaded {file.filename}"}
+            success.update(new_content)
+            return success
+        except Exception as e:
+            return {"message": f"Failed to extract text from TXT: {str(e)}"}
     else:
+        return {"message": "Only PDF & txt files are accepted"}
 @app.post("/create_index/")
 async def create_index():
     """ Create an index for the uploaded files """
+    logger("Creating index for uploaded files")
     try:
         msg = index_data()
         return {"message": msg}
 @app.post("/ask/")
 async def hybrid_search(question: Question):
+    logger(f"Processing question: {question.question}")
     try:
         search_results = vector_search(question.question)
+        logger(f"Answer: {search_results}")
         return {"answer": search_results}
     except Exception as e:
         raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
 @app.post("/ragit/")
 async def ragit(question: Question):
+    logger(f"Processing question: {question.question}")
     try:
+        search_results = vector_search_raw(question.question)
+        logger(f"Search results generated: {search_results}")
         answer = rag_it(question.question, search_results)
+        logger(f"Answer: {answer}")
         return {"answer": answer}
     except Exception as e:
         raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
 if __name__ == '__main__':
     import uvicorn
     from os import getenv
     uvicorn.run("main:app", host="0.0.0.0", port=port, reload=reload)
 # Examples:
+# curl -X POST "http://localhost:8001/upload" -F "file=@test.pdf"
+# curl -X DELETE "http://localhost:8001/erase_data/"
+# curl -X GET "http://localhost:8001/list_files/"
+# hf space is at https://jpbianchi-multirag.hf.space/
+# code given by https://jpbianchi-multirag.hf.space/docs
 # Space must be public
+# curl -X POST "https://jpbianchi-multirag.hf.space/upload/" -F "file=@test.pdf"
 # curl -X POST http://localhost:80/ask/ -H "Content-Type: application/json" -d '{"question": "what is Amazon loss"}'
 # curl -X POST http://localhost:80/ragit/ -H "Content-Type: application/json" -d '{"question": "Does ATT have postpaid phone customers?"}'
+# see more in notebook upload_index.ipynb

app/main_reflex.py ADDED Viewed

	@@ -0,0 +1,231 @@

+# this is the original main.py file, but without the call to fastapi
+# since it is done by reflex's own fast api server
+import os, random, logging, pickle, shutil
+from dotenv import load_dotenv, find_dotenv
+from typing import Optional
+from pydantic import BaseModel, Field
+from fastapi import FastAPI, HTTPException, File, UploadFile, status
+from fastapi.responses import HTMLResponse
+from fastapi.middleware.cors import CORSMiddleware
+from app.engine.processing import ( # << creates the collection already
+    process_pdf,
+    process_txt,
+    index_data,
+    empty_collection,
+    vector_search,
+    vector_search_raw,
+)
+from app.rag.rag import rag_it
+from app.engine.logger import logger
+from app.settings import datadir, datadir2
+EXTENSIONS = ["pdf", "txt"]
+# app = FastAPI()
+environment = os.getenv("ENVIRONMENT", "dev")  # created by dockerfile
+# replaced by cors_allowed_origins=['*'] in rxconfig.py when using Reflex endpoint
+# if environment == "dev":
+#     logger("Running in development mode - allowing CORS for all origins")
+#     app.add_middleware(
+#         CORSMiddleware,
+#         allow_origins=["*"],
+#         allow_credentials=True,
+#         allow_methods=["*"],
+#         allow_headers=["*"],
+#     )
+# not used when using Reflex endpoint
+# @app.get("/", response_class=HTMLResponse)
+def read_root():
+    logger("Title displayed on home page")
+    return """
+    <html>
+        <body>
+            <h1>Welcome to MultiRAG, a RAG system designed by JP Bianchi!</h1>
+        </body>
+    </html>
+    """
+# already provided by Reflex
+# @app.get("/ping/")
+def ping():
+    """ Testing """
+    logger("Someone is pinging the server")
+    return {"answer": str(int(random.random() * 100))}
+# @app.delete("/erase_data/")
+def erase_data():
+    """ Erase all files in the data directory at the first level only,
+        (in case we would like to use it for something else)
+        but not the vector store or the parquet file.
+        We can do it since the embeddings are in the parquet file already.
+    """
+    if len(os.listdir(datadir)) == 0:
+        logger("No data to erase")
+        return {"message": "No data to erase"}
+    # if we try to rmtree datadir, it looks like /data can't be deleted on HF
+    for f in os.listdir(datadir):
+        if f == '.DS_Store' or f.split('.')[-1].lower() in EXTENSIONS:
+            print(f"Removing {f}")
+            os.remove(os.path.join(datadir, f))
+            # we don't remove the parquet file, create_index does that
+    logger("All data has been erased")
+    return {"message": "All data has been erased"}
+# @app.delete("/empty_collection/")
+def delete_vectors():
+    """ Empty the collection in the vector store """
+    try:
+        status = empty_collection()
+        return {"message": f"Collection{'' if status else ' NOT'} erased!"}
+    except Exception as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
+# @app.get("/list_files/")
+def list_files():
+    """ List all files in the data directory """
+    print("Listing files")
+    files = os.listdir(datadir)
+    logger(f"Files in data directory: {files}")
+    return {"files": files}
+# @app.post("/upload/")
+# @limiter.limit("5/minute") see 'slowapi' for rate limiting
+async def upload_file(file: UploadFile = File(...)):
+    """  Uploads a file in data directory, for later indexing """
+    try:
+        filepath = os.path.join(datadir, file.filename)
+        logger(f"Fiename detected: {file.filename}")
+        if os.path.exists(filepath):
+            logger(f"File {file.filename} already exists: no processing done")
+            return {"message": f"File {file.filename} already exists: no processing done"}
+        else:
+            logger(f"Receiving file: {file.filename}")
+            contents = await file.read()
+            logger(f"File reception complete!")
+    except Exception as e:
+        logger(f"Error during file upload: {str(e)}")
+        return {"message": f"Error during file upload:  {str(e)}"}
+    if file.filename.endswith('.pdf'):
+        # let's save the file in /data even if it's temp storage on HF
+        with open(filepath, 'wb') as f:
+            f.write(contents)
+        # save it also in assets/data because data can be cleared
+        filepath2 = os.path.join(datadir2, file.filename)
+        with open(filepath2, 'wb') as f:
+            f.write(contents)
+        try:
+            logger(f"Starting to process {file.filename}")
+            new_content = process_pdf(filepath)
+            success = {"message": f"Successfully uploaded {file.filename}"}
+            success.update(new_content)
+            return success
+        except Exception as e:
+            return {"message": f"Failed to extract text from PDF: {str(e)}"}
+    elif file.filename.endswith('.txt'):
+        with open(filepath, 'wb') as f:
+            f.write(contents)
+        filepath2 = os.path.join(datadir2, file.filename)
+        with open(filepath2, 'wb') as f:
+            f.write(contents)
+        try:
+            logger(f"Reading {file.filename}")
+            new_content = process_txt(filepath)
+            success = {"message": f"Successfully uploaded {file.filename}"}
+            success.update(new_content)
+            return success
+        except Exception as e:
+            return {"message": f"Failed to extract text from TXT: {str(e)}"}
+    else:
+        return {"message": "Only PDF & txt files are accepted"}
+# @app.post("/create_index/")
+async def create_index():
+    """ Create an index for the uploaded files """
+    logger("Creating index for uploaded files")
+    try:
+        msg = index_data()
+        return {"message": msg}
+    except Exception as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
+class Question(BaseModel):
+    question: str
+# @app.post("/ask/")
+async def hybrid_search(question: Question):
+    logger(f"Processing question: {question.question}")
+    try:
+        search_results = vector_search(question.question)
+        logger(f"Answer: {search_results}")
+        return {"answer": search_results}
+    except Exception as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
+# @app.post("/ragit/")
+async def ragit(question: Question):
+    logger(f"Processing question: {question.question}")
+    try:
+        search_results = vector_search_raw(question.question)
+        logger(f"Search results generated: {search_results}")
+        answer = rag_it(question.question, search_results)
+        logger(f"Answer: {answer}")
+        return {"answer": answer}
+    except Exception as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
+if __name__ == '__main__':
+    import uvicorn
+    from os import getenv
+    port = int(getenv("PORT", 80))
+    print(f"Starting server on port {port}")
+    reload = True if environment == "dev" else False
+    uvicorn.run("main:app", host="0.0.0.0", port=port, reload=reload)
+# Examples:
+# curl -X POST "http://localhost:8001/upload" -F "file=@test.pdf"
+# curl -X DELETE "http://localhost:8001/erase_data/"
+# curl -X GET "http://localhost:8001/list_files/"
+# hf space is at https://jpbianchi-multirag.hf.space/
+# code given by https://jpbianchi-multirag.hf.space/docs
+# Space must be public
+# curl -X POST "https://jpbianchi-multirag.hf.space/upload/" -F "file=@test.pdf"
+# curl -X POST http://localhost:80/ask/ -H "Content-Type: application/json" -d '{"question": "what is Amazon loss"}'
+# curl -X POST http://localhost:80/ragit/ -H "Content-Type: application/json" -d '{"question": "Does ATT have postpaid phone customers?"}'
+# see more in notebook upload_index.ipynb

app/notebooks/__init__.py ADDED Viewed

File without changes

app/requirements.txt CHANGED Viewed

@@ -1,8 +1,8 @@
 requests==2.31.0
 pydantic==2.7.1
 pydantic_core==2.18.2
-fastapi
-uvicorn[standard]
 pdfplumber==0.11.0
 weaviate-client==4.5.4
 PyPDF2==3.0.1
@@ -21,4 +21,8 @@ langchain-community==0.0.38
 langchain-core==0.1.52
 langchain-text-splitters==0.0.1
 python-multipart==0.0.9
-tenacity==8.2.3

 requests==2.31.0
 pydantic==2.7.1
 pydantic_core==2.18.2
+fastapi==0.110.0
+uvicorn==0.20.0
 pdfplumber==0.11.0
 weaviate-client==4.5.4
 PyPDF2==3.0.1
 langchain-core==0.1.52
 langchain-text-splitters==0.0.1
 python-multipart==0.0.9
+tenacity==8.2.3
+typer
+# https://hub.guardrailsai.com/tokens
+guardrails-ai<=0.4.2   # API KEY doesn not work above that version
+loguru==0.7.2    # used in reranker

app/settings.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
-datadir = '../data'  # will be used in main.py
 parquet_file = os.path.join(datadir, 'text_vectors.parquet') # used by the files in 'engine'

 import os
+datadir = 'data'   # will be used in main.py
+datadir2 = 'assets/data'  # backup since data can be emptied
 parquet_file = os.path.join(datadir, 'text_vectors.parquet') # used by the files in 'engine'