cd@bziiit.com commited on
Commit
a3d26e6
·
1 Parent(s): 4e6d9da

First commit

Browse files
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ */__pycache__/*
2
+ __pycache__
3
+
4
+ .env
app.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import dotenv
3
+ import os
4
+
5
+ from rag import Rag
6
+ from vectore_store.PineconeConnector import PineconeConnector
7
+ from vectore_store.VectoreStoreManager import VectoreStoreManager
8
+
9
+ GROUP_NAME = "Groupe 1"
10
+
11
+ def main():
12
+
13
+ if len(st.session_state) == 0:
14
+ # Define Vectore store strategy
15
+ pinecone_connector = PineconeConnector()
16
+ vs_manager = VectoreStoreManager(pinecone_connector)
17
+
18
+ st.session_state["messages"] = []
19
+ st.session_state["assistant"] = Rag(vectore_store=vs_manager)
20
+
21
+ st.set_page_config(page_title=GROUP_NAME)
22
+
23
+ st.title(GROUP_NAME)
24
+
25
+ prompt_system = st.Page("pages/prompt_system.py", title="Prompt système", icon="📋", default=True)
26
+ saved_documents = st.Page("pages/persistent_documents.py", title="Documents Communs", icon="📋")
27
+ documents = st.Page("pages/documents.py", title="Documents", icon="📋")
28
+ form = st.Page("pages/form.py", title="Formulaire", icon="📋")
29
+ chatbot = st.Page("pages/chatbot.py", title="Chatbot", icon="📋")
30
+
31
+ pg = st.navigation(
32
+ [
33
+ saved_documents,
34
+ prompt_system,
35
+ documents,
36
+ form,
37
+ chatbot
38
+ ]
39
+ )
40
+
41
+ pg.run()
42
+
43
+
44
+ if __name__ == "__main__":
45
+ main()
pages/chatbot.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from streamlit_chat import message
3
+
4
+ def display_messages():
5
+ for i, (msg, is_user) in enumerate(st.session_state["messages"]):
6
+ message(msg, is_user=is_user, key=str(i))
7
+ st.session_state["thinking_spinner"] = st.empty()
8
+
9
+
10
+ def process_input():
11
+ if st.session_state["user_input"] and len(st.session_state["user_input"].strip()) > 0:
12
+ user_text = st.session_state["user_input"].strip()
13
+
14
+
15
+ with st.session_state["thinking_spinner"], st.spinner(f"Je réfléchis"):
16
+ agent_text = st.session_state["assistant"].ask(user_text, st.session_state["messages"] if "messages" in st.session_state else [])
17
+
18
+ st.session_state["messages"].append((user_text, True))
19
+ st.session_state["messages"].append((agent_text, False))
20
+
21
+
22
+ def page():
23
+ st.subheader("Posez vos questions")
24
+
25
+ if "assistant" not in st.session_state:
26
+ st.text("Assistant non initialisé")
27
+
28
+
29
+ prompt_sys = st.session_state.prompt_system if 'prompt_system' in st.session_state and st.session_state.prompt_system != '' else "Renseignez votre prompt system"
30
+
31
+ st.text("Prompt system : " + prompt_sys)
32
+
33
+ display_messages()
34
+ st.text_input("Message", key="user_input", on_change=process_input)
35
+
36
+ page()
pages/documents.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ import streamlit as st
4
+
5
+ def read_and_save_file():
6
+ st.session_state["messages"] = []
7
+ st.session_state["user_input"] = ""
8
+
9
+ for file in st.session_state["file_uploader"]:
10
+ with tempfile.NamedTemporaryFile(delete=False) as tf:
11
+ tf.write(file.getbuffer())
12
+ file_path = tf.name
13
+
14
+ with st.session_state["ingestion_spinner"], st.spinner(f"Chargement {file.name}"):
15
+ st.session_state["assistant"].ingest(file_path)
16
+ os.remove(file_path)
17
+
18
+
19
+
20
+ def page():
21
+ st.subheader("Charger vos documents")
22
+
23
+ # File uploader
24
+ uploaded_file = st.file_uploader(
25
+ "Télécharger un ou plusieurs documents",
26
+ type=["pdf"],
27
+ key="file_uploader",
28
+ accept_multiple_files=True,
29
+ on_change=read_and_save_file,
30
+ )
31
+
32
+
33
+ st.session_state["ingestion_spinner"] = st.empty()
34
+
35
+ page()
pages/form.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ def page():
4
+ st.subheader("Définissez vos paramètres")
5
+
6
+ page()
pages/persistent_documents.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ import streamlit as st
4
+
5
+ def uploadToDb():
6
+
7
+ for file in st.session_state["file_uploader_commun"]:
8
+ with tempfile.NamedTemporaryFile(delete=False) as tf:
9
+ tf.write(file.getbuffer())
10
+ file_path = tf.name
11
+
12
+ with st.session_state["ingestion_spinner"], st.spinner(f"Chargement {file.name}"):
13
+ st.session_state["assistant"].ingestToDb(file_path, filename=file.name)
14
+ os.remove(file_path)
15
+
16
+ def page():
17
+ st.subheader("Montez des documents communs")
18
+
19
+ st.file_uploader(
20
+ "Télécharger un documents",
21
+ type=["pdf"],
22
+ key="file_uploader_commun",
23
+ accept_multiple_files=True,
24
+ on_change=uploadToDb,
25
+ )
26
+
27
+ st.session_state["ingestion_spinner"] = st.empty()
28
+
29
+ st.divider()
30
+ st.write("Documents dans la base de données", bold=True)
31
+
32
+ for doc in st.session_state["assistant"].vector_store.getDocs():
33
+ st.write(" - "+doc)
34
+
35
+ page()
pages/prompt_system.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ def page():
4
+
5
+ st.subheader("Renseignez votre prompt system")
6
+
7
+ prompt = st.text_area("Prompt system", st.session_state.prompt_system if 'prompt_system' in st.session_state else "")
8
+
9
+ # Session State also supports attribute based syntax
10
+ st.session_state['prompt_system'] = prompt
11
+
12
+ page()
prompt_template.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ base_template = '''
2
+
3
+ Documents partagées : {commonContext}
4
+ Document de référence : {documentContext}
5
+
6
+ Voici l'historique des messages : {messages}
7
+ Les attentes de l'utilisateur sont : {query}
8
+ '''
rag.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from dotenv import load_dotenv
4
+ from langchain_community.vectorstores import FAISS
5
+ from langchain_mistralai.chat_models import ChatMistralAI
6
+ from langchain_mistralai.embeddings import MistralAIEmbeddings
7
+ from langchain.schema.output_parser import StrOutputParser
8
+ from langchain_community.document_loaders import PyPDFLoader
9
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
10
+ from langchain.schema.runnable import RunnablePassthrough
11
+ from langchain.prompts import PromptTemplate
12
+ from langchain_community.vectorstores.utils import filter_complex_metadata
13
+ #add new import
14
+ from langchain_community.document_loaders.csv_loader import CSVLoader
15
+
16
+ from prompt_template import base_template
17
+
18
+
19
+ # load .env in local dev
20
+ load_dotenv()
21
+ env_api_key = os.environ.get("MISTRAL_API_KEY")
22
+ llm_model = "open-mixtral-8x7b"
23
+
24
+ class Rag:
25
+ document_vector_store = None
26
+ retriever = None
27
+ chain = None
28
+
29
+ def __init__(self, vectore_store=None):
30
+
31
+ self.model = ChatMistralAI(model=llm_model)
32
+ self.embedding = MistralAIEmbeddings(model="mistral-embed", mistral_api_key=env_api_key)
33
+
34
+ self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100, length_function=len)
35
+ self.prompt = PromptTemplate.from_template(base_template)
36
+
37
+ self.vector_store = vectore_store
38
+
39
+ def setModel(self, model):
40
+ self.model = model
41
+
42
+ def ingestToDb(self, file_path: str, filename: str):
43
+
44
+ docs = PyPDFLoader(file_path=file_path).load()
45
+
46
+ # Extract all text from the document
47
+ text = ""
48
+ for page in docs:
49
+ text += page.page_content
50
+
51
+ # Split the text into chunks
52
+ chunks = self.text_splitter.split_text(text)
53
+
54
+ return self.vector_store.addDoc(filename=filename, text_chunks=chunks, embedding=self.embedding)
55
+
56
+ def getDbFiles(self):
57
+ return self.vector_store.getDocs()
58
+
59
+ def ingest(self, pdf_file_path: str):
60
+ docs = PyPDFLoader(file_path=pdf_file_path).load()
61
+
62
+ chunks = self.text_splitter.split_documents(docs)
63
+ chunks = filter_complex_metadata(chunks)
64
+
65
+ document_vector_store = FAISS.from_documents(chunks, self.embedding)
66
+
67
+ self.retriever = document_vector_store.as_retriever(
68
+ search_type="similarity_score_threshold",
69
+ search_kwargs={
70
+ "k": 3,
71
+ "score_threshold": 0.5,
72
+ },
73
+ )
74
+
75
+ self.chain = self.prompt | self.model | StrOutputParser()
76
+
77
+ def ask(self, query: str, messages: list):
78
+ if not self.chain:
79
+ return "Ajouter un document PDF d'abord."
80
+
81
+ print("messages ", messages)
82
+
83
+ # Retrieve the context document
84
+ documentContext = self.retriever.invoke(query)
85
+
86
+ # Retrieve the VectoreStore
87
+ contextCommon = None
88
+
89
+ return self.chain.invoke({
90
+ "query": query,
91
+ "documentContext": documentContext,
92
+ "commonContext": contextCommon,
93
+ "messages": messages
94
+ })
95
+
96
+ def clear(self):
97
+ self.document_vector_store = None
98
+ self.vector_store = None
99
+ self.retriever = None
100
+ self.chain = None
requirements.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit==1.37.0
2
+ streamlit_chat
3
+ # abc
4
+ python-dotenv
5
+ pymupdf
6
+ python-multipart
7
+ pydantic
8
+ langchain-pinecone
9
+ pinecone-notebooks
10
+ pinecone-client[grpc]
11
+ async-timeout
12
+ pymupdf
13
+ python-dotenv
14
+ typing-extensions
15
+ langchain
16
+ langchain-openai
17
+ langchain-community
18
+ langchain-pinecone
19
+ langchain_mistralai
vectore_store/ConnectorStrategy.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+
3
+ class ConnectorStrategy(ABC):
4
+ @abstractmethod
5
+ def getDocs(self):
6
+ pass
7
+
8
+ @abstractmethod
9
+ def addDoc(self, filename, text_chunks, embedding):
10
+ pass
11
+
12
+ @abstractmethod
13
+ def retriever(self, query, embedding):
14
+ pass
vectore_store/PineconeConnector.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+
4
+ from .ConnectorStrategy import ConnectorStrategy
5
+
6
+ from pinecone import Pinecone, ServerlessSpec
7
+ from langchain_openai import OpenAIEmbeddings
8
+ from langchain_pinecone import PineconeVectorStore
9
+ from langchain_core.documents import Document
10
+
11
+ import unicodedata
12
+ import time
13
+
14
+ class PineconeConnector(ConnectorStrategy):
15
+ def __init__(self):
16
+
17
+ load_dotenv()
18
+
19
+ pinecone_api_key = os.environ.get("PINECONE_API_KEY")
20
+
21
+ self.index_name = os.environ.get("PINECONE_INDEX_NAME")
22
+ self.namespace = os.environ.get("PINECONE_NAMESPACE")
23
+
24
+ print(f"Index name: {self.index_name}")
25
+ print(f"Namespace: {self.namespace}")
26
+ print(f"Pinecone API Key: {pinecone_api_key}")
27
+
28
+ pc = Pinecone(api_key=pinecone_api_key)
29
+
30
+ existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
31
+
32
+ if self.index_name not in existing_indexes:
33
+ pc.create_index(
34
+ name=self.index_name,
35
+ dimension=3072,
36
+ metric="cosine",
37
+ spec=ServerlessSpec(cloud="aws", region="us-east-1"),
38
+ )
39
+ while not pc.describe_index(self.index_name).status["ready"]:
40
+ time.sleep(1)
41
+
42
+ self.index = pc.Index(self.index_name)
43
+
44
+
45
+ def getDocs(self):
46
+ # Simulate getting docs from Pinecone
47
+ print("Fetching documents from Pinecone")
48
+
49
+ docs_names = []
50
+ for ids in self.index.list(namespace=self.namespace):
51
+ for id in ids:
52
+ name_doc = "_".join(id.split("_")[:-1])
53
+ if name_doc not in docs_names:
54
+ docs_names.append(name_doc)
55
+
56
+ return docs_names
57
+
58
+
59
+ def addDoc(self, filename, text_chunks, embedding):
60
+ try:
61
+ vector_store = PineconeVectorStore(index=self.index, embedding=embedding,namespace=self.namespace)
62
+
63
+ file_name = filename.split(".")[0].replace(" ","_").replace("-","_").replace(".","_").replace("/","_").replace("\\","_").strip()
64
+
65
+ documents = []
66
+ uuids = []
67
+
68
+ print(file_name)
69
+
70
+ for i, chunk in enumerate(text_chunks):
71
+ clean_filename = remove_non_standard_ascii(file_name)
72
+ uuid = f"{clean_filename}_{i}"
73
+
74
+ print(f"Adding document with ID {uuid}")
75
+
76
+ document = Document(
77
+ page_content=chunk,
78
+ metadata={ "filename":filename, "chunk_id":uuid },
79
+ )
80
+
81
+ uuids.append(uuid)
82
+ documents.append(document)
83
+
84
+
85
+ vector_store.add_documents(documents=documents, ids=uuids)
86
+
87
+ return {"filename_id":clean_filename}
88
+
89
+ except Exception as e:
90
+ print(e)
91
+ return False
92
+
93
+ def retriever(self, query, embedding):
94
+
95
+ print(f"Retrieving documents from Pinecone for query '{query}'")
96
+
97
+ vector_store = PineconeVectorStore(index=self.index, embedding=embedding,namespace=self.namespace)
98
+
99
+ retriever = vector_store.as_retriever(
100
+ search_type="similarity_score_threshold",
101
+ search_kwargs={"k": 3, "score_threshold": 0.6},
102
+ )
103
+
104
+ return retriever.invoke(query)
105
+
106
+
107
+ def remove_non_standard_ascii(input_string: str) -> str:
108
+ normalized_string = unicodedata.normalize('NFKD', input_string)
109
+ return ''.join(char for char in normalized_string if 'a' <= char <= 'z' or 'A' <= char <= 'Z' or char.isdigit() or char in ' .,!?')
110
+
vectore_store/VectoreStoreManager.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from vectore_store import ConnectorStrategy
2
+
3
+
4
+ class VectoreStoreManager:
5
+ def __init__(self, strategy: ConnectorStrategy):
6
+ self.strategy = strategy
7
+
8
+ def getDocs(self):
9
+ return self.strategy.getDocs()
10
+
11
+ def addDoc(self, filename, text_chunks, embedding):
12
+ self.strategy.addDoc(filename, text_chunks, embedding)
13
+
14
+ def retriever(self, query, embedding):
15
+ return self.strategy.retriever(query, embedding)
vectore_store/__init__.py ADDED
File without changes