jeevan commited on
Commit
4a0c158
0 Parent(s):

Locally working lcel rag

Browse files
Files changed (7) hide show
  1. .chainlit/config.toml +84 -0
  2. .gitignore +8 -0
  3. Chunking.py +68 -0
  4. Dockerfile +9 -0
  5. app.py +125 -0
  6. chainlit.md +3 -0
  7. requirements.txt +15 -0
.chainlit/config.toml ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ # Whether to enable telemetry (default: true). No personal data is collected.
3
+ enable_telemetry = true
4
+
5
+ # List of environment variables to be provided by each user to use the app.
6
+ user_env = []
7
+
8
+ # Duration (in seconds) during which the session is saved when the connection is lost
9
+ session_timeout = 3600
10
+
11
+ # Enable third parties caching (e.g LangChain cache)
12
+ cache = false
13
+
14
+ # Follow symlink for asset mount (see https://github.com/Chainlit/chainlit/issues/317)
15
+ # follow_symlink = false
16
+
17
+ [features]
18
+ # Show the prompt playground
19
+ prompt_playground = true
20
+
21
+ # Process and display HTML in messages. This can be a security risk (see https://stackoverflow.com/questions/19603097/why-is-it-dangerous-to-render-user-generated-html-or-javascript)
22
+ unsafe_allow_html = false
23
+
24
+ # Process and display mathematical expressions. This can clash with "$" characters in messages.
25
+ latex = false
26
+
27
+ # Authorize users to upload files with messages
28
+ multi_modal = true
29
+
30
+ # Allows user to use speech to text
31
+ [features.speech_to_text]
32
+ enabled = false
33
+ # See all languages here https://github.com/JamesBrill/react-speech-recognition/blob/HEAD/docs/API.md#language-string
34
+ # language = "en-US"
35
+
36
+ [UI]
37
+ # Name of the app and chatbot.
38
+ name = "Chatbot"
39
+
40
+ # Show the readme while the conversation is empty.
41
+ show_readme_as_default = true
42
+
43
+ # Description of the app and chatbot. This is used for HTML tags.
44
+ # description = ""
45
+
46
+ # Large size content are by default collapsed for a cleaner ui
47
+ default_collapse_content = true
48
+
49
+ # The default value for the expand messages settings.
50
+ default_expand_messages = false
51
+
52
+ # Hide the chain of thought details from the user in the UI.
53
+ hide_cot = false
54
+
55
+ # Link to your github repo. This will add a github button in the UI's header.
56
+ # github = ""
57
+
58
+ # Specify a CSS file that can be used to customize the user interface.
59
+ # The CSS file can be served from the public directory or via an external link.
60
+ # custom_css = "/public/test.css"
61
+
62
+ # Override default MUI light theme. (Check theme.ts)
63
+ [UI.theme.light]
64
+ #background = "#FAFAFA"
65
+ #paper = "#FFFFFF"
66
+
67
+ [UI.theme.light.primary]
68
+ #main = "#F80061"
69
+ #dark = "#980039"
70
+ #light = "#FFE7EB"
71
+
72
+ # Override default MUI dark theme. (Check theme.ts)
73
+ [UI.theme.dark]
74
+ #background = "#FAFAFA"
75
+ #paper = "#FFFFFF"
76
+
77
+ [UI.theme.dark.primary]
78
+ #main = "#F80061"
79
+ #dark = "#980039"
80
+ #light = "#FFE7EB"
81
+
82
+
83
+ [meta]
84
+ generated_by = "0.7.700"
.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ venv/*
2
+ .env
3
+ __pycache__/app.cpython-39.pyc
4
+ __pycache__/app.cpython-311.pyc
5
+ __pycache__/Chunking.cpython-39.pyc
6
+ __pycache__/Chunking.cpython-311.pyc
7
+ .vscode/launch.json
8
+ .vscode/settings.json
Chunking.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from enum import Enum
2
+ from langchain_community.document_loaders import PyPDFLoader,TextLoader
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter,NLTKTextSplitter,SpacyTextSplitter
4
+ separators=[
5
+ "\n\n",
6
+ "\n",
7
+ " ",
8
+ ".",
9
+ ",",
10
+ "\u200b", # Zero-width space
11
+ "\uff0c", # Fullwidth comma
12
+ "\u3001", # Ideographic comma
13
+ "\uff0e", # Fullwidth full stop
14
+ "\u3002", # Ideographic full stop
15
+ "",
16
+ ]
17
+
18
+ class ChunkingStrategy(Enum):
19
+ RECURSIVE_CHARACTER_CHAR_SPLITTER = "recursive_character_char_splitter"
20
+ NLTK_TEXT_SPLITTER = "nltk_text_splitter"
21
+ SPACY_TEXT_SPLITTER = "spacy_text_splitter"
22
+
23
+ class TextLoaderAndSplitterWrapper:
24
+ def __init__(self, strategy: ChunkingStrategy, file_path:str):
25
+ # Defaults
26
+ self.splitter = None
27
+ self.documents = []
28
+
29
+ # Determine with splitter strategy to use from parameter
30
+ if strategy == ChunkingStrategy.RECURSIVE_CHARACTER_CHAR_SPLITTER:
31
+ self.splitter = RecursiveCharacterTextSplitter(separators=separators)
32
+ elif strategy == ChunkingStrategy.NLTK_TEXT_SPLITTER:
33
+ self.splitter = NLTKTextSplitter()
34
+ elif strategy == ChunkingStrategy.SPACY_TEXT_SPLITTER:
35
+ self.splitter = SpacyTextSplitter()
36
+ else:
37
+ raise ValueError(f"Unknown strategy: {strategy}")
38
+
39
+ # Load the document and chunk it
40
+ self.file_path = file_path
41
+
42
+
43
+ def load_documents(self):
44
+ if self.file_path.endswith(".pdf"):
45
+ # Use PDF loader
46
+ pdf_loader = PyPDFLoader(self.file_path)
47
+ self.documents = pdf_loader.load_and_split(text_splitter=self.splitter) # Defaults to RecursiveCharacterTextSplitter.
48
+ return self.documents
49
+ elif self.file_path.endswith(".txt"):
50
+ # Use Text loader
51
+ text_loader = TextLoader(self.file_path)
52
+ self.documents = text_loader.load_and_split(text_splitter=self.splitter)
53
+ return self.documents
54
+ else:
55
+ raise ValueError(f"Unknown file type: {self.file_path}")
56
+
57
+
58
+ def split(self, text: str):
59
+ return self.splitter.split(text)
60
+
61
+ def join(self, chunks: list):
62
+ return self.splitter.join(chunks)
63
+
64
+ def __str__(self):
65
+ return f"TextLoaderAndSplitterWrapper(splitter={self.splitter})"
66
+
67
+ def __repr__(self):
68
+ return str(self)
Dockerfile ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+ RUN useradd -m -u 1000 user
3
+ USER user
4
+ ENV HOME=/home/user \
5
+ PATH=/home/user/.local/bin:$PATH
6
+ WORKDIR $HOME/app
7
+ COPY --chown=user . $HOME/app
8
+ RUN pip install -r requirements.txt
9
+ CMD ["chainlit", "run", "app.py", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import List
3
+ from operator import itemgetter
4
+ from Chunking import ChunkingStrategy, TextLoaderAndSplitterWrapper
5
+
6
+ from langchain.schema.runnable import RunnablePassthrough
7
+ from langchain_openai import ChatOpenAI
8
+ from langchain_openai.embeddings import OpenAIEmbeddings
9
+ from langchain_core.prompts import ChatPromptTemplate
10
+ from langchain_community.vectorstores import Qdrant
11
+
12
+ import chainlit as cl
13
+ from chainlit.types import AskFileResponse
14
+ from chainlit.cli import run_chainlit
15
+
16
+ import tempfile
17
+
18
+ OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
19
+ GPT_MODEL = "gpt-4o-mini"
20
+
21
+ # Utility functions
22
+ def save_file(file: AskFileResponse,file_ext:str) -> str:
23
+ if file_ext == "application/pdf":
24
+ file_ext = ".pdf"
25
+ elif file_ext == "text/plain":
26
+ file_ext = ".txt"
27
+ else:
28
+ raise ValueError(f"Unknown file type: {file_ext}")
29
+
30
+ with tempfile.NamedTemporaryFile(
31
+ mode="wb", delete=False, suffix=file_ext
32
+ ) as temp_file:
33
+ temp_file_path = temp_file.name
34
+ temp_file.write(file.content)
35
+ return temp_file_path
36
+
37
+
38
+ # Prepare the components that will form the chain
39
+
40
+ ## Step 1: Create a prompt template
41
+ base_rag_prompt_template = """\
42
+ You are a helpful assistant that can answer questions related to the provided context. Repond I don't have that information if outside context.
43
+
44
+ Context:
45
+ {context}
46
+
47
+ Question:
48
+ {question}
49
+ """
50
+
51
+ base_rag_prompt = ChatPromptTemplate.from_template(base_rag_prompt_template)
52
+
53
+ ## Step 2: Create Embeddings model instance for creating embeddings
54
+ embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
55
+
56
+ ## Step 2: Create the OpenAI chat model
57
+ base_llm = ChatOpenAI(model="gpt-4o-mini", tags=["base_llm"])
58
+
59
+
60
+ @cl.on_chat_start
61
+ async def on_chat_start():
62
+
63
+ msg = cl.Message(content="Welcome to the Chat with Files app powered by LCEL and OpenAI - RAG!")
64
+ await msg.send()
65
+
66
+ files = None
67
+ documents = None
68
+ # Wait for the user to upload a file
69
+ while files == None:
70
+ files = await cl.AskFileMessage(
71
+ content="Please upload a text or a pdf file to begin!",
72
+ accept=["text/plain", "application/pdf"],
73
+ max_size_mb=10,
74
+ max_files=1,
75
+ timeout=180,
76
+ ).send()
77
+
78
+ ## Load file and split into chunks
79
+ msg = cl.Message(content=f"Processing `{files[0].name}`...")
80
+ await msg.send()
81
+ current_file_path = save_file(files[0], files[0].type)
82
+ loader_splitter = TextLoaderAndSplitterWrapper(ChunkingStrategy.RECURSIVE_CHARACTER_CHAR_SPLITTER, current_file_path)
83
+ documents = loader_splitter.load_documents()
84
+
85
+ ## Vectorising the documents
86
+ qdrant_vectorstore = Qdrant.from_documents(
87
+ documents=documents,
88
+ embedding=embedding_model,
89
+ location=":memory:"
90
+ )
91
+ qdrant_retriever = qdrant_vectorstore.as_retriever()
92
+
93
+ # create the chain on new chart session
94
+ retrieval_augmented_qa_chain = (
95
+ # INVOKE CHAIN WITH: {"question" : "<<SOME USER QUESTION>>"}
96
+ # "question" : populated by getting the value of the "question" key
97
+ # "context" : populated by getting the value of the "question" key and chaining it into the base_retriever
98
+ {"context": itemgetter("question") | qdrant_retriever, "question": itemgetter("question")}
99
+ # "context" : is assigned to a RunnablePassthrough object (will not be called or considered in the next step)
100
+ # by getting the value of the "context" key from the previous step
101
+ | RunnablePassthrough.assign(context=itemgetter("context"))
102
+ # "response" : the "context" and "question" values are used to format our prompt object and then piped
103
+ # into the LLM and stored in a key called "response"
104
+ # "context" : populated by getting the value of the "context" key from the previous step
105
+ | {"response": base_rag_prompt | base_llm, "context": itemgetter("context")}
106
+ )
107
+
108
+ # Let the user know that the system is ready
109
+ msg = cl.Message(content=f"Processing `{files[0].name}` done. You can now ask questions!")
110
+ await msg.send()
111
+
112
+ cl.user_session.set("chain", retrieval_augmented_qa_chain)
113
+
114
+
115
+ @cl.on_message
116
+ async def main(message: cl.Message):
117
+ chain = cl.user_session.get("chain")
118
+ msg = cl.Message(content="")
119
+ response = chain.invoke({"question": message.content})
120
+ msg.content= response["response"].content
121
+ await msg.send()
122
+ cl.user_session.set("chain", chain)
123
+
124
+ if __name__ == "__main__":
125
+ run_chainlit(__file__)
chainlit.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Welcome to Chat with Your Text File
2
+
3
+ With this application, you can chat with an uploaded text file that is smaller than 2MB!
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langsmith
2
+ langchain_core
3
+ langchain_openai
4
+ langchain_community
5
+ langchain-text-splitters
6
+ langchain-qdrant
7
+ qdrant-client
8
+ openai
9
+ tiktoken
10
+ cohere
11
+ lxml
12
+ pymupdf
13
+ pypdf
14
+
15
+ chainlit==0.7.700