souravmighty commited on
Commit
86e0637
1 Parent(s): faa5b0a

add app files

Browse files
Files changed (4) hide show
  1. .gitignore +3 -0
  2. Dockerfile +14 -0
  3. app.py +156 -0
  4. requirements.txt +8 -0
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ .venv/
2
+ .env
3
+ __pycache__
Dockerfile ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
+ # you will also find guides on how best to write your Dockerfile
3
+
4
+ FROM python:3.11
5
+
6
+ WORKDIR /code
7
+
8
+ COPY ./requirements.txt /code/requirements.txt
9
+
10
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
11
+
12
+ COPY . .
13
+
14
+ CMD ["chainlit", "run", "app.py", "--address", "0.0.0.0", "--port", "7860", "--allow-websocket-origin", "souravmighty-groq_doc.hf.space"]
app.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PyPDF2
2
+ from langchain_community.embeddings import OllamaEmbeddings
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain_community.vectorstores import Chroma
5
+ from langchain.chains import ConversationalRetrievalChain
6
+ from langchain_groq import ChatGroq
7
+ from langchain.memory import ChatMessageHistory, ConversationBufferMemory
8
+ import chainlit as cl
9
+ from chainlit.input_widget import Select
10
+ import os
11
+
12
+
13
+ @cl.cache
14
+ def get_memory():
15
+ # Initialize message history for conversation
16
+ message_history = ChatMessageHistory()
17
+
18
+ # Memory for conversational context
19
+ memory = ConversationBufferMemory(
20
+ memory_key="chat_history",
21
+ output_key="answer",
22
+ chat_memory=message_history,
23
+ return_messages=True,
24
+ )
25
+ return memory
26
+
27
+ @cl.on_chat_start
28
+ async def on_chat_start():
29
+
30
+ user_env = cl.user_session.get("env")
31
+ os.environ["GROQ_API_KEY"] = user_env.get("GROQ_API_KEY")
32
+
33
+ settings = await cl.ChatSettings(
34
+ [
35
+ Select(
36
+ id="Model",
37
+ label="Open Source Model",
38
+ values=["llama3-8b-8192", "llama3-70b-8192", "mixtral-8x7b-32768", "gemma-7b-it"],
39
+ initial_index=0,
40
+ )
41
+ ]
42
+ ).send()
43
+
44
+ files = None #Initialize variable to store uploaded files
45
+
46
+ # Wait for the user to upload a file
47
+ while files is None:
48
+ files = await cl.AskFileMessage(
49
+ content="Please upload a pdf file to begin!",
50
+ accept=["application/pdf"],
51
+ max_size_mb=100,
52
+ timeout=180,
53
+ ).send()
54
+
55
+ file = files[0] # Get the first uploaded file
56
+
57
+ # Inform the user that processing has started
58
+ msg = cl.Message(content=f"Processing `{file.name}`...")
59
+ await msg.send()
60
+
61
+ # Read the PDF file
62
+ pdf = PyPDF2.PdfReader(file.path)
63
+ pdf_text = ""
64
+ for page in pdf.pages:
65
+ pdf_text += page.extract_text()
66
+
67
+
68
+ # Split the text into chunks
69
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
70
+ texts = text_splitter.split_text(pdf_text)
71
+
72
+ # Create a metadata for each chunk
73
+ metadatas = [{"source": f"{i}-pl"} for i in range(len(texts))]
74
+
75
+ # Create a Chroma vector store
76
+ embeddings = OllamaEmbeddings(model="nomic-embed-text")
77
+ #embeddings = OllamaEmbeddings(model="llama2:7b")
78
+ docsearch = await cl.make_async(Chroma.from_texts)(
79
+ texts, embeddings, metadatas=metadatas, persist_directory='./chroma_db'
80
+ )
81
+ docsearch.persist()
82
+
83
+
84
+ # Let the user know that the system is ready
85
+ msg.content = f"Processing `{file.name}` done. You can now ask questions!"
86
+ await msg.update()
87
+
88
+ await setup_agent(settings)
89
+
90
+
91
+ @cl.on_settings_update
92
+ async def setup_agent(settings):
93
+ print("Setup agent with settings:", settings)
94
+
95
+ user_env = cl.user_session.get("env")
96
+ os.environ["GROQ_API_KEY"] = user_env.get("GROQ_API_KEY")
97
+
98
+ embeddings = OllamaEmbeddings(model="nomic-embed-text")
99
+ memory=get_memory()
100
+
101
+ docsearch = await cl.make_async(Chroma)(
102
+ persist_directory="./chroma_db",
103
+ embedding_function=embeddings
104
+ )
105
+
106
+ # Create a chain that uses the Chroma vector store
107
+ chain = ConversationalRetrievalChain.from_llm(
108
+ llm = ChatGroq(model=settings["Model"]),
109
+ chain_type="stuff",
110
+ retriever=docsearch.as_retriever(),
111
+ memory=memory,
112
+ return_source_documents=True,
113
+ )
114
+
115
+ #store the chain in user session
116
+ cl.user_session.set("chain", chain)
117
+
118
+
119
+ @cl.on_message
120
+ async def main(message: cl.Message):
121
+
122
+ # Retrieve the chain from user session
123
+ chain = cl.user_session.get("chain")
124
+ #call backs happens asynchronously/parallel
125
+ cb = cl.AsyncLangchainCallbackHandler()
126
+
127
+ user_env = cl.user_session.get("env")
128
+ os.environ["GROQ_API_KEY"] = user_env.get("GROQ_API_KEY")
129
+
130
+
131
+ print(chain)
132
+
133
+ # call the chain with user's message content
134
+ res = await chain.ainvoke(message.content, callbacks=[cb])
135
+ answer = res["answer"]
136
+ source_documents = res["source_documents"]
137
+
138
+ text_elements = [] # Initialize list to store text elements
139
+
140
+ # Process source documents if available
141
+ if source_documents:
142
+ for source_idx, source_doc in enumerate(source_documents):
143
+ source_name = f"source_{source_idx}"
144
+ # Create the text element referenced in the message
145
+ text_elements.append(
146
+ cl.Text(content=source_doc.page_content, name=source_name)
147
+ )
148
+ source_names = [text_el.name for text_el in text_elements]
149
+
150
+ # Add source references to the answer
151
+ if source_names:
152
+ answer += f"\nSources: {', '.join(source_names)}"
153
+ else:
154
+ answer += "\nNo sources found"
155
+ #return results
156
+ await cl.Message(content=answer, elements=text_elements).send()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ chainlit
2
+ langchain
3
+ langchain-community
4
+ PyPDF2
5
+ chromadb
6
+ groq
7
+ langchain-groq
8
+ ollama