souravmighty commited on
Commit
811a1e3
Β·
1 Parent(s): 1187c2e

Changed to snowflake embedding

Browse files
.chainlit/config.toml CHANGED
@@ -49,7 +49,7 @@ auto_tag_thread = true
49
  name = "Chatbot"
50
 
51
  # Show the readme while the thread is empty.
52
- show_readme_as_default = true
53
 
54
  # Description of the app and chatbot. This is used for HTML tags.
55
  # description = ""
 
49
  name = "Chatbot"
50
 
51
  # Show the readme while the thread is empty.
52
+ show_readme_as_default = false
53
 
54
  # Description of the app and chatbot. This is used for HTML tags.
55
  # description = ""
Dockerfile CHANGED
@@ -7,5 +7,6 @@ WORKDIR $HOME/app
7
  COPY --chown=user . $HOME/app
8
  COPY ./requirements.txt ~/app/requirements.txt
9
  RUN pip install -r requirements.txt
 
10
  COPY . .
11
  CMD ["chainlit", "run", "app.py", "--port", "7860"]
 
7
  COPY --chown=user . $HOME/app
8
  COPY ./requirements.txt ~/app/requirements.txt
9
  RUN pip install -r requirements.txt
10
+ RUN pip install git+https://github.com/UKPLab/sentence-transformers.git
11
  COPY . .
12
  CMD ["chainlit", "run", "app.py", "--port", "7860"]
app.py CHANGED
@@ -10,6 +10,7 @@ from chainlit.input_widget import Select
10
  import os
11
 
12
 
 
13
  @cl.cache
14
  def get_memory():
15
  # Initialize message history for conversation
@@ -41,26 +42,6 @@ async def on_chat_start():
41
  ]
42
  ).send()
43
 
44
- await setup_agent(settings)
45
-
46
-
47
- @cl.on_settings_update
48
- async def setup_agent(settings):
49
-
50
- user_env = cl.user_session.get("env")
51
- os.environ["GROQ_API_KEY"] = user_env.get("GROQ_API_KEY")
52
-
53
- # embeddings = OllamaEmbeddings(model="nomic-embed-text")
54
- # embeddings = SentenceTransformerEmbeddings(model_name = "sentence-transformers/all-MiniLM-L6-v2")
55
- # memory=get_memory()
56
-
57
- # docsearch = await cl.make_async(Chroma)(
58
- # persist_directory="./chroma_db",
59
- # embedding_function=embeddings
60
- # )
61
-
62
- msg = cl.Message(content = f"You are using '{settings['Model']}' as LLM.")
63
- await msg.send()
64
 
65
  files = None #Initialize variable to store uploaded files
66
 
@@ -71,23 +52,25 @@ async def setup_agent(settings):
71
  accept=["application/pdf"],
72
  max_size_mb=100,
73
  timeout=180,
 
74
  ).send()
75
 
76
- file = files[0] # Get the first uploaded file
77
-
78
- # Inform the user that processing has started
79
- msg = cl.Message(content=f"Processing `{file.name}`...")
80
- await msg.send()
81
 
82
- # Read the PDF file
83
- pdf = PyPDF2.PdfReader(file.path)
84
  pdf_text = ""
85
- for page in pdf.pages:
86
- pdf_text += page.extract_text()
 
 
 
 
 
 
 
 
87
 
88
 
89
  # Split the text into chunks
90
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
91
  texts = text_splitter.split_text(pdf_text)
92
 
93
  # Create a metadata for each chunk
@@ -95,20 +78,40 @@ async def setup_agent(settings):
95
 
96
  # Create a Chroma vector store
97
  # embeddings = OllamaEmbeddings(model="nomic-embed-text")
98
- embeddings = SentenceTransformerEmbeddings(model_name = "sentence-transformers/all-MiniLM-L6-v2")
 
 
 
99
  #embeddings = OllamaEmbeddings(model="llama2:7b")
100
  docsearch = await cl.make_async(Chroma.from_texts)(
101
  texts, embeddings, metadatas=metadatas
102
  )
 
103
 
104
  # Let the user know that the system is ready
105
  msg.content = f"Processing `{file.name}` done. You can now ask questions!"
106
  await msg.update()
107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  memory=get_memory()
109
 
110
 
111
- # Create a chain that uses the Chroma vector store
112
  chain = ConversationalRetrievalChain.from_llm(
113
  llm = ChatGroq(model=settings["Model"]),
114
  chain_type="stuff",
@@ -158,4 +161,18 @@ async def main(message: cl.Message):
158
  else:
159
  answer += "\nNo sources found"
160
  #return results
161
- await cl.Message(content=answer, elements=text_elements).send()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  import os
11
 
12
 
13
+
14
  @cl.cache
15
  def get_memory():
16
  # Initialize message history for conversation
 
42
  ]
43
  ).send()
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  files = None #Initialize variable to store uploaded files
47
 
 
52
  accept=["application/pdf"],
53
  max_size_mb=100,
54
  timeout=180,
55
+ max_files = 10,
56
  ).send()
57
 
 
 
 
 
 
58
 
 
 
59
  pdf_text = ""
60
+ for file in files:
61
+ # Inform the user that processing has started
62
+ msg = cl.Message(content=f"Processing `{file.name}`...")
63
+ await msg.send()
64
+
65
+ # Read the PDF file
66
+ pdf = PyPDF2.PdfReader(file.path)
67
+ for page in pdf.pages:
68
+ pdf_text += page.extract_text()
69
+
70
 
71
 
72
  # Split the text into chunks
73
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
74
  texts = text_splitter.split_text(pdf_text)
75
 
76
  # Create a metadata for each chunk
 
78
 
79
  # Create a Chroma vector store
80
  # embeddings = OllamaEmbeddings(model="nomic-embed-text")
81
+ # embeddings = SentenceTransformerEmbeddings(model_name = "sentence-transformers/all-MiniLM-L6-v2")
82
+ embeddings = SentenceTransformerEmbeddings(model_name = "Snowflake/snowflake-arctic-embed-m")
83
+
84
+
85
  #embeddings = OllamaEmbeddings(model="llama2:7b")
86
  docsearch = await cl.make_async(Chroma.from_texts)(
87
  texts, embeddings, metadatas=metadatas
88
  )
89
+ cl.user_session.set("docsearch", docsearch)
90
 
91
  # Let the user know that the system is ready
92
  msg.content = f"Processing `{file.name}` done. You can now ask questions!"
93
  await msg.update()
94
 
95
+ await setup_agent(settings)
96
+
97
+
98
+ @cl.on_settings_update
99
+ async def setup_agent(settings):
100
+
101
+ user_env = cl.user_session.get("env")
102
+ os.environ["GROQ_API_KEY"] = user_env.get("GROQ_API_KEY")
103
+
104
+ memory=get_memory()
105
+ docsearch = cl.user_session.get("docsearch")
106
+
107
+ msg = cl.Message(content = f"You are using `{settings['Model']}` as LLM. You can change model in `Settings Panel` in the chat box.")
108
+ await msg.send()
109
+
110
+
111
  memory=get_memory()
112
 
113
 
114
+ # Create a chain that uses the Chroma vector stores
115
  chain = ConversationalRetrievalChain.from_llm(
116
  llm = ChatGroq(model=settings["Model"]),
117
  chain_type="stuff",
 
161
  else:
162
  answer += "\nNo sources found"
163
  #return results
164
+ await cl.Message(content=answer, elements=text_elements).send()
165
+
166
+
167
+ @cl.on_stop
168
+ def on_stop():
169
+ print("The user wants to stop the task!")
170
+ docsearch = cl.user_session.get("docsearch")
171
+ docsearch.delete_collection()
172
+
173
+
174
+ @cl.on_chat_end
175
+ def on_chat_end():
176
+ print("The user disconnected!")
177
+ docsearch = cl.user_session.get("docsearch")
178
+ docsearch.delete_collection()
assets/conversational_rag_architecture.gif ADDED
chainlit.md ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # Welcome to GroqDoc!
2
+
3
+ ## Useful Links πŸ”—
4
+
5
+ - **Groq API KEY:** Generate Groq API Key for free [Groq API Key](https://console.groq.com/keys) πŸ“š
6
+
requirements.txt CHANGED
@@ -5,4 +5,3 @@ PyPDF2
5
  chromadb
6
  groq
7
  langchain-groq
8
- sentence-transformers
 
5
  chromadb
6
  groq
7
  langchain-groq