Rohan Kataria commited on
Commit
f143a79
·
1 Parent(s): 5d1b722

more changes

Browse files
.history/.streamlit/config_20240310222951.toml ADDED
File without changes
.history/.streamlit/config_20240310223008.toml ADDED
File without changes
.history/src/main_20240310222705.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import openai
3
+ import sys
4
+ import docarray
5
+ sys.path.append('../..')
6
+ from langchain.embeddings.openai import OpenAIEmbeddings
7
+ from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
8
+ from langchain.vectorstores import DocArrayInMemorySearch
9
+ from langchain.document_loaders import TextLoader
10
+ from langchain.chains import RetrievalQA, ConversationalRetrievalChain
11
+ from langchain.memory import ConversationBufferMemory
12
+ from langchain.chat_models import ChatOpenAI
13
+ from langchain.document_loaders import TextLoader
14
+ from langchain.document_loaders import GitLoader
15
+ from langchain.llms import OpenAI
16
+ from langchain.memory import ConversationBufferMemory, ConversationBufferWindowMemory
17
+ from langchain.vectorstores import Chroma
18
+ from langchain.embeddings.openai import OpenAIEmbeddings
19
+ from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, AIMessagePromptTemplate, ChatPromptTemplate
20
+ import datetime
21
+ import shutil
22
+
23
+ os.environ["OPENAI_API_KEY"] = "nothing"
24
+
25
+ # Function to load the data from github using langchain with string type url, string type branch, string type file_filter
26
+ def loader(url: str, branch: str, file_filter: str):
27
+ repo_path = "./github_repo"
28
+ if os.path.exists(repo_path):
29
+ shutil.rmtree(repo_path)
30
+
31
+ loader = GitLoader(
32
+ clone_url= url,
33
+ repo_path="./github_repo/",
34
+ branch=branch,
35
+ file_filter=lambda file_path: file_path.endswith(tuple(file_filter.split(','))) # Filter out files in Data but whole repo is cloned
36
+ )
37
+
38
+ data = loader.load()
39
+ return data
40
+
41
+
42
+ #Function to split the data into chunks using recursive character text splitter
43
+ def split_data(data):
44
+ splitter = RecursiveCharacterTextSplitter(
45
+ chunk_size=1000,
46
+ chunk_overlap=150,
47
+ length_function=len, # Function to measure the length of chunks while splitting
48
+ add_start_index=True # Include the starting position of each chunk in metadata
49
+ )
50
+ chunks = splitter.split_documents(data)
51
+ return chunks
52
+
53
+ #Function to ingest the chunks into a vectorstore of doc
54
+ def ingest_chunks(chunks):
55
+ embedding = OpenAIEmbeddings(
56
+ # deployment="your-embeddings-deployment-name",
57
+ model="nomic-embed-text",
58
+ openai_api_base="https://thewise-ollama-server.hf.space",
59
+ # openai_api_type="azure",
60
+ )
61
+ vector_store = DocArrayInMemorySearch.from_documents(chunks, embedding)
62
+
63
+ repo_path = "./github_repo"
64
+ if os.path.exists(repo_path):
65
+ shutil.rmtree(repo_path)
66
+
67
+ return vector_store
68
+
69
+ #Retreival function to get the data from the database and reply to the user
70
+ def retreival(vector_store, k):
71
+ #Creating LLM
72
+ llm = ChatOpenAI(model='codellama', temperature=0, openai_api_base='https://thewise-ollama-server.hf.space', openai_api_key='nothing')
73
+
74
+ # Define the system message template
75
+ #Adding CHAT HISTORY to the System template explicitly because mainly Chat history goes to Condense the Human Question with Backround (Not template), but System template goes straight the LLM Chain
76
+ #Explicitly adding chat history to access previous chats and answer "what is my previous question?"
77
+ #Great thing this also sends the chat history to the LLM Model along with the context and question
78
+ system_template = """You're a code summarisation assistant. Given the following extracted parts of a long document as "CONTEXT" create a final answer.
79
+ If you don't know the answer, just say that you don't know. Don't try to make up an answer.
80
+ Only If asked to create a "DIAGRAM" for code use "MERMAID SYNTAX LANGUAGE" in your answer from "CONTEXT" and "CHAT HISTORY" with a short explanation of diagram.
81
+ CONTEXT: {context}
82
+ =======
83
+ CHAT HISTORY: {chat_history}
84
+ =======
85
+ FINAL ANSWER:"""
86
+
87
+ human_template = """{question}"""
88
+
89
+ # ai_template = """
90
+ # FINAL ANSWER:"""
91
+
92
+ # Create the chat prompt templates
93
+ messages = [
94
+ SystemMessagePromptTemplate.from_template(system_template),
95
+ HumanMessagePromptTemplate.from_template(human_template)
96
+ # AIMessagePromptTemplate.from_template(ai_template)
97
+ ]
98
+
99
+ PROMPT = ChatPromptTemplate.from_messages(messages)
100
+
101
+ #Creating memory
102
+ # memory = ConversationBufferMemory(
103
+ # memory_key="chat_history",
104
+ # input_key="question",
105
+ # output_key="answer",
106
+ # return_messages=True)
107
+
108
+ memory = ConversationBufferWindowMemory(
109
+ memory_key="chat_history",
110
+ input_key="question",
111
+ output_key="answer",
112
+ return_messages=True,
113
+ k=5)
114
+
115
+ #Creating the retriever, this can also be a contextual compressed retriever
116
+ retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": k}) #search_type can be "similarity" or "mmr"
117
+
118
+ chain = ConversationalRetrievalChain.from_llm(
119
+ llm=llm,
120
+ chain_type="stuff", #chain type can be refine, stuff, map_reduce
121
+ retriever=retriever,
122
+ memory=memory,
123
+ return_source_documents=True, #When used these 2 properties, the output gets 3 properties: answer, source_document, source_document_score and then have to speocify input and output key in memory for it to work
124
+ combine_docs_chain_kwargs=dict({"prompt": PROMPT})
125
+ )
126
+
127
+ return chain
128
+
129
+ #Class using all above components to create QA system
130
+ class ConversationalResponse:
131
+ def __init__(self, url, branch, file_filter):
132
+ self.url = url
133
+ self.branch = branch
134
+ self.file_filter = file_filter
135
+ self.data = loader(self.url, self.branch, self.file_filter)
136
+ self.chunks = split_data(self.data)
137
+ self.vector_store = ingest_chunks(self.chunks)
138
+ self.chain_type = "stuff"
139
+ self.k = 10
140
+ self.chain = retreival(self.vector_store, self.k)
141
+
142
+ def __call__(self, question):
143
+ agent = self.chain(question)
144
+ return agent['answer']
.history/src/main_20240310222818.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import openai
3
+ import sys
4
+ import docarray
5
+ sys.path.append('../..')
6
+ from langchain.embeddings.openai import OpenAIEmbeddings
7
+ from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
8
+ from langchain.vectorstores import DocArrayInMemorySearch
9
+ from langchain.document_loaders import TextLoader
10
+ from langchain.chains import RetrievalQA, ConversationalRetrievalChain
11
+ from langchain.memory import ConversationBufferMemory
12
+ from langchain.chat_models import ChatOpenAI
13
+ from langchain.document_loaders import TextLoader
14
+ from langchain.document_loaders import GitLoader
15
+ from langchain.llms import OpenAI
16
+ from langchain.memory import ConversationBufferMemory, ConversationBufferWindowMemory
17
+ from langchain.vectorstores import Chroma
18
+ from langchain.embeddings.openai import OpenAIEmbeddings
19
+ from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, AIMessagePromptTemplate, ChatPromptTemplate
20
+ import datetime
21
+ import shutil
22
+
23
+ os.environ["OPENAI_API_KEY"] = "nothing"
24
+
25
+ # Function to load the data from github using langchain with string type url, string type branch, string type file_filter
26
+ def loader(url: str, branch: str, file_filter: str):
27
+ repo_path = "./github_repo"
28
+ if os.path.exists(repo_path):
29
+ shutil.rmtree(repo_path)
30
+
31
+ loader = GitLoader(
32
+ clone_url= url,
33
+ repo_path="./github_repo/",
34
+ branch=branch,
35
+ file_filter=lambda file_path: file_path.endswith(tuple(file_filter.split(','))) # Filter out files in Data but whole repo is cloned
36
+ )
37
+
38
+ data = loader.load()
39
+ return data
40
+
41
+
42
+ #Function to split the data into chunks using recursive character text splitter
43
+ def split_data(data):
44
+ splitter = RecursiveCharacterTextSplitter(
45
+ chunk_size=1000,
46
+ chunk_overlap=150,
47
+ length_function=len, # Function to measure the length of chunks while splitting
48
+ add_start_index=True # Include the starting position of each chunk in metadata
49
+ )
50
+ chunks = splitter.split_documents(data)
51
+ return chunks
52
+
53
+ #Function to ingest the chunks into a vectorstore of doc
54
+ def ingest_chunks(chunks):
55
+ embedding = OpenAIEmbeddings(
56
+ model="nomic-embed-text",
57
+ openai_api_base="https://thewise-ollama-server.hf.space",
58
+ # openai_api_type="azure",
59
+ )
60
+ vector_store = DocArrayInMemorySearch.from_documents(chunks, embedding)
61
+
62
+ repo_path = "./github_repo"
63
+ if os.path.exists(repo_path):
64
+ shutil.rmtree(repo_path)
65
+
66
+ return vector_store
67
+
68
+ #Retreival function to get the data from the database and reply to the user
69
+ def retreival(vector_store, k):
70
+ #Creating LLM
71
+ llm = ChatOpenAI(model='codellama'
72
+ , openai_api_base='https://thewise-ollama-server.hf.space')
73
+
74
+ # Define the system message template
75
+ #Adding CHAT HISTORY to the System template explicitly because mainly Chat history goes to Condense the Human Question with Backround (Not template), but System template goes straight the LLM Chain
76
+ #Explicitly adding chat history to access previous chats and answer "what is my previous question?"
77
+ #Great thing this also sends the chat history to the LLM Model along with the context and question
78
+ system_template = """You're a code summarisation assistant. Given the following extracted parts of a long document as "CONTEXT" create a final answer.
79
+ If you don't know the answer, just say that you don't know. Don't try to make up an answer.
80
+ Only If asked to create a "DIAGRAM" for code use "MERMAID SYNTAX LANGUAGE" in your answer from "CONTEXT" and "CHAT HISTORY" with a short explanation of diagram.
81
+ CONTEXT: {context}
82
+ =======
83
+ CHAT HISTORY: {chat_history}
84
+ =======
85
+ FINAL ANSWER:"""
86
+
87
+ human_template = """{question}"""
88
+
89
+ # ai_template = """
90
+ # FINAL ANSWER:"""
91
+
92
+ # Create the chat prompt templates
93
+ messages = [
94
+ SystemMessagePromptTemplate.from_template(system_template),
95
+ HumanMessagePromptTemplate.from_template(human_template)
96
+ # AIMessagePromptTemplate.from_template(ai_template)
97
+ ]
98
+
99
+ PROMPT = ChatPromptTemplate.from_messages(messages)
100
+
101
+ #Creating memory
102
+ # memory = ConversationBufferMemory(
103
+ # memory_key="chat_history",
104
+ # input_key="question",
105
+ # output_key="answer",
106
+ # return_messages=True)
107
+
108
+ memory = ConversationBufferWindowMemory(
109
+ memory_key="chat_history",
110
+ input_key="question",
111
+ output_key="answer",
112
+ return_messages=True,
113
+ k=5)
114
+
115
+ #Creating the retriever, this can also be a contextual compressed retriever
116
+ retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": k}) #search_type can be "similarity" or "mmr"
117
+
118
+ chain = ConversationalRetrievalChain.from_llm(
119
+ llm=llm,
120
+ chain_type="stuff", #chain type can be refine, stuff, map_reduce
121
+ retriever=retriever,
122
+ memory=memory,
123
+ return_source_documents=True, #When used these 2 properties, the output gets 3 properties: answer, source_document, source_document_score and then have to speocify input and output key in memory for it to work
124
+ combine_docs_chain_kwargs=dict({"prompt": PROMPT})
125
+ )
126
+
127
+ return chain
128
+
129
+ #Class using all above components to create QA system
130
+ class ConversationalResponse:
131
+ def __init__(self, url, branch, file_filter):
132
+ self.url = url
133
+ self.branch = branch
134
+ self.file_filter = file_filter
135
+ self.data = loader(self.url, self.branch, self.file_filter)
136
+ self.chunks = split_data(self.data)
137
+ self.vector_store = ingest_chunks(self.chunks)
138
+ self.chain_type = "stuff"
139
+ self.k = 10
140
+ self.chain = retreival(self.vector_store, self.k)
141
+
142
+ def __call__(self, question):
143
+ agent = self.chain(question)
144
+ return agent['answer']
.streamlit/config.toml ADDED
File without changes
src/main.py CHANGED
@@ -20,6 +20,7 @@ from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate, Human
20
  import datetime
21
  import shutil
22
 
 
23
 
24
  # Function to load the data from github using langchain with string type url, string type branch, string type file_filter
25
  def loader(url: str, branch: str, file_filter: str):
@@ -52,11 +53,9 @@ def split_data(data):
52
  #Function to ingest the chunks into a vectorstore of doc
53
  def ingest_chunks(chunks):
54
  embedding = OpenAIEmbeddings(
55
- # deployment="your-embeddings-deployment-name",
56
  model="nomic-embed-text",
57
  openai_api_base="https://thewise-ollama-server.hf.space",
58
  # openai_api_type="azure",
59
- openai_api_key='nothing'
60
  )
61
  vector_store = DocArrayInMemorySearch.from_documents(chunks, embedding)
62
 
@@ -69,7 +68,8 @@ def ingest_chunks(chunks):
69
  #Retreival function to get the data from the database and reply to the user
70
  def retreival(vector_store, k):
71
  #Creating LLM
72
- llm = ChatOpenAI(model='codellama', temperature=0, openai_api_base='https://thewise-ollama-server.hf.space', openai_api_key='nothing')
 
73
 
74
  # Define the system message template
75
  #Adding CHAT HISTORY to the System template explicitly because mainly Chat history goes to Condense the Human Question with Backround (Not template), but System template goes straight the LLM Chain
 
20
  import datetime
21
  import shutil
22
 
23
+ os.environ["OPENAI_API_KEY"] = "nothing"
24
 
25
  # Function to load the data from github using langchain with string type url, string type branch, string type file_filter
26
  def loader(url: str, branch: str, file_filter: str):
 
53
  #Function to ingest the chunks into a vectorstore of doc
54
  def ingest_chunks(chunks):
55
  embedding = OpenAIEmbeddings(
 
56
  model="nomic-embed-text",
57
  openai_api_base="https://thewise-ollama-server.hf.space",
58
  # openai_api_type="azure",
 
59
  )
60
  vector_store = DocArrayInMemorySearch.from_documents(chunks, embedding)
61
 
 
68
  #Retreival function to get the data from the database and reply to the user
69
  def retreival(vector_store, k):
70
  #Creating LLM
71
+ llm = ChatOpenAI(model='codellama'
72
+ , openai_api_base='https://thewise-ollama-server.hf.space')
73
 
74
  # Define the system message template
75
  #Adding CHAT HISTORY to the System template explicitly because mainly Chat history goes to Condense the Human Question with Backround (Not template), but System template goes straight the LLM Chain