Spaces:

thewise
/

Chat-w-git-Codellama

Running

App Files Files Community

Rohan Kataria commited on Mar 11, 2024

Commit

f143a79

1 Parent(s): 5d1b722

more changes

Browse files

Files changed (6) hide show

.history/.streamlit/config_20240310222951.toml +0 -0
.history/.streamlit/config_20240310223008.toml +0 -0
.history/src/main_20240310222705.py +144 -0
.history/src/main_20240310222818.py +144 -0
.streamlit/config.toml +0 -0
src/main.py +3 -3

.history/.streamlit/config_20240310222951.toml ADDED Viewed

File without changes

.history/.streamlit/config_20240310223008.toml ADDED Viewed

File without changes

.history/src/main_20240310222705.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import os
+import openai
+import sys
+import docarray
+sys.path.append('../..')
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
+from langchain.vectorstores import DocArrayInMemorySearch
+from langchain.document_loaders import TextLoader
+from langchain.chains import RetrievalQA,  ConversationalRetrievalChain
+from langchain.memory import ConversationBufferMemory
+from langchain.chat_models import ChatOpenAI
+from langchain.document_loaders import TextLoader
+from langchain.document_loaders import GitLoader
+from langchain.llms import OpenAI
+from langchain.memory import ConversationBufferMemory, ConversationBufferWindowMemory
+from langchain.vectorstores import Chroma
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, AIMessagePromptTemplate, ChatPromptTemplate
+import datetime
+import shutil
+os.environ["OPENAI_API_KEY"] = "nothing"
+# Function to load the data from github using langchain with string type url, string type branch, string type file_filter
+def loader(url: str, branch: str, file_filter: str):
+    repo_path = "./github_repo"
+    if os.path.exists(repo_path):
+        shutil.rmtree(repo_path)
+    loader = GitLoader(
+    clone_url= url,
+    repo_path="./github_repo/",
+    branch=branch,
+    file_filter=lambda file_path: file_path.endswith(tuple(file_filter.split(','))) # Filter out files in Data but whole repo is cloned
+    )
+    data = loader.load()
+    return data
+#Function to split the data into chunks using recursive character text splitter
+def split_data(data):
+    splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000,
+            chunk_overlap=150,
+            length_function=len,  # Function to measure the length of chunks while splitting
+            add_start_index=True  # Include the starting position of each chunk in metadata
+    )
+    chunks = splitter.split_documents(data)
+    return chunks
+#Function to ingest the chunks into a vectorstore of doc
+def ingest_chunks(chunks):
+    embedding = OpenAIEmbeddings(
+    # deployment="your-embeddings-deployment-name",
+    model="nomic-embed-text",
+    openai_api_base="https://thewise-ollama-server.hf.space",
+    # openai_api_type="azure",
+)
+    vector_store = DocArrayInMemorySearch.from_documents(chunks, embedding)
+    repo_path = "./github_repo"
+    if os.path.exists(repo_path):
+        shutil.rmtree(repo_path)
+    return vector_store
+#Retreival function to get the data from the database and reply to the user
+def retreival(vector_store, k):
+    #Creating LLM
+    llm = ChatOpenAI(model='codellama', temperature=0, openai_api_base='https://thewise-ollama-server.hf.space', openai_api_key='nothing')
+    # Define the system message template
+    #Adding CHAT HISTORY to the System template explicitly because mainly Chat history goes to Condense the Human Question with Backround (Not template), but System template goes straight the LLM Chain
+    #Explicitly adding chat history to access previous chats and answer "what is my previous question?"
+    #Great thing this also sends the chat history to the LLM Model along with the context and question
+    system_template = """You're a code summarisation assistant. Given the following extracted parts of a long document as "CONTEXT" create a final answer.
+    If you don't know the answer, just say that you don't know. Don't try to make up an answer.
+    Only If asked to create a "DIAGRAM" for code use "MERMAID SYNTAX LANGUAGE" in your answer from "CONTEXT" and "CHAT HISTORY" with a short explanation of diagram.
+    CONTEXT: {context}
+    =======
+    CHAT HISTORY: {chat_history}
+    =======
+    FINAL ANSWER:"""
+    human_template = """{question}"""
+    # ai_template = """
+    # FINAL ANSWER:"""
+    # Create the chat prompt templates
+    messages = [
+    SystemMessagePromptTemplate.from_template(system_template),
+    HumanMessagePromptTemplate.from_template(human_template)
+    # AIMessagePromptTemplate.from_template(ai_template)
+    ]
+    PROMPT = ChatPromptTemplate.from_messages(messages)
+    #Creating memory
+    # memory = ConversationBufferMemory(
+    #         memory_key="chat_history",
+    #         input_key="question",
+    #         output_key="answer",
+    #         return_messages=True)
+    memory = ConversationBufferWindowMemory(
+            memory_key="chat_history",
+            input_key="question",
+            output_key="answer",
+            return_messages=True,
+            k=5)
+    #Creating the retriever, this can also be a contextual compressed retriever
+    retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": k}) #search_type can be "similarity" or "mmr"
+    chain = ConversationalRetrievalChain.from_llm(
+        llm=llm,
+        chain_type="stuff", #chain type can be refine, stuff, map_reduce
+        retriever=retriever,
+        memory=memory,
+        return_source_documents=True, #When used these 2 properties, the output gets 3 properties: answer, source_document, source_document_score and then have to speocify input and output key in memory for it to work
+        combine_docs_chain_kwargs=dict({"prompt": PROMPT})
+        )
+    return chain
+#Class using all above components to create QA system
+class ConversationalResponse:
+    def __init__(self, url, branch, file_filter):
+        self.url = url
+        self.branch = branch
+        self.file_filter = file_filter
+        self.data = loader(self.url, self.branch, self.file_filter)
+        self.chunks = split_data(self.data)
+        self.vector_store = ingest_chunks(self.chunks)
+        self.chain_type = "stuff"
+        self.k = 10
+        self.chain = retreival(self.vector_store, self.k)
+    def __call__(self, question):
+        agent = self.chain(question)
+        return agent['answer']

.history/src/main_20240310222818.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import os
+import openai
+import sys
+import docarray
+sys.path.append('../..')
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
+from langchain.vectorstores import DocArrayInMemorySearch
+from langchain.document_loaders import TextLoader
+from langchain.chains import RetrievalQA,  ConversationalRetrievalChain
+from langchain.memory import ConversationBufferMemory
+from langchain.chat_models import ChatOpenAI
+from langchain.document_loaders import TextLoader
+from langchain.document_loaders import GitLoader
+from langchain.llms import OpenAI
+from langchain.memory import ConversationBufferMemory, ConversationBufferWindowMemory
+from langchain.vectorstores import Chroma
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, AIMessagePromptTemplate, ChatPromptTemplate
+import datetime
+import shutil
+os.environ["OPENAI_API_KEY"] = "nothing"
+# Function to load the data from github using langchain with string type url, string type branch, string type file_filter
+def loader(url: str, branch: str, file_filter: str):
+    repo_path = "./github_repo"
+    if os.path.exists(repo_path):
+        shutil.rmtree(repo_path)
+    loader = GitLoader(
+    clone_url= url,
+    repo_path="./github_repo/",
+    branch=branch,
+    file_filter=lambda file_path: file_path.endswith(tuple(file_filter.split(','))) # Filter out files in Data but whole repo is cloned
+    )
+    data = loader.load()
+    return data
+#Function to split the data into chunks using recursive character text splitter
+def split_data(data):
+    splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000,
+            chunk_overlap=150,
+            length_function=len,  # Function to measure the length of chunks while splitting
+            add_start_index=True  # Include the starting position of each chunk in metadata
+    )
+    chunks = splitter.split_documents(data)
+    return chunks
+#Function to ingest the chunks into a vectorstore of doc
+def ingest_chunks(chunks):
+    embedding = OpenAIEmbeddings(
+    model="nomic-embed-text",
+    openai_api_base="https://thewise-ollama-server.hf.space",
+    # openai_api_type="azure",
+)
+    vector_store = DocArrayInMemorySearch.from_documents(chunks, embedding)
+    repo_path = "./github_repo"
+    if os.path.exists(repo_path):
+        shutil.rmtree(repo_path)
+    return vector_store
+#Retreival function to get the data from the database and reply to the user
+def retreival(vector_store, k):
+    #Creating LLM
+    llm = ChatOpenAI(model='codellama'
+                     , openai_api_base='https://thewise-ollama-server.hf.space')
+    # Define the system message template
+    #Adding CHAT HISTORY to the System template explicitly because mainly Chat history goes to Condense the Human Question with Backround (Not template), but System template goes straight the LLM Chain
+    #Explicitly adding chat history to access previous chats and answer "what is my previous question?"
+    #Great thing this also sends the chat history to the LLM Model along with the context and question
+    system_template = """You're a code summarisation assistant. Given the following extracted parts of a long document as "CONTEXT" create a final answer.
+    If you don't know the answer, just say that you don't know. Don't try to make up an answer.
+    Only If asked to create a "DIAGRAM" for code use "MERMAID SYNTAX LANGUAGE" in your answer from "CONTEXT" and "CHAT HISTORY" with a short explanation of diagram.
+    CONTEXT: {context}
+    =======
+    CHAT HISTORY: {chat_history}
+    =======
+    FINAL ANSWER:"""
+    human_template = """{question}"""
+    # ai_template = """
+    # FINAL ANSWER:"""
+    # Create the chat prompt templates
+    messages = [
+    SystemMessagePromptTemplate.from_template(system_template),
+    HumanMessagePromptTemplate.from_template(human_template)
+    # AIMessagePromptTemplate.from_template(ai_template)
+    ]
+    PROMPT = ChatPromptTemplate.from_messages(messages)
+    #Creating memory
+    # memory = ConversationBufferMemory(
+    #         memory_key="chat_history",
+    #         input_key="question",
+    #         output_key="answer",
+    #         return_messages=True)
+    memory = ConversationBufferWindowMemory(
+            memory_key="chat_history",
+            input_key="question",
+            output_key="answer",
+            return_messages=True,
+            k=5)
+    #Creating the retriever, this can also be a contextual compressed retriever
+    retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": k}) #search_type can be "similarity" or "mmr"
+    chain = ConversationalRetrievalChain.from_llm(
+        llm=llm,
+        chain_type="stuff", #chain type can be refine, stuff, map_reduce
+        retriever=retriever,
+        memory=memory,
+        return_source_documents=True, #When used these 2 properties, the output gets 3 properties: answer, source_document, source_document_score and then have to speocify input and output key in memory for it to work
+        combine_docs_chain_kwargs=dict({"prompt": PROMPT})
+        )
+    return chain
+#Class using all above components to create QA system
+class ConversationalResponse:
+    def __init__(self, url, branch, file_filter):
+        self.url = url
+        self.branch = branch
+        self.file_filter = file_filter
+        self.data = loader(self.url, self.branch, self.file_filter)
+        self.chunks = split_data(self.data)
+        self.vector_store = ingest_chunks(self.chunks)
+        self.chain_type = "stuff"
+        self.k = 10
+        self.chain = retreival(self.vector_store, self.k)
+    def __call__(self, question):
+        agent = self.chain(question)
+        return agent['answer']

.streamlit/config.toml ADDED Viewed

File without changes

src/main.py CHANGED Viewed

@@ -20,6 +20,7 @@ from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate, Human
 import datetime
 import shutil
 # Function to load the data from github using langchain with string type url, string type branch, string type file_filter
 def loader(url: str, branch: str, file_filter: str):
@@ -52,11 +53,9 @@ def split_data(data):
 #Function to ingest the chunks into a vectorstore of doc
 def ingest_chunks(chunks):
     embedding = OpenAIEmbeddings(
-    # deployment="your-embeddings-deployment-name",
     model="nomic-embed-text",
     openai_api_base="https://thewise-ollama-server.hf.space",
     # openai_api_type="azure",
-    openai_api_key='nothing'
 )
     vector_store = DocArrayInMemorySearch.from_documents(chunks, embedding)
@@ -69,7 +68,8 @@ def ingest_chunks(chunks):
 #Retreival function to get the data from the database and reply to the user
 def retreival(vector_store, k):
     #Creating LLM
-    llm = ChatOpenAI(model='codellama', temperature=0, openai_api_base='https://thewise-ollama-server.hf.space', openai_api_key='nothing')
     # Define the system message template
     #Adding CHAT HISTORY to the System template explicitly because mainly Chat history goes to Condense the Human Question with Backround (Not template), but System template goes straight the LLM Chain

 import datetime
 import shutil
+os.environ["OPENAI_API_KEY"] = "nothing"
 # Function to load the data from github using langchain with string type url, string type branch, string type file_filter
 def loader(url: str, branch: str, file_filter: str):
 #Function to ingest the chunks into a vectorstore of doc
 def ingest_chunks(chunks):
     embedding = OpenAIEmbeddings(
     model="nomic-embed-text",
     openai_api_base="https://thewise-ollama-server.hf.space",
     # openai_api_type="azure",
 )
     vector_store = DocArrayInMemorySearch.from_documents(chunks, embedding)
 #Retreival function to get the data from the database and reply to the user
 def retreival(vector_store, k):
     #Creating LLM
+    llm = ChatOpenAI(model='codellama'
+                     , openai_api_base='https://thewise-ollama-server.hf.space')
     # Define the system message template
     #Adding CHAT HISTORY to the System template explicitly because mainly Chat history goes to Condense the Human Question with Backround (Not template), but System template goes straight the LLM Chain