Spaces:

csalabs
/

Replicate-7b-chat-Llama-streamlit

Runtime error

App Files Files Community

csalabs commited on Sep 21, 2023

Commit

6d14bdc

1 Parent(s): c50c563

Upload 4 files

Browse files

Files changed (4) hide show

.env +3 -0
app.py +123 -0
constants.py +142 -0
requirements.txt +16 -0

.env ADDED Viewed

	@@ -0,0 +1,3 @@

+HUGGINGFACE_API_TOKEN='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'
+REPLICATE_API_TOKEN = 'r8_f0yg1vSn32AAGDnqV6qErGJZeCcFFl30CJ46E' #--> Org gamail
+# REPLICATE_API_TOKEN = 'r8_L3BQN0zjnB1KwwkPjZD0RSLVrj9umPv0oRjFY' # --trial not working

app.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import streamlit as st
+from streamlit_chat import message
+from langchain.chains import ConversationalRetrievalChain
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.llms import CTransformers
+from langchain.llms import Replicate
+from langchain.text_splitter import CharacterTextSplitter
+from langchain.vectorstores import FAISS
+from langchain.memory import ConversationBufferMemory
+from langchain.document_loaders import PyPDFLoader, UnstructuredFileLoader
+from langchain.document_loaders import TextLoader
+from langchain.document_loaders import Docx2txtLoader
+from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
+from langchain.text_splitter import Language, RecursiveCharacterTextSplitter
+import os
+from dotenv import load_dotenv
+import tempfile
+from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
+from constants import (
+    CHROMA_SETTINGS,
+    DOCUMENT_MAP,
+    EMBEDDING_MODEL_NAME,
+    INGEST_THREADS,
+    PERSIST_DIRECTORY,
+    SOURCE_DIRECTORY,
+)
+from langchain.docstore.document import Document
+load_dotenv()
+def initialize_session_state():
+    if 'history' not in st.session_state:
+        st.session_state['history'] = []
+    if 'generated' not in st.session_state:
+        st.session_state['generated'] = ["Hello! Ask me anything about 🤗"]
+    if 'past' not in st.session_state:
+        st.session_state['past'] = ["Hey! 👋"]
+def conversation_chat(query, chain, history):
+    result = chain({"question": query, "chat_history": history})
+    history.append((query, result["answer"]))
+    return result["answer"]
+def display_chat_history(chain):
+    reply_container = st.container()
+    container = st.container()
+    with container:
+        with st.form(key='my_form', clear_on_submit=True):
+            user_input = st.text_input("Question:", placeholder="Ask about your Documents", key='input')
+            submit_button = st.form_submit_button(label='Send')
+        if submit_button and user_input:
+            with st.spinner('Generating response...'):
+                output = conversation_chat(user_input, chain, st.session_state['history'])
+            st.session_state['past'].append(user_input)
+            st.session_state['generated'].append(output)
+    if st.session_state['generated']:
+        with reply_container:
+            for i in range(len(st.session_state['generated'])):
+                message(st.session_state["past"][i], is_user=True, key=str(i) + '_user', avatar_style="thumbs")
+                message(st.session_state["generated"][i], key=str(i), avatar_style="fun-emoji")
+def create_conversational_chain(vector_store):
+    load_dotenv()
+    llm = Replicate(
+        streaming = True,
+        # model = "replicate/llama-2-70b-chat:58d078176e02c219e11eb4da5a02a7830a283b14cf8f94537af893ccff5ee781",
+        model = "meta/llama-2-7b-chat:8e6975e5ed6174911a6ff3d60540dfd4844201974602551e10e9e87ab143d81e",
+        callbacks=[StreamingStdOutCallbackHandler()],
+        input = {"temperature": 0.01, "max_length" :500,"top_p":1})
+    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
+    chain = ConversationalRetrievalChain.from_llm(llm=llm, chain_type='stuff',
+                                                 retriever=vector_store.as_retriever(search_kwargs={"k": 2}),
+                                                 memory=memory)
+    return chain
+file_paths = [
+    './SOURCE_DOCUMENTS/Freedom of Information and Protection of Privacy Act, R.S.O. 1990, c. F.31[462] - Copy.pdf',
+    './SOURCE_DOCUMENTS/Highway Traffic Act, R.S.O. 1990, c. H.8[465] - Copy.pdf',
+    './SOURCE_DOCUMENTS/Narcotics Safety and Awareness Act, 2010, S.O. 2010, c. 22[463].pdf',
+    './SOURCE_DOCUMENTS/Nutrient Management Act, 2002, S.O. 2002, c. 4[464].pdf'
+    # Add more file paths as needed
+]
+def main():
+    # load_dotenv()
+    os.environ.get("REPLICATE_API_TOKEN")
+    # Initialize session state
+    initialize_session_state()
+    st.title("Multi-Docs ChatBot using llama-2-7b :books:")
+    # loader = UnstructuredFileLoader('./SOURCE_DOCUMENTS/Freedom of Information and Protection of Privacy Act, R.S.O. 1990, c. F.31[462] - Copy.pdf')
+    # documents = loader.load()
+    documents = []
+    for file_path in file_paths:
+        loader = UnstructuredFileLoader(file_path)
+        loaded_doc = loader.load()  # Assuming this returns a list of pages
+        documents.extend(loaded_doc)
+    text_splitter=CharacterTextSplitter(separator='\n',
+                                    chunk_size=1500,
+                                    chunk_overlap=300)
+    text_chunks=text_splitter.split_documents(documents)
+    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2',model_kwargs={'device': 'cpu'})
+    vector_store=FAISS.from_documents(text_chunks, embeddings)
+# Create the chain object
+    chain = create_conversational_chain(vector_store)
+# Display chat history
+    display_chat_history(chain)
+if __name__ == "__main__":
+    main()

constants.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import os
+# from dotenv import load_dotenv
+from chromadb.config import Settings
+# https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/excel.html?highlight=xlsx#microsoft-excel
+from langchain.document_loaders import CSVLoader, PDFMinerLoader, TextLoader, UnstructuredExcelLoader, Docx2txtLoader
+# load_dotenv()
+ROOT_DIRECTORY = os.path.dirname(os.path.realpath(__file__))
+# Define the folder for storing database
+SOURCE_DIRECTORY = f"{ROOT_DIRECTORY}/SOURCE_DOCUMENTS"
+PERSIST_DIRECTORY = f"{ROOT_DIRECTORY}/DB"
+# Can be changed to a specific number
+INGEST_THREADS = os.cpu_count() or 8
+# Define the Chroma settings
+CHROMA_SETTINGS = Settings(
+    anonymized_telemetry=False,
+    is_persistent=True,
+)
+# https://python.langchain.com/en/latest/_modules/langchain/document_loaders/excel.html#UnstructuredExcelLoader
+DOCUMENT_MAP = {
+    ".txt": TextLoader,
+    ".md": TextLoader,
+    ".py": TextLoader,
+    ".pdf": PDFMinerLoader,
+    ".csv": CSVLoader,
+    ".xls": UnstructuredExcelLoader,
+    ".xlsx": UnstructuredExcelLoader,
+    ".docx": Docx2txtLoader,
+    ".doc": Docx2txtLoader,
+}
+# Default Instructor Model
+EMBEDDING_MODEL_NAME = "hkunlp/instructor-large" # Uses 1.5 GB of VRAM (High Accuracy with lower VRAM usage)
+####
+#### OTHER EMBEDDING MODEL OPTIONS
+####
+# EMBEDDING_MODEL_NAME = "hkunlp/instructor-xl" # Uses 5 GB of VRAM (Most Accurate of all models)
+# EMBEDDING_MODEL_NAME = "intfloat/e5-large-v2" # Uses 1.5 GB of VRAM (A little less accurate than instructor-large)
+# EMBEDDING_MODEL_NAME = "intfloat/e5-base-v2" # Uses 0.5 GB of VRAM (A good model for lower VRAM GPUs)
+# EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2" # Uses 0.2 GB of VRAM (Less accurate but fastest - only requires 150mb of vram)
+####
+#### MULTILINGUAL EMBEDDING MODELS
+####
+# EMBEDDING_MODEL_NAME = "intfloat/multilingual-e5-large" # Uses 2.5 GB of VRAM
+# EMBEDDING_MODEL_NAME = "intfloat/multilingual-e5-base" # Uses 1.2 GB of VRAM
+#### SELECT AN OPEN SOURCE LLM (LARGE LANGUAGE MODEL)
+    # Select the Model ID and model_basename
+    # load the LLM for generating Natural Language responses
+#### GPU VRAM Memory required for LLM Models (ONLY) by Billion Parameter value (B Model)
+#### Does not include VRAM used by Embedding Models - which use an additional 2GB-7GB of VRAM depending on the model.
+####
+#### (B Model)   (float32)    (float16)    (GPTQ 8bit)         (GPTQ 4bit)
+####    7b         28 GB        14 GB       7 GB - 9 GB        3.5 GB - 5 GB
+####    13b        52 GB        26 GB       13 GB - 15 GB      6.5 GB - 8 GB
+####    32b        130 GB       65 GB       32.5 GB - 35 GB    16.25 GB - 19 GB
+####    65b        260.8 GB     130.4 GB    65.2 GB - 67 GB    32.6 GB -  - 35 GB
+MODEL_ID = "TheBloke/Llama-2-7B-Chat-GGML"
+MODEL_BASENAME = "llama-2-7b-chat.ggmlv3.q4_0.bin"
+####
+#### (FOR HF MODELS)
+####
+# MODEL_ID = "TheBloke/vicuna-7B-1.1-HF"
+# MODEL_BASENAME = None
+# MODEL_ID = "TheBloke/Wizard-Vicuna-7B-Uncensored-HF"
+# MODEL_ID = "TheBloke/guanaco-7B-HF"
+# MODEL_ID = 'NousResearch/Nous-Hermes-13b' # Requires ~ 23GB VRAM. Using STransformers
+# alongside will 100% create OOM on 24GB cards.
+# llm = load_model(device_type, model_id=model_id)
+####
+#### (FOR GPTQ QUANTIZED) Select a llm model based on your GPU and VRAM GB. Does not include Embedding Models VRAM usage.
+####
+##### 48GB VRAM Graphics Cards (RTX 6000, RTX A6000 and other 48GB VRAM GPUs) #####
+### 65b GPTQ LLM Models for 48GB GPUs (*** With best embedding model: hkunlp/instructor-xl ***)
+# model_id = "TheBloke/guanaco-65B-GPTQ"
+# model_basename = "model.safetensors"
+# model_id = "TheBloke/Airoboros-65B-GPT4-2.0-GPTQ"
+# model_basename = "model.safetensors"
+# model_id = "TheBloke/gpt4-alpaca-lora_mlp-65B-GPTQ"
+# model_basename = "model.safetensors"
+# model_id = "TheBloke/Upstage-Llama1-65B-Instruct-GPTQ"
+# model_basename = "model.safetensors"
+##### 24GB VRAM Graphics Cards (RTX 3090 - RTX 4090 (35% Faster) - RTX A5000 - RTX A5500) #####
+### 13b GPTQ Models for 24GB GPUs (*** With best embedding model: hkunlp/instructor-xl ***)
+# model_id = "TheBloke/Wizard-Vicuna-13B-Uncensored-GPTQ"
+# model_basename = "Wizard-Vicuna-13B-Uncensored-GPTQ-4bit-128g.compat.no-act-order.safetensors"
+# model_id = "TheBloke/vicuna-13B-v1.5-GPTQ"
+# model_basename = "model.safetensors"
+# model_id = "TheBloke/Nous-Hermes-13B-GPTQ"
+# model_basename = "nous-hermes-13b-GPTQ-4bit-128g.no-act.order"
+# model_id = "TheBloke/WizardLM-13B-V1.2-GPTQ"
+# model_basename = "gptq_model-4bit-128g.safetensors
+### 30b GPTQ Models for 24GB GPUs (*** Requires using intfloat/e5-base-v2 instead of hkunlp/instructor-large as embedding model ***)
+# model_id = "TheBloke/Wizard-Vicuna-30B-Uncensored-GPTQ"
+# model_basename = "Wizard-Vicuna-30B-Uncensored-GPTQ-4bit--1g.act.order.safetensors"
+# model_id = "TheBloke/WizardLM-30B-Uncensored-GPTQ"
+# model_basename = "WizardLM-30B-Uncensored-GPTQ-4bit.act-order.safetensors"
+##### 8-10GB VRAM Graphics Cards (RTX 3080 - RTX 3080 Ti - RTX 3070 Ti - 3060 Ti - RTX 2000 Series, Quadro RTX 4000, 5000, 6000) #####
+### (*** Requires using intfloat/e5-small-v2 instead of hkunlp/instructor-large as embedding model ***)
+### 7b GPTQ Models for 8GB GPUs
+# model_id = "TheBloke/Wizard-Vicuna-7B-Uncensored-GPTQ"
+# model_basename = "Wizard-Vicuna-7B-Uncensored-GPTQ-4bit-128g.no-act.order.safetensors"
+# model_id = "TheBloke/WizardLM-7B-uncensored-GPTQ"
+# model_basename = "WizardLM-7B-uncensored-GPTQ-4bit-128g.compat.no-act-order.safetensors"
+# model_id = "TheBloke/wizardLM-7B-GPTQ"
+# model_basename = "wizardLM-7B-GPTQ-4bit.compat.no-act-order.safetensors"
+####
+#### (FOR GGML) (Quantized cpu+gpu+mps) models - check if they support llama.cpp
+####
+# MODEL_ID = "TheBloke/wizard-vicuna-13B-GGML"
+# MODEL_BASENAME = "wizard-vicuna-13B.ggmlv3.q4_0.bin"
+# MODEL_BASENAME = "wizard-vicuna-13B.ggmlv3.q6_K.bin"
+# MODEL_BASENAME = "wizard-vicuna-13B.ggmlv3.q2_K.bin"
+# MODEL_ID = "TheBloke/orca_mini_3B-GGML"
+# MODEL_BASENAME = "orca-mini-3b.ggmlv3.q4_0.bin"

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+langchain
+torch
+accelerate
+sentence_transformers
+streamlit_chat
+streamlit
+faiss-cpu
+tiktoken
+ctransformers
+huggingface-hub
+pypdf
+pypdf2
+python-dotenv
+replicate
+docx2txt
+streamlit_chat