Spaces:

jaynopponep
/

LinuxGPT

Runtime error

App Files Files Community

jaynopponep commited on Dec 16, 2024

Commit

fd5f784

1 Parent(s): 3197ae2

adding non large files

Browse files

Files changed (7) hide show

.gitignore +4 -0
Dockerfile +13 -0
app.py +55 -0
init_db.py +85 -0
llm.py +106 -0
requirements.txt +9 -0
setup.py +23 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+.venv
+.env
+.idea
+data

Dockerfile ADDED Viewed

	@@ -0,0 +1,13 @@

+FROM python:3.12-slim
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+EXPOSE 8501
+ENV STREAMLIT_BROWSER_GATHERUSAGESTATS=false
+CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.enableCORS=false"]

app.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import streamlit as st
+from llm import build_rag_chain
+from setup import setup
+st.set_page_config(layout="wide")
+st.title("LinuxGPT")
+l_col, r_col = st.columns((3, 1))
+if "trigger" not in st.session_state:
+    st.session_state["trigger"] = False
+def on_enter():
+    st.session_state["trigger"] = True
+with r_col:
+    submit_button, openai_models = setup()
+# chat input goes here:
+with l_col:
+    user_question = st.text_area(
+        "Ask about Linux fundamentals",
+        on_change=on_enter,
+    )
+    # BELOW IS TEMPORARY, JUST FOR DEMOS!
+    api_key = st.secrets["api_key"]
+    if (submit_button or st.session_state["trigger"]) and api_key and user_question:
+        rag_chain = build_rag_chain(api_key=api_key)
+        formatted_history = [
+            {"role": "user", "content": item["question"]} if idx % 2 == 0
+            else {"role": "assistant", "content": item["answer"]}
+            for idx, item in enumerate(st.session_state.get("history", []))
+        ]
+        with st.spinner("Thinking..."):
+            # invoke answer
+            result = rag_chain.invoke({"input": user_question, "chat_history": formatted_history})
+            answer = result['answer']
+            st.header("LinuxGPT says:")
+            st.write(answer)
+            # add to chat history automatically
+            if 'history' not in st.session_state:
+                st.session_state['history'] = []
+            st.session_state['history'].append({"question": user_question, "answer": answer})
+    else:
+        st.write("Please provide an input. No API key is needed for demoing")
+# clear history
+if st.button("Clear History"):
+    st.session_state['history'] = []
+    st.write("Chat history cleared")

init_db.py ADDED Viewed

	@@ -0,0 +1,85 @@

+from langchain_community.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
+from langchain_chroma import Chroma
+from langchain.schema import Document
+from langchain_openai import OpenAIEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from dotenv import load_dotenv
+import os
+import shutil
+load_dotenv()
+OPEN_AI_KEY = os.getenv('OPEN_AI_KEY')
+CHROMA_PATH = "chroma"
+DATA_PATH = "data/"
+TEST_PATH = "data/theory_of_computation.pdf"
+embed = OpenAIEmbeddings(
+    api_key=OPEN_AI_KEY,
+    model="text-embedding-3-large"
+)
+def main():
+    generate_data_store()
+    # print(load_documents())
+def generate_data_store():
+    documents = load_documents()
+    chunks = split_text(documents)
+    save_to_chroma(chunks)
+def load_documents():
+    loader = PyPDFDirectoryLoader(DATA_PATH)
+    docs = loader.load()
+    print(docs[0].metadata)
+    return docs
+    # loader = PyPDFLoader(TEST_PATH)
+    # docs = []
+    # docs_lazy = loader.load()
+    # for doc in docs_lazy:
+    #     docs.append(doc)
+    # return docs_lazy
+def split_text(documents: list[Document]):
+    # chunk_size = 1000,
+    # chunk_overlap = 200,
+    # length_function = len,
+    # add_start_index = True,
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=1100,
+        chunk_overlap=100,
+        length_function=len,
+    )
+    chunks = text_splitter.split_documents(documents)
+    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
+    document = chunks[10]
+    print(document.page_content)
+    print(document.metadata)
+    return chunks
+def save_to_chroma(chunks: list[Document]):
+    if os.path.exists(CHROMA_PATH):  # clear out the DB first
+        shutil.rmtree(CHROMA_PATH)
+    db = Chroma(
+        collection_name="linux_funds",
+        embedding_function=embed,
+        persist_directory=CHROMA_PATH
+    )
+    # below breaks text & metadata down to Chroma vector store
+    texts = [chunk.page_content for chunk in chunks]
+    metadatas = [chunk.metadata for chunk in chunks]
+    db.add_texts(texts=texts, metadatas=metadatas)
+    print(f"Saved {len(chunks)} chunks to CHROMA PATH {CHROMA_PATH}.")
+if __name__ == "__main__":
+    main()

llm.py ADDED Viewed

	@@ -0,0 +1,106 @@

+from langchain_chroma import Chroma
+from langchain_openai import OpenAIEmbeddings, ChatOpenAI
+from langchain_core.messages import HumanMessage, SystemMessage
+from langchain_community.vectorstores import FAISS
+from langchain.chains import create_history_aware_retriever
+from langchain.chains import create_retrieval_chain
+from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
+from langchain.chains.combine_documents import create_stuff_documents_chain
+from langchain.chains import create_retrieval_chain
+from dotenv import load_dotenv
+import os
+CHROMA_PATH = "chroma"
+load_dotenv()
+API_KEY = os.getenv("OPEN_AI_KEY")
+def build_rag_chain(api_key):
+    embed = OpenAIEmbeddings(
+        api_key=api_key,
+        model="text-embedding-3-large"
+    )
+    db = Chroma(
+            collection_name="linux_funds",
+            embedding_function=embed,
+            persist_directory=CHROMA_PATH
+        )
+    retriever = db.as_retriever(
+        search_type="similarity_score_threshold",
+        search_kwargs={"k": 4, "score_threshold": 0.3},
+    )
+    model = ChatOpenAI(api_key=api_key, model="gpt-4o")
+    # docs = retriever.invoke(test_query)
+    # print("\n--- RELEVANT DOCUMENTS ---")
+    # for i, doc in enumerate(docs, 1):
+    #     print(f"Document {i}:\n{doc.page_content}\n")
+    #     if doc.metadata:
+    #         print(f"Source: {doc.metadata.get('source', 'Unknown')}\n")
+    context = (
+        "Given a chat history and the latest user question "
+        "which might reference context in the chat history, "
+        "formulate a standalone question which can be understood "
+        "without the chat history. Do NOT answer the question, just "
+        "reformulate it if needed and otherwise return it as is."
+    )
+    context_with_history = ChatPromptTemplate(
+        [
+            ("system", context),
+            MessagesPlaceholder("chat_history"),
+            ("human", "{input}"),
+        ]
+    )
+    history_aware_retriever = create_history_aware_retriever(
+        model, retriever, context_with_history
+    )
+    main_query = (
+        "You are an assistant for question-answering tasks. Use"
+        "the following pieces of retrieved context to answer the "
+        "question. If you don't know the answer, just say "
+        "you don't know. Use 10 sentences maximum and keep the answer "
+        "concise. You will most likely have to write bash scripts, so make"
+        " this presentable on HuggingFace in markdown if needed."
+        "\n\n"
+        "{context}"
+    )
+    prompt = ChatPromptTemplate(
+        [
+            ("system", main_query),
+            MessagesPlaceholder("chat_history"),
+            ("human", "{input}"),
+        ]
+    )
+    qna_chain = create_stuff_documents_chain(model, prompt)
+    rag_chain = create_retrieval_chain(history_aware_retriever, qna_chain)
+    return rag_chain
+def chat():
+    print("Start asking about the Theory of Computation. Type 'exit' to end the conversation.")
+    chat_history = []
+    while True:
+        query = input("You: ")
+        if query.lower() == "exit":
+            break
+        rag_chain = build_rag_chain(API_KEY)
+        result = rag_chain.invoke({"input": query, "chat_history": chat_history})
+        print(f"AI: {result['answer']}")
+        chat_history.append(HumanMessage(content=query))
+        chat_history.append(SystemMessage(content=result["answer"]))
+# ABOVE IS FOR LOCAL TESTING ONLY ^ ONLY KEEPING IT FOR FUTURE USE
+    # messages = [
+    #     SystemMessage(content="You are a helpful assistant."),
+    #     HumanMessage(content=query_input),
+    # ]
+    #
+    # result = model.invoke(messages)
+    #
+    # print("\n--- Generated Response ---")
+    # print("Result:")
+    # print(result)
+    # print("Content only:")
+    # print(result.content)
+if __name__ == "__main__":
+    chat()

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+langchain~=0.3.7
+streamlit~=1.39.0
+langchain==0.3.7
+langchain-core==0.3.15
+langchain-chroma==0.1.4
+langchain-community==0.3.5
+langchain-openai==0.2.5
+python-dotenv~=1.0.1
+pypdf==5.1.0

setup.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import streamlit as st
+def setup():
+    submit_button = st.button("Generate a response")
+    # "Enter your OpenAPI key above (make sure there is money in it). Learn more [here](https://platform.openai.com/api-keys)",
+    openai_models: str = st.selectbox(
+        label="Choose your OpenAI model:",
+        options=[
+            "gpt-3.5-turbo",
+            "gpt-4-turbo"
+        ]
+    )
+    st.subheader("LinuxGPT")
+    st.write(
+        "LinuxGPT is currently trained on the Linux Fundamentals textbook, and course materials will be added soon."
+    )
+    st.subheader("Examples for Demoing")
+    st.write("")
+    return (
+        submit_button,
+        openai_models,
+    )