jaynopponep commited on
Commit
fd5f784
·
1 Parent(s): 3197ae2

adding non large files

Browse files
Files changed (7) hide show
  1. .gitignore +4 -0
  2. Dockerfile +13 -0
  3. app.py +55 -0
  4. init_db.py +85 -0
  5. llm.py +106 -0
  6. requirements.txt +9 -0
  7. setup.py +23 -0
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ .venv
2
+ .env
3
+ .idea
4
+ data
Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12-slim
2
+
3
+ WORKDIR /app
4
+
5
+ COPY requirements.txt .
6
+ RUN pip install --no-cache-dir -r requirements.txt
7
+
8
+ COPY . .
9
+ EXPOSE 8501
10
+
11
+ ENV STREAMLIT_BROWSER_GATHERUSAGESTATS=false
12
+
13
+ CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.enableCORS=false"]
app.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from llm import build_rag_chain
3
+ from setup import setup
4
+
5
+ st.set_page_config(layout="wide")
6
+
7
+ st.title("LinuxGPT")
8
+
9
+ l_col, r_col = st.columns((3, 1))
10
+
11
+ if "trigger" not in st.session_state:
12
+ st.session_state["trigger"] = False
13
+
14
+ def on_enter():
15
+ st.session_state["trigger"] = True
16
+
17
+ with r_col:
18
+ submit_button, openai_models = setup()
19
+
20
+ # chat input goes here:
21
+ with l_col:
22
+ user_question = st.text_area(
23
+ "Ask about Linux fundamentals",
24
+ on_change=on_enter,
25
+ )
26
+ # BELOW IS TEMPORARY, JUST FOR DEMOS!
27
+ api_key = st.secrets["api_key"]
28
+ if (submit_button or st.session_state["trigger"]) and api_key and user_question:
29
+ rag_chain = build_rag_chain(api_key=api_key)
30
+
31
+ formatted_history = [
32
+ {"role": "user", "content": item["question"]} if idx % 2 == 0
33
+ else {"role": "assistant", "content": item["answer"]}
34
+ for idx, item in enumerate(st.session_state.get("history", []))
35
+ ]
36
+ with st.spinner("Thinking..."):
37
+ # invoke answer
38
+ result = rag_chain.invoke({"input": user_question, "chat_history": formatted_history})
39
+ answer = result['answer']
40
+
41
+ st.header("LinuxGPT says:")
42
+ st.write(answer)
43
+
44
+ # add to chat history automatically
45
+ if 'history' not in st.session_state:
46
+ st.session_state['history'] = []
47
+ st.session_state['history'].append({"question": user_question, "answer": answer})
48
+
49
+ else:
50
+ st.write("Please provide an input. No API key is needed for demoing")
51
+
52
+ # clear history
53
+ if st.button("Clear History"):
54
+ st.session_state['history'] = []
55
+ st.write("Chat history cleared")
init_db.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
2
+ from langchain_chroma import Chroma
3
+ from langchain.schema import Document
4
+ from langchain_openai import OpenAIEmbeddings
5
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
6
+ from dotenv import load_dotenv
7
+ import os
8
+ import shutil
9
+
10
+ load_dotenv()
11
+ OPEN_AI_KEY = os.getenv('OPEN_AI_KEY')
12
+
13
+ CHROMA_PATH = "chroma"
14
+ DATA_PATH = "data/"
15
+ TEST_PATH = "data/theory_of_computation.pdf"
16
+
17
+ embed = OpenAIEmbeddings(
18
+ api_key=OPEN_AI_KEY,
19
+ model="text-embedding-3-large"
20
+ )
21
+
22
+
23
+ def main():
24
+ generate_data_store()
25
+ # print(load_documents())
26
+
27
+
28
+ def generate_data_store():
29
+ documents = load_documents()
30
+ chunks = split_text(documents)
31
+ save_to_chroma(chunks)
32
+
33
+
34
+ def load_documents():
35
+ loader = PyPDFDirectoryLoader(DATA_PATH)
36
+ docs = loader.load()
37
+ print(docs[0].metadata)
38
+ return docs
39
+
40
+ # loader = PyPDFLoader(TEST_PATH)
41
+ # docs = []
42
+ # docs_lazy = loader.load()
43
+ # for doc in docs_lazy:
44
+ # docs.append(doc)
45
+ # return docs_lazy
46
+
47
+
48
+ def split_text(documents: list[Document]):
49
+ # chunk_size = 1000,
50
+ # chunk_overlap = 200,
51
+ # length_function = len,
52
+ # add_start_index = True,
53
+ text_splitter = RecursiveCharacterTextSplitter(
54
+ chunk_size=1100,
55
+ chunk_overlap=100,
56
+ length_function=len,
57
+ )
58
+ chunks = text_splitter.split_documents(documents)
59
+ print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
60
+ document = chunks[10]
61
+ print(document.page_content)
62
+ print(document.metadata)
63
+ return chunks
64
+
65
+
66
+ def save_to_chroma(chunks: list[Document]):
67
+ if os.path.exists(CHROMA_PATH): # clear out the DB first
68
+ shutil.rmtree(CHROMA_PATH)
69
+
70
+ db = Chroma(
71
+ collection_name="linux_funds",
72
+ embedding_function=embed,
73
+ persist_directory=CHROMA_PATH
74
+ )
75
+
76
+ # below breaks text & metadata down to Chroma vector store
77
+ texts = [chunk.page_content for chunk in chunks]
78
+ metadatas = [chunk.metadata for chunk in chunks]
79
+ db.add_texts(texts=texts, metadatas=metadatas)
80
+ print(f"Saved {len(chunks)} chunks to CHROMA PATH {CHROMA_PATH}.")
81
+
82
+
83
+ if __name__ == "__main__":
84
+ main()
85
+
llm.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_chroma import Chroma
2
+ from langchain_openai import OpenAIEmbeddings, ChatOpenAI
3
+ from langchain_core.messages import HumanMessage, SystemMessage
4
+ from langchain_community.vectorstores import FAISS
5
+ from langchain.chains import create_history_aware_retriever
6
+ from langchain.chains import create_retrieval_chain
7
+ from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
8
+ from langchain.chains.combine_documents import create_stuff_documents_chain
9
+ from langchain.chains import create_retrieval_chain
10
+ from dotenv import load_dotenv
11
+ import os
12
+ CHROMA_PATH = "chroma"
13
+
14
+ load_dotenv()
15
+ API_KEY = os.getenv("OPEN_AI_KEY")
16
+ def build_rag_chain(api_key):
17
+ embed = OpenAIEmbeddings(
18
+ api_key=api_key,
19
+ model="text-embedding-3-large"
20
+ )
21
+ db = Chroma(
22
+ collection_name="linux_funds",
23
+ embedding_function=embed,
24
+ persist_directory=CHROMA_PATH
25
+ )
26
+ retriever = db.as_retriever(
27
+ search_type="similarity_score_threshold",
28
+ search_kwargs={"k": 4, "score_threshold": 0.3},
29
+ )
30
+ model = ChatOpenAI(api_key=api_key, model="gpt-4o")
31
+ # docs = retriever.invoke(test_query)
32
+ # print("\n--- RELEVANT DOCUMENTS ---")
33
+ # for i, doc in enumerate(docs, 1):
34
+ # print(f"Document {i}:\n{doc.page_content}\n")
35
+ # if doc.metadata:
36
+ # print(f"Source: {doc.metadata.get('source', 'Unknown')}\n")
37
+ context = (
38
+ "Given a chat history and the latest user question "
39
+ "which might reference context in the chat history, "
40
+ "formulate a standalone question which can be understood "
41
+ "without the chat history. Do NOT answer the question, just "
42
+ "reformulate it if needed and otherwise return it as is."
43
+ )
44
+ context_with_history = ChatPromptTemplate(
45
+ [
46
+ ("system", context),
47
+ MessagesPlaceholder("chat_history"),
48
+ ("human", "{input}"),
49
+ ]
50
+ )
51
+ history_aware_retriever = create_history_aware_retriever(
52
+ model, retriever, context_with_history
53
+ )
54
+ main_query = (
55
+ "You are an assistant for question-answering tasks. Use"
56
+ "the following pieces of retrieved context to answer the "
57
+ "question. If you don't know the answer, just say "
58
+ "you don't know. Use 10 sentences maximum and keep the answer "
59
+ "concise. You will most likely have to write bash scripts, so make"
60
+ " this presentable on HuggingFace in markdown if needed."
61
+ "\n\n"
62
+ "{context}"
63
+ )
64
+ prompt = ChatPromptTemplate(
65
+ [
66
+ ("system", main_query),
67
+ MessagesPlaceholder("chat_history"),
68
+ ("human", "{input}"),
69
+ ]
70
+ )
71
+ qna_chain = create_stuff_documents_chain(model, prompt)
72
+ rag_chain = create_retrieval_chain(history_aware_retriever, qna_chain)
73
+ return rag_chain
74
+
75
+ def chat():
76
+ print("Start asking about the Theory of Computation. Type 'exit' to end the conversation.")
77
+ chat_history = []
78
+
79
+ while True:
80
+ query = input("You: ")
81
+ if query.lower() == "exit":
82
+ break
83
+ rag_chain = build_rag_chain(API_KEY)
84
+ result = rag_chain.invoke({"input": query, "chat_history": chat_history})
85
+ print(f"AI: {result['answer']}")
86
+ chat_history.append(HumanMessage(content=query))
87
+ chat_history.append(SystemMessage(content=result["answer"]))
88
+
89
+ # ABOVE IS FOR LOCAL TESTING ONLY ^ ONLY KEEPING IT FOR FUTURE USE
90
+
91
+
92
+ # messages = [
93
+ # SystemMessage(content="You are a helpful assistant."),
94
+ # HumanMessage(content=query_input),
95
+ # ]
96
+ #
97
+ # result = model.invoke(messages)
98
+ #
99
+ # print("\n--- Generated Response ---")
100
+ # print("Result:")
101
+ # print(result)
102
+ # print("Content only:")
103
+ # print(result.content)
104
+
105
+ if __name__ == "__main__":
106
+ chat()
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ langchain~=0.3.7
2
+ streamlit~=1.39.0
3
+ langchain==0.3.7
4
+ langchain-core==0.3.15
5
+ langchain-chroma==0.1.4
6
+ langchain-community==0.3.5
7
+ langchain-openai==0.2.5
8
+ python-dotenv~=1.0.1
9
+ pypdf==5.1.0
setup.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+
4
+ def setup():
5
+ submit_button = st.button("Generate a response")
6
+ # "Enter your OpenAPI key above (make sure there is money in it). Learn more [here](https://platform.openai.com/api-keys)",
7
+ openai_models: str = st.selectbox(
8
+ label="Choose your OpenAI model:",
9
+ options=[
10
+ "gpt-3.5-turbo",
11
+ "gpt-4-turbo"
12
+ ]
13
+ )
14
+ st.subheader("LinuxGPT")
15
+ st.write(
16
+ "LinuxGPT is currently trained on the Linux Fundamentals textbook, and course materials will be added soon."
17
+ )
18
+ st.subheader("Examples for Demoing")
19
+ st.write("")
20
+ return (
21
+ submit_button,
22
+ openai_models,
23
+ )