Spaces:
Runtime error
Runtime error
Commit
·
fd5f784
1
Parent(s):
3197ae2
adding non large files
Browse files- .gitignore +4 -0
- Dockerfile +13 -0
- app.py +55 -0
- init_db.py +85 -0
- llm.py +106 -0
- requirements.txt +9 -0
- setup.py +23 -0
.gitignore
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.venv
|
2 |
+
.env
|
3 |
+
.idea
|
4 |
+
data
|
Dockerfile
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.12-slim
|
2 |
+
|
3 |
+
WORKDIR /app
|
4 |
+
|
5 |
+
COPY requirements.txt .
|
6 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
7 |
+
|
8 |
+
COPY . .
|
9 |
+
EXPOSE 8501
|
10 |
+
|
11 |
+
ENV STREAMLIT_BROWSER_GATHERUSAGESTATS=false
|
12 |
+
|
13 |
+
CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.enableCORS=false"]
|
app.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from llm import build_rag_chain
|
3 |
+
from setup import setup
|
4 |
+
|
5 |
+
st.set_page_config(layout="wide")
|
6 |
+
|
7 |
+
st.title("LinuxGPT")
|
8 |
+
|
9 |
+
l_col, r_col = st.columns((3, 1))
|
10 |
+
|
11 |
+
if "trigger" not in st.session_state:
|
12 |
+
st.session_state["trigger"] = False
|
13 |
+
|
14 |
+
def on_enter():
|
15 |
+
st.session_state["trigger"] = True
|
16 |
+
|
17 |
+
with r_col:
|
18 |
+
submit_button, openai_models = setup()
|
19 |
+
|
20 |
+
# chat input goes here:
|
21 |
+
with l_col:
|
22 |
+
user_question = st.text_area(
|
23 |
+
"Ask about Linux fundamentals",
|
24 |
+
on_change=on_enter,
|
25 |
+
)
|
26 |
+
# BELOW IS TEMPORARY, JUST FOR DEMOS!
|
27 |
+
api_key = st.secrets["api_key"]
|
28 |
+
if (submit_button or st.session_state["trigger"]) and api_key and user_question:
|
29 |
+
rag_chain = build_rag_chain(api_key=api_key)
|
30 |
+
|
31 |
+
formatted_history = [
|
32 |
+
{"role": "user", "content": item["question"]} if idx % 2 == 0
|
33 |
+
else {"role": "assistant", "content": item["answer"]}
|
34 |
+
for idx, item in enumerate(st.session_state.get("history", []))
|
35 |
+
]
|
36 |
+
with st.spinner("Thinking..."):
|
37 |
+
# invoke answer
|
38 |
+
result = rag_chain.invoke({"input": user_question, "chat_history": formatted_history})
|
39 |
+
answer = result['answer']
|
40 |
+
|
41 |
+
st.header("LinuxGPT says:")
|
42 |
+
st.write(answer)
|
43 |
+
|
44 |
+
# add to chat history automatically
|
45 |
+
if 'history' not in st.session_state:
|
46 |
+
st.session_state['history'] = []
|
47 |
+
st.session_state['history'].append({"question": user_question, "answer": answer})
|
48 |
+
|
49 |
+
else:
|
50 |
+
st.write("Please provide an input. No API key is needed for demoing")
|
51 |
+
|
52 |
+
# clear history
|
53 |
+
if st.button("Clear History"):
|
54 |
+
st.session_state['history'] = []
|
55 |
+
st.write("Chat history cleared")
|
init_db.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_community.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
|
2 |
+
from langchain_chroma import Chroma
|
3 |
+
from langchain.schema import Document
|
4 |
+
from langchain_openai import OpenAIEmbeddings
|
5 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
import os
|
8 |
+
import shutil
|
9 |
+
|
10 |
+
load_dotenv()
|
11 |
+
OPEN_AI_KEY = os.getenv('OPEN_AI_KEY')
|
12 |
+
|
13 |
+
CHROMA_PATH = "chroma"
|
14 |
+
DATA_PATH = "data/"
|
15 |
+
TEST_PATH = "data/theory_of_computation.pdf"
|
16 |
+
|
17 |
+
embed = OpenAIEmbeddings(
|
18 |
+
api_key=OPEN_AI_KEY,
|
19 |
+
model="text-embedding-3-large"
|
20 |
+
)
|
21 |
+
|
22 |
+
|
23 |
+
def main():
|
24 |
+
generate_data_store()
|
25 |
+
# print(load_documents())
|
26 |
+
|
27 |
+
|
28 |
+
def generate_data_store():
|
29 |
+
documents = load_documents()
|
30 |
+
chunks = split_text(documents)
|
31 |
+
save_to_chroma(chunks)
|
32 |
+
|
33 |
+
|
34 |
+
def load_documents():
|
35 |
+
loader = PyPDFDirectoryLoader(DATA_PATH)
|
36 |
+
docs = loader.load()
|
37 |
+
print(docs[0].metadata)
|
38 |
+
return docs
|
39 |
+
|
40 |
+
# loader = PyPDFLoader(TEST_PATH)
|
41 |
+
# docs = []
|
42 |
+
# docs_lazy = loader.load()
|
43 |
+
# for doc in docs_lazy:
|
44 |
+
# docs.append(doc)
|
45 |
+
# return docs_lazy
|
46 |
+
|
47 |
+
|
48 |
+
def split_text(documents: list[Document]):
|
49 |
+
# chunk_size = 1000,
|
50 |
+
# chunk_overlap = 200,
|
51 |
+
# length_function = len,
|
52 |
+
# add_start_index = True,
|
53 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
54 |
+
chunk_size=1100,
|
55 |
+
chunk_overlap=100,
|
56 |
+
length_function=len,
|
57 |
+
)
|
58 |
+
chunks = text_splitter.split_documents(documents)
|
59 |
+
print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
|
60 |
+
document = chunks[10]
|
61 |
+
print(document.page_content)
|
62 |
+
print(document.metadata)
|
63 |
+
return chunks
|
64 |
+
|
65 |
+
|
66 |
+
def save_to_chroma(chunks: list[Document]):
|
67 |
+
if os.path.exists(CHROMA_PATH): # clear out the DB first
|
68 |
+
shutil.rmtree(CHROMA_PATH)
|
69 |
+
|
70 |
+
db = Chroma(
|
71 |
+
collection_name="linux_funds",
|
72 |
+
embedding_function=embed,
|
73 |
+
persist_directory=CHROMA_PATH
|
74 |
+
)
|
75 |
+
|
76 |
+
# below breaks text & metadata down to Chroma vector store
|
77 |
+
texts = [chunk.page_content for chunk in chunks]
|
78 |
+
metadatas = [chunk.metadata for chunk in chunks]
|
79 |
+
db.add_texts(texts=texts, metadatas=metadatas)
|
80 |
+
print(f"Saved {len(chunks)} chunks to CHROMA PATH {CHROMA_PATH}.")
|
81 |
+
|
82 |
+
|
83 |
+
if __name__ == "__main__":
|
84 |
+
main()
|
85 |
+
|
llm.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_chroma import Chroma
|
2 |
+
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
|
3 |
+
from langchain_core.messages import HumanMessage, SystemMessage
|
4 |
+
from langchain_community.vectorstores import FAISS
|
5 |
+
from langchain.chains import create_history_aware_retriever
|
6 |
+
from langchain.chains import create_retrieval_chain
|
7 |
+
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
|
8 |
+
from langchain.chains.combine_documents import create_stuff_documents_chain
|
9 |
+
from langchain.chains import create_retrieval_chain
|
10 |
+
from dotenv import load_dotenv
|
11 |
+
import os
|
12 |
+
CHROMA_PATH = "chroma"
|
13 |
+
|
14 |
+
load_dotenv()
|
15 |
+
API_KEY = os.getenv("OPEN_AI_KEY")
|
16 |
+
def build_rag_chain(api_key):
|
17 |
+
embed = OpenAIEmbeddings(
|
18 |
+
api_key=api_key,
|
19 |
+
model="text-embedding-3-large"
|
20 |
+
)
|
21 |
+
db = Chroma(
|
22 |
+
collection_name="linux_funds",
|
23 |
+
embedding_function=embed,
|
24 |
+
persist_directory=CHROMA_PATH
|
25 |
+
)
|
26 |
+
retriever = db.as_retriever(
|
27 |
+
search_type="similarity_score_threshold",
|
28 |
+
search_kwargs={"k": 4, "score_threshold": 0.3},
|
29 |
+
)
|
30 |
+
model = ChatOpenAI(api_key=api_key, model="gpt-4o")
|
31 |
+
# docs = retriever.invoke(test_query)
|
32 |
+
# print("\n--- RELEVANT DOCUMENTS ---")
|
33 |
+
# for i, doc in enumerate(docs, 1):
|
34 |
+
# print(f"Document {i}:\n{doc.page_content}\n")
|
35 |
+
# if doc.metadata:
|
36 |
+
# print(f"Source: {doc.metadata.get('source', 'Unknown')}\n")
|
37 |
+
context = (
|
38 |
+
"Given a chat history and the latest user question "
|
39 |
+
"which might reference context in the chat history, "
|
40 |
+
"formulate a standalone question which can be understood "
|
41 |
+
"without the chat history. Do NOT answer the question, just "
|
42 |
+
"reformulate it if needed and otherwise return it as is."
|
43 |
+
)
|
44 |
+
context_with_history = ChatPromptTemplate(
|
45 |
+
[
|
46 |
+
("system", context),
|
47 |
+
MessagesPlaceholder("chat_history"),
|
48 |
+
("human", "{input}"),
|
49 |
+
]
|
50 |
+
)
|
51 |
+
history_aware_retriever = create_history_aware_retriever(
|
52 |
+
model, retriever, context_with_history
|
53 |
+
)
|
54 |
+
main_query = (
|
55 |
+
"You are an assistant for question-answering tasks. Use"
|
56 |
+
"the following pieces of retrieved context to answer the "
|
57 |
+
"question. If you don't know the answer, just say "
|
58 |
+
"you don't know. Use 10 sentences maximum and keep the answer "
|
59 |
+
"concise. You will most likely have to write bash scripts, so make"
|
60 |
+
" this presentable on HuggingFace in markdown if needed."
|
61 |
+
"\n\n"
|
62 |
+
"{context}"
|
63 |
+
)
|
64 |
+
prompt = ChatPromptTemplate(
|
65 |
+
[
|
66 |
+
("system", main_query),
|
67 |
+
MessagesPlaceholder("chat_history"),
|
68 |
+
("human", "{input}"),
|
69 |
+
]
|
70 |
+
)
|
71 |
+
qna_chain = create_stuff_documents_chain(model, prompt)
|
72 |
+
rag_chain = create_retrieval_chain(history_aware_retriever, qna_chain)
|
73 |
+
return rag_chain
|
74 |
+
|
75 |
+
def chat():
|
76 |
+
print("Start asking about the Theory of Computation. Type 'exit' to end the conversation.")
|
77 |
+
chat_history = []
|
78 |
+
|
79 |
+
while True:
|
80 |
+
query = input("You: ")
|
81 |
+
if query.lower() == "exit":
|
82 |
+
break
|
83 |
+
rag_chain = build_rag_chain(API_KEY)
|
84 |
+
result = rag_chain.invoke({"input": query, "chat_history": chat_history})
|
85 |
+
print(f"AI: {result['answer']}")
|
86 |
+
chat_history.append(HumanMessage(content=query))
|
87 |
+
chat_history.append(SystemMessage(content=result["answer"]))
|
88 |
+
|
89 |
+
# ABOVE IS FOR LOCAL TESTING ONLY ^ ONLY KEEPING IT FOR FUTURE USE
|
90 |
+
|
91 |
+
|
92 |
+
# messages = [
|
93 |
+
# SystemMessage(content="You are a helpful assistant."),
|
94 |
+
# HumanMessage(content=query_input),
|
95 |
+
# ]
|
96 |
+
#
|
97 |
+
# result = model.invoke(messages)
|
98 |
+
#
|
99 |
+
# print("\n--- Generated Response ---")
|
100 |
+
# print("Result:")
|
101 |
+
# print(result)
|
102 |
+
# print("Content only:")
|
103 |
+
# print(result.content)
|
104 |
+
|
105 |
+
if __name__ == "__main__":
|
106 |
+
chat()
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
langchain~=0.3.7
|
2 |
+
streamlit~=1.39.0
|
3 |
+
langchain==0.3.7
|
4 |
+
langchain-core==0.3.15
|
5 |
+
langchain-chroma==0.1.4
|
6 |
+
langchain-community==0.3.5
|
7 |
+
langchain-openai==0.2.5
|
8 |
+
python-dotenv~=1.0.1
|
9 |
+
pypdf==5.1.0
|
setup.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
|
4 |
+
def setup():
|
5 |
+
submit_button = st.button("Generate a response")
|
6 |
+
# "Enter your OpenAPI key above (make sure there is money in it). Learn more [here](https://platform.openai.com/api-keys)",
|
7 |
+
openai_models: str = st.selectbox(
|
8 |
+
label="Choose your OpenAI model:",
|
9 |
+
options=[
|
10 |
+
"gpt-3.5-turbo",
|
11 |
+
"gpt-4-turbo"
|
12 |
+
]
|
13 |
+
)
|
14 |
+
st.subheader("LinuxGPT")
|
15 |
+
st.write(
|
16 |
+
"LinuxGPT is currently trained on the Linux Fundamentals textbook, and course materials will be added soon."
|
17 |
+
)
|
18 |
+
st.subheader("Examples for Demoing")
|
19 |
+
st.write("")
|
20 |
+
return (
|
21 |
+
submit_button,
|
22 |
+
openai_models,
|
23 |
+
)
|