CazimirRoman commited on
Commit
d64a8e1
1 Parent(s): a63e5a1

initial commit

Browse files
Files changed (3) hide show
  1. Dockerfile +30 -0
  2. app/app.py +110 -0
  3. requirements.txt +11 -0
Dockerfile ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /code
4
+
5
+ COPY ./requirements.txt /code/requirements.txt
6
+
7
+ RUN pip3 install --no-cache-dir --upgrade -r /code/requirements.txt
8
+
9
+ # we are using open AI here. no need to download model
10
+ # RUN python3 -c 'from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM;model=AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn");model.save_pretrained("models");tokenizer=AutoTokenizer.from_pretrained("facebook/bart-large-cnn");tokenizer.save_pretrained("models")'
11
+
12
+ # Set up a new user named "user" with user ID 1000
13
+ RUN useradd -m -u 1000 user
14
+ # Switch to the "user" user
15
+ USER user
16
+ # Set home to the user's home directory
17
+ ENV HOME=/home/user
18
+ ENV PATH=/home/user/.local/bin:$PATH
19
+
20
+ # Set the working directory to the user's home directory
21
+ WORKDIR $HOME/app
22
+
23
+ # Copy the current directory contents into the container at $HOME/app setting the owner to the user
24
+ COPY --chown=user . $HOME/app
25
+
26
+ COPY ./app /code/app
27
+
28
+ # CMD ["uvicorn", "app.app:main", "--host", "0.0.0.0", "--port", "7860"]
29
+
30
+ ENTRYPOINT ["streamlit", "run", "app/app.py", "--server.port=8501", "--server.address=0.0.0.0"]
app/app.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from dotenv import load_dotenv
3
+ import pickle
4
+ from PyPDF2 import PdfReader
5
+ from streamlit_extras.add_vertical_space import add_vertical_space
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+ from langchain.embeddings.openai import OpenAIEmbeddings
8
+ from langchain.vectorstores import FAISS
9
+ from langchain.llms import OpenAI
10
+ from langchain.chains.question_answering import load_qa_chain
11
+ from langchain.callbacks import get_openai_callback
12
+
13
+ import os
14
+
15
+ with st.sidebar:
16
+ st.title('PDF Chat App')
17
+ st.markdown('''
18
+ ## About
19
+ This app is an LLM-powered PDF chatbot built using:
20
+ - [Streamlit](https://streamlit.io/)
21
+ - [LangChain](https://python.langchain.com/)
22
+ - [OpenAI](https://platform.openai.com/docs/models) LLM model
23
+
24
+ ## How it works
25
+ - Load up a PDF file
26
+ - Extract the text from the PDF file
27
+ - Split the text into chunks
28
+ - Create embeddings using OpenAI, which are vectors of floating-point numbers that measure the relatedness of text strings
29
+ - Save these embeddings as vectors in a vector store, such as FAISS
30
+ - Use a similarity search to ask a question
31
+ - Get the answer and tokens used from OpenAI
32
+
33
+ ''')
34
+ st.write('Made with 🤖 by [Cazimir Roman](https://cazimir.dev)')
35
+
36
+ def load_app():
37
+ # upload a PDF file
38
+ pdf = st.file_uploader("Upload your PDF", type='pdf')
39
+
40
+ if pdf is not None:
41
+ pdf_reader = PdfReader(pdf)
42
+
43
+ text = ""
44
+ for page in pdf_reader.pages:
45
+ text += page.extract_text()
46
+
47
+ text_splitter = RecursiveCharacterTextSplitter(
48
+ chunk_size = 1000,
49
+ chunk_overlap=200,
50
+ length_function=len
51
+ )
52
+
53
+ chunks = text_splitter.split_text(text=text)
54
+
55
+ store_name = pdf.name[:-4]
56
+
57
+ # check if vector store exists. if not, create one
58
+ if os.path.exists(f"{store_name}.pkl"):
59
+ with open(f"{store_name}.pkl", "rb") as f:
60
+ vectorStore = pickle.load(f)
61
+ st.success('Text embeddings loaded from disk')
62
+ else:
63
+ with st.spinner("Creating vector store embeddings..."):
64
+ embeddings = OpenAIEmbeddings()
65
+ vectorStore = FAISS.from_texts(chunks, embeddings)
66
+ with open(f"{store_name}.pkl", "wb") as f:
67
+ pickle.dump(vectorStore, f)
68
+ st.success('Embeddings computation completed')
69
+
70
+ # Accept user question/query
71
+ st.divider()
72
+ query = st.text_input("Ask a question about your PDF file")
73
+
74
+ if query:
75
+ st.write(f"You asked: {query}")
76
+ with st.spinner("Thinking..."):
77
+ # top 3 that are most similar to our query
78
+ docs = vectorStore.similarity_search(query)
79
+ llm = OpenAI(temperature=0)
80
+ chain = load_qa_chain(llm=llm, chain_type="stuff")
81
+ with get_openai_callback() as cb:
82
+ response = chain.run(input_documents=docs, question=query)
83
+ st.write(response)
84
+
85
+ def main():
86
+ print("Main called")
87
+ st.header("Chat with your PDF")
88
+
89
+ container = st.container()
90
+
91
+ with container:
92
+ open_ai_key = os.getenv("OPENAI_API_KEY")
93
+ api_key = container.text_input("Enter your OpenAI API key", type="password", value="" if open_ai_key == None else open_ai_key)
94
+ # You can find it here: https://platform.openai.com/account/api-keys
95
+ submit = container.button("Submit")
96
+
97
+ if open_ai_key:
98
+ load_app()
99
+
100
+ # submit button is pressed
101
+ if submit:
102
+ # check if api key length correct
103
+ if len(api_key) == 51:
104
+ os.environ["OPENAI_API_KEY"] = api_key
105
+ load_app()
106
+ else:
107
+ st.error("Api key is not correct")
108
+
109
+ if __name__ == '__main__':
110
+ main()
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ PyPDF2
3
+ python-dotenv
4
+ streamlit
5
+ faiss-cpu
6
+ streamlit-extras
7
+ openai
8
+ altair<5
9
+ tiktoken
10
+ sentence_transformers
11
+ torch