tony346 commited on
Commit
ffa3342
β€’
1 Parent(s): 2687e1f

Upload 4 files

Browse files
Files changed (3) hide show
  1. README.md +6 -5
  2. app.py +105 -0
  3. requirements.txt +8 -0
README.md CHANGED
@@ -1,12 +1,13 @@
1
  ---
2
- title: PDF Llama
3
- emoji: πŸ“‰
4
- colorFrom: green
5
- colorTo: yellow
6
  sdk: streamlit
7
- sdk_version: 1.27.2
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: LLama PDF
3
+ emoji: πŸƒ
4
+ colorFrom: purple
5
+ colorTo: blue
6
  sdk: streamlit
7
+ sdk_version: 1.27.1
8
  app_file: app.py
9
  pinned: false
10
+ license: llama2
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import necessary modules for processing documents, embeddings, Q&A, etc. from 'langchain' library.
2
+ from dotenv import load_dotenv
3
+ load_dotenv() # Load environment variables from a .env file.
4
+ from langchain.document_loaders import PyPDFLoader # For loading and reading PDF documents.
5
+ from langchain.text_splitter import RecursiveCharacterTextSplitter # For splitting large texts into smaller chunks.
6
+ from langchain.vectorstores import Chroma # Vector storage system for embeddings.
7
+ from langchain.llms import CTransformers # For loading transformer models.
8
+ # from InstructorEmbedding import INSTRUCTOR # Not clear without context, possibly a custom embedding.
9
+ from langchain.embeddings import HuggingFaceInstructEmbeddings # Embeddings from HuggingFace models with instructions.
10
+ from langchain.embeddings import HuggingFaceEmbeddings # General embeddings from HuggingFace models.
11
+ from langchain.embeddings import LlamaCppEmbeddings # Embeddings using the Llama model.
12
+ from langchain.chains import RetrievalQA # Q&A retrieval system.
13
+ from langchain.embeddings import OpenAIEmbeddings # Embeddings from OpenAI models.
14
+ from langchain.vectorstores import FAISS # Another vector storage system for embeddings.
15
+
16
+ # Import Streamlit for creating a web application and other necessary modules for file handling.
17
+ import streamlit as st # Main library for creating the web application.
18
+ import tempfile # For creating temporary directories and files.
19
+ import os # For handling file and directory paths.
20
+
21
+ # Import a handler for streaming outputs.
22
+ from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler # For live updates in the Streamlit app.
23
+
24
+ st.title("ChatPDF")
25
+
26
+ st.markdown("""
27
+ ChatPDF is a web application that can answer questions based on a PDF document. To use the app, simply upload a PDF file and type your question in the input box. The app will then use a powerful language model to generate an answer to your question.
28
+ """)
29
+
30
+ # Create a visual separator in the app.
31
+ st.write("---")
32
+
33
+ # Add a file uploader widget for users to upload their PDF files.
34
+ uploaded_file = st.sidebar.file_uploader("Upload your PDF file!", type=['pdf'])
35
+ # Another visual separator after the file uploader.
36
+ st.write("---")
37
+
38
+ # Function to convert the uploaded PDF into a readable document format.
39
+ def pdf_to_document(uploaded_file):
40
+ # Create a temporary directory for storing the uploaded PDF.
41
+ temp_dir = tempfile.TemporaryDirectory()
42
+ # Get the path where the uploaded PDF will be stored temporarily.
43
+ temp_filepath = os.path.join(temp_dir.name, uploaded_file.name)
44
+
45
+ # Save the uploaded PDF to the temporary path.
46
+ with open(temp_filepath, "wb") as f:
47
+ f.write(uploaded_file.getvalue())
48
+
49
+ # Load the PDF and split it into individual pages.
50
+ loader = PyPDFLoader(temp_filepath)
51
+ pages = loader.load_and_split()
52
+ return pages
53
+
54
+ # Check if a user has uploaded a file.
55
+ if uploaded_file is not None:
56
+ # Convert the uploaded PDF into a document format.
57
+ pages = pdf_to_document(uploaded_file)
58
+
59
+ # Initialize a tool to split the document into smaller textual chunks.
60
+ text_splitter = RecursiveCharacterTextSplitter(
61
+ chunk_size = 300, # Define the size of each chunk.
62
+ chunk_overlap = 20, # Define how much chunks can overlap.
63
+ length_function = len # Function to determine the length of texts.
64
+ )
65
+ # Split the document into chunks.
66
+ texts = text_splitter.split_documents(pages)
67
+
68
+ ## Below are examples of different embedding techniques, but they are commented out.
69
+
70
+ # Load the desired embeddings model.
71
+ embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2',
72
+ model_kwargs={'device': 'cpu'})
73
+
74
+ # Load the textual chunks into the Chroma vector store.
75
+ db = Chroma.from_documents(texts, embeddings)
76
+
77
+ # Custom handler to stream outputs live to the Streamlit application.
78
+ from langchain.callbacks.base import BaseCallbackHandler
79
+ class StreamHandler(BaseCallbackHandler):
80
+ def __init__(self, container, initial_text=""):
81
+ self.container = container # Streamlit container to display text.
82
+ self.text=initial_text
83
+ def on_llm_new_token(self, token: str, **kwargs) -> None:
84
+ self.text+=token # Add new tokens to the text.
85
+ self.container.markdown(self.text) # Display the text.
86
+
87
+ # Header for the Q&A section of the web app.
88
+ st.header("Ask the PDF a question!")
89
+ # Input box for users to type their questions.
90
+ question = st.text_input('Type your question')
91
+
92
+ # Check if the user has pressed the 'Ask' button.
93
+ if st.button('Ask'):
94
+ # Display a spinner while processing the question.
95
+ with st.spinner('Processing...'):
96
+ # Space to display the answer.
97
+ chat_box = st.empty()
98
+ # Initialize the handler to stream outputs.
99
+ stream_hander = StreamHandler(chat_box)
100
+
101
+ # Initialize the Q&A model and chain.
102
+ llm = CTransformers(model="llama-2-7b-chat.ggmlv3.q2_K.bin", model_type="llama", callbacks=[stream_hander])
103
+ qa_chain = RetrievalQA.from_chain_type(llm, retriever=db.as_retriever())
104
+ # Get the answer to the user's question.
105
+ qa_chain({"query": question})
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ pypdf
3
+ chromadb
4
+ tiktoken
5
+ pysqlite3-binary
6
+ streamlit-extras
7
+ InstructorEmbedding
8
+ sentence-transformers