Spaces:
Sleeping
Sleeping
File size: 4,425 Bytes
1404580 bb90a67 1404580 ad29e3b 1404580 bb90a67 1404580 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
import streamlit as st
import os
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma, Pinecone
from langchain.llms import OpenAI
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader
import pinecone
# Set the path where you want to save the uploaded PDF file
SAVE_DIR = "pdf"
st.header('Question Answering with your PDF file')
st.write("Are you interested in chatting with your own documents, whether it is a text file, a PDF, or a website? LangChain makes it easy for you to do question answering with your documents.")
def qa(file, query, chain_type, k,api_key_pinecode,index_name,environment_pinecode):
# load document
loader = PyPDFLoader(file)
#loader = UnstructuredPDFLoader(file)
#loader = OnlinePDFLoader("https://wolfpaulus.com/wp-content/uploads/2017/05/field-guide-to-data-science.pdf")
documents = loader.load()
#print("doccs",documents)
# split the documents into chunks
# text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
# texts = text_splitter.split_documents(documents)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
# select which embeddings we want to use
embeddings = OpenAIEmbeddings()
# create the vectorestore to use as the index
# initialize pinecone
pinecone.init(
api_key=api_key_pinecode, # find at app.pinecone.io
environment=environment_pinecode #"northamerica-northeast1-gcp" # next to api key in console
)
#index_name = "openaiindex"
index_name = index_name
#db = Chroma.from_documents(texts, embeddings)
#db = Pinecone.from_texts(texts, embeddings)
db = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)
# expose this index in a retriever interface
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": k})
# create a chain to answer questions
qa = RetrievalQA.from_chain_type(
llm=OpenAI(), chain_type=chain_type, retriever=retriever, return_source_documents=True)
result = qa({"query": query})
print(result['result'])
return result
with st.sidebar:
st.header('Configurations')
st.write("Enter OpenAI API key. This costs $. Set up billing at [OpenAI](https://platform.openai.com/account).")
apikey = st.text_input("Enter your OpenAI API Key here")
os.environ["OPENAI_API_KEY"] = apikey
st.write("Enter Pinecode API key. [Pinecode](https://www.pinecone.io/).")
apikey2 = st.text_input("Enter your Pinecone Key here")
enviroment_pinecode = st.text_input("Enter your Pinecone your environment Key")
index_name = st.text_input("enter index-name")
left_column, right_column = st.columns(2)
# You can use a column just like st.sidebar:
with left_column:
# Add a file uploader to the app
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
# Check if a file has been uploaded
if uploaded_file is not None:
# Save the uploaded file to the specified directory
file_path = os.path.join(SAVE_DIR, uploaded_file.name)
with open(file_path, "wb") as f:
f.write(uploaded_file.getbuffer())
st.success(f"File path {file_path}")
query = st.text_input("enter your question")
chain_type = st.selectbox(
'chain type',
('stuff', 'map_reduce', "refine", "map_rerank"))
k = st.slider('Number of relevant chunks', 1, 5)
if st.button('Loading'):
# Or even better, call Streamlit functions inside a "with" block:
result=qa(file_path, query, chain_type, k, apikey2, index_name, enviroment_pinecode)
with right_column:
st.write("Output of your question")
#st.write(result)
#st.write(result['result'])
st.subheader("Result")
st.write(result['result'])
st.subheader("source_documents")
st.write(result['source_documents'][0])
|