Spaces:
Sleeping
Sleeping
File size: 3,004 Bytes
8884a50 89c2788 8884a50 89c2788 a5db8b1 89c2788 fcae09a 89c2788 8884a50 89c2788 4cfa3e0 89c2788 4cfa3e0 89c2788 39f86fb 89c2788 39f86fb 89c2788 39f86fb 89c2788 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
import streamlit as st
from llama_index import VectorStoreIndex, ServiceContext
from llama_index.embeddings import HuggingFaceEmbedding
from llama_index.llms import HuggingFaceInferenceAPI
from llama_index.schema import Document
from PyPDF2 import PdfReader
class DocumentLoader:
@staticmethod
def read_pdf(uploaded_file):
pdf_reader = PdfReader(uploaded_file)
text = ""
for page_num in range(len(pdf_reader.pages)):
text += pdf_reader.pages[page_num].extract_text()
return text
@staticmethod
def load_documents(uploaded_pdf):
file_contents = DocumentLoader.read_pdf(uploaded_pdf)
return [Document(text=file_contents)]
class IndexCreator:
@staticmethod
def create_index(documents, hf_token):
llm = HuggingFaceInferenceAPI(model_name="HuggingFaceH4/zephyr-7b-alpha", token=hf_token)
embed_model_uae = HuggingFaceEmbedding(model_name="WhereIsAI/UAE-Large-V1")
service_context = ServiceContext.from_defaults(
llm=llm, chunk_size=800, chunk_overlap=20, embed_model=embed_model_uae
)
index = VectorStoreIndex.from_documents(documents, service_context=service_context, show_progress=True)
index.storage_context.persist()
return index.as_query_engine()
class PDFQueryApp:
def __init__(self):
st.title("Private LLM @Purbayan_Majumder")
st.write("Base Model : **HuggingFaceH4/zephyr-7b-alpha (open-source from HuggineFace)**")
st.write("Embedding Model : **WhereIsAI/UAE-Large-V1(open-source from HuggineFace)**")
st.write("Ask anything from the data that you upload")
st.write("Note !! As its runnning on a CPU it takes times 5 to 8 mins for each response")
self.hf_token = st.text_input("Enter your Hugging Face token [Free]:")
self.uploaded_pdf = st.file_uploader("Upload your data[PDF for now]", type=['pdf'])
self.query_engine = None
def load_and_create_index(self):
if self.uploaded_pdf:
st.success("Dataset has been loaded into the model succesfully")
documents = DocumentLoader.load_documents(self.uploaded_pdf)
self.query_engine = IndexCreator.create_index(documents, self.hf_token)
st.success("Vector embeddings have been succesfully created and initiated")
else:
st.warning("You have to upload a PDF file first.")
def run_query(self, user_query):
if self.query_engine and user_query:
with st.spinner('Fetching the response from the model Please wait !!!!...'):
response = self.query_engine.query(user_query)
st.markdown(f"**Response:** {response}")
else:
st.warning("Please load documents and create vector embeddings before querying.")
if __name__ == "__main__":
app = PDFQueryApp()
app.load_and_create_index()
user_query = st.text_input("Enter your query from the dataset:")
app.run_query(user_query) |