File size: 3,004 Bytes
8884a50
89c2788
 
 
 
8884a50
 
89c2788
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a5db8b1
89c2788
 
 
fcae09a
89c2788
 
 
 
 
 
 
 
 
 
 
8884a50
89c2788
4cfa3e0
89c2788
 
 
 
 
 
 
4cfa3e0
89c2788
 
39f86fb
89c2788
39f86fb
89c2788
39f86fb
89c2788
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import streamlit as st
from llama_index import VectorStoreIndex, ServiceContext
from llama_index.embeddings import HuggingFaceEmbedding
from llama_index.llms import HuggingFaceInferenceAPI
from llama_index.schema import Document
from PyPDF2 import PdfReader

class DocumentLoader:
    @staticmethod
    def read_pdf(uploaded_file):
        pdf_reader = PdfReader(uploaded_file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            text += pdf_reader.pages[page_num].extract_text()
        return text

    @staticmethod
    def load_documents(uploaded_pdf):
        file_contents = DocumentLoader.read_pdf(uploaded_pdf)
        return [Document(text=file_contents)]

class IndexCreator:
    @staticmethod
    def create_index(documents, hf_token):
        llm = HuggingFaceInferenceAPI(model_name="HuggingFaceH4/zephyr-7b-alpha", token=hf_token)
        embed_model_uae = HuggingFaceEmbedding(model_name="WhereIsAI/UAE-Large-V1")

        service_context = ServiceContext.from_defaults(
            llm=llm, chunk_size=800, chunk_overlap=20, embed_model=embed_model_uae
        )
        index = VectorStoreIndex.from_documents(documents, service_context=service_context, show_progress=True)
        index.storage_context.persist()
        return index.as_query_engine()

class PDFQueryApp:
    def __init__(self):
        st.title("Private LLM @Purbayan_Majumder")
        st.write("Base Model : **HuggingFaceH4/zephyr-7b-alpha (open-source from HuggineFace)**")
        st.write("Embedding Model : **WhereIsAI/UAE-Large-V1(open-source from HuggineFace)**")
        st.write("Ask anything from the data that you upload")
        st.write("Note !! As its runnning on a CPU it takes times 5 to 8 mins for each response")

        self.hf_token = st.text_input("Enter your Hugging Face token [Free]:")
        self.uploaded_pdf = st.file_uploader("Upload your data[PDF for now]", type=['pdf'])
        self.query_engine = None

    def load_and_create_index(self):
        if self.uploaded_pdf:
            st.success("Dataset has been loaded into the model succesfully")
            documents = DocumentLoader.load_documents(self.uploaded_pdf)
            self.query_engine = IndexCreator.create_index(documents, self.hf_token)
            st.success("Vector embeddings have been succesfully created and initiated")
        else:
            st.warning("You have to upload a PDF file first.")

    def run_query(self, user_query):
        if self.query_engine and user_query:
            with st.spinner('Fetching the response from the model Please wait !!!!...'):
                response = self.query_engine.query(user_query)
            st.markdown(f"**Response:** {response}")
        else:
            st.warning("Please load documents and create vector embeddings before querying.")

if __name__ == "__main__":
    app = PDFQueryApp()

    app.load_and_create_index()

    user_query = st.text_input("Enter your query from the dataset:")

    app.run_query(user_query)