Spaces:

DrishtiSharma
/

chat_w_pdf

Running

File size: 5,610 Bytes

import streamlit as st
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings, HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models.gigachat import GigaChat 
from htmlTemplates import css, bot_template, user_template
from langchain.llms import HuggingFaceHub, LlamaCpp
from huggingface_hub import snapshot_download, hf_hub_download



repo_name = "IlyaGusev/saiga_mistral_7b_gguf"
model_name = "model-q4_K.gguf"

#snapshot_download(repo_id=repo_name, local_dir=".", allow_patterns=model_name)

from transformers import pipeline

# Initialize the summarization pipeline
summarizer = pipeline("summarization")



def get_pdf_text(pdf_docs):
    text = ""
    for pdf in pdf_docs:
        pdf_reader = PdfReader(pdf)
        for page in pdf_reader.pages:
            text += page.extract_text()

    return text


def get_text_chunks(text):
    text_splitter = CharacterTextSplitter(separator="\n",
                                          chunk_size=1000,  # 1000
                                          chunk_overlap=200,  # 200
                                          length_function=len
                                          )
    chunks = text_splitter.split_text(text)

    return chunks


#def get_vectorstore(text_chunks):
    #embeddings = OpenAIEmbeddings()
    #embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
    #embeddings = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large")
    #embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
    #vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)

    #return vectorstore


def get_vectorstore(text_chunks, embedding_model_name="intfloat/multilingual-e5-large"):
    embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
    return vectorstore



def get_conversation_chain(vectorstore, model_name):


    llm = GigaChat(profanity=False,
                   verify_ssl_certs=False
                  )
    
    memory = ConversationBufferMemory(memory_key='chat_history',
                                      input_key='question',
                                      output_key='answer',
                                      return_messages=True)
    
    conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm,
                                                               retriever=vectorstore.as_retriever(),
                                                               memory=memory,
                                                               return_source_documents=True
                                                               )

    return conversation_chain


def summarize_text(text):
    summary = summarizer(text, max_length=130, min_length=30, do_sample=False)
    return summary[0]['summary_text']


def handle_userinput(user_question):
    response = st.session_state.conversation({'question': user_question})

    st.session_state.chat_history = response['chat_history']
    st.session_state.retrieved_text = response['source_documents']

    for i, (message, text) in enumerate(zip(st.session_state.chat_history, st.session_state.retrieved_text)):
        if i % 2 == 0:  # User messages
            st.write(user_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
        else:  # Bot messages
            st.write(bot_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
            if summarize_option and text.page_content:  # Check if summarization is enabled
                summarized_text = summarize_text(text.page_content)
                st.write(bot_template.replace("{{MSG}}", summarized_text), unsafe_allow_html=True)
            else:
                st.write(bot_template.replace("{{MSG}}", text.page_content), unsafe_allow_html=True)



st.set_page_config(page_title="Chat with multiple PDFs",
                   page_icon=":books:")
st.write(css, unsafe_allow_html=True)

if "conversation" not in st.session_state:
    st.session_state.conversation = None
if "chat_history" not in st.session_state:
    st.session_state.chat_history = None

st.header("Chat with multiple PDFs :books:")
user_question = st.text_input("Ask a question about your documents: ")

if user_question:
    handle_userinput(user_question)

with st.sidebar:
    st.subheader("Your documents")
    embedding_model_name = st.selectbox("Select embedding model", ["intfloat/multilingual-e5-large", "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"])
    summarize_option = st.sidebar.checkbox("Enable Summarization", value=False)
    pdf_docs = st.file_uploader(
        "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
    if st.button("Process"):
        with st.spinner("Processing"):
            # get pdf text
            raw_text = get_pdf_text(pdf_docs)

            # get the text chunks
            text_chunks = get_text_chunks(raw_text)

            # create vector store
            vectorstore = get_vectorstore(text_chunks, embedding_model_name)

            # create conversation chain
            st.session_state.conversation = get_conversation_chain(vectorstore, model_name)