Spaces:
Sleeping
Sleeping
import os | |
from dotenv import load_dotenv | |
import streamlit as st | |
from langchain.text_splitter import CharacterTextSplitter | |
from langchain.embeddings.openai import OpenAIEmbeddings | |
from langchain.vectorstores import FAISS | |
from langchain.prompts import PromptTemplate | |
from langchain.memory import ConversationBufferMemory | |
from langchain.chains import ConversationalRetrievalChain | |
from langchain.chat_models import ChatOpenAI | |
from langchain.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader, CSVLoader | |
import tempfile | |
# Load environment variables | |
load_dotenv() | |
api_key = os.getenv("OPENAI_API_KEY") | |
# Custom template to guide LLM model | |
custom_template = """ | |
<s>[INST]You will start the conversation by greeting the user and introducing yourself as an Expert PDF documents analyze and assistant, | |
stating your availability for assistance. Your next step will depend on the user's response. | |
If the user expresses a need for assistance in pdf or document or txt or csv, you will ask them to describe their question. | |
However, if the user asks questions out of context from the knowledge base, you will immediately thank them and | |
say goodbye, ending the conversation. Remember to base your responses on the user's needs, providing accurate and | |
concise information regarding the data within the knowledge base. Your interactions should be professional and | |
focused, ensuring the user's queries are addressed efficiently without deviating from the set flows. | |
CHAT HISTORY: {chat_history} | |
QUESTION: {question} | |
ANSWER: | |
</s>[INST] | |
""" | |
CUSTOM_QUESTION_PROMPT = PromptTemplate.from_template(custom_template) | |
prompt_template = """<s>[INST] | |
You will answer from the provided files stored in knowledge base | |
CONTEXT: {context} | |
CHAT HISTORY: {chat_history} | |
QUESTION: {question} | |
ANSWER: | |
</s>[INST] | |
""" | |
prompt = PromptTemplate(template=prompt_template, | |
input_variables=['context', 'question', 'chat_history']) | |
# Function to extract text from documents | |
def get_document_text(uploaded_files): | |
documents = [] | |
for uploaded_file in uploaded_files: | |
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[-1]) as temp_file: | |
temp_file.write(uploaded_file.read()) | |
temp_file_path = temp_file.name | |
# Load document based on its type | |
if uploaded_file.name.endswith(".pdf"): | |
loader = PyPDFLoader(temp_file_path) | |
documents.extend(loader.load()) | |
elif uploaded_file.name.endswith(".docx") or uploaded_file.name.endswith(".doc"): | |
loader = Docx2txtLoader(temp_file_path) | |
documents.extend(loader.load()) | |
elif uploaded_file.name.endswith(".txt"): | |
loader = TextLoader(temp_file_path) | |
documents.extend(loader.load()) | |
elif uploaded_file.name.endswith(".csv"): | |
loader = CSVLoader(temp_file_path) | |
documents.extend(loader.load()) | |
return documents | |
# Split text into chunks | |
def get_chunks(documents): | |
text_splitter = CharacterTextSplitter(separator="\n", chunk_size=600, chunk_overlap=200, length_function=len) | |
chunks = [chunk for doc in documents for chunk in text_splitter.split_text(doc.page_content)] | |
return chunks | |
# Create vectorstore | |
def get_vectorstore(chunks): | |
embeddings = OpenAIEmbeddings() | |
vectorstore = FAISS.from_texts(texts=chunks, embedding=embeddings) | |
return vectorstore | |
# Create a conversational chain | |
def get_conversationchain(vectorstore): | |
llm = ChatOpenAI(temperature=0.1, model_name='gpt-4o-mini') | |
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True) | |
conversation_chain = ConversationalRetrievalChain.from_llm( | |
llm=llm, | |
retriever=vectorstore.as_retriever(search_type="similarity",search_kwargs={"k": 10}), | |
condense_question_prompt=CUSTOM_QUESTION_PROMPT, | |
memory=memory, | |
combine_docs_chain_kwargs={'prompt': prompt} | |
) | |
return conversation_chain | |
# Handle user questions and update chat history | |
def handle_question(question): | |
if not st.session_state.conversation: | |
st.warning("Please process your documents first.") | |
return | |
response = st.session_state.conversation({'question': question}) | |
st.session_state.chat_history = response['chat_history'] | |
for i, msg in enumerate(st.session_state.chat_history): | |
if i % 2 == 0: | |
st.markdown(f"**You:** {msg.content}") | |
else: | |
st.markdown(f"**Bot:** {msg.content}") | |
def handle_question(question): | |
if not st.session_state.conversation: | |
st.warning("Please process your documents first.") | |
return | |
# Get the response from the conversation chain | |
response = st.session_state.conversation({'question': question}) | |
# Update chat history | |
st.session_state.chat_history = response['chat_history'] | |
# Display chat history | |
for i, msg in enumerate(st.session_state.chat_history): | |
if i % 2 == 0: | |
st.markdown(f"**You:** {msg.content}") | |
else: | |
st.markdown(f"**Bot:** {msg.content}") | |
# Main Streamlit app | |
def main(): | |
st.set_page_config(page_title="Chat with Documents", page_icon="π") | |
st.title("π Chat with Your Documents") | |
st.sidebar.title("Upload Your Files") | |
if "conversation" not in st.session_state: | |
st.session_state.conversation = None | |
if "chat_history" not in st.session_state: | |
st.session_state.chat_history = None | |
# File uploader | |
uploaded_files = st.sidebar.file_uploader("Upload your files (PDF, DOCX, TXT, CSV):", accept_multiple_files=True) | |
# Process button | |
if st.sidebar.button("Process Documents"): | |
if uploaded_files: | |
with st.spinner("Processing documents..."): | |
# Extract text and create conversation chain | |
raw_documents = get_document_text(uploaded_files) | |
text_chunks = get_chunks(raw_documents) | |
vectorstore = get_vectorstore(text_chunks) | |
st.session_state.conversation = get_conversationchain(vectorstore) | |
st.success("Documents processed successfully!") | |
else: | |
st.warning("Please upload at least one document.") | |
# User input | |
question = st.text_input("Ask a question about the uploaded documents:") | |
if question: | |
handle_question(question) | |
if __name__ == '__main__': | |
main() | |