File size: 3,933 Bytes
67d9bfa
813c0d8
 
 
 
67d9bfa
d3942c3
 
813c0d8
 
d3942c3
 
813c0d8
d3942c3
813c0d8
d3942c3
 
 
 
 
 
 
 
813c0d8
 
98d41b7
d3942c3
 
813c0d8
d3942c3
 
 
 
 
 
 
 
 
 
813c0d8
d3942c3
 
 
 
813c0d8
d3942c3
 
 
 
813c0d8
 
 
d3942c3
813c0d8
d3942c3
 
 
 
 
813c0d8
 
d3942c3
813c0d8
 
d3942c3
 
 
813c0d8
d3942c3
 
 
 
 
 
813c0d8
 
d3942c3
 
 
 
 
 
 
 
 
 
 
813c0d8
d3942c3
 
 
 
 
 
813c0d8
d3942c3
813c0d8
 
 
 
 
 
d3942c3
813c0d8
d3942c3
813c0d8
 
 
 
 
d3942c3
813c0d8
 
 
 
d3942c3
813c0d8
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# app.py 12-04-2024, 19u45m CET
#
# ChatGPT
# To convert the code from a Chainlit app to a Streamlit app, you'll need to make several modifications. 
# Here's the modified code for a Streamlit app:

import os
from typing import List
import streamlit as st
from langchain_community.embeddings import FastEmbedEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.chains import ConversationalRetrievalChain
from langchain.document_loaders import PyPDFLoader
from langchain_groq import ChatGroq
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.docstore.document import Document
from langchain.memory import ChatMessageHistory, ConversationBufferMemory

st.title("Chat App")
st.write("Upload a PDF file to begin!")

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

system_template = """Use the following pieces of context to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
ALWAYS return a "SOURCES" part in your answer.
The "SOURCES" part should be a reference to the source of the document from which you got your answer.
And if the user greets with greetings like Hi, hello, How are you, etc reply accordingly as well.
Example of your response should be:
The answer is foo
SOURCES: xyz
Begin!
----------------
{summaries}"""

messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template("{question}"),
]

prompt = ChatPromptTemplate.from_messages(messages)
chain_type_kwargs = {"prompt": prompt}


def process_file(file):
    with open(file.name, "wb") as f:
        f.write(file.read())

    pypdf_loader = PyPDFLoader(file.name)
    texts = pypdf_loader.load_and_split()
    texts = [text.page_content for text in texts]
    return texts


def main():
    files = st.file_uploader("Upload PDF File", type="pdf", key="pdf_upload")

    if not files:
        return

    file = files[0]

    st.write(f"Processing `{file.name}`...")

    texts = process_file(file)

    # Create a metadata for each chunk
    metadatas = [{"source": f"{i}-pl"} for i in range(len(texts))]

    embeddings = FastEmbedEmbeddings()
    docsearch = Chroma.from_texts(texts, embeddings, metadatas=metadatas)

    message_history = ChatMessageHistory()

    memory = ConversationBufferMemory(
        memory_key="chat_history",
        output_key="answer",
        chat_memory=message_history,
        return_messages=True,
    )

    chain = ConversationalRetrievalChain.from_llm(
        ChatGroq(temperature=0.2, groq_api_key=groq_api_key, model_name='mixtral-8x7b-32768', streaming=True),
        chain_type="stuff",
        retriever=docsearch.as_retriever(),
        memory=memory,
        return_source_documents=True,
    )

    st.write(f"Processing `{file.name}` done. You can now ask questions!")

    while True:
        user_input = st.text_input("User Input")
        if st.button("Send"):
            res = chain.call(user_input)
            answer = res["answer"]
            source_documents = res["source_documents"]

            text_elements = []

            if source_documents:
                for source_idx, source_doc in enumerate(source_documents):
                    source_name = f"source_{source_idx}"
                    text_elements.append(Document(content=source_doc.page_content, name=source_name))
                source_names = [text_el.name for text_el in text_elements]

                if source_names:
                    answer += f"\nSources: {', '.join(source_names)}"
                else:
                    answer += "\nNo sources found"

            st.write(answer)
            for source_doc in source_documents:
                st.write(source_doc.page_content)