JBHF commited on
Commit
5d0d8f1
·
verified ·
1 Parent(s): c44cb90

Create app_BACKUP_09032024.py

Browse files
Files changed (1) hide show
  1. app_BACKUP_09032024.py +159 -0
app_BACKUP_09032024.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # JB:
2
+ # LangChainDeprecationWarning: Importing embeddings from langchain is deprecated.
3
+ # Importing from langchain will no longer be supported as of langchain==0.2.0.
4
+ # Please import from langchain-community instead:
5
+ # `from langchain_community.embeddings import FastEmbedEmbeddings`.
6
+ # To install langchain-community run `pip install -U langchain-community`.
7
+ from langchain_community.embeddings import FastEmbedEmbeddings
8
+
9
+ import os
10
+ import streamlit as st
11
+ from langchain_groq import ChatGroq
12
+ from langchain_community.document_loaders import WebBaseLoader
13
+ # JB:
14
+ from langchain_community.document_loaders import PyPDFLoader
15
+ from langchain_community.embeddings import OllamaEmbeddings
16
+
17
+ # JB:
18
+ from langchain.embeddings import FastEmbedEmbeddings
19
+ from langchain_community.document_loaders import PyPDFDirectoryLoader
20
+
21
+ # JB:
22
+ # File Directory
23
+ # This covers how to load all documents in a directory.
24
+ # Under the hood, by default this uses the UnstructuredLoader.
25
+ from langchain_community.document_loaders import DirectoryLoader
26
+ from langchain_community.document_loaders import TextLoader
27
+ import chardet
28
+
29
+ from langchain_community.vectorstores import FAISS
30
+ # from langchain.vectorstores import Chroma
31
+ # from langchain_community.vectorstores import Chroma
32
+
33
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
34
+ from langchain.chains.combine_documents import create_stuff_documents_chain
35
+ from langchain_core.prompts import ChatPromptTemplate
36
+ from langchain.chains import create_retrieval_chain
37
+ import time
38
+ from dotenv import load_dotenv
39
+
40
+ load_dotenv() #
41
+
42
+ # groq_api_key = os.environ['GROQ_API_KEY']
43
+ groq_api_key = "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" # os.environ['GROQ_API_KEY']
44
+ print("groq_api_key: ", groq_api_key)
45
+
46
+
47
+ if "vector" not in st.session_state:
48
+
49
+ # st.session_state.embeddings = OllamaEmbeddings() # ORIGINAL
50
+ st.session_state.embeddings = FastEmbedEmbeddings() # JB
51
+
52
+
53
+ # st.session_state.loader = WebBaseLoader("https://paulgraham.com/greatwork.html") # ORIGINAL
54
+ # st.session_state.docs = st.session_state.loader.load() # ORIGINAL
55
+ # https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFLoader.html
56
+ # https://python.langchain.com/docs/integrations/document_loaders/merge_doc
57
+ # from langchain_community.document_loaders import PyPDFLoader
58
+ # loader_pdf = PyPDFLoader("../MachineLearning-Lecture01.pdf")
59
+ #
60
+ # https://stackoverflow.com/questions/60215731/pypdf-to-read-each-pdf-in-a-folder
61
+ #
62
+ # https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFDirectoryLoader.html
63
+ # https://python.langchain.com/docs/modules/data_connection/document_loaders/pdf#pypdf-directory
64
+ # !!!!!
65
+ # PyPDF Directory
66
+ # Load PDFs from directory
67
+ # from langchain_community.document_loaders import PyPDFDirectoryLoader
68
+ # loader = PyPDFDirectoryLoader("example_data/")
69
+ # docs = loader.load()
70
+ #
71
+ # ZIE OOK:
72
+ # https://python.langchain.com/docs/modules/data_connection/document_loaders/pdf#using-pypdf
73
+ # Using MathPix
74
+ # Inspired by Daniel Gross's https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21
75
+ # from langchain_community.document_loaders import MathpixPDFLoader
76
+ # loader = MathpixPDFLoader("example_data/layout-parser-paper.pdf")
77
+ # data = loader.load()
78
+ # pdf_file_path = "*.pdf" # JB
79
+ # st.session_state.loader = PyPDFLoader(file_path=pdf_file_path).load() # JB
80
+ # st.session_state.loader = PyPDFLoader(*.pdf).load() # JB syntax error *.pdf !
81
+ # st.session_state.loader = PyPDFDirectoryLoader("*.pdf") # JB PyPDFDirectoryLoader("example_data/")
82
+ # chunks = self.text_splitter.split_documents(docs)
83
+ # chunks = filter_complex_metadata(chunks)
84
+
85
+ # JB:
86
+ # https://python.langchain.com/docs/modules/data_connection/document_loaders/pdf#pypdf-directory
87
+ # st.session_state.docs = st.session_state.loader.load()
88
+ # loader = PyPDFDirectoryLoader(".")
89
+ # docs = loader.load()
90
+ # st.session_state.docs = docs
91
+
92
+ # JB:
93
+ # https://python.langchain.com/docs/modules/data_connection/document_loaders/file_directory
94
+ # text_loader_kwargs={'autodetect_encoding': True}
95
+ text_loader_kwargs={'autodetect_encoding': False}
96
+ path = '../'
97
+ # loader = DirectoryLoader(path, glob="**/*.pdf", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
98
+ # PyPDFDirectoryLoader (TEST):
99
+ # loader = PyPDFDirectoryLoader(path, glob="**/*.pdf", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
100
+ # loader = PyPDFDirectoryLoader(path, glob="**/*.pdf", loader_kwargs=text_loader_kwargs)
101
+ loader = PyPDFDirectoryLoader(path, glob="**/*.pdf")
102
+ docs = loader.load()
103
+ st.session_state.docs = docs
104
+
105
+ st.session_state.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
106
+ st.session_state.documents = st.session_state.text_splitter.split_documents(st.session_state.docs)
107
+ # st.session_state.vector = FAISS.from_documents(st.session_state.documents, st.session_state.embeddings) # ORIGINAL
108
+ st.session_state.vector = FAISS.from_documents(st.session_state.documents, st.session_state.embeddings) # ORIGINAL
109
+ # ZIE:
110
+ # ZIE VOOR EEN APP MET CHROMADB:
111
+ # https://github.com/vndee/local-rag-example/blob/main/rag.py
112
+ # https://raw.githubusercontent.com/vndee/local-rag-example/main/rag.py
113
+ # Chroma.from_documents(documents=chunks, embedding=FastEmbedEmbeddings())
114
+ # st.session_state.vector = Chroma.from_documents(st.session_state.documents, st.session_state.embeddings) # JB
115
+
116
+
117
+
118
+ # st.title("Chat with Docs - Groq Edition :) ")
119
+ st.title("Literature Based Research (LBR) - A. Unzicker and J. Bours - Chat with Docs - Groq Edition (Very Fast!) - VERSION 3 - March 8 2024")
120
+
121
+ llm = ChatGroq(
122
+ groq_api_key=groq_api_key,
123
+ model_name='mixtral-8x7b-32768'
124
+ )
125
+
126
+ prompt = ChatPromptTemplate.from_template("""
127
+ Answer the following question based only on the provided context.
128
+ Think step by step before providing a detailed answer.
129
+ I will tip you $200 if the user finds the answer helpful.
130
+ <context>
131
+ {context}
132
+ </context>
133
+ Question: {input}""")
134
+
135
+ document_chain = create_stuff_documents_chain(llm, prompt)
136
+
137
+ retriever = st.session_state.vector.as_retriever()
138
+ retrieval_chain = create_retrieval_chain(retriever, document_chain)
139
+
140
+ prompt = st.text_input("Input your prompt here")
141
+
142
+
143
+ # If the user hits enter
144
+ if prompt:
145
+ # Then pass the prompt to the LLM
146
+ start = time.process_time()
147
+ response = retrieval_chain.invoke({"input": prompt})
148
+ print(f"Response time: {time.process_time() - start}")
149
+
150
+ st.write(response["answer"])
151
+
152
+ # With a streamlit expander
153
+ with st.expander("Document Similarity Search"):
154
+ # Find the relevant chunks
155
+ for i, doc in enumerate(response["context"]):
156
+ # print(doc)
157
+ # st.write(f"Source Document # {i+1} : {doc.metadata['source'].split('/')[-1]}")
158
+ st.write(doc.page_content)
159
+ st.write("--------------------------------")