# JB: | |
# LangChainDeprecationWarning: Importing embeddings from langchain is deprecated. | |
# Importing from langchain will no longer be supported as of langchain==0.2.0. | |
# Please import from langchain-community instead: | |
# `from langchain_community.embeddings import FastEmbedEmbeddings`. | |
# To install langchain-community run `pip install -U langchain-community`. | |
from langchain_community.embeddings import FastEmbedEmbeddings | |
import os | |
import streamlit as st | |
from langchain_groq import ChatGroq | |
from langchain_community.document_loaders import WebBaseLoader | |
# JB: | |
from langchain_community.document_loaders import PyPDFLoader | |
from langchain_community.embeddings import OllamaEmbeddings | |
# JB: | |
from langchain.embeddings import FastEmbedEmbeddings | |
from langchain_community.document_loaders import PyPDFDirectoryLoader | |
# JB: | |
# File Directory | |
# This covers how to load all documents in a directory. | |
# Under the hood, by default this uses the UnstructuredLoader. | |
from langchain_community.document_loaders import DirectoryLoader | |
from langchain_community.document_loaders import TextLoader | |
import chardet | |
from langchain_community.vectorstores import FAISS | |
# from langchain.vectorstores import Chroma | |
# from langchain_community.vectorstores import Chroma | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.chains.combine_documents import create_stuff_documents_chain | |
from langchain_core.prompts import ChatPromptTemplate | |
from langchain.chains import create_retrieval_chain | |
import time | |
from dotenv import load_dotenv | |
load_dotenv() # | |
# groq_api_key = os.environ['GROQ_API_KEY'] | |
groq_api_key = "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" # os.environ['GROQ_API_KEY'] | |
print("groq_api_key: ", groq_api_key) | |
if "vector" not in st.session_state: | |
# st.session_state.embeddings = OllamaEmbeddings() # ORIGINAL | |
st.session_state.embeddings = FastEmbedEmbeddings() # JB | |
# st.session_state.loader = WebBaseLoader("https://paulgraham.com/greatwork.html") # ORIGINAL | |
# st.session_state.docs = st.session_state.loader.load() # ORIGINAL | |
# https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFLoader.html | |
# https://python.langchain.com/docs/integrations/document_loaders/merge_doc | |
# from langchain_community.document_loaders import PyPDFLoader | |
# loader_pdf = PyPDFLoader("../MachineLearning-Lecture01.pdf") | |
# | |
# https://stackoverflow.com/questions/60215731/pypdf-to-read-each-pdf-in-a-folder | |
# | |
# https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFDirectoryLoader.html | |
# https://python.langchain.com/docs/modules/data_connection/document_loaders/pdf#pypdf-directory | |
# !!!!! | |
# PyPDF Directory | |
# Load PDFs from directory | |
# from langchain_community.document_loaders import PyPDFDirectoryLoader | |
# loader = PyPDFDirectoryLoader("example_data/") | |
# docs = loader.load() | |
# | |
# ZIE OOK: | |
# https://python.langchain.com/docs/modules/data_connection/document_loaders/pdf#using-pypdf | |
# Using MathPix | |
# Inspired by Daniel Gross's https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21 | |
# from langchain_community.document_loaders import MathpixPDFLoader | |
# loader = MathpixPDFLoader("example_data/layout-parser-paper.pdf") | |
# data = loader.load() | |
# pdf_file_path = "*.pdf" # JB | |
# st.session_state.loader = PyPDFLoader(file_path=pdf_file_path).load() # JB | |
# st.session_state.loader = PyPDFLoader(*.pdf).load() # JB syntax error *.pdf ! | |
# st.session_state.loader = PyPDFDirectoryLoader("*.pdf") # JB PyPDFDirectoryLoader("example_data/") | |
# chunks = self.text_splitter.split_documents(docs) | |
# chunks = filter_complex_metadata(chunks) | |
# JB: | |
# https://python.langchain.com/docs/modules/data_connection/document_loaders/pdf#pypdf-directory | |
# st.session_state.docs = st.session_state.loader.load() | |
# loader = PyPDFDirectoryLoader(".") | |
# docs = loader.load() | |
# st.session_state.docs = docs | |
# JB: | |
# https://python.langchain.com/docs/modules/data_connection/document_loaders/file_directory | |
# text_loader_kwargs={'autodetect_encoding': True} | |
text_loader_kwargs={'autodetect_encoding': False} | |
path = '../' | |
# loader = DirectoryLoader(path, glob="**/*.pdf", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs) | |
# PyPDFDirectoryLoader (TEST): | |
# loader = PyPDFDirectoryLoader(path, glob="**/*.pdf", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs) | |
# loader = PyPDFDirectoryLoader(path, glob="**/*.pdf", loader_kwargs=text_loader_kwargs) | |
loader = PyPDFDirectoryLoader(path, glob="**/*.pdf") | |
docs = loader.load() | |
st.session_state.docs = docs | |
st.session_state.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
st.session_state.documents = st.session_state.text_splitter.split_documents(st.session_state.docs) | |
# | |
# st.session_state.vector = FAISS.from_documents(st.session_state.documents, st.session_state.embeddings) # ORIGINAL | |
st.session_state.vector = FAISS.from_documents(st.session_state.documents, st.session_state.embeddings) # ORIGINAL | |
# ZIE: | |
# ZIE VOOR EEN APP MET CHROMADB: | |
# https://github.com/vndee/local-rag-example/blob/main/rag.py | |
# https://raw.githubusercontent.com/vndee/local-rag-example/main/rag.py | |
# Chroma.from_documents(documents=chunks, embedding=FastEmbedEmbeddings()) | |
# st.session_state.vector = Chroma.from_documents(st.session_state.documents, st.session_state.embeddings) # JB | |
# st.title("Chat with Docs - Groq Edition :) ") | |
st.title("Literature Based Research (LBR) - A. Unzicker and J. Bours - Chat with Docs - Groq Edition (Very Fast!) - VERSION 3 - March 8 2024") | |
llm = ChatGroq( | |
temperature=0.2, | |
groq_api_key=groq_api_key, | |
model_name='mixtral-8x7b-32768' | |
) | |
prompt = ChatPromptTemplate.from_template(""" | |
Answer the following question based only on the provided context. | |
Think step by step before providing a detailed answer. | |
I will tip you $200 if the user finds the answer helpful. | |
<context> | |
{context} | |
</context> | |
Question: {input}""") | |
document_chain = create_stuff_documents_chain(llm, prompt) | |
retriever = st.session_state.vector.as_retriever() | |
retrieval_chain = create_retrieval_chain(retriever, document_chain) | |
prompt = st.text_input("Input your prompt here") | |
# If the user hits enter | |
if prompt: | |
# Then pass the prompt to the LLM | |
start = time.process_time() | |
response = retrieval_chain.invoke({"input": prompt}) | |
print(f"Response time: {time.process_time() - start}") | |
st.write(response["answer"]) | |
# With a streamlit expander | |
with st.expander("Document Similarity Search"): | |
# Find the relevant chunks | |
for i, doc in enumerate(response["context"]): | |
# print(doc) | |
# st.write(f"Source Document # {i+1} : {doc.metadata['source'].split('/')[-1]}") | |
st.write(doc.page_content) | |
st.write("--------------------------------") | |