Spaces:
Running
Running
# got some of the code from | |
# https://diptimanrc.medium.com/rapid-q-a-on-multiple-pdfs-using-langchain-and-chromadb-as-local-disk-vector-store-60678328c0df | |
# https://stackoverflow.com/questions/76482987/chroma-database-embeddings-none-when-using-get | |
# https://docs.trychroma.com/embeddings/hugging-face?lang=py | |
# https://www.datacamp.com/tutorial/chromadb-tutorial-step-by-step-guide | |
# https://python.langchain.com/docs/modules/data_connection/retrievers/self_query | |
# https://python.langchain.com/docs/integrations/vectorstores/chroma#update-and-delete | |
# https://python.langchain.com/docs/modules/data_connection/retrievers/vectorstore | |
import chromadb | |
from langchain.text_splitter import CharacterTextSplitter | |
from langchain_text_splitters import MarkdownHeaderTextSplitter | |
from langchain_text_splitters import RecursiveCharacterTextSplitter | |
from langchain_core.documents import Document | |
from langchain_community.document_loaders import PyPDFLoader | |
from langchain_community.vectorstores import Chroma | |
from langchain_community.embeddings.sentence_transformer import ( | |
SentenceTransformerEmbeddings, | |
) | |
from langchain_community.embeddings import GPT4AllEmbeddings | |
from app.utils.utils import ( | |
generate_uuid | |
) | |
from app.core.config import settings | |
# import dotenv | |
# import os | |
# dotenv.load_dotenv() | |
# persist_directory = os.getenv('VECTOR_DATABASE_LOCATION') | |
persist_directory = settings.VECTOR_DATABASE_LOCATION | |
def read_markdown_file(file_path: str) -> str: | |
""" | |
Read a Markdown file and return its content as a single string. | |
Args: | |
file_path (str): The path to the Markdown file. | |
Returns: | |
str: The content of the Markdown file as a single string. | |
""" | |
with open(file_path, 'r', encoding='utf-8') as file: | |
content = file.read() | |
return content | |
def add_markdown_to_collection( | |
markdown_file_location:str, | |
collection_name:str, | |
chunk_size:int, | |
chunk_overlap:int, | |
) -> None: | |
""" | |
Embeds markdown data to a given chroma db collection | |
markdown_file_location (str): location of markdown file | |
collection_name (str) : the collection where the documents will be added | |
chunk_size (int) : size of the chunks to be embedded | |
chunk_overlap (int) : the ammount of overlappping chunks | |
""" | |
markdown_document = read_markdown_file(markdown_file_location) | |
headers_to_split_on = [ | |
("#", "Header 1"), | |
("##", "Header 2"), | |
("###", "Header 3"), | |
] | |
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on) | |
md_header_splits = markdown_splitter.split_text(markdown_document) | |
# MD splits | |
markdown_splitter = MarkdownHeaderTextSplitter( | |
headers_to_split_on=headers_to_split_on, | |
strip_headers=False, | |
) | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=chunk_size, chunk_overlap=chunk_overlap | |
) | |
# Split | |
splits = text_splitter.split_documents(md_header_splits) | |
client = chromadb.PersistentClient( | |
path=persist_directory, | |
) | |
# If the collection already exists, we just return it. This allows us to add more | |
# data to an existing collection. | |
collection = client.get_or_create_collection( | |
name=collection_name, | |
) | |
# embedding_function = SentenceTransformerEmbeddings( | |
# #model_name=os.getenv("EMBEDDING_MODEL"), | |
# model_name=settings.EMBEDDING_MODEL | |
# ) | |
embedding_function = GPT4AllEmbeddings() | |
documents_page_content:list = [i.page_content for i in splits] | |
for i in range(0, len(splits)): | |
data = splits[i] | |
collection.add( | |
ids=[generate_uuid()], # give each document a uuid | |
documents=documents_page_content[i], # contents of document | |
embeddings=embedding_function(documents_page_content[i]), | |
metadatas=data.metadata, # type: ignore | |
) | |
def split_by_intervals(s: str, interval: int, overlapped: int = 0) -> list: | |
""" | |
Split a string into intervals of a given length, with optional overlapping. | |
Args: | |
s: The input string. | |
interval: The length of each interval. | |
overlapped: The number of characters to overlap between intervals. Default is 0. | |
Returns: | |
A list of substrings, each containing 'interval' characters from the input string. | |
""" | |
result = [] | |
for i in range(0, len(s), interval - overlapped): | |
result.append(s[i:i + interval]) | |
return result | |
def add_pdf_to_vector_store( | |
# vector_store:Chroma.from_documents, | |
collection_name, | |
pdf_file_location:str, | |
text_chunk_size=1000, | |
text_chunk_overlap=10, | |
) -> None: | |
""" | |
## Summary | |
given the location of a pdf file this will chunk it's contents | |
and store it the given vectorstore | |
## Arguments | |
collection_name (str) : name of collection to store documents | |
pdf_file_location (str) : location of pdf file | |
## Return | |
None | |
""" | |
documents = [] | |
loader = PyPDFLoader(pdf_file_location) | |
documents.extend(loader.load()) | |
split_docs:list[Document] = [] | |
for document in documents: | |
sub_docs = split_by_intervals( | |
document.page_content, | |
text_chunk_size, | |
text_chunk_overlap | |
) | |
for sub_doc in sub_docs: | |
loaded_doc = Document(sub_doc, metadata=document.metadata) | |
split_docs.append(loaded_doc) | |
client = chromadb.PersistentClient( | |
path=persist_directory, | |
) | |
collection = client.get_or_create_collection( | |
name=collection_name, | |
) | |
# embedding_function = SentenceTransformerEmbeddings( | |
# #model_name=os.getenv("EMBEDDING_MODEL"), | |
# model_name=settings.EMBEDDING_MODEL | |
# ) | |
embedding_function = GPT4AllEmbeddings() | |
documents_page_content:list = [i.page_content for i in split_docs] | |
for i in range(0, len(split_docs)): | |
data = split_docs[i] | |
collection.add( | |
ids=[generate_uuid()], # give each document a uuid | |
documents=documents_page_content[i], # contents of document | |
embeddings=embedding_function(documents_page_content[i]), | |
metadatas=data.metadata, # type: ignore | |
) | |
if __name__ == "__main__": | |
collection_name="ArxivPapers" | |
client = chromadb.PersistentClient( | |
path=persist_directory, | |
) | |
# delete existing collection | |
# client.delete_collection( | |
# name=collection_name, | |
# ) | |
collection = client.get_or_create_collection( | |
name=collection_name, | |
) | |
pdf_file_location = "/workspaces/InnovationPathfinderAI/2212.02623.pdf" | |
add_pdf_to_vector_store( | |
collection_name="ArxivPapers", | |
pdf_file_location=pdf_file_location, | |
) | |
pdf_file_location = "/workspaces/InnovationPathfinderAI/2402.17764.pdf" | |
add_pdf_to_vector_store( | |
collection_name="ArxivPapers", | |
pdf_file_location=pdf_file_location, | |
) | |
#create the cliient using Chroma's library | |
client = chromadb.PersistentClient( | |
path=persist_directory, | |
) | |
# This is an example collection name | |
collection_name="ArxivPapers" | |
# create the open-source embedding function | |
# embedding_function = SentenceTransformerEmbeddings( | |
# #model_name=os.getenv("EMBEDDING_MODEL"), | |
# model_name=settings.EMBEDDING_MODEL | |
# ) | |
embedding_function = GPT4AllEmbeddings() | |
#method of integrating Chroma and Langchain | |
vector_db = Chroma( | |
client=client, # client for Chroma | |
collection_name=collection_name, | |
embedding_function=embedding_function, | |
) | |
query = "ai" # your query | |
# using your Chromadb as a retriever for langchain | |
retriever = vector_db.as_retriever() | |
# returning a list of documents | |
docs = retriever.get_relevant_documents(query) | |
# pdf_file_location = "mydir/181000551.pdf" | |
# pdf_file_location = "/workspaces/InnovationPathfinderAI/2402.17764.pdf" | |
# example query using Chroma | |
# results = collection.query( | |
# query_texts=["benchmark"], | |
# n_results=3, | |
# include=['embeddings', 'documents', 'metadatas'], | |
# ) |