Spaces:
Running
Running
File size: 8,352 Bytes
a4dffa4 1a930c9 26a0cad 63a91c0 5660f01 63a91c0 cc8fd4e 57e87b0 cc8fd4e 1a930c9 9237552 a0df48e 6e8591f fa99d8f 26a0cad fa99d8f a4dffa4 5660f01 1314610 5660f01 9237552 5660f01 cd0ca96 5660f01 1a930c9 5660f01 26a0cad a4dffa4 26a0cad a4dffa4 26a0cad 1a930c9 cc8fd4e 1a930c9 cc8fd4e 1a930c9 cc8fd4e 1a930c9 a4dffa4 1314610 a4dffa4 26a0cad a4dffa4 9237552 1a930c9 cc8fd4e a4dffa4 1a930c9 cc8fd4e a4dffa4 cd0ca96 a4dffa4 26a0cad 465bb25 1314610 465bb25 1a930c9 465bb25 a4dffa4 26a0cad a4dffa4 26a0cad 465bb25 cc8fd4e 1314610 cc8fd4e 9237552 cc8fd4e 465bb25 1a930c9 465bb25 1a930c9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 |
# got some of the code from
# https://diptimanrc.medium.com/rapid-q-a-on-multiple-pdfs-using-langchain-and-chromadb-as-local-disk-vector-store-60678328c0df
# https://stackoverflow.com/questions/76482987/chroma-database-embeddings-none-when-using-get
# https://docs.trychroma.com/embeddings/hugging-face?lang=py
# https://www.datacamp.com/tutorial/chromadb-tutorial-step-by-step-guide
# https://python.langchain.com/docs/modules/data_connection/retrievers/self_query
# https://python.langchain.com/docs/integrations/vectorstores/chroma#update-and-delete
# https://python.langchain.com/docs/modules/data_connection/retrievers/vectorstore
import chromadb
from langchain.text_splitter import CharacterTextSplitter
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings.sentence_transformer import (
SentenceTransformerEmbeddings,
)
from langchain_community.embeddings import GPT4AllEmbeddings
from app.utils.utils import (
generate_uuid
)
from app.core.config import settings
# import dotenv
# import os
# dotenv.load_dotenv()
# persist_directory = os.getenv('VECTOR_DATABASE_LOCATION')
persist_directory = settings.VECTOR_DATABASE_LOCATION
def read_markdown_file(file_path: str) -> str:
"""
Read a Markdown file and return its content as a single string.
Args:
file_path (str): The path to the Markdown file.
Returns:
str: The content of the Markdown file as a single string.
"""
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
return content
def add_markdown_to_collection(
markdown_file_location:str,
collection_name:str,
chunk_size:int,
chunk_overlap:int,
) -> None:
"""
Embeds markdown data to a given chroma db collection
markdown_file_location (str): location of markdown file
collection_name (str) : the collection where the documents will be added
chunk_size (int) : size of the chunks to be embedded
chunk_overlap (int) : the ammount of overlappping chunks
"""
markdown_document = read_markdown_file(markdown_file_location)
headers_to_split_on = [
("#", "Header 1"),
("##", "Header 2"),
("###", "Header 3"),
]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
md_header_splits = markdown_splitter.split_text(markdown_document)
# MD splits
markdown_splitter = MarkdownHeaderTextSplitter(
headers_to_split_on=headers_to_split_on,
strip_headers=False,
)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
# Split
splits = text_splitter.split_documents(md_header_splits)
client = chromadb.PersistentClient(
path=persist_directory,
)
# If the collection already exists, we just return it. This allows us to add more
# data to an existing collection.
collection = client.get_or_create_collection(
name=collection_name,
)
# embedding_function = SentenceTransformerEmbeddings(
# #model_name=os.getenv("EMBEDDING_MODEL"),
# model_name=settings.EMBEDDING_MODEL
# )
embedding_function = GPT4AllEmbeddings()
documents_page_content:list = [i.page_content for i in splits]
for i in range(0, len(splits)):
data = splits[i]
collection.add(
ids=[generate_uuid()], # give each document a uuid
documents=documents_page_content[i], # contents of document
embeddings=embedding_function(documents_page_content[i]),
metadatas=data.metadata, # type: ignore
)
def split_by_intervals(s: str, interval: int, overlapped: int = 0) -> list:
"""
Split a string into intervals of a given length, with optional overlapping.
Args:
s: The input string.
interval: The length of each interval.
overlapped: The number of characters to overlap between intervals. Default is 0.
Returns:
A list of substrings, each containing 'interval' characters from the input string.
"""
result = []
for i in range(0, len(s), interval - overlapped):
result.append(s[i:i + interval])
return result
def add_pdf_to_vector_store(
# vector_store:Chroma.from_documents,
collection_name,
pdf_file_location:str,
text_chunk_size=1000,
text_chunk_overlap=10,
) -> None:
"""
## Summary
given the location of a pdf file this will chunk it's contents
and store it the given vectorstore
## Arguments
collection_name (str) : name of collection to store documents
pdf_file_location (str) : location of pdf file
## Return
None
"""
documents = []
loader = PyPDFLoader(pdf_file_location)
documents.extend(loader.load())
split_docs:list[Document] = []
for document in documents:
sub_docs = split_by_intervals(
document.page_content,
text_chunk_size,
text_chunk_overlap
)
for sub_doc in sub_docs:
loaded_doc = Document(sub_doc, metadata=document.metadata)
split_docs.append(loaded_doc)
client = chromadb.PersistentClient(
path=persist_directory,
)
collection = client.get_or_create_collection(
name=collection_name,
)
# embedding_function = SentenceTransformerEmbeddings(
# #model_name=os.getenv("EMBEDDING_MODEL"),
# model_name=settings.EMBEDDING_MODEL
# )
embedding_function = GPT4AllEmbeddings()
documents_page_content:list = [i.page_content for i in split_docs]
for i in range(0, len(split_docs)):
data = split_docs[i]
collection.add(
ids=[generate_uuid()], # give each document a uuid
documents=documents_page_content[i], # contents of document
embeddings=embedding_function(documents_page_content[i]),
metadatas=data.metadata, # type: ignore
)
if __name__ == "__main__":
collection_name="ArxivPapers"
client = chromadb.PersistentClient(
path=persist_directory,
)
# delete existing collection
# client.delete_collection(
# name=collection_name,
# )
collection = client.get_or_create_collection(
name=collection_name,
)
pdf_file_location = "/workspaces/InnovationPathfinderAI/2212.02623.pdf"
add_pdf_to_vector_store(
collection_name="ArxivPapers",
pdf_file_location=pdf_file_location,
)
pdf_file_location = "/workspaces/InnovationPathfinderAI/2402.17764.pdf"
add_pdf_to_vector_store(
collection_name="ArxivPapers",
pdf_file_location=pdf_file_location,
)
#create the cliient using Chroma's library
client = chromadb.PersistentClient(
path=persist_directory,
)
# This is an example collection name
collection_name="ArxivPapers"
# create the open-source embedding function
# embedding_function = SentenceTransformerEmbeddings(
# #model_name=os.getenv("EMBEDDING_MODEL"),
# model_name=settings.EMBEDDING_MODEL
# )
embedding_function = GPT4AllEmbeddings()
#method of integrating Chroma and Langchain
vector_db = Chroma(
client=client, # client for Chroma
collection_name=collection_name,
embedding_function=embedding_function,
)
query = "ai" # your query
# using your Chromadb as a retriever for langchain
retriever = vector_db.as_retriever()
# returning a list of documents
docs = retriever.get_relevant_documents(query)
# pdf_file_location = "mydir/181000551.pdf"
# pdf_file_location = "/workspaces/InnovationPathfinderAI/2402.17764.pdf"
# example query using Chroma
# results = collection.query(
# query_texts=["benchmark"],
# n_results=3,
# include=['embeddings', 'documents', 'metadatas'],
# ) |