DataBob commited on
Commit
4ae7865
·
1 Parent(s): 0f7d64f

create database utils

Browse files
Files changed (1) hide show
  1. create_database.py +72 -0
create_database.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from langchain.document_loaders import DirectoryLoader
2
+ from langchain_community.document_loaders import DirectoryLoader
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain.schema import Document
5
+ # from langchain.embeddings import OpenAIEmbeddings
6
+ from langchain_huggingface import HuggingFaceEmbeddings
7
+ from langchain_community.vectorstores import Chroma
8
+ import openai
9
+ from dotenv import load_dotenv
10
+ import os
11
+ import shutil
12
+ import logging
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # Load environment variables. Assumes that project contains .env file with API keys
17
+ load_dotenv()
18
+ #---- Set OpenAI API key
19
+ # Change environment variable name from "OPENAI_API_KEY" to the name given in
20
+ # your .env file.
21
+
22
+ CHROMA_PATH = "chroma"
23
+ DATA_PATH = "data/"
24
+
25
+
26
+ def main():
27
+ generate_data_store()
28
+
29
+
30
+ def generate_data_store():
31
+ logger.info("Loading documents..")
32
+ documents = load_documents()
33
+ chunks = split_text(documents)
34
+ save_to_chroma(chunks)
35
+
36
+
37
+ def load_documents():
38
+ loader = DirectoryLoader(DATA_PATH, glob="*.pdf")
39
+ documents = loader.load()
40
+ logger.info("Found {:d} documents..".format(len(documents)))
41
+
42
+ return documents
43
+
44
+
45
+ def split_text(documents: list[Document]):
46
+ text_splitter = RecursiveCharacterTextSplitter(
47
+ chunk_size=1800,
48
+ chunk_overlap=100,
49
+ length_function=len,
50
+ add_start_index=True,
51
+ )
52
+ chunks = text_splitter.split_documents(documents)
53
+ print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
54
+
55
+ document = chunks[10]
56
+ print(document.page_content)
57
+ print(document.metadata)
58
+
59
+ return chunks
60
+
61
+
62
+ def save_to_chroma(chunks: list[Document]):
63
+ # Clear out the database first.
64
+ if os.path.exists(CHROMA_PATH):
65
+ shutil.rmtree(CHROMA_PATH)
66
+
67
+ # Create a new DB from the documents.
68
+ db = Chroma.from_documents(
69
+ chunks, HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2"), persist_directory=CHROMA_PATH
70
+ )
71
+ db.persist()
72
+ print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")