isayahc commited on
Commit
6193b2c
1 Parent(s): 5e2ffa7

Create ingest.py

Browse files
Files changed (1) hide show
  1. ingest.py +30 -0
ingest.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from langchain.embeddings import HuggingFaceBgeEmbeddings
4
+ from langchain.vectorstores import Chroma
5
+ from langchain.document_loaders import PyPDFLoader
6
+
7
+ model_name = "BAAI/bge-large-en"
8
+ model_kwargs = {"device": "cpu"}
9
+ encode_kwargs = {"normalize_embeddings": False}
10
+
11
+ embeddings = HuggingFaceBgeEmbeddings(
12
+ model_name=model_name,
13
+ model_kwargs=model_kwargs,
14
+ encode_kwargs=encode_kwargs,
15
+ )
16
+
17
+ loader = PyPDFLoader("dino-types.pdf")
18
+
19
+ documents = loader.load()
20
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
21
+
22
+ texts = text_splitter.split_documents(documents)
23
+
24
+ # print(texts[0])
25
+ vector_store = Chroma.from_documents(
26
+ texts,
27
+ embeddings,
28
+ collection_metadata={"hnsw:space": "cosine"},
29
+ persist_directory="stores/dino_cosine",
30
+ )