dh-mc commited on
Commit
4ae9830
·
1 Parent(s): b7ee4c3

added index for lib_serach

Browse files
.gitattributes CHANGED
@@ -45,3 +45,15 @@ data/smu_lib_index/2.pdf filter=lfs diff=lfs merge=lfs -text
45
  data/smu_lib_index/3.pdf filter=lfs diff=lfs merge=lfs -text
46
  data/smu_lib_index/4.pdf filter=lfs diff=lfs merge=lfs -text
47
  data/smu_lib_index/5.pdf filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  data/smu_lib_index/3.pdf filter=lfs diff=lfs merge=lfs -text
46
  data/smu_lib_index/4.pdf filter=lfs diff=lfs merge=lfs -text
47
  data/smu_lib_index/5.pdf filter=lfs diff=lfs merge=lfs -text
48
+ data/smu_lib_index/1.pdf/index.faiss filter=lfs diff=lfs merge=lfs -text
49
+ data/smu_lib_index/1.pdf/index.pkl filter=lfs diff=lfs merge=lfs -text
50
+ data/smu_lib_index/4.pdf/index.pkl filter=lfs diff=lfs merge=lfs -text
51
+ data/smu_lib_index/0.pdf/index.faiss filter=lfs diff=lfs merge=lfs -text
52
+ data/smu_lib_index/2.pdf/index.faiss filter=lfs diff=lfs merge=lfs -text
53
+ data/smu_lib_index/2.pdf/index.pkl filter=lfs diff=lfs merge=lfs -text
54
+ data/smu_lib_index/3.pdf/index.faiss filter=lfs diff=lfs merge=lfs -text
55
+ data/smu_lib_index/3.pdf/index.pkl filter=lfs diff=lfs merge=lfs -text
56
+ data/smu_lib_index/4.pdf/index.faiss filter=lfs diff=lfs merge=lfs -text
57
+ data/smu_lib_index/5.pdf/index.faiss filter=lfs diff=lfs merge=lfs -text
58
+ data/smu_lib_index/5.pdf/index.pkl filter=lfs diff=lfs merge=lfs -text
59
+ data/smu_lib_index/0.pdf/index.pkl filter=lfs diff=lfs merge=lfs -text
app_modules/init.py CHANGED
@@ -26,6 +26,26 @@ llm_loader = None
26
  qa_chain = None
27
 
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  def app_init(initQAChain: bool = True):
30
  global llm_loader
31
  global qa_chain
@@ -59,24 +79,7 @@ def app_init(initQAChain: bool = True):
59
 
60
  print(f"Completed in {end - start:.3f}s")
61
 
62
- start = timer()
63
-
64
- print(
65
- f"Load index from {index_path} with {'FAISS' if using_faiss else 'Chroma'}"
66
- )
67
-
68
- if not os.path.isdir(index_path):
69
- raise ValueError(f"{index_path} does not exist!")
70
- elif using_faiss:
71
- vectorstore = FAISS.load_local(index_path, embeddings)
72
- else:
73
- vectorstore = Chroma(
74
- embedding_function=embeddings, persist_directory=index_path
75
- )
76
-
77
- end = timer()
78
-
79
- print(f"Completed in {end - start:.3f}s")
80
 
81
  start = timer()
82
  llm_loader = LLMLoader(llm_model_type)
 
26
  qa_chain = None
27
 
28
 
29
+ def load_vectorstor(using_faiss, index_path, embeddings):
30
+ start = timer()
31
+
32
+ print(f"Load index from {index_path} with {'FAISS' if using_faiss else 'Chroma'}")
33
+
34
+ if not os.path.isdir(index_path):
35
+ raise ValueError(f"{index_path} does not exist!")
36
+ elif using_faiss:
37
+ vectorstore = FAISS.load_local(index_path, embeddings)
38
+ else:
39
+ vectorstore = Chroma(
40
+ embedding_function=embeddings, persist_directory=index_path
41
+ )
42
+
43
+ end = timer()
44
+
45
+ print(f"Completed in {end - start:.3f}s")
46
+ return vectorstore
47
+
48
+
49
  def app_init(initQAChain: bool = True):
50
  global llm_loader
51
  global qa_chain
 
79
 
80
  print(f"Completed in {end - start:.3f}s")
81
 
82
+ vectorstore = load_vectorstor(index_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
  start = timer()
85
  llm_loader = LLMLoader(llm_model_type)
data/smu_lib_index/0.pdf/index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b3d8640d878840dd03569beb3de62a0e7db3c69e602c4af94e7f2b4aa7437f1
3
+ size 1665069
data/smu_lib_index/0.pdf/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc7901a5deeb83aa9042c9d530af0dd4a71d8957ab6967c077fb85afb7e99ed1
3
+ size 495066
data/smu_lib_index/1.pdf/index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e033249266139624933e8caaa3e477aba0ab95246e8185099b8d6424cfd760e3
3
+ size 2623533
data/smu_lib_index/1.pdf/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3dd94b10a724cd057b022ff1f1cc986138912e87f285b9f20295d0a4c6fa66f8
3
+ size 754144
data/smu_lib_index/2.pdf/index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:add4ca6be98c0cced1807a6f8857273aa603af709688903082bac8a135f176bb
3
+ size 113709
data/smu_lib_index/2.pdf/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00e5b92d42a46ffd810a6700e31ddb6d25e02d634023b6d098172c9580eecb20
3
+ size 35980
data/smu_lib_index/3.pdf/index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32c40ae8cf525f67b006d0086904bc59620c744ae69f36db90fdd3c4ae01c892
3
+ size 110637
data/smu_lib_index/3.pdf/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0209fd9dfe7e368a6554f9acab7b66d1c56f54101ed135b91d001211a551d4a2
3
+ size 35875
data/smu_lib_index/4.pdf/index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c12431becadaed99d1b5555540cb0a721838dd0d9d52dfb200fda2ee3eaf280
3
+ size 76845
data/smu_lib_index/4.pdf/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:430b96fb402101066564b1c8693cb92c18b4f1d42bd8c8e6d90d9c374a5a1b3d
3
+ size 24951
data/smu_lib_index/5.pdf/index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bbd32f3e66f36b4bbedd2bda520375c7ae37704b49fc65746ebba0d793cecde9
3
+ size 119853
data/smu_lib_index/5.pdf/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1143d4ced259bbc09282f12f39e78b266ae06545b7bfb1fd0758df7f20ff0e0
3
+ size 39268
ingest-lib-search.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # setting device on GPU if available, else CPU
2
+ import os
3
+ from timeit import default_timer as timer
4
+ from typing import List
5
+
6
+ from langchain.document_loaders import DirectoryLoader
7
+ from langchain.document_loaders import PyPDFDirectoryLoader
8
+
9
+ from langchain.embeddings import HuggingFaceInstructEmbeddings
10
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
11
+ from langchain.vectorstores.base import VectorStore
12
+ from langchain.vectorstores.chroma import Chroma
13
+ from langchain.vectorstores.faiss import FAISS
14
+
15
+ from app_modules.init import *
16
+
17
+
18
+ def load_documents(source_path) -> List:
19
+ loader = PyPDFDirectoryLoader(source_path, silent_errors=True)
20
+ documents = loader.load()
21
+
22
+ loader = DirectoryLoader(
23
+ source_path, glob="**/*.html", silent_errors=True, show_progress=True
24
+ )
25
+ documents.extend(loader.load())
26
+ return documents
27
+
28
+
29
+ def split_chunks(documents: List, chunk_size, chunk_overlap) -> List:
30
+ text_splitter = RecursiveCharacterTextSplitter(
31
+ chunk_size=chunk_size, chunk_overlap=chunk_overlap
32
+ )
33
+ return text_splitter.split_documents(documents)
34
+
35
+
36
+ def generate_index(
37
+ filename: str, chunks: List, embeddings: HuggingFaceInstructEmbeddings
38
+ ) -> VectorStore:
39
+ full_path = index_path + filename + "/"
40
+ os.mkdir(full_path)
41
+
42
+ if using_faiss:
43
+ faiss_instructor_embeddings = FAISS.from_documents(
44
+ documents=chunks, embedding=embeddings
45
+ )
46
+
47
+ faiss_instructor_embeddings.save_local(full_path)
48
+ return faiss_instructor_embeddings
49
+ else:
50
+ chromadb_instructor_embeddings = Chroma.from_documents(
51
+ documents=chunks, embedding=embeddings, persist_directory=full_path
52
+ )
53
+
54
+ chromadb_instructor_embeddings.persist()
55
+ return chromadb_instructor_embeddings
56
+
57
+
58
+ # Constants
59
+ device_type, hf_pipeline_device_type = get_device_types()
60
+ hf_embeddings_model_name = (
61
+ os.environ.get("HF_EMBEDDINGS_MODEL_NAME") or "hkunlp/instructor-xl"
62
+ )
63
+ index_path = os.environ.get("FAISS_INDEX_PATH_PDFS") or os.environ.get(
64
+ "CHROMADB_INDEX_PATH_PDFS"
65
+ )
66
+ using_faiss = os.environ.get("FAISS_INDEX_PATH_PDFS") is not None
67
+ source_path = os.environ.get("SOURCE_PDFS_PATH")
68
+ chunk_size = os.environ.get("CHUNCK_SIZE")
69
+ chunk_overlap = os.environ.get("CHUNK_OVERLAP")
70
+
71
+ start = timer()
72
+ embeddings = HuggingFaceInstructEmbeddings(
73
+ model_name=hf_embeddings_model_name, model_kwargs={"device": device_type}
74
+ )
75
+ end = timer()
76
+
77
+ print(f"Completed in {end - start:.3f}s")
78
+
79
+ start = timer()
80
+
81
+ if not os.path.isdir(index_path):
82
+ print(
83
+ f"The index persist directory {index_path} is not present. Creating a new one."
84
+ )
85
+ os.mkdir(index_path)
86
+
87
+ print(f"Loading PDF & HTML files from {source_path}")
88
+ sources = load_documents(source_path)
89
+ print(sources[2])
90
+
91
+ print(f"Splitting {len(sources)} PDF pages in to chunks ...")
92
+
93
+ current_file = None
94
+ docs = []
95
+ index = 0
96
+ for index, doc in enumerate(sources):
97
+ filename = doc.metadata["source"].split("/")[-1]
98
+ # print(filename)
99
+ if (
100
+ filename != current_file
101
+ and current_file != None
102
+ or index == len(sources) - 1
103
+ ):
104
+ chunks = split_chunks(
105
+ docs, chunk_size=int(chunk_size), chunk_overlap=int(chunk_overlap)
106
+ )
107
+ print(f"Generating index for {current_file} with {len(chunks)} chunks ...")
108
+ generate_index(current_file, chunks, embeddings)
109
+ docs = [doc]
110
+ else:
111
+ docs.append(doc)
112
+
113
+ current_file = filename
114
+ else:
115
+ print(f"The index persist directory {index_path} is present. Quitting ...")
116
+
117
+ end = timer()
118
+
119
+ print(f"Completed in {end - start:.3f}s")