Improve search results with tokenizer length function and by removing TOC from the pdf.
Browse files
app.py
CHANGED
@@ -6,27 +6,30 @@ from langchain.embeddings import HuggingFaceEmbeddings
|
|
6 |
from langchain.llms import HuggingFacePipeline
|
7 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
8 |
from langchain.vectorstores import FAISS
|
|
|
9 |
|
10 |
|
11 |
# Number of search results to query from the vector database.
|
12 |
-
SIMILARITY_SEARCH_COUNT =
|
13 |
|
14 |
-
# Size of each document chunk in number of
|
15 |
-
CHUNK_SIZE =
|
16 |
|
17 |
-
# Chunk overlap in number of
|
18 |
-
CHUNK_OVERLAP =
|
19 |
|
20 |
# Maximum number of output tokens.
|
21 |
MODEL_MAX_LENGTH = 500
|
22 |
|
23 |
|
24 |
print("Loading documents")
|
25 |
-
loader = PyMuPDFLoader("rdna3
|
26 |
documents = loader.load()
|
27 |
|
28 |
print("Creating chunks")
|
29 |
-
|
|
|
|
|
30 |
chunks = splitter.split_documents(documents)
|
31 |
|
32 |
print("Creating database")
|
|
|
6 |
from langchain.llms import HuggingFacePipeline
|
7 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
8 |
from langchain.vectorstores import FAISS
|
9 |
+
from transformers import GPT2TokenizerFast
|
10 |
|
11 |
|
12 |
# Number of search results to query from the vector database.
|
13 |
+
SIMILARITY_SEARCH_COUNT = 8
|
14 |
|
15 |
+
# Size of each document chunk in number of tokens.
|
16 |
+
CHUNK_SIZE = 100
|
17 |
|
18 |
+
# Chunk overlap in number of tokens.
|
19 |
+
CHUNK_OVERLAP = 10
|
20 |
|
21 |
# Maximum number of output tokens.
|
22 |
MODEL_MAX_LENGTH = 500
|
23 |
|
24 |
|
25 |
print("Loading documents")
|
26 |
+
loader = PyMuPDFLoader("rdna3.pdf")
|
27 |
documents = loader.load()
|
28 |
|
29 |
print("Creating chunks")
|
30 |
+
tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
|
31 |
+
splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
|
32 |
+
tokenizer, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
|
33 |
chunks = splitter.split_documents(documents)
|
34 |
|
35 |
print("Creating database")
|
rdna3-shader-instruction-set-architecture-feb-2023_0.pdf → rdna3.pdf
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:81348dff61fc7b4912d86102a709a5a620819ee6d3e773a1da7a8d4c433fa45c
|
3 |
+
size 3053930
|
requirements.txt
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
langchain==0.0.162
|
2 |
faiss-cpu==1.7.4
|
3 |
huggingface-hub==0.14.1
|
4 |
-
sentence-transformers==2.2.2
|
5 |
protobuf==3.20.1
|
6 |
pymupdf==1.22.2
|
|
|
|
|
|
1 |
langchain==0.0.162
|
2 |
faiss-cpu==1.7.4
|
3 |
huggingface-hub==0.14.1
|
|
|
4 |
protobuf==3.20.1
|
5 |
pymupdf==1.22.2
|
6 |
+
sentence-transformers==2.2.2
|
7 |
+
transformers==4.28.1
|