ppsingh commited on
Commit
8edd3eb
1 Parent(s): 8424008

Update auditqa/doc_process.py

Browse files
Files changed (1) hide show
  1. auditqa/doc_process.py +14 -0
auditqa/doc_process.py CHANGED
@@ -15,4 +15,18 @@ def process_pdf():
15
  docs[file] = PyMuPDFLoader(value).load()
16
  except Exception as e:
17
  print("Exception: ", e)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
 
15
  docs[file] = PyMuPDFLoader(value).load()
16
  except Exception as e:
17
  print("Exception: ", e)
18
+
19
+
20
+ # text splitter based on the tokenizer of a model of your choosing
21
+ # to make texts fit exactly a transformer's context window size
22
+ # langchain text splitters: https://python.langchain.com/docs/modules/data_connection/document_transformers/
23
+ chunk_size = 256
24
+ text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
25
+ AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5"),
26
+ chunk_size=chunk_size,
27
+ chunk_overlap=int(chunk_size / 10),
28
+ add_start_index=True,
29
+ strip_whitespace=True,
30
+ separators=["\n\n", "\n", ".", " ", ""],
31
+ )
32