ppsingh commited on
Commit
1f81843
1 Parent(s): 9ca031b

Update auditqa/doc_process.py

Browse files
Files changed (1) hide show
  1. auditqa/doc_process.py +1 -1
auditqa/doc_process.py CHANGED
@@ -27,7 +27,7 @@ def process_pdf():
27
  # text splitter based on the tokenizer of a model of your choosing
28
  # to make texts fit exactly a transformer's context window size
29
  # langchain text splitters: https://python.langchain.com/docs/modules/data_connection/document_transformers/
30
- chunk_size = 512
31
  text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
32
  AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5"),
33
  chunk_size=chunk_size,
 
27
  # text splitter based on the tokenizer of a model of your choosing
28
  # to make texts fit exactly a transformer's context window size
29
  # langchain text splitters: https://python.langchain.com/docs/modules/data_connection/document_transformers/
30
+ chunk_size = 256
31
  text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
32
  AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5"),
33
  chunk_size=chunk_size,