ppsingh commited on
Commit
e8fe387
1 Parent(s): aaf6c15

Update auditqa/doc_process.py

Browse files
Files changed (1) hide show
  1. auditqa/doc_process.py +3 -3
auditqa/doc_process.py CHANGED
@@ -31,7 +31,7 @@ def process_pdf():
31
  # langchain text splitters: https://python.langchain.com/docs/modules/data_connection/document_transformers/
32
  chunk_size = 256
33
  text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
34
- AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5"),
35
  chunk_size=chunk_size,
36
  chunk_overlap=10,
37
  add_start_index=True,
@@ -78,7 +78,7 @@ def process_pdf():
78
  embeddings = HuggingFaceEmbeddings(
79
  model_kwargs = {'device': device},
80
  encode_kwargs = {'normalize_embeddings': True},
81
- model_name="BAAI/bge-small-en-v1.5"
82
  )
83
  # placeholder for collection
84
  qdrant_collections = {}
@@ -102,7 +102,7 @@ def get_local_qdrant():
102
  embeddings = HuggingFaceEmbeddings(
103
  model_kwargs = {'device': device},
104
  encode_kwargs = {'normalize_embeddings': True},
105
- model_name="BAAI/bge-small-en-v1.5")
106
  list_ = ['Consolidated','District','Ministry','allreports']
107
  for val in list_:
108
  client = QdrantClient(path=f"./data/{val}")
 
31
  # langchain text splitters: https://python.langchain.com/docs/modules/data_connection/document_transformers/
32
  chunk_size = 256
33
  text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
34
+ AutoTokenizer.from_pretrained("BAAI/bge-en-icl"),
35
  chunk_size=chunk_size,
36
  chunk_overlap=10,
37
  add_start_index=True,
 
78
  embeddings = HuggingFaceEmbeddings(
79
  model_kwargs = {'device': device},
80
  encode_kwargs = {'normalize_embeddings': True},
81
+ model_name="BAAI/bge-en-icl"
82
  )
83
  # placeholder for collection
84
  qdrant_collections = {}
 
102
  embeddings = HuggingFaceEmbeddings(
103
  model_kwargs = {'device': device},
104
  encode_kwargs = {'normalize_embeddings': True},
105
+ model_name="BAAI/bge-en-icl")
106
  list_ = ['Consolidated','District','Ministry','allreports']
107
  for val in list_:
108
  client = QdrantClient(path=f"./data/{val}")