ppsingh commited on
Commit
3ab64ac
1 Parent(s): be6235f

Create doc_process.py

Browse files
Files changed (1) hide show
  1. auditqa/doc_process.py +18 -0
auditqa/doc_process.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import os
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
4
+ from transformers import AutoTokenizer
5
+ from langchain_community.document_loaders import PyMuPDFLoader
6
+ path_to_data = "./data/"
7
+
8
+
9
+ def process_pdf():
10
+ files = {'ABC':'./data/MWTS2021.pdf',
11
+ 'XYZ':'./data/Consolidated2021.pdf'}
12
+ docs = {}
13
+ for file,value in files.items():
14
+ try:
15
+ docs[file] = PyMuPDFLoader(value).load()
16
+ except Exception as e:
17
+ print("Exception: ", e)
18
+