ppsingh commited on
Commit
376f540
1 Parent(s): ccf8ca1

Delete auditqa/doc_process_0.py

Browse files
Files changed (1) hide show
  1. auditqa/doc_process_0.py +0 -76
auditqa/doc_process_0.py DELETED
@@ -1,76 +0,0 @@
1
- import glob
2
- import os
3
- from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
4
- from transformers import AutoTokenizer
5
- from torch import cuda
6
- from langchain_community.document_loaders import PyMuPDFLoader
7
- from langchain_community.embeddings import HuggingFaceEmbeddings, HuggingFaceInferenceAPIEmbeddings
8
- from langchain_community.vectorstores import Qdrant
9
- device = 'cuda' if cuda.is_available() else 'cpu'
10
- #from dotenv import load_dotenv
11
- #load_dotenv()
12
-
13
- #HF_token = os.environ["HF_TOKEN"]
14
- path_to_data = "./data/"
15
-
16
-
17
- def process_pdf():
18
- files = {'MWTS2021':'./data/MWTS2021.pdf',
19
- 'MWTS2022':'./data/MWTS2022.pdf',
20
- 'Consolidated2021':'./data/Consolidated2021.pdf'}
21
- docs = {}
22
- for file,value in files.items():
23
- try:
24
- docs[file] = PyMuPDFLoader(value).load()
25
- except Exception as e:
26
- print("Exception: ", e)
27
-
28
-
29
- # text splitter based on the tokenizer of a model of your choosing
30
- # to make texts fit exactly a transformer's context window size
31
- # langchain text splitters: https://python.langchain.com/docs/modules/data_connection/document_transformers/
32
- chunk_size = 256
33
- text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
34
- AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5"),
35
- chunk_size=chunk_size,
36
- chunk_overlap=10,
37
- add_start_index=True,
38
- strip_whitespace=True,
39
- separators=["\n\n", "\n"],
40
- )
41
-
42
- all_documents = {'Consolidated':[], 'MWTS':[]}
43
-
44
- for file,value in docs.items():
45
- doc_processed = text_splitter.split_documents(value)
46
- for doc in doc_processed:
47
- doc.metadata["source"] = file
48
- doc.metadata["year"] = file[-4:]
49
- for key in all_documents:
50
- if key in file:
51
- print(key)
52
- all_documents[key].append(doc_processed)
53
-
54
- for key, docs_processed in all_documents.items():
55
- docs_processed = [item for sublist in docs_processed for item in sublist]
56
- all_documents[key] = docs_processed
57
-
58
-
59
- embeddings = HuggingFaceEmbeddings(
60
- model_kwargs = {'device': device},
61
- encode_kwargs = {'normalize_embeddings': True},
62
- model_name="BAAI/bge-small-en-v1.5"
63
- )
64
-
65
- qdrant_collections = {}
66
-
67
- for file,value in all_documents.items():
68
- print("emebddings for:",file)
69
- qdrant_collections[file] = Qdrant.from_documents(
70
- value,
71
- embeddings,
72
- location=":memory:",
73
- collection_name=file,
74
- )
75
- print("done")
76
- return qdrant_collections