mitulagr2 commited on
Commit
f4f9ced
1 Parent(s): 5529817

Update rag.py

Browse files
Files changed (1) hide show
  1. app/rag.py +14 -12
app/rag.py CHANGED
@@ -24,6 +24,9 @@ logger = logging.getLogger(__name__)
24
 
25
  class ChatPDF:
26
  pdf_count = 0
 
 
 
27
 
28
  def __init__(self):
29
  self.text_parser = SentenceSplitter(chunk_size=1024, chunk_overlap=24)
@@ -63,28 +66,24 @@ class ChatPDF:
63
  Settings.transformations = [self.text_parser]
64
 
65
  def ingest(self, files_dir: str):
66
- text_chunks = []
67
- doc_ids = []
68
- nodes = []
69
-
70
  docs = SimpleDirectoryReader(input_dir=files_dir).load_data()
71
 
72
  logger.info("enumerating docs")
73
  for doc_idx, doc in enumerate(docs):
74
  self.pdf_count = self.pdf_count + 1
75
  curr_text_chunks = self.text_parser.split_text(doc.text)
76
- text_chunks.extend(curr_text_chunks)
77
- doc_ids.extend([doc_idx] * len(curr_text_chunks))
78
 
79
  logger.info("enumerating text_chunks")
80
- for idx, text_chunk in enumerate(text_chunks):
81
  node = TextNode(text=text_chunk)
82
- src_doc = docs[doc_ids[idx]]
83
  node.metadata = src_doc.metadata
84
- nodes.append(node)
85
 
86
  logger.info("enumerating nodes")
87
- for node in nodes:
88
  node_embedding = self.embed_model.get_text_embedding(
89
  node.get_content(metadata_mode=MetadataMode.ALL)
90
  )
@@ -94,7 +93,7 @@ class ChatPDF:
94
  storage_context = StorageContext.from_defaults(vector_store=self.vector_store)
95
  logger.info("indexing the nodes in VectorStoreIndex")
96
  index = VectorStoreIndex(
97
- nodes=nodes,
98
  storage_context=storage_context,
99
  transformations=Settings.transformations,
100
  )
@@ -110,5 +109,8 @@ class ChatPDF:
110
  return streaming_response
111
 
112
  def clear(self):
 
113
  self.pdf_count = 0
114
- self.client.delete(collection_name="rag_documents")
 
 
 
24
 
25
  class ChatPDF:
26
  pdf_count = 0
27
+ text_chunks = []
28
+ doc_ids = []
29
+ nodes = []
30
 
31
  def __init__(self):
32
  self.text_parser = SentenceSplitter(chunk_size=1024, chunk_overlap=24)
 
66
  Settings.transformations = [self.text_parser]
67
 
68
  def ingest(self, files_dir: str):
 
 
 
 
69
  docs = SimpleDirectoryReader(input_dir=files_dir).load_data()
70
 
71
  logger.info("enumerating docs")
72
  for doc_idx, doc in enumerate(docs):
73
  self.pdf_count = self.pdf_count + 1
74
  curr_text_chunks = self.text_parser.split_text(doc.text)
75
+ self.text_chunks.extend(curr_text_chunks)
76
+ self.doc_ids.extend([doc_idx] * len(curr_text_chunks))
77
 
78
  logger.info("enumerating text_chunks")
79
+ for idx, text_chunk in enumerate(self.text_chunks):
80
  node = TextNode(text=text_chunk)
81
+ self.src_doc = docs[doc_ids[idx]]
82
  node.metadata = src_doc.metadata
83
+ self.nodes.append(node)
84
 
85
  logger.info("enumerating nodes")
86
+ for node in self.nodes:
87
  node_embedding = self.embed_model.get_text_embedding(
88
  node.get_content(metadata_mode=MetadataMode.ALL)
89
  )
 
93
  storage_context = StorageContext.from_defaults(vector_store=self.vector_store)
94
  logger.info("indexing the nodes in VectorStoreIndex")
95
  index = VectorStoreIndex(
96
+ nodes=self.nodes,
97
  storage_context=storage_context,
98
  transformations=Settings.transformations,
99
  )
 
109
  return streaming_response
110
 
111
  def clear(self):
112
+ self.self.client.delete(collection_name="rag_documents", ids=doc_ids)
113
  self.pdf_count = 0
114
+ self.text_chunks = []
115
+ self.doc_ids = []
116
+ self.nodes = []