Spaces:
Runtime error
Runtime error
Update rag.py
Browse files- app/rag.py +14 -12
app/rag.py
CHANGED
@@ -24,6 +24,9 @@ logger = logging.getLogger(__name__)
|
|
24 |
|
25 |
class ChatPDF:
|
26 |
pdf_count = 0
|
|
|
|
|
|
|
27 |
|
28 |
def __init__(self):
|
29 |
self.text_parser = SentenceSplitter(chunk_size=1024, chunk_overlap=24)
|
@@ -63,28 +66,24 @@ class ChatPDF:
|
|
63 |
Settings.transformations = [self.text_parser]
|
64 |
|
65 |
def ingest(self, files_dir: str):
|
66 |
-
text_chunks = []
|
67 |
-
doc_ids = []
|
68 |
-
nodes = []
|
69 |
-
|
70 |
docs = SimpleDirectoryReader(input_dir=files_dir).load_data()
|
71 |
|
72 |
logger.info("enumerating docs")
|
73 |
for doc_idx, doc in enumerate(docs):
|
74 |
self.pdf_count = self.pdf_count + 1
|
75 |
curr_text_chunks = self.text_parser.split_text(doc.text)
|
76 |
-
text_chunks.extend(curr_text_chunks)
|
77 |
-
doc_ids.extend([doc_idx] * len(curr_text_chunks))
|
78 |
|
79 |
logger.info("enumerating text_chunks")
|
80 |
-
for idx, text_chunk in enumerate(text_chunks):
|
81 |
node = TextNode(text=text_chunk)
|
82 |
-
src_doc = docs[doc_ids[idx]]
|
83 |
node.metadata = src_doc.metadata
|
84 |
-
nodes.append(node)
|
85 |
|
86 |
logger.info("enumerating nodes")
|
87 |
-
for node in nodes:
|
88 |
node_embedding = self.embed_model.get_text_embedding(
|
89 |
node.get_content(metadata_mode=MetadataMode.ALL)
|
90 |
)
|
@@ -94,7 +93,7 @@ class ChatPDF:
|
|
94 |
storage_context = StorageContext.from_defaults(vector_store=self.vector_store)
|
95 |
logger.info("indexing the nodes in VectorStoreIndex")
|
96 |
index = VectorStoreIndex(
|
97 |
-
nodes=nodes,
|
98 |
storage_context=storage_context,
|
99 |
transformations=Settings.transformations,
|
100 |
)
|
@@ -110,5 +109,8 @@ class ChatPDF:
|
|
110 |
return streaming_response
|
111 |
|
112 |
def clear(self):
|
|
|
113 |
self.pdf_count = 0
|
114 |
-
self.
|
|
|
|
|
|
24 |
|
25 |
class ChatPDF:
|
26 |
pdf_count = 0
|
27 |
+
text_chunks = []
|
28 |
+
doc_ids = []
|
29 |
+
nodes = []
|
30 |
|
31 |
def __init__(self):
|
32 |
self.text_parser = SentenceSplitter(chunk_size=1024, chunk_overlap=24)
|
|
|
66 |
Settings.transformations = [self.text_parser]
|
67 |
|
68 |
def ingest(self, files_dir: str):
|
|
|
|
|
|
|
|
|
69 |
docs = SimpleDirectoryReader(input_dir=files_dir).load_data()
|
70 |
|
71 |
logger.info("enumerating docs")
|
72 |
for doc_idx, doc in enumerate(docs):
|
73 |
self.pdf_count = self.pdf_count + 1
|
74 |
curr_text_chunks = self.text_parser.split_text(doc.text)
|
75 |
+
self.text_chunks.extend(curr_text_chunks)
|
76 |
+
self.doc_ids.extend([doc_idx] * len(curr_text_chunks))
|
77 |
|
78 |
logger.info("enumerating text_chunks")
|
79 |
+
for idx, text_chunk in enumerate(self.text_chunks):
|
80 |
node = TextNode(text=text_chunk)
|
81 |
+
self.src_doc = docs[doc_ids[idx]]
|
82 |
node.metadata = src_doc.metadata
|
83 |
+
self.nodes.append(node)
|
84 |
|
85 |
logger.info("enumerating nodes")
|
86 |
+
for node in self.nodes:
|
87 |
node_embedding = self.embed_model.get_text_embedding(
|
88 |
node.get_content(metadata_mode=MetadataMode.ALL)
|
89 |
)
|
|
|
93 |
storage_context = StorageContext.from_defaults(vector_store=self.vector_store)
|
94 |
logger.info("indexing the nodes in VectorStoreIndex")
|
95 |
index = VectorStoreIndex(
|
96 |
+
nodes=self.nodes,
|
97 |
storage_context=storage_context,
|
98 |
transformations=Settings.transformations,
|
99 |
)
|
|
|
109 |
return streaming_response
|
110 |
|
111 |
def clear(self):
|
112 |
+
self.self.client.delete(collection_name="rag_documents", ids=doc_ids)
|
113 |
self.pdf_count = 0
|
114 |
+
self.text_chunks = []
|
115 |
+
self.doc_ids = []
|
116 |
+
self.nodes = []
|