danicafisher commited on
Commit
43888b6
·
verified ·
1 Parent(s): 4d4cc98

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -20
app.py CHANGED
@@ -11,6 +11,10 @@ from langchain_core.prompts import PromptTemplate
11
  from langchain.schema.output_parser import StrOutputParser
12
  from langchain.schema.runnable import RunnablePassthrough
13
  from langchain.schema.runnable.config import RunnableConfig
 
 
 
 
14
 
15
  # GLOBAL SCOPE - ENTIRE APPLICATION HAS ACCESS TO VALUES SET IN THIS SCOPE #
16
  # ---- ENV VARIABLES ---- #
@@ -54,27 +58,57 @@ hf_embeddings = HuggingFaceEndpointEmbeddings(
54
  huggingfacehub_api_token=HF_TOKEN,
55
  )
56
 
57
- if os.path.exists("./data/vectorstore"):
58
- vectorstore = FAISS.load_local(
59
- "./data/vectorstore",
60
- hf_embeddings,
61
- allow_dangerous_deserialization=True # this is necessary to load the vectorstore from disk as it's stored as a `.pkl` file.
62
- )
63
- hf_retriever = vectorstore.as_retriever()
64
- print("Loaded Vectorstore")
65
- else:
 
 
 
 
66
  print("Indexing Files")
67
- os.makedirs("./data/vectorstore", exist_ok=True)
68
- ### 4. INDEX FILES
69
- ### NOTE: REMEMBER TO BATCH THE DOCUMENTS WITH MAXIMUM BATCH SIZE = 32
70
- for i in range(0, len(split_documents), 32):
71
- if i == 0:
72
- vectorstore = FAISS.from_documents(split_documents[i:i+32], hf_embeddings)
73
- continue
74
- vectorstore.add_documents(split_documents[i:i+32])
75
- vectorstore.save_local("./data/vectorstore")
76
-
77
- hf_retriever = vectorstore.as_retriever()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
  # -- AUGMENTED -- #
80
  """
 
11
  from langchain.schema.output_parser import StrOutputParser
12
  from langchain.schema.runnable import RunnablePassthrough
13
  from langchain.schema.runnable.config import RunnableConfig
14
+ from tqdm.asyncio import tqdm_asyncio
15
+ import asyncio
16
+ from tqdm.asyncio import tqdm
17
+
18
 
19
  # GLOBAL SCOPE - ENTIRE APPLICATION HAS ACCESS TO VALUES SET IN THIS SCOPE #
20
  # ---- ENV VARIABLES ---- #
 
58
  huggingfacehub_api_token=HF_TOKEN,
59
  )
60
 
61
+ async def add_documents_async(vectorstore, documents):
62
+ await vectorstore.aadd_documents(documents)
63
+
64
+ async def process_batch(vectorstore, batch, is_first_batch, pbar):
65
+ if is_first_batch:
66
+ result = await FAISS.afrom_documents(batch, hf_embeddings)
67
+ else:
68
+ await add_documents_async(vectorstore, batch)
69
+ result = vectorstore
70
+ pbar.update(len(batch))
71
+ return result
72
+
73
+ async def main():
74
  print("Indexing Files")
75
+
76
+ vectorstore = None
77
+ batch_size = 32
78
+
79
+ batches = [split_documents[i:i+batch_size] for i in range(0, len(split_documents), batch_size)]
80
+
81
+ async def process_all_batches():
82
+ nonlocal vectorstore
83
+ tasks = []
84
+ pbars = []
85
+
86
+ for i, batch in enumerate(batches):
87
+ pbar = tqdm(total=len(batch), desc=f"Batch {i+1}/{len(batches)}", position=i)
88
+ pbars.append(pbar)
89
+
90
+ if i == 0:
91
+ vectorstore = await process_batch(None, batch, True, pbar)
92
+ else:
93
+ tasks.append(process_batch(vectorstore, batch, False, pbar))
94
+
95
+ if tasks:
96
+ await asyncio.gather(*tasks)
97
+
98
+ for pbar in pbars:
99
+ pbar.close()
100
+
101
+ await process_all_batches()
102
+
103
+ hf_retriever = vectorstore.as_retriever()
104
+ print("\nIndexing complete. Vectorstore is ready for use.")
105
+ return hf_retriever
106
+
107
+ async def run():
108
+ retriever = await main()
109
+ return retriever
110
+
111
+ hf_retriever = asyncio.run(run())
112
 
113
  # -- AUGMENTED -- #
114
  """