kltn20133118 commited on
Commit
b2cbb03
·
verified ·
1 Parent(s): 73604f6

Update function/chatbot.py

Browse files
Files changed (1) hide show
  1. function/chatbot.py +19 -19
function/chatbot.py CHANGED
@@ -116,7 +116,7 @@ def text_preprocessing(text):
116
  text = ' '.join(words)
117
  return text
118
  def find_matching_files_in_docs_12_id(text, id):
119
- folder_path = f"/code/user_file/{id}"
120
  search_terms = []
121
  search_terms_old = []
122
  matching_index = []
@@ -169,7 +169,7 @@ def save_list_CSV_id(file_list, id):
169
  text = ""
170
  for x in file_list:
171
  if x.endswith('.xlsx'):
172
- old = f"/code/user_file/{id}/{x}"
173
  new = old.replace(".xlsx", ".csv")
174
  convert_xlsx_to_csv(old, new)
175
  x = x.replace(".xlsx", ".csv")
@@ -279,7 +279,7 @@ def question_answer(question):
279
  return answer
280
 
281
  def check_persist_directory(id, file_name):
282
- directory_path = f"/code/vector_database/{id}/{file_name}"
283
  return os.path.exists(directory_path)
284
 
285
  from langchain_community.vectorstores import FAISS
@@ -303,11 +303,11 @@ def aws1_all_id(new_dict, text_alls, id, thread_id):
303
  texts_data = text_splitter.split_text(data)
304
 
305
  if check_persist_directory(id, file_name):
306
- vectordb_query = Chroma(persist_directory=f"./vector_database/{id}/{file_name}", embedding_function=embeddings)
307
  else:
308
  vectordb_query = Chroma.from_texts(texts_data,
309
  embedding=embeddings,
310
- persist_directory=f"./vector_database/{id}/{file_name}")
311
 
312
  k_1 = len(texts_data)
313
  retriever = vectordb_query.as_retriever(search_kwargs={f"k": k_1})
@@ -317,12 +317,12 @@ def aws1_all_id(new_dict, text_alls, id, thread_id):
317
  weights=[0.6, 0.4])
318
  docs = ensemble_retriever.get_relevant_documents(f"{query}")
319
 
320
- path = f"/code/vector_database/FAISS/{id}/{file_name}"
321
  if check_path_exists(path):
322
  docsearch = FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
323
  else:
324
  docsearch = FAISS.from_documents(docs, embeddings)
325
- docsearch.save_local(f"/code/vector_database/FAISS/{id}/{file_name}")
326
  docsearch = FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
327
 
328
  k_2 = len(docs)
@@ -443,7 +443,7 @@ def load_file(loader):
443
 
444
  def extract_data2(id):
445
  documents = []
446
- directory_path = f"/code/user_file/{id}"
447
  if not os.path.exists(directory_path) or not any(
448
  os.path.isfile(os.path.join(directory_path, f)) for f in os.listdir(directory_path)):
449
  return False
@@ -490,11 +490,11 @@ def extract_data2(id):
490
  texts = text_splitter.split_documents(documents)
491
  Chroma.from_documents(documents=texts,
492
  embedding=embeddings,
493
- persist_directory=f"/code/vector_database/{id}")
494
  return texts
495
 
496
  def generate_question(id):
497
- directory_path = f"/code/user_file/{id}"
498
  if not os.path.exists(directory_path) or not any(
499
  os.path.isfile(os.path.join(directory_path, f)) for f in os.listdir(directory_path)):
500
  return False
@@ -593,19 +593,19 @@ def handle_query(question, text_all, compression_retriever, id, thread_id):
593
  print(relevance_score_float)
594
  if relevance_score_float <= 0.12:
595
  documents1 = []
596
- for file in os.listdir(f"/code/user_file/{id}"):
597
  if file.endswith('.csv'):
598
- csv_path = f"/code/user_file/{id}/" + file
599
  loader = UnstructuredCSVLoader(csv_path)
600
  documents1.extend(loader.load())
601
  elif file.endswith('.xlsx'):
602
- excel_path = f"/code/user_file/{id}/" + file
603
  loader = UnstructuredExcelLoader(excel_path)
604
  documents1.extend(loader.load())
605
  text_splitter_csv = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=2200, chunk_overlap=1500)
606
  texts_csv = text_splitter_csv.split_documents(documents1)
607
  vectordb_csv = Chroma.from_documents(documents=texts_csv,
608
- embedding=embeddings, persist_directory=f'/code/vector_database/csv/{thread_id}')
609
  k = len(texts_csv)
610
  retriever_csv = vectordb_csv.as_retriever(search_kwargs={"k": k})
611
  llm = Cohere(temperature=0)
@@ -660,12 +660,12 @@ def handle_query(question, text_all, compression_retriever, id, thread_id):
660
  k_1 = len(texts)
661
  file_name = os.path.basename(file_path)
662
  if check_persist_directory(id, file_name):
663
- vectordb_file = Chroma(persist_directory=f"/code/vector_database/{id}/{file_name}",
664
  embedding_function=embeddings)
665
  else:
666
  vectordb_file = Chroma.from_documents(texts,
667
  embedding=embeddings,
668
- persist_directory=f"/code/vector_database/{id}/{file_name}")
669
  retriever_file = vectordb_file.as_retriever(search_kwargs={f"k": k_1})
670
  bm25_retriever = BM25Retriever.from_documents(texts)
671
  bm25_retriever.k = k_1
@@ -673,12 +673,12 @@ def handle_query(question, text_all, compression_retriever, id, thread_id):
673
  weights=[0.6, 0.4])
674
  docs = ensemble_retriever.get_relevant_documents(f"{query}")
675
 
676
- path = f"/code/vector_database/FAISS/{id}/{file_name}"
677
  if check_path_exists(path):
678
  docsearch = FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
679
  else:
680
  docsearch = FAISS.from_documents(docs, embeddings)
681
- docsearch.save_local(f"/code/vector_database/FAISS/{id}/{file_name}")
682
  docsearch = FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
683
  k_2 = len(docs)
684
  retrieve3 = docsearch.as_retriever(search_kwargs={f"k": k_2})
@@ -701,7 +701,7 @@ def handle_query_upgrade_keyword_old(query_all, text_all, id,chat_history):
701
  test = query_analyzer(query_all)
702
  test_string = str(test)
703
  matches = re.findall(r"'([^']*)'", test_string)
704
- vectordb = Chroma(persist_directory=f"/code/vector_database/{id}", embedding_function=embeddings)
705
  k = len(text_all)
706
  retriever = vectordb.as_retriever(search_kwargs={"k": k})
707
  compressor = CohereRerank(top_n=5, model="rerank-english-v2.0")
 
116
  text = ' '.join(words)
117
  return text
118
  def find_matching_files_in_docs_12_id(text, id):
119
+ folder_path = f"/code/temp/{id}"
120
  search_terms = []
121
  search_terms_old = []
122
  matching_index = []
 
169
  text = ""
170
  for x in file_list:
171
  if x.endswith('.xlsx'):
172
+ old = f"/code/temp/{id}/{x}"
173
  new = old.replace(".xlsx", ".csv")
174
  convert_xlsx_to_csv(old, new)
175
  x = x.replace(".xlsx", ".csv")
 
279
  return answer
280
 
281
  def check_persist_directory(id, file_name):
282
+ directory_path = f"/code/temp/vector_database/{id}/{file_name}"
283
  return os.path.exists(directory_path)
284
 
285
  from langchain_community.vectorstores import FAISS
 
303
  texts_data = text_splitter.split_text(data)
304
 
305
  if check_persist_directory(id, file_name):
306
+ vectordb_query = Chroma(persist_directory=f"/code/temp/vector_database/{id}/{file_name}", embedding_function=embeddings)
307
  else:
308
  vectordb_query = Chroma.from_texts(texts_data,
309
  embedding=embeddings,
310
+ persist_directory=f"/code/temp/vector_database/{id}/{file_name}")
311
 
312
  k_1 = len(texts_data)
313
  retriever = vectordb_query.as_retriever(search_kwargs={f"k": k_1})
 
317
  weights=[0.6, 0.4])
318
  docs = ensemble_retriever.get_relevant_documents(f"{query}")
319
 
320
+ path = f"/code/temp/vector_database/FAISS/{id}/{file_name}"
321
  if check_path_exists(path):
322
  docsearch = FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
323
  else:
324
  docsearch = FAISS.from_documents(docs, embeddings)
325
+ docsearch.save_local(f"/code/temp/vector_database/FAISS/{id}/{file_name}")
326
  docsearch = FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
327
 
328
  k_2 = len(docs)
 
443
 
444
  def extract_data2(id):
445
  documents = []
446
+ directory_path = f"/code/temp/{id}"
447
  if not os.path.exists(directory_path) or not any(
448
  os.path.isfile(os.path.join(directory_path, f)) for f in os.listdir(directory_path)):
449
  return False
 
490
  texts = text_splitter.split_documents(documents)
491
  Chroma.from_documents(documents=texts,
492
  embedding=embeddings,
493
+ persist_directory=f"/code/temp/vector_database/{id}")
494
  return texts
495
 
496
  def generate_question(id):
497
+ directory_path = f"/code/temp/{id}"
498
  if not os.path.exists(directory_path) or not any(
499
  os.path.isfile(os.path.join(directory_path, f)) for f in os.listdir(directory_path)):
500
  return False
 
593
  print(relevance_score_float)
594
  if relevance_score_float <= 0.12:
595
  documents1 = []
596
+ for file in os.listdir(f"/code/temp/{id}"):
597
  if file.endswith('.csv'):
598
+ csv_path = f"/code/temp/{id}/" + file
599
  loader = UnstructuredCSVLoader(csv_path)
600
  documents1.extend(loader.load())
601
  elif file.endswith('.xlsx'):
602
+ excel_path = f"/code/temp/{id}/" + file
603
  loader = UnstructuredExcelLoader(excel_path)
604
  documents1.extend(loader.load())
605
  text_splitter_csv = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=2200, chunk_overlap=1500)
606
  texts_csv = text_splitter_csv.split_documents(documents1)
607
  vectordb_csv = Chroma.from_documents(documents=texts_csv,
608
+ embedding=embeddings, persist_directory=f'/code/temp/vector_database/csv/{thread_id}')
609
  k = len(texts_csv)
610
  retriever_csv = vectordb_csv.as_retriever(search_kwargs={"k": k})
611
  llm = Cohere(temperature=0)
 
660
  k_1 = len(texts)
661
  file_name = os.path.basename(file_path)
662
  if check_persist_directory(id, file_name):
663
+ vectordb_file = Chroma(persist_directory=f"/code/temp/vector_database/{id}/{file_name}",
664
  embedding_function=embeddings)
665
  else:
666
  vectordb_file = Chroma.from_documents(texts,
667
  embedding=embeddings,
668
+ persist_directory=f"/code/temp/vector_database/{id}/{file_name}")
669
  retriever_file = vectordb_file.as_retriever(search_kwargs={f"k": k_1})
670
  bm25_retriever = BM25Retriever.from_documents(texts)
671
  bm25_retriever.k = k_1
 
673
  weights=[0.6, 0.4])
674
  docs = ensemble_retriever.get_relevant_documents(f"{query}")
675
 
676
+ path = f"/code/temp/vector_database/FAISS/{id}/{file_name}"
677
  if check_path_exists(path):
678
  docsearch = FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
679
  else:
680
  docsearch = FAISS.from_documents(docs, embeddings)
681
+ docsearch.save_local(f"/code/temp/vector_database/FAISS/{id}/{file_name}")
682
  docsearch = FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
683
  k_2 = len(docs)
684
  retrieve3 = docsearch.as_retriever(search_kwargs={f"k": k_2})
 
701
  test = query_analyzer(query_all)
702
  test_string = str(test)
703
  matches = re.findall(r"'([^']*)'", test_string)
704
+ vectordb = Chroma(persist_directory=f"/code/temp/vector_database/{id}", embedding_function=embeddings)
705
  k = len(text_all)
706
  retriever = vectordb.as_retriever(search_kwargs={"k": k})
707
  compressor = CohereRerank(top_n=5, model="rerank-english-v2.0")