Spaces:
Sleeping
Sleeping
Update function/chatbot.py
Browse files- function/chatbot.py +19 -19
function/chatbot.py
CHANGED
@@ -116,7 +116,7 @@ def text_preprocessing(text):
|
|
116 |
text = ' '.join(words)
|
117 |
return text
|
118 |
def find_matching_files_in_docs_12_id(text, id):
|
119 |
-
folder_path = f"/code/
|
120 |
search_terms = []
|
121 |
search_terms_old = []
|
122 |
matching_index = []
|
@@ -169,7 +169,7 @@ def save_list_CSV_id(file_list, id):
|
|
169 |
text = ""
|
170 |
for x in file_list:
|
171 |
if x.endswith('.xlsx'):
|
172 |
-
old = f"/code/
|
173 |
new = old.replace(".xlsx", ".csv")
|
174 |
convert_xlsx_to_csv(old, new)
|
175 |
x = x.replace(".xlsx", ".csv")
|
@@ -279,7 +279,7 @@ def question_answer(question):
|
|
279 |
return answer
|
280 |
|
281 |
def check_persist_directory(id, file_name):
|
282 |
-
directory_path = f"/code/vector_database/{id}/{file_name}"
|
283 |
return os.path.exists(directory_path)
|
284 |
|
285 |
from langchain_community.vectorstores import FAISS
|
@@ -303,11 +303,11 @@ def aws1_all_id(new_dict, text_alls, id, thread_id):
|
|
303 |
texts_data = text_splitter.split_text(data)
|
304 |
|
305 |
if check_persist_directory(id, file_name):
|
306 |
-
vectordb_query = Chroma(persist_directory=f"
|
307 |
else:
|
308 |
vectordb_query = Chroma.from_texts(texts_data,
|
309 |
embedding=embeddings,
|
310 |
-
persist_directory=f"
|
311 |
|
312 |
k_1 = len(texts_data)
|
313 |
retriever = vectordb_query.as_retriever(search_kwargs={f"k": k_1})
|
@@ -317,12 +317,12 @@ def aws1_all_id(new_dict, text_alls, id, thread_id):
|
|
317 |
weights=[0.6, 0.4])
|
318 |
docs = ensemble_retriever.get_relevant_documents(f"{query}")
|
319 |
|
320 |
-
path = f"/code/vector_database/FAISS/{id}/{file_name}"
|
321 |
if check_path_exists(path):
|
322 |
docsearch = FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
|
323 |
else:
|
324 |
docsearch = FAISS.from_documents(docs, embeddings)
|
325 |
-
docsearch.save_local(f"/code/vector_database/FAISS/{id}/{file_name}")
|
326 |
docsearch = FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
|
327 |
|
328 |
k_2 = len(docs)
|
@@ -443,7 +443,7 @@ def load_file(loader):
|
|
443 |
|
444 |
def extract_data2(id):
|
445 |
documents = []
|
446 |
-
directory_path = f"/code/
|
447 |
if not os.path.exists(directory_path) or not any(
|
448 |
os.path.isfile(os.path.join(directory_path, f)) for f in os.listdir(directory_path)):
|
449 |
return False
|
@@ -490,11 +490,11 @@ def extract_data2(id):
|
|
490 |
texts = text_splitter.split_documents(documents)
|
491 |
Chroma.from_documents(documents=texts,
|
492 |
embedding=embeddings,
|
493 |
-
persist_directory=f"/code/vector_database/{id}")
|
494 |
return texts
|
495 |
|
496 |
def generate_question(id):
|
497 |
-
directory_path = f"/code/
|
498 |
if not os.path.exists(directory_path) or not any(
|
499 |
os.path.isfile(os.path.join(directory_path, f)) for f in os.listdir(directory_path)):
|
500 |
return False
|
@@ -593,19 +593,19 @@ def handle_query(question, text_all, compression_retriever, id, thread_id):
|
|
593 |
print(relevance_score_float)
|
594 |
if relevance_score_float <= 0.12:
|
595 |
documents1 = []
|
596 |
-
for file in os.listdir(f"/code/
|
597 |
if file.endswith('.csv'):
|
598 |
-
csv_path = f"/code/
|
599 |
loader = UnstructuredCSVLoader(csv_path)
|
600 |
documents1.extend(loader.load())
|
601 |
elif file.endswith('.xlsx'):
|
602 |
-
excel_path = f"/code/
|
603 |
loader = UnstructuredExcelLoader(excel_path)
|
604 |
documents1.extend(loader.load())
|
605 |
text_splitter_csv = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=2200, chunk_overlap=1500)
|
606 |
texts_csv = text_splitter_csv.split_documents(documents1)
|
607 |
vectordb_csv = Chroma.from_documents(documents=texts_csv,
|
608 |
-
embedding=embeddings, persist_directory=f'/code/vector_database/csv/{thread_id}')
|
609 |
k = len(texts_csv)
|
610 |
retriever_csv = vectordb_csv.as_retriever(search_kwargs={"k": k})
|
611 |
llm = Cohere(temperature=0)
|
@@ -660,12 +660,12 @@ def handle_query(question, text_all, compression_retriever, id, thread_id):
|
|
660 |
k_1 = len(texts)
|
661 |
file_name = os.path.basename(file_path)
|
662 |
if check_persist_directory(id, file_name):
|
663 |
-
vectordb_file = Chroma(persist_directory=f"/code/vector_database/{id}/{file_name}",
|
664 |
embedding_function=embeddings)
|
665 |
else:
|
666 |
vectordb_file = Chroma.from_documents(texts,
|
667 |
embedding=embeddings,
|
668 |
-
persist_directory=f"/code/vector_database/{id}/{file_name}")
|
669 |
retriever_file = vectordb_file.as_retriever(search_kwargs={f"k": k_1})
|
670 |
bm25_retriever = BM25Retriever.from_documents(texts)
|
671 |
bm25_retriever.k = k_1
|
@@ -673,12 +673,12 @@ def handle_query(question, text_all, compression_retriever, id, thread_id):
|
|
673 |
weights=[0.6, 0.4])
|
674 |
docs = ensemble_retriever.get_relevant_documents(f"{query}")
|
675 |
|
676 |
-
path = f"/code/vector_database/FAISS/{id}/{file_name}"
|
677 |
if check_path_exists(path):
|
678 |
docsearch = FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
|
679 |
else:
|
680 |
docsearch = FAISS.from_documents(docs, embeddings)
|
681 |
-
docsearch.save_local(f"/code/vector_database/FAISS/{id}/{file_name}")
|
682 |
docsearch = FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
|
683 |
k_2 = len(docs)
|
684 |
retrieve3 = docsearch.as_retriever(search_kwargs={f"k": k_2})
|
@@ -701,7 +701,7 @@ def handle_query_upgrade_keyword_old(query_all, text_all, id,chat_history):
|
|
701 |
test = query_analyzer(query_all)
|
702 |
test_string = str(test)
|
703 |
matches = re.findall(r"'([^']*)'", test_string)
|
704 |
-
vectordb = Chroma(persist_directory=f"/code/vector_database/{id}", embedding_function=embeddings)
|
705 |
k = len(text_all)
|
706 |
retriever = vectordb.as_retriever(search_kwargs={"k": k})
|
707 |
compressor = CohereRerank(top_n=5, model="rerank-english-v2.0")
|
|
|
116 |
text = ' '.join(words)
|
117 |
return text
|
118 |
def find_matching_files_in_docs_12_id(text, id):
|
119 |
+
folder_path = f"/code/temp/{id}"
|
120 |
search_terms = []
|
121 |
search_terms_old = []
|
122 |
matching_index = []
|
|
|
169 |
text = ""
|
170 |
for x in file_list:
|
171 |
if x.endswith('.xlsx'):
|
172 |
+
old = f"/code/temp/{id}/{x}"
|
173 |
new = old.replace(".xlsx", ".csv")
|
174 |
convert_xlsx_to_csv(old, new)
|
175 |
x = x.replace(".xlsx", ".csv")
|
|
|
279 |
return answer
|
280 |
|
281 |
def check_persist_directory(id, file_name):
|
282 |
+
directory_path = f"/code/temp/vector_database/{id}/{file_name}"
|
283 |
return os.path.exists(directory_path)
|
284 |
|
285 |
from langchain_community.vectorstores import FAISS
|
|
|
303 |
texts_data = text_splitter.split_text(data)
|
304 |
|
305 |
if check_persist_directory(id, file_name):
|
306 |
+
vectordb_query = Chroma(persist_directory=f"/code/temp/vector_database/{id}/{file_name}", embedding_function=embeddings)
|
307 |
else:
|
308 |
vectordb_query = Chroma.from_texts(texts_data,
|
309 |
embedding=embeddings,
|
310 |
+
persist_directory=f"/code/temp/vector_database/{id}/{file_name}")
|
311 |
|
312 |
k_1 = len(texts_data)
|
313 |
retriever = vectordb_query.as_retriever(search_kwargs={f"k": k_1})
|
|
|
317 |
weights=[0.6, 0.4])
|
318 |
docs = ensemble_retriever.get_relevant_documents(f"{query}")
|
319 |
|
320 |
+
path = f"/code/temp/vector_database/FAISS/{id}/{file_name}"
|
321 |
if check_path_exists(path):
|
322 |
docsearch = FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
|
323 |
else:
|
324 |
docsearch = FAISS.from_documents(docs, embeddings)
|
325 |
+
docsearch.save_local(f"/code/temp/vector_database/FAISS/{id}/{file_name}")
|
326 |
docsearch = FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
|
327 |
|
328 |
k_2 = len(docs)
|
|
|
443 |
|
444 |
def extract_data2(id):
|
445 |
documents = []
|
446 |
+
directory_path = f"/code/temp/{id}"
|
447 |
if not os.path.exists(directory_path) or not any(
|
448 |
os.path.isfile(os.path.join(directory_path, f)) for f in os.listdir(directory_path)):
|
449 |
return False
|
|
|
490 |
texts = text_splitter.split_documents(documents)
|
491 |
Chroma.from_documents(documents=texts,
|
492 |
embedding=embeddings,
|
493 |
+
persist_directory=f"/code/temp/vector_database/{id}")
|
494 |
return texts
|
495 |
|
496 |
def generate_question(id):
|
497 |
+
directory_path = f"/code/temp/{id}"
|
498 |
if not os.path.exists(directory_path) or not any(
|
499 |
os.path.isfile(os.path.join(directory_path, f)) for f in os.listdir(directory_path)):
|
500 |
return False
|
|
|
593 |
print(relevance_score_float)
|
594 |
if relevance_score_float <= 0.12:
|
595 |
documents1 = []
|
596 |
+
for file in os.listdir(f"/code/temp/{id}"):
|
597 |
if file.endswith('.csv'):
|
598 |
+
csv_path = f"/code/temp/{id}/" + file
|
599 |
loader = UnstructuredCSVLoader(csv_path)
|
600 |
documents1.extend(loader.load())
|
601 |
elif file.endswith('.xlsx'):
|
602 |
+
excel_path = f"/code/temp/{id}/" + file
|
603 |
loader = UnstructuredExcelLoader(excel_path)
|
604 |
documents1.extend(loader.load())
|
605 |
text_splitter_csv = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=2200, chunk_overlap=1500)
|
606 |
texts_csv = text_splitter_csv.split_documents(documents1)
|
607 |
vectordb_csv = Chroma.from_documents(documents=texts_csv,
|
608 |
+
embedding=embeddings, persist_directory=f'/code/temp/vector_database/csv/{thread_id}')
|
609 |
k = len(texts_csv)
|
610 |
retriever_csv = vectordb_csv.as_retriever(search_kwargs={"k": k})
|
611 |
llm = Cohere(temperature=0)
|
|
|
660 |
k_1 = len(texts)
|
661 |
file_name = os.path.basename(file_path)
|
662 |
if check_persist_directory(id, file_name):
|
663 |
+
vectordb_file = Chroma(persist_directory=f"/code/temp/vector_database/{id}/{file_name}",
|
664 |
embedding_function=embeddings)
|
665 |
else:
|
666 |
vectordb_file = Chroma.from_documents(texts,
|
667 |
embedding=embeddings,
|
668 |
+
persist_directory=f"/code/temp/vector_database/{id}/{file_name}")
|
669 |
retriever_file = vectordb_file.as_retriever(search_kwargs={f"k": k_1})
|
670 |
bm25_retriever = BM25Retriever.from_documents(texts)
|
671 |
bm25_retriever.k = k_1
|
|
|
673 |
weights=[0.6, 0.4])
|
674 |
docs = ensemble_retriever.get_relevant_documents(f"{query}")
|
675 |
|
676 |
+
path = f"/code/temp/vector_database/FAISS/{id}/{file_name}"
|
677 |
if check_path_exists(path):
|
678 |
docsearch = FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
|
679 |
else:
|
680 |
docsearch = FAISS.from_documents(docs, embeddings)
|
681 |
+
docsearch.save_local(f"/code/temp/vector_database/FAISS/{id}/{file_name}")
|
682 |
docsearch = FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
|
683 |
k_2 = len(docs)
|
684 |
retrieve3 = docsearch.as_retriever(search_kwargs={f"k": k_2})
|
|
|
701 |
test = query_analyzer(query_all)
|
702 |
test_string = str(test)
|
703 |
matches = re.findall(r"'([^']*)'", test_string)
|
704 |
+
vectordb = Chroma(persist_directory=f"/code/temp/vector_database/{id}", embedding_function=embeddings)
|
705 |
k = len(text_all)
|
706 |
retriever = vectordb.as_retriever(search_kwargs={"k": k})
|
707 |
compressor = CohereRerank(top_n=5, model="rerank-english-v2.0")
|