Spaces:

kltn20133118
/

pychatbot

Sleeping

App Files Files Community

kltn20133118 commited on Aug 2, 2024

Commit

b2cbb03

verified ·

1 Parent(s): 73604f6

Update function/chatbot.py

Browse files

Files changed (1) hide show

function/chatbot.py +19 -19

function/chatbot.py CHANGED Viewed

@@ -116,7 +116,7 @@ def text_preprocessing(text):
     text = ' '.join(words)
     return text
 def find_matching_files_in_docs_12_id(text, id):
-    folder_path = f"/code/user_file/{id}"
     search_terms = []
     search_terms_old = []
     matching_index = []
@@ -169,7 +169,7 @@ def save_list_CSV_id(file_list, id):
     text = ""
     for x in file_list:
         if x.endswith('.xlsx'):
-            old = f"/code/user_file/{id}/{x}"
             new = old.replace(".xlsx", ".csv")
             convert_xlsx_to_csv(old, new)
             x = x.replace(".xlsx", ".csv")
@@ -279,7 +279,7 @@ def question_answer(question):
         return answer
 def check_persist_directory(id, file_name):
-    directory_path = f"/code/vector_database/{id}/{file_name}"
     return os.path.exists(directory_path)
 from langchain_community.vectorstores import FAISS
@@ -303,11 +303,11 @@ def aws1_all_id(new_dict, text_alls, id, thread_id):
             texts_data = text_splitter.split_text(data)
             if check_persist_directory(id, file_name):
-                vectordb_query = Chroma(persist_directory=f"./vector_database/{id}/{file_name}", embedding_function=embeddings)
             else:
                 vectordb_query = Chroma.from_texts(texts_data,
                                                    embedding=embeddings,
-                                                   persist_directory=f"./vector_database/{id}/{file_name}")
             k_1 = len(texts_data)
             retriever = vectordb_query.as_retriever(search_kwargs={f"k": k_1})
@@ -317,12 +317,12 @@ def aws1_all_id(new_dict, text_alls, id, thread_id):
                                                    weights=[0.6, 0.4])
             docs = ensemble_retriever.get_relevant_documents(f"{query}")
-            path = f"/code/vector_database/FAISS/{id}/{file_name}"
             if check_path_exists(path):
                 docsearch = FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
             else:
                 docsearch = FAISS.from_documents(docs, embeddings)
-                docsearch.save_local(f"/code/vector_database/FAISS/{id}/{file_name}")
                 docsearch = FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
             k_2 = len(docs)
@@ -443,7 +443,7 @@ def load_file(loader):
 def extract_data2(id):
     documents = []
-    directory_path = f"/code/user_file/{id}"
     if not os.path.exists(directory_path) or not any(
             os.path.isfile(os.path.join(directory_path, f)) for f in os.listdir(directory_path)):
         return False
@@ -490,11 +490,11 @@ def extract_data2(id):
     texts = text_splitter.split_documents(documents)
     Chroma.from_documents(documents=texts,
                           embedding=embeddings,
-                          persist_directory=f"/code/vector_database/{id}")
     return texts
 def generate_question(id):
-    directory_path = f"/code/user_file/{id}"
     if not os.path.exists(directory_path) or not any(
             os.path.isfile(os.path.join(directory_path, f)) for f in os.listdir(directory_path)):
         return False
@@ -593,19 +593,19 @@ def handle_query(question, text_all, compression_retriever, id, thread_id):
             print(relevance_score_float)
             if relevance_score_float <= 0.12:
                 documents1 = []
-                for file in os.listdir(f"/code/user_file/{id}"):
                     if file.endswith('.csv'):
-                        csv_path = f"/code/user_file/{id}/" + file
                         loader = UnstructuredCSVLoader(csv_path)
                         documents1.extend(loader.load())
                     elif file.endswith('.xlsx'):
-                        excel_path = f"/code/user_file/{id}/" + file
                         loader = UnstructuredExcelLoader(excel_path)
                         documents1.extend(loader.load())
                 text_splitter_csv = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=2200, chunk_overlap=1500)
                 texts_csv = text_splitter_csv.split_documents(documents1)
                 vectordb_csv = Chroma.from_documents(documents=texts_csv,
-                                                     embedding=embeddings, persist_directory=f'/code/vector_database/csv/{thread_id}')
                 k = len(texts_csv)
                 retriever_csv = vectordb_csv.as_retriever(search_kwargs={"k": k})
                 llm = Cohere(temperature=0)
@@ -660,12 +660,12 @@ def handle_query(question, text_all, compression_retriever, id, thread_id):
                 k_1 = len(texts)
                 file_name = os.path.basename(file_path)
                 if check_persist_directory(id, file_name):
-                    vectordb_file = Chroma(persist_directory=f"/code/vector_database/{id}/{file_name}",
                                            embedding_function=embeddings)
                 else:
                     vectordb_file = Chroma.from_documents(texts,
                                                           embedding=embeddings,
-                                                          persist_directory=f"/code/vector_database/{id}/{file_name}")
                 retriever_file = vectordb_file.as_retriever(search_kwargs={f"k": k_1})
                 bm25_retriever = BM25Retriever.from_documents(texts)
                 bm25_retriever.k = k_1
@@ -673,12 +673,12 @@ def handle_query(question, text_all, compression_retriever, id, thread_id):
                                                        weights=[0.6, 0.4])
                 docs = ensemble_retriever.get_relevant_documents(f"{query}")
-                path = f"/code/vector_database/FAISS/{id}/{file_name}"
                 if check_path_exists(path):
                     docsearch = FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
                 else:
                     docsearch = FAISS.from_documents(docs, embeddings)
-                    docsearch.save_local(f"/code/vector_database/FAISS/{id}/{file_name}")
                     docsearch = FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
                 k_2 = len(docs)
                 retrieve3 = docsearch.as_retriever(search_kwargs={f"k": k_2})
@@ -701,7 +701,7 @@ def handle_query_upgrade_keyword_old(query_all, text_all, id,chat_history):
     test = query_analyzer(query_all)
     test_string = str(test)
     matches = re.findall(r"'([^']*)'", test_string)
-    vectordb = Chroma(persist_directory=f"/code/vector_database/{id}", embedding_function=embeddings)
     k = len(text_all)
     retriever = vectordb.as_retriever(search_kwargs={"k": k})
     compressor = CohereRerank(top_n=5, model="rerank-english-v2.0")

     text = ' '.join(words)
     return text
 def find_matching_files_in_docs_12_id(text, id):
+    folder_path = f"/code/temp/{id}"
     search_terms = []
     search_terms_old = []
     matching_index = []
     text = ""
     for x in file_list:
         if x.endswith('.xlsx'):
+            old = f"/code/temp/{id}/{x}"
             new = old.replace(".xlsx", ".csv")
             convert_xlsx_to_csv(old, new)
             x = x.replace(".xlsx", ".csv")
         return answer
 def check_persist_directory(id, file_name):
+    directory_path = f"/code/temp/vector_database/{id}/{file_name}"
     return os.path.exists(directory_path)
 from langchain_community.vectorstores import FAISS
             texts_data = text_splitter.split_text(data)
             if check_persist_directory(id, file_name):
+                vectordb_query = Chroma(persist_directory=f"/code/temp/vector_database/{id}/{file_name}", embedding_function=embeddings)
             else:
                 vectordb_query = Chroma.from_texts(texts_data,
                                                    embedding=embeddings,
+                                                   persist_directory=f"/code/temp/vector_database/{id}/{file_name}")
             k_1 = len(texts_data)
             retriever = vectordb_query.as_retriever(search_kwargs={f"k": k_1})
                                                    weights=[0.6, 0.4])
             docs = ensemble_retriever.get_relevant_documents(f"{query}")
+            path = f"/code/temp/vector_database/FAISS/{id}/{file_name}"
             if check_path_exists(path):
                 docsearch = FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
             else:
                 docsearch = FAISS.from_documents(docs, embeddings)
+                docsearch.save_local(f"/code/temp/vector_database/FAISS/{id}/{file_name}")
                 docsearch = FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
             k_2 = len(docs)
 def extract_data2(id):
     documents = []
+    directory_path = f"/code/temp/{id}"
     if not os.path.exists(directory_path) or not any(
             os.path.isfile(os.path.join(directory_path, f)) for f in os.listdir(directory_path)):
         return False
     texts = text_splitter.split_documents(documents)
     Chroma.from_documents(documents=texts,
                           embedding=embeddings,
+                          persist_directory=f"/code/temp/vector_database/{id}")
     return texts
 def generate_question(id):
+    directory_path = f"/code/temp/{id}"
     if not os.path.exists(directory_path) or not any(
             os.path.isfile(os.path.join(directory_path, f)) for f in os.listdir(directory_path)):
         return False
             print(relevance_score_float)
             if relevance_score_float <= 0.12:
                 documents1 = []
+                for file in os.listdir(f"/code/temp/{id}"):
                     if file.endswith('.csv'):
+                        csv_path = f"/code/temp/{id}/" + file
                         loader = UnstructuredCSVLoader(csv_path)
                         documents1.extend(loader.load())
                     elif file.endswith('.xlsx'):
+                        excel_path = f"/code/temp/{id}/" + file
                         loader = UnstructuredExcelLoader(excel_path)
                         documents1.extend(loader.load())
                 text_splitter_csv = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=2200, chunk_overlap=1500)
                 texts_csv = text_splitter_csv.split_documents(documents1)
                 vectordb_csv = Chroma.from_documents(documents=texts_csv,
+                                                     embedding=embeddings, persist_directory=f'/code/temp/vector_database/csv/{thread_id}')
                 k = len(texts_csv)
                 retriever_csv = vectordb_csv.as_retriever(search_kwargs={"k": k})
                 llm = Cohere(temperature=0)
                 k_1 = len(texts)
                 file_name = os.path.basename(file_path)
                 if check_persist_directory(id, file_name):
+                    vectordb_file = Chroma(persist_directory=f"/code/temp/vector_database/{id}/{file_name}",
                                            embedding_function=embeddings)
                 else:
                     vectordb_file = Chroma.from_documents(texts,
                                                           embedding=embeddings,
+                                                          persist_directory=f"/code/temp/vector_database/{id}/{file_name}")
                 retriever_file = vectordb_file.as_retriever(search_kwargs={f"k": k_1})
                 bm25_retriever = BM25Retriever.from_documents(texts)
                 bm25_retriever.k = k_1
                                                        weights=[0.6, 0.4])
                 docs = ensemble_retriever.get_relevant_documents(f"{query}")
+                path = f"/code/temp/vector_database/FAISS/{id}/{file_name}"
                 if check_path_exists(path):
                     docsearch = FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
                 else:
                     docsearch = FAISS.from_documents(docs, embeddings)
+                    docsearch.save_local(f"/code/temp/vector_database/FAISS/{id}/{file_name}")
                     docsearch = FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
                 k_2 = len(docs)
                 retrieve3 = docsearch.as_retriever(search_kwargs={f"k": k_2})
     test = query_analyzer(query_all)
     test_string = str(test)
     matches = re.findall(r"'([^']*)'", test_string)
+    vectordb = Chroma(persist_directory=f"/code/temp/vector_database/{id}", embedding_function=embeddings)
     k = len(text_all)
     retriever = vectordb.as_retriever(search_kwargs={"k": k})
     compressor = CohereRerank(top_n=5, model="rerank-english-v2.0")