import json, os, time, uuid import pandas as pd import numpy as np from sklearn.metrics.pairwise import cosine_similarity from transformers import AutoTokenizer, AutoModel import chromadb from chromadb.config import Settings from chromadb.utils import embedding_functions from InstructorEmbedding import INSTRUCTOR from langchain_community.vectorstores import Chroma ''' If there is a transformers install error: pip install transformers==4.29.2 Python 3.8 and above will need to upgrade the transformers to 4.2x.xx https://github.com/huggingface/transformers/issues/11799 The goal is to creat a domain knowledge database based on existing transcribed labels. I modify the domain knowledge (an xlsx file) so that each row is embedded in a way that most closely resembles the raw OCR output, since that is what will be used to query against the db. Once the closest row is found, I use the id to go back to the xlsx and take the whole row, converting it into a dictionary similar to the desired output from the LLM. This dict is then added to the prompt as a hint for the LLM. ''' ''' pip uninstall protobuf pip install protobuf==3.19.5 ''' class VoucherVisionEmbedding: # def __init__(self, db_name, path_domain_knowledge, logger, build_new_db=False, model_name="hkunlp/instructor-xl", device="cuda"): # DB_DIR = os.path.join(os.path.dirname(__file__), db_name) # client_settings = chromadb.config.Settings( # chroma_db_impl="duckdb+parquet", # persist_directory=DB_DIR, # anonymized_telemetry=False # ) # embeddings = embedding_functions.InstructorEmbeddingFunction(model_name=model_name, device=device) # self.collection = Chroma( # collection_name="langchain_store", # embedding_function=embeddings, # client_settings=client_settings, # persist_directory=DB_DIR, # ) # total_rows = len(self.domain_knowledge) # for index, row in self.domain_knowledge.iterrows(): # try: # self.logger.info(f"[Creating New Embedding DB] --- Adding Row {index+1}/{total_rows}") # except: # print(f"Row {index+1}/{total_rows}") # id = str(row[0]) # document = str(' '.join(row[1:][row[1:].notna()].astype(str))) # self.collection.add_texts(document, None, id, embedding=embeddings) # self.collection.persist() # print(self.collection) def __init__(self, db_name, path_domain_knowledge, logger, build_new_db=False, model_name="hkunlp/instructor-xl", device="cuda"): DB_DIR = os.path.join(os.path.dirname(__file__), db_name) self.logger = logger self.path_domain_knowledge = path_domain_knowledge self.client = chromadb.PersistentClient(path=DB_DIR, settings=Settings(anonymized_telemetry=False)) ef = embedding_functions.InstructorEmbeddingFunction(model_name=model_name, device=device) self.domain_knowledge = pd.read_excel(path_domain_knowledge).fillna('').astype(str) if build_new_db: self.logger.info(f"Creating new DB from {self.path_domain_knowledge}") self.collection = self.client.create_collection(name=db_name, embedding_function=ef, metadata={"hnsw:space": "cosine"}) self.create_db_from_xlsx() else: try: self.collection = self.client.get_collection(name=db_name, embedding_function=ef) except: self.logger.error(f"Embedding database not found! Creating new DB from {self.path_domain_knowledge}") self.collection = self.client.create_collection(name=db_name, embedding_function=ef, metadata={"hnsw:space": "cosine"}) self.create_db_from_xlsx() def add_document(self, document, metadata, id): id = str(id) existing_documents = self.collection.get() if id not in existing_documents['ids']: try: self.collection.add(documents=[document], ids=[id]) except Exception as e: self.logger.error(f"Error while adding document {id}: {str(e)}") # try: # self.collection.add(documents=[document], ids=[id]) # except: # try: # time.sleep(0.1) # self.collection.add(documents=[document], ids=[id]) # except: # try: # self.logger.info(f"[Embedding Add Doc] --- Failed, skipping: {id}") # except: # print(f"Failed, skipping: {id}") else: try: self.logger.info(f"[Embedding Add Doc] --- ID already exists in the collection: {id}") except: print(f"ID already exists in the collection: {id}") def query_db(self, query_text, n_results): results = self.collection.query(query_texts=[query_text], n_results=n_results) self.similarity = round(results['distances'][0][0],3) self.similarity_exact = results['distances'][0][0] try: self.logger.info(f"[Embedding Search] --- Similarity (close to zero is best) {self.similarity}") except: print(f"Similarity (close to zero is best) --- {self.similarity}") self.domain_knowledge.iloc[:, 0] = self.domain_knowledge.iloc[:, 0].astype(str) # Initialize an empty list to hold dictionaries for id in results['ids']: row_dicts = self._get_row_from_df(id) if not row_dicts: # try: # self.logger.info(f"[Embedding Search] --- Similar Dictionary\n{row_dicts}") # except: # print(row_dicts) # else: try: self.logger.info(f"[Embedding Search] --- No row found for id {id}") except: print(f"No row found for id {id}") # Return the list of dictionaries if n_results > 1, else return single dictionary if n_results > 1: return row_dicts else: return row_dicts[0] if row_dicts else None def create_db_from_xlsx(self): total_rows = len(self.domain_knowledge) for index, row in self.domain_knowledge.iterrows(): try: self.logger.info(f"[Creating New Embedding DB] --- Adding Row {index+1}/{total_rows}") except: print(f"Row {index+1}/{total_rows}") id = str(row.iloc[0]) document = str(' '.join(row[0:][row[0:].notna()].astype(str))) self.add_document(document, None, id) def get_similarity(self): return self.similarity_exact def _get_row_from_df(self, ids): row_dicts = [] # initialize an empty list to hold dictionaries for id in ids: row = self.domain_knowledge[self.domain_knowledge.iloc[:, 0] == id] if not row.empty: row_dict = row.iloc[0].to_dict() row_dict.pop('Catalog Number', None) for key in row_dict: if pd.isna(row_dict[key]): row_dict[key] = '' row_dicts.append(row_dict) # append the dictionary to the list return row_dicts if row_dicts else None # return the list of dictionaries or None if it's empty # def _get_row_from_df(self, ids): # for id in ids: # row = self.domain_knowledge[self.domain_knowledge.iloc[:, 0] == id] # if not row.empty: # row_dict = row.iloc[0].to_dict() # row_dict.pop('Catalog Number', None) # for key in row_dict: # if pd.isna(row_dict[key]): # row_dict[key] = '' # return row_dict # return None class VoucherVisionEmbeddingTest: def __init__(self, ground_truth_dir, llm_output_dir, model_name="hkunlp/instructor-xl"): self.ground_truth_dir = ground_truth_dir self.llm_output_dir = llm_output_dir self.model_name = model_name self.model = INSTRUCTOR(model_name, device="cuda") self.instruction = "Represent the Science json dictionary document:" def compare_texts(self, ground_truth_text, predicted_text): # Convert the texts to embeddings using the given model ground_truth_embedding = self.model.encode([[self.instruction,ground_truth_text]]) predicted_embedding = self.model.encode([[self.instruction,predicted_text]]) # Compute the cosine similarity between the two embeddings similarity = cosine_similarity(ground_truth_embedding, predicted_embedding) return similarity[0][0] @staticmethod def json_to_text(json_dict): return str(json_dict) def get_max_difference(self, similarities): differences = [abs(1 - sim) for sim in similarities] return max(differences) def evaluate(self): # Get a list of all ground truth and LLM output files ground_truth_files = os.listdir(self.ground_truth_dir) llm_output_files = os.listdir(self.llm_output_dir) # Ensure file lists are sorted so they match up correctly ground_truth_files.sort() llm_output_files.sort() similarities = [] key_similarities = [] # List to store key similarity for ground_truth_file, llm_output_file in zip(ground_truth_files, llm_output_files): # Read the files and convert them to text with open(os.path.join(self.ground_truth_dir, ground_truth_file), 'r') as f: ground_truth_dict = json.load(f) ground_truth_text = self.json_to_text(ground_truth_dict) with open(os.path.join(self.llm_output_dir, llm_output_file), 'r') as ff: llm_output_dict = json.load(ff) llm_output_text = self.json_to_text(llm_output_dict) # Compute the similarity between the ground truth and the LLM output similarity = self.compare_texts(ground_truth_text, llm_output_text) # Clip and round to mitigate/smudge floating-point precision limitations similarity = np.clip(similarity, -1.0, 1.0) similarity = np.round(similarity, 6) similarities.append(similarity) # Compare keys ground_truth_keys = ', '.join(sorted(ground_truth_dict.keys())) llm_output_keys = ', '.join(sorted(llm_output_dict.keys())) key_similarity = self.compare_texts(ground_truth_keys, llm_output_keys) key_similarity = np.clip(key_similarity, -1.0, 1.0) key_similarity = np.round(key_similarity, 6) key_similarities.append(key_similarity) # Compute the mean similarity mean_similarity = np.mean(similarities) mean_key_similarity = np.mean(key_similarities) max_diff = self.get_max_difference(similarities) max_diff_key = self.get_max_difference(key_similarities) return mean_similarity, max_diff, similarities, mean_key_similarity, max_diff_key, key_similarities if __name__ == '__main__': # db_name = "VV_all_asia_minimal" db_name = "all_asia_minimal" path_domain_knowledge = 'D:/Dropbox/LeafMachine2/leafmachine2/transcription/domain_knowledge/AllAsiaMinimalasof25May2023_2__FOR-EMBEDDING.xlsx' # path_domain_knowledge = 'D:/Dropbox/LeafMachine2/leafmachine2/transcription/domain_knowledge/AllAsiaMinimalasof25May2023_2__TRIMMEDtiny.xlsx' build_new_db = False VVE = VoucherVisionEmbedding(db_name, path_domain_knowledge, build_new_db) test_query = "Golden Thread\nHerbaria of Michigan State University (MSC) and\nUniversiti Kebangsaan Malaysia, Sabah Campus (UKMS)\nUNITED STATES\n3539788\nNATIONAL HERBARIUM\nPLANTS OF BORNEO\nBrookea tomentosa Benth.\nMalaysia. Sabah. Beaufort District: Beaufort Hill. 5°22'N,\n115°45'E. Elev. 200 m. Burned logged dipterocarp forest.\nCrocker Formation. Small tree, corolla cream.\nDet. at K, 1986\n28 August 1983\nWith: Reed S. Beaman and Teofila E. Beamann\nJohn H. Beaman 6844" domain_knowledge_example = VVE.query_db(test_query, 1)