# %% import pandas as pd # Load the disease descriptions from MGDEF.RRF df_disease_descriptions = pd.read_csv("MGDEF.RRF", sep="|", header=0) # Rename the column '#CUI' to 'CUI' df_disease_descriptions.rename(columns={"#CUI": "CUI"}, inplace=True) # Rename the column 'DEF' to 'definition' df_disease_descriptions.rename(columns={"DEF": "definition"}, inplace=True) # Remove the last column, it's empty df_disease_descriptions = df_disease_descriptions.iloc[:, :-1] # Filter out the rows where the SUPPRESS field is equal to 'Y' df_disease_descriptions = df_disease_descriptions[ df_disease_descriptions["SUPPRESS"] != "Y" ] # Some of the rows include a \n character, so we need to remove the rows where the CUI field contains spaces or doesn't start with 'C' df_disease_descriptions = df_disease_descriptions[ df_disease_descriptions["CUI"].str.startswith("C") & ~df_disease_descriptions["CUI"].str.contains(" ") ] # Remove the rows where the DEF field is empty df_disease_descriptions = df_disease_descriptions[ df_disease_descriptions["definition"].notnull() ] df_disease_descriptions["uri"] = df_disease_descriptions["CUI"].apply( lambda x: f"http://identifiers.org/medgen/{x}" ) # Drop the columns that are not needed (source, SUPPRESS, CUI) df_disease_descriptions.drop(columns=["source", "SUPPRESS", "CUI"], inplace=True) # Drop the descriptions that are duplicates df_disease_descriptions.drop_duplicates(subset=["definition"], inplace=True) # Reset the index df_disease_descriptions.reset_index(drop=True, inplace=True) # %% from sentence_transformers import SentenceTransformer encoder = SentenceTransformer("allenai-specter") vectors = encoder.encode( df_disease_descriptions.definition, show_progress_bar=True, batch_size=64 ) vectors.shape # %% import numpy as np df_disease_descriptions["embeddings"] = vectors.astype( "float32", casting="same_kind" ).tolist() # %% # Write to a CSV file df_disease_descriptions.to_csv( "disease_descriptions_with_embeddings.csv", index=False, header=True ) # %%