Spaces:
Sleeping
Sleeping
# %% | |
import pandas as pd | |
# Load the disease descriptions from MGDEF.RRF | |
df_disease_descriptions = pd.read_csv("MGDEF.RRF", sep="|", header=0) | |
# Rename the column '#CUI' to 'CUI' | |
df_disease_descriptions.rename(columns={"#CUI": "CUI"}, inplace=True) | |
# Rename the column 'DEF' to 'definition' | |
df_disease_descriptions.rename(columns={"DEF": "definition"}, inplace=True) | |
# Remove the last column, it's empty | |
df_disease_descriptions = df_disease_descriptions.iloc[:, :-1] | |
# Filter out the rows where the SUPPRESS field is equal to 'Y' | |
df_disease_descriptions = df_disease_descriptions[ | |
df_disease_descriptions["SUPPRESS"] != "Y" | |
] | |
# Some of the rows include a \n character, so we need to remove the rows where the CUI field contains spaces or doesn't start with 'C' | |
df_disease_descriptions = df_disease_descriptions[ | |
df_disease_descriptions["CUI"].str.startswith("C") | |
& ~df_disease_descriptions["CUI"].str.contains(" ") | |
] | |
# Remove the rows where the DEF field is empty | |
df_disease_descriptions = df_disease_descriptions[ | |
df_disease_descriptions["definition"].notnull() | |
] | |
df_disease_descriptions["uri"] = df_disease_descriptions["CUI"].apply( | |
lambda x: f"http://identifiers.org/medgen/{x}" | |
) | |
# Drop the columns that are not needed (source, SUPPRESS, CUI) | |
df_disease_descriptions.drop(columns=["source", "SUPPRESS", "CUI"], inplace=True) | |
# Drop the descriptions that are duplicates | |
df_disease_descriptions.drop_duplicates(subset=["definition"], inplace=True) | |
# Reset the index | |
df_disease_descriptions.reset_index(drop=True, inplace=True) | |
# %% | |
from sentence_transformers import SentenceTransformer | |
encoder = SentenceTransformer("allenai-specter") | |
vectors = encoder.encode( | |
df_disease_descriptions.definition, show_progress_bar=True, batch_size=64 | |
) | |
vectors.shape | |
# %% | |
import numpy as np | |
df_disease_descriptions["embeddings"] = vectors.astype( | |
"float32", casting="same_kind" | |
).tolist() | |
# %% | |
# Write to a CSV file | |
df_disease_descriptions.to_csv( | |
"disease_descriptions_with_embeddings.csv", index=False, header=True | |
) | |
# %% | |