klinic / get_embeddings_of_disease_descriptions.py
acmc
First commit in HuggingFace
93e1b64
raw
history blame contribute delete
No virus
2.05 kB
# %%
import pandas as pd
# Load the disease descriptions from MGDEF.RRF
df_disease_descriptions = pd.read_csv("MGDEF.RRF", sep="|", header=0)
# Rename the column '#CUI' to 'CUI'
df_disease_descriptions.rename(columns={"#CUI": "CUI"}, inplace=True)
# Rename the column 'DEF' to 'definition'
df_disease_descriptions.rename(columns={"DEF": "definition"}, inplace=True)
# Remove the last column, it's empty
df_disease_descriptions = df_disease_descriptions.iloc[:, :-1]
# Filter out the rows where the SUPPRESS field is equal to 'Y'
df_disease_descriptions = df_disease_descriptions[
df_disease_descriptions["SUPPRESS"] != "Y"
]
# Some of the rows include a \n character, so we need to remove the rows where the CUI field contains spaces or doesn't start with 'C'
df_disease_descriptions = df_disease_descriptions[
df_disease_descriptions["CUI"].str.startswith("C")
& ~df_disease_descriptions["CUI"].str.contains(" ")
]
# Remove the rows where the DEF field is empty
df_disease_descriptions = df_disease_descriptions[
df_disease_descriptions["definition"].notnull()
]
df_disease_descriptions["uri"] = df_disease_descriptions["CUI"].apply(
lambda x: f"http://identifiers.org/medgen/{x}"
)
# Drop the columns that are not needed (source, SUPPRESS, CUI)
df_disease_descriptions.drop(columns=["source", "SUPPRESS", "CUI"], inplace=True)
# Drop the descriptions that are duplicates
df_disease_descriptions.drop_duplicates(subset=["definition"], inplace=True)
# Reset the index
df_disease_descriptions.reset_index(drop=True, inplace=True)
# %%
from sentence_transformers import SentenceTransformer
encoder = SentenceTransformer("allenai-specter")
vectors = encoder.encode(
df_disease_descriptions.definition, show_progress_bar=True, batch_size=64
)
vectors.shape
# %%
import numpy as np
df_disease_descriptions["embeddings"] = vectors.astype(
"float32", casting="same_kind"
).tolist()
# %%
# Write to a CSV file
df_disease_descriptions.to_csv(
"disease_descriptions_with_embeddings.csv", index=False, header=True
)
# %%