klinic / graph.py
acmc
First commit in HuggingFace
93e1b64
raw
history blame
8.11 kB
# %%
import rdflib
import pandas as pd
def get_graph():
# File with the graph: MGCONSO.RRF
df_concepts = pd.read_csv("MGCONSO.RRF", sep="|", header=0)
# Rename the column '#CUI' to 'CUI'
df_concepts.rename(columns={"#CUI": "CUI"}, inplace=True)
# Remove the last column, it's empty
df_concepts = df_concepts.iloc[:, :-1]
print(df_concepts.head())
# Create a graph
g = rdflib.Graph()
# Bind the namespace
g.bind("medgen", "http://identifiers.org/medgen/")
# Iterate over the rows
for i, row in df_concepts.iterrows():
if row.SUPPRESS == "Y":
continue
if row.ISPREF == "Y" and row.STT == "PF" and row.TS == "P":
# Create the URI
uri = rdflib.URIRef(f"http://identifiers.org/medgen/{row.CUI}")
# Add the triple
g.add((uri, rdflib.RDFS.label, rdflib.Literal(row.STR)))
# Now, load MGREL.RRF
df_relations = pd.read_csv("MGREL.RRF", sep="|", header=0)
# Rename the column '#CUI1' to 'CUI1'
df_relations.rename(columns={"#CUI1": "CUI1"}, inplace=True)
# Remove the last column, it's empty
df_relations = df_relations.iloc[:, :-1]
print(df_relations.head())
# Iterate over the rows
for i, row in df_relations.iterrows():
if row.SUPPRESS == "Y":
continue
# Create the URI
uri1 = rdflib.URIRef(f"http://identifiers.org/medgen/{row.CUI1}")
uri2 = rdflib.URIRef(f"http://identifiers.org/medgen/{row.CUI2}")
# Add the triple
if row.REL == "RL":
g.add((uri1, rdflib.URIRef("related"), uri2))
continue
g.add((uri1, rdflib.URIRef(f"http://identifiers.org/medgen/{row.REL}"), uri2))
return g
def apply_rules_to_graph(g):
# Now, apply this rule: if two nodes have the same parent (i.e. node1 RB node2 and node3 RB node2, then node1 related node3)
# Query the graph to get the parents of each node
query = """
PREFIX medgen: <http://identifiers.org/medgen/>
SELECT DISTINCT ?parent ?child1 ?child2 WHERE {
?parent medgen:RN ?child1 .
?parent medgen:RN ?child2 .
FILTER (?child1 != ?child2)
}
"""
res = g.query(query)
for row in res:
g.add((row.child1, rdflib.URIRef("related"), row.child2))
g.add((row.child2, rdflib.URIRef("related"), row.child1))
return g
def get_labels_of_entities():
"""
Returns a dictionary with the labels of the entities
"""
# File with the graph: MGCONSO.RRF
df_concepts = pd.read_csv("MGCONSO.RRF", sep="|", header=0)
# Rename the column '#CUI' to 'CUI'
df_concepts.rename(columns={"#CUI": "CUI"}, inplace=True)
# Remove the last column, it's empty
df_concepts = df_concepts.iloc[:, :-1]
# Create a dictionary
labels_of_entities = {}
# Iterate over the rows
for i, row in df_concepts.iterrows():
if row.SUPPRESS == "Y":
continue
if row.ISPREF == "Y" and row.STT == "PF" and row.TS == "P":
labels_of_entities[f"http://identifiers.org/medgen/{row.CUI}"] = row.STR
return labels_of_entities
def generate_triples_file(graph: rdflib.Graph):
with open("triples_medgen.tsv", "w") as f:
# Output the triples ?s ?p ?o
for s, p, o in graph.triples((None, rdflib.URIRef("related"), None)):
f.write(f"{s}\t{p}\t{o}\n")
for s, p, o in graph.triples(
(None, rdflib.URIRef("http://identifiers.org/medgen/RN"), None)
):
f.write(f"{s}\t{p}\t{o}\n")
for s, p, o in graph.triples(
(None, rdflib.URIRef("http://identifiers.org/medgen/RB"), None)
):
f.write(f"{s}\t{p}\t{o}\n")
for s, p, o in graph.triples((None, rdflib.URIRef("http://identifiers.org/medgen/PAR"), None)):
f.write(f"{s}\t{p}\t{o}\n")
for s, p, o in graph.triples((None, rdflib.URIRef("http://identifiers.org/medgen/CHD"), None)):
f.write(f"{s}\t{p}\t{o}\n")
def save_adjacency_matrix():
# Load the triples file generated
df = pd.read_csv("triples_medgen.tsv", sep="\t", header=None)
# Now output the adjacency matrix, where the rows are the subjects and the columns are the objects
# The values are the relations (i.e. 0 if no relation and 1 if there is a relation)
# Get the unique subjects and objects
subjects = df[0].unique()
objects = df[2].unique()
# Create the adjacency matrix
adj_matrix = pd.DataFrame(0, index=subjects, columns=objects)
# Iterate over the rows
for i, row in df.iterrows():
adj_matrix.loc[row[0], row[2]] = 1
# Save the adjacency matrix
adj_matrix.to_csv("adjacency_matrix.mat", sep="\t")
# %%
g = get_graph()
# %%
g = apply_rules_to_graph(g)
# %%
labels_of_entities = get_labels_of_entities()
# %%
generate_triples_file(g)
# %%
from pykeen.triples import TriplesFactory
from pykeen.models import TuckER, TransE, TransH
from pykeen.pipeline import pipeline
tf = TriplesFactory.from_path("triples_medgen.tsv")
print(f"Triples count: {tf.num_triples}")
training, testing, validation = tf.split([0.8, 0.1, 0.1], random_state=42, randomize_cleanup=False)
result = pipeline(
training=training,
testing=testing,
validation=validation,
model=TransE,
stopper="early",
epochs=500, # short epochs for testing - you should go
# higher, especially with early stopper enabled
)
result.save_to_directory("doctests/test_unstratified_stopped_complex")
# %%
import torch
alzheimers = "http://identifiers.org/medgen/C1843013"
# What does the model predict for Alzheimer's disease?
model = result.model
alzheimers_id = tf.entity_to_id[alzheimers]
relation_id = tf.relation_to_id["related"]
batch_to_predict = torch.tensor([[alzheimers_id, relation_id]])
alzheimers_pred = model.predict_t(hr_batch=batch_to_predict)
print(alzheimers_pred.shape)
# Get the indices of the top 10 predictions
top10 = torch.topk(alzheimers_pred, 10, largest=True)
# Get the entities
entities = tf.entity_id_to_label
print(top10.indices)
for i in top10.indices[0]:
# Ask the graph, what is the label for this entity?
query = f"""
PREFIX medgen: <http://identifiers.org/medgen/>
SELECT ?label WHERE {{
<{entities[i.item()]}> <http://www.w3.org/2000/01/rdf-schema#label> ?label
}}
"""
res = g.query(query)
for i, row in enumerate(res):
print(f"{i}: {row}")
# %%
from pykeen.nn.representation import Embedding
# Get the embeddings of all the entities
entity_ids = torch.LongTensor(list(tf.entity_to_id.values())).cuda()
entity_embeddings: Embedding = model.entity_representations[0]._embeddings(entity_ids)
# Get the embeddings of the relations
relation_ids = torch.LongTensor(list(tf.relation_to_id.values())).cuda()
relation_embeddings: Embedding = model.relation_representations[0]._embeddings(
relation_ids
)
print(f"Entity embeddings shape: {entity_embeddings.shape}")
print(f"Relation embeddings shape: {relation_embeddings.shape}")
# Store the embeddings in a DataFrame
df = pd.DataFrame(
{
"embedding": entity_embeddings.detach().cpu().tolist(),
"label": [
labels_of_entities[tf.entity_id_to_label[i]] if tf.entity_id_to_label[i] in labels_of_entities else ""
for i in range(len(tf.entity_id_to_label))
],
"uri": [
f"{tf.entity_id_to_label[i]}" for i in range(len(tf.entity_id_to_label))
],
},
index=range(len(entity_embeddings)),
)
## Save the DataFrame
df.to_csv("entity_embeddings.csv")
# Store the embeddings in a DataFrame
df = pd.DataFrame(
{
"embedding": relation_embeddings.detach().cpu().tolist(),
"label": [
tf.relation_id_to_label[i] for i in range(len(tf.relation_id_to_label))
],
"uri": [
f"{tf.relation_id_to_label[i]}" for i in range(len(tf.relation_id_to_label))
],
},
index=range(len(relation_embeddings)),
)
## Save the DataFrame
df.to_csv("relation_embeddings.csv")
# %%
import pyobo
pyobo.get_name("mesh", "16793")
# %%