Spaces:
Sleeping
Sleeping
# %% | |
import rdflib | |
import pandas as pd | |
def get_graph(): | |
# File with the graph: MGCONSO.RRF | |
df_concepts = pd.read_csv("MGCONSO.RRF", sep="|", header=0) | |
# Rename the column '#CUI' to 'CUI' | |
df_concepts.rename(columns={"#CUI": "CUI"}, inplace=True) | |
# Remove the last column, it's empty | |
df_concepts = df_concepts.iloc[:, :-1] | |
print(df_concepts.head()) | |
# Create a graph | |
g = rdflib.Graph() | |
# Bind the namespace | |
g.bind("medgen", "http://identifiers.org/medgen/") | |
# Iterate over the rows | |
for i, row in df_concepts.iterrows(): | |
if row.SUPPRESS == "Y": | |
continue | |
if row.ISPREF == "Y" and row.STT == "PF" and row.TS == "P": | |
# Create the URI | |
uri = rdflib.URIRef(f"http://identifiers.org/medgen/{row.CUI}") | |
# Add the triple | |
g.add((uri, rdflib.RDFS.label, rdflib.Literal(row.STR))) | |
# Now, load MGREL.RRF | |
df_relations = pd.read_csv("MGREL.RRF", sep="|", header=0) | |
# Rename the column '#CUI1' to 'CUI1' | |
df_relations.rename(columns={"#CUI1": "CUI1"}, inplace=True) | |
# Remove the last column, it's empty | |
df_relations = df_relations.iloc[:, :-1] | |
print(df_relations.head()) | |
# Iterate over the rows | |
for i, row in df_relations.iterrows(): | |
if row.SUPPRESS == "Y": | |
continue | |
# Create the URI | |
uri1 = rdflib.URIRef(f"http://identifiers.org/medgen/{row.CUI1}") | |
uri2 = rdflib.URIRef(f"http://identifiers.org/medgen/{row.CUI2}") | |
# Add the triple | |
if row.REL == "RL": | |
g.add((uri1, rdflib.URIRef("related"), uri2)) | |
continue | |
g.add((uri1, rdflib.URIRef(f"http://identifiers.org/medgen/{row.REL}"), uri2)) | |
return g | |
def apply_rules_to_graph(g): | |
# Now, apply this rule: if two nodes have the same parent (i.e. node1 RB node2 and node3 RB node2, then node1 related node3) | |
# Query the graph to get the parents of each node | |
query = """ | |
PREFIX medgen: <http://identifiers.org/medgen/> | |
SELECT DISTINCT ?parent ?child1 ?child2 WHERE { | |
?parent medgen:RN ?child1 . | |
?parent medgen:RN ?child2 . | |
FILTER (?child1 != ?child2) | |
} | |
""" | |
res = g.query(query) | |
for row in res: | |
g.add((row.child1, rdflib.URIRef("related"), row.child2)) | |
g.add((row.child2, rdflib.URIRef("related"), row.child1)) | |
return g | |
def get_labels_of_entities(): | |
""" | |
Returns a dictionary with the labels of the entities | |
""" | |
# File with the graph: MGCONSO.RRF | |
df_concepts = pd.read_csv("MGCONSO.RRF", sep="|", header=0) | |
# Rename the column '#CUI' to 'CUI' | |
df_concepts.rename(columns={"#CUI": "CUI"}, inplace=True) | |
# Remove the last column, it's empty | |
df_concepts = df_concepts.iloc[:, :-1] | |
# Create a dictionary | |
labels_of_entities = {} | |
# Iterate over the rows | |
for i, row in df_concepts.iterrows(): | |
if row.SUPPRESS == "Y": | |
continue | |
if row.ISPREF == "Y" and row.STT == "PF" and row.TS == "P": | |
labels_of_entities[f"http://identifiers.org/medgen/{row.CUI}"] = row.STR | |
return labels_of_entities | |
def generate_triples_file(graph: rdflib.Graph): | |
with open("triples_medgen.tsv", "w") as f: | |
# Output the triples ?s ?p ?o | |
for s, p, o in graph.triples((None, rdflib.URIRef("related"), None)): | |
f.write(f"{s}\t{p}\t{o}\n") | |
for s, p, o in graph.triples( | |
(None, rdflib.URIRef("http://identifiers.org/medgen/RN"), None) | |
): | |
f.write(f"{s}\t{p}\t{o}\n") | |
for s, p, o in graph.triples( | |
(None, rdflib.URIRef("http://identifiers.org/medgen/RB"), None) | |
): | |
f.write(f"{s}\t{p}\t{o}\n") | |
for s, p, o in graph.triples((None, rdflib.URIRef("http://identifiers.org/medgen/PAR"), None)): | |
f.write(f"{s}\t{p}\t{o}\n") | |
for s, p, o in graph.triples((None, rdflib.URIRef("http://identifiers.org/medgen/CHD"), None)): | |
f.write(f"{s}\t{p}\t{o}\n") | |
def save_adjacency_matrix(): | |
# Load the triples file generated | |
df = pd.read_csv("triples_medgen.tsv", sep="\t", header=None) | |
# Now output the adjacency matrix, where the rows are the subjects and the columns are the objects | |
# The values are the relations (i.e. 0 if no relation and 1 if there is a relation) | |
# Get the unique subjects and objects | |
subjects = df[0].unique() | |
objects = df[2].unique() | |
# Create the adjacency matrix | |
adj_matrix = pd.DataFrame(0, index=subjects, columns=objects) | |
# Iterate over the rows | |
for i, row in df.iterrows(): | |
adj_matrix.loc[row[0], row[2]] = 1 | |
# Save the adjacency matrix | |
adj_matrix.to_csv("adjacency_matrix.mat", sep="\t") | |
# %% | |
g = get_graph() | |
# %% | |
g = apply_rules_to_graph(g) | |
# %% | |
labels_of_entities = get_labels_of_entities() | |
# %% | |
generate_triples_file(g) | |
# %% | |
from pykeen.triples import TriplesFactory | |
from pykeen.models import TuckER, TransE, TransH | |
from pykeen.pipeline import pipeline | |
tf = TriplesFactory.from_path("triples_medgen.tsv") | |
print(f"Triples count: {tf.num_triples}") | |
training, testing, validation = tf.split([0.8, 0.1, 0.1], random_state=42, randomize_cleanup=False) | |
result = pipeline( | |
training=training, | |
testing=testing, | |
validation=validation, | |
model=TransE, | |
stopper="early", | |
epochs=500, # short epochs for testing - you should go | |
# higher, especially with early stopper enabled | |
) | |
result.save_to_directory("doctests/test_unstratified_stopped_complex") | |
# %% | |
import torch | |
alzheimers = "http://identifiers.org/medgen/C1843013" | |
# What does the model predict for Alzheimer's disease? | |
model = result.model | |
alzheimers_id = tf.entity_to_id[alzheimers] | |
relation_id = tf.relation_to_id["related"] | |
batch_to_predict = torch.tensor([[alzheimers_id, relation_id]]) | |
alzheimers_pred = model.predict_t(hr_batch=batch_to_predict) | |
print(alzheimers_pred.shape) | |
# Get the indices of the top 10 predictions | |
top10 = torch.topk(alzheimers_pred, 10, largest=True) | |
# Get the entities | |
entities = tf.entity_id_to_label | |
print(top10.indices) | |
for i in top10.indices[0]: | |
# Ask the graph, what is the label for this entity? | |
query = f""" | |
PREFIX medgen: <http://identifiers.org/medgen/> | |
SELECT ?label WHERE {{ | |
<{entities[i.item()]}> <http://www.w3.org/2000/01/rdf-schema#label> ?label | |
}} | |
""" | |
res = g.query(query) | |
for i, row in enumerate(res): | |
print(f"{i}: {row}") | |
# %% | |
from pykeen.nn.representation import Embedding | |
# Get the embeddings of all the entities | |
entity_ids = torch.LongTensor(list(tf.entity_to_id.values())).cuda() | |
entity_embeddings: Embedding = model.entity_representations[0]._embeddings(entity_ids) | |
# Get the embeddings of the relations | |
relation_ids = torch.LongTensor(list(tf.relation_to_id.values())).cuda() | |
relation_embeddings: Embedding = model.relation_representations[0]._embeddings( | |
relation_ids | |
) | |
print(f"Entity embeddings shape: {entity_embeddings.shape}") | |
print(f"Relation embeddings shape: {relation_embeddings.shape}") | |
# Store the embeddings in a DataFrame | |
df = pd.DataFrame( | |
{ | |
"embedding": entity_embeddings.detach().cpu().tolist(), | |
"label": [ | |
labels_of_entities[tf.entity_id_to_label[i]] if tf.entity_id_to_label[i] in labels_of_entities else "" | |
for i in range(len(tf.entity_id_to_label)) | |
], | |
"uri": [ | |
f"{tf.entity_id_to_label[i]}" for i in range(len(tf.entity_id_to_label)) | |
], | |
}, | |
index=range(len(entity_embeddings)), | |
) | |
## Save the DataFrame | |
df.to_csv("entity_embeddings.csv") | |
# Store the embeddings in a DataFrame | |
df = pd.DataFrame( | |
{ | |
"embedding": relation_embeddings.detach().cpu().tolist(), | |
"label": [ | |
tf.relation_id_to_label[i] for i in range(len(tf.relation_id_to_label)) | |
], | |
"uri": [ | |
f"{tf.relation_id_to_label[i]}" for i in range(len(tf.relation_id_to_label)) | |
], | |
}, | |
index=range(len(relation_embeddings)), | |
) | |
## Save the DataFrame | |
df.to_csv("relation_embeddings.csv") | |
# %% | |
import pyobo | |
pyobo.get_name("mesh", "16793") | |
# %% | |