Tanguyvans commited on
Commit
7833461
1 Parent(s): f26b169

augment from similar

Browse files
Files changed (1) hide show
  1. utils.py +22 -0
utils.py CHANGED
@@ -123,6 +123,28 @@ def get_similarities_among_diseases_uris(
123
  return data
124
 
125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  def get_embedding(string: str, encoder) -> List[float]:
127
  # Embed the string using sentence-transformers
128
  vector = encoder.encode(string, show_progress_bar=False)
 
123
  return data
124
 
125
 
126
+ def augment_the_set_of_diseaces(engine, diseases: List[str]) -> str:
127
+
128
+ for i in range(15-len(diseases)):
129
+ with engine.connect() as conn:
130
+ with conn.begin():
131
+ sql = f"""
132
+ SELECT TOP 1 e2.uri AS new_disease, (SUM(VECTOR_COSINE(e1.embedding, e2.embedding))/ {len(diseases)}) AS score
133
+ FROM Test.EntityEmbeddings e1, Test.EntityEmbeddings e2
134
+ WHERE e1.uri IN ({','.join([f"'http://identifiers.org/medgen/{disease}'" for disease in diseases])})
135
+ AND e2.uri NOT IN ({','.join([f"'http://identifiers.org/medgen/{disease}'" for disease in diseases])})
136
+ AND e2.label != 'nan'
137
+ GROUP BY e2.label
138
+ ORDER BY score DESC
139
+ """
140
+
141
+ result = conn.execute(text(sql))
142
+ data = result.fetchall()
143
+
144
+ diseases.append(data[0][0].split('/')[-1])
145
+
146
+ return diseases
147
+
148
  def get_embedding(string: str, encoder) -> List[float]:
149
  # Embed the string using sentence-transformers
150
  vector = encoder.encode(string, show_progress_bar=False)