Spaces:

klinic-hackupc
/

klinic

Sleeping

File size: 9,734 Bytes

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Vector Search "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os, pandas as pd\n",
    "from sqlalchemy import create_engine, text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "username = 'demo'\n",
    "password = 'demo'\n",
    "hostname = os.getenv('IRIS_HOSTNAME', 'localhost')\n",
    "port = '1972' \n",
    "namespace = 'USER'\n",
    "CONNECTION_STRING = f\"iris://{username}:{password}@{hostname}:{port}/{namespace}\"\n",
    "\n",
    "engine = create_engine(CONNECTION_STRING)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load knowledge graph\n",
    "entity_embeddings = pd.read_csv('./entity_embeddings.csv', index_col=0)\n",
    "entity_embeddings[\"embedding\"] = entity_embeddings[\"embedding\"].apply(\n",
    "    lambda x: x[1:-1])\n",
    "\n",
    "len_label = entity_embeddings['label'].str.len().max()\n",
    "len_uri = entity_embeddings['uri'].str.len().max()\n",
    "# TODO: set varchar length dynamically as above\n",
    "with engine.connect() as conn:\n",
    "    with conn.begin(): \n",
    "        result = conn.execute(text('DROP TABLE IF EXISTS Test.EntityEmbeddings'))\n",
    "        sql = f\"\"\"\n",
    "                CREATE TABLE Test.EntityEmbeddings (\n",
    "                        embedding VECTOR(DOUBLE, 50),\n",
    "                        label VARCHAR(143),\n",
    "                        uri VARCHAR(38)\n",
    "                )\n",
    "                \"\"\"\n",
    "        result = conn.execute(text(sql))\n",
    "\n",
    "with engine.connect() as conn:\n",
    "    with conn.begin():\n",
    "        for index, row in entity_embeddings.iterrows():\n",
    "            sql = text(\"\"\"\n",
    "                INSERT INTO Test.EntityEmbeddings \n",
    "                (embedding, label, uri) \n",
    "                VALUES (TO_VECTOR(:embedding), :label, :uri)\n",
    "            \"\"\")\n",
    "            conn.execute(sql, {\n",
    "                'embedding': str(row['embedding']),\n",
    "                'label': row['label'], \n",
    "                'uri': row['uri']\n",
    "            })\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculate distance between entities\n",
    "with engine.connect() as conn:\n",
    "    with conn.begin():\n",
    "        sql = f\"\"\"\n",
    "                SELECT TOP 10 e1.uri AS uri1, e2.uri AS uri2, e1.label AS label1, e2.label AS label2,\n",
    "                VECTOR_COSINE(e1.embedding, e2.embedding) AS distance\n",
    "                FROM Test.EntityEmbeddings e1, Test.EntityEmbeddings e2\n",
    "                WHERE e1.uri = 'http://identifiers.org/medgen/C0002395'\n",
    "                ORDER BY distance DESC\n",
    "                \"\"\"\n",
    "        result = conn.execute(text(sql))\n",
    "        data = result.fetchall()\n",
    "        display(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load clinical trials\n",
    "\n",
    "relation_embeddings = pd.read_csv('./relation_embeddings.csv', index_col=0)\n",
    "relation_embeddings[\"embedding\"] = relation_embeddings[\"embedding\"].apply(\n",
    "    lambda x: x[1:-1])\n",
    "\n",
    "len_label = relation_embeddings['label'].str.len().max()\n",
    "len_uri = relation_embeddings['uri'].str.len().max()\n",
    "# TODO: set varchar length dynamically as above\n",
    "with engine.connect() as conn:\n",
    "    with conn.begin():# Load \n",
    "        result = conn.execute(text('DROP TABLE IF EXISTS Test.RelationEmbeddings'))\n",
    "        sql = f\"\"\"\n",
    "                CREATE TABLE Test.RelationEmbeddings (\n",
    "                        embedding VECTOR(DOUBLE, 50),\n",
    "                        label VARCHAR(10),\n",
    "                        uri VARCHAR(38)\n",
    "                )\n",
    "                \"\"\"\n",
    "        result = conn.execute(text(sql))\n",
    "\n",
    "with engine.connect() as conn:\n",
    "    with conn.begin():\n",
    "        for index, row in relation_embeddings.iterrows():\n",
    "            sql = text(\"\"\"\n",
    "                INSERT INTO Test.RelationEmbeddings \n",
    "                (embedding, label, uri) \n",
    "                VALUES (TO_VECTOR(:embedding), :label, :uri)\n",
    "            \"\"\")\n",
    "            conn.execute(sql, {\n",
    "                'embedding': str(row['embedding']),\n",
    "                'label': row['label'], \n",
    "                'uri': row['uri']\n",
    "            })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load knowledge graph\n",
    "clinical_trials = pd.read_csv(\"clinical_trials_embeddings.csv\")\n",
    "clinical_trials[\"embeddings\"] = clinical_trials[\"embeddings\"].apply(lambda x: x[1:-1])\n",
    "display(clinical_trials.head())\n",
    "\n",
    "# TODO: set varchar length dynamically as above\n",
    "with engine.connect() as conn:\n",
    "    with conn.begin():\n",
    "        result = conn.execute(text(\"DROP TABLE IF EXISTS Test.ClinicalTrials\"))\n",
    "        sql = f\"\"\"\n",
    "                CREATE TABLE Test.ClinicalTrials (\n",
    "                        nct_id VARCHAR(11) PRIMARY KEY,\n",
    "                        diseases TEXT,\n",
    "                        embedding VECTOR(DOUBLE, 768)\n",
    "                )\n",
    "                \"\"\"\n",
    "        result = conn.execute(text(sql))\n",
    "\n",
    "with engine.connect() as conn:\n",
    "    with conn.begin():\n",
    "        for index, row in clinical_trials.iterrows():\n",
    "\n",
    "            sql = text(\n",
    "                \"\"\"\n",
    "                INSERT INTO Test.ClinicalTrials \n",
    "                (nct_id, diseases, embedding)\n",
    "                VALUES (:nct_id, :diseases, TO_VECTOR(:embedding))\n",
    "            \"\"\"\n",
    "            )\n",
    "            conn.execute(\n",
    "                sql,\n",
    "                {\n",
    "                    \"nct_id\": row[\"nct_id\"],\n",
    "                    \"diseases\": row[\"desease_condition\"],\n",
    "                    \"embedding\": str(row[\"embeddings\"]),\n",
    "                },\n",
    "            )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# %%\n",
    "import pandas as pd\n",
    "import rdflib\n",
    "\n",
    "# Load the disease descriptions from MGDEF.RRF\n",
    "df_disease_descriptions = pd.read_csv(\"MGDEF.RRF\", sep=\"|\", header=0)\n",
    "# Rename the column '#CUI' to 'CUI'\n",
    "df_disease_descriptions.rename(columns={\"#CUI\": \"CUI\"}, inplace=True)\n",
    "# Remove the last column, it's empty\n",
    "df_disease_descriptions = df_disease_descriptions.iloc[:, :-1]\n",
    "# Filter out the rows where the SUPPRESS field is equal to 'Y'\n",
    "df_disease_descriptions = df_disease_descriptions[df_disease_descriptions[\"SUPPRESS\"] != \"Y\"]\n",
    "# Some of the rows include a \\n character, so we need to remove the rows where the CUI field contains spaces or doesn't start with 'C'\n",
    "df_disease_descriptions = df_disease_descriptions[df_disease_descriptions[\"CUI\"].str.startswith(\"C\") & ~df_disease_descriptions[\"CUI\"].str.contains(\" \")]\n",
    "# Remove the rows where the DEF field is empty\n",
    "df_disease_descriptions = df_disease_descriptions[df_disease_descriptions[\"DEF\"].notnull()]\n",
    "df_disease_descriptions['uri'] = df_disease_descriptions['CUI'].apply(lambda x: f'http://identifiers.org/medgen/{x}')\n",
    "\n",
    "with engine.connect() as conn:\n",
    "    with conn.begin(): \n",
    "        result = conn.execute(text('DROP TABLE IF EXISTS Test.DiseaseDescriptions'))\n",
    "        sql = f\"\"\"\n",
    "                CREATE TABLE Test.DiseaseDescriptions (\n",
    "                        uri VARCHAR(50),\n",
    "                        description TEXT\n",
    "                )\n",
    "                \"\"\"\n",
    "        result = conn.execute(text(sql))\n",
    "\n",
    "with engine.connect() as conn:\n",
    "    with conn.begin():\n",
    "        for index, row in df_disease_descriptions.iterrows():\n",
    "            print(row['DEF'])\n",
    "            print(row['uri'])\n",
    "            sql = text(\"\"\"\n",
    "                INSERT INTO Test.DiseaseDescriptions \n",
    "                (uri, description) \n",
    "                VALUES ( :uri, :description)\n",
    "            \"\"\")\n",
    "            conn.execute(sql, {\n",
    "                'uri': row['uri'],\n",
    "                'description': row['DEF'], \n",
    "            })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "treehacks",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}