Spaces:

klinic-hackupc
/

klinic

Sleeping

App Files Files Community

ACMCMC commited on May 5

Commit

a6bd112

•

1 Parent(s): b375334

WIP

Browse files

Files changed (8) hide show

graph_analysis.m → MATLAB/get_metrics.m +3 -11
MATLAB/main.m +1 -1
MATLAB/visualize_app.mlapp +0 -0
app.py +178 -113
calculate_smilar_nodes.py +9 -3
llm_res.py +16 -15
main.ipynb +33 -30
utils.py +138 -73

graph_analysis.m → MATLAB/get_metrics.m RENAMED Viewed

@@ -1,7 +1,6 @@
 % Read the CSV file
-data = readtable('MGREL.RRF', Delimiter='|', FileType='text', NumHeaderLines=0, VariableNamingRule='preserve');
 data = renamevars(data,"#CUI1","CUI1");
-data = data(1:1000,:);
 ids_1 = data.CUI1;
 for k = 1 : length(ids_1)
     cellContents = ids_1{k};
@@ -10,7 +9,6 @@ for k = 1 : length(ids_1)
 end
 ids_1 = str2double(ids_1);
 ids_2 = data.CUI2;
-ids_2 = data.CUI1(2:end);
 for k = 1 : length(ids_2)
     cellContents = ids_2{k};
     % Truncate and stick back into the cell
@@ -18,11 +16,6 @@ for k = 1 : length(ids_2)
 end
 ids_2 = str2double(ids_2);
-ids_1 = ids_1(1:end-1);
-ids_2 = ids_2(2:end);
 % Get the number of unique nodes
 %nodes = unique([ids_1; ids_2]);
 %num_nodes = length(nodes);
@@ -36,8 +29,7 @@ ids_2 = ids_2(2:end);
 %G = digraph(A);
 G = digraph(ids_1, ids_2);
 [bin,binsize] = conncomp(G,'Type','weak');
-bin(1:100)
-size(unique(bin))
 max(binsize)
 pg_ranks = centrality(G,'pagerank');
 G.Nodes.PageRank = pg_ranks;
@@ -46,4 +38,4 @@ G.Nodes.PageRank = pg_ranks;
 %G.Nodes.Hubs = hub_ranks;
 %G.Nodes.Authorities = auth_ranks;
 G.Nodes
-%plot(G);

 % Read the CSV file
+data = readtable('../MGREL.RRF', Delimiter='|', FileType='text', NumHeaderLines=0, VariableNamingRule='preserve');
 data = renamevars(data,"#CUI1","CUI1");
 ids_1 = data.CUI1;
 for k = 1 : length(ids_1)
     cellContents = ids_1{k};
 end
 ids_1 = str2double(ids_1);
 ids_2 = data.CUI2;
 for k = 1 : length(ids_2)
     cellContents = ids_2{k};
     % Truncate and stick back into the cell
 end
 ids_2 = str2double(ids_2);
 % Get the number of unique nodes
 %nodes = unique([ids_1; ids_2]);
 %num_nodes = length(nodes);
 %G = digraph(A);
 G = digraph(ids_1, ids_2);
 [bin,binsize] = conncomp(G,'Type','weak');
+bin(1:10)
 max(binsize)
 pg_ranks = centrality(G,'pagerank');
 G.Nodes.PageRank = pg_ranks;
 %G.Nodes.Hubs = hub_ranks;
 %G.Nodes.Authorities = auth_ranks;
 G.Nodes
+%plot(G);

MATLAB/main.m CHANGED Viewed

@@ -17,7 +17,7 @@ end
 data = readtable('MGREL.RRF', Delimiter='|', FileType='text', NumHeaderLines=0, VariableNamingRule='preserve');
 data = renamevars(data,"#CUI1","CUI1");
-data = data(1:1000,:);
 % Create a Map to store connections
 connectionsMap = containers.Map('KeyType','char', 'ValueType','any');

 data = readtable('MGREL.RRF', Delimiter='|', FileType='text', NumHeaderLines=0, VariableNamingRule='preserve');
 data = renamevars(data,"#CUI1","CUI1");
+data = data(1:2000,:);
 % Create a Map to store connections
 connectionsMap = containers.Map('KeyType','char', 'ValueType','any');

MATLAB/visualize_app.mlapp CHANGED Viewed

Binary files a/MATLAB/visualize_app.mlapp and b/MATLAB/visualize_app.mlapp differ

app.py CHANGED Viewed

@@ -1,27 +1,30 @@
-import streamlit as st
-from streamlit_agraph import agraph, Node, Edge, Config
 import os
-from sqlalchemy import create_engine, text
-import pandas as pd
 import time
 from utils import (
     get_all_diseases_name,
-    get_most_similar_diseases_from_uri,
-    get_uri_from_name,
     get_diseases_related_to_a_textual_description,
     get_similarities_among_diseases_uris,
-    augment_the_set_of_diseaces,
-    get_clinical_trials_related_to_diseases,
-    get_clinical_records_by_ids,
     render_trial_details,
-    filter_out_less_promising_diseases
 )
-from llm_res import get_short_summary_out_of_json_files, tagging_insights_from_json
-import json
-import numpy as np
-from sentence_transformers import SentenceTransformer
-import matplotlib
 # variables to reveal next steps
 show_graph = False
@@ -42,17 +45,22 @@ engine = create_engine(CONNECTION_STRING)
 st.image("img_klinic.jpeg", caption="(AI-generated image)", use_column_width=True)
 st.title("Klìnic", help="AI-powered clinical trial search engine")
-st.subheader("Find clinical trials in a scoped domain of biomedical research, guiding your research with AI-powered insights.")
-with st.container(): # user input
     col1, col2 = st.columns((6, 1))
     with col1:
-        description_input = st.text_area(label="Enter a disease description 👇", placeholder='A disorder manifested in memory loss and other cognitive impairments among elderly patients (60+ years old), especially women.')
     with col2:
-        st.text('') # dummy to center vertically
-        st.text('') # dummy to center vertically
-        st.text('') # dummy to center vertically
         show_analyze_status = st.button("Analyze 🔎")
@@ -64,45 +72,78 @@ with st.container():
             # 2. Get 5 diseases with the highest cosine silimarity from the DB
             status.write("Analyzing the description that you wrote...")
             encoder = SentenceTransformer("allenai-specter")
-            diseases_related_to_the_user_text = get_diseases_related_to_a_textual_description(
-                description_input, encoder
             )
-            status.info(f'Selected {len(diseases_related_to_the_user_text)} diseases related to the description you entered.')
             status.json(diseases_related_to_the_user_text, expanded=False)
             status.divider()
             # 3. Get the similarities of the embeddings of those diseases (cosine similarity of the embeddings of the nodes of such diseases)
-            status.write("Getting the similarities among the diseases to filter out less promising ones...")
-            diseases_uris = [disease["uri"] for disease in diseases_related_to_the_user_text]
             similarities = get_similarities_among_diseases_uris(diseases_uris)
-            status.info(f'Obtained similarity information among the diseases by measuring the cosine similarity of their embeddings.')
             status.json(similarities, expanded=False)
-            filtered_diseases_uris, df_similarities = filter_out_less_promising_diseases(similarities)
             # Apply a colormap to the table
-            status.table(df_similarities.style.background_gradient(cmap='viridis', axis=None))
-            status.info(f'Filtered out less promising diseases, keeping {len(filtered_diseases_uris)} diseases.')
             status.json(filtered_diseases_uris, expanded=False)
             status.divider()
             # 4. Potentially filter out the diseases that are not similar enough (e.g. similarity < 0.8)
             # 5. Augment the set of diseases: add new diseases that are similar to the ones that are already in the set, until we get 10-15 diseases
-            status.write("Augmenting the set of diseases by finding others with related embeddings...")
             augmented_set_of_diseases = augment_the_set_of_diseaces(diseases_uris)
-            # print(augmented_set_of_diseases)
-            status.info(f'Augmented set of diseases: {len(augmented_set_of_diseases)} diseases.')
             status.json(augmented_set_of_diseases, expanded=False)
             status.divider()
             # 6. Query the embeddings of the diseases related to each clinical trial (also in the DB), to get the most similar clinical trials to our set of diseases
             status.write("Getting the clinical trials related to the diseases found...")
-            clinical_trials_related_to_the_diseases = get_clinical_trials_related_to_diseases(
-                augmented_set_of_diseases, encoder
             )
-            status.info(f'Selected {len(clinical_trials_related_to_the_diseases)} clinical trials related to the diseases.')
             status.json(clinical_trials_related_to_the_diseases, expanded=False)
             status.divider()
             status.write("Getting the details of the clinical trials...")
             json_of_clinical_trials = get_clinical_records_by_ids(
                 [trial["nct_id"] for trial in clinical_trials_related_to_the_diseases]
             )
-            status.success(f'Details of the clinical trials obtained.')
             status.json(json_of_clinical_trials, expanded=False)
             status.divider()
             # 7. Use an LLM to get a summary of the clinical trials, in plain text format.
@@ -112,22 +153,27 @@ with st.container():
                 status.success("Summary of the clinical trials obtained.")
                 disease_overview = response
             except Exception as e:
-                print(f'Error while getting a summary of the clinical trials: {e}')
-                status.warning(f'Error while getting a summary of the clinical trials. This information will not be shown.')
             try:
-            # 8. Use an LLM to extract numerical data from the clinical trials (e.g. number of patients, number of deaths, etc.). Get summary statistics out of that.
                 status.write("Getting summary statistics of the clinical trials...")
                 response = tagging_insights_from_json(json_of_clinical_trials)
                 average_minimum_age = response["avg_min_age"]
                 average_maximum_age = response["avg_max_age"]
-                most_common_gender = response['most_common_gender']
-                print(f'Response from LLM tagging: {response}')
-                status.success(f'Summary statistics of the clinical trials obtained.')
             except Exception as e:
-                raise e
-                print(f'Error while extracting numerical data from the clinical trials: {e}')
-                status.warning(f'Error while extracting numerical data from the clinical trials. This information will not be shown.')
             # 9. Show the results to the user: graph of the diseases chosen, summary of the clinical trials, summary statistics of the clinical trials, and list of the details of the clinical trials considered
             status.update(label="Done!", state="complete")
             status.balloons()
@@ -146,37 +192,55 @@ We use the embeddings of the diseases to determine the similarity between them.
 [TransH](https://ojs.aaai.org/index.php/AAAI/article/view/8870) utilizes hyperplanes to model relations between entities. It is a multi-relational model that can handle many-to-many relations between entities. The model is trained on the triples of the graph, where the triples are the subject, relation, and object of the graph. The model learns the embeddings of the entities and the relations, such that the embeddings of the subject and object are close to each other when the relation is true.
 Specifically, it optimizes the following cost function:
-$$"""
-        )
-        # TODO actual graph
-        graph_of_diseases = agraph(
-            nodes=[
-                Node(id="A", label="Node A", size=10),
-                Node(id="B", label="Node B", size=10),
-                Node(id="C", label="Node C", size=10),
-                Node(id="D", label="Node D", size=10),
-                Node(id="E", label="Node E", size=10),
-                Node(id="F", label="Node F", size=10),
-                Node(id="G", label="Node G", size=10),
-                Node(id="H", label="Node H", size=10),
-                Node(id="I", label="Node I", size=10),
-                Node(id="J", label="Node J", size=10),
-            ],
-            edges=[
-                Edge(source="A", target="B"),
-                Edge(source="B", target="C"),
-                Edge(source="C", target="D"),
-                Edge(source="D", target="E"),
-                Edge(source="E", target="F"),
-                Edge(source="F", target="G"),
-                Edge(source="G", target="H"),
-                Edge(source="H", target="I"),
-                Edge(source="I", target="J"),
-            ],
-            config=Config(height=500, width=500),
         )
-        time.sleep(2)
-        show_overview = True
 # overview
@@ -187,7 +251,7 @@ with st.container():
             st.write(disease_overview)
             time.sleep(1)
         except Exception as e:
-            print(f'Error while showing the overview of the clinical trials: {e}')
         finally:
             show_metrics = True
@@ -196,7 +260,7 @@ with st.container():
     if show_metrics:
         try:
             st.write("## Metrics of the Clinical Trials")
-            col1, col2, col3  = st.columns(3)
             with col1:
                 st.metric("Average Minimum Age", average_minimum_age)
             with col2:
@@ -205,7 +269,7 @@ with st.container():
                 st.metric("Most Common Gender", most_common_gender)
             time.sleep(2)
         except Exception as e:
-            print(f'Error while showing the metrics: {e}')
         finally:
             show_details = True
@@ -215,7 +279,10 @@ with st.container():
     if show_details:
         st.write("## Clinical Trials Details")
-        tab_titles = [f"{trial['protocolSection']['identificationModule']['nctId']}" for trial in trials]
         tabs = st.tabs(tab_titles)
@@ -231,7 +298,7 @@ if show_graph_of_all_diseases:
     chosen_disease_name = st.selectbox(
         "Choose a disease",
         st.session_state.disease_names,
-        )
     st.write("You selected:", chosen_disease_name)
     chosen_disease_uri = get_uri_from_name(engine, chosen_disease_name)
@@ -239,41 +306,39 @@ if show_graph_of_all_diseases:
     nodes = []
     edges = []
-    nodes.append( Node(id=chosen_disease_uri,
-        label=chosen_disease_name,
-        size=25,
-        shape="circular")
     )
-    similar_diseases = get_most_similar_diseases_from_uri(engine, chosen_disease_uri, threshold=0.6)
     print(similar_diseases)
     for uri, name, weight in similar_diseases:
-        nodes.append( Node(id=uri,
-            label=name,
-            size=25,
-            shape="circular")
-        )
         print(True if float(weight) > 0.7 else False)
-        edges.append( Edge(source=chosen_disease_uri,
-                    target=uri,
-                    color="red" if float(weight) > 0.7 else "blue",
-                    weight=float(weight)**10,
-                    type="CURVE_SMOOTH"
-                    #    type="STRAIGHT"
-                    )
-                )
-    config = Config(width=750,
-                    height=950,
-                    directed=False,
-                    physics=True,
-                    hierarchical=False,
-                    collapsible=False,
-                    # **kwargs
-                    )
-    return_value = agraph(nodes=nodes,
-                        edges=edges,
-                        config=config)

+import json
 import os
 import time
+import matplotlib
+import numpy as np
+import pandas as pd
+import streamlit as st
+from sentence_transformers import SentenceTransformer
+from sqlalchemy import create_engine, text
+from streamlit_agraph import Config, Edge, Node, agraph
+from llm_res import get_short_summary_out_of_json_files, tagging_insights_from_json
 from utils import (
+    augment_the_set_of_diseaces,
+    filter_out_less_promising_diseases,
     get_all_diseases_name,
+    get_clinical_records_by_ids,
+    get_clinical_trials_related_to_diseases,
     get_diseases_related_to_a_textual_description,
+    get_most_similar_diseases_from_uri,
     get_similarities_among_diseases_uris,
+    get_similarities_df,
+    get_uri_from_name,
     render_trial_details,
+    get_labels_of_diseases_from_uris,
 )
 # variables to reveal next steps
 show_graph = False
 st.image("img_klinic.jpeg", caption="(AI-generated image)", use_column_width=True)
 st.title("Klìnic", help="AI-powered clinical trial search engine")
+st.subheader(
+    "Find clinical trials in a scoped domain of biomedical research, guiding your research with AI-powered insights."
+)
+with st.container():  # user input
     col1, col2 = st.columns((6, 1))
     with col1:
+        description_input = st.text_area(
+            label="Enter a disease description 👇",
+            placeholder="A disorder manifested in memory loss and other cognitive impairments among elderly patients (60+ years old), especially women.",
+        )
     with col2:
+        st.text("")  # dummy to center vertically
+        st.text("")  # dummy to center vertically
+        st.text("")  # dummy to center vertically
         show_analyze_status = st.button("Analyze 🔎")
             # 2. Get 5 diseases with the highest cosine silimarity from the DB
             status.write("Analyzing the description that you wrote...")
             encoder = SentenceTransformer("allenai-specter")
+            diseases_related_to_the_user_text = (
+                get_diseases_related_to_a_textual_description(
+                    description_input, encoder
+                )
+            )
+            status.info(
+                f"Selected {len(diseases_related_to_the_user_text)} diseases related to the description you entered."
             )
             status.json(diseases_related_to_the_user_text, expanded=False)
             status.divider()
             # 3. Get the similarities of the embeddings of those diseases (cosine similarity of the embeddings of the nodes of such diseases)
+            status.write(
+                "Getting the similarities among the diseases to filter out less promising ones..."
+            )
+            diseases_uris = [
+                disease["uri"] for disease in diseases_related_to_the_user_text
+            ]
             similarities = get_similarities_among_diseases_uris(diseases_uris)
+            status.info(
+                f"Obtained similarity information among the diseases by measuring the cosine similarity of their embeddings."
+            )
             status.json(similarities, expanded=False)
+            filtered_diseases_uris, df_similarities = (
+                filter_out_less_promising_diseases(similarities)
+            )
             # Apply a colormap to the table
+            status.table(
+                df_similarities.style.background_gradient(cmap="viridis", axis=None)
+            )
+            status.info(
+                f"Filtered out less promising diseases, keeping {len(filtered_diseases_uris)} diseases."
+            )
             status.json(filtered_diseases_uris, expanded=False)
             status.divider()
             # 4. Potentially filter out the diseases that are not similar enough (e.g. similarity < 0.8)
             # 5. Augment the set of diseases: add new diseases that are similar to the ones that are already in the set, until we get 10-15 diseases
+            status.write(
+                "Augmenting the set of diseases by finding others with related embeddings..."
+            )
             augmented_set_of_diseases = augment_the_set_of_diseaces(diseases_uris)
+            similarities_of_augmented_set_of_diseases = (
+                get_similarities_among_diseases_uris(augmented_set_of_diseases)
+            )
+            df_similarities_augmented_set = get_similarities_df(
+                similarities_of_augmented_set_of_diseases
+            )
+            status.table(
+                df_similarities_augmented_set.style.background_gradient(cmap="viridis", axis=None)
+            )
+            status.json(similarities_of_augmented_set_of_diseases, expanded=True)
+            status.info(
+                f"Augmented set of diseases: {len(augmented_set_of_diseases)} diseases."
+            )
             status.json(augmented_set_of_diseases, expanded=False)
             status.divider()
             # 6. Query the embeddings of the diseases related to each clinical trial (also in the DB), to get the most similar clinical trials to our set of diseases
             status.write("Getting the clinical trials related to the diseases found...")
+            clinical_trials_related_to_the_diseases = (
+                get_clinical_trials_related_to_diseases(
+                    augmented_set_of_diseases, encoder
+                )
+            )
+            status.info(
+                f"Selected {len(clinical_trials_related_to_the_diseases)} clinical trials related to the diseases."
             )
             status.json(clinical_trials_related_to_the_diseases, expanded=False)
             status.divider()
             status.write("Getting the details of the clinical trials...")
             json_of_clinical_trials = get_clinical_records_by_ids(
                 [trial["nct_id"] for trial in clinical_trials_related_to_the_diseases]
             )
+            status.success(f"Details of the clinical trials obtained.")
             status.json(json_of_clinical_trials, expanded=False)
             status.divider()
             # 7. Use an LLM to get a summary of the clinical trials, in plain text format.
                 status.success("Summary of the clinical trials obtained.")
                 disease_overview = response
             except Exception as e:
+                print(f"Error while getting a summary of the clinical trials: {e}")
+                status.warning(
+                    f"Error while getting a summary of the clinical trials. This information will not be shown."
+                )
             try:
+                # 8. Use an LLM to extract numerical data from the clinical trials (e.g. number of patients, number of deaths, etc.). Get summary statistics out of that.
                 status.write("Getting summary statistics of the clinical trials...")
                 response = tagging_insights_from_json(json_of_clinical_trials)
                 average_minimum_age = response["avg_min_age"]
                 average_maximum_age = response["avg_max_age"]
+                most_common_gender = response["most_common_gender"]
+                print(f"Response from LLM tagging: {response}")
+                status.success(f"Summary statistics of the clinical trials obtained.")
             except Exception as e:
+                print(
+                    f"Error while extracting numerical data from the clinical trials: {e}"
+                )
+                status.warning(
+                    f"Error while extracting numerical data from the clinical trials. This information will not be shown."
+                )
             # 9. Show the results to the user: graph of the diseases chosen, summary of the clinical trials, summary statistics of the clinical trials, and list of the details of the clinical trials considered
             status.update(label="Done!", state="complete")
             status.balloons()
 [TransH](https://ojs.aaai.org/index.php/AAAI/article/view/8870) utilizes hyperplanes to model relations between entities. It is a multi-relational model that can handle many-to-many relations between entities. The model is trained on the triples of the graph, where the triples are the subject, relation, and object of the graph. The model learns the embeddings of the entities and the relations, such that the embeddings of the subject and object are close to each other when the relation is true.
 Specifically, it optimizes the following cost function:
+$\\text{minimize} \\sum_{(h, r, t) \\in S} \\max(0, \\gamma + f(h, r, t) - f(h, r, t')) + \\sum_{(h, r, t) \\in S'} f(h, r, t)$
+"""
         )
+        try:
+            edges_to_show = []
+            labels_of_diseases = get_labels_of_diseases_from_uris(
+                df_similarities_augmented_set.index
+            )
+            uris_and_labels_of_diseases = dict(
+                zip(df_similarities_augmented_set.index, labels_of_diseases)
+            )
+            color_mapper = matplotlib.cm.get_cmap("viridis")
+            for source in df_similarities_augmented_set.index:
+                for target in df_similarities_augmented_set.columns:
+                    if source != target:
+                        weight = df_similarities_augmented_set.loc[source, target]
+                        color = color_mapper(weight)
+                        # Convert from rgba to hex
+                        color = matplotlib.colors.to_hex(color)
+                        edges_to_show.append(
+                            Edge(
+                                source=source,
+                                target=target,
+                                # Dynamic color based on the weight
+                                color=color,
+                                weight=weight**10,
+                                type="CURVE_SMOOTH",
+                                label=f"{weight:.2f}",
+                            )
+                        )
+            graph_of_diseases = agraph(
+                nodes=[
+                    Node(
+                        id=disease,
+                        label=disease,#uris_and_labels_of_diseases[disease],
+                        size=25,
+                        shape="circular",
+                    )
+                    for disease in df_similarities_augmented_set.index
+                ],
+                edges=edges_to_show,
+                config=Config(height=500, width=500),
+            )
+            time.sleep(2)
+        except Exception as e:
+            print(f"Error while showing the graph of the diseases: {e}")
+            st.error("Error while showing the graph of the diseases.")
+        finally:
+            show_overview = True
 # overview
             st.write(disease_overview)
             time.sleep(1)
         except Exception as e:
+            print(f"Error while showing the overview of the clinical trials: {e}")
         finally:
             show_metrics = True
     if show_metrics:
         try:
             st.write("## Metrics of the Clinical Trials")
+            col1, col2, col3 = st.columns(3)
             with col1:
                 st.metric("Average Minimum Age", average_minimum_age)
             with col2:
                 st.metric("Most Common Gender", most_common_gender)
             time.sleep(2)
         except Exception as e:
+            print(f"Error while showing the metrics: {e}")
         finally:
             show_details = True
     if show_details:
         st.write("## Clinical Trials Details")
+        tab_titles = [
+            f"{trial['protocolSection']['identificationModule']['nctId']}"
+            for trial in trials
+        ]
         tabs = st.tabs(tab_titles)
     chosen_disease_name = st.selectbox(
         "Choose a disease",
         st.session_state.disease_names,
+    )
     st.write("You selected:", chosen_disease_name)
     chosen_disease_uri = get_uri_from_name(engine, chosen_disease_name)
     nodes = []
     edges = []
+    nodes.append(
+        Node(
+            id=chosen_disease_uri, label=chosen_disease_name, size=25, shape="circular"
+        )
     )
+    similar_diseases = get_most_similar_diseases_from_uri(
+        engine, chosen_disease_uri, threshold=0.6
+    )
     print(similar_diseases)
     for uri, name, weight in similar_diseases:
+        nodes.append(Node(id=uri, label=name, size=25, shape="circular"))
         print(True if float(weight) > 0.7 else False)
+        edges.append(
+            Edge(
+                source=chosen_disease_uri,
+                target=uri,
+                color="red" if float(weight) > 0.7 else "blue",
+                weight=float(weight) ** 10,
+                type="CURVE_SMOOTH",
+                #    type="STRAIGHT"
+            )
+        )
+    config = Config(
+        width=750,
+        height=950,
+        directed=False,
+        physics=True,
+        hierarchical=False,
+        collapsible=False,
+        # **kwargs
+    )
+    return_value = agraph(nodes=nodes, edges=edges, config=config)

calculate_smilar_nodes.py CHANGED Viewed

@@ -6,6 +6,7 @@ def transe_distance(head, tail, relation, entity_embeddings, relation_embeddings
     distance = head_embedding + relation_embeddings - tail_embedding
     return distance
 def calculate_similar_nodes(node, entity_embeddings, relation_embeddings, top_n=10):
     distances = []
     for i in range(len(entity_embeddings)):
@@ -14,6 +15,7 @@ def calculate_similar_nodes(node, entity_embeddings, relation_embeddings, top_n=
     distances.sort(key=lambda x: x[1].norm().item())
     return distances[:top_n]
 # %%
 import pandas as pd
@@ -55,9 +57,13 @@ print(
 )
 # %%
 # Calculate similar nodes to the head
-similar_nodes = calculate_similar_nodes(head, entity_embeddings["embedding"], relation_embeddings["embedding"])
 print(f"Similar nodes to {entity_embeddings['label'][head]} ({head}):")
 # Print the similar nodes
 for i, (node, distance) in enumerate(similar_nodes):
-    print(f"{i}: {entity_embeddings['label'][node]} ({node}) with distance {distance.norm().item()}")
-# %%

     distance = head_embedding + relation_embeddings - tail_embedding
     return distance
 def calculate_similar_nodes(node, entity_embeddings, relation_embeddings, top_n=10):
     distances = []
     for i in range(len(entity_embeddings)):
     distances.sort(key=lambda x: x[1].norm().item())
     return distances[:top_n]
 # %%
 import pandas as pd
 )
 # %%
 # Calculate similar nodes to the head
+similar_nodes = calculate_similar_nodes(
+    head, entity_embeddings["embedding"], relation_embeddings["embedding"]
+)
 print(f"Similar nodes to {entity_embeddings['label'][head]} ({head}):")
 # Print the similar nodes
 for i, (node, distance) in enumerate(similar_nodes):
+    print(
+        f"{i}: {entity_embeddings['label'][node]} ({node}) with distance {distance.norm().item()}"
+    )
+# %%

llm_res.py CHANGED Viewed

@@ -1,15 +1,19 @@
 import ast
 import json
 import os
 from typing import Any, Dict, List
 import langchain
 import openai
 import pandas as pd
 import requests
 from dotenv import load_dotenv
 from langchain import OpenAI
 from langchain.chains.combine_documents.stuff import StuffDocumentsChain
 from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
 from langchain.document_loaders import UnstructuredURLLoader
 from langchain.embeddings import OpenAIEmbeddings
@@ -17,14 +21,9 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.vectorstores import FAISS
 from langchain_community.document_loaders import JSONLoader
 from langchain_community.document_loaders.csv_loader import CSVLoader
-from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.pydantic_v1 import BaseModel, Field
 from langchain_openai import ChatOpenAI
-from langchain.chains.llm import LLMChain
-from langchain_core.prompts import PromptTemplate
-from collections import Counter
-import statistics
-import regex as re
 load_dotenv()
@@ -245,7 +244,7 @@ General summary:"""
     prompt = PromptTemplate.from_template(prompt_template)
     llm = ChatOpenAI(
-        temperature=0.4, model_name="gpt-4-turbo", api_key=os.environ["OPENAI_API_KEY"]
     )
     llm_chain = LLMChain(llm=llm, prompt=prompt)
@@ -279,8 +278,12 @@ General summary:"""
 def analyze_data(data):
     print(f"Data: {data}")
     # Extract minimum and maximum ages: Turn ['18 Years', '20 Years'] into [18, 20]
-    min_ages = [int(re.search(r"\d+", age).group()) for age in data["minimum_age"] if age]
-    max_ages = [int(re.search(r"\d+", age).group()) for age in data["maximum_age"] if age]
     # primary_timeframe= [int(age.split()[0]) for age in data['[primary_outcome]'] if age]
     # Calculate average minimum and maximum ages
@@ -292,13 +295,13 @@ def analyze_data(data):
     most_common_gender = gender_counter.most_common(1)[0][0]
     # Flatten keywords list and find common keywords
-    #keywords = [keyword for sublist in data["keywords"] for keyword in sublist]
-    #common_keywords = [word for word, count in Counter(keywords).most_common()]
     return {
         "avg_min_age": avg_min_age,
         "avg_max_age": avg_max_age,
-        "most_common_gender": most_common_gender
     }
@@ -379,9 +382,7 @@ def tagging_insights_from_json(data_json):
     res = tagging_chain.invoke({"input": processed_json})
     unprocessed_results_dict = res.get_dict()
-    results_dict = analyze_data(
-        unprocessed_results_dict
-    )
     # stats_dict= {'Average Minimum age': avg_min_age,
     #             'Average Maximum age': avg_max_age,

 import ast
 import json
 import os
+import statistics
+from collections import Counter
 from typing import Any, Dict, List
 import langchain
 import openai
 import pandas as pd
+import regex as re
 import requests
 from dotenv import load_dotenv
 from langchain import OpenAI
 from langchain.chains.combine_documents.stuff import StuffDocumentsChain
+from langchain.chains.llm import LLMChain
 from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
 from langchain.document_loaders import UnstructuredURLLoader
 from langchain.embeddings import OpenAIEmbeddings
 from langchain.vectorstores import FAISS
 from langchain_community.document_loaders import JSONLoader
 from langchain_community.document_loaders.csv_loader import CSVLoader
+from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
 from langchain_core.pydantic_v1 import BaseModel, Field
 from langchain_openai import ChatOpenAI
 load_dotenv()
     prompt = PromptTemplate.from_template(prompt_template)
     llm = ChatOpenAI(
+        temperature=0.5, model_name="gpt-4-turbo", api_key=os.environ["OPENAI_API_KEY"]
     )
     llm_chain = LLMChain(llm=llm, prompt=prompt)
 def analyze_data(data):
     print(f"Data: {data}")
     # Extract minimum and maximum ages: Turn ['18 Years', '20 Years'] into [18, 20]
+    min_ages = [
+        int(re.search(r"\d+", age).group()) for age in data["minimum_age"] if age
+    ]
+    max_ages = [
+        int(re.search(r"\d+", age).group()) for age in data["maximum_age"] if age
+    ]
     # primary_timeframe= [int(age.split()[0]) for age in data['[primary_outcome]'] if age]
     # Calculate average minimum and maximum ages
     most_common_gender = gender_counter.most_common(1)[0][0]
     # Flatten keywords list and find common keywords
+    # keywords = [keyword for sublist in data["keywords"] for keyword in sublist]
+    # common_keywords = [word for word, count in Counter(keywords).most_common()]
     return {
         "avg_min_age": avg_min_age,
         "avg_max_age": avg_max_age,
+        "most_common_gender": most_common_gender,
     }
     res = tagging_chain.invoke({"input": processed_json})
     unprocessed_results_dict = res.get_dict()
+    results_dict = analyze_data(unprocessed_results_dict)
     # stats_dict= {'Average Minimum age': avg_min_age,
     #             'Average Maximum age': avg_max_age,

main.ipynb CHANGED Viewed

@@ -245,52 +245,55 @@
     }
    ],
    "source": [
-    "df_summary = pd.read_csv('file_db/brief_summaries.txt', delimiter='|')\n",
-    "df_summary = df_summary.rename(columns={'description': 'summary'})\n",
     "\n",
     "### create and merge intervention ###\n",
-    "df_intervention = pd.read_csv('file_db/interventions.txt', delimiter='|')\n",
     "\n",
-    "intervention_grouped = df_intervention.groupby('nct_id')['name'].apply(list).reset_index()\n",
-    "intervention_grouped = intervention_grouped.rename(columns={'name': 'intervention_name'})\n",
     "merged_df = pd.merge(\n",
-    "    df_summary[['nct_id', 'summary']], \n",
-    "    intervention_grouped[['nct_id', 'intervention_name']], \n",
-    "    on='nct_id')\n",
     "\n",
-    "df_intervention = df_intervention.rename(columns={'description': 'intervention_description'})\n",
     "\n",
     "merged_df = pd.merge(\n",
     "    merged_df,\n",
-    "    df_intervention[['nct_id', 'intervention_type', 'intervention_description']], \n",
-    "    on='nct_id')\n",
     "\n",
     "### create and merge keywords ###\n",
-    "df_keyword = pd.read_csv('file_db/keywords.txt', delimiter='|')\n",
-    "keywords_grouped = df_keyword.groupby('nct_id')['name'].apply(list).reset_index()\n",
-    "keywords_grouped = keywords_grouped.rename(columns={'name': 'keywords'})\n",
     "\n",
-    "merged_df = pd.merge(\n",
-    "    merged_df,\n",
-    "    keywords_grouped,\n",
-    "    on='nct_id'\n",
-    ")\n",
     "\n",
     "### create and merge browse conditions\n",
-    "df_condition = pd.read_csv('file_db/browse_conditions.txt', delimiter='|')\n",
-    "conditions_grouped = df_condition.groupby('nct_id')['downcase_mesh_term'].apply(list).reset_index()\n",
-    "conditions_grouped = conditions_grouped.rename(columns={'downcase_mesh_term': 'desease_condition'})\n",
-    "\n",
-    "merged_df = pd.merge(\n",
-    "    merged_df,\n",
-    "    conditions_grouped,\n",
-    "    on='nct_id'\n",
     ")\n",
     "\n",
-    "merged_df = merged_df.drop_duplicates(subset='nct_id')\n",
     "\n",
-    "merged_df.head()\n",
-    "\n"
    ]
   },
   {

     }
    ],
    "source": [
+    "df_summary = pd.read_csv(\"file_db/brief_summaries.txt\", delimiter=\"|\")\n",
+    "df_summary = df_summary.rename(columns={\"description\": \"summary\"})\n",
     "\n",
     "### create and merge intervention ###\n",
+    "df_intervention = pd.read_csv(\"file_db/interventions.txt\", delimiter=\"|\")\n",
     "\n",
+    "intervention_grouped = (\n",
+    "    df_intervention.groupby(\"nct_id\")[\"name\"].apply(list).reset_index()\n",
+    ")\n",
+    "intervention_grouped = intervention_grouped.rename(\n",
+    "    columns={\"name\": \"intervention_name\"}\n",
+    ")\n",
     "merged_df = pd.merge(\n",
+    "    df_summary[[\"nct_id\", \"summary\"]],\n",
+    "    intervention_grouped[[\"nct_id\", \"intervention_name\"]],\n",
+    "    on=\"nct_id\",\n",
+    ")\n",
     "\n",
+    "df_intervention = df_intervention.rename(\n",
+    "    columns={\"description\": \"intervention_description\"}\n",
+    ")\n",
     "\n",
     "merged_df = pd.merge(\n",
     "    merged_df,\n",
+    "    df_intervention[[\"nct_id\", \"intervention_type\", \"intervention_description\"]],\n",
+    "    on=\"nct_id\",\n",
+    ")\n",
     "\n",
     "### create and merge keywords ###\n",
+    "df_keyword = pd.read_csv(\"file_db/keywords.txt\", delimiter=\"|\")\n",
+    "keywords_grouped = df_keyword.groupby(\"nct_id\")[\"name\"].apply(list).reset_index()\n",
+    "keywords_grouped = keywords_grouped.rename(columns={\"name\": \"keywords\"})\n",
     "\n",
+    "merged_df = pd.merge(merged_df, keywords_grouped, on=\"nct_id\")\n",
     "\n",
     "### create and merge browse conditions\n",
+    "df_condition = pd.read_csv(\"file_db/browse_conditions.txt\", delimiter=\"|\")\n",
+    "conditions_grouped = (\n",
+    "    df_condition.groupby(\"nct_id\")[\"downcase_mesh_term\"].apply(list).reset_index()\n",
+    ")\n",
+    "conditions_grouped = conditions_grouped.rename(\n",
+    "    columns={\"downcase_mesh_term\": \"desease_condition\"}\n",
     ")\n",
     "\n",
+    "merged_df = pd.merge(merged_df, conditions_grouped, on=\"nct_id\")\n",
     "\n",
+    "merged_df = merged_df.drop_duplicates(subset=\"nct_id\")\n",
+    "\n",
+    "merged_df.head()"
    ]
   },
   {

utils.py CHANGED Viewed

@@ -1,11 +1,12 @@
 # %%
-from typing import List, Dict, Any
 import os
-from sqlalchemy import create_engine, text
 import requests
-from sentence_transformers import SentenceTransformer
 import streamlit as st
-import pandas as pd
 username = "demo"
 password = "demo"
@@ -124,16 +125,19 @@ def get_similarities_among_diseases_uris(
             result = conn.execute(text(sql))
             data = result.fetchall()
-    return [{
-        "uri1": row[0].split("/")[-1],
-        "uri2": row[1].split("/")[-1],
-        "distance": float(row[2]),
-    } for row in data]
 def augment_the_set_of_diseaces(diseases: List[str]) -> str:
     augmented_diseases = diseases.copy()
-    for i in range(15-len(augmented_diseases)):
         with engine.connect() as conn:
             with conn.begin():
                 sql = f"""
@@ -153,6 +157,7 @@ def augment_the_set_of_diseaces(diseases: List[str]) -> str:
     return augmented_diseases
 def get_embedding(string: str, encoder) -> List[float]:
     # Embed the string using sentence-transformers
     vector = encoder.encode(string, show_progress_bar=False)
@@ -176,11 +181,14 @@ def get_diseases_related_to_a_textual_description(
             result = conn.execute(text(sql))
             data = result.fetchall()
-    return [{"uri": row[0], "distance": float(row[1])} for row in data if float(row[1]) > 0.8]
-def get_clinical_trials_related_to_diseases(
-    diseases: List[str], encoder
-) -> List[str]:
     # Embed the diseases using sentence-transformers
     diseases_string = ", ".join(diseases)
     disease_embedding = get_embedding(diseases_string, encoder)
@@ -189,7 +197,7 @@ def get_clinical_trials_related_to_diseases(
     with engine.connect() as conn:
         with conn.begin():
             sql = f"""
-                    SELECT TOP 15 d.nct_id, VECTOR_COSINE(d.embedding, TO_VECTOR('{string_representation}', DOUBLE)) AS distance
                     FROM Test.ClinicalTrials d
                     ORDER BY distance DESC
                 """
@@ -198,82 +206,139 @@ def get_clinical_trials_related_to_diseases(
     return [{"nct_id": row[0], "distance": row[1]} for row in data]
-def filter_out_less_promising_diseases(info_dicts: List[Dict[str, Any]]) -> List[str]:
     # Find out the score of each disease by averaging the cosine similarity of the embeddings of the diseases that include it as uri1 or uri2
-    df_diseases_similarities = pd.DataFrame(info_dicts)
     # Use uri1 as the index, and uri2 as the columns. The values are the distances.
-    df_diseases_similarities = df_diseases_similarities.pivot(index="uri1", columns="uri2", values="distance")
     # Fill the diagonal with 1.0
     df_diseases_similarities = df_diseases_similarities.fillna(1.0)
-    # Filter out the diseases that are 1 standard deviation below the mean
     mean = df_diseases_similarities.mean().mean()
     std = df_diseases_similarities.mean().std()
-    filtered_diseases = df_diseases_similarities.mean()[df_diseases_similarities.mean() > mean - std].index.tolist()
     return filtered_diseases, df_diseases_similarities
 def to_capitalized_case(string: str) -> str:
     string = string.replace("_", " ")
     if string.isupper():
         return string[0] + string[1:].lower()
 def list_to_capitalized_case(strings: List[str]) -> str:
     strings = [to_capitalized_case(s) for s in strings]
     return ", ".join(strings)
 def render_trial_details(trial: dict) -> None:
-            # TODO: handle key errors for all cases (→ do not render)
-            official_title = trial["protocolSection"]["identificationModule"]["officialTitle"]
-            st.write(f"##### {official_title}")
-            try:
-                st.write(trial["protocolSection"]["descriptionModule"]["briefSummary"])
-            except KeyError:
-                try:
-                    st.write(trial["protocolSection"]["descriptionModule"]["detailedDescription"])
-                except KeyError:
-                    st.error("No description available.")
-            st.write("###### Status")
-            try:
-                status_module = {
-                    "Status": to_capitalized_case(trial["protocolSection"]["statusModule"]["overallStatus"]),
-                    "Status Date": trial["protocolSection"]["statusModule"]["statusVerifiedDate"],
-                    "Has Results": trial["hasResults"]
-                }
-                st.table(status_module)
-            except KeyError:
-                st.info("No status information available.")
-            st.write("###### Design")
-            try:
-                design_module = {
-                    "Study Type": to_capitalized_case(trial["protocolSection"]["designModule"]["studyType"]),
-                    "Phases": list_to_capitalized_case(trial["protocolSection"]["designModule"]["phases"]),
-                    "Allocation": to_capitalized_case(trial["protocolSection"]["designModule"]["designInfo"]["allocation"]),
-                    "Primary Purpose": to_capitalized_case(trial["protocolSection"]["designModule"]["designInfo"]["primaryPurpose"]),
-                    "Participants": trial["protocolSection"]["designModule"]["enrollmentInfo"]["count"],
-                    "Masking": to_capitalized_case(trial["protocolSection"]["designModule"]["designInfo"]["maskingInfo"]["masking"]),
-                    "Who Masked": list_to_capitalized_case(trial["protocolSection"]["designModule"]["designInfo"]["maskingInfo"]["whoMasked"])
-                }
-                st.table(design_module)
-            except KeyError:
-                st.info("No design information available.")
-            st.write("###### Interventions")
-            try:
-                interventions_module = {}
-                for intervention in trial["protocolSection"]["armsInterventionsModule"]["interventions"]:
-                    name = intervention["name"]
-                    desc = intervention["description"]
-                    interventions_module[name] = desc
-                st.table(interventions_module)
-            except KeyError:
-                st.info("No interventions information available.")
-            # Button to go to ClinicalTrials.gov and see the trial. It takes the user to the official page of the trial.
-            st.markdown(f"See more in [ClinicalTrials.gov](https://clinicaltrials.gov/study/{trial['protocolSection']['identificationModule']['nctId']})")
 if __name__ == "__main__":
     username = "demo"

 # %%
 import os
+from typing import Any, Dict, List
+import pandas as pd
 import requests
 import streamlit as st
+from sentence_transformers import SentenceTransformer
+from sqlalchemy import create_engine, text
 username = "demo"
 password = "demo"
             result = conn.execute(text(sql))
             data = result.fetchall()
+    return [
+        {
+            "uri1": row[0].split("/")[-1],
+            "uri2": row[1].split("/")[-1],
+            "distance": float(row[2]),
+        }
+        for row in data
+    ]
 def augment_the_set_of_diseaces(diseases: List[str]) -> str:
     augmented_diseases = diseases.copy()
+    for i in range(10 - len(augmented_diseases)):
         with engine.connect() as conn:
             with conn.begin():
                 sql = f"""
     return augmented_diseases
 def get_embedding(string: str, encoder) -> List[float]:
     # Embed the string using sentence-transformers
     vector = encoder.encode(string, show_progress_bar=False)
             result = conn.execute(text(sql))
             data = result.fetchall()
+    return [
+        {"uri": row[0], "distance": float(row[1])}
+        for row in data
+        if float(row[1]) > 0.8
+    ]
+def get_clinical_trials_related_to_diseases(diseases: List[str], encoder) -> List[str]:
     # Embed the diseases using sentence-transformers
     diseases_string = ", ".join(diseases)
     disease_embedding = get_embedding(diseases_string, encoder)
     with engine.connect() as conn:
         with conn.begin():
             sql = f"""
+                    SELECT TOP 20 d.nct_id, VECTOR_COSINE(d.embedding, TO_VECTOR('{string_representation}', DOUBLE)) AS distance
                     FROM Test.ClinicalTrials d
                     ORDER BY distance DESC
                 """
     return [{"nct_id": row[0], "distance": row[1]} for row in data]
+def get_similarities_df(diseases: List[Dict[str, Any]]) -> pd.DataFrame:
     # Find out the score of each disease by averaging the cosine similarity of the embeddings of the diseases that include it as uri1 or uri2
+    df_diseases_similarities = pd.DataFrame(diseases)
     # Use uri1 as the index, and uri2 as the columns. The values are the distances.
+    df_diseases_similarities = df_diseases_similarities.pivot(
+        index="uri1", columns="uri2", values="distance"
+    )
     # Fill the diagonal with 1.0
     df_diseases_similarities = df_diseases_similarities.fillna(1.0)
+    return df_diseases_similarities
+def filter_out_less_promising_diseases(info_dicts: List[Dict[str, Any]]) -> List[str]:
+    df_diseases_similarities = get_similarities_df(info_dicts)
+    # Filter out the diseases that are 0.2 standard deviations below the mean
     mean = df_diseases_similarities.mean().mean()
     std = df_diseases_similarities.mean().std()
+    filtered_diseases = df_diseases_similarities.mean()[
+        df_diseases_similarities.mean() > mean - 0.2 * std
+    ].index.tolist()
     return filtered_diseases, df_diseases_similarities
+def get_labels_of_diseases_from_uris(uris: List[str]) -> List[str]:
+    with engine.connect() as conn:
+        with conn.begin():
+            joined_uris = ", ".join([f"'{uri}'" for uri in uris])
+            sql = f"""
+                    SELECT label FROM Test.EntityEmbeddings
+                    WHERE uri IN ({joined_uris})
+                """
+            result = conn.execute(text(sql))
+            data = result.fetchall()
+    return [row[0] for row in data]
 def to_capitalized_case(string: str) -> str:
     string = string.replace("_", " ")
     if string.isupper():
         return string[0] + string[1:].lower()
 def list_to_capitalized_case(strings: List[str]) -> str:
     strings = [to_capitalized_case(s) for s in strings]
     return ", ".join(strings)
 def render_trial_details(trial: dict) -> None:
+    # TODO: handle key errors for all cases (→ do not render)
+    official_title = trial["protocolSection"]["identificationModule"]["officialTitle"]
+    st.write(f"##### {official_title}")
+    try:
+        st.write(trial["protocolSection"]["descriptionModule"]["briefSummary"])
+    except KeyError:
+        try:
+            st.write(
+                trial["protocolSection"]["descriptionModule"]["detailedDescription"]
+            )
+        except KeyError:
+            st.error("No description available.")
+    st.write("###### Status")
+    try:
+        status_module = {
+            "Status": to_capitalized_case(
+                trial["protocolSection"]["statusModule"]["overallStatus"]
+            ),
+            "Status Date": trial["protocolSection"]["statusModule"][
+                "statusVerifiedDate"
+            ],
+            "Has Results": trial["hasResults"],
+        }
+        st.table(status_module)
+    except KeyError:
+        st.info("No status information available.")
+    st.write("###### Design")
+    try:
+        design_module = {
+            "Study Type": to_capitalized_case(
+                trial["protocolSection"]["designModule"]["studyType"]
+            ),
+            "Phases": list_to_capitalized_case(
+                trial["protocolSection"]["designModule"]["phases"]
+            ),
+            "Allocation": to_capitalized_case(
+                trial["protocolSection"]["designModule"]["designInfo"]["allocation"]
+            ),
+            "Primary Purpose": to_capitalized_case(
+                trial["protocolSection"]["designModule"]["designInfo"]["primaryPurpose"]
+            ),
+            "Participants": trial["protocolSection"]["designModule"]["enrollmentInfo"][
+                "count"
+            ],
+            "Masking": to_capitalized_case(
+                trial["protocolSection"]["designModule"]["designInfo"]["maskingInfo"][
+                    "masking"
+                ]
+            ),
+            "Who Masked": list_to_capitalized_case(
+                trial["protocolSection"]["designModule"]["designInfo"]["maskingInfo"][
+                    "whoMasked"
+                ]
+            ),
+        }
+        st.table(design_module)
+    except KeyError:
+        st.info("No design information available.")
+    st.write("###### Interventions")
+    try:
+        interventions_module = {}
+        for intervention in trial["protocolSection"]["armsInterventionsModule"][
+            "interventions"
+        ]:
+            name = intervention["name"]
+            desc = intervention["description"]
+            interventions_module[name] = desc
+        st.table(interventions_module)
+    except KeyError:
+        st.info("No interventions information available.")
+    # Button to go to ClinicalTrials.gov and see the trial. It takes the user to the official page of the trial.
+    st.markdown(
+        f"See more in [ClinicalTrials.gov](https://clinicaltrials.gov/study/{trial['protocolSection']['identificationModule']['nctId']})"
+    )
 if __name__ == "__main__":
     username = "demo"