advanced_manufacturing

Sleeping

App Files Files Community

jannisborn commited on Jan 8, 2023

Commit

a4eba41

•

1 Parent(s): 09c907a

update

Browse files

Files changed (6) hide show

README.md +1 -1
app.py +29 -95
model_cards/article.md +22 -43
model_cards/description.md +1 -1
model_cards/examples.csv +2 -3
utils.py +7 -35

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: GT4SD - PaccMannGP
 emoji: 💡
 colorFrom: green
 colorTo: blue

 ---
+title: GT4SD - Advanced Manufacturing
 emoji: 💡
 colorFrom: green
 colorTo: blue

app.py CHANGED Viewed

@@ -1,17 +1,11 @@
 import logging
 import pathlib
-from typing import List
 import gradio as gr
 import pandas as pd
-from gt4sd.algorithms.controlled_sampling.paccmann_gp import (
-    PaccMannGPGenerator,
-    PaccMannGP,
-)
-from gt4sd.algorithms.controlled_sampling.paccmann_gp.implementation import (
-    MINIMIZATION_FUNCTIONS,
 )
 from gt4sd.algorithms.registry import ApplicationsRegistry
 from utils import draw_grid_generate
@@ -20,62 +14,27 @@ logger = logging.getLogger(__name__)
 logger.addHandler(logging.NullHandler())
-MINIMIZATION_FUNCTIONS.pop("callable", None)
-MINIMIZATION_FUNCTIONS.pop("molwt", None)
 def run_inference(
     algorithm_version: str,
-    targets: List[str],
-    protein_target: str,
-    temperature: float,
     length: float,
-    number_of_samples: int,
-    limit: int,
     number_of_steps: int,
-    number_of_initial_points: int,
-    number_of_optimization_rounds: int,
-    sampling_variance: float,
-    samples_for_evaluation: int,
-    maximum_number_of_sampling_steps: int,
-    seed: int,
 ):
-    config = PaccMannGPGenerator(
-        algorithm_version=algorithm_version.split("_")[-1],
-        batch_size=32,
-        temperature=temperature,
-        generated_length=length,
-        limit=limit,
-        acquisition_function="EI",
         number_of_steps=number_of_steps,
-        number_of_initial_points=number_of_initial_points,
-        initial_point_generator="random",
-        number_of_optimization_rounds=number_of_optimization_rounds,
-        sampling_variance=sampling_variance,
-        samples_for_evaluation=samples_for_evaluation,
-        maximum_number_of_sampling_steps=maximum_number_of_sampling_steps,
-        seed=seed,
     )
-    target = {i: {} for i in targets}
-    if "affinity" in targets:
-        if protein_target == "" or not isinstance(protein_target, str):
-            raise ValueError(
-                f"Protein target must be specified for affinity prediction, not ={protein_target}"
-            )
-        target["affinity"]["protein"] = protein_target
-    else:
-        protein_target = ""
-    model = PaccMannGP(config, target=target)
     samples = list(model.sample(number_of_samples))
-    return draw_grid_generate(
-        samples=samples,
-        n_cols=5,
-        properties=set(target.keys()),
-        protein_target=protein_target,
-    )
 if __name__ == "__main__":
@@ -84,16 +43,15 @@ if __name__ == "__main__":
     all_algos = ApplicationsRegistry.list_available()
     algos = [
         x["algorithm_version"]
-        for x in list(filter(lambda x: "PaccMannRL" in x["algorithm_name"], all_algos))
     ]
     # Load metadata
     metadata_root = pathlib.Path(__file__).parent.joinpath("model_cards")
-    examples = pd.read_csv(
-        metadata_root.joinpath("examples.csv"), header=None, sep="|"
-    ).fillna("")
-    examples[1] = examples[1].apply(eval)
     with open(metadata_root.joinpath("article.md"), "r") as f:
         article = f.read()
@@ -102,21 +60,19 @@ if __name__ == "__main__":
     demo = gr.Interface(
         fn=run_inference,
-        title="PaccMannGP",
         inputs=[
-            gr.Dropdown(algos, label="Algorithm version", value="v0"),
-            gr.CheckboxGroup(
-                choices=list(MINIMIZATION_FUNCTIONS.keys()),
-                value=["qed"],
-                multiselect=True,
-                label="Property goals",
             ),
             gr.Textbox(
-                label="Protein target",
-                placeholder="MVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTT",
                 lines=1,
             ),
-            gr.Slider(minimum=0.5, maximum=2, value=1, label="Decoding temperature"),
             gr.Slider(
                 minimum=5,
                 maximum=400,
@@ -125,36 +81,14 @@ if __name__ == "__main__":
                 step=1,
             ),
             gr.Slider(
-                minimum=1, maximum=50, value=10, label="Number of samples", step=1
-            ),
-            gr.Slider(minimum=1, maximum=8, value=4.0, label="Limit"),
-            gr.Slider(minimum=1, maximum=32, value=8, label="Number of steps", step=1),
-            gr.Slider(
-                minimum=1, maximum=32, value=4, label="Number of initial points", step=1
-            ),
-            gr.Slider(
-                minimum=1,
-                maximum=4,
-                value=1,
-                label="Number of optimization rounds",
-                step=1,
             ),
-            gr.Slider(minimum=0.01, maximum=1, value=0.1, label="Sampling variance"),
             gr.Slider(
-                minimum=1,
-                maximum=10,
-                value=1,
-                label="Samples used for evaluation",
-                step=1,
             ),
             gr.Slider(
-                minimum=1,
-                maximum=64,
-                value=4,
-                label="Maximum number of sampling steps",
-                step=1,
             ),
-            gr.Number(value=42, label="Seed", precision=0),
         ],
         outputs=gr.HTML(label="Output"),
         article=article,

 import logging
 import pathlib
 import gradio as gr
 import pandas as pd
+from gt4sd.algorithms.controlled_sampling.advanced_manufacturing import (
+    CatalystGenerator,
+    AdvancedManufacturing,
 )
 from gt4sd.algorithms.registry import ApplicationsRegistry
 from utils import draw_grid_generate
 logger.addHandler(logging.NullHandler())
 def run_inference(
     algorithm_version: str,
+    target_binding_energy: float,
+    primer_smiles: str,
     length: float,
+    number_of_points: int,
     number_of_steps: int,
+    number_of_samples: int,
 ):
+    config = CatalystGenerator(
+        algorithm_version=algorithm_version,
+        number_of_points=number_of_points,
         number_of_steps=number_of_steps,
+        generated_length=length,
+        primer_smiles=primer_smiles,
     )
+    model = AdvancedManufacturing(config, target=target_binding_energy)
     samples = list(model.sample(number_of_samples))
+    return draw_grid_generate(samples=samples, n_cols=5, seeds=[])
 if __name__ == "__main__":
     all_algos = ApplicationsRegistry.list_available()
     algos = [
         x["algorithm_version"]
+        for x in list(filter(lambda x: "Advanced" in x["algorithm_name"], all_algos))
     ]
     # Load metadata
     metadata_root = pathlib.Path(__file__).parent.joinpath("model_cards")
+    examples = pd.read_csv(metadata_root.joinpath("examples.csv"), header=None).fillna(
+        ""
+    )
     with open(metadata_root.joinpath("article.md"), "r") as f:
         article = f.read()
     demo = gr.Interface(
         fn=run_inference,
+        title="Advanced Manufacturing",
         inputs=[
+            gr.Dropdown(
+                algos,
+                label="Algorithm version",
+                value="NCCR_rnn_suzuki_aug16_smiles",
             ),
+            gr.Slider(minimum=1, maximum=100, value=10, label="Target binding energy"),
             gr.Textbox(
+                label="Primer SMILES",
+                placeholder="FP(F)F.CP(C)c1ccccc1.[Au]",
                 lines=1,
             ),
             gr.Slider(
                 minimum=5,
                 maximum=400,
                 step=1,
             ),
             gr.Slider(
+                minimum=16, maximum=128, value=32, label="Number of points", step=1
             ),
             gr.Slider(
+                minimum=16, maximum=128, value=50, label="Number of steps", step=1
             ),
             gr.Slider(
+                minimum=1, maximum=50, value=10, label="Number of samples", step=1
             ),
         ],
         outputs=gr.HTML(label="Output"),
         article=article,

model_cards/article.md CHANGED Viewed

@@ -2,53 +2,39 @@
 **Algorithm Version**: Which model version to use.
-**Property goals**: One or multiple properties that will be optimized.
-**Protein target**: An AAS of a protein target used for conditioning. Leave blank unless you use `affinity` as a `property goal`.
-**Decoding temperature**: The temperature parameter in the SMILES/SELFIES decoder. Higher values lead to more explorative choices, smaller values culminate in mode collapse.
 **Maximal sequence length**: The maximal number of SMILES tokens in the generated molecule.
-**Number of samples**: How many samples should be generated (between 1 and 50).
-**Limit**: Hypercube limits in the latent space.
-**Number of steps**: Number of steps for a GP optmization round. The longer the slower. Has to be at least `Number of initial points`.
-**Number of initial points**: Number of initial points evaluated. The longer the slower.
-**Number of optimization rounds**: Maximum number of optimization rounds.
-**Sampling variance**: Variance of the Gaussian noise applied during sampling from the optimal point.
-**Samples for evaluation**: Number of samples averaged for each minimization function evaluation.
-**Max. sampling steps**: Maximum number of sampling steps in an optmization round.
-**Seed**: The random seed used for initialization.
-# Model card -- PaccMannGP
-**Model Details**: [PaccMann<sup>GP</sup>](https://github.com/PaccMann/paccmann_gp) is a language-based Variational Autoencoder that is coupled with a GaussianProcess for controlled sampling. This model systematically explores the latent space of a trained molecular VAE.
-**Developers**: Jannis Born, Matteo Manica and colleagues from IBM Research.
-**Distributors**: Original authors' code wrapped and distributed by GT4SD Team (2023) from IBM Research.
-**Model date**: Published in 2022.
-**Model version**: A molecular VAE trained on 1.5M molecules from ChEMBL.
-**Model type**: A language-based molecular generative model that can be explored with Gaussian Processes to generate molecules with desired properties.
 **Information about training algorithms, parameters, fairness constraints or other applied approaches, and features**:
-Described in the [original paper](https://pubs.acs.org/doi/10.1021/acs.jcim.1c00889).
 **Paper or other resource for more information**:
-[Active Site Sequence Representations of Human Kinases Outperform Full Sequence Representations for Affinity Prediction and Inhibitor Generation: 3D Effects in a 1D Model (2022; *Journal of Chemical Information & Modeling*)](https://pubs.acs.org/doi/10.1021/acs.jcim.1c00889).
 **License**: MIT
@@ -60,11 +46,9 @@ Described in the [original paper](https://pubs.acs.org/doi/10.1021/acs.jcim.1c00
 **Out-of-scope use cases**: Production-level inference, producing molecules with harmful properties.
-**Factors**: Not applicable.
-**Metrics**: High reward on generating molecules with desired properties.
-**Datasets**: ChEMBL.
 **Ethical Considerations**: Unclear, please consult with original authors in case of questions.
@@ -73,17 +57,12 @@ Described in the [original paper](https://pubs.acs.org/doi/10.1021/acs.jcim.1c00
 Model card prototype inspired by [Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)
 ## Citation
 ```bib
-@article{born2022active,
-	author = {Born, Jannis and Huynh, Tien and Stroobants, Astrid and Cornell, Wendy D. and Manica, Matteo},
-	title = {Active Site Sequence Representations of Human Kinases Outperform Full Sequence Representations for Affinity Prediction and Inhibitor Generation: 3D Effects in a 1D Model},
-	journal = {Journal of Chemical Information and Modeling},
-	volume = {62},
-	number = {2},
-	pages = {240-257},
-	year = {2022},
-	doi = {10.1021/acs.jcim.1c00889},
-	note ={PMID: 34905358},
-	URL = {https://doi.org/10.1021/acs.jcim.1c00889}
 }
 ```

 **Algorithm Version**: Which model version to use.
+**Target binding energy**: The desired binding energy.
+**Primer SMILES**: A SMILES string used to prime the generation.
 **Maximal sequence length**: The maximal number of SMILES tokens in the generated molecule.
+**Number of points**: Number of points to sample with the Gaussian Process.
+**Number of steps**: Number of optimization steps in the Gaussian Process optimization.
+**Number of samples**: How many samples should be generated (between 1 and 50).
+# Model card -- AdvancedManufacturing
+**Model Details**: *AdvancedManufacturing* is a sequence-based molecular generator tuned to generate catalysts. The model relies on a recurrent Variational Autoencoder with a binding-energy predictor trained on the latent code. The framework uses Gaussian Processes for generating targeted molecules.
+**Developers**: Oliver Schilter and colleagues from IBM Research.
+**Distributors**: Original authors' code integrated into GT4SD.
+**Model date**: Not yet published.
+**Model version**: Different types of models trained on NCCR data using SMILES or SELFIES, potentially also with augmentation.
+**Model type**: A sequence-based molecular generator tuned to generate catalysts. The model relies on a recurrent Variational Autoencoder with a binding-energy predictor trained on the latent code. The framework uses Gaussian Processes for generating targeted molecules.
 **Information about training algorithms, parameters, fairness constraints or other applied approaches, and features**:
+N.A.
 **Paper or other resource for more information**:
+TBD
 **License**: MIT
 **Out-of-scope use cases**: Production-level inference, producing molecules with harmful properties.
+**Metrics**: N.A.
+**Datasets**: Data provided through NCCR.
 **Ethical Considerations**: Unclear, please consult with original authors in case of questions.
 Model card prototype inspired by [Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)
 ## Citation
+TBD, temporarily please cite:
 ```bib
+@article{manica2022gt4sd,
+  title={GT4SD: Generative Toolkit for Scientific Discovery},
+  author={Manica, Matteo and Cadow, Joris and Christofidellis, Dimitrios and Dave, Ashish and Born, Jannis and Clarke, Dean and Teukam, Yves Gaetan Nana and Hoffman, Samuel C and Buchan, Matthew and Chenthamarakshan, Vijil and others},
+  journal={arXiv preprint arXiv:2207.03928},
+  year={2022}
 }
 ```

model_cards/description.md CHANGED Viewed

@@ -1,6 +1,6 @@
 <img align="right" src="https://raw.githubusercontent.com/GT4SD/gt4sd-core/main/docs/_static/gt4sd_logo.png" alt="logo" width="120" >
-[PaccMann<sup>GP</sup>](https://github.com/PaccMann/paccmann_gp) is a language-based Variational Autoencoder that is coupled with a GaussianProcess for controlled sampling. For details of the methodology, please see [Born et al., (2022), *Journal of Chemical Information & Modeling*](https://pubs.acs.org/doi/10.1021/acs.jcim.1c00889).
 For **examples** and **documentation** of the model parameters, please see below.
 Moreover, we provide a **model card** ([Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)) at the bottom of this page.

 <img align="right" src="https://raw.githubusercontent.com/GT4SD/gt4sd-core/main/docs/_static/gt4sd_logo.png" alt="logo" width="120" >
+*AdvancedManufacturing* is a sequence-based molecular generator tuned to generate catalysts. The model relies on a Variational Autoencoder with a binding-energy predictor trained on the latent code. The framework uses Gaussian Processes for generating targeted molecules.
 For **examples** and **documentation** of the model parameters, please see below.
 Moreover, we provide a **model card** ([Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)) at the bottom of this page.

model_cards/examples.csv CHANGED Viewed

@@ -1,3 +1,2 @@
-v0|["qed"]||1.2|100|10|4|8|4|1|0.1|3|4|42
-v0|["qed","sa"]||1.2|100|10|4|8|4|1|0.1|3|4|42
-v0|["affinity"]|MVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTT|1.2|100|10|4|8|4|1|0.1|3|4|42


1	+ NCCR_rnn_suzuki_aug16_smiles,10,,100,32,50,10
2	+ NCCR_rnn_suzuki_aug16_selfies,5,FP(F)F.CCOP(OCC)c1ccccc1.[Cu],200,32,50,5

utils.py CHANGED Viewed

@@ -1,9 +1,6 @@
 import logging
 from collections import defaultdict
-from typing import List, Callable
-from gt4sd.properties import PropertyPredictorRegistry
-from gt4sd.algorithms.prediction.paccmann.core import PaccMann, AffinityPredictor
-import torch
 import mols2grid
 import pandas as pd
@@ -12,26 +9,9 @@ logger = logging.getLogger(__name__)
 logger.addHandler(logging.NullHandler())
-def get_affinity_function(target: str) -> Callable:
-    return lambda mols: torch.stack(
-        list(
-            PaccMann(
-                AffinityPredictor(protein_targets=[target] * len(mols), ligands=mols)
-            ).sample(len(mols))
-        )
-    ).tolist()
-EVAL_DICT = {
-    "qed": PropertyPredictorRegistry.get_property_predictor("qed"),
-    "sa": PropertyPredictorRegistry.get_property_predictor("sas"),
-}
 def draw_grid_generate(
     samples: List[str],
-    properties: List[str],
-    protein_target: str,
     n_cols: int = 3,
     size=(140, 200),
 ) -> str:
@@ -47,22 +27,14 @@ def draw_grid_generate(
         HTML to display
     """
-    if protein_target != "":
-        EVAL_DICT.update({"affinity": get_affinity_function(protein_target)})
     result = defaultdict(list)
     result.update(
-        {"SMILES": samples, "Name": [f"Generated_{i}" for i in range(len(samples))]},
     )
-    if "affinity" in properties:
-        properties.remove("affinity")
-        vals = EVAL_DICT["affinity"](samples)
-        result["affinity"] = vals
-    # Fill properties
-    for sample in samples:
-        for prop in properties:
-            value = EVAL_DICT[prop](sample)
-            result[prop].append(f"{prop} = {value}")
     result_df = pd.DataFrame(result)
     obj = mols2grid.display(

 import logging
 from collections import defaultdict
+from typing import List
 import mols2grid
 import pandas as pd
 logger.addHandler(logging.NullHandler())
 def draw_grid_generate(
     samples: List[str],
+    seeds: List[str] = [],
     n_cols: int = 3,
     size=(140, 200),
 ) -> str:
         HTML to display
     """
     result = defaultdict(list)
     result.update(
+        {
+            "SMILES": seeds + samples,
+            "Name": [f"Seed_{i}" for i in range(len(seeds))]
+            + [f"Generated_{i}" for i in range(len(samples))],
+        },
     )
     result_df = pd.DataFrame(result)
     obj = mols2grid.display(