jannisborn commited on
Commit
a4eba41
1 Parent(s): 09c907a
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: GT4SD - PaccMannGP
3
  emoji: 💡
4
  colorFrom: green
5
  colorTo: blue
 
1
  ---
2
+ title: GT4SD - Advanced Manufacturing
3
  emoji: 💡
4
  colorFrom: green
5
  colorTo: blue
app.py CHANGED
@@ -1,17 +1,11 @@
1
  import logging
2
  import pathlib
3
- from typing import List
4
-
5
  import gradio as gr
6
  import pandas as pd
7
- from gt4sd.algorithms.controlled_sampling.paccmann_gp import (
8
- PaccMannGPGenerator,
9
- PaccMannGP,
10
- )
11
- from gt4sd.algorithms.controlled_sampling.paccmann_gp.implementation import (
12
- MINIMIZATION_FUNCTIONS,
13
  )
14
-
15
  from gt4sd.algorithms.registry import ApplicationsRegistry
16
 
17
  from utils import draw_grid_generate
@@ -20,62 +14,27 @@ logger = logging.getLogger(__name__)
20
  logger.addHandler(logging.NullHandler())
21
 
22
 
23
- MINIMIZATION_FUNCTIONS.pop("callable", None)
24
- MINIMIZATION_FUNCTIONS.pop("molwt", None)
25
-
26
-
27
  def run_inference(
28
  algorithm_version: str,
29
- targets: List[str],
30
- protein_target: str,
31
- temperature: float,
32
  length: float,
33
- number_of_samples: int,
34
- limit: int,
35
  number_of_steps: int,
36
- number_of_initial_points: int,
37
- number_of_optimization_rounds: int,
38
- sampling_variance: float,
39
- samples_for_evaluation: int,
40
- maximum_number_of_sampling_steps: int,
41
- seed: int,
42
  ):
43
 
44
- config = PaccMannGPGenerator(
45
- algorithm_version=algorithm_version.split("_")[-1],
46
- batch_size=32,
47
- temperature=temperature,
48
- generated_length=length,
49
- limit=limit,
50
- acquisition_function="EI",
51
  number_of_steps=number_of_steps,
52
- number_of_initial_points=number_of_initial_points,
53
- initial_point_generator="random",
54
- number_of_optimization_rounds=number_of_optimization_rounds,
55
- sampling_variance=sampling_variance,
56
- samples_for_evaluation=samples_for_evaluation,
57
- maximum_number_of_sampling_steps=maximum_number_of_sampling_steps,
58
- seed=seed,
59
  )
60
- target = {i: {} for i in targets}
61
- if "affinity" in targets:
62
- if protein_target == "" or not isinstance(protein_target, str):
63
- raise ValueError(
64
- f"Protein target must be specified for affinity prediction, not ={protein_target}"
65
- )
66
- target["affinity"]["protein"] = protein_target
67
- else:
68
- protein_target = ""
69
-
70
- model = PaccMannGP(config, target=target)
71
  samples = list(model.sample(number_of_samples))
72
 
73
- return draw_grid_generate(
74
- samples=samples,
75
- n_cols=5,
76
- properties=set(target.keys()),
77
- protein_target=protein_target,
78
- )
79
 
80
 
81
  if __name__ == "__main__":
@@ -84,16 +43,15 @@ if __name__ == "__main__":
84
  all_algos = ApplicationsRegistry.list_available()
85
  algos = [
86
  x["algorithm_version"]
87
- for x in list(filter(lambda x: "PaccMannRL" in x["algorithm_name"], all_algos))
88
  ]
89
 
90
  # Load metadata
91
  metadata_root = pathlib.Path(__file__).parent.joinpath("model_cards")
92
 
93
- examples = pd.read_csv(
94
- metadata_root.joinpath("examples.csv"), header=None, sep="|"
95
- ).fillna("")
96
- examples[1] = examples[1].apply(eval)
97
 
98
  with open(metadata_root.joinpath("article.md"), "r") as f:
99
  article = f.read()
@@ -102,21 +60,19 @@ if __name__ == "__main__":
102
 
103
  demo = gr.Interface(
104
  fn=run_inference,
105
- title="PaccMannGP",
106
  inputs=[
107
- gr.Dropdown(algos, label="Algorithm version", value="v0"),
108
- gr.CheckboxGroup(
109
- choices=list(MINIMIZATION_FUNCTIONS.keys()),
110
- value=["qed"],
111
- multiselect=True,
112
- label="Property goals",
113
  ),
 
114
  gr.Textbox(
115
- label="Protein target",
116
- placeholder="MVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTT",
117
  lines=1,
118
  ),
119
- gr.Slider(minimum=0.5, maximum=2, value=1, label="Decoding temperature"),
120
  gr.Slider(
121
  minimum=5,
122
  maximum=400,
@@ -125,36 +81,14 @@ if __name__ == "__main__":
125
  step=1,
126
  ),
127
  gr.Slider(
128
- minimum=1, maximum=50, value=10, label="Number of samples", step=1
129
- ),
130
- gr.Slider(minimum=1, maximum=8, value=4.0, label="Limit"),
131
- gr.Slider(minimum=1, maximum=32, value=8, label="Number of steps", step=1),
132
- gr.Slider(
133
- minimum=1, maximum=32, value=4, label="Number of initial points", step=1
134
- ),
135
- gr.Slider(
136
- minimum=1,
137
- maximum=4,
138
- value=1,
139
- label="Number of optimization rounds",
140
- step=1,
141
  ),
142
- gr.Slider(minimum=0.01, maximum=1, value=0.1, label="Sampling variance"),
143
  gr.Slider(
144
- minimum=1,
145
- maximum=10,
146
- value=1,
147
- label="Samples used for evaluation",
148
- step=1,
149
  ),
150
  gr.Slider(
151
- minimum=1,
152
- maximum=64,
153
- value=4,
154
- label="Maximum number of sampling steps",
155
- step=1,
156
  ),
157
- gr.Number(value=42, label="Seed", precision=0),
158
  ],
159
  outputs=gr.HTML(label="Output"),
160
  article=article,
 
1
  import logging
2
  import pathlib
 
 
3
  import gradio as gr
4
  import pandas as pd
5
+ from gt4sd.algorithms.controlled_sampling.advanced_manufacturing import (
6
+ CatalystGenerator,
7
+ AdvancedManufacturing,
 
 
 
8
  )
 
9
  from gt4sd.algorithms.registry import ApplicationsRegistry
10
 
11
  from utils import draw_grid_generate
 
14
  logger.addHandler(logging.NullHandler())
15
 
16
 
 
 
 
 
17
  def run_inference(
18
  algorithm_version: str,
19
+ target_binding_energy: float,
20
+ primer_smiles: str,
 
21
  length: float,
22
+ number_of_points: int,
 
23
  number_of_steps: int,
24
+ number_of_samples: int,
 
 
 
 
 
25
  ):
26
 
27
+ config = CatalystGenerator(
28
+ algorithm_version=algorithm_version,
29
+ number_of_points=number_of_points,
 
 
 
 
30
  number_of_steps=number_of_steps,
31
+ generated_length=length,
32
+ primer_smiles=primer_smiles,
 
 
 
 
 
33
  )
34
+ model = AdvancedManufacturing(config, target=target_binding_energy)
 
 
 
 
 
 
 
 
 
 
35
  samples = list(model.sample(number_of_samples))
36
 
37
+ return draw_grid_generate(samples=samples, n_cols=5, seeds=[])
 
 
 
 
 
38
 
39
 
40
  if __name__ == "__main__":
 
43
  all_algos = ApplicationsRegistry.list_available()
44
  algos = [
45
  x["algorithm_version"]
46
+ for x in list(filter(lambda x: "Advanced" in x["algorithm_name"], all_algos))
47
  ]
48
 
49
  # Load metadata
50
  metadata_root = pathlib.Path(__file__).parent.joinpath("model_cards")
51
 
52
+ examples = pd.read_csv(metadata_root.joinpath("examples.csv"), header=None).fillna(
53
+ ""
54
+ )
 
55
 
56
  with open(metadata_root.joinpath("article.md"), "r") as f:
57
  article = f.read()
 
60
 
61
  demo = gr.Interface(
62
  fn=run_inference,
63
+ title="Advanced Manufacturing",
64
  inputs=[
65
+ gr.Dropdown(
66
+ algos,
67
+ label="Algorithm version",
68
+ value="NCCR_rnn_suzuki_aug16_smiles",
 
 
69
  ),
70
+ gr.Slider(minimum=1, maximum=100, value=10, label="Target binding energy"),
71
  gr.Textbox(
72
+ label="Primer SMILES",
73
+ placeholder="FP(F)F.CP(C)c1ccccc1.[Au]",
74
  lines=1,
75
  ),
 
76
  gr.Slider(
77
  minimum=5,
78
  maximum=400,
 
81
  step=1,
82
  ),
83
  gr.Slider(
84
+ minimum=16, maximum=128, value=32, label="Number of points", step=1
 
 
 
 
 
 
 
 
 
 
 
 
85
  ),
 
86
  gr.Slider(
87
+ minimum=16, maximum=128, value=50, label="Number of steps", step=1
 
 
 
 
88
  ),
89
  gr.Slider(
90
+ minimum=1, maximum=50, value=10, label="Number of samples", step=1
 
 
 
 
91
  ),
 
92
  ],
93
  outputs=gr.HTML(label="Output"),
94
  article=article,
model_cards/article.md CHANGED
@@ -2,53 +2,39 @@
2
 
3
  **Algorithm Version**: Which model version to use.
4
 
5
- **Property goals**: One or multiple properties that will be optimized.
6
 
7
- **Protein target**: An AAS of a protein target used for conditioning. Leave blank unless you use `affinity` as a `property goal`.
8
-
9
- **Decoding temperature**: The temperature parameter in the SMILES/SELFIES decoder. Higher values lead to more explorative choices, smaller values culminate in mode collapse.
10
 
11
  **Maximal sequence length**: The maximal number of SMILES tokens in the generated molecule.
12
 
13
- **Number of samples**: How many samples should be generated (between 1 and 50).
14
-
15
- **Limit**: Hypercube limits in the latent space.
16
-
17
- **Number of steps**: Number of steps for a GP optmization round. The longer the slower. Has to be at least `Number of initial points`.
18
-
19
- **Number of initial points**: Number of initial points evaluated. The longer the slower.
20
 
21
- **Number of optimization rounds**: Maximum number of optimization rounds.
22
 
23
- **Sampling variance**: Variance of the Gaussian noise applied during sampling from the optimal point.
24
-
25
- **Samples for evaluation**: Number of samples averaged for each minimization function evaluation.
26
-
27
- **Max. sampling steps**: Maximum number of sampling steps in an optmization round.
28
-
29
- **Seed**: The random seed used for initialization.
30
 
31
 
32
 
33
- # Model card -- PaccMannGP
34
 
35
- **Model Details**: [PaccMann<sup>GP</sup>](https://github.com/PaccMann/paccmann_gp) is a language-based Variational Autoencoder that is coupled with a GaussianProcess for controlled sampling. This model systematically explores the latent space of a trained molecular VAE.
36
 
37
- **Developers**: Jannis Born, Matteo Manica and colleagues from IBM Research.
38
 
39
- **Distributors**: Original authors' code wrapped and distributed by GT4SD Team (2023) from IBM Research.
40
 
41
- **Model date**: Published in 2022.
42
 
43
- **Model version**: A molecular VAE trained on 1.5M molecules from ChEMBL.
44
 
45
- **Model type**: A language-based molecular generative model that can be explored with Gaussian Processes to generate molecules with desired properties.
46
 
47
  **Information about training algorithms, parameters, fairness constraints or other applied approaches, and features**:
48
- Described in the [original paper](https://pubs.acs.org/doi/10.1021/acs.jcim.1c00889).
49
 
50
  **Paper or other resource for more information**:
51
- [Active Site Sequence Representations of Human Kinases Outperform Full Sequence Representations for Affinity Prediction and Inhibitor Generation: 3D Effects in a 1D Model (2022; *Journal of Chemical Information & Modeling*)](https://pubs.acs.org/doi/10.1021/acs.jcim.1c00889).
52
 
53
  **License**: MIT
54
 
@@ -60,11 +46,9 @@ Described in the [original paper](https://pubs.acs.org/doi/10.1021/acs.jcim.1c00
60
 
61
  **Out-of-scope use cases**: Production-level inference, producing molecules with harmful properties.
62
 
63
- **Factors**: Not applicable.
64
-
65
- **Metrics**: High reward on generating molecules with desired properties.
66
 
67
- **Datasets**: ChEMBL.
68
 
69
  **Ethical Considerations**: Unclear, please consult with original authors in case of questions.
70
 
@@ -73,17 +57,12 @@ Described in the [original paper](https://pubs.acs.org/doi/10.1021/acs.jcim.1c00
73
  Model card prototype inspired by [Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)
74
 
75
  ## Citation
 
76
  ```bib
77
- @article{born2022active,
78
- author = {Born, Jannis and Huynh, Tien and Stroobants, Astrid and Cornell, Wendy D. and Manica, Matteo},
79
- title = {Active Site Sequence Representations of Human Kinases Outperform Full Sequence Representations for Affinity Prediction and Inhibitor Generation: 3D Effects in a 1D Model},
80
- journal = {Journal of Chemical Information and Modeling},
81
- volume = {62},
82
- number = {2},
83
- pages = {240-257},
84
- year = {2022},
85
- doi = {10.1021/acs.jcim.1c00889},
86
- note ={PMID: 34905358},
87
- URL = {https://doi.org/10.1021/acs.jcim.1c00889}
88
  }
89
  ```
 
2
 
3
  **Algorithm Version**: Which model version to use.
4
 
5
+ **Target binding energy**: The desired binding energy.
6
 
7
+ **Primer SMILES**: A SMILES string used to prime the generation.
 
 
8
 
9
  **Maximal sequence length**: The maximal number of SMILES tokens in the generated molecule.
10
 
11
+ **Number of points**: Number of points to sample with the Gaussian Process.
 
 
 
 
 
 
12
 
13
+ **Number of steps**: Number of optimization steps in the Gaussian Process optimization.
14
 
15
+ **Number of samples**: How many samples should be generated (between 1 and 50).
 
 
 
 
 
 
16
 
17
 
18
 
19
+ # Model card -- AdvancedManufacturing
20
 
21
+ **Model Details**: *AdvancedManufacturing* is a sequence-based molecular generator tuned to generate catalysts. The model relies on a recurrent Variational Autoencoder with a binding-energy predictor trained on the latent code. The framework uses Gaussian Processes for generating targeted molecules.
22
 
23
+ **Developers**: Oliver Schilter and colleagues from IBM Research.
24
 
25
+ **Distributors**: Original authors' code integrated into GT4SD.
26
 
27
+ **Model date**: Not yet published.
28
 
29
+ **Model version**: Different types of models trained on NCCR data using SMILES or SELFIES, potentially also with augmentation.
30
 
31
+ **Model type**: A sequence-based molecular generator tuned to generate catalysts. The model relies on a recurrent Variational Autoencoder with a binding-energy predictor trained on the latent code. The framework uses Gaussian Processes for generating targeted molecules.
32
 
33
  **Information about training algorithms, parameters, fairness constraints or other applied approaches, and features**:
34
+ N.A.
35
 
36
  **Paper or other resource for more information**:
37
+ TBD
38
 
39
  **License**: MIT
40
 
 
46
 
47
  **Out-of-scope use cases**: Production-level inference, producing molecules with harmful properties.
48
 
49
+ **Metrics**: N.A.
 
 
50
 
51
+ **Datasets**: Data provided through NCCR.
52
 
53
  **Ethical Considerations**: Unclear, please consult with original authors in case of questions.
54
 
 
57
  Model card prototype inspired by [Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)
58
 
59
  ## Citation
60
+ TBD, temporarily please cite:
61
  ```bib
62
+ @article{manica2022gt4sd,
63
+ title={GT4SD: Generative Toolkit for Scientific Discovery},
64
+ author={Manica, Matteo and Cadow, Joris and Christofidellis, Dimitrios and Dave, Ashish and Born, Jannis and Clarke, Dean and Teukam, Yves Gaetan Nana and Hoffman, Samuel C and Buchan, Matthew and Chenthamarakshan, Vijil and others},
65
+ journal={arXiv preprint arXiv:2207.03928},
66
+ year={2022}
 
 
 
 
 
 
67
  }
68
  ```
model_cards/description.md CHANGED
@@ -1,6 +1,6 @@
1
  <img align="right" src="https://raw.githubusercontent.com/GT4SD/gt4sd-core/main/docs/_static/gt4sd_logo.png" alt="logo" width="120" >
2
 
3
- [PaccMann<sup>GP</sup>](https://github.com/PaccMann/paccmann_gp) is a language-based Variational Autoencoder that is coupled with a GaussianProcess for controlled sampling. For details of the methodology, please see [Born et al., (2022), *Journal of Chemical Information & Modeling*](https://pubs.acs.org/doi/10.1021/acs.jcim.1c00889).
4
 
5
  For **examples** and **documentation** of the model parameters, please see below.
6
  Moreover, we provide a **model card** ([Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)) at the bottom of this page.
 
1
  <img align="right" src="https://raw.githubusercontent.com/GT4SD/gt4sd-core/main/docs/_static/gt4sd_logo.png" alt="logo" width="120" >
2
 
3
+ *AdvancedManufacturing* is a sequence-based molecular generator tuned to generate catalysts. The model relies on a Variational Autoencoder with a binding-energy predictor trained on the latent code. The framework uses Gaussian Processes for generating targeted molecules.
4
 
5
  For **examples** and **documentation** of the model parameters, please see below.
6
  Moreover, we provide a **model card** ([Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)) at the bottom of this page.
model_cards/examples.csv CHANGED
@@ -1,3 +1,2 @@
1
- v0|["qed"]||1.2|100|10|4|8|4|1|0.1|3|4|42
2
- v0|["qed","sa"]||1.2|100|10|4|8|4|1|0.1|3|4|42
3
- v0|["affinity"]|MVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTT|1.2|100|10|4|8|4|1|0.1|3|4|42
 
1
+ NCCR_rnn_suzuki_aug16_smiles,10,,100,32,50,10
2
+ NCCR_rnn_suzuki_aug16_selfies,5,FP(F)F.CCOP(OCC)c1ccccc1.[Cu],200,32,50,5
 
utils.py CHANGED
@@ -1,9 +1,6 @@
1
  import logging
2
  from collections import defaultdict
3
- from typing import List, Callable
4
- from gt4sd.properties import PropertyPredictorRegistry
5
- from gt4sd.algorithms.prediction.paccmann.core import PaccMann, AffinityPredictor
6
- import torch
7
 
8
  import mols2grid
9
  import pandas as pd
@@ -12,26 +9,9 @@ logger = logging.getLogger(__name__)
12
  logger.addHandler(logging.NullHandler())
13
 
14
 
15
- def get_affinity_function(target: str) -> Callable:
16
- return lambda mols: torch.stack(
17
- list(
18
- PaccMann(
19
- AffinityPredictor(protein_targets=[target] * len(mols), ligands=mols)
20
- ).sample(len(mols))
21
- )
22
- ).tolist()
23
-
24
-
25
- EVAL_DICT = {
26
- "qed": PropertyPredictorRegistry.get_property_predictor("qed"),
27
- "sa": PropertyPredictorRegistry.get_property_predictor("sas"),
28
- }
29
-
30
-
31
  def draw_grid_generate(
32
  samples: List[str],
33
- properties: List[str],
34
- protein_target: str,
35
  n_cols: int = 3,
36
  size=(140, 200),
37
  ) -> str:
@@ -47,22 +27,14 @@ def draw_grid_generate(
47
  HTML to display
48
  """
49
 
50
- if protein_target != "":
51
- EVAL_DICT.update({"affinity": get_affinity_function(protein_target)})
52
-
53
  result = defaultdict(list)
54
  result.update(
55
- {"SMILES": samples, "Name": [f"Generated_{i}" for i in range(len(samples))]},
 
 
 
 
56
  )
57
- if "affinity" in properties:
58
- properties.remove("affinity")
59
- vals = EVAL_DICT["affinity"](samples)
60
- result["affinity"] = vals
61
- # Fill properties
62
- for sample in samples:
63
- for prop in properties:
64
- value = EVAL_DICT[prop](sample)
65
- result[prop].append(f"{prop} = {value}")
66
 
67
  result_df = pd.DataFrame(result)
68
  obj = mols2grid.display(
 
1
  import logging
2
  from collections import defaultdict
3
+ from typing import List
 
 
 
4
 
5
  import mols2grid
6
  import pandas as pd
 
9
  logger.addHandler(logging.NullHandler())
10
 
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  def draw_grid_generate(
13
  samples: List[str],
14
+ seeds: List[str] = [],
 
15
  n_cols: int = 3,
16
  size=(140, 200),
17
  ) -> str:
 
27
  HTML to display
28
  """
29
 
 
 
 
30
  result = defaultdict(list)
31
  result.update(
32
+ {
33
+ "SMILES": seeds + samples,
34
+ "Name": [f"Seed_{i}" for i in range(len(seeds))]
35
+ + [f"Generated_{i}" for i in range(len(samples))],
36
+ },
37
  )
 
 
 
 
 
 
 
 
 
38
 
39
  result_df = pd.DataFrame(result)
40
  obj = mols2grid.display(