jannisborn commited on
Commit
09c907a
0 Parent(s):

Duplicate from jannisborn/gt4sd-paccmann-gp

Browse files
Files changed (10) hide show
  1. .gitattributes +34 -0
  2. .gitignore +1 -0
  3. LICENSE +21 -0
  4. README.md +15 -0
  5. app.py +164 -0
  6. model_cards/article.md +89 -0
  7. model_cards/description.md +6 -0
  8. model_cards/examples.csv +3 -0
  9. requirements.txt +29 -0
  10. utils.py +76 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__/
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2022 Generative Toolkit 4 Scientific Discovery
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: GT4SD - PaccMannGP
3
+ emoji: 💡
4
+ colorFrom: green
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 3.9.1
8
+ app_file: app.py
9
+ pinned: false
10
+ python_version: 3.8.13
11
+ pypi_version: 20.2.4
12
+ duplicated_from: jannisborn/gt4sd-paccmann-gp
13
+ ---
14
+
15
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import pathlib
3
+ from typing import List
4
+
5
+ import gradio as gr
6
+ import pandas as pd
7
+ from gt4sd.algorithms.controlled_sampling.paccmann_gp import (
8
+ PaccMannGPGenerator,
9
+ PaccMannGP,
10
+ )
11
+ from gt4sd.algorithms.controlled_sampling.paccmann_gp.implementation import (
12
+ MINIMIZATION_FUNCTIONS,
13
+ )
14
+
15
+ from gt4sd.algorithms.registry import ApplicationsRegistry
16
+
17
+ from utils import draw_grid_generate
18
+
19
+ logger = logging.getLogger(__name__)
20
+ logger.addHandler(logging.NullHandler())
21
+
22
+
23
+ MINIMIZATION_FUNCTIONS.pop("callable", None)
24
+ MINIMIZATION_FUNCTIONS.pop("molwt", None)
25
+
26
+
27
+ def run_inference(
28
+ algorithm_version: str,
29
+ targets: List[str],
30
+ protein_target: str,
31
+ temperature: float,
32
+ length: float,
33
+ number_of_samples: int,
34
+ limit: int,
35
+ number_of_steps: int,
36
+ number_of_initial_points: int,
37
+ number_of_optimization_rounds: int,
38
+ sampling_variance: float,
39
+ samples_for_evaluation: int,
40
+ maximum_number_of_sampling_steps: int,
41
+ seed: int,
42
+ ):
43
+
44
+ config = PaccMannGPGenerator(
45
+ algorithm_version=algorithm_version.split("_")[-1],
46
+ batch_size=32,
47
+ temperature=temperature,
48
+ generated_length=length,
49
+ limit=limit,
50
+ acquisition_function="EI",
51
+ number_of_steps=number_of_steps,
52
+ number_of_initial_points=number_of_initial_points,
53
+ initial_point_generator="random",
54
+ number_of_optimization_rounds=number_of_optimization_rounds,
55
+ sampling_variance=sampling_variance,
56
+ samples_for_evaluation=samples_for_evaluation,
57
+ maximum_number_of_sampling_steps=maximum_number_of_sampling_steps,
58
+ seed=seed,
59
+ )
60
+ target = {i: {} for i in targets}
61
+ if "affinity" in targets:
62
+ if protein_target == "" or not isinstance(protein_target, str):
63
+ raise ValueError(
64
+ f"Protein target must be specified for affinity prediction, not ={protein_target}"
65
+ )
66
+ target["affinity"]["protein"] = protein_target
67
+ else:
68
+ protein_target = ""
69
+
70
+ model = PaccMannGP(config, target=target)
71
+ samples = list(model.sample(number_of_samples))
72
+
73
+ return draw_grid_generate(
74
+ samples=samples,
75
+ n_cols=5,
76
+ properties=set(target.keys()),
77
+ protein_target=protein_target,
78
+ )
79
+
80
+
81
+ if __name__ == "__main__":
82
+
83
+ # Preparation (retrieve all available algorithms)
84
+ all_algos = ApplicationsRegistry.list_available()
85
+ algos = [
86
+ x["algorithm_version"]
87
+ for x in list(filter(lambda x: "PaccMannRL" in x["algorithm_name"], all_algos))
88
+ ]
89
+
90
+ # Load metadata
91
+ metadata_root = pathlib.Path(__file__).parent.joinpath("model_cards")
92
+
93
+ examples = pd.read_csv(
94
+ metadata_root.joinpath("examples.csv"), header=None, sep="|"
95
+ ).fillna("")
96
+ examples[1] = examples[1].apply(eval)
97
+
98
+ with open(metadata_root.joinpath("article.md"), "r") as f:
99
+ article = f.read()
100
+ with open(metadata_root.joinpath("description.md"), "r") as f:
101
+ description = f.read()
102
+
103
+ demo = gr.Interface(
104
+ fn=run_inference,
105
+ title="PaccMannGP",
106
+ inputs=[
107
+ gr.Dropdown(algos, label="Algorithm version", value="v0"),
108
+ gr.CheckboxGroup(
109
+ choices=list(MINIMIZATION_FUNCTIONS.keys()),
110
+ value=["qed"],
111
+ multiselect=True,
112
+ label="Property goals",
113
+ ),
114
+ gr.Textbox(
115
+ label="Protein target",
116
+ placeholder="MVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTT",
117
+ lines=1,
118
+ ),
119
+ gr.Slider(minimum=0.5, maximum=2, value=1, label="Decoding temperature"),
120
+ gr.Slider(
121
+ minimum=5,
122
+ maximum=400,
123
+ value=100,
124
+ label="Maximal sequence length",
125
+ step=1,
126
+ ),
127
+ gr.Slider(
128
+ minimum=1, maximum=50, value=10, label="Number of samples", step=1
129
+ ),
130
+ gr.Slider(minimum=1, maximum=8, value=4.0, label="Limit"),
131
+ gr.Slider(minimum=1, maximum=32, value=8, label="Number of steps", step=1),
132
+ gr.Slider(
133
+ minimum=1, maximum=32, value=4, label="Number of initial points", step=1
134
+ ),
135
+ gr.Slider(
136
+ minimum=1,
137
+ maximum=4,
138
+ value=1,
139
+ label="Number of optimization rounds",
140
+ step=1,
141
+ ),
142
+ gr.Slider(minimum=0.01, maximum=1, value=0.1, label="Sampling variance"),
143
+ gr.Slider(
144
+ minimum=1,
145
+ maximum=10,
146
+ value=1,
147
+ label="Samples used for evaluation",
148
+ step=1,
149
+ ),
150
+ gr.Slider(
151
+ minimum=1,
152
+ maximum=64,
153
+ value=4,
154
+ label="Maximum number of sampling steps",
155
+ step=1,
156
+ ),
157
+ gr.Number(value=42, label="Seed", precision=0),
158
+ ],
159
+ outputs=gr.HTML(label="Output"),
160
+ article=article,
161
+ description=description,
162
+ examples=examples.values.tolist(),
163
+ )
164
+ demo.launch(debug=True, show_error=True)
model_cards/article.md ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model documentation & parameters
2
+
3
+ **Algorithm Version**: Which model version to use.
4
+
5
+ **Property goals**: One or multiple properties that will be optimized.
6
+
7
+ **Protein target**: An AAS of a protein target used for conditioning. Leave blank unless you use `affinity` as a `property goal`.
8
+
9
+ **Decoding temperature**: The temperature parameter in the SMILES/SELFIES decoder. Higher values lead to more explorative choices, smaller values culminate in mode collapse.
10
+
11
+ **Maximal sequence length**: The maximal number of SMILES tokens in the generated molecule.
12
+
13
+ **Number of samples**: How many samples should be generated (between 1 and 50).
14
+
15
+ **Limit**: Hypercube limits in the latent space.
16
+
17
+ **Number of steps**: Number of steps for a GP optmization round. The longer the slower. Has to be at least `Number of initial points`.
18
+
19
+ **Number of initial points**: Number of initial points evaluated. The longer the slower.
20
+
21
+ **Number of optimization rounds**: Maximum number of optimization rounds.
22
+
23
+ **Sampling variance**: Variance of the Gaussian noise applied during sampling from the optimal point.
24
+
25
+ **Samples for evaluation**: Number of samples averaged for each minimization function evaluation.
26
+
27
+ **Max. sampling steps**: Maximum number of sampling steps in an optmization round.
28
+
29
+ **Seed**: The random seed used for initialization.
30
+
31
+
32
+
33
+ # Model card -- PaccMannGP
34
+
35
+ **Model Details**: [PaccMann<sup>GP</sup>](https://github.com/PaccMann/paccmann_gp) is a language-based Variational Autoencoder that is coupled with a GaussianProcess for controlled sampling. This model systematically explores the latent space of a trained molecular VAE.
36
+
37
+ **Developers**: Jannis Born, Matteo Manica and colleagues from IBM Research.
38
+
39
+ **Distributors**: Original authors' code wrapped and distributed by GT4SD Team (2023) from IBM Research.
40
+
41
+ **Model date**: Published in 2022.
42
+
43
+ **Model version**: A molecular VAE trained on 1.5M molecules from ChEMBL.
44
+
45
+ **Model type**: A language-based molecular generative model that can be explored with Gaussian Processes to generate molecules with desired properties.
46
+
47
+ **Information about training algorithms, parameters, fairness constraints or other applied approaches, and features**:
48
+ Described in the [original paper](https://pubs.acs.org/doi/10.1021/acs.jcim.1c00889).
49
+
50
+ **Paper or other resource for more information**:
51
+ [Active Site Sequence Representations of Human Kinases Outperform Full Sequence Representations for Affinity Prediction and Inhibitor Generation: 3D Effects in a 1D Model (2022; *Journal of Chemical Information & Modeling*)](https://pubs.acs.org/doi/10.1021/acs.jcim.1c00889).
52
+
53
+ **License**: MIT
54
+
55
+ **Where to send questions or comments about the model**: Open an issue on [GT4SD repository](https://github.com/GT4SD/gt4sd-core).
56
+
57
+ **Intended Use. Use cases that were envisioned during development**: Chemical research, in particular drug discovery.
58
+
59
+ **Primary intended uses/users**: Researchers and computational chemists using the model for model comparison or research exploration purposes.
60
+
61
+ **Out-of-scope use cases**: Production-level inference, producing molecules with harmful properties.
62
+
63
+ **Factors**: Not applicable.
64
+
65
+ **Metrics**: High reward on generating molecules with desired properties.
66
+
67
+ **Datasets**: ChEMBL.
68
+
69
+ **Ethical Considerations**: Unclear, please consult with original authors in case of questions.
70
+
71
+ **Caveats and Recommendations**: Unclear, please consult with original authors in case of questions.
72
+
73
+ Model card prototype inspired by [Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)
74
+
75
+ ## Citation
76
+ ```bib
77
+ @article{born2022active,
78
+ author = {Born, Jannis and Huynh, Tien and Stroobants, Astrid and Cornell, Wendy D. and Manica, Matteo},
79
+ title = {Active Site Sequence Representations of Human Kinases Outperform Full Sequence Representations for Affinity Prediction and Inhibitor Generation: 3D Effects in a 1D Model},
80
+ journal = {Journal of Chemical Information and Modeling},
81
+ volume = {62},
82
+ number = {2},
83
+ pages = {240-257},
84
+ year = {2022},
85
+ doi = {10.1021/acs.jcim.1c00889},
86
+ note ={PMID: 34905358},
87
+ URL = {https://doi.org/10.1021/acs.jcim.1c00889}
88
+ }
89
+ ```
model_cards/description.md ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <img align="right" src="https://raw.githubusercontent.com/GT4SD/gt4sd-core/main/docs/_static/gt4sd_logo.png" alt="logo" width="120" >
2
+
3
+ [PaccMann<sup>GP</sup>](https://github.com/PaccMann/paccmann_gp) is a language-based Variational Autoencoder that is coupled with a GaussianProcess for controlled sampling. For details of the methodology, please see [Born et al., (2022), *Journal of Chemical Information & Modeling*](https://pubs.acs.org/doi/10.1021/acs.jcim.1c00889).
4
+
5
+ For **examples** and **documentation** of the model parameters, please see below.
6
+ Moreover, we provide a **model card** ([Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)) at the bottom of this page.
model_cards/examples.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ v0|["qed"]||1.2|100|10|4|8|4|1|0.1|3|4|42
2
+ v0|["qed","sa"]||1.2|100|10|4|8|4|1|0.1|3|4|42
3
+ v0|["affinity"]|MVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTT|1.2|100|10|4|8|4|1|0.1|3|4|42
requirements.txt ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -f https://download.pytorch.org/whl/cpu/torch_stable.html
2
+ -f https://data.pyg.org/whl/torch-1.12.1+cpu.html
3
+ # pip==20.2.4
4
+ torch==1.12.1
5
+ torch-scatter
6
+ torch-spline-conv
7
+ torch-sparse
8
+ torch-geometric
9
+ torchvision==0.13.1
10
+ torchaudio==0.12.1
11
+ gt4sd>=1.0.5
12
+ molgx>=0.22.0a1
13
+ molecule_generation
14
+ nglview
15
+ PyTDC==0.3.7
16
+ gradio==3.12.0
17
+ markdown-it-py>=2.1.0
18
+ mols2grid>=0.2.0
19
+ numpy==1.23.5
20
+ pandas>=1.0.0
21
+ terminator @ git+https://github.com/IBM/regression-transformer@gt4sd
22
+ guacamol_baselines @ git+https://github.com/GT4SD/guacamol_baselines.git@v0.0.2
23
+ moses @ git+https://github.com/GT4SD/moses.git@v0.1.0
24
+ paccmann_chemistry @ git+https://github.com/PaccMann/paccmann_chemistry@0.0.4
25
+ paccmann_generator @ git+https://github.com/PaccMann/paccmann_generator@0.0.2
26
+ paccmann_gp @ git+https://github.com/PaccMann/paccmann_gp@0.1.1
27
+ paccmann_omics @ git+https://github.com/PaccMann/paccmann_omics@0.0.1.1
28
+ paccmann_predictor @ git+https://github.com/PaccMann/paccmann_predictor@sarscov2
29
+ reinvent_models @ git+https://github.com/GT4SD/reinvent_models@v0.0.1
utils.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from collections import defaultdict
3
+ from typing import List, Callable
4
+ from gt4sd.properties import PropertyPredictorRegistry
5
+ from gt4sd.algorithms.prediction.paccmann.core import PaccMann, AffinityPredictor
6
+ import torch
7
+
8
+ import mols2grid
9
+ import pandas as pd
10
+
11
+ logger = logging.getLogger(__name__)
12
+ logger.addHandler(logging.NullHandler())
13
+
14
+
15
+ def get_affinity_function(target: str) -> Callable:
16
+ return lambda mols: torch.stack(
17
+ list(
18
+ PaccMann(
19
+ AffinityPredictor(protein_targets=[target] * len(mols), ligands=mols)
20
+ ).sample(len(mols))
21
+ )
22
+ ).tolist()
23
+
24
+
25
+ EVAL_DICT = {
26
+ "qed": PropertyPredictorRegistry.get_property_predictor("qed"),
27
+ "sa": PropertyPredictorRegistry.get_property_predictor("sas"),
28
+ }
29
+
30
+
31
+ def draw_grid_generate(
32
+ samples: List[str],
33
+ properties: List[str],
34
+ protein_target: str,
35
+ n_cols: int = 3,
36
+ size=(140, 200),
37
+ ) -> str:
38
+ """
39
+ Uses mols2grid to draw a HTML grid for the generated molecules
40
+
41
+ Args:
42
+ samples: The generated samples.
43
+ n_cols: Number of columns in grid. Defaults to 5.
44
+ size: Size of molecule in grid. Defaults to (140, 200).
45
+
46
+ Returns:
47
+ HTML to display
48
+ """
49
+
50
+ if protein_target != "":
51
+ EVAL_DICT.update({"affinity": get_affinity_function(protein_target)})
52
+
53
+ result = defaultdict(list)
54
+ result.update(
55
+ {"SMILES": samples, "Name": [f"Generated_{i}" for i in range(len(samples))]},
56
+ )
57
+ if "affinity" in properties:
58
+ properties.remove("affinity")
59
+ vals = EVAL_DICT["affinity"](samples)
60
+ result["affinity"] = vals
61
+ # Fill properties
62
+ for sample in samples:
63
+ for prop in properties:
64
+ value = EVAL_DICT[prop](sample)
65
+ result[prop].append(f"{prop} = {value}")
66
+
67
+ result_df = pd.DataFrame(result)
68
+ obj = mols2grid.display(
69
+ result_df,
70
+ tooltip=list(result.keys()),
71
+ height=1100,
72
+ n_cols=n_cols,
73
+ name="Results",
74
+ size=size,
75
+ )
76
+ return obj.data