jannisborn commited on
Commit
7d76d6f
0 Parent(s):

Duplicate from jannisborn/gt4sd-moler

Browse files
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__/
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2022 Generative Toolkit 4 Scientific Discovery
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: MoLeR
3
+ emoji: 💡
4
+ colorFrom: green
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 3.9.1
8
+ app_file: app.py
9
+ pinned: false
10
+ python_version: 3.8.13
11
+ pypi_version: 20.2.4
12
+ duplicated_from: jannisborn/gt4sd-moler
13
+ ---
14
+
15
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import pathlib
3
+
4
+ import gradio as gr
5
+ import pandas as pd
6
+ from gt4sd.algorithms.generation.moler import MoLeR, MoLeRDefaultGenerator
7
+
8
+ from gt4sd.algorithms.registry import ApplicationsRegistry
9
+ from utils import draw_grid_generate
10
+
11
+ logger = logging.getLogger(__name__)
12
+ logger.addHandler(logging.NullHandler())
13
+
14
+ TITLE = "MoLeR"
15
+
16
+
17
+ def run_inference(
18
+ algorithm_version: str,
19
+ scaffolds: str,
20
+ beam_size: int,
21
+ number_of_samples: int,
22
+ seed: int,
23
+ ):
24
+ config = MoLeRDefaultGenerator(
25
+ algorithm_version=algorithm_version,
26
+ scaffolds=scaffolds,
27
+ beam_size=beam_size,
28
+ num_samples=4,
29
+ seed=seed,
30
+ num_workers=1,
31
+ )
32
+ model = MoLeR(configuration=config)
33
+ samples = list(model.sample(number_of_samples))
34
+
35
+ seed_mols = [] if scaffolds == "" else scaffolds.split(".")
36
+ return draw_grid_generate(seed_mols, samples)
37
+
38
+
39
+ if __name__ == "__main__":
40
+
41
+ # Preparation (retrieve all available algorithms)
42
+ all_algos = ApplicationsRegistry.list_available()
43
+ algos = [
44
+ x["algorithm_version"]
45
+ for x in list(filter(lambda x: TITLE in x["algorithm_name"], all_algos))
46
+ ]
47
+
48
+ # Load metadata
49
+ metadata_root = pathlib.Path(__file__).parent.joinpath("model_cards")
50
+
51
+ examples = pd.read_csv(metadata_root.joinpath("examples.csv"), header=None).fillna(
52
+ ""
53
+ )
54
+
55
+ with open(metadata_root.joinpath("article.md"), "r") as f:
56
+ article = f.read()
57
+ with open(metadata_root.joinpath("description.md"), "r") as f:
58
+ description = f.read()
59
+
60
+ demo = gr.Interface(
61
+ fn=run_inference,
62
+ title="MoLeR (MOlecule-LEvel Representation)",
63
+ inputs=[
64
+ gr.Dropdown(algos, label="Algorithm version", value="v0"),
65
+ gr.Textbox(
66
+ label="Scaffolds",
67
+ placeholder="CC(C#C)N(C)C(=O)NC1=CC=C(Cl)C=C1",
68
+ lines=1,
69
+ ),
70
+ gr.Slider(minimum=1, maximum=5, value=1, step=1, label="Beam_size"),
71
+ gr.Slider(
72
+ minimum=1, maximum=50, value=10, label="Number of samples", step=1
73
+ ),
74
+ gr.Number(value=42, label="Seed", precision=0),
75
+ ],
76
+ outputs=gr.HTML(label="Output"),
77
+ article=article,
78
+ description=description,
79
+ examples=examples.values.tolist(),
80
+ )
81
+ demo.launch(debug=True, show_error=True)
model_cards/article.md ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model documentation & parameters
2
+
3
+ **Algorithm Version**: Which model checkpoint to use (trained on different datasets).
4
+
5
+ **Scaffolds**: One or multiple scaffolds (or seed molecules), provided as '.'-separated SMILES. If empty, no scaffolds are used.
6
+
7
+ **Number of samples**: How many samples should be generated (between 1 and 50).
8
+
9
+ **Beam size**: Beam size used in beam search decoding (the higher the slower but better).
10
+
11
+ **Seed**: The random seed used for initialization.
12
+
13
+
14
+ # Model card
15
+
16
+ **Model Details**: MoLeR is a graph-based molecular generative model that can be conditioned (primed) on scaffolds. The model decorates scaffolds with realistic structural motifs.
17
+
18
+ **Developers**: Krzysztof Maziarz and co-authors from Microsoft Research and Novartis (full reference at bottom).
19
+
20
+ **Distributors**: Developer's code wrapped and distributed by GT4SD Team (2023) from IBM Research.
21
+
22
+ **Model date**: Released around March 2022.
23
+
24
+ **Model version**: Model provided by original authors, see [their GitHub repo](https://github.com/microsoft/molecule-generation).
25
+
26
+ **Model type**: An encoder-decoder-based GNN for molecular generation.
27
+
28
+ **Information about training algorithms, parameters, fairness constraints or other applied approaches, and features**: Trained by the original authors with the default parameters provided [on GitHub](https://github.com/microsoft/molecule-generation).
29
+
30
+ **Paper or other resource for more information**: Learning to Extend Molecular Scaffolds with Structural Motifs (ICLR 2022).
31
+
32
+ **License**: MIT
33
+
34
+ **Where to send questions or comments about the model**: Open an issue on original author's [GitHub repository](https://github.com/microsoft/molecule-generation).
35
+
36
+ **Intended Use. Use cases that were envisioned during development**: Chemical research, in particular drug discovery.
37
+
38
+ **Primary intended uses/users**: Researchers and computational chemists using the model for model comparison or research exploration purposes.
39
+
40
+ **Out-of-scope use cases**: Production-level inference, producing molecules with harmful properties.
41
+
42
+ **Factors**: Not applicable.
43
+
44
+ **Metrics**: Validation loss on decoding correct molecules. Evaluated on several downstream tasks.
45
+
46
+ **Datasets**: 1.5M drug-like molecules from GuacaMol benchmark. Finetuning on 20 molecular optimization tasks from GuacaMol.
47
+
48
+ **Ethical Considerations**: Unclear, please consult with original authors in case of questions.
49
+
50
+ **Caveats and Recommendations**: Unclear, please consult with original authors in case of questions.
51
+
52
+ Model card prototype inspired by [Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)
53
+
54
+ ## Citation
55
+
56
+ ```bib
57
+ @inproceedings{maziarz2021learning,
58
+ author={Krzysztof Maziarz and Henry Richard Jackson{-}Flux and Pashmina Cameron and
59
+ Finton Sirockin and Nadine Schneider and Nikolaus Stiefl and Marwin H. S. Segler and Marc Brockschmidt},
60
+ title = {Learning to Extend Molecular Scaffolds with Structural Motifs},
61
+ booktitle = {The Tenth International Conference on Learning Representations, {ICLR}},
62
+ year = {2022}
63
+ }
64
+ ```
65
+
model_cards/description.md ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <img align="right" src="https://raw.githubusercontent.com/GT4SD/gt4sd-core/main/docs/_static/gt4sd_logo.png" alt="logo" width="120" >
2
+
3
+ MoLeR (Maziarz et al., (2022), *ICLR*) is a graph-based molecular generative model that can be conditioned (primed) on scaffolds. This model r is provided and distributed by the **GT4SD** (Generative Toolkit for Scientific Discovery).
4
+
5
+ For **examples** and **documentation** of the model parameters, please see below.
6
+ Moreover, we provide a **model card** ([Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)) at the bottom of this page.
model_cards/examples.csv ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ v0,,1,4,0
2
+ v0,CC(=O)NC1=NC2=CC(OCC3=CC=CN(CC4=CC=C(Cl)C=C4)C3=O)=CC=C2N1,1,10,0
3
+ v0,C12C=CC=NN1C(C#CC1=C(C)C=CC3C(NC4=CC(C(F)(F)F)=CC=C4)=NOC1=3)=CN=2.CCO,3,5,5
4
+
5
+
requirements.txt ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -f https://download.pytorch.org/whl/cpu/torch_stable.html
2
+ -f https://data.pyg.org/whl/torch-1.12.1+cpu.html
3
+ # pip==20.2.4
4
+ torch==1.12.1
5
+ torch-scatter
6
+ torch-spline-conv
7
+ torch-sparse
8
+ torch-geometric
9
+ torchvision==0.13.1
10
+ torchaudio==0.12.1
11
+ gt4sd>=1.0.0
12
+ molgx>=0.22.0a1
13
+ molecule_generation
14
+ nglview
15
+ PyTDC==0.3.7
16
+ gradio==3.12.0
17
+ markdown-it-py>=2.1.0
18
+ mols2grid>=0.2.0
19
+ numpy==1.23.5
20
+ pandas>=1.0.0
21
+ terminator @ git+https://github.com/IBM/regression-transformer@gt4sd
22
+ guacamol_baselines @ git+https://github.com/GT4SD/guacamol_baselines.git@v0.0.2
23
+ moses @ git+https://github.com/GT4SD/moses.git@v0.1.0
24
+ paccmann_chemistry @ git+https://github.com/PaccMann/paccmann_chemistry@0.0.4
25
+ paccmann_generator @ git+https://github.com/PaccMann/paccmann_generator@0.0.2
26
+ paccmann_gp @ git+https://github.com/PaccMann/paccmann_gp@0.1.1
27
+ paccmann_omics @ git+https://github.com/PaccMann/paccmann_omics@0.0.1.1
28
+ paccmann_predictor @ git+https://github.com/PaccMann/paccmann_predictor@sarscov2
29
+ reinvent_models @ git+https://github.com/GT4SD/reinvent_models@v0.0.1
utils.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import os
4
+ from collections import defaultdict
5
+ from typing import Dict, List, Tuple
6
+
7
+ import mols2grid
8
+ import pandas as pd
9
+ from rdkit import Chem
10
+ from terminator.selfies import decoder
11
+
12
+ logger = logging.getLogger(__name__)
13
+ logger.addHandler(logging.NullHandler())
14
+
15
+
16
+ def draw_grid_generate(
17
+ seeds: List[str],
18
+ samples: List[str],
19
+ n_cols: int = 3,
20
+ size=(140, 200),
21
+ ) -> str:
22
+ """
23
+ Uses mols2grid to draw a HTML grid for the generated molecules
24
+
25
+ Args:
26
+ samples: The generated samples.
27
+ n_cols: Number of columns in grid. Defaults to 5.
28
+ size: Size of molecule in grid. Defaults to (140, 200).
29
+
30
+ Returns:
31
+ HTML to display
32
+ """
33
+
34
+ result = defaultdict(list)
35
+ result.update(
36
+ {
37
+ "SMILES": seeds + samples,
38
+ "Name": [f"Seed_{i}" for i in range(len(seeds))]
39
+ + [f"Generated_{i}" for i in range(len(samples))],
40
+ },
41
+ )
42
+
43
+ result_df = pd.DataFrame(result)
44
+ obj = mols2grid.display(
45
+ result_df,
46
+ tooltip=list(result.keys()),
47
+ height=1100,
48
+ n_cols=n_cols,
49
+ name="Results",
50
+ size=size,
51
+ )
52
+ return obj.data