jannisborn commited on
Commit
e3475d1
0 Parent(s):

Duplicate from jannisborn/gt4sd-huggingface

Browse files
Files changed (10) hide show
  1. .gitattributes +34 -0
  2. .gitignore +1 -0
  3. LICENSE +21 -0
  4. README.md +15 -0
  5. app.py +114 -0
  6. model_cards/article.md +78 -0
  7. model_cards/description.md +6 -0
  8. model_cards/examples.csv +2 -0
  9. requirements.txt +29 -0
  10. utils.py +48 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__/
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2022 Generative Toolkit 4 Scientific Discovery
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: GT4SD - HuggingFace transformers
3
+ emoji: 💡
4
+ colorFrom: green
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 3.9.1
8
+ app_file: app.py
9
+ pinned: false
10
+ python_version: 3.8.13
11
+ pypi_version: 20.2.4
12
+ duplicated_from: jannisborn/gt4sd-huggingface
13
+ ---
14
+
15
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import pathlib
3
+ import gradio as gr
4
+ import pandas as pd
5
+ from gt4sd.algorithms.generation.hugging_face import (
6
+ HuggingFaceCTRLGenerator,
7
+ HuggingFaceGenerationAlgorithm,
8
+ HuggingFaceGPT2Generator,
9
+ HuggingFaceTransfoXLGenerator,
10
+ HuggingFaceOpenAIGPTGenerator,
11
+ HuggingFaceXLMGenerator,
12
+ HuggingFaceXLNetGenerator,
13
+ )
14
+ from gt4sd.algorithms.registry import ApplicationsRegistry
15
+
16
+
17
+ logger = logging.getLogger(__name__)
18
+ logger.addHandler(logging.NullHandler())
19
+
20
+ MODEL_FN = {
21
+ "HuggingFaceCTRLGenerator": HuggingFaceCTRLGenerator,
22
+ "HuggingFaceGPT2Generator": HuggingFaceGPT2Generator,
23
+ "HuggingFaceTransfoXLGenerator": HuggingFaceTransfoXLGenerator,
24
+ "HuggingFaceOpenAIGPTGenerator": HuggingFaceOpenAIGPTGenerator,
25
+ "HuggingFaceXLMGenerator": HuggingFaceXLMGenerator,
26
+ "HuggingFaceXLNetGenerator": HuggingFaceXLNetGenerator,
27
+ }
28
+
29
+
30
+ def run_inference(
31
+ model_type: str,
32
+ prompt: str,
33
+ length: float,
34
+ temperature: float,
35
+ prefix: str,
36
+ k: float,
37
+ p: float,
38
+ repetition_penalty: float,
39
+ ):
40
+ model = model_type.split("_")[0]
41
+ version = model_type.split("_")[1]
42
+
43
+ if model not in MODEL_FN.keys():
44
+ raise ValueError(f"Model type {model} not supported")
45
+ config = MODEL_FN[model](
46
+ algorithm_version=version,
47
+ prompt=prompt,
48
+ length=length,
49
+ temperature=temperature,
50
+ repetition_penalty=repetition_penalty,
51
+ k=k,
52
+ p=p,
53
+ prefix=prefix,
54
+ )
55
+
56
+ model = HuggingFaceGenerationAlgorithm(config)
57
+ text = list(model.sample(1))[0]
58
+
59
+ return text
60
+
61
+
62
+ if __name__ == "__main__":
63
+
64
+ # Preparation (retrieve all available algorithms)
65
+ all_algos = ApplicationsRegistry.list_available()
66
+ algos = [
67
+ x["algorithm_application"] + "_" + x["algorithm_version"]
68
+ for x in list(filter(lambda x: "HuggingFace" in x["algorithm_name"], all_algos))
69
+ ]
70
+
71
+ # Load metadata
72
+ metadata_root = pathlib.Path(__file__).parent.joinpath("model_cards")
73
+
74
+ examples = pd.read_csv(metadata_root.joinpath("examples.csv"), header=None).fillna(
75
+ ""
76
+ )
77
+ print("Examples: ", examples.values.tolist())
78
+
79
+ with open(metadata_root.joinpath("article.md"), "r") as f:
80
+ article = f.read()
81
+ with open(metadata_root.joinpath("description.md"), "r") as f:
82
+ description = f.read()
83
+
84
+ demo = gr.Interface(
85
+ fn=run_inference,
86
+ title="HuggingFace language models",
87
+ inputs=[
88
+ gr.Dropdown(
89
+ algos,
90
+ label="Language model",
91
+ value="HuggingFaceGPT2Generator_gpt2",
92
+ ),
93
+ gr.Textbox(
94
+ label="Text prompt",
95
+ placeholder="I'm a stochastic parrot.",
96
+ lines=1,
97
+ ),
98
+ gr.Slider(minimum=5, maximum=100, value=20, label="Maximal length", step=1),
99
+ gr.Slider(
100
+ minimum=0.6, maximum=1.5, value=1.1, label="Decoding temperature"
101
+ ),
102
+ gr.Textbox(
103
+ label="Prefix", placeholder="Some prefix (before the prompt)", lines=1
104
+ ),
105
+ gr.Slider(minimum=2, maximum=500, value=50, label="Top-k", step=1),
106
+ gr.Slider(minimum=0.5, maximum=1, value=1.0, label="Decoding-p", step=1),
107
+ gr.Slider(minimum=0.5, maximum=5, value=1.0, label="Repetition penalty"),
108
+ ],
109
+ outputs=gr.Textbox(label="Output"),
110
+ article=article,
111
+ description=description,
112
+ examples=examples.values.tolist(),
113
+ )
114
+ demo.launch(debug=True, show_error=True)
model_cards/article.md ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model documentation & parameters
2
+
3
+ **Language model**: Type of language model to be used.
4
+
5
+ **Text prompt**: The text prompt to condition the model.
6
+
7
+ **Maximal length**: The maximal number of SMILES tokens in the generated molecule.
8
+
9
+ **Decoding temperature**: The temperature in the beam search decoding.
10
+
11
+ **Prefix**: A text prompt that will be passed to the mode **before** the prompt.
12
+
13
+ **Top-k**: Number of top-k probability tokens to keep.
14
+
15
+ **Decoding-p**: Only tokens with cumulative probabilities summing up to this value are kept.
16
+
17
+ **Repetition penalty**: Penalty for repeating tokens. Leave unchanged, but for CTRL model, use 1.2.
18
+
19
+
20
+
21
+ # Model card -- HuggingFace
22
+
23
+ **Model Details**: Various Transformer-based language models.
24
+
25
+ **Developers**: HuggingFace developers
26
+
27
+ **Distributors**: HuggingFace developers' code integrated into GT4SD.
28
+
29
+ **Model date**: Varies between models.
30
+
31
+ **Model type**: Different types of `transformers` language models:
32
+ - CTRL: `CTRLLMHeadModel`
33
+ - GPT2: `GPT2LMHeadModel`
34
+ - XLNet: `XLNetLMHeadModel`
35
+ - OpenAIGPT: `OpenAIGPTLMHeadModel`
36
+ - TransfoXL: `TransfoXLLMHeadModel`
37
+ - XLM: `XLMWithLMHeadModel`
38
+
39
+ **Information about training algorithms, parameters, fairness constraints or other applied approaches, and features**:
40
+ N.A.
41
+
42
+ **Paper or other resource for more information**:
43
+ All documentation available from [transformers documentation](https://huggingface.co/docs/transformers/)
44
+
45
+ **License**: MIT
46
+
47
+ **Where to send questions or comments about the model**: Open an issue on [GT4SD repository](https://github.com/GT4SD/gt4sd-core).
48
+
49
+ **Intended Use. Use cases that were envisioned during development**: N.A.
50
+
51
+ **Primary intended uses/users**: N.A.
52
+
53
+ **Out-of-scope use cases**: Production-level inference, producing molecules with harmful properties.
54
+
55
+ **Metrics**: N.A.
56
+
57
+ **Datasets**: N.A.
58
+
59
+ **Ethical Considerations**: Unclear, please consult with original authors in case of questions.
60
+
61
+ **Caveats and Recommendations**: Unclear, please consult with original authors in case of questions.
62
+
63
+ Model card prototype inspired by [Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)
64
+
65
+ ## Citation
66
+ ```bib
67
+ @inproceedings{wolf-etal-2020-transformers,
68
+ title = "Transformers: State-of-the-Art Natural Language Processing",
69
+ author = "Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and Rémi Louf and Morgan Funtowicz and Joe Davison and Sam Shleifer and Patrick von Platen and Clara Ma and Yacine Jernite and Julien Plu and Canwen Xu and Teven Le Scao and Sylvain Gugger and Mariama Drame and Quentin Lhoest and Alexander M. Rush",
70
+ booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
71
+ month = oct,
72
+ year = "2020",
73
+ address = "Online",
74
+ publisher = "Association for Computational Linguistics",
75
+ url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6",
76
+ pages = "38--45"
77
+ }
78
+ ```
model_cards/description.md ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <img align="right" src="https://raw.githubusercontent.com/GT4SD/gt4sd-core/main/docs/_static/gt4sd_logo.png" alt="logo" width="120" >
2
+
3
+ This UI gives access to some pretrained language models from [*HuggingFace*](https://github.com/huggingface/) that are distributed via GT4SD.
4
+
5
+ For **examples** and **documentation** of the model parameters, please see below.
6
+ Moreover, we provide a **model card** ([Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)) at the bottom of this page.
model_cards/examples.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ HuggingFaceGPT2Generator_gpt2, The role of generative models is,20,1.1,,50,1,1
2
+ HuggingFaceOpenAIGPTGenerator_openai-gpt, The best country in the world is,10,0.9,,50,1,1
requirements.txt ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -f https://download.pytorch.org/whl/cpu/torch_stable.html
2
+ -f https://data.pyg.org/whl/torch-1.12.1+cpu.html
3
+ # pip==20.2.4
4
+ torch==1.12.1
5
+ torch-scatter
6
+ torch-spline-conv
7
+ torch-sparse
8
+ torch-geometric
9
+ torchvision==0.13.1
10
+ torchaudio==0.12.1
11
+ gt4sd>=1.0.5
12
+ molgx>=0.22.0a1
13
+ molecule_generation
14
+ nglview
15
+ PyTDC==0.3.7
16
+ gradio==3.12.0
17
+ markdown-it-py>=2.1.0
18
+ mols2grid>=0.2.0
19
+ numpy==1.23.5
20
+ pandas>=1.0.0
21
+ terminator @ git+https://github.com/IBM/regression-transformer@gt4sd
22
+ guacamol_baselines @ git+https://github.com/GT4SD/guacamol_baselines.git@v0.0.2
23
+ moses @ git+https://github.com/GT4SD/moses.git@v0.1.0
24
+ paccmann_chemistry @ git+https://github.com/PaccMann/paccmann_chemistry@0.0.4
25
+ paccmann_generator @ git+https://github.com/PaccMann/paccmann_generator@0.0.2
26
+ paccmann_gp @ git+https://github.com/PaccMann/paccmann_gp@0.1.1
27
+ paccmann_omics @ git+https://github.com/PaccMann/paccmann_omics@0.0.1.1
28
+ paccmann_predictor @ git+https://github.com/PaccMann/paccmann_predictor@sarscov2
29
+ reinvent_models @ git+https://github.com/GT4SD/reinvent_models@v0.0.1
utils.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from collections import defaultdict
3
+ from typing import List
4
+
5
+ import mols2grid
6
+ import pandas as pd
7
+
8
+ logger = logging.getLogger(__name__)
9
+ logger.addHandler(logging.NullHandler())
10
+
11
+
12
+ def draw_grid_generate(
13
+ samples: List[str],
14
+ seeds: List[str] = [],
15
+ n_cols: int = 3,
16
+ size=(140, 200),
17
+ ) -> str:
18
+ """
19
+ Uses mols2grid to draw a HTML grid for the generated molecules
20
+
21
+ Args:
22
+ samples: The generated samples.
23
+ n_cols: Number of columns in grid. Defaults to 5.
24
+ size: Size of molecule in grid. Defaults to (140, 200).
25
+
26
+ Returns:
27
+ HTML to display
28
+ """
29
+
30
+ result = defaultdict(list)
31
+ result.update(
32
+ {
33
+ "SMILES": seeds + samples,
34
+ "Name": [f"Seed_{i}" for i in range(len(seeds))]
35
+ + [f"Generated_{i}" for i in range(len(samples))],
36
+ },
37
+ )
38
+
39
+ result_df = pd.DataFrame(result)
40
+ obj = mols2grid.display(
41
+ result_df,
42
+ tooltip=list(result.keys()),
43
+ height=1100,
44
+ n_cols=n_cols,
45
+ name="Results",
46
+ size=size,
47
+ )
48
+ return obj.data