Spaces:
Running
Running
jannisborn
commited on
update
Browse files- app.py +21 -4
- model_cards/article.md +27 -49
- model_cards/metal.csv +9 -12
app.py
CHANGED
@@ -4,6 +4,7 @@ import pathlib
|
|
4 |
import shutil
|
5 |
import tempfile
|
6 |
from pathlib import Path
|
|
|
7 |
|
8 |
import gradio as gr
|
9 |
import pandas as pd
|
@@ -12,7 +13,7 @@ from gt4sd.properties.crystals import CRYSTALS_PROPERTY_PREDICTOR_FACTORY
|
|
12 |
logger = logging.getLogger(__name__)
|
13 |
logger.addHandler(logging.NullHandler())
|
14 |
|
15 |
-
suffix_dict = {"metal_nonmetal_classifier": ".csv"}
|
16 |
|
17 |
|
18 |
def create_temp_file(path: str) -> str:
|
@@ -36,16 +37,33 @@ def main(property: str, data_file: str):
|
|
36 |
if data_file is None:
|
37 |
raise TypeError("You have to pass either an input file for the crystal model")
|
38 |
|
|
|
|
|
39 |
# Copy file into a UNIQUE temporary directory
|
40 |
-
|
|
|
|
|
|
|
41 |
folder = file_path.parent
|
42 |
print(file_path)
|
43 |
print(folder)
|
44 |
if file_path.suffix == ".cif":
|
|
|
|
|
|
|
|
|
45 |
input_path = folder
|
46 |
elif file_path.suffix == ".csv":
|
|
|
|
|
|
|
|
|
47 |
input_path = file_path
|
48 |
elif file_path.suffix == ".zip":
|
|
|
|
|
|
|
|
|
49 |
# Unzip zip
|
50 |
shutil.unpack_archive(file_path, file_path.parent)
|
51 |
if len(list(filter(lambda x: x.endswith(".cif"), os.listdir(folder)))) == 0:
|
@@ -58,7 +76,6 @@ def main(property: str, data_file: str):
|
|
58 |
f" `.cif` files. Not {type(data_file)}."
|
59 |
)
|
60 |
|
61 |
-
prop_name = property.replace(" ", "_").lower()
|
62 |
algo, config = CRYSTALS_PROPERTY_PREDICTOR_FACTORY[prop_name]
|
63 |
# Pass hyperparameters if applicable
|
64 |
kwargs = {"algorithm_version": "v0"}
|
@@ -80,7 +97,7 @@ if __name__ == "__main__":
|
|
80 |
examples = [
|
81 |
["Formation Energy", metadata_root.joinpath("7206075.cif")],
|
82 |
["Bulk moduli", metadata_root.joinpath("crystals.zip")],
|
83 |
-
|
84 |
["Bulk moduli", metadata_root.joinpath("9000046.cif")],
|
85 |
]
|
86 |
|
|
|
4 |
import shutil
|
5 |
import tempfile
|
6 |
from pathlib import Path
|
7 |
+
from collections import defaultdict
|
8 |
|
9 |
import gradio as gr
|
10 |
import pandas as pd
|
|
|
13 |
logger = logging.getLogger(__name__)
|
14 |
logger.addHandler(logging.NullHandler())
|
15 |
|
16 |
+
suffix_dict = {"metal_nonmetal_classifier": [".csv"]}
|
17 |
|
18 |
|
19 |
def create_temp_file(path: str) -> str:
|
|
|
37 |
if data_file is None:
|
38 |
raise TypeError("You have to pass either an input file for the crystal model")
|
39 |
|
40 |
+
prop_name = property.replace(" ", "_").lower()
|
41 |
+
|
42 |
# Copy file into a UNIQUE temporary directory
|
43 |
+
if data_file.name.endswith("cfsdfsdsv"):
|
44 |
+
file_path = Path(create_temp_file(data_file.orig_name))
|
45 |
+
else:
|
46 |
+
file_path = Path(create_temp_file(data_file.name))
|
47 |
folder = file_path.parent
|
48 |
print(file_path)
|
49 |
print(folder)
|
50 |
if file_path.suffix == ".cif":
|
51 |
+
if ".cif" not in suffix_dict.get(prop_name, [".cif", ".zip"]):
|
52 |
+
raise ValueError(
|
53 |
+
f"For this property, provide {suffix_dict[prop_name]}, not `.cif`."
|
54 |
+
)
|
55 |
input_path = folder
|
56 |
elif file_path.suffix == ".csv":
|
57 |
+
if ".csv" not in suffix_dict.get(prop_name, [".cif", ".zip"]):
|
58 |
+
raise ValueError(
|
59 |
+
f"For this property, provide {suffix_dict.get(prop_name, ['.cif', '.zip'])}, not `.csv`."
|
60 |
+
)
|
61 |
input_path = file_path
|
62 |
elif file_path.suffix == ".zip":
|
63 |
+
if ".zip" not in suffix_dict.get(prop_name, [".cif", ".zip"]):
|
64 |
+
raise ValueError(
|
65 |
+
f"For this property, provide {suffix_dict[prop_name]}, not `.zip`."
|
66 |
+
)
|
67 |
# Unzip zip
|
68 |
shutil.unpack_archive(file_path, file_path.parent)
|
69 |
if len(list(filter(lambda x: x.endswith(".cif"), os.listdir(folder)))) == 0:
|
|
|
76 |
f" `.cif` files. Not {type(data_file)}."
|
77 |
)
|
78 |
|
|
|
79 |
algo, config = CRYSTALS_PROPERTY_PREDICTOR_FACTORY[prop_name]
|
80 |
# Pass hyperparameters if applicable
|
81 |
kwargs = {"algorithm_version": "v0"}
|
|
|
97 |
examples = [
|
98 |
["Formation Energy", metadata_root.joinpath("7206075.cif")],
|
99 |
["Bulk moduli", metadata_root.joinpath("crystals.zip")],
|
100 |
+
["Metal Nonmetal Classifier", metadata_root.joinpath("metal.csv")],
|
101 |
["Bulk moduli", metadata_root.joinpath("9000046.cif")],
|
102 |
]
|
103 |
|
model_cards/article.md
CHANGED
@@ -2,52 +2,27 @@
|
|
2 |
|
3 |
## Parameters
|
4 |
|
5 |
-
###
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
-
### Task
|
9 |
-
Whether the multitask model should be used for property prediction or conditional generation (default).
|
10 |
|
11 |
-
### Input
|
12 |
-
The
|
13 |
-
|
14 |
-
|
15 |
|
16 |
-
### Number of samples
|
17 |
-
How many samples should be generated (between 1 and 50). If `Task` is *Predict*, this has to be set to 1.
|
18 |
|
19 |
-
### Search
|
20 |
-
Decoding search method. Use *Sample* if `Task` is *Generate*. If `Task` is *Predict*, use *Greedy*.
|
21 |
|
22 |
-
|
23 |
-
Precision tolerance; only used if `Task` is *Generate*. This is a single float between 0 and 100 for the the tolerated deviation between desired/primed property and predicted property of the generated molecule. Given in percentage with respect to the property range encountered during training.
|
24 |
-
NOTE: The tolerance is *only* used for post-hoc filtering of the generated samples.
|
25 |
-
|
26 |
-
### Sampling Wrapper
|
27 |
-
Only used if `Task` is *Generate*. If set to *False*, the user has to provide a full RT-sequence as `Input` and has to **explicitly** decide which tokens are masked (see example below). This gives full control but is tedious. Instead, if `Sampling Wrapper` is set to *True*, the RT stochastically determines which parts of the sequence are masked.
|
28 |
-
**NOTE**: All below arguments only apply if `Sampling Wrapper` is *True*.
|
29 |
-
|
30 |
-
#### Fraction to mask
|
31 |
-
Specifies the ratio of tokens that can be changed by the model. Argument only applies if `Task` is *Generate* and `Sampling Wrapper` is *True*.
|
32 |
-
|
33 |
-
#### Property goal
|
34 |
-
Specifies the desired target properties for the generation. Need to be given in the format `<prop>:value`. If the model supports multiple properties, give them separated by a comma `,`. Argument only applies if `Task` is *Generate* and `Sampling Wrapper` is *True*.
|
35 |
-
|
36 |
-
#### Tokens to mask
|
37 |
-
Optionally specifies which tokens (atoms, bonds etc) can be masked. Please separate multiple tokens by comma (`,`). If not specified, all tokens can be masked. Argument only applies if `Task` is *Generate* and `Sampling Wrapper` is *True*.
|
38 |
-
|
39 |
-
#### Substructures to mask
|
40 |
-
Optionally specifies a list of substructures that should *definitely* be masked (excluded from stochastic masking). Given in SMILES format. If multiple are provided, separate by comma (`,`). Argument only applies if `Task` is *Generate* and `Sampling Wrapper` is *True*.
|
41 |
-
*NOTE*: Most models operate on SELFIES and the matching of the substructures occurs in SELFIES simply on a string level.
|
42 |
-
|
43 |
-
#### Substructures to keep
|
44 |
-
Optionally specifies a list of substructures that should definitely be present in the target sample (i.e., excluded from stochastic masking). Given in SMILES format. Argument only applies if `Task` is *Generate* and `Sampling Wrapper` is *True*.
|
45 |
-
*NOTE*: This keeps tokens even if they are included in `tokens_to_mask`.
|
46 |
-
*NOTE*: Most models operate on SELFIES and the matching of the substructures occurs in SELFIES simply on a string level.
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
# Model card -- Regression Transformer
|
51 |
|
52 |
**Model Details**: The [Regression Transformer](https://arxiv.org/abs/2202.01338) is a multitask Transformer that reformulates regression as a conditional sequence modeling task. This yields a dichotomous language model that seamlessly integrates property prediction with property-driven conditional generation.
|
53 |
|
@@ -99,15 +74,18 @@ The [Regression Transformer](https://arxiv.org/abs/2202.01338) paper. See the [s
|
|
99 |
Model card prototype inspired by [Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)
|
100 |
|
101 |
|
102 |
-
|
|
|
|
|
|
|
|
|
|
|
103 |
|
104 |
```bib
|
105 |
-
@article{
|
106 |
-
title={
|
107 |
-
author={Born, Jannis and
|
108 |
-
journal={arXiv preprint arXiv:
|
109 |
-
note={Spotlight talk at ICLR workshop on Machine Learning for Drug Discovery},
|
110 |
year={2022}
|
111 |
}
|
112 |
-
```
|
113 |
-
|
|
|
2 |
|
3 |
## Parameters
|
4 |
|
5 |
+
### Property
|
6 |
+
The supported properties are:
|
7 |
+
- `Metal NonMetal Classifier`: Predicted by a RF model (WHICH? )
|
8 |
+
- `Metal Semiconductor Classifier`: Classifying whether a metal could be a semiconductor. Predicted with CGCNN (ToDo: Add Ref!)
|
9 |
+
- `Poisson Ratio`: ToDo: Description + Reference
|
10 |
+
- `Shear Moduli` ...
|
11 |
+
- `Bulk Moduli`
|
12 |
+
- `Fermi Energy`
|
13 |
+
- `Band Gap`
|
14 |
+
- `Absolute Energy`
|
15 |
+
- `Formation Energy`
|
16 |
|
|
|
|
|
17 |
|
18 |
+
### Input file for crystal model
|
19 |
+
The file with information about the metal. Dependent on the property you want to predict, the format of the file differs:
|
20 |
+
- `Metal NonMetal Classifier`. It requires a single `.csv` file with the metal (chemical formula) in the first column and the crystal system in the second.
|
21 |
+
- **All others**: Predicted with CGCNN. The input can either be a single `.cif` file (to predict a single metal) or a `.zip` folder which contains multiple `.cif` (for batch prediction)
|
22 |
|
|
|
|
|
23 |
|
|
|
|
|
24 |
|
25 |
+
# Model card - CGCNN
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
**Model Details**: The [Regression Transformer](https://arxiv.org/abs/2202.01338) is a multitask Transformer that reformulates regression as a conditional sequence modeling task. This yields a dichotomous language model that seamlessly integrates property prediction with property-driven conditional generation.
|
28 |
|
|
|
74 |
Model card prototype inspired by [Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)
|
75 |
|
76 |
|
77 |
+
# Model card - RandomForestMetalClassifier
|
78 |
+
|
79 |
+
ToDo...
|
80 |
+
|
81 |
+
|
82 |
+
# Citation
|
83 |
|
84 |
```bib
|
85 |
+
@article{manica2022gt4sd,
|
86 |
+
title={GT4SD: Generative Toolkit for Scientific Discovery},
|
87 |
+
author={Manica, Matteo and Cadow, Joris and Christofidellis, Dimitrios and Dave, Ashish and Born, Jannis and Clarke, Dean and Teukam, Yves Gaetan Nana and Hoffman, Samuel C and Buchan, Matthew and Chenthamarakshan, Vijil and others},
|
88 |
+
journal={arXiv preprint arXiv:2207.03928},
|
|
|
89 |
year={2022}
|
90 |
}
|
91 |
+
```
|
|
model_cards/metal.csv
CHANGED
@@ -1,4 +1,11 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
Zr2Ga(PO4)3,trigonal
|
3 |
Te4Mo(WSe)2,trigonal
|
4 |
Mo3W(SeS3)2,trigonal
|
@@ -13,7 +20,6 @@ Te6Mo3WS2,trigonal
|
|
13 |
KMg6CO8,tetragonal
|
14 |
Mg14BiBO16,orthorhombic
|
15 |
KMg14WO16,tetragonal
|
16 |
-
Mg14AlCdO16,orthorhombic
|
17 |
Mg30VCrO32,tetragonal
|
18 |
Mg30CoSiO32,tetragonal
|
19 |
YMg30CO32,tetragonal
|
@@ -25,9 +31,7 @@ CaMg30NiO32,tetragonal
|
|
25 |
LiMg30AlO32,tetragonal
|
26 |
Mg30AlFeO32,tetragonal
|
27 |
RbMg30SbO32,tetragonal
|
28 |
-
KNaMg30O3orthorhombic
|
29 |
La7Sm(Fe2O5)4,triclinic
|
30 |
-
SrCa3Mn4O1triclinic
|
31 |
NbNi3(HC)2,tetragonal
|
32 |
La2P2AuO,monoclinic
|
33 |
Li9Mn2Co5O16,monoclinic
|
@@ -40,11 +44,4 @@ LiCr4P7O24,triclinic
|
|
40 |
ZnGe(OF)6,trigonal
|
41 |
Cs2Mo(SO)2,monoclinic
|
42 |
NaMgSO7,monoclinic
|
43 |
-
|
44 |
-
K2NaBiCl6,cubic
|
45 |
-
Na2EuCuCl6,cubic
|
46 |
-
NaLi2CoF6,cubic
|
47 |
-
K2NaTiF6,cubic
|
48 |
-
K2AgRhF6,cubic
|
49 |
-
K2CeAgCl6,cubic
|
50 |
-
K2ErCuCl6,cubic
|
|
|
1 |
+
K2NaNdCl6,cubic
|
2 |
+
K2NaBiCl6,cubic
|
3 |
+
Na2EuCuCl6,cubic
|
4 |
+
NaLi2CoF6,cubic
|
5 |
+
K2NaTiF6,cubic
|
6 |
+
K2AgRhF6,cubic
|
7 |
+
K2CeAgCl6,cubic
|
8 |
+
K2ErCuCl6,cubic
|
9 |
Zr2Ga(PO4)3,trigonal
|
10 |
Te4Mo(WSe)2,trigonal
|
11 |
Mo3W(SeS3)2,trigonal
|
|
|
20 |
KMg6CO8,tetragonal
|
21 |
Mg14BiBO16,orthorhombic
|
22 |
KMg14WO16,tetragonal
|
|
|
23 |
Mg30VCrO32,tetragonal
|
24 |
Mg30CoSiO32,tetragonal
|
25 |
YMg30CO32,tetragonal
|
|
|
31 |
LiMg30AlO32,tetragonal
|
32 |
Mg30AlFeO32,tetragonal
|
33 |
RbMg30SbO32,tetragonal
|
|
|
34 |
La7Sm(Fe2O5)4,triclinic
|
|
|
35 |
NbNi3(HC)2,tetragonal
|
36 |
La2P2AuO,monoclinic
|
37 |
Li9Mn2Co5O16,monoclinic
|
|
|
44 |
ZnGe(OF)6,trigonal
|
45 |
Cs2Mo(SO)2,monoclinic
|
46 |
NaMgSO7,monoclinic
|
47 |
+
Mg14AlCdO16,orthorhombic
|
|
|
|
|
|
|
|
|
|
|
|
|
|