jannisborn commited on
Commit
63d9b78
·
unverified ·
1 Parent(s): 1ff7fe3
Files changed (3) hide show
  1. app.py +21 -4
  2. model_cards/article.md +27 -49
  3. model_cards/metal.csv +9 -12
app.py CHANGED
@@ -4,6 +4,7 @@ import pathlib
4
  import shutil
5
  import tempfile
6
  from pathlib import Path
 
7
 
8
  import gradio as gr
9
  import pandas as pd
@@ -12,7 +13,7 @@ from gt4sd.properties.crystals import CRYSTALS_PROPERTY_PREDICTOR_FACTORY
12
  logger = logging.getLogger(__name__)
13
  logger.addHandler(logging.NullHandler())
14
 
15
- suffix_dict = {"metal_nonmetal_classifier": ".csv"}
16
 
17
 
18
  def create_temp_file(path: str) -> str:
@@ -36,16 +37,33 @@ def main(property: str, data_file: str):
36
  if data_file is None:
37
  raise TypeError("You have to pass either an input file for the crystal model")
38
 
 
 
39
  # Copy file into a UNIQUE temporary directory
40
- file_path = Path(create_temp_file(data_file.name))
 
 
 
41
  folder = file_path.parent
42
  print(file_path)
43
  print(folder)
44
  if file_path.suffix == ".cif":
 
 
 
 
45
  input_path = folder
46
  elif file_path.suffix == ".csv":
 
 
 
 
47
  input_path = file_path
48
  elif file_path.suffix == ".zip":
 
 
 
 
49
  # Unzip zip
50
  shutil.unpack_archive(file_path, file_path.parent)
51
  if len(list(filter(lambda x: x.endswith(".cif"), os.listdir(folder)))) == 0:
@@ -58,7 +76,6 @@ def main(property: str, data_file: str):
58
  f" `.cif` files. Not {type(data_file)}."
59
  )
60
 
61
- prop_name = property.replace(" ", "_").lower()
62
  algo, config = CRYSTALS_PROPERTY_PREDICTOR_FACTORY[prop_name]
63
  # Pass hyperparameters if applicable
64
  kwargs = {"algorithm_version": "v0"}
@@ -80,7 +97,7 @@ if __name__ == "__main__":
80
  examples = [
81
  ["Formation Energy", metadata_root.joinpath("7206075.cif")],
82
  ["Bulk moduli", metadata_root.joinpath("crystals.zip")],
83
- # ["Metal Nonmetal Classifier", metadata_root.joinpath("metal.csv")],
84
  ["Bulk moduli", metadata_root.joinpath("9000046.cif")],
85
  ]
86
 
 
4
  import shutil
5
  import tempfile
6
  from pathlib import Path
7
+ from collections import defaultdict
8
 
9
  import gradio as gr
10
  import pandas as pd
 
13
  logger = logging.getLogger(__name__)
14
  logger.addHandler(logging.NullHandler())
15
 
16
+ suffix_dict = {"metal_nonmetal_classifier": [".csv"]}
17
 
18
 
19
  def create_temp_file(path: str) -> str:
 
37
  if data_file is None:
38
  raise TypeError("You have to pass either an input file for the crystal model")
39
 
40
+ prop_name = property.replace(" ", "_").lower()
41
+
42
  # Copy file into a UNIQUE temporary directory
43
+ if data_file.name.endswith("cfsdfsdsv"):
44
+ file_path = Path(create_temp_file(data_file.orig_name))
45
+ else:
46
+ file_path = Path(create_temp_file(data_file.name))
47
  folder = file_path.parent
48
  print(file_path)
49
  print(folder)
50
  if file_path.suffix == ".cif":
51
+ if ".cif" not in suffix_dict.get(prop_name, [".cif", ".zip"]):
52
+ raise ValueError(
53
+ f"For this property, provide {suffix_dict[prop_name]}, not `.cif`."
54
+ )
55
  input_path = folder
56
  elif file_path.suffix == ".csv":
57
+ if ".csv" not in suffix_dict.get(prop_name, [".cif", ".zip"]):
58
+ raise ValueError(
59
+ f"For this property, provide {suffix_dict.get(prop_name, ['.cif', '.zip'])}, not `.csv`."
60
+ )
61
  input_path = file_path
62
  elif file_path.suffix == ".zip":
63
+ if ".zip" not in suffix_dict.get(prop_name, [".cif", ".zip"]):
64
+ raise ValueError(
65
+ f"For this property, provide {suffix_dict[prop_name]}, not `.zip`."
66
+ )
67
  # Unzip zip
68
  shutil.unpack_archive(file_path, file_path.parent)
69
  if len(list(filter(lambda x: x.endswith(".cif"), os.listdir(folder)))) == 0:
 
76
  f" `.cif` files. Not {type(data_file)}."
77
  )
78
 
 
79
  algo, config = CRYSTALS_PROPERTY_PREDICTOR_FACTORY[prop_name]
80
  # Pass hyperparameters if applicable
81
  kwargs = {"algorithm_version": "v0"}
 
97
  examples = [
98
  ["Formation Energy", metadata_root.joinpath("7206075.cif")],
99
  ["Bulk moduli", metadata_root.joinpath("crystals.zip")],
100
+ ["Metal Nonmetal Classifier", metadata_root.joinpath("metal.csv")],
101
  ["Bulk moduli", metadata_root.joinpath("9000046.cif")],
102
  ]
103
 
model_cards/article.md CHANGED
@@ -2,52 +2,27 @@
2
 
3
  ## Parameters
4
 
5
- ### Algorithm Version
6
- Which model checkpoint to use (trained on different datasets).
 
 
 
 
 
 
 
 
 
7
 
8
- ### Task
9
- Whether the multitask model should be used for property prediction or conditional generation (default).
10
 
11
- ### Input
12
- The input sequence. In the default setting (where `Task` is *Generate* and `Sampling Wrapper` is *True*) this can be a seed SMILES (for the molecule models) or amino-acid sequence (for the protein models). The model will locally adapt the seed sequence by masking `Fraction to mask` of the tokens.
13
- If the `Task` is *Predict*, the sequences are given as SELFIES for the molecule models. Moreover, the tokens that should be predicted (`[MASK]` in the input) have to be given explicitly. Populate the examples to understand better.
14
- NOTE: When setting `Task` to *Generate*, and `Sampling Wrapper` to *False*, the user has maximal control about the generative process and can explicitly decide which tokens should be masked.
15
 
16
- ### Number of samples
17
- How many samples should be generated (between 1 and 50). If `Task` is *Predict*, this has to be set to 1.
18
 
19
- ### Search
20
- Decoding search method. Use *Sample* if `Task` is *Generate*. If `Task` is *Predict*, use *Greedy*.
21
 
22
- ### Tolerance
23
- Precision tolerance; only used if `Task` is *Generate*. This is a single float between 0 and 100 for the the tolerated deviation between desired/primed property and predicted property of the generated molecule. Given in percentage with respect to the property range encountered during training.
24
- NOTE: The tolerance is *only* used for post-hoc filtering of the generated samples.
25
-
26
- ### Sampling Wrapper
27
- Only used if `Task` is *Generate*. If set to *False*, the user has to provide a full RT-sequence as `Input` and has to **explicitly** decide which tokens are masked (see example below). This gives full control but is tedious. Instead, if `Sampling Wrapper` is set to *True*, the RT stochastically determines which parts of the sequence are masked.
28
- **NOTE**: All below arguments only apply if `Sampling Wrapper` is *True*.
29
-
30
- #### Fraction to mask
31
- Specifies the ratio of tokens that can be changed by the model. Argument only applies if `Task` is *Generate* and `Sampling Wrapper` is *True*.
32
-
33
- #### Property goal
34
- Specifies the desired target properties for the generation. Need to be given in the format `<prop>:value`. If the model supports multiple properties, give them separated by a comma `,`. Argument only applies if `Task` is *Generate* and `Sampling Wrapper` is *True*.
35
-
36
- #### Tokens to mask
37
- Optionally specifies which tokens (atoms, bonds etc) can be masked. Please separate multiple tokens by comma (`,`). If not specified, all tokens can be masked. Argument only applies if `Task` is *Generate* and `Sampling Wrapper` is *True*.
38
-
39
- #### Substructures to mask
40
- Optionally specifies a list of substructures that should *definitely* be masked (excluded from stochastic masking). Given in SMILES format. If multiple are provided, separate by comma (`,`). Argument only applies if `Task` is *Generate* and `Sampling Wrapper` is *True*.
41
- *NOTE*: Most models operate on SELFIES and the matching of the substructures occurs in SELFIES simply on a string level.
42
-
43
- #### Substructures to keep
44
- Optionally specifies a list of substructures that should definitely be present in the target sample (i.e., excluded from stochastic masking). Given in SMILES format. Argument only applies if `Task` is *Generate* and `Sampling Wrapper` is *True*.
45
- *NOTE*: This keeps tokens even if they are included in `tokens_to_mask`.
46
- *NOTE*: Most models operate on SELFIES and the matching of the substructures occurs in SELFIES simply on a string level.
47
-
48
-
49
-
50
- # Model card -- Regression Transformer
51
 
52
  **Model Details**: The [Regression Transformer](https://arxiv.org/abs/2202.01338) is a multitask Transformer that reformulates regression as a conditional sequence modeling task. This yields a dichotomous language model that seamlessly integrates property prediction with property-driven conditional generation.
53
 
@@ -99,15 +74,18 @@ The [Regression Transformer](https://arxiv.org/abs/2202.01338) paper. See the [s
99
  Model card prototype inspired by [Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)
100
 
101
 
102
- ## Citation
 
 
 
 
 
103
 
104
  ```bib
105
- @article{born2022regression,
106
- title={Regression Transformer: Concurrent Conditional Generation and Regression by Blending Numerical and Textual Tokens},
107
- author={Born, Jannis and Manica, Matteo},
108
- journal={arXiv preprint arXiv:2202.01338},
109
- note={Spotlight talk at ICLR workshop on Machine Learning for Drug Discovery},
110
  year={2022}
111
  }
112
- ```
113
-
 
2
 
3
  ## Parameters
4
 
5
+ ### Property
6
+ The supported properties are:
7
+ - `Metal NonMetal Classifier`: Predicted by a RF model (WHICH? )
8
+ - `Metal Semiconductor Classifier`: Classifying whether a metal could be a semiconductor. Predicted with CGCNN (ToDo: Add Ref!)
9
+ - `Poisson Ratio`: ToDo: Description + Reference
10
+ - `Shear Moduli` ...
11
+ - `Bulk Moduli`
12
+ - `Fermi Energy`
13
+ - `Band Gap`
14
+ - `Absolute Energy`
15
+ - `Formation Energy`
16
 
 
 
17
 
18
+ ### Input file for crystal model
19
+ The file with information about the metal. Dependent on the property you want to predict, the format of the file differs:
20
+ - `Metal NonMetal Classifier`. It requires a single `.csv` file with the metal (chemical formula) in the first column and the crystal system in the second.
21
+ - **All others**: Predicted with CGCNN. The input can either be a single `.cif` file (to predict a single metal) or a `.zip` folder which contains multiple `.cif` (for batch prediction)
22
 
 
 
23
 
 
 
24
 
25
+ # Model card - CGCNN
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  **Model Details**: The [Regression Transformer](https://arxiv.org/abs/2202.01338) is a multitask Transformer that reformulates regression as a conditional sequence modeling task. This yields a dichotomous language model that seamlessly integrates property prediction with property-driven conditional generation.
28
 
 
74
  Model card prototype inspired by [Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)
75
 
76
 
77
+ # Model card - RandomForestMetalClassifier
78
+
79
+ ToDo...
80
+
81
+
82
+ # Citation
83
 
84
  ```bib
85
+ @article{manica2022gt4sd,
86
+ title={GT4SD: Generative Toolkit for Scientific Discovery},
87
+ author={Manica, Matteo and Cadow, Joris and Christofidellis, Dimitrios and Dave, Ashish and Born, Jannis and Clarke, Dean and Teukam, Yves Gaetan Nana and Hoffman, Samuel C and Buchan, Matthew and Chenthamarakshan, Vijil and others},
88
+ journal={arXiv preprint arXiv:2207.03928},
 
89
  year={2022}
90
  }
91
+ ```
 
model_cards/metal.csv CHANGED
@@ -1,4 +1,11 @@
1
- KPSO2,orthorhombic
 
 
 
 
 
 
 
2
  Zr2Ga(PO4)3,trigonal
3
  Te4Mo(WSe)2,trigonal
4
  Mo3W(SeS3)2,trigonal
@@ -13,7 +20,6 @@ Te6Mo3WS2,trigonal
13
  KMg6CO8,tetragonal
14
  Mg14BiBO16,orthorhombic
15
  KMg14WO16,tetragonal
16
- Mg14AlCdO16,orthorhombic
17
  Mg30VCrO32,tetragonal
18
  Mg30CoSiO32,tetragonal
19
  YMg30CO32,tetragonal
@@ -25,9 +31,7 @@ CaMg30NiO32,tetragonal
25
  LiMg30AlO32,tetragonal
26
  Mg30AlFeO32,tetragonal
27
  RbMg30SbO32,tetragonal
28
- KNaMg30O3orthorhombic
29
  La7Sm(Fe2O5)4,triclinic
30
- SrCa3Mn4O1triclinic
31
  NbNi3(HC)2,tetragonal
32
  La2P2AuO,monoclinic
33
  Li9Mn2Co5O16,monoclinic
@@ -40,11 +44,4 @@ LiCr4P7O24,triclinic
40
  ZnGe(OF)6,trigonal
41
  Cs2Mo(SO)2,monoclinic
42
  NaMgSO7,monoclinic
43
- K2NaNdCl6,cubic
44
- K2NaBiCl6,cubic
45
- Na2EuCuCl6,cubic
46
- NaLi2CoF6,cubic
47
- K2NaTiF6,cubic
48
- K2AgRhF6,cubic
49
- K2CeAgCl6,cubic
50
- K2ErCuCl6,cubic
 
1
+ K2NaNdCl6,cubic
2
+ K2NaBiCl6,cubic
3
+ Na2EuCuCl6,cubic
4
+ NaLi2CoF6,cubic
5
+ K2NaTiF6,cubic
6
+ K2AgRhF6,cubic
7
+ K2CeAgCl6,cubic
8
+ K2ErCuCl6,cubic
9
  Zr2Ga(PO4)3,trigonal
10
  Te4Mo(WSe)2,trigonal
11
  Mo3W(SeS3)2,trigonal
 
20
  KMg6CO8,tetragonal
21
  Mg14BiBO16,orthorhombic
22
  KMg14WO16,tetragonal
 
23
  Mg30VCrO32,tetragonal
24
  Mg30CoSiO32,tetragonal
25
  YMg30CO32,tetragonal
 
31
  LiMg30AlO32,tetragonal
32
  Mg30AlFeO32,tetragonal
33
  RbMg30SbO32,tetragonal
 
34
  La7Sm(Fe2O5)4,triclinic
 
35
  NbNi3(HC)2,tetragonal
36
  La2P2AuO,monoclinic
37
  Li9Mn2Co5O16,monoclinic
 
44
  ZnGe(OF)6,trigonal
45
  Cs2Mo(SO)2,monoclinic
46
  NaMgSO7,monoclinic
47
+ Mg14AlCdO16,orthorhombic