Spaces:
Runtime error
Runtime error
Upload 8 files
Browse files- README.md +64 -12
- analysis_dGPredictor.ipynb +0 -0
- decompose_groups.py +251 -0
- main.py +478 -0
- mini_novoStoic.py +326 -0
- model_gen.py +55 -0
- reaction_rule_2_gen.py +30 -0
- retrieve_bulk.ipynb +660 -0
README.md
CHANGED
@@ -1,12 +1,64 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# dGPredictor
|
2 |
+
|
3 |
+
==================================
|
4 |
+
### Requirements:
|
5 |
+
|
6 |
+
1. Python 3.8.10
|
7 |
+
2. RDkit (http://www.rdkit.org/)
|
8 |
+
3. pandas (https://pandas.pydata.org/)
|
9 |
+
4. matplotlib (https://matplotlib.org/stable/users/installing.html)
|
10 |
+
5. Scikit-learn (https://scikit-learn.org/stable/)
|
11 |
+
6. Streamlit (https://streamlit.io/)
|
12 |
+
7. Openbabel (https://anaconda.org/openbabel/openbabel)
|
13 |
+
8. ChemAxon's Marvin >= 5.11
|
14 |
+
9. Pulp
|
15 |
+
|
16 |
+
Installation
|
17 |
+
1. Python 3.8.10 (https://www.python.org/downloads/windows/)
|
18 |
+
Recommended-
|
19 |
+
- Create anaconda environment using command "conda create -n dGPredictor python=3.8 ipython"
|
20 |
+
- activate the env using command "conda activate dGPredictor" or "source activate dGPredictor"
|
21 |
+
2. RDkit
|
22 |
+
- type command "conda install -c conda-forge rdkit" in your dGPredictor env to install rdkit
|
23 |
+
3. Pandas
|
24 |
+
- "conda install pandas"
|
25 |
+
4. matplotlib
|
26 |
+
- "conda install -c conda-forge matplotlib"
|
27 |
+
5. Scikit-learn
|
28 |
+
- use command "pip install -U scikit-learn"
|
29 |
+
6. Streamlit
|
30 |
+
- use command "pip install -U streamlit"
|
31 |
+
7. Openbabel
|
32 |
+
- run "conda install -c conda-forge openbabel"
|
33 |
+
8. ChemAxon's Marvin (PkA value estimation)
|
34 |
+
- Marvin is only required for adding structures of novel metabolites/compounds that are not in the KEGG database
|
35 |
+
- instructions (https://chemaxon.com/products/marvin/download)
|
36 |
+
- add "cxcalc.bat (macOS) /cxcalc.exe (Windows)" to PATH and also in "./CC/chemaxon.py" file
|
37 |
+
- you will need to get a license to use ChemAxon (it is free for academic use)
|
38 |
+
9. Pulp
|
39 |
+
- use command "pip install -U pulp"
|
40 |
+
|
41 |
+
|
42 |
+
|
43 |
+
|
44 |
+
==================================
|
45 |
+
### Running web-interface locally using streamlit
|
46 |
+
|
47 |
+
- Model generation: Run "model_gen.py" using "python model_gen.py" once to create dGPredictor model file :- (Running this might take some time)
|
48 |
+
- run "streamlit run ./streamlit/main.py" from dGPredictor folder
|
49 |
+
- running KEGG reaction (doesn't require ChemAxon's Marvin) : copy paste the reaction equation into reaction section and click search
|
50 |
+
|
51 |
+
### Gibbs free energy prediction use automated group decomposition method
|
52 |
+
|
53 |
+
- Step 1: decompose the metabolites based on smiles files (see function decompse_ac in decompose_groups.py or notebook )
|
54 |
+
- Step 2: create group changes vectors (i.e. reaction rules) based on group changes in metabolites of reactions (see get_rxn_rule in decompose_groups.py)
|
55 |
+
- Step 3: linear regression, Ridge Regression and Bayesian Ridge Regression in "predict.py"
|
56 |
+
- Step 4: Multiple regression models in notebook "analysis_dGPredictor.ipynb"
|
57 |
+
|
58 |
+
### Pathway design using novoStoic
|
59 |
+
- Run "mini_novoStoic.py" to see an example to design pathways for Isobutanol synthesis
|
60 |
+
|
61 |
+
|
62 |
+
# demo
|
63 |
+
![dGPredictor Demo](figures/dg_demo_py3.gif)
|
64 |
+
|
analysis_dGPredictor.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
decompose_groups.py
ADDED
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import pdb
|
3 |
+
import json
|
4 |
+
from rdkit import Chem
|
5 |
+
|
6 |
+
def count_substructures(radius,molecule):
|
7 |
+
"""Helper function for get the information of molecular signature of a
|
8 |
+
metabolite. The relaxed signature requires the number of each substructure
|
9 |
+
to construct a matrix for each molecule.
|
10 |
+
Parameters
|
11 |
+
----------
|
12 |
+
radius : int
|
13 |
+
the radius is bond-distance that defines how many neighbor atoms should
|
14 |
+
be considered in a reaction center.
|
15 |
+
molecule : Molecule
|
16 |
+
a molecule object create by RDkit (e.g. Chem.MolFromInchi(inchi_code)
|
17 |
+
or Chem.MolToSmiles(smiles_code))
|
18 |
+
Returns
|
19 |
+
-------
|
20 |
+
dict
|
21 |
+
dictionary of molecular signature for a molecule,
|
22 |
+
{smiles: molecular_signature}
|
23 |
+
"""
|
24 |
+
m = molecule
|
25 |
+
smi_count = dict()
|
26 |
+
atomList = [atom for atom in m.GetAtoms()]
|
27 |
+
|
28 |
+
for i in range(len(atomList)):
|
29 |
+
env = Chem.FindAtomEnvironmentOfRadiusN(m,radius,i)
|
30 |
+
atoms=set()
|
31 |
+
for bidx in env:
|
32 |
+
atoms.add(m.GetBondWithIdx(bidx).GetBeginAtomIdx())
|
33 |
+
atoms.add(m.GetBondWithIdx(bidx).GetEndAtomIdx())
|
34 |
+
|
35 |
+
# only one atom is in this environment, such as O in H2O
|
36 |
+
if len(atoms) == 0:
|
37 |
+
atoms = {i}
|
38 |
+
|
39 |
+
smi = Chem.MolFragmentToSmiles(m,atomsToUse=list(atoms),
|
40 |
+
bondsToUse=env,canonical=True)
|
41 |
+
|
42 |
+
if smi in smi_count:
|
43 |
+
smi_count[smi] = smi_count[smi] + 1
|
44 |
+
else:
|
45 |
+
smi_count[smi] = 1
|
46 |
+
return smi_count
|
47 |
+
|
48 |
+
def decompse_ac(db_smiles,radius=1):
|
49 |
+
non_decomposable = []
|
50 |
+
decompose_vector = dict()
|
51 |
+
|
52 |
+
for cid in db_smiles:
|
53 |
+
# print cid
|
54 |
+
smiles_pH7 = db_smiles[cid]
|
55 |
+
try:
|
56 |
+
mol = Chem.MolFromSmiles(smiles_pH7)
|
57 |
+
mol = Chem.RemoveHs(mol)
|
58 |
+
# Chem.RemoveStereochemistry(mol)
|
59 |
+
smi_count = count_substructures(radius,mol)
|
60 |
+
decompose_vector[cid] = smi_count
|
61 |
+
|
62 |
+
except Exception as e:
|
63 |
+
non_decomposable.append(cid)
|
64 |
+
|
65 |
+
with open('./data/decompose_vector_ac.json','w') as fp:
|
66 |
+
json.dump(decompose_vector,fp)
|
67 |
+
|
68 |
+
def get_rxn_rule():
|
69 |
+
"""calculate reaction rules based on the relaxed molecular signatures.
|
70 |
+
|
71 |
+
Parameters
|
72 |
+
----------
|
73 |
+
radius : int
|
74 |
+
the radius is bond-distance that defines how many neighbor atoms should
|
75 |
+
be considered in a reaction center.
|
76 |
+
|
77 |
+
Returns
|
78 |
+
-------
|
79 |
+
None
|
80 |
+
All of the reaction rules are saved in files (csv file)
|
81 |
+
|
82 |
+
"""
|
83 |
+
reaction_dict = json.load(open('./data/optstoic_v3_Sji_dict.json'))
|
84 |
+
molecular_signature = json.load(open('./data/decompose_vector_ac.json'))
|
85 |
+
molsigna_df = pd.DataFrame.from_dict(molecular_signature).fillna(0)
|
86 |
+
all_mets = molsigna_df.columns.tolist()
|
87 |
+
all_mets.append("C00080")
|
88 |
+
all_mets.append("C00282")
|
89 |
+
|
90 |
+
|
91 |
+
rule_df = pd.DataFrame(index=molsigna_df.index)
|
92 |
+
for rid, value in list(reaction_dict.items()):
|
93 |
+
# skip the reactions with missing metabolites
|
94 |
+
mets = list(value.keys())
|
95 |
+
flag = False
|
96 |
+
for met in mets:
|
97 |
+
if met not in all_mets:
|
98 |
+
flag = True
|
99 |
+
break
|
100 |
+
if flag: continue
|
101 |
+
|
102 |
+
rule_df[rid] = 0
|
103 |
+
for met, stoic in list(value.items()):
|
104 |
+
if met == "C00080" or met == "C00282":
|
105 |
+
continue # hydogen is zero
|
106 |
+
rule_df[rid] += molsigna_df[met] * stoic
|
107 |
+
rule_df.to_csv("./data/reaction_rule.csv", index=True)
|
108 |
+
|
109 |
+
def get_rxn_rule_no_stero():
|
110 |
+
"""calculate reaction rules based on the relaxed molecular signatures.
|
111 |
+
|
112 |
+
Parameters
|
113 |
+
----------
|
114 |
+
radius : int
|
115 |
+
the radius is bond-distance that defines how many neighbor atoms should
|
116 |
+
be considered in a reaction center.
|
117 |
+
|
118 |
+
Returns
|
119 |
+
-------
|
120 |
+
None
|
121 |
+
All of the reaction rules are saved in files (csv file)
|
122 |
+
|
123 |
+
"""
|
124 |
+
reaction_dict = json.load(open('./data/optstoic_v3_Sji_dict.json'))
|
125 |
+
molecular_signature = json.load(open('./data/decompose_vector_ac_nostereo.json'))
|
126 |
+
molsigna_df = pd.DataFrame.from_dict(molecular_signature).fillna(0)
|
127 |
+
all_mets = molsigna_df.columns.tolist()
|
128 |
+
all_mets.append("C00080")
|
129 |
+
all_mets.append("C00282")
|
130 |
+
|
131 |
+
|
132 |
+
rule_df = pd.DataFrame(index=molsigna_df.index)
|
133 |
+
for rid, value in list(reaction_dict.items()):
|
134 |
+
# skip the reactions with missing metabolites
|
135 |
+
mets = list(value.keys())
|
136 |
+
flag = False
|
137 |
+
for met in mets:
|
138 |
+
if met not in all_mets:
|
139 |
+
flag = True
|
140 |
+
break
|
141 |
+
if flag: continue
|
142 |
+
|
143 |
+
rule_df[rid] = 0
|
144 |
+
for met, stoic in list(value.items()):
|
145 |
+
if met == "C00080" or met == "C00282":
|
146 |
+
continue # hydogen is zero
|
147 |
+
rule_df[rid] += molsigna_df[met] * stoic
|
148 |
+
rule_df.to_csv("./data/reaction_rule_no_stero.csv", index=True)
|
149 |
+
|
150 |
+
def get_rxn_rule_remove_TECRDB_mets():
|
151 |
+
"""calculate reaction rules based on the relaxed molecular signatures.
|
152 |
+
|
153 |
+
Parameters
|
154 |
+
----------
|
155 |
+
radius : int
|
156 |
+
the radius is bond-distance that defines how many neighbor atoms should
|
157 |
+
be considered in a reaction center.
|
158 |
+
|
159 |
+
Returns
|
160 |
+
-------
|
161 |
+
None
|
162 |
+
All of the reaction rules are saved in files (csv file)
|
163 |
+
|
164 |
+
"""
|
165 |
+
reaction_dict = json.load(open('./data/optstoic_v3_Sji_dict.json'))
|
166 |
+
molecular_signature = json.load(open('./data/decompose_vector_ac.json'))
|
167 |
+
molsigna_df = pd.DataFrame.from_dict(molecular_signature).fillna(0)
|
168 |
+
all_mets = molsigna_df.columns.tolist()
|
169 |
+
all_mets.append("C00080")
|
170 |
+
all_mets.append("C00282")
|
171 |
+
|
172 |
+
mets_TECRDB_df = pd.read_csv('./data/TECRBD_mets.txt',header=None)
|
173 |
+
mets_TECRDB = mets_TECRDB_df[0].tolist()
|
174 |
+
|
175 |
+
# pdb.set_trace()
|
176 |
+
all_mets = list(set(all_mets + mets_TECRDB))
|
177 |
+
|
178 |
+
rule_df = pd.DataFrame(index=molsigna_df.index)
|
179 |
+
for rid, value in list(reaction_dict.items()):
|
180 |
+
# skip the reactions with missing metabolites
|
181 |
+
mets = list(value.keys())
|
182 |
+
flag = False
|
183 |
+
for met in mets:
|
184 |
+
if met not in all_mets:
|
185 |
+
flag = True
|
186 |
+
break
|
187 |
+
if flag: continue
|
188 |
+
|
189 |
+
rule_df[rid] = 0
|
190 |
+
for met, stoic in list(value.items()):
|
191 |
+
if met in mets_TECRDB:
|
192 |
+
continue # hydogen is zero
|
193 |
+
rule_df[rid] += molsigna_df[met] * stoic
|
194 |
+
rule_df.to_csv("./data/reaction_rule_remove_TECRDB_mets.csv", index=True)
|
195 |
+
|
196 |
+
def get_rxn_rule_no_stero_remove_TECRDB_mets():
|
197 |
+
"""calculate reaction rules based on the relaxed molecular signatures.
|
198 |
+
|
199 |
+
Parameters
|
200 |
+
----------
|
201 |
+
radius : int
|
202 |
+
the radius is bond-distance that defines how many neighbor atoms should
|
203 |
+
be considered in a reaction center.
|
204 |
+
|
205 |
+
Returns
|
206 |
+
-------
|
207 |
+
None
|
208 |
+
All of the reaction rules are saved in files (csv file)
|
209 |
+
|
210 |
+
"""
|
211 |
+
reaction_dict = json.load(open('./data/optstoic_v3_Sji_dict.json'))
|
212 |
+
molecular_signature = json.load(open('./data/decompose_vector_ac_nostereo.json'))
|
213 |
+
molsigna_df = pd.DataFrame.from_dict(molecular_signature).fillna(0)
|
214 |
+
all_mets = molsigna_df.columns.tolist()
|
215 |
+
all_mets.append("C00080")
|
216 |
+
all_mets.append("C00282")
|
217 |
+
|
218 |
+
mets_TECRDB_df = pd.read_csv('./data/TECRBD_mets.txt',header=None)
|
219 |
+
mets_TECRDB = mets_TECRDB_df[0].tolist()
|
220 |
+
|
221 |
+
# pdb.set_trace()
|
222 |
+
all_mets = list(set(all_mets + mets_TECRDB))
|
223 |
+
|
224 |
+
rule_df = pd.DataFrame(index=molsigna_df.index)
|
225 |
+
for rid, value in list(reaction_dict.items()):
|
226 |
+
# skip the reactions with missing metabolites
|
227 |
+
mets = list(value.keys())
|
228 |
+
flag = False
|
229 |
+
for met in mets:
|
230 |
+
if met not in all_mets:
|
231 |
+
flag = True
|
232 |
+
break
|
233 |
+
if flag: continue
|
234 |
+
|
235 |
+
rule_df[rid] = 0
|
236 |
+
for met, stoic in list(value.items()):
|
237 |
+
if met in mets_TECRDB:
|
238 |
+
continue # hydogen is zero
|
239 |
+
rule_df[rid] += molsigna_df[met] * stoic
|
240 |
+
rule_df.to_csv("./data/reaction_rule_nostereo_remove_TECRDB_mets.csv", index=True)
|
241 |
+
|
242 |
+
|
243 |
+
|
244 |
+
if __name__ == '__main__':
|
245 |
+
# db = pd.read_csv('./data/cache_compounds_20160818.csv',index_col='compound_id')
|
246 |
+
# db_smiles = db['smiles_pH7'].to_dict()
|
247 |
+
# decompse_ac(db_smiles)
|
248 |
+
# get_rxn_rule()
|
249 |
+
|
250 |
+
# get_rxn_rule_remove_TECRDB_mets()
|
251 |
+
get_rxn_rule_no_stero_remove_TECRDB_mets()
|
main.py
ADDED
@@ -0,0 +1,478 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
import re
|
5 |
+
from PIL import Image
|
6 |
+
import webbrowser
|
7 |
+
import json
|
8 |
+
import pickle
|
9 |
+
import sys
|
10 |
+
import joblib
|
11 |
+
import sys
|
12 |
+
|
13 |
+
sys.path.append('./CC/')
|
14 |
+
|
15 |
+
import chemaxon
|
16 |
+
from chemaxon import *
|
17 |
+
from compound import Compound
|
18 |
+
from compound_cacher import CompoundCacher
|
19 |
+
from rdkit.Chem import rdChemReactions as Reactions
|
20 |
+
from rdkit.Chem import Draw
|
21 |
+
from rdkit import Chem
|
22 |
+
|
23 |
+
@st.cache(allow_output_mutation=True)
|
24 |
+
def load_smiles():
|
25 |
+
db = pd.read_csv('./data/cache_compounds_20160818.csv',
|
26 |
+
index_col='compound_id')
|
27 |
+
db_smiles = db['smiles_pH7'].to_dict()
|
28 |
+
return db_smiles
|
29 |
+
|
30 |
+
|
31 |
+
@st.cache(allow_output_mutation=True)
|
32 |
+
def load_molsig_rad1():
|
33 |
+
molecular_signature_r1 = json.load(open('./data/decompose_vector_ac.json'))
|
34 |
+
return molecular_signature_r1
|
35 |
+
|
36 |
+
|
37 |
+
@st.cache(allow_output_mutation=True)
|
38 |
+
def load_molsig_rad2():
|
39 |
+
molecular_signature_r2 = json.load(
|
40 |
+
open('./data/decompose_vector_ac_r2_py3_indent_modified_manual.json'))
|
41 |
+
return molecular_signature_r2
|
42 |
+
|
43 |
+
|
44 |
+
@st.cache(allow_output_mutation=True)
|
45 |
+
def load_model():
|
46 |
+
filename = './model/M12_model_BR.pkl'
|
47 |
+
loaded_model = joblib.load(open(filename, 'rb'))
|
48 |
+
return loaded_model
|
49 |
+
|
50 |
+
|
51 |
+
@st.cache(allow_output_mutation=True)
|
52 |
+
def load_compound_cache():
|
53 |
+
ccache = CompoundCacher()
|
54 |
+
return ccache
|
55 |
+
|
56 |
+
|
57 |
+
def count_substructures(radius, molecule):
|
58 |
+
"""Helper function for get the information of molecular signature of a
|
59 |
+
metabolite. The relaxed signature requires the number of each substructure
|
60 |
+
to construct a matrix for each molecule.
|
61 |
+
Parameters
|
62 |
+
----------
|
63 |
+
radius : int
|
64 |
+
the radius is bond-distance that defines how many neighbor atoms should
|
65 |
+
be considered in a reaction center.
|
66 |
+
molecule : Molecule
|
67 |
+
a molecule object create by RDkit (e.g. Chem.MolFromInchi(inchi_code)
|
68 |
+
or Chem.MolToSmiles(smiles_code))
|
69 |
+
Returns
|
70 |
+
-------
|
71 |
+
dict
|
72 |
+
dictionary of molecular signature for a molecule,
|
73 |
+
{smiles: molecular_signature}
|
74 |
+
"""
|
75 |
+
m = molecule
|
76 |
+
smi_count = dict()
|
77 |
+
atomList = [atom for atom in m.GetAtoms()]
|
78 |
+
|
79 |
+
for i in range(len(atomList)):
|
80 |
+
env = Chem.FindAtomEnvironmentOfRadiusN(m, radius, i)
|
81 |
+
atoms = set()
|
82 |
+
for bidx in env:
|
83 |
+
atoms.add(m.GetBondWithIdx(bidx).GetBeginAtomIdx())
|
84 |
+
atoms.add(m.GetBondWithIdx(bidx).GetEndAtomIdx())
|
85 |
+
|
86 |
+
# only one atom is in this environment, such as O in H2O
|
87 |
+
if len(atoms) == 0:
|
88 |
+
atoms = {i}
|
89 |
+
|
90 |
+
smi = Chem.MolFragmentToSmiles(m, atomsToUse=list(atoms),
|
91 |
+
bondsToUse=env, canonical=True)
|
92 |
+
|
93 |
+
if smi in smi_count:
|
94 |
+
smi_count[smi] = smi_count[smi] + 1
|
95 |
+
else:
|
96 |
+
smi_count[smi] = 1
|
97 |
+
return smi_count
|
98 |
+
|
99 |
+
|
100 |
+
def decompse_novel_mets_rad1(novel_smiles, radius=1):
|
101 |
+
decompose_vector = dict()
|
102 |
+
|
103 |
+
for cid, smiles_pH7 in novel_smiles.items():
|
104 |
+
mol = Chem.MolFromSmiles(smiles_pH7)
|
105 |
+
mol = Chem.RemoveHs(mol)
|
106 |
+
# Chem.RemoveStereochemistry(mol)
|
107 |
+
smi_count = count_substructures(radius, mol)
|
108 |
+
decompose_vector[cid] = smi_count
|
109 |
+
return decompose_vector
|
110 |
+
|
111 |
+
|
112 |
+
def decompse_novel_mets_rad2(novel_smiles, radius=2):
|
113 |
+
decompose_vector = dict()
|
114 |
+
|
115 |
+
for cid, smiles_pH7 in novel_smiles.items():
|
116 |
+
mol = Chem.MolFromSmiles(smiles_pH7)
|
117 |
+
mol = Chem.RemoveHs(mol)
|
118 |
+
# Chem.RemoveStereochemistry(mol)
|
119 |
+
smi_count = count_substructures(radius, mol)
|
120 |
+
decompose_vector[cid] = smi_count
|
121 |
+
return decompose_vector
|
122 |
+
|
123 |
+
# def parse_rule(rxn,df_rule):
|
124 |
+
# df = df_rule
|
125 |
+
# rule_df = df[rxn].to_frame()
|
126 |
+
# # new_df = rule_df[(rule_df.T != 0).any()]
|
127 |
+
|
128 |
+
# return rule_df[(rule_df.T != 0).any()]
|
129 |
+
|
130 |
+
|
131 |
+
def parse_reaction_formula_side(s):
|
132 |
+
"""
|
133 |
+
Parses the side formula, e.g. '2 C00001 + C00002 + 3 C00003'
|
134 |
+
Ignores stoichiometry.
|
135 |
+
|
136 |
+
Returns:
|
137 |
+
The set of CIDs.
|
138 |
+
"""
|
139 |
+
if s.strip() == "null":
|
140 |
+
return {}
|
141 |
+
|
142 |
+
compound_bag = {}
|
143 |
+
for member in re.split('\s+\+\s+', s):
|
144 |
+
tokens = member.split(None, 1)
|
145 |
+
if len(tokens) == 0:
|
146 |
+
continue
|
147 |
+
if len(tokens) == 1:
|
148 |
+
amount = 1
|
149 |
+
key = member
|
150 |
+
else:
|
151 |
+
amount = float(tokens[0])
|
152 |
+
key = tokens[1]
|
153 |
+
|
154 |
+
compound_bag[key] = compound_bag.get(key, 0) + amount
|
155 |
+
|
156 |
+
return compound_bag
|
157 |
+
|
158 |
+
|
159 |
+
def parse_formula(formula, arrow='<=>', rid=None):
|
160 |
+
"""
|
161 |
+
Parses a two-sided formula such as: 2 C00001 => C00002 + C00003
|
162 |
+
|
163 |
+
Return:
|
164 |
+
The set of substrates, products and the direction of the reaction
|
165 |
+
"""
|
166 |
+
tokens = formula.split(arrow)
|
167 |
+
if len(tokens) < 2:
|
168 |
+
print(('Reaction does not contain the arrow sign (%s): %s'
|
169 |
+
% (arrow, formula)))
|
170 |
+
if len(tokens) > 2:
|
171 |
+
print(('Reaction contains more than one arrow sign (%s): %s'
|
172 |
+
% (arrow, formula)))
|
173 |
+
|
174 |
+
left = tokens[0].strip()
|
175 |
+
right = tokens[1].strip()
|
176 |
+
|
177 |
+
sparse_reaction = {}
|
178 |
+
for cid, count in parse_reaction_formula_side(left).items():
|
179 |
+
sparse_reaction[cid] = sparse_reaction.get(cid, 0) - count
|
180 |
+
|
181 |
+
for cid, count in parse_reaction_formula_side(right).items():
|
182 |
+
sparse_reaction[cid] = sparse_reaction.get(cid, 0) + count
|
183 |
+
|
184 |
+
return sparse_reaction
|
185 |
+
|
186 |
+
|
187 |
+
def draw_rxn_figure(rxn_dict, db_smiles, novel_smiles):
|
188 |
+
# db_smiles = load_smiles()
|
189 |
+
|
190 |
+
left = ''
|
191 |
+
right = ''
|
192 |
+
|
193 |
+
for met, stoic in rxn_dict.items():
|
194 |
+
if met == "C00080" or met == "C00282":
|
195 |
+
continue # hydogen is not considered
|
196 |
+
if stoic > 0:
|
197 |
+
if met in db_smiles:
|
198 |
+
right = right + db_smiles[met] + '.'
|
199 |
+
else:
|
200 |
+
right = right + novel_smiles[met] + '.'
|
201 |
+
else:
|
202 |
+
if met in db_smiles:
|
203 |
+
left = left + db_smiles[met] + '.'
|
204 |
+
else:
|
205 |
+
left = left + novel_smiles[met] + '.'
|
206 |
+
smarts = left[:-1] + '>>' + right[:-1]
|
207 |
+
# print smarts
|
208 |
+
smarts = str(smarts)
|
209 |
+
rxn = Reactions.ReactionFromSmarts(smarts, useSmiles=True)
|
210 |
+
return Draw.ReactionToImage(rxn) # , subImgSize=(400, 400))
|
211 |
+
|
212 |
+
# def draw_group_changes(rxn,df_rule):
|
213 |
+
# df = parse_rule(rxn,df_rule)
|
214 |
+
# group_dict = df.to_dict()[rxn]
|
215 |
+
|
216 |
+
# left = ''
|
217 |
+
# right = ''
|
218 |
+
|
219 |
+
# for smiles,stoic in group_dict.iteritems():
|
220 |
+
# if stoic > 0:
|
221 |
+
# right = right + smiles + '.'
|
222 |
+
# else:
|
223 |
+
# left = left + smiles + '.'
|
224 |
+
# smarts = left[:-1] + '>>' + right[:-1]
|
225 |
+
# rxn = Reactions.ReactionFromSmarts(smarts, useSmiles=True)
|
226 |
+
# return Draw.ReactionToImage(rxn)
|
227 |
+
|
228 |
+
# def get_rxn_rule(rid):
|
229 |
+
# reaction_dict = json.load(open('../data/optstoic_v3_Sji_dict.json'))
|
230 |
+
# molecular_signature = json.load(open('../data/decompose_vector_ac.json'))
|
231 |
+
# molsigna_df = pd.DataFrame.from_dict(molecular_signature).fillna(0)
|
232 |
+
# all_mets = molsigna_df.columns.tolist()
|
233 |
+
# all_mets.append("C00080")
|
234 |
+
# all_mets.append("C00282")
|
235 |
+
|
236 |
+
# rule_df = pd.DataFrame(index=molsigna_df.index)
|
237 |
+
|
238 |
+
# info = reaction_dict[rid]
|
239 |
+
|
240 |
+
# # skip the reactions with missing metabolites
|
241 |
+
# mets = info.keys()
|
242 |
+
# flag = False
|
243 |
+
# for met in mets:
|
244 |
+
# if met not in all_mets:
|
245 |
+
# flag = True
|
246 |
+
# break
|
247 |
+
# if flag:
|
248 |
+
# return None
|
249 |
+
|
250 |
+
# rule_df[rid] = 0
|
251 |
+
# for met, stoic in info.items():
|
252 |
+
# if met == "C00080" or met == "C00282":
|
253 |
+
# continue # hydogen is zero
|
254 |
+
# rule_df[rid] += molsigna_df[met] * stoic
|
255 |
+
# return rule_df
|
256 |
+
|
257 |
+
|
258 |
+
def get_rule(rxn_dict, molsig1, molsig2, novel_decomposed1, novel_decomposed2):
|
259 |
+
if novel_decomposed1 != None:
|
260 |
+
for cid in novel_decomposed1:
|
261 |
+
molsig1[cid] = novel_decomposed1[cid]
|
262 |
+
if novel_decomposed2 != None:
|
263 |
+
for cid in novel_decomposed2:
|
264 |
+
molsig2[cid] = novel_decomposed2[cid]
|
265 |
+
|
266 |
+
molsigna_df1 = pd.DataFrame.from_dict(molsig1).fillna(0)
|
267 |
+
all_mets1 = molsigna_df1.columns.tolist()
|
268 |
+
all_mets1.append("C00080")
|
269 |
+
all_mets1.append("C00282")
|
270 |
+
|
271 |
+
molsigna_df2 = pd.DataFrame.from_dict(molsig2).fillna(0)
|
272 |
+
all_mets2 = molsigna_df2.columns.tolist()
|
273 |
+
all_mets2.append("C00080")
|
274 |
+
all_mets2.append("C00282")
|
275 |
+
|
276 |
+
moieties_r1 = open('./data/group_names_r1.txt')
|
277 |
+
moieties_r2 = open('./data/group_names_r2_py3_modified_manual.txt')
|
278 |
+
moie_r1 = moieties_r1.read().splitlines()
|
279 |
+
moie_r2 = moieties_r2.read().splitlines()
|
280 |
+
|
281 |
+
molsigna_df1 = molsigna_df1.reindex(moie_r1)
|
282 |
+
molsigna_df2 = molsigna_df2.reindex(moie_r2)
|
283 |
+
|
284 |
+
rule_df1 = pd.DataFrame(index=molsigna_df1.index)
|
285 |
+
rule_df2 = pd.DataFrame(index=molsigna_df2.index)
|
286 |
+
# for rid, value in reaction_dict.items():
|
287 |
+
# # skip the reactions with missing metabolites
|
288 |
+
# mets = value.keys()
|
289 |
+
# flag = False
|
290 |
+
# for met in mets:
|
291 |
+
# if met not in all_mets:
|
292 |
+
# flag = True
|
293 |
+
# break
|
294 |
+
# if flag: continue
|
295 |
+
|
296 |
+
rule_df1['change'] = 0
|
297 |
+
for met, stoic in rxn_dict.items():
|
298 |
+
if met == "C00080" or met == "C00282":
|
299 |
+
continue # hydogen is zero
|
300 |
+
rule_df1['change'] += molsigna_df1[met] * stoic
|
301 |
+
|
302 |
+
rule_df2['change'] = 0
|
303 |
+
for met, stoic in rxn_dict.items():
|
304 |
+
if met == "C00080" or met == "C00282":
|
305 |
+
continue # hydogen is zero
|
306 |
+
rule_df2['change'] += molsigna_df2[met] * stoic
|
307 |
+
|
308 |
+
rule_vec1 = rule_df1.to_numpy().T
|
309 |
+
rule_vec2 = rule_df2.to_numpy().T
|
310 |
+
|
311 |
+
m1, n1 = rule_vec1.shape
|
312 |
+
m2, n2 = rule_vec2.shape
|
313 |
+
|
314 |
+
zeros1 = np.zeros((m1, 44))
|
315 |
+
zeros2 = np.zeros((m2, 44))
|
316 |
+
X1 = np.concatenate((rule_vec1, zeros1), 1)
|
317 |
+
X2 = np.concatenate((rule_vec2, zeros2), 1)
|
318 |
+
|
319 |
+
rule_comb = np.concatenate((X1, X2), 1)
|
320 |
+
|
321 |
+
# rule_df_final = {}
|
322 |
+
# rule_df_final['rad1'] = rule_df1
|
323 |
+
# rule_df_final['rad2'] = rule_df2
|
324 |
+
return rule_comb, rule_df1, rule_df2
|
325 |
+
|
326 |
+
|
327 |
+
def get_ddG0(rxn_dict, pH, I, novel_mets):
|
328 |
+
ccache = CompoundCacher()
|
329 |
+
# ddG0 = get_transform_ddG0(rxn_dict, ccache, pH, I, T)
|
330 |
+
T = 298.15
|
331 |
+
ddG0_forward = 0
|
332 |
+
for compound_id, coeff in rxn_dict.items():
|
333 |
+
if novel_mets != None and compound_id in novel_mets:
|
334 |
+
comp = novel_mets[compound_id]
|
335 |
+
else:
|
336 |
+
comp = ccache.get_compound(compound_id)
|
337 |
+
ddG0_forward += coeff * comp.transform_pH7(pH, I, T)
|
338 |
+
|
339 |
+
return ddG0_forward
|
340 |
+
|
341 |
+
|
342 |
+
def get_dG0(rxn_dict, rid, pH, I, loaded_model, molsig_r1, molsig_r2, novel_decomposed_r1, novel_decomposed_r2, novel_mets):
|
343 |
+
|
344 |
+
# rule_df = get_rxn_rule(rid)
|
345 |
+
rule_comb, rule_df1, rule_df2 = get_rule(
|
346 |
+
rxn_dict, molsig_r1, molsig_r2, novel_decomposed_r1, novel_decomposed_r2)
|
347 |
+
|
348 |
+
X = rule_comb
|
349 |
+
# X = X.reshape(1,-1)
|
350 |
+
# pdb.set_trace()
|
351 |
+
# print(np.shape(X1))
|
352 |
+
# print(np.shape(X2))
|
353 |
+
# print(np.shape(X))
|
354 |
+
|
355 |
+
ymean, ystd = loaded_model.predict(X, return_std=True)
|
356 |
+
|
357 |
+
# print(ymean)
|
358 |
+
# print(ystd)
|
359 |
+
result = {}
|
360 |
+
# result['dG0'] = ymean[0] + get_ddG0(rxn_dict, pH, I)
|
361 |
+
# result['standard deviation'] = ystd[0]
|
362 |
+
|
363 |
+
# result_df = pd.DataFrame([result])
|
364 |
+
# result_df.style.hide_index()
|
365 |
+
# return result_df
|
366 |
+
return ymean[0] + get_ddG0(rxn_dict, pH, I, novel_mets), ystd[0], rule_df1, rule_df2
|
367 |
+
# return ymean[0],ystd[0]
|
368 |
+
|
369 |
+
|
370 |
+
def parse_novel_molecule(add_info):
|
371 |
+
result = {}
|
372 |
+
for cid, InChI in add_info.items():
|
373 |
+
c = Compound.from_inchi('Test', cid, InChI)
|
374 |
+
result[cid] = c
|
375 |
+
return result
|
376 |
+
|
377 |
+
|
378 |
+
def parse_novel_smiles(result):
|
379 |
+
novel_smiles = {}
|
380 |
+
for cid, c in result.items():
|
381 |
+
smiles = c.smiles_pH7
|
382 |
+
novel_smiles[cid] = smiles
|
383 |
+
return novel_smiles
|
384 |
+
|
385 |
+
|
386 |
+
def main():
|
387 |
+
# def img_to_bytes(img_path):
|
388 |
+
# img_bytes = Path(img_path).read_bytes()
|
389 |
+
# encoded = base64.b64encode(img_bytes).decode()
|
390 |
+
# return encoded
|
391 |
+
# # st.title('dGPredictor')
|
392 |
+
|
393 |
+
# header_html = "<img src='../figures/header.png'>"
|
394 |
+
|
395 |
+
# st.markdown(
|
396 |
+
# header_html, unsafe_allow_html=True,
|
397 |
+
# )
|
398 |
+
|
399 |
+
db_smiles = load_smiles()
|
400 |
+
molsig_r1 = load_molsig_rad1()
|
401 |
+
molsig_r2 = load_molsig_rad2()
|
402 |
+
|
403 |
+
loaded_model = load_model()
|
404 |
+
ccache = load_compound_cache()
|
405 |
+
|
406 |
+
st.image('./figures/header.png', use_column_width=True)
|
407 |
+
|
408 |
+
st.subheader('Reaction (please use KEGG IDs)')
|
409 |
+
|
410 |
+
# rxn_str = st.text_input('Reaction using KEGG ids:', value='C16688 + C00001 <=> C00095 + C00092')
|
411 |
+
rxn_str = st.text_input(
|
412 |
+
'', value='C01745 + C00004 <=> N00001 + C00003 + C00001')
|
413 |
+
# rxn_str = st.text_input('', value='C16688 + C00001 <=> C00095 + C00092')
|
414 |
+
|
415 |
+
# url = 'https://www.genome.jp/dbget-bin/www_bget?rn:R00801'
|
416 |
+
# if st.button('KEGG format example'):
|
417 |
+
# webbrowser.open_new_tab(url)
|
418 |
+
|
419 |
+
if st.checkbox('Reaction has metabolites not in KEGG'):
|
420 |
+
# st.subheader('test')
|
421 |
+
add_info = st.text_area('Additional information (id: InChI):',
|
422 |
+
'{"N00001":"InChI=1S/C14H12O/c15-14-8-4-7-13(11-14)10-9-12-5-2-1-3-6-12/h1-11,15H/b10-9+"}')
|
423 |
+
else:
|
424 |
+
add_info = '{"None":"None"}'
|
425 |
+
|
426 |
+
# session_state = SessionState.get(name="", button_sent=False)
|
427 |
+
# button_search = st.button("Search")
|
428 |
+
|
429 |
+
# if button_search:
|
430 |
+
# session_state.button_search = True
|
431 |
+
pH = st.slider('pH', min_value=0.0, max_value=14.0, value=7.0, step=0.1)
|
432 |
+
I = st.slider('Ionic strength [M]', min_value=0.0,
|
433 |
+
max_value=0.5, value=0.1, step=0.01)
|
434 |
+
|
435 |
+
if st.button("Search"):
|
436 |
+
# if session_state.button_search:
|
437 |
+
st.subheader('Reaction Equation')
|
438 |
+
st.write(rxn_str)
|
439 |
+
with st.spinner('Searching...'):
|
440 |
+
try:
|
441 |
+
novel_mets = parse_novel_molecule(json.loads(add_info))
|
442 |
+
novel_smiles = parse_novel_smiles(novel_mets)
|
443 |
+
novel_decomposed_r1 = decompse_novel_mets_rad1(novel_smiles)
|
444 |
+
novel_decomposed_r2 = decompse_novel_mets_rad2(novel_smiles)
|
445 |
+
|
446 |
+
except Exception as e:
|
447 |
+
novel_mets = None
|
448 |
+
novel_smiles = None
|
449 |
+
novel_decomposed_r1 = None
|
450 |
+
novel_decomposed_r2 = None
|
451 |
+
# novel_smiles = json.loads(add_info)
|
452 |
+
print(novel_smiles)
|
453 |
+
|
454 |
+
rxn_dict = parse_formula(rxn_str)
|
455 |
+
st.image(draw_rxn_figure(rxn_dict, db_smiles,
|
456 |
+
novel_smiles), use_column_width=True)
|
457 |
+
|
458 |
+
# st.text('Group changes:')
|
459 |
+
# st.write(parse_rule('R03921'))
|
460 |
+
# st.write(get_rxn_rule('R03921'))
|
461 |
+
|
462 |
+
# session_state.calculate = st.button('Start Calculate!')
|
463 |
+
# if session_state.calculate:
|
464 |
+
# if st.button('Start Calculate!'):
|
465 |
+
|
466 |
+
# st.text('Result:')
|
467 |
+
st.subheader('Thermodynamics')
|
468 |
+
with st.spinner('Calculating...'):
|
469 |
+
mu, std, rule_df1, rule_df2 = get_dG0(
|
470 |
+
rxn_dict, 'R00801', pH, I, loaded_model, molsig_r1, molsig_r2, novel_decomposed_r1, novel_decomposed_r2, novel_mets)
|
471 |
+
st.write(r"$\Delta_r G'^{o} = %.2f \pm %.2f \ kJ/mol$" % (mu, std))
|
472 |
+
st.text('Group changes:')
|
473 |
+
st.write(rule_df1[(rule_df1.T != 0).any()])
|
474 |
+
st.write(rule_df2[(rule_df2.T != 0).any()])
|
475 |
+
|
476 |
+
|
477 |
+
if __name__ == '__main__':
|
478 |
+
main()
|
mini_novoStoic.py
ADDED
@@ -0,0 +1,326 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import pulp
|
3 |
+
import pdb
|
4 |
+
import os
|
5 |
+
import json
|
6 |
+
from rdkit import Chem
|
7 |
+
|
8 |
+
# pulp_solver = pulp.solvers.CPLEX_CMD(path=None, keepFiles=0, mip=1, msg=1,
|
9 |
+
# options=['mip tolerances mipgap 0', 'mip tolerances absmipgap 0',
|
10 |
+
# 'mip tolerances integrality 0', 'simplex tolerances optimality 1E-9',
|
11 |
+
# 'simplex tolerances feasibility 1E-9',], timelimit=1200)
|
12 |
+
|
13 |
+
def count_substructures(radius,molecule):
|
14 |
+
"""Helper function for get the information of molecular signature of a
|
15 |
+
metabolite. The relaxed signature requires the number of each substructure
|
16 |
+
to construct a matrix for each molecule.
|
17 |
+
Parameters
|
18 |
+
----------
|
19 |
+
radius : int
|
20 |
+
the radius is bond-distance that defines how many neighbor atoms should
|
21 |
+
be considered in a reaction center.
|
22 |
+
molecule : Molecule
|
23 |
+
a molecule object create by RDkit (e.g. Chem.MolFromInchi(inchi_code)
|
24 |
+
or Chem.MolToSmiles(smiles_code))
|
25 |
+
Returns
|
26 |
+
-------
|
27 |
+
dict
|
28 |
+
dictionary of molecular signature for a molecule,
|
29 |
+
{smiles: molecular_signature}
|
30 |
+
"""
|
31 |
+
m = molecule
|
32 |
+
smi_count = dict()
|
33 |
+
atomList = [atom for atom in m.GetAtoms()]
|
34 |
+
|
35 |
+
for i in range(len(atomList)):
|
36 |
+
env = Chem.FindAtomEnvironmentOfRadiusN(m,radius,i)
|
37 |
+
atoms=set()
|
38 |
+
for bidx in env:
|
39 |
+
atoms.add(m.GetBondWithIdx(bidx).GetBeginAtomIdx())
|
40 |
+
atoms.add(m.GetBondWithIdx(bidx).GetEndAtomIdx())
|
41 |
+
|
42 |
+
# only one atom is in this environment, such as O in H2O
|
43 |
+
if len(atoms) == 0:
|
44 |
+
atoms = {i}
|
45 |
+
|
46 |
+
smi = Chem.MolFragmentToSmiles(m,atomsToUse=list(atoms),
|
47 |
+
bondsToUse=env,canonical=True)
|
48 |
+
|
49 |
+
if smi in smi_count:
|
50 |
+
smi_count[smi] = smi_count[smi] + 1
|
51 |
+
else:
|
52 |
+
smi_count[smi] = 1
|
53 |
+
return smi_count
|
54 |
+
|
55 |
+
def novoStoic_minFlux_relaxedRule(exchange_mets, novel_mets,project,iterations,pulp_solver,use_direction):
|
56 |
+
"""apply reaction rules generated from a more relaxed manner to search for
|
57 |
+
reaction rules that are able to fill the gap between the source and sink
|
58 |
+
metabolites.
|
59 |
+
- rePrime procedure is more similar to a morgan fingerprints
|
60 |
+
- the relaxed rule is generated from substructures without considering the
|
61 |
+
bond that connect the atoms at the edge of the substructure to the rest
|
62 |
+
of the molecules
|
63 |
+
|
64 |
+
Parameters
|
65 |
+
----------
|
66 |
+
exchange_mets : dict
|
67 |
+
overall stoichiometry of source and sink metabolites, {met: stoic,...}
|
68 |
+
This is a important input for novoStoic to run correctly because the
|
69 |
+
method requires that overall moieties are balanced.
|
70 |
+
novel_mets : list
|
71 |
+
list of novel metabolites that are not in the database (novoStoic/data/
|
72 |
+
metanetx_universal_model_kegg_metacyc_rhea_seed_reactome.json)
|
73 |
+
filtered_rules : list
|
74 |
+
list of rules that are filtered by the user (based on expert knowldedge)
|
75 |
+
to reduce the running time of the novoStoic search process
|
76 |
+
project : string
|
77 |
+
a path to store the tmp information of result from running novoStoic
|
78 |
+
iterations : int
|
79 |
+
the number of iterations of searching for alternative solutions
|
80 |
+
data_dir : type
|
81 |
+
Description of parameter `data_dir`.
|
82 |
+
|
83 |
+
Returns
|
84 |
+
-------
|
85 |
+
None
|
86 |
+
all the outputs are saved in the project folder.
|
87 |
+
|
88 |
+
"""
|
89 |
+
if not os.path.exists(project):
|
90 |
+
os.makedirs(project)
|
91 |
+
|
92 |
+
# the maximum flux of a reaction
|
93 |
+
M = 2
|
94 |
+
|
95 |
+
data_dir = './data'
|
96 |
+
|
97 |
+
# read csv files with molecular signatures and reaction rules
|
98 |
+
molecular_signature = json.load(open(
|
99 |
+
os.path.join(data_dir, 'decompose_vector_ac.json')))
|
100 |
+
molsigs = pd.DataFrame.from_dict(molecular_signature).fillna(0)
|
101 |
+
|
102 |
+
rules = pd.read_csv(
|
103 |
+
os.path.join(data_dir, "relaxed_rule_noduplic.csv"), index_col=0
|
104 |
+
)
|
105 |
+
|
106 |
+
###### sets ############
|
107 |
+
moiety_index = rules.index.tolist() # moiety sets
|
108 |
+
rules_index = rules.columns.values.tolist()
|
109 |
+
print("Number of rules used in this search:",len(rules_index))
|
110 |
+
|
111 |
+
exchange_index = exchange_mets.keys()
|
112 |
+
|
113 |
+
###### parameters ######
|
114 |
+
# T(m,r) contains atom stoichiometry for each rule
|
115 |
+
T = rules.to_dict(orient="index")
|
116 |
+
|
117 |
+
# C(m,i) contains moiety cardinality for each metabolite
|
118 |
+
C = molsigs.to_dict(orient="index")
|
119 |
+
for m in moiety_index:
|
120 |
+
C[m]["C00080"] = 0
|
121 |
+
C[m]["C00282"] = 0
|
122 |
+
|
123 |
+
# add metabolites that are not present in current database
|
124 |
+
for met in novel_mets:
|
125 |
+
# molsigs_product = pd.read_csv(
|
126 |
+
# project + "/relaxed_molsig_" + met + "_1.csv", index_col=0
|
127 |
+
# )
|
128 |
+
# molsigs_product_dict = molsigs_product.to_dict(orient="index")
|
129 |
+
smiles = novel_mets[met]
|
130 |
+
mol = Chem.MolFromSmiles(smiles)
|
131 |
+
mol = Chem.RemoveHs(mol)
|
132 |
+
molsigs_product_dict = count_substructures(1,mol)
|
133 |
+
|
134 |
+
for m in moiety_index:
|
135 |
+
if m in molsigs_product_dict.keys():
|
136 |
+
C[m][met] = molsigs_product_dict[m]
|
137 |
+
else:
|
138 |
+
C[m][met] = 0
|
139 |
+
|
140 |
+
###### variables ######
|
141 |
+
v_rule = pulp.LpVariable.dicts(
|
142 |
+
"v_rule", rules_index, lowBound=-M, upBound=M, cat="Integer"
|
143 |
+
)
|
144 |
+
v_rule_obj = pulp.LpVariable.dicts(
|
145 |
+
"v_rule_obj", rules_index, lowBound=0, upBound=M, cat="Continuous"
|
146 |
+
)
|
147 |
+
|
148 |
+
v_EX = pulp.LpVariable.dicts(
|
149 |
+
"v_EX", exchange_index, lowBound=-M, upBound=M, cat="Continuous"
|
150 |
+
)
|
151 |
+
y_rule = pulp.LpVariable.dicts(
|
152 |
+
"y", rules_index, lowBound=0, upBound=1, cat="Binary"
|
153 |
+
)
|
154 |
+
|
155 |
+
# create MILP problem
|
156 |
+
lp_prob = pulp.LpProblem("novoStoic", pulp.LpMinimize)
|
157 |
+
|
158 |
+
####### objective function ####
|
159 |
+
lp_prob += pulp.lpSum([v_rule_obj[j] for j in rules_index])
|
160 |
+
|
161 |
+
####### constraints ####
|
162 |
+
# constraint 1: moiety change balance
|
163 |
+
for m in moiety_index:
|
164 |
+
lp_prob += (
|
165 |
+
pulp.lpSum([T[m][r] * v_rule[r] for r in rules_index if T[m][r] !=0])
|
166 |
+
== pulp.lpSum([C[m][i] * v_EX[i] for i in exchange_index if C[m][i] != 0]),
|
167 |
+
"moiety_balance_" + str(moiety_index.index(m)),
|
168 |
+
)
|
169 |
+
|
170 |
+
# constraint 2: constraint for exchange reactions
|
171 |
+
for i, stoic in exchange_mets.items():
|
172 |
+
lp_prob += v_EX[i] == stoic, "exchange" + i
|
173 |
+
|
174 |
+
# constraint 3: control the number of rules
|
175 |
+
|
176 |
+
direction_df = pd.read_csv(
|
177 |
+
os.path.join(data_dir, "direction.csv"), index_col=0
|
178 |
+
)
|
179 |
+
direction_df.index = direction_df['reaction']
|
180 |
+
|
181 |
+
# direction: 0-reversible, 1-backward, 2-forward
|
182 |
+
direction = direction_df['direction'].to_dict()
|
183 |
+
|
184 |
+
if use_direction:
|
185 |
+
soln_file = os.path.join(project, "solution_use_direction.txt")
|
186 |
+
for j in rules_index:
|
187 |
+
if direction[j] == 0:
|
188 |
+
lp_prob += v_rule[j] >= y_rule[j] * -M, "cons1_%s" % j
|
189 |
+
lp_prob += v_rule[j] <= y_rule[j] * M, "cons2_%s" % j
|
190 |
+
if direction[j] == 1:
|
191 |
+
lp_prob += v_rule[j] >= y_rule[j] * -M, "cons1_%s" % j
|
192 |
+
lp_prob += v_rule[j] <= 0, "cons2_%s" % j
|
193 |
+
if direction[j] == 2:
|
194 |
+
lp_prob += v_rule[j] >= 0, "cons1_%s" % j
|
195 |
+
lp_prob += v_rule[j] <= y_rule[j] * M, "cons2_%s" % j
|
196 |
+
else:
|
197 |
+
soln_file = os.path.join(project, "solution_no_direction.txt")
|
198 |
+
for j in rules_index:
|
199 |
+
lp_prob += v_rule[j] >= y_rule[j] * -M, "cons1_%s" % j
|
200 |
+
lp_prob += v_rule[j] <= y_rule[j] * M, "cons2_%s" % j
|
201 |
+
|
202 |
+
for j in rules_index:
|
203 |
+
lp_prob += v_rule_obj[j] >= v_rule[j]
|
204 |
+
lp_prob += v_rule_obj[j] >= -v_rule[j]
|
205 |
+
|
206 |
+
# constraint 5: customized constraints
|
207 |
+
# the number of steps of the pathway
|
208 |
+
lp_prob += pulp.lpSum([v_rule_obj[j] for j in rules_index]) == 2
|
209 |
+
|
210 |
+
### solve
|
211 |
+
integer_cuts(lp_prob,pulp_solver,iterations,rules_index,y_rule,v_rule,soln_file,direction)
|
212 |
+
|
213 |
+
def integer_cuts(lp_prob,pulp_solver,iterations,rules_index,y_rule,v_rule,soln_file,direction):
|
214 |
+
"""add integer cut constraints to a mixed-integer linear programming problem
|
215 |
+
(MILP). The aim of such constraints is to find alternative solutions by
|
216 |
+
adding constraints to exclude the already explored solutions.
|
217 |
+
|
218 |
+
Reference: Optimization Methods in Metabolic Networks By Costas D. Maranas,
|
219 |
+
Ali R. Zomorrodi, Chapter 4.2.2 Finding alternative optimal integer
|
220 |
+
solutions
|
221 |
+
|
222 |
+
Returns
|
223 |
+
-------
|
224 |
+
type
|
225 |
+
Description of returned object.
|
226 |
+
|
227 |
+
"""
|
228 |
+
for sol_num in range(1, iterations + 1):
|
229 |
+
integer_cut_rules = []
|
230 |
+
|
231 |
+
# optinal output: lp file for debug
|
232 |
+
lp_prob.writeLP('./test.lp')
|
233 |
+
# if pulp_solver = "SCIP":
|
234 |
+
# status, values = pulp_solver.solve(lp_prob)
|
235 |
+
lp_prob.solve(pulp_solver)
|
236 |
+
# pulp_solver.solve(lp_prob)
|
237 |
+
|
238 |
+
print("Status:", pulp.LpStatus[lp_prob.status])
|
239 |
+
|
240 |
+
if pulp.LpStatus[lp_prob.status] != 'Optimal':
|
241 |
+
break
|
242 |
+
|
243 |
+
print('-----------rules--------------')
|
244 |
+
with open(soln_file,'a') as f:
|
245 |
+
f.write('iteration,' + str(sol_num))
|
246 |
+
f.write('\n')
|
247 |
+
|
248 |
+
for r in rules_index:
|
249 |
+
if (v_rule[r].varValue >= 0.1 or v_rule[r].varValue <=-0.1):
|
250 |
+
|
251 |
+
dG_info = ''
|
252 |
+
if (v_rule[r].varValue > 0 and direction[r] == 1) or (v_rule[r].varValue < 0 and direction[r] == 2):
|
253 |
+
# print("##### Found ####: " + str(r))
|
254 |
+
# with open(soln_file,'a') as f:
|
255 |
+
# f.write('##### Found ####: ' + str(r))
|
256 |
+
# f.write('\n')
|
257 |
+
dG_info = ' * Thermodynamically infeasible'
|
258 |
+
print("##### Found ####: " + str(r) + dG_info)
|
259 |
+
integer_cut_rules.append(r)
|
260 |
+
print(r,v_rule[r].varValue)
|
261 |
+
|
262 |
+
with open(soln_file,'a') as f:
|
263 |
+
f.write(r + ',' + str(v_rule[r].varValue) + dG_info)
|
264 |
+
f.write('\n')
|
265 |
+
|
266 |
+
length = len(integer_cut_rules) - 1
|
267 |
+
lp_prob += (
|
268 |
+
pulp.lpSum([y_rule[r] for r in integer_cut_rules]) <= length,
|
269 |
+
"integer_cut_" + str(sol_num),
|
270 |
+
)
|
271 |
+
|
272 |
+
|
273 |
+
def test_bdo():
|
274 |
+
exchange_mets = {
|
275 |
+
'C00091': -1, # Succinyl-CoA
|
276 |
+
'C00004': -4, # NADH
|
277 |
+
'C00003': 4, # NAD+
|
278 |
+
'C00010': 1, # coa
|
279 |
+
'C00001':1, # h2O
|
280 |
+
'14bdo': 1,
|
281 |
+
}
|
282 |
+
novel_mets = {
|
283 |
+
'14bdo': 'OCCCCO'
|
284 |
+
}
|
285 |
+
|
286 |
+
iterations = 50
|
287 |
+
project = './novoStoic_result'
|
288 |
+
|
289 |
+
# path_to_cplex = '/Users/linuswang/Applications/IBM/ILOG/CPLEX_Studio1261/cplex/bin/x86-64_osx/cplex'
|
290 |
+
# pulp_solver = pulp.CPLEX_CMD(path=path_to_cplex,keepFiles=0, mip=1, msg=1)
|
291 |
+
|
292 |
+
pulp_solver = pulp.CPLEX_CMD(path=None,keepFiles=0, mip=1, msg=1)
|
293 |
+
# pulp_solver = pulp.solvers.GUROBI_CMD()
|
294 |
+
# pulp_solver = pulp.solvers.GLPK_CMD()
|
295 |
+
use_direction=True
|
296 |
+
novoStoic_minFlux_relaxedRule(exchange_mets, novel_mets,project,iterations,pulp_solver,use_direction)
|
297 |
+
use_direction=False
|
298 |
+
novoStoic_minFlux_relaxedRule(exchange_mets, novel_mets,project,iterations,pulp_solver,use_direction)
|
299 |
+
|
300 |
+
|
301 |
+
def test_isovalarate():
|
302 |
+
exchange_mets = {
|
303 |
+
'C00141': -1, # 2-keto isovalarate
|
304 |
+
'C00004': -1, # NADH
|
305 |
+
'C00003': 1, # NAD+
|
306 |
+
"C14710": 1, # isobutanol C4H10O
|
307 |
+
'C00011': 1, # co2
|
308 |
+
}
|
309 |
+
novel_mets = {}
|
310 |
+
|
311 |
+
iterations = 50
|
312 |
+
project = './novoStoic_isovalarate'
|
313 |
+
|
314 |
+
# path_to_cplex = '/Users/linuswang/Applications/IBM/ILOG/CPLEX_Studio1261/cplex/bin/x86-64_osx/cplex'
|
315 |
+
# pulp_solver = pulp.CPLEX_CMD(path=path_to_cplex,keepFiles=0, mip=1, msg=1)
|
316 |
+
|
317 |
+
pulp_solver = pulp.CPLEX_CMD(path=None,keepFiles=0, mip=1, msg=1)
|
318 |
+
# pulp_solver = pulp.solvers.GUROBI_CMD()
|
319 |
+
# pulp_solver = pulp.GLPK_CMD()
|
320 |
+
# use_direction=True
|
321 |
+
# novoStoic_minFlux_relaxedRule(exchange_mets, novel_mets,project,iterations,pulp_solver,use_direction)
|
322 |
+
use_direction=False
|
323 |
+
novoStoic_minFlux_relaxedRule(exchange_mets, novel_mets,project,iterations,pulp_solver,use_direction)
|
324 |
+
|
325 |
+
if __name__ == '__main__':
|
326 |
+
test_isovalarate()
|
model_gen.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from scipy.io import savemat, loadmat
|
2 |
+
import pandas as pd
|
3 |
+
import pdb
|
4 |
+
import json
|
5 |
+
import numpy as np
|
6 |
+
from numpy import median, mean
|
7 |
+
from sklearn.linear_model import BayesianRidge, LinearRegression, RidgeCV, Ridge
|
8 |
+
from sklearn.neural_network import MLPRegressor
|
9 |
+
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
|
10 |
+
from sklearn.model_selection import cross_val_score, LeaveOneOut
|
11 |
+
import joblib
|
12 |
+
import pickle
|
13 |
+
import matplotlib.pyplot as plt
|
14 |
+
import sys
|
15 |
+
import os.path
|
16 |
+
import glob, os
|
17 |
+
import openbabel
|
18 |
+
from IPython.display import clear_output
|
19 |
+
import timeit
|
20 |
+
|
21 |
+
|
22 |
+
ac = loadmat('./data/Test_KEGG_all_grp.mat')
|
23 |
+
|
24 |
+
y = ac['y']
|
25 |
+
y = y.flatten()
|
26 |
+
|
27 |
+
alphas = np.logspace(-6, 6, 200)
|
28 |
+
|
29 |
+
Xrc = ac['X_comb_all']
|
30 |
+
regr_rcombined = BayesianRidge(tol=1e-6, fit_intercept=False, compute_score=True).fit(Xrc, y)
|
31 |
+
|
32 |
+
y_pred_rc = regr_rcombined.predict(Xrc)
|
33 |
+
mse_rc = mean_squared_error(y, y_pred_rc)
|
34 |
+
r2 = r2_score(y, y_pred_rc)
|
35 |
+
|
36 |
+
|
37 |
+
print('radius 1+2 linear model')
|
38 |
+
print('Mean squared error: %.2f'
|
39 |
+
% mse_rc)
|
40 |
+
print('Coefficient of determination: %.4f'
|
41 |
+
% r2)
|
42 |
+
|
43 |
+
|
44 |
+
|
45 |
+
s0 = timeit.default_timer()
|
46 |
+
joblib.dump(regr_rcombined, './model/M12_model_BR.pkl',compress=3)
|
47 |
+
s1 = timeit.default_timer()
|
48 |
+
print(s1 - s0)
|
49 |
+
|
50 |
+
s0 = timeit.default_timer()
|
51 |
+
filename = './model/M12_model_BR.pkl'
|
52 |
+
loaded_model = joblib.load(open(filename, 'rb'))
|
53 |
+
s1 = timeit.default_timer()
|
54 |
+
print(s1 - s0)
|
55 |
+
print('==================================')
|
reaction_rule_2_gen.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import pdb
|
3 |
+
import json
|
4 |
+
from rdkit import Chem
|
5 |
+
|
6 |
+
reaction_dict = json.load(open('./data/optstoic_v3_Sji_dict.json'))
|
7 |
+
molecular_signature = json.load(open('./data/decompose_vector_ac_r2_py3_indent_modified_manual.json'))
|
8 |
+
molsigna_df = pd.DataFrame.from_dict(molecular_signature).fillna(0)
|
9 |
+
all_mets = molsigna_df.columns.tolist()
|
10 |
+
all_mets.append("C00080")
|
11 |
+
all_mets.append("C00282")
|
12 |
+
|
13 |
+
|
14 |
+
rule_df = pd.DataFrame(index=molsigna_df.index)
|
15 |
+
for rid, value in list(reaction_dict.items()):
|
16 |
+
# skip the reactions with missing metabolites
|
17 |
+
mets = list(value.keys())
|
18 |
+
flag = False
|
19 |
+
for met in mets:
|
20 |
+
if met not in all_mets:
|
21 |
+
flag = True
|
22 |
+
break
|
23 |
+
if flag: continue
|
24 |
+
|
25 |
+
rule_df[rid] = 0
|
26 |
+
for met, stoic in list(value.items()):
|
27 |
+
if met == "C00080" or met == "C00282":
|
28 |
+
continue # hydogen is zero
|
29 |
+
rule_df[rid] += molsigna_df[met] * stoic
|
30 |
+
rule_df.to_csv("./data/reaction_rule_r2_py3_manual_modified.csv", index=True)
|
retrieve_bulk.ipynb
ADDED
@@ -0,0 +1,660 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [
|
8 |
+
{
|
9 |
+
"name": "stderr",
|
10 |
+
"output_type": "stream",
|
11 |
+
"text": [
|
12 |
+
"2021-08-13 17:29:46.477 INFO rdkit: Enabling RDKit 2021.03.4 jupyter extensions\n"
|
13 |
+
]
|
14 |
+
}
|
15 |
+
],
|
16 |
+
"source": [
|
17 |
+
"import streamlit as st\n",
|
18 |
+
"import pandas as pd\n",
|
19 |
+
"import numpy as np\n",
|
20 |
+
"import re\n",
|
21 |
+
"from PIL import Image\n",
|
22 |
+
"import webbrowser\n",
|
23 |
+
"import json\n",
|
24 |
+
"import pickle\n",
|
25 |
+
"import sys \n",
|
26 |
+
"import joblib\n",
|
27 |
+
"\n",
|
28 |
+
"sys.path.append('./CC/')\n",
|
29 |
+
"\n",
|
30 |
+
"import chemaxon\n",
|
31 |
+
"from chemaxon import *\n",
|
32 |
+
"from compound import Compound\n",
|
33 |
+
"from compound_cacher import CompoundCacher\n",
|
34 |
+
"from rdkit.Chem import rdChemReactions as Reactions\n",
|
35 |
+
"from rdkit.Chem import Draw\n",
|
36 |
+
"from rdkit import Chem"
|
37 |
+
]
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"cell_type": "code",
|
41 |
+
"execution_count": 2,
|
42 |
+
"metadata": {},
|
43 |
+
"outputs": [],
|
44 |
+
"source": [
|
45 |
+
"def load_smiles():\n",
|
46 |
+
" db = pd.read_csv('./data/cache_compounds_20160818.csv',\n",
|
47 |
+
" index_col='compound_id')\n",
|
48 |
+
" db_smiles = db['smiles_pH7'].to_dict()\n",
|
49 |
+
" return db_smiles\n",
|
50 |
+
"\n",
|
51 |
+
"def load_molsig_rad1():\n",
|
52 |
+
" molecular_signature_r1 = json.load(open('./data/decompose_vector_ac.json'))\n",
|
53 |
+
" return molecular_signature_r1\n",
|
54 |
+
"\n",
|
55 |
+
"\n",
|
56 |
+
"def load_molsig_rad2():\n",
|
57 |
+
" molecular_signature_r2 = json.load(\n",
|
58 |
+
" open('./data/decompose_vector_ac_r2_py3_indent_modified_manual.json'))\n",
|
59 |
+
" return molecular_signature_r2\n",
|
60 |
+
"\n",
|
61 |
+
"\n",
|
62 |
+
"def load_model():\n",
|
63 |
+
" filename = './model/M12_model_BR.pkl'\n",
|
64 |
+
" loaded_model = joblib.load(open(filename, 'rb'))\n",
|
65 |
+
" return loaded_model\n",
|
66 |
+
"\n",
|
67 |
+
"\n",
|
68 |
+
"def load_compound_cache():\n",
|
69 |
+
" ccache = CompoundCacher()\n",
|
70 |
+
" return ccache\n"
|
71 |
+
]
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"cell_type": "code",
|
75 |
+
"execution_count": 3,
|
76 |
+
"metadata": {},
|
77 |
+
"outputs": [],
|
78 |
+
"source": [
|
79 |
+
"def count_substructures(radius, molecule):\n",
|
80 |
+
" \"\"\"Helper function for get the information of molecular signature of a\n",
|
81 |
+
" metabolite. The relaxed signature requires the number of each substructure\n",
|
82 |
+
" to construct a matrix for each molecule.\n",
|
83 |
+
" Parameters\n",
|
84 |
+
" ----------\n",
|
85 |
+
" radius : int\n",
|
86 |
+
" the radius is bond-distance that defines how many neighbor atoms should\n",
|
87 |
+
" be considered in a reaction center.\n",
|
88 |
+
" molecule : Molecule\n",
|
89 |
+
" a molecule object create by RDkit (e.g. Chem.MolFromInchi(inchi_code)\n",
|
90 |
+
" or Chem.MolToSmiles(smiles_code))\n",
|
91 |
+
" Returns\n",
|
92 |
+
" -------\n",
|
93 |
+
" dict\n",
|
94 |
+
" dictionary of molecular signature for a molecule,\n",
|
95 |
+
" {smiles: molecular_signature}\n",
|
96 |
+
" \"\"\"\n",
|
97 |
+
" m = molecule\n",
|
98 |
+
" smi_count = dict()\n",
|
99 |
+
" atomList = [atom for atom in m.GetAtoms()]\n",
|
100 |
+
"\n",
|
101 |
+
" for i in range(len(atomList)):\n",
|
102 |
+
" env = Chem.FindAtomEnvironmentOfRadiusN(m, radius, i)\n",
|
103 |
+
" atoms = set()\n",
|
104 |
+
" for bidx in env:\n",
|
105 |
+
" atoms.add(m.GetBondWithIdx(bidx).GetBeginAtomIdx())\n",
|
106 |
+
" atoms.add(m.GetBondWithIdx(bidx).GetEndAtomIdx())\n",
|
107 |
+
"\n",
|
108 |
+
" # only one atom is in this environment, such as O in H2O\n",
|
109 |
+
" if len(atoms) == 0:\n",
|
110 |
+
" atoms = {i}\n",
|
111 |
+
"\n",
|
112 |
+
" smi = Chem.MolFragmentToSmiles(m, atomsToUse=list(atoms),\n",
|
113 |
+
" bondsToUse=env, canonical=True)\n",
|
114 |
+
"\n",
|
115 |
+
" if smi in smi_count:\n",
|
116 |
+
" smi_count[smi] = smi_count[smi] + 1\n",
|
117 |
+
" else:\n",
|
118 |
+
" smi_count[smi] = 1\n",
|
119 |
+
" return smi_count\n"
|
120 |
+
]
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"cell_type": "code",
|
124 |
+
"execution_count": 4,
|
125 |
+
"metadata": {},
|
126 |
+
"outputs": [],
|
127 |
+
"source": [
|
128 |
+
"def decompse_novel_mets_rad1(novel_smiles, radius=1):\n",
|
129 |
+
" decompose_vector = dict()\n",
|
130 |
+
"\n",
|
131 |
+
" for cid, smiles_pH7 in novel_smiles.items():\n",
|
132 |
+
" mol = Chem.MolFromSmiles(smiles_pH7)\n",
|
133 |
+
" mol = Chem.RemoveHs(mol)\n",
|
134 |
+
" # Chem.RemoveStereochemistry(mol)\n",
|
135 |
+
" smi_count = count_substructures(radius, mol)\n",
|
136 |
+
" decompose_vector[cid] = smi_count\n",
|
137 |
+
" return decompose_vector\n",
|
138 |
+
"\n",
|
139 |
+
"\n",
|
140 |
+
"def decompse_novel_mets_rad2(novel_smiles, radius=2):\n",
|
141 |
+
" decompose_vector = dict()\n",
|
142 |
+
"\n",
|
143 |
+
" for cid, smiles_pH7 in novel_smiles.items():\n",
|
144 |
+
" mol = Chem.MolFromSmiles(smiles_pH7)\n",
|
145 |
+
" mol = Chem.RemoveHs(mol)\n",
|
146 |
+
" # Chem.RemoveStereochemistry(mol)\n",
|
147 |
+
" smi_count = count_substructures(radius, mol)\n",
|
148 |
+
" decompose_vector[cid] = smi_count\n",
|
149 |
+
" return decompose_vector\n"
|
150 |
+
]
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"cell_type": "code",
|
154 |
+
"execution_count": 5,
|
155 |
+
"metadata": {},
|
156 |
+
"outputs": [],
|
157 |
+
"source": [
|
158 |
+
"def parse_reaction_formula_side(s):\n",
|
159 |
+
" \"\"\"\n",
|
160 |
+
" Parses the side formula, e.g. '2 C00001 + C00002 + 3 C00003'\n",
|
161 |
+
" Ignores stoichiometry.\n",
|
162 |
+
"\n",
|
163 |
+
" Returns:\n",
|
164 |
+
" The set of CIDs.\n",
|
165 |
+
" \"\"\"\n",
|
166 |
+
" if s.strip() == \"null\":\n",
|
167 |
+
" return {}\n",
|
168 |
+
"\n",
|
169 |
+
" compound_bag = {}\n",
|
170 |
+
" for member in re.split('\\s+\\+\\s+', s):\n",
|
171 |
+
" tokens = member.split(None, 1)\n",
|
172 |
+
" if len(tokens) == 0:\n",
|
173 |
+
" continue\n",
|
174 |
+
" if len(tokens) == 1:\n",
|
175 |
+
" amount = 1\n",
|
176 |
+
" key = member\n",
|
177 |
+
" else:\n",
|
178 |
+
" amount = float(tokens[0])\n",
|
179 |
+
" key = tokens[1]\n",
|
180 |
+
"\n",
|
181 |
+
" compound_bag[key] = compound_bag.get(key, 0) + amount\n",
|
182 |
+
"\n",
|
183 |
+
" return compound_bag\n",
|
184 |
+
"\n",
|
185 |
+
"\n",
|
186 |
+
"def parse_formula(formula, arrow='<=>', rid=None):\n",
|
187 |
+
" \"\"\"\n",
|
188 |
+
" Parses a two-sided formula such as: 2 C00001 => C00002 + C00003\n",
|
189 |
+
"\n",
|
190 |
+
" Return:\n",
|
191 |
+
" The set of substrates, products and the direction of the reaction\n",
|
192 |
+
" \"\"\"\n",
|
193 |
+
" tokens = formula.split(arrow)\n",
|
194 |
+
" if len(tokens) < 2:\n",
|
195 |
+
" print(('Reaction does not contain the arrow sign (%s): %s'\n",
|
196 |
+
" % (arrow, formula)))\n",
|
197 |
+
" if len(tokens) > 2:\n",
|
198 |
+
" print(('Reaction contains more than one arrow sign (%s): %s'\n",
|
199 |
+
" % (arrow, formula)))\n",
|
200 |
+
"\n",
|
201 |
+
" left = tokens[0].strip()\n",
|
202 |
+
" right = tokens[1].strip()\n",
|
203 |
+
"\n",
|
204 |
+
" sparse_reaction = {}\n",
|
205 |
+
" for cid, count in parse_reaction_formula_side(left).items():\n",
|
206 |
+
" sparse_reaction[cid] = sparse_reaction.get(cid, 0) - count\n",
|
207 |
+
"\n",
|
208 |
+
" for cid, count in parse_reaction_formula_side(right).items():\n",
|
209 |
+
" sparse_reaction[cid] = sparse_reaction.get(cid, 0) + count\n",
|
210 |
+
"\n",
|
211 |
+
" return sparse_reaction\n"
|
212 |
+
]
|
213 |
+
},
|
214 |
+
{
|
215 |
+
"cell_type": "code",
|
216 |
+
"execution_count": 6,
|
217 |
+
"metadata": {},
|
218 |
+
"outputs": [],
|
219 |
+
"source": [
|
220 |
+
"def draw_rxn_figure(rxn_dict, db_smiles, novel_smiles):\n",
|
221 |
+
" # db_smiles = load_smiles()\n",
|
222 |
+
"\n",
|
223 |
+
" left = ''\n",
|
224 |
+
" right = ''\n",
|
225 |
+
"\n",
|
226 |
+
" for met, stoic in rxn_dict.items():\n",
|
227 |
+
" if met == \"C00080\" or met == \"C00282\":\n",
|
228 |
+
" continue # hydogen is not considered\n",
|
229 |
+
" if stoic > 0:\n",
|
230 |
+
" if met in db_smiles:\n",
|
231 |
+
" right = right + db_smiles[met] + '.'\n",
|
232 |
+
" else:\n",
|
233 |
+
" right = right + novel_smiles[met] + '.'\n",
|
234 |
+
" else:\n",
|
235 |
+
" if met in db_smiles:\n",
|
236 |
+
" left = left + db_smiles[met] + '.'\n",
|
237 |
+
" else:\n",
|
238 |
+
" left = left + novel_smiles[met] + '.'\n",
|
239 |
+
" smarts = left[:-1] + '>>' + right[:-1]\n",
|
240 |
+
" # print smarts\n",
|
241 |
+
" smarts = str(smarts)\n",
|
242 |
+
" rxn = Reactions.ReactionFromSmarts(smarts, useSmiles=True)\n",
|
243 |
+
" return Draw.ReactionToImage(rxn) # , subImgSize=(400, 400))"
|
244 |
+
]
|
245 |
+
},
|
246 |
+
{
|
247 |
+
"cell_type": "code",
|
248 |
+
"execution_count": 7,
|
249 |
+
"metadata": {},
|
250 |
+
"outputs": [],
|
251 |
+
"source": [
|
252 |
+
"def get_rule(rxn_dict, molsig1, molsig2, novel_decomposed1, novel_decomposed2):\n",
|
253 |
+
" if novel_decomposed1 != None:\n",
|
254 |
+
" for cid in novel_decomposed1:\n",
|
255 |
+
" molsig1[cid] = novel_decomposed1[cid]\n",
|
256 |
+
" if novel_decomposed2 != None:\n",
|
257 |
+
" for cid in novel_decomposed2:\n",
|
258 |
+
" molsig2[cid] = novel_decomposed2[cid]\n",
|
259 |
+
"\n",
|
260 |
+
" molsigna_df1 = pd.DataFrame.from_dict(molsig1).fillna(0)\n",
|
261 |
+
" all_mets1 = molsigna_df1.columns.tolist()\n",
|
262 |
+
" all_mets1.append(\"C00080\")\n",
|
263 |
+
" all_mets1.append(\"C00282\")\n",
|
264 |
+
"\n",
|
265 |
+
" molsigna_df2 = pd.DataFrame.from_dict(molsig2).fillna(0)\n",
|
266 |
+
" all_mets2 = molsigna_df2.columns.tolist()\n",
|
267 |
+
" all_mets2.append(\"C00080\")\n",
|
268 |
+
" all_mets2.append(\"C00282\")\n",
|
269 |
+
"\n",
|
270 |
+
" moieties_r1 = open('./data/group_names_r1.txt')\n",
|
271 |
+
" moieties_r2 = open('./data/group_names_r2_py3_modified_manual.txt')\n",
|
272 |
+
" moie_r1 = moieties_r1.read().splitlines()\n",
|
273 |
+
" moie_r2 = moieties_r2.read().splitlines()\n",
|
274 |
+
"\n",
|
275 |
+
" molsigna_df1 = molsigna_df1.reindex(moie_r1)\n",
|
276 |
+
" molsigna_df2 = molsigna_df2.reindex(moie_r2)\n",
|
277 |
+
"\n",
|
278 |
+
" rule_df1 = pd.DataFrame(index=molsigna_df1.index)\n",
|
279 |
+
" rule_df2 = pd.DataFrame(index=molsigna_df2.index)\n",
|
280 |
+
" # for rid, value in reaction_dict.items():\n",
|
281 |
+
" # # skip the reactions with missing metabolites\n",
|
282 |
+
" # mets = value.keys()\n",
|
283 |
+
" # flag = False\n",
|
284 |
+
" # for met in mets:\n",
|
285 |
+
" # if met not in all_mets:\n",
|
286 |
+
" # flag = True\n",
|
287 |
+
" # break\n",
|
288 |
+
" # if flag: continue\n",
|
289 |
+
"\n",
|
290 |
+
" rule_df1['change'] = 0\n",
|
291 |
+
" for met, stoic in rxn_dict.items():\n",
|
292 |
+
" if met == \"C00080\" or met == \"C00282\":\n",
|
293 |
+
" continue # hydogen is zero\n",
|
294 |
+
" rule_df1['change'] += molsigna_df1[met] * stoic\n",
|
295 |
+
"\n",
|
296 |
+
" rule_df2['change'] = 0\n",
|
297 |
+
" for met, stoic in rxn_dict.items():\n",
|
298 |
+
" if met == \"C00080\" or met == \"C00282\":\n",
|
299 |
+
" continue # hydogen is zero\n",
|
300 |
+
" rule_df2['change'] += molsigna_df2[met] * stoic\n",
|
301 |
+
"\n",
|
302 |
+
" rule_vec1 = rule_df1.to_numpy().T\n",
|
303 |
+
" rule_vec2 = rule_df2.to_numpy().T\n",
|
304 |
+
"\n",
|
305 |
+
" m1, n1 = rule_vec1.shape\n",
|
306 |
+
" m2, n2 = rule_vec2.shape\n",
|
307 |
+
"\n",
|
308 |
+
" zeros1 = np.zeros((m1, 44))\n",
|
309 |
+
" zeros2 = np.zeros((m2, 44))\n",
|
310 |
+
" X1 = np.concatenate((rule_vec1, zeros1), 1)\n",
|
311 |
+
" X2 = np.concatenate((rule_vec2, zeros2), 1)\n",
|
312 |
+
"\n",
|
313 |
+
" rule_comb = np.concatenate((X1, X2), 1)\n",
|
314 |
+
"\n",
|
315 |
+
" # rule_df_final = {}\n",
|
316 |
+
" # rule_df_final['rad1'] = rule_df1\n",
|
317 |
+
" # rule_df_final['rad2'] = rule_df2\n",
|
318 |
+
" return rule_comb, rule_df1, rule_df2"
|
319 |
+
]
|
320 |
+
},
|
321 |
+
{
|
322 |
+
"cell_type": "code",
|
323 |
+
"execution_count": 8,
|
324 |
+
"metadata": {},
|
325 |
+
"outputs": [],
|
326 |
+
"source": [
|
327 |
+
"def get_ddG0(rxn_dict, pH, I, novel_mets):\n",
|
328 |
+
" ccache = CompoundCacher()\n",
|
329 |
+
" # ddG0 = get_transform_ddG0(rxn_dict, ccache, pH, I, T)\n",
|
330 |
+
" T = 298.15\n",
|
331 |
+
" ddG0_forward = 0\n",
|
332 |
+
" for compound_id, coeff in rxn_dict.items():\n",
|
333 |
+
" if novel_mets != None and compound_id in novel_mets:\n",
|
334 |
+
" comp = novel_mets[compound_id]\n",
|
335 |
+
" else:\n",
|
336 |
+
" comp = ccache.get_compound(compound_id)\n",
|
337 |
+
" ddG0_forward += coeff * comp.transform_pH7(pH, I, T)\n",
|
338 |
+
"\n",
|
339 |
+
" return ddG0_forward"
|
340 |
+
]
|
341 |
+
},
|
342 |
+
{
|
343 |
+
"cell_type": "code",
|
344 |
+
"execution_count": 9,
|
345 |
+
"metadata": {},
|
346 |
+
"outputs": [],
|
347 |
+
"source": [
|
348 |
+
"def get_dG0(rxn_dict, rid, pH, I, loaded_model, molsig_r1, molsig_r2, novel_decomposed_r1, novel_decomposed_r2, novel_mets):\n",
|
349 |
+
"\n",
|
350 |
+
" # rule_df = get_rxn_rule(rid)\n",
|
351 |
+
" rule_comb, rule_df1, rule_df2 = get_rule(\n",
|
352 |
+
" rxn_dict, molsig_r1, molsig_r2, novel_decomposed_r1, novel_decomposed_r2)\n",
|
353 |
+
"\n",
|
354 |
+
" X = rule_comb\n",
|
355 |
+
"\n",
|
356 |
+
" ymean, ystd = loaded_model.predict(X, return_std=True)\n",
|
357 |
+
"\n",
|
358 |
+
" result = {}\n",
|
359 |
+
" # result['dG0'] = ymean[0] + get_ddG0(rxn_dict, pH, I)\n",
|
360 |
+
" # result['standard deviation'] = ystd[0]\n",
|
361 |
+
"\n",
|
362 |
+
" # result_df = pd.DataFrame([result])\n",
|
363 |
+
" # result_df.style.hide_index()\n",
|
364 |
+
" # return result_df\n",
|
365 |
+
" return ymean[0] + get_ddG0(rxn_dict, pH, I, novel_mets), ystd[0], rule_df1, rule_df2\n",
|
366 |
+
" # return ymean[0],ystd[0]\n"
|
367 |
+
]
|
368 |
+
},
|
369 |
+
{
|
370 |
+
"cell_type": "code",
|
371 |
+
"execution_count": 10,
|
372 |
+
"metadata": {},
|
373 |
+
"outputs": [],
|
374 |
+
"source": [
|
375 |
+
"def parse_novel_molecule(add_info):\n",
|
376 |
+
" result = {}\n",
|
377 |
+
" for cid, InChI in add_info.items():\n",
|
378 |
+
" c = Compound.from_inchi('Test', cid, InChI)\n",
|
379 |
+
" result[cid] = c\n",
|
380 |
+
" return result\n",
|
381 |
+
"\n",
|
382 |
+
"\n",
|
383 |
+
"def parse_novel_smiles(result):\n",
|
384 |
+
" novel_smiles = {}\n",
|
385 |
+
" for cid, c in result.items():\n",
|
386 |
+
" smiles = c.smiles_pH7\n",
|
387 |
+
" novel_smiles[cid] = smiles\n",
|
388 |
+
" return novel_smiles\n"
|
389 |
+
]
|
390 |
+
},
|
391 |
+
{
|
392 |
+
"cell_type": "code",
|
393 |
+
"execution_count": 11,
|
394 |
+
"metadata": {},
|
395 |
+
"outputs": [],
|
396 |
+
"source": [
|
397 |
+
"db_smiles = load_smiles()\n",
|
398 |
+
"molsig_r1 = load_molsig_rad1()\n",
|
399 |
+
"molsig_r2 = load_molsig_rad2()\n",
|
400 |
+
"\n",
|
401 |
+
"loaded_model = load_model()\n",
|
402 |
+
"ccache = load_compound_cache()"
|
403 |
+
]
|
404 |
+
},
|
405 |
+
{
|
406 |
+
"cell_type": "markdown",
|
407 |
+
"metadata": {},
|
408 |
+
"source": [
|
409 |
+
"## Estimating dG for reaction with novel metabolite"
|
410 |
+
]
|
411 |
+
},
|
412 |
+
{
|
413 |
+
"cell_type": "code",
|
414 |
+
"execution_count": 12,
|
415 |
+
"metadata": {},
|
416 |
+
"outputs": [],
|
417 |
+
"source": [
|
418 |
+
"rxn_str = 'C01745 + C00004 <=> N00001 + C00003 + C00001'"
|
419 |
+
]
|
420 |
+
},
|
421 |
+
{
|
422 |
+
"cell_type": "code",
|
423 |
+
"execution_count": 13,
|
424 |
+
"metadata": {},
|
425 |
+
"outputs": [
|
426 |
+
{
|
427 |
+
"data": {
|
428 |
+
"text/plain": [
|
429 |
+
"'C01745 + C00004 <=> N00001 + C00003 + C00001'"
|
430 |
+
]
|
431 |
+
},
|
432 |
+
"execution_count": 13,
|
433 |
+
"metadata": {},
|
434 |
+
"output_type": "execute_result"
|
435 |
+
}
|
436 |
+
],
|
437 |
+
"source": [
|
438 |
+
"rxn_str"
|
439 |
+
]
|
440 |
+
},
|
441 |
+
{
|
442 |
+
"cell_type": "code",
|
443 |
+
"execution_count": 14,
|
444 |
+
"metadata": {},
|
445 |
+
"outputs": [],
|
446 |
+
"source": [
|
447 |
+
"add_info = {\"N00001\":\"InChI=1S/C14H12O/c15-14-8-4-7-13(11-14)10-9-12-5-2-1-3-6-12/h1-11,15H/b10-9+\"}"
|
448 |
+
]
|
449 |
+
},
|
450 |
+
{
|
451 |
+
"cell_type": "code",
|
452 |
+
"execution_count": 15,
|
453 |
+
"metadata": {},
|
454 |
+
"outputs": [
|
455 |
+
{
|
456 |
+
"data": {
|
457 |
+
"text/plain": [
|
458 |
+
"'InChI=1S/C14H12O/c15-14-8-4-7-13(11-14)10-9-12-5-2-1-3-6-12/h1-11,15H/b10-9+'"
|
459 |
+
]
|
460 |
+
},
|
461 |
+
"execution_count": 15,
|
462 |
+
"metadata": {},
|
463 |
+
"output_type": "execute_result"
|
464 |
+
}
|
465 |
+
],
|
466 |
+
"source": [
|
467 |
+
"add_info['N00001']"
|
468 |
+
]
|
469 |
+
},
|
470 |
+
{
|
471 |
+
"cell_type": "code",
|
472 |
+
"execution_count": 16,
|
473 |
+
"metadata": {},
|
474 |
+
"outputs": [],
|
475 |
+
"source": [
|
476 |
+
"pH = 7 # any number between 0-14 \n",
|
477 |
+
"I = 0.1 #min_value=0.0, max_value=0.5)"
|
478 |
+
]
|
479 |
+
},
|
480 |
+
{
|
481 |
+
"cell_type": "code",
|
482 |
+
"execution_count": 17,
|
483 |
+
"metadata": {},
|
484 |
+
"outputs": [
|
485 |
+
{
|
486 |
+
"name": "stdout",
|
487 |
+
"output_type": "stream",
|
488 |
+
"text": [
|
489 |
+
"{'N00001': 'Oc1cccc(/C=C/c2ccccc2)c1'}\n"
|
490 |
+
]
|
491 |
+
}
|
492 |
+
],
|
493 |
+
"source": [
|
494 |
+
"try:\n",
|
495 |
+
" novel_mets = parse_novel_molecule(add_info)\n",
|
496 |
+
" novel_smiles = parse_novel_smiles(novel_mets)\n",
|
497 |
+
" novel_decomposed_r1 = decompse_novel_mets_rad1(novel_smiles)\n",
|
498 |
+
" novel_decomposed_r2 = decompse_novel_mets_rad2(novel_smiles)\n",
|
499 |
+
"\n",
|
500 |
+
"except Exception as e:\n",
|
501 |
+
" novel_mets = None\n",
|
502 |
+
" novel_smiles = None\n",
|
503 |
+
" novel_decomposed_r1 = None\n",
|
504 |
+
" novel_decomposed_r2 = None\n",
|
505 |
+
"\n",
|
506 |
+
"print(novel_smiles)\n"
|
507 |
+
]
|
508 |
+
},
|
509 |
+
{
|
510 |
+
"cell_type": "code",
|
511 |
+
"execution_count": 18,
|
512 |
+
"metadata": {},
|
513 |
+
"outputs": [],
|
514 |
+
"source": [
|
515 |
+
"rxn_dict = parse_formula(rxn_str)"
|
516 |
+
]
|
517 |
+
},
|
518 |
+
{
|
519 |
+
"cell_type": "code",
|
520 |
+
"execution_count": 19,
|
521 |
+
"metadata": {},
|
522 |
+
"outputs": [
|
523 |
+
{
|
524 |
+
"data": {
|
525 |
+
"image/png": "\n",
|
526 |
+
"text/plain": [
|
527 |
+
"<PIL.PngImagePlugin.PngImageFile image mode=RGB size=1200x200 at 0x1711E6902E0>"
|
528 |
+
]
|
529 |
+
},
|
530 |
+
"execution_count": 19,
|
531 |
+
"metadata": {},
|
532 |
+
"output_type": "execute_result"
|
533 |
+
}
|
534 |
+
],
|
535 |
+
"source": [
|
536 |
+
"draw_rxn_figure(rxn_dict, db_smiles,novel_smiles)"
|
537 |
+
]
|
538 |
+
},
|
539 |
+
{
|
540 |
+
"cell_type": "code",
|
541 |
+
"execution_count": 20,
|
542 |
+
"metadata": {},
|
543 |
+
"outputs": [
|
544 |
+
{
|
545 |
+
"name": "stdout",
|
546 |
+
"output_type": "stream",
|
547 |
+
"text": [
|
548 |
+
"dG = -121.79 ± 100.57 kJ/mol\n"
|
549 |
+
]
|
550 |
+
}
|
551 |
+
],
|
552 |
+
"source": [
|
553 |
+
"mu, std, rule_df1, rule_df2 = get_dG0(rxn_dict, 'R00801', pH, I, loaded_model, molsig_r1, molsig_r2, novel_decomposed_r1, novel_decomposed_r2, novel_mets)\n",
|
554 |
+
"\n",
|
555 |
+
"print(\"dG = %.2f ± %.2f kJ/mol\" % (mu, std))\n",
|
556 |
+
"\n"
|
557 |
+
]
|
558 |
+
},
|
559 |
+
{
|
560 |
+
"cell_type": "markdown",
|
561 |
+
"metadata": {},
|
562 |
+
"source": [
|
563 |
+
"## Bulk estimation of dG for a list of KEGG reactions"
|
564 |
+
]
|
565 |
+
},
|
566 |
+
{
|
567 |
+
"cell_type": "code",
|
568 |
+
"execution_count": 12,
|
569 |
+
"metadata": {},
|
570 |
+
"outputs": [],
|
571 |
+
"source": [
|
572 |
+
"KEGG_rxn_list = {\"R00010\" : \"C01083 + C00001 <=> 2 C00031\",\n",
|
573 |
+
" \"R00303\" : \"C00092 + C00001 <=> C00031 + C00009\",\n",
|
574 |
+
" \"R00304\" : \"C00103 + C00001 <=> C00031 + C00009\",\n",
|
575 |
+
" \"R07294\" : \"C15524 + C00001 <=> C02137 + C00010\",\n",
|
576 |
+
" \"R01252\" : \"C00148 + C00026 + C00007 <=> C01157 + C00042 + C00011\",\n",
|
577 |
+
" \"R00406\" : \"C00091 + C00149 <=> C00042 + C04348\"\n",
|
578 |
+
" }"
|
579 |
+
]
|
580 |
+
},
|
581 |
+
{
|
582 |
+
"cell_type": "code",
|
583 |
+
"execution_count": 14,
|
584 |
+
"metadata": {},
|
585 |
+
"outputs": [
|
586 |
+
{
|
587 |
+
"name": "stdout",
|
588 |
+
"output_type": "stream",
|
589 |
+
"text": [
|
590 |
+
"R00010\n",
|
591 |
+
"C01083 + C00001 <=> 2 C00031\n",
|
592 |
+
"dG = -12.45 ± 3.49 kJ/mol\n",
|
593 |
+
"R00303\n",
|
594 |
+
"C00092 + C00001 <=> C00031 + C00009\n",
|
595 |
+
"dG = -12.40 ± 3.30 kJ/mol\n",
|
596 |
+
"R00304\n",
|
597 |
+
"C00103 + C00001 <=> C00031 + C00009\n",
|
598 |
+
"dG = -18.78 ± 3.37 kJ/mol\n",
|
599 |
+
"R07294\n",
|
600 |
+
"C15524 + C00001 <=> C02137 + C00010\n",
|
601 |
+
"dG = -14.46 ± 31.43 kJ/mol\n",
|
602 |
+
"R01252\n",
|
603 |
+
"C00148 + C00026 + C00007 <=> C01157 + C00042 + C00011\n",
|
604 |
+
"dG = -427.04 ± 41.12 kJ/mol\n",
|
605 |
+
"R00406\n",
|
606 |
+
"C00091 + C00149 <=> C00042 + C04348\n",
|
607 |
+
"dG = -3.27 ± 4.37 kJ/mol\n"
|
608 |
+
]
|
609 |
+
}
|
610 |
+
],
|
611 |
+
"source": [
|
612 |
+
"pH = 7 # any number between 0-14 \n",
|
613 |
+
"I = 0.1 #min_value=0.0, max_value=0.5)\n",
|
614 |
+
"\n",
|
615 |
+
"for keys in KEGG_rxn_list:\n",
|
616 |
+
" kegg_rxn_string = KEGG_rxn_list[keys]\n",
|
617 |
+
" kegg_rxn_dict = parse_formula(kegg_rxn_string)\n",
|
618 |
+
" mu, std, rule_df1, rule_df2 = get_dG0(kegg_rxn_dict, keys, pH, I, loaded_model, molsig_r1, molsig_r2, [], [], [])\n",
|
619 |
+
" print(keys)\n",
|
620 |
+
" print(kegg_rxn_string)\n",
|
621 |
+
" print(\"dG = %.2f ± %.2f kJ/mol\" % (mu, std))"
|
622 |
+
]
|
623 |
+
},
|
624 |
+
{
|
625 |
+
"cell_type": "code",
|
626 |
+
"execution_count": null,
|
627 |
+
"metadata": {},
|
628 |
+
"outputs": [],
|
629 |
+
"source": []
|
630 |
+
},
|
631 |
+
{
|
632 |
+
"cell_type": "code",
|
633 |
+
"execution_count": null,
|
634 |
+
"metadata": {},
|
635 |
+
"outputs": [],
|
636 |
+
"source": []
|
637 |
+
}
|
638 |
+
],
|
639 |
+
"metadata": {
|
640 |
+
"kernelspec": {
|
641 |
+
"display_name": "Python 3",
|
642 |
+
"language": "python",
|
643 |
+
"name": "python3"
|
644 |
+
},
|
645 |
+
"language_info": {
|
646 |
+
"codemirror_mode": {
|
647 |
+
"name": "ipython",
|
648 |
+
"version": 3
|
649 |
+
},
|
650 |
+
"file_extension": ".py",
|
651 |
+
"mimetype": "text/x-python",
|
652 |
+
"name": "python",
|
653 |
+
"nbconvert_exporter": "python",
|
654 |
+
"pygments_lexer": "ipython3",
|
655 |
+
"version": "3.8.10"
|
656 |
+
}
|
657 |
+
},
|
658 |
+
"nbformat": 4,
|
659 |
+
"nbformat_minor": 4
|
660 |
+
}
|