vuu10 commited on
Commit
66061aa
1 Parent(s): 2061732

Upload 8 files

Browse files
README.md CHANGED
@@ -1,12 +1,64 @@
1
- ---
2
- title: DGPredictor
3
- emoji: 🔥
4
- colorFrom: pink
5
- colorTo: indigo
6
- sdk: streamlit
7
- sdk_version: 1.19.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dGPredictor
2
+
3
+ ==================================
4
+ ### Requirements:
5
+
6
+ 1. Python 3.8.10
7
+ 2. RDkit (http://www.rdkit.org/)
8
+ 3. pandas (https://pandas.pydata.org/)
9
+ 4. matplotlib (https://matplotlib.org/stable/users/installing.html)
10
+ 5. Scikit-learn (https://scikit-learn.org/stable/)
11
+ 6. Streamlit (https://streamlit.io/)
12
+ 7. Openbabel (https://anaconda.org/openbabel/openbabel)
13
+ 8. ChemAxon's Marvin >= 5.11
14
+ 9. Pulp
15
+
16
+ Installation
17
+ 1. Python 3.8.10 (https://www.python.org/downloads/windows/)
18
+ Recommended-
19
+ - Create anaconda environment using command "conda create -n dGPredictor python=3.8 ipython"
20
+ - activate the env using command "conda activate dGPredictor" or "source activate dGPredictor"
21
+ 2. RDkit
22
+ - type command "conda install -c conda-forge rdkit" in your dGPredictor env to install rdkit
23
+ 3. Pandas
24
+ - "conda install pandas"
25
+ 4. matplotlib
26
+ - "conda install -c conda-forge matplotlib"
27
+ 5. Scikit-learn
28
+ - use command "pip install -U scikit-learn"
29
+ 6. Streamlit
30
+ - use command "pip install -U streamlit"
31
+ 7. Openbabel
32
+ - run "conda install -c conda-forge openbabel"
33
+ 8. ChemAxon's Marvin (PkA value estimation)
34
+ - Marvin is only required for adding structures of novel metabolites/compounds that are not in the KEGG database
35
+ - instructions (https://chemaxon.com/products/marvin/download)
36
+ - add "cxcalc.bat (macOS) /cxcalc.exe (Windows)" to PATH and also in "./CC/chemaxon.py" file
37
+ - you will need to get a license to use ChemAxon (it is free for academic use)
38
+ 9. Pulp
39
+ - use command "pip install -U pulp"
40
+
41
+
42
+
43
+
44
+ ==================================
45
+ ### Running web-interface locally using streamlit
46
+
47
+ - Model generation: Run "model_gen.py" using "python model_gen.py" once to create dGPredictor model file :- (Running this might take some time)
48
+ - run "streamlit run ./streamlit/main.py" from dGPredictor folder
49
+ - running KEGG reaction (doesn't require ChemAxon's Marvin) : copy paste the reaction equation into reaction section and click search
50
+
51
+ ### Gibbs free energy prediction use automated group decomposition method
52
+
53
+ - Step 1: decompose the metabolites based on smiles files (see function decompse_ac in decompose_groups.py or notebook )
54
+ - Step 2: create group changes vectors (i.e. reaction rules) based on group changes in metabolites of reactions (see get_rxn_rule in decompose_groups.py)
55
+ - Step 3: linear regression, Ridge Regression and Bayesian Ridge Regression in "predict.py"
56
+ - Step 4: Multiple regression models in notebook "analysis_dGPredictor.ipynb"
57
+
58
+ ### Pathway design using novoStoic
59
+ - Run "mini_novoStoic.py" to see an example to design pathways for Isobutanol synthesis
60
+
61
+
62
+ # demo
63
+ ![dGPredictor Demo](figures/dg_demo_py3.gif)
64
+
analysis_dGPredictor.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
decompose_groups.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import pdb
3
+ import json
4
+ from rdkit import Chem
5
+
6
+ def count_substructures(radius,molecule):
7
+ """Helper function for get the information of molecular signature of a
8
+ metabolite. The relaxed signature requires the number of each substructure
9
+ to construct a matrix for each molecule.
10
+ Parameters
11
+ ----------
12
+ radius : int
13
+ the radius is bond-distance that defines how many neighbor atoms should
14
+ be considered in a reaction center.
15
+ molecule : Molecule
16
+ a molecule object create by RDkit (e.g. Chem.MolFromInchi(inchi_code)
17
+ or Chem.MolToSmiles(smiles_code))
18
+ Returns
19
+ -------
20
+ dict
21
+ dictionary of molecular signature for a molecule,
22
+ {smiles: molecular_signature}
23
+ """
24
+ m = molecule
25
+ smi_count = dict()
26
+ atomList = [atom for atom in m.GetAtoms()]
27
+
28
+ for i in range(len(atomList)):
29
+ env = Chem.FindAtomEnvironmentOfRadiusN(m,radius,i)
30
+ atoms=set()
31
+ for bidx in env:
32
+ atoms.add(m.GetBondWithIdx(bidx).GetBeginAtomIdx())
33
+ atoms.add(m.GetBondWithIdx(bidx).GetEndAtomIdx())
34
+
35
+ # only one atom is in this environment, such as O in H2O
36
+ if len(atoms) == 0:
37
+ atoms = {i}
38
+
39
+ smi = Chem.MolFragmentToSmiles(m,atomsToUse=list(atoms),
40
+ bondsToUse=env,canonical=True)
41
+
42
+ if smi in smi_count:
43
+ smi_count[smi] = smi_count[smi] + 1
44
+ else:
45
+ smi_count[smi] = 1
46
+ return smi_count
47
+
48
+ def decompse_ac(db_smiles,radius=1):
49
+ non_decomposable = []
50
+ decompose_vector = dict()
51
+
52
+ for cid in db_smiles:
53
+ # print cid
54
+ smiles_pH7 = db_smiles[cid]
55
+ try:
56
+ mol = Chem.MolFromSmiles(smiles_pH7)
57
+ mol = Chem.RemoveHs(mol)
58
+ # Chem.RemoveStereochemistry(mol)
59
+ smi_count = count_substructures(radius,mol)
60
+ decompose_vector[cid] = smi_count
61
+
62
+ except Exception as e:
63
+ non_decomposable.append(cid)
64
+
65
+ with open('./data/decompose_vector_ac.json','w') as fp:
66
+ json.dump(decompose_vector,fp)
67
+
68
+ def get_rxn_rule():
69
+ """calculate reaction rules based on the relaxed molecular signatures.
70
+
71
+ Parameters
72
+ ----------
73
+ radius : int
74
+ the radius is bond-distance that defines how many neighbor atoms should
75
+ be considered in a reaction center.
76
+
77
+ Returns
78
+ -------
79
+ None
80
+ All of the reaction rules are saved in files (csv file)
81
+
82
+ """
83
+ reaction_dict = json.load(open('./data/optstoic_v3_Sji_dict.json'))
84
+ molecular_signature = json.load(open('./data/decompose_vector_ac.json'))
85
+ molsigna_df = pd.DataFrame.from_dict(molecular_signature).fillna(0)
86
+ all_mets = molsigna_df.columns.tolist()
87
+ all_mets.append("C00080")
88
+ all_mets.append("C00282")
89
+
90
+
91
+ rule_df = pd.DataFrame(index=molsigna_df.index)
92
+ for rid, value in list(reaction_dict.items()):
93
+ # skip the reactions with missing metabolites
94
+ mets = list(value.keys())
95
+ flag = False
96
+ for met in mets:
97
+ if met not in all_mets:
98
+ flag = True
99
+ break
100
+ if flag: continue
101
+
102
+ rule_df[rid] = 0
103
+ for met, stoic in list(value.items()):
104
+ if met == "C00080" or met == "C00282":
105
+ continue # hydogen is zero
106
+ rule_df[rid] += molsigna_df[met] * stoic
107
+ rule_df.to_csv("./data/reaction_rule.csv", index=True)
108
+
109
+ def get_rxn_rule_no_stero():
110
+ """calculate reaction rules based on the relaxed molecular signatures.
111
+
112
+ Parameters
113
+ ----------
114
+ radius : int
115
+ the radius is bond-distance that defines how many neighbor atoms should
116
+ be considered in a reaction center.
117
+
118
+ Returns
119
+ -------
120
+ None
121
+ All of the reaction rules are saved in files (csv file)
122
+
123
+ """
124
+ reaction_dict = json.load(open('./data/optstoic_v3_Sji_dict.json'))
125
+ molecular_signature = json.load(open('./data/decompose_vector_ac_nostereo.json'))
126
+ molsigna_df = pd.DataFrame.from_dict(molecular_signature).fillna(0)
127
+ all_mets = molsigna_df.columns.tolist()
128
+ all_mets.append("C00080")
129
+ all_mets.append("C00282")
130
+
131
+
132
+ rule_df = pd.DataFrame(index=molsigna_df.index)
133
+ for rid, value in list(reaction_dict.items()):
134
+ # skip the reactions with missing metabolites
135
+ mets = list(value.keys())
136
+ flag = False
137
+ for met in mets:
138
+ if met not in all_mets:
139
+ flag = True
140
+ break
141
+ if flag: continue
142
+
143
+ rule_df[rid] = 0
144
+ for met, stoic in list(value.items()):
145
+ if met == "C00080" or met == "C00282":
146
+ continue # hydogen is zero
147
+ rule_df[rid] += molsigna_df[met] * stoic
148
+ rule_df.to_csv("./data/reaction_rule_no_stero.csv", index=True)
149
+
150
+ def get_rxn_rule_remove_TECRDB_mets():
151
+ """calculate reaction rules based on the relaxed molecular signatures.
152
+
153
+ Parameters
154
+ ----------
155
+ radius : int
156
+ the radius is bond-distance that defines how many neighbor atoms should
157
+ be considered in a reaction center.
158
+
159
+ Returns
160
+ -------
161
+ None
162
+ All of the reaction rules are saved in files (csv file)
163
+
164
+ """
165
+ reaction_dict = json.load(open('./data/optstoic_v3_Sji_dict.json'))
166
+ molecular_signature = json.load(open('./data/decompose_vector_ac.json'))
167
+ molsigna_df = pd.DataFrame.from_dict(molecular_signature).fillna(0)
168
+ all_mets = molsigna_df.columns.tolist()
169
+ all_mets.append("C00080")
170
+ all_mets.append("C00282")
171
+
172
+ mets_TECRDB_df = pd.read_csv('./data/TECRBD_mets.txt',header=None)
173
+ mets_TECRDB = mets_TECRDB_df[0].tolist()
174
+
175
+ # pdb.set_trace()
176
+ all_mets = list(set(all_mets + mets_TECRDB))
177
+
178
+ rule_df = pd.DataFrame(index=molsigna_df.index)
179
+ for rid, value in list(reaction_dict.items()):
180
+ # skip the reactions with missing metabolites
181
+ mets = list(value.keys())
182
+ flag = False
183
+ for met in mets:
184
+ if met not in all_mets:
185
+ flag = True
186
+ break
187
+ if flag: continue
188
+
189
+ rule_df[rid] = 0
190
+ for met, stoic in list(value.items()):
191
+ if met in mets_TECRDB:
192
+ continue # hydogen is zero
193
+ rule_df[rid] += molsigna_df[met] * stoic
194
+ rule_df.to_csv("./data/reaction_rule_remove_TECRDB_mets.csv", index=True)
195
+
196
+ def get_rxn_rule_no_stero_remove_TECRDB_mets():
197
+ """calculate reaction rules based on the relaxed molecular signatures.
198
+
199
+ Parameters
200
+ ----------
201
+ radius : int
202
+ the radius is bond-distance that defines how many neighbor atoms should
203
+ be considered in a reaction center.
204
+
205
+ Returns
206
+ -------
207
+ None
208
+ All of the reaction rules are saved in files (csv file)
209
+
210
+ """
211
+ reaction_dict = json.load(open('./data/optstoic_v3_Sji_dict.json'))
212
+ molecular_signature = json.load(open('./data/decompose_vector_ac_nostereo.json'))
213
+ molsigna_df = pd.DataFrame.from_dict(molecular_signature).fillna(0)
214
+ all_mets = molsigna_df.columns.tolist()
215
+ all_mets.append("C00080")
216
+ all_mets.append("C00282")
217
+
218
+ mets_TECRDB_df = pd.read_csv('./data/TECRBD_mets.txt',header=None)
219
+ mets_TECRDB = mets_TECRDB_df[0].tolist()
220
+
221
+ # pdb.set_trace()
222
+ all_mets = list(set(all_mets + mets_TECRDB))
223
+
224
+ rule_df = pd.DataFrame(index=molsigna_df.index)
225
+ for rid, value in list(reaction_dict.items()):
226
+ # skip the reactions with missing metabolites
227
+ mets = list(value.keys())
228
+ flag = False
229
+ for met in mets:
230
+ if met not in all_mets:
231
+ flag = True
232
+ break
233
+ if flag: continue
234
+
235
+ rule_df[rid] = 0
236
+ for met, stoic in list(value.items()):
237
+ if met in mets_TECRDB:
238
+ continue # hydogen is zero
239
+ rule_df[rid] += molsigna_df[met] * stoic
240
+ rule_df.to_csv("./data/reaction_rule_nostereo_remove_TECRDB_mets.csv", index=True)
241
+
242
+
243
+
244
+ if __name__ == '__main__':
245
+ # db = pd.read_csv('./data/cache_compounds_20160818.csv',index_col='compound_id')
246
+ # db_smiles = db['smiles_pH7'].to_dict()
247
+ # decompse_ac(db_smiles)
248
+ # get_rxn_rule()
249
+
250
+ # get_rxn_rule_remove_TECRDB_mets()
251
+ get_rxn_rule_no_stero_remove_TECRDB_mets()
main.py ADDED
@@ -0,0 +1,478 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import re
5
+ from PIL import Image
6
+ import webbrowser
7
+ import json
8
+ import pickle
9
+ import sys
10
+ import joblib
11
+ import sys
12
+
13
+ sys.path.append('./CC/')
14
+
15
+ import chemaxon
16
+ from chemaxon import *
17
+ from compound import Compound
18
+ from compound_cacher import CompoundCacher
19
+ from rdkit.Chem import rdChemReactions as Reactions
20
+ from rdkit.Chem import Draw
21
+ from rdkit import Chem
22
+
23
+ @st.cache(allow_output_mutation=True)
24
+ def load_smiles():
25
+ db = pd.read_csv('./data/cache_compounds_20160818.csv',
26
+ index_col='compound_id')
27
+ db_smiles = db['smiles_pH7'].to_dict()
28
+ return db_smiles
29
+
30
+
31
+ @st.cache(allow_output_mutation=True)
32
+ def load_molsig_rad1():
33
+ molecular_signature_r1 = json.load(open('./data/decompose_vector_ac.json'))
34
+ return molecular_signature_r1
35
+
36
+
37
+ @st.cache(allow_output_mutation=True)
38
+ def load_molsig_rad2():
39
+ molecular_signature_r2 = json.load(
40
+ open('./data/decompose_vector_ac_r2_py3_indent_modified_manual.json'))
41
+ return molecular_signature_r2
42
+
43
+
44
+ @st.cache(allow_output_mutation=True)
45
+ def load_model():
46
+ filename = './model/M12_model_BR.pkl'
47
+ loaded_model = joblib.load(open(filename, 'rb'))
48
+ return loaded_model
49
+
50
+
51
+ @st.cache(allow_output_mutation=True)
52
+ def load_compound_cache():
53
+ ccache = CompoundCacher()
54
+ return ccache
55
+
56
+
57
+ def count_substructures(radius, molecule):
58
+ """Helper function for get the information of molecular signature of a
59
+ metabolite. The relaxed signature requires the number of each substructure
60
+ to construct a matrix for each molecule.
61
+ Parameters
62
+ ----------
63
+ radius : int
64
+ the radius is bond-distance that defines how many neighbor atoms should
65
+ be considered in a reaction center.
66
+ molecule : Molecule
67
+ a molecule object create by RDkit (e.g. Chem.MolFromInchi(inchi_code)
68
+ or Chem.MolToSmiles(smiles_code))
69
+ Returns
70
+ -------
71
+ dict
72
+ dictionary of molecular signature for a molecule,
73
+ {smiles: molecular_signature}
74
+ """
75
+ m = molecule
76
+ smi_count = dict()
77
+ atomList = [atom for atom in m.GetAtoms()]
78
+
79
+ for i in range(len(atomList)):
80
+ env = Chem.FindAtomEnvironmentOfRadiusN(m, radius, i)
81
+ atoms = set()
82
+ for bidx in env:
83
+ atoms.add(m.GetBondWithIdx(bidx).GetBeginAtomIdx())
84
+ atoms.add(m.GetBondWithIdx(bidx).GetEndAtomIdx())
85
+
86
+ # only one atom is in this environment, such as O in H2O
87
+ if len(atoms) == 0:
88
+ atoms = {i}
89
+
90
+ smi = Chem.MolFragmentToSmiles(m, atomsToUse=list(atoms),
91
+ bondsToUse=env, canonical=True)
92
+
93
+ if smi in smi_count:
94
+ smi_count[smi] = smi_count[smi] + 1
95
+ else:
96
+ smi_count[smi] = 1
97
+ return smi_count
98
+
99
+
100
+ def decompse_novel_mets_rad1(novel_smiles, radius=1):
101
+ decompose_vector = dict()
102
+
103
+ for cid, smiles_pH7 in novel_smiles.items():
104
+ mol = Chem.MolFromSmiles(smiles_pH7)
105
+ mol = Chem.RemoveHs(mol)
106
+ # Chem.RemoveStereochemistry(mol)
107
+ smi_count = count_substructures(radius, mol)
108
+ decompose_vector[cid] = smi_count
109
+ return decompose_vector
110
+
111
+
112
+ def decompse_novel_mets_rad2(novel_smiles, radius=2):
113
+ decompose_vector = dict()
114
+
115
+ for cid, smiles_pH7 in novel_smiles.items():
116
+ mol = Chem.MolFromSmiles(smiles_pH7)
117
+ mol = Chem.RemoveHs(mol)
118
+ # Chem.RemoveStereochemistry(mol)
119
+ smi_count = count_substructures(radius, mol)
120
+ decompose_vector[cid] = smi_count
121
+ return decompose_vector
122
+
123
+ # def parse_rule(rxn,df_rule):
124
+ # df = df_rule
125
+ # rule_df = df[rxn].to_frame()
126
+ # # new_df = rule_df[(rule_df.T != 0).any()]
127
+
128
+ # return rule_df[(rule_df.T != 0).any()]
129
+
130
+
131
+ def parse_reaction_formula_side(s):
132
+ """
133
+ Parses the side formula, e.g. '2 C00001 + C00002 + 3 C00003'
134
+ Ignores stoichiometry.
135
+
136
+ Returns:
137
+ The set of CIDs.
138
+ """
139
+ if s.strip() == "null":
140
+ return {}
141
+
142
+ compound_bag = {}
143
+ for member in re.split('\s+\+\s+', s):
144
+ tokens = member.split(None, 1)
145
+ if len(tokens) == 0:
146
+ continue
147
+ if len(tokens) == 1:
148
+ amount = 1
149
+ key = member
150
+ else:
151
+ amount = float(tokens[0])
152
+ key = tokens[1]
153
+
154
+ compound_bag[key] = compound_bag.get(key, 0) + amount
155
+
156
+ return compound_bag
157
+
158
+
159
+ def parse_formula(formula, arrow='<=>', rid=None):
160
+ """
161
+ Parses a two-sided formula such as: 2 C00001 => C00002 + C00003
162
+
163
+ Return:
164
+ The set of substrates, products and the direction of the reaction
165
+ """
166
+ tokens = formula.split(arrow)
167
+ if len(tokens) < 2:
168
+ print(('Reaction does not contain the arrow sign (%s): %s'
169
+ % (arrow, formula)))
170
+ if len(tokens) > 2:
171
+ print(('Reaction contains more than one arrow sign (%s): %s'
172
+ % (arrow, formula)))
173
+
174
+ left = tokens[0].strip()
175
+ right = tokens[1].strip()
176
+
177
+ sparse_reaction = {}
178
+ for cid, count in parse_reaction_formula_side(left).items():
179
+ sparse_reaction[cid] = sparse_reaction.get(cid, 0) - count
180
+
181
+ for cid, count in parse_reaction_formula_side(right).items():
182
+ sparse_reaction[cid] = sparse_reaction.get(cid, 0) + count
183
+
184
+ return sparse_reaction
185
+
186
+
187
+ def draw_rxn_figure(rxn_dict, db_smiles, novel_smiles):
188
+ # db_smiles = load_smiles()
189
+
190
+ left = ''
191
+ right = ''
192
+
193
+ for met, stoic in rxn_dict.items():
194
+ if met == "C00080" or met == "C00282":
195
+ continue # hydogen is not considered
196
+ if stoic > 0:
197
+ if met in db_smiles:
198
+ right = right + db_smiles[met] + '.'
199
+ else:
200
+ right = right + novel_smiles[met] + '.'
201
+ else:
202
+ if met in db_smiles:
203
+ left = left + db_smiles[met] + '.'
204
+ else:
205
+ left = left + novel_smiles[met] + '.'
206
+ smarts = left[:-1] + '>>' + right[:-1]
207
+ # print smarts
208
+ smarts = str(smarts)
209
+ rxn = Reactions.ReactionFromSmarts(smarts, useSmiles=True)
210
+ return Draw.ReactionToImage(rxn) # , subImgSize=(400, 400))
211
+
212
+ # def draw_group_changes(rxn,df_rule):
213
+ # df = parse_rule(rxn,df_rule)
214
+ # group_dict = df.to_dict()[rxn]
215
+
216
+ # left = ''
217
+ # right = ''
218
+
219
+ # for smiles,stoic in group_dict.iteritems():
220
+ # if stoic > 0:
221
+ # right = right + smiles + '.'
222
+ # else:
223
+ # left = left + smiles + '.'
224
+ # smarts = left[:-1] + '>>' + right[:-1]
225
+ # rxn = Reactions.ReactionFromSmarts(smarts, useSmiles=True)
226
+ # return Draw.ReactionToImage(rxn)
227
+
228
+ # def get_rxn_rule(rid):
229
+ # reaction_dict = json.load(open('../data/optstoic_v3_Sji_dict.json'))
230
+ # molecular_signature = json.load(open('../data/decompose_vector_ac.json'))
231
+ # molsigna_df = pd.DataFrame.from_dict(molecular_signature).fillna(0)
232
+ # all_mets = molsigna_df.columns.tolist()
233
+ # all_mets.append("C00080")
234
+ # all_mets.append("C00282")
235
+
236
+ # rule_df = pd.DataFrame(index=molsigna_df.index)
237
+
238
+ # info = reaction_dict[rid]
239
+
240
+ # # skip the reactions with missing metabolites
241
+ # mets = info.keys()
242
+ # flag = False
243
+ # for met in mets:
244
+ # if met not in all_mets:
245
+ # flag = True
246
+ # break
247
+ # if flag:
248
+ # return None
249
+
250
+ # rule_df[rid] = 0
251
+ # for met, stoic in info.items():
252
+ # if met == "C00080" or met == "C00282":
253
+ # continue # hydogen is zero
254
+ # rule_df[rid] += molsigna_df[met] * stoic
255
+ # return rule_df
256
+
257
+
258
+ def get_rule(rxn_dict, molsig1, molsig2, novel_decomposed1, novel_decomposed2):
259
+ if novel_decomposed1 != None:
260
+ for cid in novel_decomposed1:
261
+ molsig1[cid] = novel_decomposed1[cid]
262
+ if novel_decomposed2 != None:
263
+ for cid in novel_decomposed2:
264
+ molsig2[cid] = novel_decomposed2[cid]
265
+
266
+ molsigna_df1 = pd.DataFrame.from_dict(molsig1).fillna(0)
267
+ all_mets1 = molsigna_df1.columns.tolist()
268
+ all_mets1.append("C00080")
269
+ all_mets1.append("C00282")
270
+
271
+ molsigna_df2 = pd.DataFrame.from_dict(molsig2).fillna(0)
272
+ all_mets2 = molsigna_df2.columns.tolist()
273
+ all_mets2.append("C00080")
274
+ all_mets2.append("C00282")
275
+
276
+ moieties_r1 = open('./data/group_names_r1.txt')
277
+ moieties_r2 = open('./data/group_names_r2_py3_modified_manual.txt')
278
+ moie_r1 = moieties_r1.read().splitlines()
279
+ moie_r2 = moieties_r2.read().splitlines()
280
+
281
+ molsigna_df1 = molsigna_df1.reindex(moie_r1)
282
+ molsigna_df2 = molsigna_df2.reindex(moie_r2)
283
+
284
+ rule_df1 = pd.DataFrame(index=molsigna_df1.index)
285
+ rule_df2 = pd.DataFrame(index=molsigna_df2.index)
286
+ # for rid, value in reaction_dict.items():
287
+ # # skip the reactions with missing metabolites
288
+ # mets = value.keys()
289
+ # flag = False
290
+ # for met in mets:
291
+ # if met not in all_mets:
292
+ # flag = True
293
+ # break
294
+ # if flag: continue
295
+
296
+ rule_df1['change'] = 0
297
+ for met, stoic in rxn_dict.items():
298
+ if met == "C00080" or met == "C00282":
299
+ continue # hydogen is zero
300
+ rule_df1['change'] += molsigna_df1[met] * stoic
301
+
302
+ rule_df2['change'] = 0
303
+ for met, stoic in rxn_dict.items():
304
+ if met == "C00080" or met == "C00282":
305
+ continue # hydogen is zero
306
+ rule_df2['change'] += molsigna_df2[met] * stoic
307
+
308
+ rule_vec1 = rule_df1.to_numpy().T
309
+ rule_vec2 = rule_df2.to_numpy().T
310
+
311
+ m1, n1 = rule_vec1.shape
312
+ m2, n2 = rule_vec2.shape
313
+
314
+ zeros1 = np.zeros((m1, 44))
315
+ zeros2 = np.zeros((m2, 44))
316
+ X1 = np.concatenate((rule_vec1, zeros1), 1)
317
+ X2 = np.concatenate((rule_vec2, zeros2), 1)
318
+
319
+ rule_comb = np.concatenate((X1, X2), 1)
320
+
321
+ # rule_df_final = {}
322
+ # rule_df_final['rad1'] = rule_df1
323
+ # rule_df_final['rad2'] = rule_df2
324
+ return rule_comb, rule_df1, rule_df2
325
+
326
+
327
+ def get_ddG0(rxn_dict, pH, I, novel_mets):
328
+ ccache = CompoundCacher()
329
+ # ddG0 = get_transform_ddG0(rxn_dict, ccache, pH, I, T)
330
+ T = 298.15
331
+ ddG0_forward = 0
332
+ for compound_id, coeff in rxn_dict.items():
333
+ if novel_mets != None and compound_id in novel_mets:
334
+ comp = novel_mets[compound_id]
335
+ else:
336
+ comp = ccache.get_compound(compound_id)
337
+ ddG0_forward += coeff * comp.transform_pH7(pH, I, T)
338
+
339
+ return ddG0_forward
340
+
341
+
342
+ def get_dG0(rxn_dict, rid, pH, I, loaded_model, molsig_r1, molsig_r2, novel_decomposed_r1, novel_decomposed_r2, novel_mets):
343
+
344
+ # rule_df = get_rxn_rule(rid)
345
+ rule_comb, rule_df1, rule_df2 = get_rule(
346
+ rxn_dict, molsig_r1, molsig_r2, novel_decomposed_r1, novel_decomposed_r2)
347
+
348
+ X = rule_comb
349
+ # X = X.reshape(1,-1)
350
+ # pdb.set_trace()
351
+ # print(np.shape(X1))
352
+ # print(np.shape(X2))
353
+ # print(np.shape(X))
354
+
355
+ ymean, ystd = loaded_model.predict(X, return_std=True)
356
+
357
+ # print(ymean)
358
+ # print(ystd)
359
+ result = {}
360
+ # result['dG0'] = ymean[0] + get_ddG0(rxn_dict, pH, I)
361
+ # result['standard deviation'] = ystd[0]
362
+
363
+ # result_df = pd.DataFrame([result])
364
+ # result_df.style.hide_index()
365
+ # return result_df
366
+ return ymean[0] + get_ddG0(rxn_dict, pH, I, novel_mets), ystd[0], rule_df1, rule_df2
367
+ # return ymean[0],ystd[0]
368
+
369
+
370
+ def parse_novel_molecule(add_info):
371
+ result = {}
372
+ for cid, InChI in add_info.items():
373
+ c = Compound.from_inchi('Test', cid, InChI)
374
+ result[cid] = c
375
+ return result
376
+
377
+
378
+ def parse_novel_smiles(result):
379
+ novel_smiles = {}
380
+ for cid, c in result.items():
381
+ smiles = c.smiles_pH7
382
+ novel_smiles[cid] = smiles
383
+ return novel_smiles
384
+
385
+
386
+ def main():
387
+ # def img_to_bytes(img_path):
388
+ # img_bytes = Path(img_path).read_bytes()
389
+ # encoded = base64.b64encode(img_bytes).decode()
390
+ # return encoded
391
+ # # st.title('dGPredictor')
392
+
393
+ # header_html = "<img src='../figures/header.png'>"
394
+
395
+ # st.markdown(
396
+ # header_html, unsafe_allow_html=True,
397
+ # )
398
+
399
+ db_smiles = load_smiles()
400
+ molsig_r1 = load_molsig_rad1()
401
+ molsig_r2 = load_molsig_rad2()
402
+
403
+ loaded_model = load_model()
404
+ ccache = load_compound_cache()
405
+
406
+ st.image('./figures/header.png', use_column_width=True)
407
+
408
+ st.subheader('Reaction (please use KEGG IDs)')
409
+
410
+ # rxn_str = st.text_input('Reaction using KEGG ids:', value='C16688 + C00001 <=> C00095 + C00092')
411
+ rxn_str = st.text_input(
412
+ '', value='C01745 + C00004 <=> N00001 + C00003 + C00001')
413
+ # rxn_str = st.text_input('', value='C16688 + C00001 <=> C00095 + C00092')
414
+
415
+ # url = 'https://www.genome.jp/dbget-bin/www_bget?rn:R00801'
416
+ # if st.button('KEGG format example'):
417
+ # webbrowser.open_new_tab(url)
418
+
419
+ if st.checkbox('Reaction has metabolites not in KEGG'):
420
+ # st.subheader('test')
421
+ add_info = st.text_area('Additional information (id: InChI):',
422
+ '{"N00001":"InChI=1S/C14H12O/c15-14-8-4-7-13(11-14)10-9-12-5-2-1-3-6-12/h1-11,15H/b10-9+"}')
423
+ else:
424
+ add_info = '{"None":"None"}'
425
+
426
+ # session_state = SessionState.get(name="", button_sent=False)
427
+ # button_search = st.button("Search")
428
+
429
+ # if button_search:
430
+ # session_state.button_search = True
431
+ pH = st.slider('pH', min_value=0.0, max_value=14.0, value=7.0, step=0.1)
432
+ I = st.slider('Ionic strength [M]', min_value=0.0,
433
+ max_value=0.5, value=0.1, step=0.01)
434
+
435
+ if st.button("Search"):
436
+ # if session_state.button_search:
437
+ st.subheader('Reaction Equation')
438
+ st.write(rxn_str)
439
+ with st.spinner('Searching...'):
440
+ try:
441
+ novel_mets = parse_novel_molecule(json.loads(add_info))
442
+ novel_smiles = parse_novel_smiles(novel_mets)
443
+ novel_decomposed_r1 = decompse_novel_mets_rad1(novel_smiles)
444
+ novel_decomposed_r2 = decompse_novel_mets_rad2(novel_smiles)
445
+
446
+ except Exception as e:
447
+ novel_mets = None
448
+ novel_smiles = None
449
+ novel_decomposed_r1 = None
450
+ novel_decomposed_r2 = None
451
+ # novel_smiles = json.loads(add_info)
452
+ print(novel_smiles)
453
+
454
+ rxn_dict = parse_formula(rxn_str)
455
+ st.image(draw_rxn_figure(rxn_dict, db_smiles,
456
+ novel_smiles), use_column_width=True)
457
+
458
+ # st.text('Group changes:')
459
+ # st.write(parse_rule('R03921'))
460
+ # st.write(get_rxn_rule('R03921'))
461
+
462
+ # session_state.calculate = st.button('Start Calculate!')
463
+ # if session_state.calculate:
464
+ # if st.button('Start Calculate!'):
465
+
466
+ # st.text('Result:')
467
+ st.subheader('Thermodynamics')
468
+ with st.spinner('Calculating...'):
469
+ mu, std, rule_df1, rule_df2 = get_dG0(
470
+ rxn_dict, 'R00801', pH, I, loaded_model, molsig_r1, molsig_r2, novel_decomposed_r1, novel_decomposed_r2, novel_mets)
471
+ st.write(r"$\Delta_r G'^{o} = %.2f \pm %.2f \ kJ/mol$" % (mu, std))
472
+ st.text('Group changes:')
473
+ st.write(rule_df1[(rule_df1.T != 0).any()])
474
+ st.write(rule_df2[(rule_df2.T != 0).any()])
475
+
476
+
477
+ if __name__ == '__main__':
478
+ main()
mini_novoStoic.py ADDED
@@ -0,0 +1,326 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import pulp
3
+ import pdb
4
+ import os
5
+ import json
6
+ from rdkit import Chem
7
+
8
+ # pulp_solver = pulp.solvers.CPLEX_CMD(path=None, keepFiles=0, mip=1, msg=1,
9
+ # options=['mip tolerances mipgap 0', 'mip tolerances absmipgap 0',
10
+ # 'mip tolerances integrality 0', 'simplex tolerances optimality 1E-9',
11
+ # 'simplex tolerances feasibility 1E-9',], timelimit=1200)
12
+
13
+ def count_substructures(radius,molecule):
14
+ """Helper function for get the information of molecular signature of a
15
+ metabolite. The relaxed signature requires the number of each substructure
16
+ to construct a matrix for each molecule.
17
+ Parameters
18
+ ----------
19
+ radius : int
20
+ the radius is bond-distance that defines how many neighbor atoms should
21
+ be considered in a reaction center.
22
+ molecule : Molecule
23
+ a molecule object create by RDkit (e.g. Chem.MolFromInchi(inchi_code)
24
+ or Chem.MolToSmiles(smiles_code))
25
+ Returns
26
+ -------
27
+ dict
28
+ dictionary of molecular signature for a molecule,
29
+ {smiles: molecular_signature}
30
+ """
31
+ m = molecule
32
+ smi_count = dict()
33
+ atomList = [atom for atom in m.GetAtoms()]
34
+
35
+ for i in range(len(atomList)):
36
+ env = Chem.FindAtomEnvironmentOfRadiusN(m,radius,i)
37
+ atoms=set()
38
+ for bidx in env:
39
+ atoms.add(m.GetBondWithIdx(bidx).GetBeginAtomIdx())
40
+ atoms.add(m.GetBondWithIdx(bidx).GetEndAtomIdx())
41
+
42
+ # only one atom is in this environment, such as O in H2O
43
+ if len(atoms) == 0:
44
+ atoms = {i}
45
+
46
+ smi = Chem.MolFragmentToSmiles(m,atomsToUse=list(atoms),
47
+ bondsToUse=env,canonical=True)
48
+
49
+ if smi in smi_count:
50
+ smi_count[smi] = smi_count[smi] + 1
51
+ else:
52
+ smi_count[smi] = 1
53
+ return smi_count
54
+
55
+ def novoStoic_minFlux_relaxedRule(exchange_mets, novel_mets,project,iterations,pulp_solver,use_direction):
56
+ """apply reaction rules generated from a more relaxed manner to search for
57
+ reaction rules that are able to fill the gap between the source and sink
58
+ metabolites.
59
+ - rePrime procedure is more similar to a morgan fingerprints
60
+ - the relaxed rule is generated from substructures without considering the
61
+ bond that connect the atoms at the edge of the substructure to the rest
62
+ of the molecules
63
+
64
+ Parameters
65
+ ----------
66
+ exchange_mets : dict
67
+ overall stoichiometry of source and sink metabolites, {met: stoic,...}
68
+ This is a important input for novoStoic to run correctly because the
69
+ method requires that overall moieties are balanced.
70
+ novel_mets : list
71
+ list of novel metabolites that are not in the database (novoStoic/data/
72
+ metanetx_universal_model_kegg_metacyc_rhea_seed_reactome.json)
73
+ filtered_rules : list
74
+ list of rules that are filtered by the user (based on expert knowldedge)
75
+ to reduce the running time of the novoStoic search process
76
+ project : string
77
+ a path to store the tmp information of result from running novoStoic
78
+ iterations : int
79
+ the number of iterations of searching for alternative solutions
80
+ data_dir : type
81
+ Description of parameter `data_dir`.
82
+
83
+ Returns
84
+ -------
85
+ None
86
+ all the outputs are saved in the project folder.
87
+
88
+ """
89
+ if not os.path.exists(project):
90
+ os.makedirs(project)
91
+
92
+ # the maximum flux of a reaction
93
+ M = 2
94
+
95
+ data_dir = './data'
96
+
97
+ # read csv files with molecular signatures and reaction rules
98
+ molecular_signature = json.load(open(
99
+ os.path.join(data_dir, 'decompose_vector_ac.json')))
100
+ molsigs = pd.DataFrame.from_dict(molecular_signature).fillna(0)
101
+
102
+ rules = pd.read_csv(
103
+ os.path.join(data_dir, "relaxed_rule_noduplic.csv"), index_col=0
104
+ )
105
+
106
+ ###### sets ############
107
+ moiety_index = rules.index.tolist() # moiety sets
108
+ rules_index = rules.columns.values.tolist()
109
+ print("Number of rules used in this search:",len(rules_index))
110
+
111
+ exchange_index = exchange_mets.keys()
112
+
113
+ ###### parameters ######
114
+ # T(m,r) contains atom stoichiometry for each rule
115
+ T = rules.to_dict(orient="index")
116
+
117
+ # C(m,i) contains moiety cardinality for each metabolite
118
+ C = molsigs.to_dict(orient="index")
119
+ for m in moiety_index:
120
+ C[m]["C00080"] = 0
121
+ C[m]["C00282"] = 0
122
+
123
+ # add metabolites that are not present in current database
124
+ for met in novel_mets:
125
+ # molsigs_product = pd.read_csv(
126
+ # project + "/relaxed_molsig_" + met + "_1.csv", index_col=0
127
+ # )
128
+ # molsigs_product_dict = molsigs_product.to_dict(orient="index")
129
+ smiles = novel_mets[met]
130
+ mol = Chem.MolFromSmiles(smiles)
131
+ mol = Chem.RemoveHs(mol)
132
+ molsigs_product_dict = count_substructures(1,mol)
133
+
134
+ for m in moiety_index:
135
+ if m in molsigs_product_dict.keys():
136
+ C[m][met] = molsigs_product_dict[m]
137
+ else:
138
+ C[m][met] = 0
139
+
140
+ ###### variables ######
141
+ v_rule = pulp.LpVariable.dicts(
142
+ "v_rule", rules_index, lowBound=-M, upBound=M, cat="Integer"
143
+ )
144
+ v_rule_obj = pulp.LpVariable.dicts(
145
+ "v_rule_obj", rules_index, lowBound=0, upBound=M, cat="Continuous"
146
+ )
147
+
148
+ v_EX = pulp.LpVariable.dicts(
149
+ "v_EX", exchange_index, lowBound=-M, upBound=M, cat="Continuous"
150
+ )
151
+ y_rule = pulp.LpVariable.dicts(
152
+ "y", rules_index, lowBound=0, upBound=1, cat="Binary"
153
+ )
154
+
155
+ # create MILP problem
156
+ lp_prob = pulp.LpProblem("novoStoic", pulp.LpMinimize)
157
+
158
+ ####### objective function ####
159
+ lp_prob += pulp.lpSum([v_rule_obj[j] for j in rules_index])
160
+
161
+ ####### constraints ####
162
+ # constraint 1: moiety change balance
163
+ for m in moiety_index:
164
+ lp_prob += (
165
+ pulp.lpSum([T[m][r] * v_rule[r] for r in rules_index if T[m][r] !=0])
166
+ == pulp.lpSum([C[m][i] * v_EX[i] for i in exchange_index if C[m][i] != 0]),
167
+ "moiety_balance_" + str(moiety_index.index(m)),
168
+ )
169
+
170
+ # constraint 2: constraint for exchange reactions
171
+ for i, stoic in exchange_mets.items():
172
+ lp_prob += v_EX[i] == stoic, "exchange" + i
173
+
174
+ # constraint 3: control the number of rules
175
+
176
+ direction_df = pd.read_csv(
177
+ os.path.join(data_dir, "direction.csv"), index_col=0
178
+ )
179
+ direction_df.index = direction_df['reaction']
180
+
181
+ # direction: 0-reversible, 1-backward, 2-forward
182
+ direction = direction_df['direction'].to_dict()
183
+
184
+ if use_direction:
185
+ soln_file = os.path.join(project, "solution_use_direction.txt")
186
+ for j in rules_index:
187
+ if direction[j] == 0:
188
+ lp_prob += v_rule[j] >= y_rule[j] * -M, "cons1_%s" % j
189
+ lp_prob += v_rule[j] <= y_rule[j] * M, "cons2_%s" % j
190
+ if direction[j] == 1:
191
+ lp_prob += v_rule[j] >= y_rule[j] * -M, "cons1_%s" % j
192
+ lp_prob += v_rule[j] <= 0, "cons2_%s" % j
193
+ if direction[j] == 2:
194
+ lp_prob += v_rule[j] >= 0, "cons1_%s" % j
195
+ lp_prob += v_rule[j] <= y_rule[j] * M, "cons2_%s" % j
196
+ else:
197
+ soln_file = os.path.join(project, "solution_no_direction.txt")
198
+ for j in rules_index:
199
+ lp_prob += v_rule[j] >= y_rule[j] * -M, "cons1_%s" % j
200
+ lp_prob += v_rule[j] <= y_rule[j] * M, "cons2_%s" % j
201
+
202
+ for j in rules_index:
203
+ lp_prob += v_rule_obj[j] >= v_rule[j]
204
+ lp_prob += v_rule_obj[j] >= -v_rule[j]
205
+
206
+ # constraint 5: customized constraints
207
+ # the number of steps of the pathway
208
+ lp_prob += pulp.lpSum([v_rule_obj[j] for j in rules_index]) == 2
209
+
210
+ ### solve
211
+ integer_cuts(lp_prob,pulp_solver,iterations,rules_index,y_rule,v_rule,soln_file,direction)
212
+
213
+ def integer_cuts(lp_prob,pulp_solver,iterations,rules_index,y_rule,v_rule,soln_file,direction):
214
+ """add integer cut constraints to a mixed-integer linear programming problem
215
+ (MILP). The aim of such constraints is to find alternative solutions by
216
+ adding constraints to exclude the already explored solutions.
217
+
218
+ Reference: Optimization Methods in Metabolic Networks By Costas D. Maranas,
219
+ Ali R. Zomorrodi, Chapter 4.2.2 Finding alternative optimal integer
220
+ solutions
221
+
222
+ Returns
223
+ -------
224
+ type
225
+ Description of returned object.
226
+
227
+ """
228
+ for sol_num in range(1, iterations + 1):
229
+ integer_cut_rules = []
230
+
231
+ # optinal output: lp file for debug
232
+ lp_prob.writeLP('./test.lp')
233
+ # if pulp_solver = "SCIP":
234
+ # status, values = pulp_solver.solve(lp_prob)
235
+ lp_prob.solve(pulp_solver)
236
+ # pulp_solver.solve(lp_prob)
237
+
238
+ print("Status:", pulp.LpStatus[lp_prob.status])
239
+
240
+ if pulp.LpStatus[lp_prob.status] != 'Optimal':
241
+ break
242
+
243
+ print('-----------rules--------------')
244
+ with open(soln_file,'a') as f:
245
+ f.write('iteration,' + str(sol_num))
246
+ f.write('\n')
247
+
248
+ for r in rules_index:
249
+ if (v_rule[r].varValue >= 0.1 or v_rule[r].varValue <=-0.1):
250
+
251
+ dG_info = ''
252
+ if (v_rule[r].varValue > 0 and direction[r] == 1) or (v_rule[r].varValue < 0 and direction[r] == 2):
253
+ # print("##### Found ####: " + str(r))
254
+ # with open(soln_file,'a') as f:
255
+ # f.write('##### Found ####: ' + str(r))
256
+ # f.write('\n')
257
+ dG_info = ' * Thermodynamically infeasible'
258
+ print("##### Found ####: " + str(r) + dG_info)
259
+ integer_cut_rules.append(r)
260
+ print(r,v_rule[r].varValue)
261
+
262
+ with open(soln_file,'a') as f:
263
+ f.write(r + ',' + str(v_rule[r].varValue) + dG_info)
264
+ f.write('\n')
265
+
266
+ length = len(integer_cut_rules) - 1
267
+ lp_prob += (
268
+ pulp.lpSum([y_rule[r] for r in integer_cut_rules]) <= length,
269
+ "integer_cut_" + str(sol_num),
270
+ )
271
+
272
+
273
+ def test_bdo():
274
+ exchange_mets = {
275
+ 'C00091': -1, # Succinyl-CoA
276
+ 'C00004': -4, # NADH
277
+ 'C00003': 4, # NAD+
278
+ 'C00010': 1, # coa
279
+ 'C00001':1, # h2O
280
+ '14bdo': 1,
281
+ }
282
+ novel_mets = {
283
+ '14bdo': 'OCCCCO'
284
+ }
285
+
286
+ iterations = 50
287
+ project = './novoStoic_result'
288
+
289
+ # path_to_cplex = '/Users/linuswang/Applications/IBM/ILOG/CPLEX_Studio1261/cplex/bin/x86-64_osx/cplex'
290
+ # pulp_solver = pulp.CPLEX_CMD(path=path_to_cplex,keepFiles=0, mip=1, msg=1)
291
+
292
+ pulp_solver = pulp.CPLEX_CMD(path=None,keepFiles=0, mip=1, msg=1)
293
+ # pulp_solver = pulp.solvers.GUROBI_CMD()
294
+ # pulp_solver = pulp.solvers.GLPK_CMD()
295
+ use_direction=True
296
+ novoStoic_minFlux_relaxedRule(exchange_mets, novel_mets,project,iterations,pulp_solver,use_direction)
297
+ use_direction=False
298
+ novoStoic_minFlux_relaxedRule(exchange_mets, novel_mets,project,iterations,pulp_solver,use_direction)
299
+
300
+
301
+ def test_isovalarate():
302
+ exchange_mets = {
303
+ 'C00141': -1, # 2-keto isovalarate
304
+ 'C00004': -1, # NADH
305
+ 'C00003': 1, # NAD+
306
+ "C14710": 1, # isobutanol C4H10O
307
+ 'C00011': 1, # co2
308
+ }
309
+ novel_mets = {}
310
+
311
+ iterations = 50
312
+ project = './novoStoic_isovalarate'
313
+
314
+ # path_to_cplex = '/Users/linuswang/Applications/IBM/ILOG/CPLEX_Studio1261/cplex/bin/x86-64_osx/cplex'
315
+ # pulp_solver = pulp.CPLEX_CMD(path=path_to_cplex,keepFiles=0, mip=1, msg=1)
316
+
317
+ pulp_solver = pulp.CPLEX_CMD(path=None,keepFiles=0, mip=1, msg=1)
318
+ # pulp_solver = pulp.solvers.GUROBI_CMD()
319
+ # pulp_solver = pulp.GLPK_CMD()
320
+ # use_direction=True
321
+ # novoStoic_minFlux_relaxedRule(exchange_mets, novel_mets,project,iterations,pulp_solver,use_direction)
322
+ use_direction=False
323
+ novoStoic_minFlux_relaxedRule(exchange_mets, novel_mets,project,iterations,pulp_solver,use_direction)
324
+
325
+ if __name__ == '__main__':
326
+ test_isovalarate()
model_gen.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from scipy.io import savemat, loadmat
2
+ import pandas as pd
3
+ import pdb
4
+ import json
5
+ import numpy as np
6
+ from numpy import median, mean
7
+ from sklearn.linear_model import BayesianRidge, LinearRegression, RidgeCV, Ridge
8
+ from sklearn.neural_network import MLPRegressor
9
+ from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
10
+ from sklearn.model_selection import cross_val_score, LeaveOneOut
11
+ import joblib
12
+ import pickle
13
+ import matplotlib.pyplot as plt
14
+ import sys
15
+ import os.path
16
+ import glob, os
17
+ import openbabel
18
+ from IPython.display import clear_output
19
+ import timeit
20
+
21
+
22
+ ac = loadmat('./data/Test_KEGG_all_grp.mat')
23
+
24
+ y = ac['y']
25
+ y = y.flatten()
26
+
27
+ alphas = np.logspace(-6, 6, 200)
28
+
29
+ Xrc = ac['X_comb_all']
30
+ regr_rcombined = BayesianRidge(tol=1e-6, fit_intercept=False, compute_score=True).fit(Xrc, y)
31
+
32
+ y_pred_rc = regr_rcombined.predict(Xrc)
33
+ mse_rc = mean_squared_error(y, y_pred_rc)
34
+ r2 = r2_score(y, y_pred_rc)
35
+
36
+
37
+ print('radius 1+2 linear model')
38
+ print('Mean squared error: %.2f'
39
+ % mse_rc)
40
+ print('Coefficient of determination: %.4f'
41
+ % r2)
42
+
43
+
44
+
45
+ s0 = timeit.default_timer()
46
+ joblib.dump(regr_rcombined, './model/M12_model_BR.pkl',compress=3)
47
+ s1 = timeit.default_timer()
48
+ print(s1 - s0)
49
+
50
+ s0 = timeit.default_timer()
51
+ filename = './model/M12_model_BR.pkl'
52
+ loaded_model = joblib.load(open(filename, 'rb'))
53
+ s1 = timeit.default_timer()
54
+ print(s1 - s0)
55
+ print('==================================')
reaction_rule_2_gen.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import pdb
3
+ import json
4
+ from rdkit import Chem
5
+
6
+ reaction_dict = json.load(open('./data/optstoic_v3_Sji_dict.json'))
7
+ molecular_signature = json.load(open('./data/decompose_vector_ac_r2_py3_indent_modified_manual.json'))
8
+ molsigna_df = pd.DataFrame.from_dict(molecular_signature).fillna(0)
9
+ all_mets = molsigna_df.columns.tolist()
10
+ all_mets.append("C00080")
11
+ all_mets.append("C00282")
12
+
13
+
14
+ rule_df = pd.DataFrame(index=molsigna_df.index)
15
+ for rid, value in list(reaction_dict.items()):
16
+ # skip the reactions with missing metabolites
17
+ mets = list(value.keys())
18
+ flag = False
19
+ for met in mets:
20
+ if met not in all_mets:
21
+ flag = True
22
+ break
23
+ if flag: continue
24
+
25
+ rule_df[rid] = 0
26
+ for met, stoic in list(value.items()):
27
+ if met == "C00080" or met == "C00282":
28
+ continue # hydogen is zero
29
+ rule_df[rid] += molsigna_df[met] * stoic
30
+ rule_df.to_csv("./data/reaction_rule_r2_py3_manual_modified.csv", index=True)
retrieve_bulk.ipynb ADDED
@@ -0,0 +1,660 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stderr",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "2021-08-13 17:29:46.477 INFO rdkit: Enabling RDKit 2021.03.4 jupyter extensions\n"
13
+ ]
14
+ }
15
+ ],
16
+ "source": [
17
+ "import streamlit as st\n",
18
+ "import pandas as pd\n",
19
+ "import numpy as np\n",
20
+ "import re\n",
21
+ "from PIL import Image\n",
22
+ "import webbrowser\n",
23
+ "import json\n",
24
+ "import pickle\n",
25
+ "import sys \n",
26
+ "import joblib\n",
27
+ "\n",
28
+ "sys.path.append('./CC/')\n",
29
+ "\n",
30
+ "import chemaxon\n",
31
+ "from chemaxon import *\n",
32
+ "from compound import Compound\n",
33
+ "from compound_cacher import CompoundCacher\n",
34
+ "from rdkit.Chem import rdChemReactions as Reactions\n",
35
+ "from rdkit.Chem import Draw\n",
36
+ "from rdkit import Chem"
37
+ ]
38
+ },
39
+ {
40
+ "cell_type": "code",
41
+ "execution_count": 2,
42
+ "metadata": {},
43
+ "outputs": [],
44
+ "source": [
45
+ "def load_smiles():\n",
46
+ " db = pd.read_csv('./data/cache_compounds_20160818.csv',\n",
47
+ " index_col='compound_id')\n",
48
+ " db_smiles = db['smiles_pH7'].to_dict()\n",
49
+ " return db_smiles\n",
50
+ "\n",
51
+ "def load_molsig_rad1():\n",
52
+ " molecular_signature_r1 = json.load(open('./data/decompose_vector_ac.json'))\n",
53
+ " return molecular_signature_r1\n",
54
+ "\n",
55
+ "\n",
56
+ "def load_molsig_rad2():\n",
57
+ " molecular_signature_r2 = json.load(\n",
58
+ " open('./data/decompose_vector_ac_r2_py3_indent_modified_manual.json'))\n",
59
+ " return molecular_signature_r2\n",
60
+ "\n",
61
+ "\n",
62
+ "def load_model():\n",
63
+ " filename = './model/M12_model_BR.pkl'\n",
64
+ " loaded_model = joblib.load(open(filename, 'rb'))\n",
65
+ " return loaded_model\n",
66
+ "\n",
67
+ "\n",
68
+ "def load_compound_cache():\n",
69
+ " ccache = CompoundCacher()\n",
70
+ " return ccache\n"
71
+ ]
72
+ },
73
+ {
74
+ "cell_type": "code",
75
+ "execution_count": 3,
76
+ "metadata": {},
77
+ "outputs": [],
78
+ "source": [
79
+ "def count_substructures(radius, molecule):\n",
80
+ " \"\"\"Helper function for get the information of molecular signature of a\n",
81
+ " metabolite. The relaxed signature requires the number of each substructure\n",
82
+ " to construct a matrix for each molecule.\n",
83
+ " Parameters\n",
84
+ " ----------\n",
85
+ " radius : int\n",
86
+ " the radius is bond-distance that defines how many neighbor atoms should\n",
87
+ " be considered in a reaction center.\n",
88
+ " molecule : Molecule\n",
89
+ " a molecule object create by RDkit (e.g. Chem.MolFromInchi(inchi_code)\n",
90
+ " or Chem.MolToSmiles(smiles_code))\n",
91
+ " Returns\n",
92
+ " -------\n",
93
+ " dict\n",
94
+ " dictionary of molecular signature for a molecule,\n",
95
+ " {smiles: molecular_signature}\n",
96
+ " \"\"\"\n",
97
+ " m = molecule\n",
98
+ " smi_count = dict()\n",
99
+ " atomList = [atom for atom in m.GetAtoms()]\n",
100
+ "\n",
101
+ " for i in range(len(atomList)):\n",
102
+ " env = Chem.FindAtomEnvironmentOfRadiusN(m, radius, i)\n",
103
+ " atoms = set()\n",
104
+ " for bidx in env:\n",
105
+ " atoms.add(m.GetBondWithIdx(bidx).GetBeginAtomIdx())\n",
106
+ " atoms.add(m.GetBondWithIdx(bidx).GetEndAtomIdx())\n",
107
+ "\n",
108
+ " # only one atom is in this environment, such as O in H2O\n",
109
+ " if len(atoms) == 0:\n",
110
+ " atoms = {i}\n",
111
+ "\n",
112
+ " smi = Chem.MolFragmentToSmiles(m, atomsToUse=list(atoms),\n",
113
+ " bondsToUse=env, canonical=True)\n",
114
+ "\n",
115
+ " if smi in smi_count:\n",
116
+ " smi_count[smi] = smi_count[smi] + 1\n",
117
+ " else:\n",
118
+ " smi_count[smi] = 1\n",
119
+ " return smi_count\n"
120
+ ]
121
+ },
122
+ {
123
+ "cell_type": "code",
124
+ "execution_count": 4,
125
+ "metadata": {},
126
+ "outputs": [],
127
+ "source": [
128
+ "def decompse_novel_mets_rad1(novel_smiles, radius=1):\n",
129
+ " decompose_vector = dict()\n",
130
+ "\n",
131
+ " for cid, smiles_pH7 in novel_smiles.items():\n",
132
+ " mol = Chem.MolFromSmiles(smiles_pH7)\n",
133
+ " mol = Chem.RemoveHs(mol)\n",
134
+ " # Chem.RemoveStereochemistry(mol)\n",
135
+ " smi_count = count_substructures(radius, mol)\n",
136
+ " decompose_vector[cid] = smi_count\n",
137
+ " return decompose_vector\n",
138
+ "\n",
139
+ "\n",
140
+ "def decompse_novel_mets_rad2(novel_smiles, radius=2):\n",
141
+ " decompose_vector = dict()\n",
142
+ "\n",
143
+ " for cid, smiles_pH7 in novel_smiles.items():\n",
144
+ " mol = Chem.MolFromSmiles(smiles_pH7)\n",
145
+ " mol = Chem.RemoveHs(mol)\n",
146
+ " # Chem.RemoveStereochemistry(mol)\n",
147
+ " smi_count = count_substructures(radius, mol)\n",
148
+ " decompose_vector[cid] = smi_count\n",
149
+ " return decompose_vector\n"
150
+ ]
151
+ },
152
+ {
153
+ "cell_type": "code",
154
+ "execution_count": 5,
155
+ "metadata": {},
156
+ "outputs": [],
157
+ "source": [
158
+ "def parse_reaction_formula_side(s):\n",
159
+ " \"\"\"\n",
160
+ " Parses the side formula, e.g. '2 C00001 + C00002 + 3 C00003'\n",
161
+ " Ignores stoichiometry.\n",
162
+ "\n",
163
+ " Returns:\n",
164
+ " The set of CIDs.\n",
165
+ " \"\"\"\n",
166
+ " if s.strip() == \"null\":\n",
167
+ " return {}\n",
168
+ "\n",
169
+ " compound_bag = {}\n",
170
+ " for member in re.split('\\s+\\+\\s+', s):\n",
171
+ " tokens = member.split(None, 1)\n",
172
+ " if len(tokens) == 0:\n",
173
+ " continue\n",
174
+ " if len(tokens) == 1:\n",
175
+ " amount = 1\n",
176
+ " key = member\n",
177
+ " else:\n",
178
+ " amount = float(tokens[0])\n",
179
+ " key = tokens[1]\n",
180
+ "\n",
181
+ " compound_bag[key] = compound_bag.get(key, 0) + amount\n",
182
+ "\n",
183
+ " return compound_bag\n",
184
+ "\n",
185
+ "\n",
186
+ "def parse_formula(formula, arrow='<=>', rid=None):\n",
187
+ " \"\"\"\n",
188
+ " Parses a two-sided formula such as: 2 C00001 => C00002 + C00003\n",
189
+ "\n",
190
+ " Return:\n",
191
+ " The set of substrates, products and the direction of the reaction\n",
192
+ " \"\"\"\n",
193
+ " tokens = formula.split(arrow)\n",
194
+ " if len(tokens) < 2:\n",
195
+ " print(('Reaction does not contain the arrow sign (%s): %s'\n",
196
+ " % (arrow, formula)))\n",
197
+ " if len(tokens) > 2:\n",
198
+ " print(('Reaction contains more than one arrow sign (%s): %s'\n",
199
+ " % (arrow, formula)))\n",
200
+ "\n",
201
+ " left = tokens[0].strip()\n",
202
+ " right = tokens[1].strip()\n",
203
+ "\n",
204
+ " sparse_reaction = {}\n",
205
+ " for cid, count in parse_reaction_formula_side(left).items():\n",
206
+ " sparse_reaction[cid] = sparse_reaction.get(cid, 0) - count\n",
207
+ "\n",
208
+ " for cid, count in parse_reaction_formula_side(right).items():\n",
209
+ " sparse_reaction[cid] = sparse_reaction.get(cid, 0) + count\n",
210
+ "\n",
211
+ " return sparse_reaction\n"
212
+ ]
213
+ },
214
+ {
215
+ "cell_type": "code",
216
+ "execution_count": 6,
217
+ "metadata": {},
218
+ "outputs": [],
219
+ "source": [
220
+ "def draw_rxn_figure(rxn_dict, db_smiles, novel_smiles):\n",
221
+ " # db_smiles = load_smiles()\n",
222
+ "\n",
223
+ " left = ''\n",
224
+ " right = ''\n",
225
+ "\n",
226
+ " for met, stoic in rxn_dict.items():\n",
227
+ " if met == \"C00080\" or met == \"C00282\":\n",
228
+ " continue # hydogen is not considered\n",
229
+ " if stoic > 0:\n",
230
+ " if met in db_smiles:\n",
231
+ " right = right + db_smiles[met] + '.'\n",
232
+ " else:\n",
233
+ " right = right + novel_smiles[met] + '.'\n",
234
+ " else:\n",
235
+ " if met in db_smiles:\n",
236
+ " left = left + db_smiles[met] + '.'\n",
237
+ " else:\n",
238
+ " left = left + novel_smiles[met] + '.'\n",
239
+ " smarts = left[:-1] + '>>' + right[:-1]\n",
240
+ " # print smarts\n",
241
+ " smarts = str(smarts)\n",
242
+ " rxn = Reactions.ReactionFromSmarts(smarts, useSmiles=True)\n",
243
+ " return Draw.ReactionToImage(rxn) # , subImgSize=(400, 400))"
244
+ ]
245
+ },
246
+ {
247
+ "cell_type": "code",
248
+ "execution_count": 7,
249
+ "metadata": {},
250
+ "outputs": [],
251
+ "source": [
252
+ "def get_rule(rxn_dict, molsig1, molsig2, novel_decomposed1, novel_decomposed2):\n",
253
+ " if novel_decomposed1 != None:\n",
254
+ " for cid in novel_decomposed1:\n",
255
+ " molsig1[cid] = novel_decomposed1[cid]\n",
256
+ " if novel_decomposed2 != None:\n",
257
+ " for cid in novel_decomposed2:\n",
258
+ " molsig2[cid] = novel_decomposed2[cid]\n",
259
+ "\n",
260
+ " molsigna_df1 = pd.DataFrame.from_dict(molsig1).fillna(0)\n",
261
+ " all_mets1 = molsigna_df1.columns.tolist()\n",
262
+ " all_mets1.append(\"C00080\")\n",
263
+ " all_mets1.append(\"C00282\")\n",
264
+ "\n",
265
+ " molsigna_df2 = pd.DataFrame.from_dict(molsig2).fillna(0)\n",
266
+ " all_mets2 = molsigna_df2.columns.tolist()\n",
267
+ " all_mets2.append(\"C00080\")\n",
268
+ " all_mets2.append(\"C00282\")\n",
269
+ "\n",
270
+ " moieties_r1 = open('./data/group_names_r1.txt')\n",
271
+ " moieties_r2 = open('./data/group_names_r2_py3_modified_manual.txt')\n",
272
+ " moie_r1 = moieties_r1.read().splitlines()\n",
273
+ " moie_r2 = moieties_r2.read().splitlines()\n",
274
+ "\n",
275
+ " molsigna_df1 = molsigna_df1.reindex(moie_r1)\n",
276
+ " molsigna_df2 = molsigna_df2.reindex(moie_r2)\n",
277
+ "\n",
278
+ " rule_df1 = pd.DataFrame(index=molsigna_df1.index)\n",
279
+ " rule_df2 = pd.DataFrame(index=molsigna_df2.index)\n",
280
+ " # for rid, value in reaction_dict.items():\n",
281
+ " # # skip the reactions with missing metabolites\n",
282
+ " # mets = value.keys()\n",
283
+ " # flag = False\n",
284
+ " # for met in mets:\n",
285
+ " # if met not in all_mets:\n",
286
+ " # flag = True\n",
287
+ " # break\n",
288
+ " # if flag: continue\n",
289
+ "\n",
290
+ " rule_df1['change'] = 0\n",
291
+ " for met, stoic in rxn_dict.items():\n",
292
+ " if met == \"C00080\" or met == \"C00282\":\n",
293
+ " continue # hydogen is zero\n",
294
+ " rule_df1['change'] += molsigna_df1[met] * stoic\n",
295
+ "\n",
296
+ " rule_df2['change'] = 0\n",
297
+ " for met, stoic in rxn_dict.items():\n",
298
+ " if met == \"C00080\" or met == \"C00282\":\n",
299
+ " continue # hydogen is zero\n",
300
+ " rule_df2['change'] += molsigna_df2[met] * stoic\n",
301
+ "\n",
302
+ " rule_vec1 = rule_df1.to_numpy().T\n",
303
+ " rule_vec2 = rule_df2.to_numpy().T\n",
304
+ "\n",
305
+ " m1, n1 = rule_vec1.shape\n",
306
+ " m2, n2 = rule_vec2.shape\n",
307
+ "\n",
308
+ " zeros1 = np.zeros((m1, 44))\n",
309
+ " zeros2 = np.zeros((m2, 44))\n",
310
+ " X1 = np.concatenate((rule_vec1, zeros1), 1)\n",
311
+ " X2 = np.concatenate((rule_vec2, zeros2), 1)\n",
312
+ "\n",
313
+ " rule_comb = np.concatenate((X1, X2), 1)\n",
314
+ "\n",
315
+ " # rule_df_final = {}\n",
316
+ " # rule_df_final['rad1'] = rule_df1\n",
317
+ " # rule_df_final['rad2'] = rule_df2\n",
318
+ " return rule_comb, rule_df1, rule_df2"
319
+ ]
320
+ },
321
+ {
322
+ "cell_type": "code",
323
+ "execution_count": 8,
324
+ "metadata": {},
325
+ "outputs": [],
326
+ "source": [
327
+ "def get_ddG0(rxn_dict, pH, I, novel_mets):\n",
328
+ " ccache = CompoundCacher()\n",
329
+ " # ddG0 = get_transform_ddG0(rxn_dict, ccache, pH, I, T)\n",
330
+ " T = 298.15\n",
331
+ " ddG0_forward = 0\n",
332
+ " for compound_id, coeff in rxn_dict.items():\n",
333
+ " if novel_mets != None and compound_id in novel_mets:\n",
334
+ " comp = novel_mets[compound_id]\n",
335
+ " else:\n",
336
+ " comp = ccache.get_compound(compound_id)\n",
337
+ " ddG0_forward += coeff * comp.transform_pH7(pH, I, T)\n",
338
+ "\n",
339
+ " return ddG0_forward"
340
+ ]
341
+ },
342
+ {
343
+ "cell_type": "code",
344
+ "execution_count": 9,
345
+ "metadata": {},
346
+ "outputs": [],
347
+ "source": [
348
+ "def get_dG0(rxn_dict, rid, pH, I, loaded_model, molsig_r1, molsig_r2, novel_decomposed_r1, novel_decomposed_r2, novel_mets):\n",
349
+ "\n",
350
+ " # rule_df = get_rxn_rule(rid)\n",
351
+ " rule_comb, rule_df1, rule_df2 = get_rule(\n",
352
+ " rxn_dict, molsig_r1, molsig_r2, novel_decomposed_r1, novel_decomposed_r2)\n",
353
+ "\n",
354
+ " X = rule_comb\n",
355
+ "\n",
356
+ " ymean, ystd = loaded_model.predict(X, return_std=True)\n",
357
+ "\n",
358
+ " result = {}\n",
359
+ " # result['dG0'] = ymean[0] + get_ddG0(rxn_dict, pH, I)\n",
360
+ " # result['standard deviation'] = ystd[0]\n",
361
+ "\n",
362
+ " # result_df = pd.DataFrame([result])\n",
363
+ " # result_df.style.hide_index()\n",
364
+ " # return result_df\n",
365
+ " return ymean[0] + get_ddG0(rxn_dict, pH, I, novel_mets), ystd[0], rule_df1, rule_df2\n",
366
+ " # return ymean[0],ystd[0]\n"
367
+ ]
368
+ },
369
+ {
370
+ "cell_type": "code",
371
+ "execution_count": 10,
372
+ "metadata": {},
373
+ "outputs": [],
374
+ "source": [
375
+ "def parse_novel_molecule(add_info):\n",
376
+ " result = {}\n",
377
+ " for cid, InChI in add_info.items():\n",
378
+ " c = Compound.from_inchi('Test', cid, InChI)\n",
379
+ " result[cid] = c\n",
380
+ " return result\n",
381
+ "\n",
382
+ "\n",
383
+ "def parse_novel_smiles(result):\n",
384
+ " novel_smiles = {}\n",
385
+ " for cid, c in result.items():\n",
386
+ " smiles = c.smiles_pH7\n",
387
+ " novel_smiles[cid] = smiles\n",
388
+ " return novel_smiles\n"
389
+ ]
390
+ },
391
+ {
392
+ "cell_type": "code",
393
+ "execution_count": 11,
394
+ "metadata": {},
395
+ "outputs": [],
396
+ "source": [
397
+ "db_smiles = load_smiles()\n",
398
+ "molsig_r1 = load_molsig_rad1()\n",
399
+ "molsig_r2 = load_molsig_rad2()\n",
400
+ "\n",
401
+ "loaded_model = load_model()\n",
402
+ "ccache = load_compound_cache()"
403
+ ]
404
+ },
405
+ {
406
+ "cell_type": "markdown",
407
+ "metadata": {},
408
+ "source": [
409
+ "## Estimating dG for reaction with novel metabolite"
410
+ ]
411
+ },
412
+ {
413
+ "cell_type": "code",
414
+ "execution_count": 12,
415
+ "metadata": {},
416
+ "outputs": [],
417
+ "source": [
418
+ "rxn_str = 'C01745 + C00004 <=> N00001 + C00003 + C00001'"
419
+ ]
420
+ },
421
+ {
422
+ "cell_type": "code",
423
+ "execution_count": 13,
424
+ "metadata": {},
425
+ "outputs": [
426
+ {
427
+ "data": {
428
+ "text/plain": [
429
+ "'C01745 + C00004 <=> N00001 + C00003 + C00001'"
430
+ ]
431
+ },
432
+ "execution_count": 13,
433
+ "metadata": {},
434
+ "output_type": "execute_result"
435
+ }
436
+ ],
437
+ "source": [
438
+ "rxn_str"
439
+ ]
440
+ },
441
+ {
442
+ "cell_type": "code",
443
+ "execution_count": 14,
444
+ "metadata": {},
445
+ "outputs": [],
446
+ "source": [
447
+ "add_info = {\"N00001\":\"InChI=1S/C14H12O/c15-14-8-4-7-13(11-14)10-9-12-5-2-1-3-6-12/h1-11,15H/b10-9+\"}"
448
+ ]
449
+ },
450
+ {
451
+ "cell_type": "code",
452
+ "execution_count": 15,
453
+ "metadata": {},
454
+ "outputs": [
455
+ {
456
+ "data": {
457
+ "text/plain": [
458
+ "'InChI=1S/C14H12O/c15-14-8-4-7-13(11-14)10-9-12-5-2-1-3-6-12/h1-11,15H/b10-9+'"
459
+ ]
460
+ },
461
+ "execution_count": 15,
462
+ "metadata": {},
463
+ "output_type": "execute_result"
464
+ }
465
+ ],
466
+ "source": [
467
+ "add_info['N00001']"
468
+ ]
469
+ },
470
+ {
471
+ "cell_type": "code",
472
+ "execution_count": 16,
473
+ "metadata": {},
474
+ "outputs": [],
475
+ "source": [
476
+ "pH = 7 # any number between 0-14 \n",
477
+ "I = 0.1 #min_value=0.0, max_value=0.5)"
478
+ ]
479
+ },
480
+ {
481
+ "cell_type": "code",
482
+ "execution_count": 17,
483
+ "metadata": {},
484
+ "outputs": [
485
+ {
486
+ "name": "stdout",
487
+ "output_type": "stream",
488
+ "text": [
489
+ "{'N00001': 'Oc1cccc(/C=C/c2ccccc2)c1'}\n"
490
+ ]
491
+ }
492
+ ],
493
+ "source": [
494
+ "try:\n",
495
+ " novel_mets = parse_novel_molecule(add_info)\n",
496
+ " novel_smiles = parse_novel_smiles(novel_mets)\n",
497
+ " novel_decomposed_r1 = decompse_novel_mets_rad1(novel_smiles)\n",
498
+ " novel_decomposed_r2 = decompse_novel_mets_rad2(novel_smiles)\n",
499
+ "\n",
500
+ "except Exception as e:\n",
501
+ " novel_mets = None\n",
502
+ " novel_smiles = None\n",
503
+ " novel_decomposed_r1 = None\n",
504
+ " novel_decomposed_r2 = None\n",
505
+ "\n",
506
+ "print(novel_smiles)\n"
507
+ ]
508
+ },
509
+ {
510
+ "cell_type": "code",
511
+ "execution_count": 18,
512
+ "metadata": {},
513
+ "outputs": [],
514
+ "source": [
515
+ "rxn_dict = parse_formula(rxn_str)"
516
+ ]
517
+ },
518
+ {
519
+ "cell_type": "code",
520
+ "execution_count": 19,
521
+ "metadata": {},
522
+ "outputs": [
523
+ {
524
+ "data": {
525
+ "image/png": "\n",
526
+ "text/plain": [
527
+ "<PIL.PngImagePlugin.PngImageFile image mode=RGB size=1200x200 at 0x1711E6902E0>"
528
+ ]
529
+ },
530
+ "execution_count": 19,
531
+ "metadata": {},
532
+ "output_type": "execute_result"
533
+ }
534
+ ],
535
+ "source": [
536
+ "draw_rxn_figure(rxn_dict, db_smiles,novel_smiles)"
537
+ ]
538
+ },
539
+ {
540
+ "cell_type": "code",
541
+ "execution_count": 20,
542
+ "metadata": {},
543
+ "outputs": [
544
+ {
545
+ "name": "stdout",
546
+ "output_type": "stream",
547
+ "text": [
548
+ "dG = -121.79 ± 100.57 kJ/mol\n"
549
+ ]
550
+ }
551
+ ],
552
+ "source": [
553
+ "mu, std, rule_df1, rule_df2 = get_dG0(rxn_dict, 'R00801', pH, I, loaded_model, molsig_r1, molsig_r2, novel_decomposed_r1, novel_decomposed_r2, novel_mets)\n",
554
+ "\n",
555
+ "print(\"dG = %.2f ± %.2f kJ/mol\" % (mu, std))\n",
556
+ "\n"
557
+ ]
558
+ },
559
+ {
560
+ "cell_type": "markdown",
561
+ "metadata": {},
562
+ "source": [
563
+ "## Bulk estimation of dG for a list of KEGG reactions"
564
+ ]
565
+ },
566
+ {
567
+ "cell_type": "code",
568
+ "execution_count": 12,
569
+ "metadata": {},
570
+ "outputs": [],
571
+ "source": [
572
+ "KEGG_rxn_list = {\"R00010\" : \"C01083 + C00001 <=> 2 C00031\",\n",
573
+ " \"R00303\" : \"C00092 + C00001 <=> C00031 + C00009\",\n",
574
+ " \"R00304\" : \"C00103 + C00001 <=> C00031 + C00009\",\n",
575
+ " \"R07294\" : \"C15524 + C00001 <=> C02137 + C00010\",\n",
576
+ " \"R01252\" : \"C00148 + C00026 + C00007 <=> C01157 + C00042 + C00011\",\n",
577
+ " \"R00406\" : \"C00091 + C00149 <=> C00042 + C04348\"\n",
578
+ " }"
579
+ ]
580
+ },
581
+ {
582
+ "cell_type": "code",
583
+ "execution_count": 14,
584
+ "metadata": {},
585
+ "outputs": [
586
+ {
587
+ "name": "stdout",
588
+ "output_type": "stream",
589
+ "text": [
590
+ "R00010\n",
591
+ "C01083 + C00001 <=> 2 C00031\n",
592
+ "dG = -12.45 ± 3.49 kJ/mol\n",
593
+ "R00303\n",
594
+ "C00092 + C00001 <=> C00031 + C00009\n",
595
+ "dG = -12.40 ± 3.30 kJ/mol\n",
596
+ "R00304\n",
597
+ "C00103 + C00001 <=> C00031 + C00009\n",
598
+ "dG = -18.78 ± 3.37 kJ/mol\n",
599
+ "R07294\n",
600
+ "C15524 + C00001 <=> C02137 + C00010\n",
601
+ "dG = -14.46 ± 31.43 kJ/mol\n",
602
+ "R01252\n",
603
+ "C00148 + C00026 + C00007 <=> C01157 + C00042 + C00011\n",
604
+ "dG = -427.04 ± 41.12 kJ/mol\n",
605
+ "R00406\n",
606
+ "C00091 + C00149 <=> C00042 + C04348\n",
607
+ "dG = -3.27 ± 4.37 kJ/mol\n"
608
+ ]
609
+ }
610
+ ],
611
+ "source": [
612
+ "pH = 7 # any number between 0-14 \n",
613
+ "I = 0.1 #min_value=0.0, max_value=0.5)\n",
614
+ "\n",
615
+ "for keys in KEGG_rxn_list:\n",
616
+ " kegg_rxn_string = KEGG_rxn_list[keys]\n",
617
+ " kegg_rxn_dict = parse_formula(kegg_rxn_string)\n",
618
+ " mu, std, rule_df1, rule_df2 = get_dG0(kegg_rxn_dict, keys, pH, I, loaded_model, molsig_r1, molsig_r2, [], [], [])\n",
619
+ " print(keys)\n",
620
+ " print(kegg_rxn_string)\n",
621
+ " print(\"dG = %.2f ± %.2f kJ/mol\" % (mu, std))"
622
+ ]
623
+ },
624
+ {
625
+ "cell_type": "code",
626
+ "execution_count": null,
627
+ "metadata": {},
628
+ "outputs": [],
629
+ "source": []
630
+ },
631
+ {
632
+ "cell_type": "code",
633
+ "execution_count": null,
634
+ "metadata": {},
635
+ "outputs": [],
636
+ "source": []
637
+ }
638
+ ],
639
+ "metadata": {
640
+ "kernelspec": {
641
+ "display_name": "Python 3",
642
+ "language": "python",
643
+ "name": "python3"
644
+ },
645
+ "language_info": {
646
+ "codemirror_mode": {
647
+ "name": "ipython",
648
+ "version": 3
649
+ },
650
+ "file_extension": ".py",
651
+ "mimetype": "text/x-python",
652
+ "name": "python",
653
+ "nbconvert_exporter": "python",
654
+ "pygments_lexer": "ipython3",
655
+ "version": "3.8.10"
656
+ }
657
+ },
658
+ "nbformat": 4,
659
+ "nbformat_minor": 4
660
+ }