Spaces:
Runtime error
Runtime error
Upload 6 files
Browse files- CC/Untitled.ipynb +1038 -0
- CC/chemaxon.py +204 -0
- CC/compound.py +337 -0
- CC/compound_cacher.py +202 -0
- CC/molecule.py +292 -0
- CC/thermodynamic_constants.py +36 -0
CC/Untitled.ipynb
ADDED
@@ -0,0 +1,1038 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"id": "ed0cdaf6-71e1-4ef0-894f-0beabdc392cf",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [],
|
9 |
+
"source": [
|
10 |
+
"import pandas as pd\n",
|
11 |
+
"import numpy as np\n",
|
12 |
+
"import re\n",
|
13 |
+
"from PIL import Image\n",
|
14 |
+
"import webbrowser\n",
|
15 |
+
"import json\n",
|
16 |
+
"import pickle\n",
|
17 |
+
"import sys \n",
|
18 |
+
"import joblib\n",
|
19 |
+
"import sys\n",
|
20 |
+
"\n",
|
21 |
+
"from rdkit import Chem\n",
|
22 |
+
"from rdkit.Chem import Draw\n",
|
23 |
+
"from rdkit.Chem import rdChemReactions as Reactions\n",
|
24 |
+
"\n",
|
25 |
+
"from compound_cacher import CompoundCacher\n",
|
26 |
+
"from compound import Compound\n",
|
27 |
+
"from chemaxon import *\n",
|
28 |
+
"import chemaxon"
|
29 |
+
]
|
30 |
+
},
|
31 |
+
{
|
32 |
+
"cell_type": "code",
|
33 |
+
"execution_count": 2,
|
34 |
+
"id": "e64deced-2a44-4d8e-ba8f-d9843f11724a",
|
35 |
+
"metadata": {},
|
36 |
+
"outputs": [],
|
37 |
+
"source": [
|
38 |
+
"def load_smiles():\n",
|
39 |
+
" db = pd.read_csv('./../data/cache_compounds_20160818.csv',index_col='compound_id')\n",
|
40 |
+
" db_smiles = db['smiles_pH7'].to_dict()\n",
|
41 |
+
" return db_smiles\n",
|
42 |
+
"\n",
|
43 |
+
"def load_molsig_rad1():\n",
|
44 |
+
" molecular_signature_r1 = json.load(open('./../data/decompose_vector_ac.json'))\n",
|
45 |
+
" return molecular_signature_r1\n",
|
46 |
+
"\n",
|
47 |
+
"def load_molsig_rad2():\n",
|
48 |
+
" molecular_signature_r2 = json.load(open('./../data/decompose_vector_ac_r2_py3_indent_modified_manual.json'))\n",
|
49 |
+
" return molecular_signature_r2\n",
|
50 |
+
"\n",
|
51 |
+
"def load_model():\n",
|
52 |
+
" filename = './../model/M12_model_BR.pkl'\n",
|
53 |
+
" loaded_model = joblib.load(open(filename, 'rb'))\n",
|
54 |
+
" return loaded_model"
|
55 |
+
]
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"cell_type": "code",
|
59 |
+
"execution_count": 3,
|
60 |
+
"id": "71615c14-49c3-45e7-9495-194ef22fb1ee",
|
61 |
+
"metadata": {},
|
62 |
+
"outputs": [],
|
63 |
+
"source": [
|
64 |
+
"db_smiles = load_smiles()\n",
|
65 |
+
"molsig_r1 = load_molsig_rad1()\n",
|
66 |
+
"molsig_r2 = load_molsig_rad2()\n",
|
67 |
+
"loaded_model = load_model()"
|
68 |
+
]
|
69 |
+
},
|
70 |
+
{
|
71 |
+
"cell_type": "code",
|
72 |
+
"execution_count": 4,
|
73 |
+
"id": "b86b8049-cbf2-473f-8715-5e5f908193a2",
|
74 |
+
"metadata": {},
|
75 |
+
"outputs": [],
|
76 |
+
"source": [
|
77 |
+
"def parse_reaction_formula_side(s):\n",
|
78 |
+
" \"\"\"\n",
|
79 |
+
" Parses the side formula, e.g. '2 C00001 + C00002 + 3 C00003'\n",
|
80 |
+
" Ignores stoichiometry.\n",
|
81 |
+
"\n",
|
82 |
+
" Returns:\n",
|
83 |
+
" The set of CIDs.\n",
|
84 |
+
" \"\"\"\n",
|
85 |
+
" if s.strip() == \"null\":\n",
|
86 |
+
" return {}\n",
|
87 |
+
"\n",
|
88 |
+
" compound_bag = {}\n",
|
89 |
+
" for member in re.split('\\s+\\+\\s+', s):\n",
|
90 |
+
" tokens = member.split(None, 1)\n",
|
91 |
+
" if len(tokens) == 0:\n",
|
92 |
+
" continue\n",
|
93 |
+
" if len(tokens) == 1:\n",
|
94 |
+
" amount = 1\n",
|
95 |
+
" key = member\n",
|
96 |
+
" else:\n",
|
97 |
+
" amount = float(tokens[0])\n",
|
98 |
+
" key = tokens[1]\n",
|
99 |
+
"\n",
|
100 |
+
" compound_bag[key] = compound_bag.get(key, 0) + amount\n",
|
101 |
+
"\n",
|
102 |
+
" return compound_bag\n",
|
103 |
+
"\n",
|
104 |
+
"def parse_formula(formula, arrow='<=>', rid=None):\n",
|
105 |
+
" \"\"\"\n",
|
106 |
+
" Parses a two-sided formula such as: 2 C00001 => C00002 + C00003\n",
|
107 |
+
"\n",
|
108 |
+
" Return:\n",
|
109 |
+
" The set of substrates, products and the direction of the reaction\n",
|
110 |
+
" \"\"\"\n",
|
111 |
+
" tokens = formula.split(arrow)\n",
|
112 |
+
" if len(tokens) < 2:\n",
|
113 |
+
" print(('Reaction does not contain the arrow sign (%s): %s'\n",
|
114 |
+
" % (arrow, formula)))\n",
|
115 |
+
" if len(tokens) > 2:\n",
|
116 |
+
" print(('Reaction contains more than one arrow sign (%s): %s'\n",
|
117 |
+
" % (arrow, formula)))\n",
|
118 |
+
"\n",
|
119 |
+
" left = tokens[0].strip()\n",
|
120 |
+
" right = tokens[1].strip()\n",
|
121 |
+
"\n",
|
122 |
+
" sparse_reaction = {}\n",
|
123 |
+
" for cid, count in parse_reaction_formula_side(left).items():\n",
|
124 |
+
" sparse_reaction[cid] = sparse_reaction.get(cid, 0) - count\n",
|
125 |
+
"\n",
|
126 |
+
" for cid, count in parse_reaction_formula_side(right).items():\n",
|
127 |
+
" sparse_reaction[cid] = sparse_reaction.get(cid, 0) + count \n",
|
128 |
+
" \n",
|
129 |
+
" return sparse_reaction"
|
130 |
+
]
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"cell_type": "code",
|
134 |
+
"execution_count": 5,
|
135 |
+
"id": "7342b178-3472-4734-83e3-3de431abe15e",
|
136 |
+
"metadata": {},
|
137 |
+
"outputs": [],
|
138 |
+
"source": [
|
139 |
+
"rxn_string = \"C00222 + C00010 + C00006 <=> C00024 + C00011 + C00005\""
|
140 |
+
]
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"cell_type": "code",
|
144 |
+
"execution_count": 6,
|
145 |
+
"id": "7b4dfe4f-48a8-4011-b201-7fb3a3268cef",
|
146 |
+
"metadata": {},
|
147 |
+
"outputs": [],
|
148 |
+
"source": [
|
149 |
+
"rxn_dic = parse_formula(rxn_string)"
|
150 |
+
]
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"cell_type": "code",
|
154 |
+
"execution_count": 7,
|
155 |
+
"id": "1f523aa2-b9dc-4153-8c1c-dec58e1ab987",
|
156 |
+
"metadata": {},
|
157 |
+
"outputs": [],
|
158 |
+
"source": [
|
159 |
+
"def get_ddG0(rxn_dict,pH,I,novel_mets):\n",
|
160 |
+
" ccache = CompoundCacher()\n",
|
161 |
+
" # ddG0 = get_transform_ddG0(rxn_dict, ccache, pH, I, T)\n",
|
162 |
+
" T = 298.15\n",
|
163 |
+
" ddG0_forward = 0\n",
|
164 |
+
" for compound_id, coeff in rxn_dict.items():\n",
|
165 |
+
" if novel_mets != None and compound_id in novel_mets:\n",
|
166 |
+
" comp = novel_mets[compound_id]\n",
|
167 |
+
" else:\n",
|
168 |
+
" comp = ccache.get_compound(compound_id)\n",
|
169 |
+
" ddG0_forward += coeff * comp.transform_pH7(pH, I, T)\n",
|
170 |
+
"\n",
|
171 |
+
" return ddG0_forward"
|
172 |
+
]
|
173 |
+
},
|
174 |
+
{
|
175 |
+
"cell_type": "code",
|
176 |
+
"execution_count": 8,
|
177 |
+
"id": "33cf30ff-8b2c-4da9-9134-75a60a5c5d66",
|
178 |
+
"metadata": {},
|
179 |
+
"outputs": [
|
180 |
+
{
|
181 |
+
"data": {
|
182 |
+
"text/plain": [
|
183 |
+
"-3.6254822995515497"
|
184 |
+
]
|
185 |
+
},
|
186 |
+
"execution_count": 8,
|
187 |
+
"metadata": {},
|
188 |
+
"output_type": "execute_result"
|
189 |
+
}
|
190 |
+
],
|
191 |
+
"source": [
|
192 |
+
"get_ddG0(rxn_dic, 7.0, 0.1, {})"
|
193 |
+
]
|
194 |
+
},
|
195 |
+
{
|
196 |
+
"cell_type": "code",
|
197 |
+
"execution_count": 9,
|
198 |
+
"id": "9e39855d-eb9e-4ea9-aeb9-8b770cc24c8e",
|
199 |
+
"metadata": {},
|
200 |
+
"outputs": [],
|
201 |
+
"source": [
|
202 |
+
"def get_rule(rxn_dict, molsig1, molsig2, novel_decomposed1, novel_decomposed2):\n",
|
203 |
+
" if novel_decomposed1 != None:\n",
|
204 |
+
" for cid in novel_decomposed1:\n",
|
205 |
+
" molsig1[cid] = novel_decomposed1[cid]\n",
|
206 |
+
" if novel_decomposed2 != None:\n",
|
207 |
+
" for cid in novel_decomposed2:\n",
|
208 |
+
" molsig2[cid] = novel_decomposed2[cid]\n",
|
209 |
+
"\n",
|
210 |
+
" molsigna_df1 = pd.DataFrame.from_dict(molsig1).fillna(0)\n",
|
211 |
+
" all_mets1 = molsigna_df1.columns.tolist()\n",
|
212 |
+
" all_mets1.append(\"C00080\")\n",
|
213 |
+
" all_mets1.append(\"C00282\")\n",
|
214 |
+
"\n",
|
215 |
+
" molsigna_df2 = pd.DataFrame.from_dict(molsig2).fillna(0)\n",
|
216 |
+
" all_mets2 = molsigna_df2.columns.tolist()\n",
|
217 |
+
" all_mets2.append(\"C00080\")\n",
|
218 |
+
" all_mets2.append(\"C00282\")\n",
|
219 |
+
"\n",
|
220 |
+
" moieties_r1 = open('./data/group_names_r1.txt')\n",
|
221 |
+
" moieties_r2 = open('./data/group_names_r2_py3_modified_manual.txt')\n",
|
222 |
+
" moie_r1 = moieties_r1.read().splitlines()\n",
|
223 |
+
" moie_r2 = moieties_r2.read().splitlines()\n",
|
224 |
+
"\n",
|
225 |
+
" molsigna_df1 = molsigna_df1.reindex(moie_r1)\n",
|
226 |
+
" molsigna_df2 = molsigna_df2.reindex(moie_r2)\n",
|
227 |
+
"\n",
|
228 |
+
" rule_df1 = pd.DataFrame(index=molsigna_df1.index)\n",
|
229 |
+
" rule_df2 = pd.DataFrame(index=molsigna_df2.index)\n",
|
230 |
+
" # for rid, value in reaction_dict.items():\n",
|
231 |
+
" # # skip the reactions with missing metabolites\n",
|
232 |
+
" # mets = value.keys()\n",
|
233 |
+
" # flag = False\n",
|
234 |
+
" # for met in mets:\n",
|
235 |
+
" # if met not in all_mets:\n",
|
236 |
+
" # flag = True\n",
|
237 |
+
" # break\n",
|
238 |
+
" # if flag: continue\n",
|
239 |
+
"\n",
|
240 |
+
" rule_df1['change'] = 0\n",
|
241 |
+
" for met, stoic in rxn_dict.items():\n",
|
242 |
+
" if met == \"C00080\" or met == \"C00282\":\n",
|
243 |
+
" continue # hydogen is zero\n",
|
244 |
+
" rule_df1['change'] += molsigna_df1[met] * stoic\n",
|
245 |
+
"\n",
|
246 |
+
" rule_df2['change'] = 0\n",
|
247 |
+
" for met, stoic in rxn_dict.items():\n",
|
248 |
+
" if met == \"C00080\" or met == \"C00282\":\n",
|
249 |
+
" continue # hydogen is zero\n",
|
250 |
+
" rule_df2['change'] += molsigna_df2[met] * stoic\n",
|
251 |
+
"\n",
|
252 |
+
" rule_vec1 = rule_df1.to_numpy().T\n",
|
253 |
+
" rule_vec2 = rule_df2.to_numpy().T\n",
|
254 |
+
"\n",
|
255 |
+
" m1, n1 = rule_vec1.shape\n",
|
256 |
+
" m2, n2 = rule_vec2.shape\n",
|
257 |
+
"\n",
|
258 |
+
" zeros1 = np.zeros((m1, 44))\n",
|
259 |
+
" zeros2 = np.zeros((m2, 44))\n",
|
260 |
+
" X1 = np.concatenate((rule_vec1, zeros1), 1)\n",
|
261 |
+
" X2 = np.concatenate((rule_vec2, zeros2), 1)\n",
|
262 |
+
"\n",
|
263 |
+
" rule_comb = np.concatenate((X1, X2), 1)\n",
|
264 |
+
"\n",
|
265 |
+
" # rule_df_final = {}\n",
|
266 |
+
" # rule_df_final['rad1'] = rule_df1\n",
|
267 |
+
" # rule_df_final['rad2'] = rule_df2\n",
|
268 |
+
" return rule_comb, rule_df1, rule_df2\n"
|
269 |
+
]
|
270 |
+
},
|
271 |
+
{
|
272 |
+
"cell_type": "code",
|
273 |
+
"execution_count": 14,
|
274 |
+
"id": "a93ea75e-9851-45fd-aa58-d7f325b4b5a6",
|
275 |
+
"metadata": {},
|
276 |
+
"outputs": [
|
277 |
+
{
|
278 |
+
"data": {
|
279 |
+
"text/plain": [
|
280 |
+
"{'C00222': -1,\n",
|
281 |
+
" 'C00010': -1,\n",
|
282 |
+
" 'C00006': -1,\n",
|
283 |
+
" 'C00024': 1,\n",
|
284 |
+
" 'C00011': 1,\n",
|
285 |
+
" 'C00005': 1}"
|
286 |
+
]
|
287 |
+
},
|
288 |
+
"execution_count": 14,
|
289 |
+
"metadata": {},
|
290 |
+
"output_type": "execute_result"
|
291 |
+
}
|
292 |
+
],
|
293 |
+
"source": [
|
294 |
+
"rxn_dic"
|
295 |
+
]
|
296 |
+
},
|
297 |
+
{
|
298 |
+
"cell_type": "code",
|
299 |
+
"execution_count": null,
|
300 |
+
"id": "981948dd-db2c-4463-b983-1220353d963e",
|
301 |
+
"metadata": {},
|
302 |
+
"outputs": [],
|
303 |
+
"source": []
|
304 |
+
},
|
305 |
+
{
|
306 |
+
"cell_type": "code",
|
307 |
+
"execution_count": 23,
|
308 |
+
"id": "96eb1c38-2ca7-4e38-bcc4-ade1cef73852",
|
309 |
+
"metadata": {},
|
310 |
+
"outputs": [
|
311 |
+
{
|
312 |
+
"data": {
|
313 |
+
"text/plain": [
|
314 |
+
"(array([-19.96775194]), array([6.66052556]))"
|
315 |
+
]
|
316 |
+
},
|
317 |
+
"execution_count": 23,
|
318 |
+
"metadata": {},
|
319 |
+
"output_type": "execute_result"
|
320 |
+
}
|
321 |
+
],
|
322 |
+
"source": [
|
323 |
+
"loaded_model.predict(X, return_std= True)"
|
324 |
+
]
|
325 |
+
},
|
326 |
+
{
|
327 |
+
"cell_type": "code",
|
328 |
+
"execution_count": null,
|
329 |
+
"id": "81128dd3-5005-40a6-b5fe-8ecacef824bc",
|
330 |
+
"metadata": {},
|
331 |
+
"outputs": [],
|
332 |
+
"source": [
|
333 |
+
"def get_ddG0(rxn_dict,pH,I,novel_mets):\n",
|
334 |
+
" ccache = CompoundCacher()\n",
|
335 |
+
" # ddG0 = get_transform_ddG0(rxn_dict, ccache, pH, I, T)\n",
|
336 |
+
" T = 298.15\n",
|
337 |
+
" ddG0_forward = 0\n",
|
338 |
+
" for compound_id, coeff in rxn_dict.items():\n",
|
339 |
+
" if novel_mets != None and compound_id in novel_mets:\n",
|
340 |
+
" comp = novel_mets[compound_id]\n",
|
341 |
+
" else:\n",
|
342 |
+
" comp = ccache.get_compound(compound_id)\n",
|
343 |
+
" ddG0_forward += coeff * comp.transform_pH7(pH, I, T)\n",
|
344 |
+
"\n",
|
345 |
+
" return ddG0_forward\n",
|
346 |
+
"\n",
|
347 |
+
"\n",
|
348 |
+
"def get_dG0(rxn_dict,rid,pH,I,loaded_model,molsig_r1, molsig_r2, novel_decomposed_r1, novel_decomposed_r2,novel_mets):\n",
|
349 |
+
" rule_comb, rule_df1, rule_df2 = get_rule(rxn_dict,molsig_r1,molsig_r2, novel_decomposed_r1, novel_decomposed_r2)\n",
|
350 |
+
" X = rule_comb\n",
|
351 |
+
" ymean, ystd = loaded_model.predict(X, return_std=True)\n",
|
352 |
+
" result = {}\n",
|
353 |
+
" return ymean[0] + get_ddG0(rxn_dict, pH, I, novel_mets),ystd[0], rule_df1, rule_df2"
|
354 |
+
]
|
355 |
+
},
|
356 |
+
{
|
357 |
+
"cell_type": "code",
|
358 |
+
"execution_count": null,
|
359 |
+
"id": "751ec201-f062-4ac0-8d24-fe959636cbdc",
|
360 |
+
"metadata": {},
|
361 |
+
"outputs": [],
|
362 |
+
"source": []
|
363 |
+
},
|
364 |
+
{
|
365 |
+
"cell_type": "code",
|
366 |
+
"execution_count": null,
|
367 |
+
"id": "c6cb1e4d-24be-42a1-b88b-793a62597c92",
|
368 |
+
"metadata": {},
|
369 |
+
"outputs": [],
|
370 |
+
"source": []
|
371 |
+
},
|
372 |
+
{
|
373 |
+
"cell_type": "code",
|
374 |
+
"execution_count": null,
|
375 |
+
"id": "7abe24be-1653-455b-9931-9446480d39bb",
|
376 |
+
"metadata": {},
|
377 |
+
"outputs": [],
|
378 |
+
"source": []
|
379 |
+
},
|
380 |
+
{
|
381 |
+
"cell_type": "code",
|
382 |
+
"execution_count": null,
|
383 |
+
"id": "f13433dc-51a3-41e5-8a0b-b0f21724ef98",
|
384 |
+
"metadata": {},
|
385 |
+
"outputs": [],
|
386 |
+
"source": []
|
387 |
+
},
|
388 |
+
{
|
389 |
+
"cell_type": "code",
|
390 |
+
"execution_count": 2,
|
391 |
+
"id": "db7c764f-d216-44a9-8f88-0e3a7c51377a",
|
392 |
+
"metadata": {},
|
393 |
+
"outputs": [],
|
394 |
+
"source": [
|
395 |
+
"ccc= CompoundCacher()"
|
396 |
+
]
|
397 |
+
},
|
398 |
+
{
|
399 |
+
"cell_type": "code",
|
400 |
+
"execution_count": 3,
|
401 |
+
"id": "09e6f7f2-5be7-4db3-b55d-756ecb711095",
|
402 |
+
"metadata": {},
|
403 |
+
"outputs": [],
|
404 |
+
"source": [
|
405 |
+
"a = ccc.get_compound('C00001')"
|
406 |
+
]
|
407 |
+
},
|
408 |
+
{
|
409 |
+
"cell_type": "code",
|
410 |
+
"execution_count": 4,
|
411 |
+
"id": "d28e44b7-d942-4739-9d7d-2f4e082ac1b9",
|
412 |
+
"metadata": {},
|
413 |
+
"outputs": [
|
414 |
+
{
|
415 |
+
"data": {
|
416 |
+
"text/plain": [
|
417 |
+
"81.4472134155519"
|
418 |
+
]
|
419 |
+
},
|
420 |
+
"execution_count": 4,
|
421 |
+
"metadata": {},
|
422 |
+
"output_type": "execute_result"
|
423 |
+
}
|
424 |
+
],
|
425 |
+
"source": [
|
426 |
+
"a.transform_pH7(7, 0.25 , 298)"
|
427 |
+
]
|
428 |
+
},
|
429 |
+
{
|
430 |
+
"cell_type": "code",
|
431 |
+
"execution_count": 5,
|
432 |
+
"id": "1ef3fc0d-7d63-42ea-8743-522fe010a95d",
|
433 |
+
"metadata": {},
|
434 |
+
"outputs": [],
|
435 |
+
"source": [
|
436 |
+
"inchi_k = \"InChI=1S/C14H14O/c15-14-8-4-7-13(11-14)10-9-12-5-2-1-3-6-12/h1-8,11,15H,9-10H2\" ;"
|
437 |
+
]
|
438 |
+
},
|
439 |
+
{
|
440 |
+
"cell_type": "code",
|
441 |
+
"execution_count": 6,
|
442 |
+
"id": "4e651d1c-2c96-42d1-adab-466dc7518146",
|
443 |
+
"metadata": {},
|
444 |
+
"outputs": [
|
445 |
+
{
|
446 |
+
"name": "stderr",
|
447 |
+
"output_type": "stream",
|
448 |
+
"text": [
|
449 |
+
"C:\\Users\\vuu10\\AppData\\Local\\Continuum\\anaconda3\\envs\\dGPredictor_py3\\lib\\openbabel\\__init__.py:14: UserWarning: \"import openbabel\" is deprecated, instead use \"from openbabel import openbabel\"\n",
|
450 |
+
" warnings.warn('\"import openbabel\" is deprecated, instead use \"from openbabel import openbabel\"')\n"
|
451 |
+
]
|
452 |
+
}
|
453 |
+
],
|
454 |
+
"source": [
|
455 |
+
"c = Compound.from_inchi('Test', 'sajdf', inchi_k )"
|
456 |
+
]
|
457 |
+
},
|
458 |
+
{
|
459 |
+
"cell_type": "code",
|
460 |
+
"execution_count": 18,
|
461 |
+
"id": "6eb5c2dc-f14c-46de-889b-0e9b7faa9f79",
|
462 |
+
"metadata": {},
|
463 |
+
"outputs": [
|
464 |
+
{
|
465 |
+
"ename": "AttributeError",
|
466 |
+
"evalue": "'Compound' object has no attribute 'smiles_ph7'",
|
467 |
+
"output_type": "error",
|
468 |
+
"traceback": [
|
469 |
+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
470 |
+
"\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)",
|
471 |
+
"\u001b[1;32m<ipython-input-18-7a0d06664090>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mc\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msmiles_ph7\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
|
472 |
+
"\u001b[1;31mAttributeError\u001b[0m: 'Compound' object has no attribute 'smiles_ph7'"
|
473 |
+
]
|
474 |
+
}
|
475 |
+
],
|
476 |
+
"source": [
|
477 |
+
"c.smiles_ph7()"
|
478 |
+
]
|
479 |
+
},
|
480 |
+
{
|
481 |
+
"cell_type": "code",
|
482 |
+
"execution_count": 7,
|
483 |
+
"id": "edd156dc-4355-4c2c-ba4e-6d98e776a96a",
|
484 |
+
"metadata": {},
|
485 |
+
"outputs": [],
|
486 |
+
"source": [
|
487 |
+
"from chemaxon import *\n",
|
488 |
+
"import chemaxon"
|
489 |
+
]
|
490 |
+
},
|
491 |
+
{
|
492 |
+
"cell_type": "code",
|
493 |
+
"execution_count": 8,
|
494 |
+
"id": "880d2ef6-6b03-49d3-8f60-66769c22a84d",
|
495 |
+
"metadata": {},
|
496 |
+
"outputs": [],
|
497 |
+
"source": [
|
498 |
+
"pKas, major_ms_smiles = chemaxon.GetDissociationConstants(inchi_k)"
|
499 |
+
]
|
500 |
+
},
|
501 |
+
{
|
502 |
+
"cell_type": "code",
|
503 |
+
"execution_count": 9,
|
504 |
+
"id": "7a2391dc-313c-47f2-9f54-823bfdb95fcd",
|
505 |
+
"metadata": {},
|
506 |
+
"outputs": [
|
507 |
+
{
|
508 |
+
"data": {
|
509 |
+
"text/plain": [
|
510 |
+
"'OC1=CC=CC(CCC2=CC=CC=C2)=C1\\r'"
|
511 |
+
]
|
512 |
+
},
|
513 |
+
"execution_count": 9,
|
514 |
+
"metadata": {},
|
515 |
+
"output_type": "execute_result"
|
516 |
+
}
|
517 |
+
],
|
518 |
+
"source": [
|
519 |
+
"major_ms_smiles"
|
520 |
+
]
|
521 |
+
},
|
522 |
+
{
|
523 |
+
"cell_type": "code",
|
524 |
+
"execution_count": 10,
|
525 |
+
"id": "96d90c4a-14a2-45fb-8573-97db84de2dff",
|
526 |
+
"metadata": {},
|
527 |
+
"outputs": [],
|
528 |
+
"source": [
|
529 |
+
"major_ms_smiles = Compound.smiles2smiles(major_ms_smiles)"
|
530 |
+
]
|
531 |
+
},
|
532 |
+
{
|
533 |
+
"cell_type": "code",
|
534 |
+
"execution_count": 11,
|
535 |
+
"id": "36d46620-b895-4ec8-85d0-7499759812c6",
|
536 |
+
"metadata": {},
|
537 |
+
"outputs": [],
|
538 |
+
"source": [
|
539 |
+
"MIN_PH = 0.0\n",
|
540 |
+
"MAX_PH = 14.0\n",
|
541 |
+
"pKas = sorted([pka for pka in pKas if pka > MIN_PH and pka < MAX_PH], reverse=True)"
|
542 |
+
]
|
543 |
+
},
|
544 |
+
{
|
545 |
+
"cell_type": "code",
|
546 |
+
"execution_count": 12,
|
547 |
+
"id": "ffccf9d9-5a52-4be6-af4c-f39b3db2a27c",
|
548 |
+
"metadata": {},
|
549 |
+
"outputs": [
|
550 |
+
{
|
551 |
+
"data": {
|
552 |
+
"text/plain": [
|
553 |
+
"[10.1]"
|
554 |
+
]
|
555 |
+
},
|
556 |
+
"execution_count": 12,
|
557 |
+
"metadata": {},
|
558 |
+
"output_type": "execute_result"
|
559 |
+
}
|
560 |
+
],
|
561 |
+
"source": [
|
562 |
+
"pKas"
|
563 |
+
]
|
564 |
+
},
|
565 |
+
{
|
566 |
+
"cell_type": "code",
|
567 |
+
"execution_count": 13,
|
568 |
+
"id": "e83721fa-9a42-42ef-9a03-59fc2689c73b",
|
569 |
+
"metadata": {},
|
570 |
+
"outputs": [],
|
571 |
+
"source": [
|
572 |
+
"atom_bag, major_ms_charge = chemaxon.GetAtomBagAndCharge(major_ms_smiles)"
|
573 |
+
]
|
574 |
+
},
|
575 |
+
{
|
576 |
+
"cell_type": "code",
|
577 |
+
"execution_count": null,
|
578 |
+
"id": "47a87ed7-968d-44b6-a237-a8469ba3fe3b",
|
579 |
+
"metadata": {},
|
580 |
+
"outputs": [],
|
581 |
+
"source": []
|
582 |
+
},
|
583 |
+
{
|
584 |
+
"cell_type": "code",
|
585 |
+
"execution_count": null,
|
586 |
+
"id": "49cfefde-ee96-4ca8-89af-c50f2f2ca70b",
|
587 |
+
"metadata": {},
|
588 |
+
"outputs": [],
|
589 |
+
"source": []
|
590 |
+
},
|
591 |
+
{
|
592 |
+
"cell_type": "code",
|
593 |
+
"execution_count": null,
|
594 |
+
"id": "9b881c7b-a14a-4561-9c3c-157116efdfd0",
|
595 |
+
"metadata": {},
|
596 |
+
"outputs": [],
|
597 |
+
"source": []
|
598 |
+
},
|
599 |
+
{
|
600 |
+
"cell_type": "code",
|
601 |
+
"execution_count": null,
|
602 |
+
"id": "10c8f915-e61a-4560-b546-fe6ea8bfdde3",
|
603 |
+
"metadata": {},
|
604 |
+
"outputs": [],
|
605 |
+
"source": []
|
606 |
+
},
|
607 |
+
{
|
608 |
+
"cell_type": "code",
|
609 |
+
"execution_count": null,
|
610 |
+
"id": "936fafa5-1bf6-495c-be79-d4cc620f4861",
|
611 |
+
"metadata": {},
|
612 |
+
"outputs": [],
|
613 |
+
"source": []
|
614 |
+
},
|
615 |
+
{
|
616 |
+
"cell_type": "code",
|
617 |
+
"execution_count": null,
|
618 |
+
"id": "285f9370-2fba-44c4-a36b-66c95f9f2eed",
|
619 |
+
"metadata": {},
|
620 |
+
"outputs": [],
|
621 |
+
"source": []
|
622 |
+
},
|
623 |
+
{
|
624 |
+
"cell_type": "code",
|
625 |
+
"execution_count": null,
|
626 |
+
"id": "adbcd78f-869a-4cc9-b727-03c80df31edd",
|
627 |
+
"metadata": {},
|
628 |
+
"outputs": [],
|
629 |
+
"source": []
|
630 |
+
},
|
631 |
+
{
|
632 |
+
"cell_type": "code",
|
633 |
+
"execution_count": null,
|
634 |
+
"id": "17fbfee9-c8b7-4644-814f-0e8aa0ad5ee9",
|
635 |
+
"metadata": {},
|
636 |
+
"outputs": [],
|
637 |
+
"source": []
|
638 |
+
},
|
639 |
+
{
|
640 |
+
"cell_type": "code",
|
641 |
+
"execution_count": 21,
|
642 |
+
"id": "70f90669-ff90-4bc4-955c-63672e42bb3c",
|
643 |
+
"metadata": {},
|
644 |
+
"outputs": [],
|
645 |
+
"source": [
|
646 |
+
"formula, formal_charge = GetFormulaAndCharge(molstring)\n",
|
647 |
+
"\n",
|
648 |
+
"atom_bag = {}"
|
649 |
+
]
|
650 |
+
},
|
651 |
+
{
|
652 |
+
"cell_type": "code",
|
653 |
+
"execution_count": 25,
|
654 |
+
"id": "e40e4088-c246-4afb-98ae-f92cb738e988",
|
655 |
+
"metadata": {},
|
656 |
+
"outputs": [],
|
657 |
+
"source": [
|
658 |
+
"for mol_formula_times in formula.split('.'):\n",
|
659 |
+
" for times, mol_formula in re.findall('^(\\d+)?(\\w+)', mol_formula_times):\n",
|
660 |
+
" if not times:\n",
|
661 |
+
" times = 1\n",
|
662 |
+
" else:\n",
|
663 |
+
" times = int(times)\n",
|
664 |
+
" for atom, count in re.findall(\"([A-Z][a-z]*)([0-9]*)\", mol_formula):\n",
|
665 |
+
" if count == '':\n",
|
666 |
+
" count = 1\n",
|
667 |
+
" else:\n",
|
668 |
+
" count = int(count)\n",
|
669 |
+
" atom_bag[atom] = atom_bag.get(atom, 0) + count * times"
|
670 |
+
]
|
671 |
+
},
|
672 |
+
{
|
673 |
+
"cell_type": "code",
|
674 |
+
"execution_count": 26,
|
675 |
+
"id": "391cfbba-2da5-4b60-ba32-217754913b35",
|
676 |
+
"metadata": {},
|
677 |
+
"outputs": [
|
678 |
+
{
|
679 |
+
"data": {
|
680 |
+
"text/plain": [
|
681 |
+
"{'C': 14, 'H': 14, 'O': 1}"
|
682 |
+
]
|
683 |
+
},
|
684 |
+
"execution_count": 26,
|
685 |
+
"metadata": {},
|
686 |
+
"output_type": "execute_result"
|
687 |
+
}
|
688 |
+
],
|
689 |
+
"source": [
|
690 |
+
"atom_bag"
|
691 |
+
]
|
692 |
+
},
|
693 |
+
{
|
694 |
+
"cell_type": "code",
|
695 |
+
"execution_count": 52,
|
696 |
+
"id": "812f8297-a5cc-4d63-b132-243c278c6b76",
|
697 |
+
"metadata": {},
|
698 |
+
"outputs": [
|
699 |
+
{
|
700 |
+
"name": "stdout",
|
701 |
+
"output_type": "stream",
|
702 |
+
"text": [
|
703 |
+
"6\n",
|
704 |
+
"1\n",
|
705 |
+
"8\n"
|
706 |
+
]
|
707 |
+
}
|
708 |
+
],
|
709 |
+
"source": [
|
710 |
+
"from rdkit.Chem import rdchem\n",
|
711 |
+
"for (elem, c) in atom_bag.items():\n",
|
712 |
+
" ll = rdchem.GetPeriodicTable()\n",
|
713 |
+
" atomic_num = ll.GetAtomicNumber(elem)\n",
|
714 |
+
" print(atomic_num)"
|
715 |
+
]
|
716 |
+
},
|
717 |
+
{
|
718 |
+
"cell_type": "code",
|
719 |
+
"execution_count": 55,
|
720 |
+
"id": "463fcb01-2cd0-4aee-990c-946c534dc766",
|
721 |
+
"metadata": {},
|
722 |
+
"outputs": [],
|
723 |
+
"source": [
|
724 |
+
"\n",
|
725 |
+
"n_protons = sum([c * ll.GetAtomicNumber(str(elem))\n",
|
726 |
+
" for (elem, c) in atom_bag.items()])"
|
727 |
+
]
|
728 |
+
},
|
729 |
+
{
|
730 |
+
"cell_type": "code",
|
731 |
+
"execution_count": 57,
|
732 |
+
"id": "ac1c69f6-54db-41ba-9fdf-e7ab6a2dfcbc",
|
733 |
+
"metadata": {},
|
734 |
+
"outputs": [],
|
735 |
+
"source": [
|
736 |
+
"atom_bag['e-'] = n_protons - formal_charge"
|
737 |
+
]
|
738 |
+
},
|
739 |
+
{
|
740 |
+
"cell_type": "code",
|
741 |
+
"execution_count": 58,
|
742 |
+
"id": "61b1931e-dbaf-4e0f-afb2-6595f64d70d6",
|
743 |
+
"metadata": {},
|
744 |
+
"outputs": [
|
745 |
+
{
|
746 |
+
"data": {
|
747 |
+
"text/plain": [
|
748 |
+
"{'C': 14, 'H': 14, 'O': 1, 'e-': 106}"
|
749 |
+
]
|
750 |
+
},
|
751 |
+
"execution_count": 58,
|
752 |
+
"metadata": {},
|
753 |
+
"output_type": "execute_result"
|
754 |
+
}
|
755 |
+
],
|
756 |
+
"source": [
|
757 |
+
"atom_bag"
|
758 |
+
]
|
759 |
+
},
|
760 |
+
{
|
761 |
+
"cell_type": "code",
|
762 |
+
"execution_count": 60,
|
763 |
+
"id": "12bdbf80-7dc5-4d47-a479-703ad5a6aa06",
|
764 |
+
"metadata": {},
|
765 |
+
"outputs": [
|
766 |
+
{
|
767 |
+
"data": {
|
768 |
+
"text/plain": [
|
769 |
+
"0"
|
770 |
+
]
|
771 |
+
},
|
772 |
+
"execution_count": 60,
|
773 |
+
"metadata": {},
|
774 |
+
"output_type": "execute_result"
|
775 |
+
}
|
776 |
+
],
|
777 |
+
"source": [
|
778 |
+
"\n",
|
779 |
+
"formal_charge\n",
|
780 |
+
"\n"
|
781 |
+
]
|
782 |
+
},
|
783 |
+
{
|
784 |
+
"cell_type": "code",
|
785 |
+
"execution_count": null,
|
786 |
+
"id": "b51f36c0-707a-4856-8c23-9081e2ea2cf7",
|
787 |
+
"metadata": {},
|
788 |
+
"outputs": [],
|
789 |
+
"source": [
|
790 |
+
"all_pKas, smiles_list = GetDissociationConstants_val(inchi_k)"
|
791 |
+
]
|
792 |
+
},
|
793 |
+
{
|
794 |
+
"cell_type": "code",
|
795 |
+
"execution_count": 13,
|
796 |
+
"id": "6dd79761-760d-4233-b113-a34e6322a0e5",
|
797 |
+
"metadata": {},
|
798 |
+
"outputs": [],
|
799 |
+
"source": [
|
800 |
+
"MID_PH = 7.0\n",
|
801 |
+
"N_PKAS = 20\n",
|
802 |
+
"\n",
|
803 |
+
"n_acidic = N_PKAS\n",
|
804 |
+
"n_basic = N_PKAS\n",
|
805 |
+
"pH = MID_PH"
|
806 |
+
]
|
807 |
+
},
|
808 |
+
{
|
809 |
+
"cell_type": "code",
|
810 |
+
"execution_count": 14,
|
811 |
+
"id": "6167191a-b361-4ae0-a78a-927490c72f87",
|
812 |
+
"metadata": {},
|
813 |
+
"outputs": [],
|
814 |
+
"source": [
|
815 |
+
"args = []\n",
|
816 |
+
"if n_acidic + n_basic > 0:\n",
|
817 |
+
" args += ['pka', '-a', str(n_acidic), '-b', str(n_basic),\n",
|
818 |
+
" 'majorms', '-M', 'true', '--pH', str(pH)]\n"
|
819 |
+
]
|
820 |
+
},
|
821 |
+
{
|
822 |
+
"cell_type": "code",
|
823 |
+
"execution_count": 15,
|
824 |
+
"id": "dd4275ec-c71e-4b5b-bb35-de8b3c7c4883",
|
825 |
+
"metadata": {},
|
826 |
+
"outputs": [
|
827 |
+
{
|
828 |
+
"data": {
|
829 |
+
"text/plain": [
|
830 |
+
"['pka', '-a', '20', '-b', '20', 'majorms', '-M', 'true', '--pH', '7.0']"
|
831 |
+
]
|
832 |
+
},
|
833 |
+
"execution_count": 15,
|
834 |
+
"metadata": {},
|
835 |
+
"output_type": "execute_result"
|
836 |
+
}
|
837 |
+
],
|
838 |
+
"source": [
|
839 |
+
"args"
|
840 |
+
]
|
841 |
+
},
|
842 |
+
{
|
843 |
+
"cell_type": "code",
|
844 |
+
"execution_count": null,
|
845 |
+
"id": "79d07dc5-963a-4373-9d72-1eb6de48ede9",
|
846 |
+
"metadata": {},
|
847 |
+
"outputs": [],
|
848 |
+
"source": []
|
849 |
+
},
|
850 |
+
{
|
851 |
+
"cell_type": "code",
|
852 |
+
"execution_count": 16,
|
853 |
+
"id": "712a71fb-e3e3-4b01-828d-5a3862aa1b30",
|
854 |
+
"metadata": {},
|
855 |
+
"outputs": [],
|
856 |
+
"source": [
|
857 |
+
"logging.debug(\"INPUT: echo %s | %s\" % (inchi_k, ' '.join([CXCALC_BIN] + args)))"
|
858 |
+
]
|
859 |
+
},
|
860 |
+
{
|
861 |
+
"cell_type": "code",
|
862 |
+
"execution_count": 17,
|
863 |
+
"id": "287bf822-23b8-42de-85ca-e52678875cfa",
|
864 |
+
"metadata": {},
|
865 |
+
"outputs": [],
|
866 |
+
"source": [
|
867 |
+
"molstring= inchi_k"
|
868 |
+
]
|
869 |
+
},
|
870 |
+
{
|
871 |
+
"cell_type": "code",
|
872 |
+
"execution_count": 18,
|
873 |
+
"id": "4d2ff427-237c-4d63-a718-f29f12884d96",
|
874 |
+
"metadata": {},
|
875 |
+
"outputs": [],
|
876 |
+
"source": [
|
877 |
+
"p1 = Popen([\"echo\", molstring], stdout=PIPE, shell=use_shell_for_echo)"
|
878 |
+
]
|
879 |
+
},
|
880 |
+
{
|
881 |
+
"cell_type": "code",
|
882 |
+
"execution_count": 19,
|
883 |
+
"id": "923a09f2-b959-4837-ab1a-a858d91de0b4",
|
884 |
+
"metadata": {},
|
885 |
+
"outputs": [],
|
886 |
+
"source": [
|
887 |
+
"p2 = Popen([CXCALC_BIN] + args, stdin=p1.stdout,\n",
|
888 |
+
" executable=CXCALC_BIN, stdout=PIPE, shell=False)"
|
889 |
+
]
|
890 |
+
},
|
891 |
+
{
|
892 |
+
"cell_type": "code",
|
893 |
+
"execution_count": 20,
|
894 |
+
"id": "a6b30545-c65a-4c56-9985-71a103b9da00",
|
895 |
+
"metadata": {},
|
896 |
+
"outputs": [],
|
897 |
+
"source": [
|
898 |
+
"res = p2.communicate()[0]"
|
899 |
+
]
|
900 |
+
},
|
901 |
+
{
|
902 |
+
"cell_type": "code",
|
903 |
+
"execution_count": 21,
|
904 |
+
"id": "ac059602-027f-4a1a-932f-c1339c38c7d7",
|
905 |
+
"metadata": {},
|
906 |
+
"outputs": [],
|
907 |
+
"source": [
|
908 |
+
"if p2.returncode != 0:\n",
|
909 |
+
" raise ChemAxonError(str(args))\n",
|
910 |
+
"logging.debug(\"OUTPUT: %s\" % res)"
|
911 |
+
]
|
912 |
+
},
|
913 |
+
{
|
914 |
+
"cell_type": "code",
|
915 |
+
"execution_count": 22,
|
916 |
+
"id": "671642a5-3877-44e3-b935-f987fd601444",
|
917 |
+
"metadata": {},
|
918 |
+
"outputs": [],
|
919 |
+
"source": [
|
920 |
+
"output = res"
|
921 |
+
]
|
922 |
+
},
|
923 |
+
{
|
924 |
+
"cell_type": "code",
|
925 |
+
"execution_count": 23,
|
926 |
+
"id": "a9f4bb4a-af86-4e97-bf1d-40c58013f90e",
|
927 |
+
"metadata": {},
|
928 |
+
"outputs": [
|
929 |
+
{
|
930 |
+
"data": {
|
931 |
+
"text/plain": [
|
932 |
+
"b'id\\tapKa1\\tapKa2\\tapKa3\\tapKa4\\tapKa5\\tapKa6\\tapKa7\\tapKa8\\tapKa9\\tapKa10\\tapKa11\\tapKa12\\tapKa13\\tapKa14\\tapKa15\\tapKa16\\tapKa17\\tapKa18\\tapKa19\\tapKa20\\tbpKa1\\tbpKa2\\tbpKa3\\tbpKa4\\tbpKa5\\tbpKa6\\tbpKa7\\tbpKa8\\tbpKa9\\tbpKa10\\tbpKa11\\tbpKa12\\tbpKa13\\tbpKa14\\tbpKa15\\tbpKa16\\tbpKa17\\tbpKa18\\tbpKa19\\tbpKa20\\tatoms\\tmajor-ms\\r\\n1\\t10.10\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t-5.48\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t15,15\\tOC1=CC=CC(CCC2=CC=CC=C2)=C1\\r\\n'"
|
933 |
+
]
|
934 |
+
},
|
935 |
+
"execution_count": 23,
|
936 |
+
"metadata": {},
|
937 |
+
"output_type": "execute_result"
|
938 |
+
}
|
939 |
+
],
|
940 |
+
"source": [
|
941 |
+
"output"
|
942 |
+
]
|
943 |
+
},
|
944 |
+
{
|
945 |
+
"cell_type": "code",
|
946 |
+
"execution_count": 24,
|
947 |
+
"id": "215ffc9b-35a8-4f45-8f39-9c99deae6335",
|
948 |
+
"metadata": {},
|
949 |
+
"outputs": [],
|
950 |
+
"source": [
|
951 |
+
"atom2pKa, smiles_list = ParsePkaOutput(output, n_acidic, n_basic)"
|
952 |
+
]
|
953 |
+
},
|
954 |
+
{
|
955 |
+
"cell_type": "code",
|
956 |
+
"execution_count": 26,
|
957 |
+
"id": "21c380d3-5410-4c55-b6d7-cb0588f373ca",
|
958 |
+
"metadata": {},
|
959 |
+
"outputs": [
|
960 |
+
{
|
961 |
+
"data": {
|
962 |
+
"text/plain": [
|
963 |
+
"['OC1=CC=CC(CCC2=CC=CC=C2)=C1\\r']"
|
964 |
+
]
|
965 |
+
},
|
966 |
+
"execution_count": 26,
|
967 |
+
"metadata": {},
|
968 |
+
"output_type": "execute_result"
|
969 |
+
}
|
970 |
+
],
|
971 |
+
"source": [
|
972 |
+
"smiles_list"
|
973 |
+
]
|
974 |
+
},
|
975 |
+
{
|
976 |
+
"cell_type": "code",
|
977 |
+
"execution_count": 27,
|
978 |
+
"id": "1437693a-0923-4df1-837d-acb2b524fcae",
|
979 |
+
"metadata": {},
|
980 |
+
"outputs": [],
|
981 |
+
"source": [
|
982 |
+
"all_pKas = []\n",
|
983 |
+
"for pKa_list in list(atom2pKa.values()):\n",
|
984 |
+
" all_pKas += [pKa for pKa, _ in pKa_list]"
|
985 |
+
]
|
986 |
+
},
|
987 |
+
{
|
988 |
+
"cell_type": "code",
|
989 |
+
"execution_count": 28,
|
990 |
+
"id": "8e77324c-ed61-4615-a7c7-4f5ca781dc90",
|
991 |
+
"metadata": {},
|
992 |
+
"outputs": [
|
993 |
+
{
|
994 |
+
"data": {
|
995 |
+
"text/plain": [
|
996 |
+
"[10.1, -5.48]"
|
997 |
+
]
|
998 |
+
},
|
999 |
+
"execution_count": 28,
|
1000 |
+
"metadata": {},
|
1001 |
+
"output_type": "execute_result"
|
1002 |
+
}
|
1003 |
+
],
|
1004 |
+
"source": [
|
1005 |
+
"all_pKas"
|
1006 |
+
]
|
1007 |
+
},
|
1008 |
+
{
|
1009 |
+
"cell_type": "code",
|
1010 |
+
"execution_count": null,
|
1011 |
+
"id": "8616be46-1814-4755-b919-4b7790569890",
|
1012 |
+
"metadata": {},
|
1013 |
+
"outputs": [],
|
1014 |
+
"source": []
|
1015 |
+
}
|
1016 |
+
],
|
1017 |
+
"metadata": {
|
1018 |
+
"kernelspec": {
|
1019 |
+
"display_name": "Python 3",
|
1020 |
+
"language": "python",
|
1021 |
+
"name": "python3"
|
1022 |
+
},
|
1023 |
+
"language_info": {
|
1024 |
+
"codemirror_mode": {
|
1025 |
+
"name": "ipython",
|
1026 |
+
"version": 3
|
1027 |
+
},
|
1028 |
+
"file_extension": ".py",
|
1029 |
+
"mimetype": "text/x-python",
|
1030 |
+
"name": "python",
|
1031 |
+
"nbconvert_exporter": "python",
|
1032 |
+
"pygments_lexer": "ipython3",
|
1033 |
+
"version": "3.8.10"
|
1034 |
+
}
|
1035 |
+
},
|
1036 |
+
"nbformat": 4,
|
1037 |
+
"nbformat_minor": 5
|
1038 |
+
}
|
CC/chemaxon.py
ADDED
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import csv
|
3 |
+
import re
|
4 |
+
import platform
|
5 |
+
import io
|
6 |
+
from subprocess import Popen, PIPE
|
7 |
+
from openbabel import openbabel
|
8 |
+
import pdb
|
9 |
+
from rdkit.Chem import rdchem
|
10 |
+
|
11 |
+
if platform.system() == 'Windows':
|
12 |
+
CXCALC_BIN = 'C:\\Users\\vuu10\\AppData\\Local\\Programs\\ChemAxon\\MarvinSuite\\bin\\cxcalc.exe'
|
13 |
+
#CXCALC_BIN = 'C:\\Program Files (x86)\\ChemAxon\\MarvinBeans\\bin\\cxcalc.bat'
|
14 |
+
use_shell_for_echo = True
|
15 |
+
else:
|
16 |
+
CXCALC_BIN = 'cxcalc'
|
17 |
+
use_shell_for_echo = False
|
18 |
+
|
19 |
+
MID_PH = 7.0
|
20 |
+
N_PKAS = 20
|
21 |
+
|
22 |
+
|
23 |
+
class ChemAxonError(Exception):
|
24 |
+
pass
|
25 |
+
|
26 |
+
|
27 |
+
def RunCxcalc(molstring, args):
|
28 |
+
# pdb.set_trace()
|
29 |
+
# with open(platform.DEV_NULL, 'w') as dev_null:
|
30 |
+
try:
|
31 |
+
logging.debug("INPUT: echo %s | %s" %
|
32 |
+
(molstring, ' '.join([CXCALC_BIN] + args)))
|
33 |
+
p1 = Popen(["echo", molstring], stdout=PIPE,
|
34 |
+
shell=use_shell_for_echo)
|
35 |
+
# p2 = Popen([CXCALC_BIN] + args, stdin=p1.stdout,
|
36 |
+
# executable=CXCALC_BIN, stdout=PIPE, stderr=dev_null, shell=False)
|
37 |
+
p2 = Popen([CXCALC_BIN] + args, stdin=p1.stdout,
|
38 |
+
executable=CXCALC_BIN, stdout=PIPE, shell=False)
|
39 |
+
# p.wait()
|
40 |
+
# os.remove(temp_fname)
|
41 |
+
res = p2.communicate()[0]
|
42 |
+
if p2.returncode != 0:
|
43 |
+
raise ChemAxonError(str(args))
|
44 |
+
logging.debug("OUTPUT: %s" % res)
|
45 |
+
res = res.decode('utf-8')
|
46 |
+
return res
|
47 |
+
except OSError:
|
48 |
+
raise Exception(
|
49 |
+
"Marvin (by ChemAxon) must be installed to calculate pKa data.")
|
50 |
+
|
51 |
+
|
52 |
+
def ParsePkaOutput(s, n_acidic, n_basic):
|
53 |
+
"""
|
54 |
+
Returns:
|
55 |
+
A dictionary that maps the atom index to a list of pKas
|
56 |
+
that are assigned to that atom.
|
57 |
+
"""
|
58 |
+
# s = s.decode('utf-8')
|
59 |
+
atom2pKa = {}
|
60 |
+
|
61 |
+
pkaline = s.split('\n')[1]
|
62 |
+
splitline = pkaline.split('\t')
|
63 |
+
splitline.pop(0)
|
64 |
+
|
65 |
+
if n_acidic + n_basic > 0:
|
66 |
+
if len(splitline) != (n_acidic + n_basic + 2):
|
67 |
+
raise ChemAxonError('ChemAxon failed to find any pKas')
|
68 |
+
|
69 |
+
pKa_list = []
|
70 |
+
acid_or_base_list = []
|
71 |
+
for i in range(n_acidic + n_basic):
|
72 |
+
x = splitline.pop(0)
|
73 |
+
if x == '':
|
74 |
+
continue
|
75 |
+
|
76 |
+
pKa_list.append(float(x))
|
77 |
+
if i < n_acidic:
|
78 |
+
acid_or_base_list.append('acid')
|
79 |
+
else:
|
80 |
+
acid_or_base_list.append('base')
|
81 |
+
|
82 |
+
atom_list = splitline.pop(0)
|
83 |
+
|
84 |
+
if atom_list: # a comma separated list of the deprotonated atoms
|
85 |
+
atom_numbers = [int(y)-1 for y in atom_list.split(',')]
|
86 |
+
for i, j in enumerate(atom_numbers):
|
87 |
+
atom2pKa.setdefault(j, [])
|
88 |
+
atom2pKa[j].append((pKa_list[i], acid_or_base_list[i]))
|
89 |
+
|
90 |
+
smiles_list = splitline
|
91 |
+
return atom2pKa, smiles_list
|
92 |
+
|
93 |
+
|
94 |
+
def GetDissociationConstants_val(molstring, n_acidic=N_PKAS, n_basic=N_PKAS,
|
95 |
+
pH=MID_PH):
|
96 |
+
"""
|
97 |
+
Returns:
|
98 |
+
A pair of (pKa list, major pseudoisomer)
|
99 |
+
|
100 |
+
- the pKa list is of the pKa values in ascending order.
|
101 |
+
- the major pseudoisomer is a SMILES string of the major species
|
102 |
+
at the given pH.
|
103 |
+
"""
|
104 |
+
args = []
|
105 |
+
if n_acidic + n_basic > 0:
|
106 |
+
args += ['pka', '-a', str(n_acidic), '-b', str(n_basic),
|
107 |
+
'majorms', '-M', 'true', '--pH', str(pH)]
|
108 |
+
|
109 |
+
output = RunCxcalc(molstring, args)
|
110 |
+
atom2pKa, smiles_list = ParsePkaOutput(output, n_acidic, n_basic)
|
111 |
+
|
112 |
+
all_pKas = []
|
113 |
+
for pKa_list in list(atom2pKa.values()):
|
114 |
+
all_pKas += [pKa for pKa, _ in pKa_list]
|
115 |
+
|
116 |
+
return sorted(all_pKas), smiles_list
|
117 |
+
|
118 |
+
|
119 |
+
def GetDissociationConstants(molstring, n_acidic=N_PKAS, n_basic=N_PKAS,
|
120 |
+
pH=MID_PH):
|
121 |
+
"""
|
122 |
+
Arguments:
|
123 |
+
molstring - a text description of the molecule (SMILES or InChI)
|
124 |
+
n_acidic - the max no. of acidic pKas to calculate
|
125 |
+
n_basic - the max no. of basic pKas to calculate
|
126 |
+
pH - the pH for which the major pseudoisomer is calculated
|
127 |
+
|
128 |
+
Returns a pair:
|
129 |
+
(all_pKas, major_ms)
|
130 |
+
|
131 |
+
- all_pKas is a list of floats (pKa values)
|
132 |
+
- major_ms is a SMILES string of the major pseudoisomer at pH_mid
|
133 |
+
"""
|
134 |
+
all_pKas, smiles_list = GetDissociationConstants_val(molstring, n_acidic,
|
135 |
+
n_basic, pH)
|
136 |
+
major_ms = smiles_list[0]
|
137 |
+
return all_pKas, major_ms
|
138 |
+
|
139 |
+
|
140 |
+
def GetFormulaAndCharge(molstring):
|
141 |
+
"""
|
142 |
+
Arguments:
|
143 |
+
molstring - a text description of the molecule (SMILES or InChI)
|
144 |
+
|
145 |
+
Returns:
|
146 |
+
chemical formula of the molecule
|
147 |
+
"""
|
148 |
+
args = ['formula', 'formalcharge']
|
149 |
+
output = RunCxcalc(molstring, args)
|
150 |
+
# the output is a tab separated table whose columns are:
|
151 |
+
# id, Formula, Formal charge
|
152 |
+
f = io.StringIO(output)
|
153 |
+
tsv_output = csv.reader(f, delimiter='\t')
|
154 |
+
headers = next(tsv_output)
|
155 |
+
if headers != ['id', 'Formula', 'Formal charge']:
|
156 |
+
raise ChemAxonError(
|
157 |
+
'cannot get the formula and charge for: ' + molstring)
|
158 |
+
_, formula, formal_charge = next(tsv_output)
|
159 |
+
|
160 |
+
try:
|
161 |
+
formal_charge = int(formal_charge)
|
162 |
+
except ValueError:
|
163 |
+
formal_charge = 0
|
164 |
+
|
165 |
+
return formula, formal_charge
|
166 |
+
|
167 |
+
|
168 |
+
def GetAtomBagAndCharge(molstring):
|
169 |
+
formula, formal_charge = GetFormulaAndCharge(molstring)
|
170 |
+
periodic_table = rdchem.GetPeriodicTable()
|
171 |
+
|
172 |
+
atom_bag = {}
|
173 |
+
for mol_formula_times in formula.split('.'):
|
174 |
+
for times, mol_formula in re.findall('^(\d+)?(\w+)', mol_formula_times):
|
175 |
+
if not times:
|
176 |
+
times = 1
|
177 |
+
else:
|
178 |
+
times = int(times)
|
179 |
+
for atom, count in re.findall("([A-Z][a-z]*)([0-9]*)", mol_formula):
|
180 |
+
if count == '':
|
181 |
+
count = 1
|
182 |
+
else:
|
183 |
+
count = int(count)
|
184 |
+
atom_bag[atom] = atom_bag.get(atom, 0) + count * times
|
185 |
+
|
186 |
+
n_protons = sum([c * periodic_table.GetAtomicNumber(str(elem))
|
187 |
+
for (elem, c) in atom_bag.items()])
|
188 |
+
atom_bag['e-'] = n_protons - formal_charge
|
189 |
+
|
190 |
+
return atom_bag, formal_charge
|
191 |
+
|
192 |
+
|
193 |
+
if __name__ == "__main__":
|
194 |
+
logging.getLogger().setLevel(logging.WARNING)
|
195 |
+
from molecule import Molecule
|
196 |
+
compound_list = [
|
197 |
+
('D-Erythrulose', 'InChI=1S/C4H8O4/c5-1-3(7)4(8)2-6/h3,5-7H,1-2H2/t3-/m1/s1')]
|
198 |
+
|
199 |
+
for name, inchi in compound_list:
|
200 |
+
print("Formula: %s\nCharge: %d" % GetFormulaAndCharge(inchi))
|
201 |
+
diss_table, major_ms = GetDissociationConstants(inchi)
|
202 |
+
m = Molecule.FromSmiles(major_ms)
|
203 |
+
print("Name: %s\nInChI: %s\npKas: %s" %
|
204 |
+
(name, m.ToInChI(), str(diss_table)))
|
CC/compound.py
ADDED
@@ -0,0 +1,337 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import urllib.request, urllib.parse, urllib.error, logging
|
2 |
+
from openbabel import openbabel
|
3 |
+
import chemaxon
|
4 |
+
import numpy as np
|
5 |
+
from thermodynamic_constants import R, debye_huckel
|
6 |
+
from scipy.special import logsumexp
|
7 |
+
|
8 |
+
MIN_PH = 0.0
|
9 |
+
MAX_PH = 14.0
|
10 |
+
|
11 |
+
class Compound(object):
|
12 |
+
|
13 |
+
def __init__(self, database, compound_id, inchi,
|
14 |
+
atom_bag, pKas, smiles_pH7, majorMSpH7, nHs, zs):
|
15 |
+
self.database = database
|
16 |
+
self.compound_id = compound_id
|
17 |
+
self.inchi = inchi
|
18 |
+
self.atom_bag = atom_bag
|
19 |
+
self.pKas = pKas
|
20 |
+
self.smiles_pH7 = smiles_pH7
|
21 |
+
self.majorMSpH7 = majorMSpH7
|
22 |
+
self.nHs = nHs
|
23 |
+
self.zs = zs
|
24 |
+
|
25 |
+
@staticmethod
|
26 |
+
def from_kegg(compound_id):
|
27 |
+
return Compound.from_inchi('KEGG', compound_id,
|
28 |
+
Compound.get_inchi(compound_id))
|
29 |
+
|
30 |
+
@staticmethod
|
31 |
+
def from_inchi(database, compound_id, inchi):
|
32 |
+
if compound_id == 'C00080':
|
33 |
+
# We add an exception for H+ (and put nH = 0) in order to eliminate
|
34 |
+
# its effect of the Legendre transform
|
35 |
+
return Compound(database, compound_id, inchi,
|
36 |
+
{'H' : 1}, [], None, 0, [0], [0])
|
37 |
+
elif compound_id == 'C00087':
|
38 |
+
# ChemAxon gets confused with the structure of sulfur
|
39 |
+
# (returns a protonated form, [SH-], at pH 7).
|
40 |
+
# So we implement it manually here.
|
41 |
+
return Compound(database, compound_id, inchi,
|
42 |
+
{'S' : 1, 'e-': 16}, [], 'S', 0, [0], [0])
|
43 |
+
elif compound_id == 'C00237':
|
44 |
+
# ChemAxon gets confused with the structure of carbon monoxide
|
45 |
+
# (returns a protonated form, [CH]#[O+], at pH 7).
|
46 |
+
# So we implement it manually here.
|
47 |
+
return Compound(database, compound_id, inchi,
|
48 |
+
{'C' : 1, 'O': 1, 'e-': 14}, [], '[C-]#[O+]', 0, [0], [0])
|
49 |
+
elif compound_id == 'C00282':
|
50 |
+
# ChemAxon gets confused with the structure of hydrogen
|
51 |
+
# So we implement it manually here.
|
52 |
+
return Compound(database, compound_id, inchi,
|
53 |
+
{'H' : 2, 'e-': 2}, [], None, 0, [2], [0])
|
54 |
+
elif compound_id == 'C01353':
|
55 |
+
# When given the structure of carbonic acid, ChemAxon returns the
|
56 |
+
# pKas for CO2(tot), i.e. it assumes the non-hydrated CO2 species is
|
57 |
+
# one of the pseudoisomers, and the lower pKa value is 6.05 instead of
|
58 |
+
# 3.78. Here, we introduce a new "KEGG" compound that will represent
|
59 |
+
# pure bicarbonate (without CO2(sp)) and therefore plug in the pKa
|
60 |
+
# values from Alberty's book.
|
61 |
+
return Compound(database, compound_id, inchi,
|
62 |
+
{'C': 1, 'H': 1, 'O': 3, 'e-': 32}, [10.33, 3.43],
|
63 |
+
'OC(=O)[O-]', 1, [0, 1, 2], [-2, -1, 0])
|
64 |
+
# Metal Cations get multiple pKa values from ChemAxon, which is
|
65 |
+
# obviously a bug. We override the important ones here:
|
66 |
+
elif compound_id == 'C00076': # Ca2+
|
67 |
+
return Compound(database, compound_id, inchi,
|
68 |
+
{'Ca' : 1, 'e-': 18}, [], '[Ca++]', 0, [0], [2])
|
69 |
+
elif compound_id == 'C00238': # K+
|
70 |
+
return Compound(database, compound_id, inchi,
|
71 |
+
{'K' : 1, 'e-': 18}, [], '[K+]', 0, [0], [1])
|
72 |
+
elif compound_id == 'C00305': # Mg2+
|
73 |
+
return Compound(database, compound_id, inchi,
|
74 |
+
{'Mg' : 1, 'e-': 10}, [], '[Mg++]', 0, [0], [2])
|
75 |
+
elif compound_id == 'C14818': # Fe2+
|
76 |
+
return Compound(database, compound_id, inchi,
|
77 |
+
{'Fe' : 1, 'e-': 24}, [], '[Fe++]', 0, [0], [2])
|
78 |
+
elif compound_id == 'C14819': # Fe3+
|
79 |
+
return Compound(database, compound_id, inchi,
|
80 |
+
{'Fe' : 1, 'e-': 23}, [], '[Fe+++]', 0, [0], [3])
|
81 |
+
elif compound_id == 'C00138': # ferredoxin(red)
|
82 |
+
return Compound(database, compound_id, inchi,
|
83 |
+
{'Fe' : 1, 'e-': 26}, [], None, 0, [0], [0])
|
84 |
+
elif compound_id == 'C00139': # ferredoxin(ox)
|
85 |
+
return Compound(database, compound_id, inchi,
|
86 |
+
{'Fe' : 1, 'e-': 25}, [], None, 0, [0], [1])
|
87 |
+
elif inchi is None:
|
88 |
+
# If the compound has no explicit structure, we assume that it has
|
89 |
+
# no proton dissociations in the relevant pH range
|
90 |
+
return Compound(database, compound_id, inchi,
|
91 |
+
{}, [], None, 0, [0], [0])
|
92 |
+
|
93 |
+
# Otherwise, we use ChemAxon's software to get the pKas and the
|
94 |
+
# properties of all microspecies
|
95 |
+
|
96 |
+
try:
|
97 |
+
pKas, major_ms_smiles = chemaxon.GetDissociationConstants(inchi)
|
98 |
+
major_ms_smiles = Compound.smiles2smiles(major_ms_smiles)
|
99 |
+
pKas = sorted([pka for pka in pKas if pka > MIN_PH and pka < MAX_PH], reverse=True)
|
100 |
+
except chemaxon.ChemAxonError:
|
101 |
+
logging.warning('chemaxon failed to find pKas for this molecule: ' + inchi)
|
102 |
+
# use the original InChI to get the parameters (i.e. assume it
|
103 |
+
# represents the major microspecies at pH 7)
|
104 |
+
major_ms_smiles = Compound.inchi2smiles(inchi)
|
105 |
+
pKas = []
|
106 |
+
|
107 |
+
if major_ms_smiles:
|
108 |
+
atom_bag, major_ms_charge = chemaxon.GetAtomBagAndCharge(major_ms_smiles)
|
109 |
+
major_ms_nH = atom_bag.get('H', 0)
|
110 |
+
else:
|
111 |
+
atom_bag = {}
|
112 |
+
major_ms_charge = 0
|
113 |
+
major_ms_nH = 0
|
114 |
+
|
115 |
+
n_species = len(pKas) + 1
|
116 |
+
if pKas == []:
|
117 |
+
majorMSpH7 = 0
|
118 |
+
else:
|
119 |
+
majorMSpH7 = len([1 for pka in pKas if pka > 7])
|
120 |
+
|
121 |
+
nHs = []
|
122 |
+
zs = []
|
123 |
+
|
124 |
+
for i in range(n_species):
|
125 |
+
zs.append((i - majorMSpH7) + major_ms_charge)
|
126 |
+
nHs.append((i - majorMSpH7) + major_ms_nH)
|
127 |
+
|
128 |
+
return Compound(database, compound_id, inchi,
|
129 |
+
atom_bag, pKas, major_ms_smiles, majorMSpH7, nHs, zs)
|
130 |
+
|
131 |
+
def to_json_dict(self):
|
132 |
+
return {'database' : self.database,
|
133 |
+
'compound_id' : self.compound_id,
|
134 |
+
'inchi' : self.inchi,
|
135 |
+
'atom_bag' : self.atom_bag,
|
136 |
+
'pKas' : self.pKas,
|
137 |
+
'smiles_pH7' : self.smiles_pH7,
|
138 |
+
'majorMSpH7' : self.majorMSpH7,
|
139 |
+
'nHs' : self.nHs,
|
140 |
+
'zs' : self.zs}
|
141 |
+
|
142 |
+
@staticmethod
|
143 |
+
def from_json_dict(d):
|
144 |
+
return Compound(d['database'], d['compound_id'], d['inchi'], d['atom_bag'],
|
145 |
+
d['pKas'], d['smiles_pH7'], d['majorMSpH7'],
|
146 |
+
d['nHs'], d['zs'])
|
147 |
+
|
148 |
+
@staticmethod
|
149 |
+
def get_inchi(compound_id):
|
150 |
+
s_mol = urllib.request.urlopen('http://rest.kegg.jp/get/cpd:%s/mol' % compound_id).read()
|
151 |
+
return Compound.mol2inchi(s_mol)
|
152 |
+
|
153 |
+
@staticmethod
|
154 |
+
def mol2inchi(s):
|
155 |
+
openbabel.obErrorLog.SetOutputLevel(-1)
|
156 |
+
|
157 |
+
conv = openbabel.OBConversion()
|
158 |
+
conv.SetInAndOutFormats('mol', 'inchi')
|
159 |
+
conv.AddOption("F", conv.OUTOPTIONS)
|
160 |
+
conv.AddOption("T", conv.OUTOPTIONS)
|
161 |
+
conv.AddOption("x", conv.OUTOPTIONS, "noiso")
|
162 |
+
conv.AddOption("w", conv.OUTOPTIONS)
|
163 |
+
obmol = openbabel.OBMol()
|
164 |
+
if not conv.ReadString(obmol, str(s)):
|
165 |
+
return None
|
166 |
+
inchi = conv.WriteString(obmol, True) # second argument is trimWhitespace
|
167 |
+
if inchi == '':
|
168 |
+
return None
|
169 |
+
else:
|
170 |
+
return inchi
|
171 |
+
|
172 |
+
@staticmethod
|
173 |
+
def inchi2smiles(inchi):
|
174 |
+
openbabel.obErrorLog.SetOutputLevel(-1)
|
175 |
+
|
176 |
+
conv = openbabel.OBConversion()
|
177 |
+
conv.SetInAndOutFormats('inchi', 'smiles')
|
178 |
+
#conv.AddOption("F", conv.OUTOPTIONS)
|
179 |
+
#conv.AddOption("T", conv.OUTOPTIONS)
|
180 |
+
#conv.AddOption("x", conv.OUTOPTIONS, "noiso")
|
181 |
+
#conv.AddOption("w", conv.OUTOPTIONS)
|
182 |
+
obmol = openbabel.OBMol()
|
183 |
+
conv.ReadString(obmol, str(inchi))
|
184 |
+
smiles = conv.WriteString(obmol, True) # second argument is trimWhitespace
|
185 |
+
if smiles == '':
|
186 |
+
return None
|
187 |
+
else:
|
188 |
+
return smiles
|
189 |
+
|
190 |
+
@staticmethod
|
191 |
+
def smiles2smiles(smiles_in):
|
192 |
+
openbabel.obErrorLog.SetOutputLevel(-1)
|
193 |
+
|
194 |
+
conv = openbabel.OBConversion()
|
195 |
+
conv.SetInAndOutFormats('smiles', 'smiles')
|
196 |
+
#conv.AddOption("F", conv.OUTOPTIONS)
|
197 |
+
#conv.AddOption("T", conv.OUTOPTIONS)
|
198 |
+
#conv.AddOption("x", conv.OUTOPTIONS, "noiso")
|
199 |
+
#conv.AddOption("w", conv.OUTOPTIONS)
|
200 |
+
obmol = openbabel.OBMol()
|
201 |
+
conv.ReadString(obmol, str(smiles_in))
|
202 |
+
smiles_out = conv.WriteString(obmol, True) # second argument is trimWhitespace
|
203 |
+
if smiles_out == '':
|
204 |
+
return None
|
205 |
+
else:
|
206 |
+
return smiles_out
|
207 |
+
@staticmethod
|
208 |
+
def smiles2inchi(smiles):
|
209 |
+
openbabel.obErrorLog.SetOutputLevel(-1)
|
210 |
+
|
211 |
+
conv = openbabel.OBConversion()
|
212 |
+
conv.SetInAndOutFormats('smiles', 'inchi')
|
213 |
+
conv.AddOption("F", conv.OUTOPTIONS)
|
214 |
+
conv.AddOption("T", conv.OUTOPTIONS)
|
215 |
+
conv.AddOption("x", conv.OUTOPTIONS, "noiso")
|
216 |
+
conv.AddOption("w", conv.OUTOPTIONS)
|
217 |
+
obmol = openbabel.OBMol()
|
218 |
+
conv.ReadString(obmol, str(smiles))
|
219 |
+
inchi = conv.WriteString(obmol, True) # second argument is trimWhitespace
|
220 |
+
if inchi == '':
|
221 |
+
return None
|
222 |
+
else:
|
223 |
+
return inchi
|
224 |
+
|
225 |
+
def __str__(self):
|
226 |
+
return "%s\nInChI: %s\npKas: %s\nmajor MS: nH = %d, charge = %d" % \
|
227 |
+
(self.compound_id, self.inchi, ', '.join(['%.2f' % p for p in self.pKas]),
|
228 |
+
self.nHs[self.majorMSpH7], self.zs[self.majorMSpH7])
|
229 |
+
|
230 |
+
def _dG0_prime_vector(self, pH, I, T):
|
231 |
+
"""
|
232 |
+
Calculates the difference in kJ/mol between dG'0 and
|
233 |
+
the dG0 of the MS with the least hydrogens (dG0[0])
|
234 |
+
|
235 |
+
Returns:
|
236 |
+
dG'0 - dG0[0]
|
237 |
+
"""
|
238 |
+
if self.inchi is None:
|
239 |
+
return 0
|
240 |
+
elif self.pKas == []:
|
241 |
+
dG0s = np.zeros((1, 1))
|
242 |
+
else:
|
243 |
+
dG0s = -np.cumsum([0] + self.pKas) * R * T * np.log(10)
|
244 |
+
dG0s = dG0s
|
245 |
+
DH = debye_huckel((I, T))
|
246 |
+
|
247 |
+
# dG0' = dG0 + nH * (R T ln(10) pH + DH) - charge^2 * DH
|
248 |
+
pseudoisomers = np.vstack([dG0s, np.array(self.nHs), np.array(self.zs)]).T
|
249 |
+
dG0_prime_vector = pseudoisomers[:, 0] + \
|
250 |
+
pseudoisomers[:, 1] * (R * T * np.log(10) * pH + DH) - \
|
251 |
+
pseudoisomers[:, 2]**2 * DH
|
252 |
+
return dG0_prime_vector
|
253 |
+
|
254 |
+
def _transform(self, pH, I, T):
|
255 |
+
|
256 |
+
return -R * T * logsumexp(self._dG0_prime_vector(pH, I, T) / (-R * T))
|
257 |
+
|
258 |
+
def _ddG(self, i_from, i_to, T):
|
259 |
+
"""
|
260 |
+
Calculates the difference in kJ/mol between two MSs.
|
261 |
+
|
262 |
+
Returns:
|
263 |
+
dG0[i_to] - dG0[i_from]
|
264 |
+
"""
|
265 |
+
if not (0 <= i_from <= len(self.pKas)):
|
266 |
+
raise ValueError('MS index is out of bounds: 0 <= %d <= %d' % (i_from, len(self.pKas)))
|
267 |
+
|
268 |
+
if not (0 <= i_to <= len(self.pKas)):
|
269 |
+
raise ValueError('MS index is out of bounds: 0 <= %d <= %d' % (i_to, len(self.pKas)))
|
270 |
+
|
271 |
+
if i_from == i_to:
|
272 |
+
return 0
|
273 |
+
elif i_from < i_to:
|
274 |
+
return sum(self.pKas[i_from:i_to]) * R * T * np.log(10)
|
275 |
+
else:
|
276 |
+
return -sum(self.pKas[i_to:i_from]) * R * T * np.log(10)
|
277 |
+
|
278 |
+
def transform(self, i, pH, I, T):
|
279 |
+
"""
|
280 |
+
Returns the difference in kJ/mol between dG'0 and the dG0 of the
|
281 |
+
MS with index 'i'.
|
282 |
+
|
283 |
+
Returns:
|
284 |
+
(dG'0 - dG0[0]) + (dG0[0] - dG0[i]) = dG'0 - dG0[i]
|
285 |
+
"""
|
286 |
+
return self._transform(pH, I, T) + self._ddG(0, i, T)
|
287 |
+
|
288 |
+
def transform_pH7(self, pH, I, T):
|
289 |
+
"""
|
290 |
+
Returns the transform for the major MS in pH 7
|
291 |
+
"""
|
292 |
+
return self.transform(self.majorMSpH7, pH, I, T)
|
293 |
+
|
294 |
+
def transform_neutral(self, pH, I, T):
|
295 |
+
"""
|
296 |
+
Returns the transform for the MS with no charge
|
297 |
+
"""
|
298 |
+
try:
|
299 |
+
return self.transform(pH, I, T, self.zs.index(0))
|
300 |
+
except ValueError:
|
301 |
+
raise ValueError("The compound (%s) does not have a microspecies with 0 charge"
|
302 |
+
% self.compound_id)
|
303 |
+
|
304 |
+
def get_species(self, major_ms_dG0_f, T):
|
305 |
+
"""
|
306 |
+
Given the chemical formation energy of the major microspecies,
|
307 |
+
uses the pKa values to calculate the chemical formation energies
|
308 |
+
of all other species, and returns a list of dictionaries with
|
309 |
+
all the relevant data: dG0_f, nH, nMg, z (charge)
|
310 |
+
"""
|
311 |
+
for i, (nH, z) in enumerate(zip(self.nHs, self.zs)):
|
312 |
+
dG0_f = major_ms_dG0_f + self._ddG(i, self.majorMSpH7, T)
|
313 |
+
d = {'phase': 'aqueous', 'dG0_f': np.round(dG0_f, 2),
|
314 |
+
'nH': nH, 'z': z, 'nMg': 0}
|
315 |
+
yield d
|
316 |
+
|
317 |
+
if __name__ == '__main__':
|
318 |
+
import sys, json
|
319 |
+
logger = logging.getLogger('')
|
320 |
+
logger.setLevel(logging.DEBUG)
|
321 |
+
from compound_cacher import CompoundCacher, CompoundEncoder
|
322 |
+
from molecule import Molecule, OpenBabelError
|
323 |
+
ccache = CompoundCacher(cache_fname=None)
|
324 |
+
|
325 |
+
for compound_id in ['C00087', 'C00282', 'C00237']:
|
326 |
+
comp = Compound.from_kegg(compound_id)
|
327 |
+
try:
|
328 |
+
mol = Molecule.FromInChI(str(comp.inchi))
|
329 |
+
sys.stderr.write('%s : formula = %s, nE = %s' %
|
330 |
+
(str(comp.inchi), mol.GetFormula(), mol.GetNumElectrons()))
|
331 |
+
except OpenBabelError:
|
332 |
+
pass
|
333 |
+
ccache.add(comp)
|
334 |
+
sys.stderr.write('\ncompound id = %s, nH = %s, z = %s, pKa = %s, bag = %s\n\n\n' %
|
335 |
+
(compound_id, str(comp.nHs), str(comp.zs), str(comp.pKas), str(comp.atom_bag)))
|
336 |
+
|
337 |
+
ccache.dump()
|
CC/compound_cacher.py
ADDED
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json, os, logging, csv, gzip, numpy, pdb
|
2 |
+
from compound import Compound
|
3 |
+
base_path = os.path.split(os.path.realpath(__file__))[0]
|
4 |
+
|
5 |
+
### Input Files:
|
6 |
+
# original version of the KEGG compound file
|
7 |
+
OLD_COMPOUND_JSON_FNAME = os.path.join(base_path, './data_cc/equilibrator_compounds.json.gz')
|
8 |
+
|
9 |
+
# a CSV file with additional names and InChIs (mostly compounds missing from KEGG
|
10 |
+
# and added manually)
|
11 |
+
KEGG_ADDITIONS_TSV_FNAME = os.path.join(base_path, './data_cc/kegg_additions.tsv')
|
12 |
+
|
13 |
+
### Files created by this module:
|
14 |
+
# names and InChIs only
|
15 |
+
KEGG_COMPOUND_JSON_FNAME = os.path.join(base_path, './data_cc/kegg_compounds.json.gz')
|
16 |
+
|
17 |
+
# names, InChIs and pKa data
|
18 |
+
DEFAULT_CACHE_FNAME = os.path.join(base_path, './data_cc/compounds.json.gz')
|
19 |
+
|
20 |
+
|
21 |
+
class CompoundEncoder(json.JSONEncoder):
|
22 |
+
def default(self, obj):
|
23 |
+
if (isinstance(obj, Compound)):
|
24 |
+
return obj.to_json_dict()
|
25 |
+
return json.JSONEncoder.default(self, obj)
|
26 |
+
|
27 |
+
class Singleton(type):
|
28 |
+
def __init__(cls,name,bases,dic):
|
29 |
+
super(Singleton,cls).__init__(name,bases,dic)
|
30 |
+
cls.instance=None
|
31 |
+
def __call__(cls,*args,**kw):
|
32 |
+
if cls.instance is None:
|
33 |
+
cls.instance=super(Singleton,cls).__call__(*args,**kw)
|
34 |
+
return cls.instance
|
35 |
+
|
36 |
+
class CompoundCacher(object, metaclass=Singleton):
|
37 |
+
"""
|
38 |
+
CompoundCacher is a singleton that handles caching of Compound objects
|
39 |
+
for the component-contribution package. The Compounds are retrieved by
|
40 |
+
their ID (which is the KEGG ID in most cases).
|
41 |
+
The first time a Compound is requested, it is obtained from the relevant
|
42 |
+
database and a Compound object is created (this takes a while because
|
43 |
+
it usually involves internet communication and then invoking the ChemAxon
|
44 |
+
plugin for calculating the pKa values for that structure).
|
45 |
+
Any further request for the same Compound ID will draw the object from
|
46 |
+
the cache. When the method dump() is called, all cached data is written
|
47 |
+
to a file that will be loaded in future python sessions.
|
48 |
+
"""
|
49 |
+
|
50 |
+
def __init__(self, cache_fname=None):
|
51 |
+
self.cache_fname = cache_fname
|
52 |
+
if self.cache_fname is None:
|
53 |
+
self.cache_fname = DEFAULT_CACHE_FNAME
|
54 |
+
|
55 |
+
compounds = json.load(gzip.open(KEGG_COMPOUND_JSON_FNAME, 'r'))
|
56 |
+
self.compound_id2inchi = { d['compound_id']: d['inchi']
|
57 |
+
for d in compounds }
|
58 |
+
self.need_to_update_cache_file = False
|
59 |
+
self.load()
|
60 |
+
|
61 |
+
def get_all_compound_ids(self):
|
62 |
+
return sorted(self.compound_id2inchi.keys())
|
63 |
+
|
64 |
+
def load(self):
|
65 |
+
# parse the JSON cache file and store in a dictionary 'compound_dict'
|
66 |
+
self.compound_dict = {}
|
67 |
+
self.compound_ids = []
|
68 |
+
if os.path.exists(self.cache_fname):
|
69 |
+
for d in json.load(gzip.open(self.cache_fname, 'r')):
|
70 |
+
self.compound_ids.append(d['compound_id'])
|
71 |
+
self.compound_dict[d['compound_id']] = Compound.from_json_dict(d)
|
72 |
+
|
73 |
+
def dump(self):
|
74 |
+
if self.need_to_update_cache_file:
|
75 |
+
fp = gzip.open(self.cache_fname, 'w')
|
76 |
+
data = sorted(list(self.compound_dict.values()),
|
77 |
+
key=lambda d:d.compound_id)
|
78 |
+
dict_data = [x.to_json_dict() for x in data]
|
79 |
+
json.dump(dict_data, fp, cls=CompoundEncoder,
|
80 |
+
sort_keys=True, indent=4, separators=(',', ': '))
|
81 |
+
fp.close()
|
82 |
+
self.need_to_update_cache_file = False
|
83 |
+
|
84 |
+
def get_compound(self, compound_id, kegg_additions_cids=None):
|
85 |
+
if compound_id not in self.compound_dict:
|
86 |
+
logging.debug('Cache miss: %s' % str(compound_id))
|
87 |
+
inchi = self.compound_id2inchi[compound_id]
|
88 |
+
comp = Compound.from_inchi('KEGG', compound_id, inchi)
|
89 |
+
self.add(comp)
|
90 |
+
|
91 |
+
#if a compound id is in the kegg_additions.tsv
|
92 |
+
#remove the one in cache, and replace it with new one
|
93 |
+
else:
|
94 |
+
if kegg_additions_cids is not None:
|
95 |
+
if compound_id in kegg_additions_cids:
|
96 |
+
self.remove(compound_id)
|
97 |
+
logging.debug('Cache update: %s' % str(compound_id))
|
98 |
+
inchi = self.compound_id2inchi[compound_id]
|
99 |
+
comp = Compound.from_inchi('KEGG', compound_id, inchi)
|
100 |
+
self.add(comp)
|
101 |
+
|
102 |
+
logging.debug('Cache hit: %s' % str(compound_id))
|
103 |
+
return self.compound_dict[compound_id]
|
104 |
+
|
105 |
+
def remove(self, compound_id):
|
106 |
+
if compound_id in self.compound_dict:
|
107 |
+
del self.compound_dict[compound_id]
|
108 |
+
else:
|
109 |
+
logging.debug('%s is not cached, cannot remove it' % str(compound_id))
|
110 |
+
|
111 |
+
def add(self, comp):
|
112 |
+
self.compound_dict[comp.compound_id] = comp
|
113 |
+
self.need_to_update_cache_file = True
|
114 |
+
|
115 |
+
def get_element_matrix(self, compound_ids):
|
116 |
+
if type(compound_ids) == str:
|
117 |
+
compound_ids = [compound_ids]
|
118 |
+
# gather the "atom bags" of all compounds in a list 'atom_bag_list'
|
119 |
+
elements = set()
|
120 |
+
atom_bag_list = []
|
121 |
+
for compound_id in compound_ids:
|
122 |
+
comp = self.get_compound(compound_id)
|
123 |
+
atom_bag = comp.atom_bag
|
124 |
+
if atom_bag is not None:
|
125 |
+
elements = elements.union(list(atom_bag.keys()))
|
126 |
+
atom_bag_list.append(atom_bag)
|
127 |
+
elements.discard('H') # don't balance H (it's enough to balance e-)
|
128 |
+
elements = sorted(elements)
|
129 |
+
|
130 |
+
# create the elemental matrix, where each row is a compound and each
|
131 |
+
# column is an element (or e-)
|
132 |
+
Ematrix = numpy.matrix(numpy.zeros((len(atom_bag_list), len(elements))))
|
133 |
+
for i, atom_bag in enumerate(atom_bag_list):
|
134 |
+
if atom_bag is None:
|
135 |
+
Ematrix[i, :] = numpy.nan
|
136 |
+
else:
|
137 |
+
for j, elem in enumerate(elements):
|
138 |
+
Ematrix[i, j] = atom_bag.get(elem, 0)
|
139 |
+
return elements, Ematrix
|
140 |
+
|
141 |
+
###############################################################################
|
142 |
+
|
143 |
+
@staticmethod
|
144 |
+
def RebuildCompoundJSON():
|
145 |
+
|
146 |
+
kegg_dict = {}
|
147 |
+
for d in json.load(gzip.open(OLD_COMPOUND_JSON_FNAME, 'r')):
|
148 |
+
cid = d['CID']
|
149 |
+
kegg_dict[cid] = {'compound_id': cid,
|
150 |
+
'name': d['name'],
|
151 |
+
'names': d['names'],
|
152 |
+
'inchi': d['InChI']}
|
153 |
+
|
154 |
+
# override some of the compounds or add new ones with 'fake' IDs,
|
155 |
+
# i.e. C80000 or higher.
|
156 |
+
kegg_additions_cids = []
|
157 |
+
for d in csv.DictReader(open(KEGG_ADDITIONS_TSV_FNAME, 'r'),
|
158 |
+
delimiter='\t'):
|
159 |
+
cid = 'C%05d' % int(d['cid'])
|
160 |
+
kegg_additions_cids.append(cid)
|
161 |
+
kegg_dict[cid] = {'compound_id': cid,
|
162 |
+
'name': d['name'],
|
163 |
+
'names': [d['name']],
|
164 |
+
'inchi': d['inchi']}
|
165 |
+
|
166 |
+
compound_json = [kegg_dict[compound_id] for compound_id in sorted(kegg_dict.keys())]
|
167 |
+
|
168 |
+
new_json = gzip.open(KEGG_COMPOUND_JSON_FNAME, 'w')
|
169 |
+
json.dump(compound_json, new_json, sort_keys=True, indent=4)
|
170 |
+
new_json.close()
|
171 |
+
return kegg_additions_cids
|
172 |
+
|
173 |
+
###############################################################################
|
174 |
+
|
175 |
+
@staticmethod
|
176 |
+
def BuildCache(start_from_scratch=False, kegg_additions_cids=None):
|
177 |
+
if start_from_scratch and os.path.exists(DEFAULT_CACHE_FNAME):
|
178 |
+
os.remove(DEFAULT_CACHE_FNAME)
|
179 |
+
|
180 |
+
ccache = CompoundCacher(cache_fname=DEFAULT_CACHE_FNAME)
|
181 |
+
|
182 |
+
i = 0
|
183 |
+
for compound_id in ccache.get_all_compound_ids():
|
184 |
+
logging.debug('Caching %s' % compound_id)
|
185 |
+
comp = ccache.get_compound(compound_id, kegg_additions_cids=kegg_additions_cids)
|
186 |
+
logging.debug(str(comp))
|
187 |
+
i += 1
|
188 |
+
if i % 100 == 0:
|
189 |
+
logging.debug('Dumping Cache ...')
|
190 |
+
ccache.dump()
|
191 |
+
|
192 |
+
ccache.dump()
|
193 |
+
|
194 |
+
###############################################################################
|
195 |
+
|
196 |
+
if __name__ == '__main__':
|
197 |
+
logger = logging.getLogger('')
|
198 |
+
#logger.setLevel(logging.WARNING)
|
199 |
+
logger.setLevel(logging.DEBUG)
|
200 |
+
|
201 |
+
kegg_additions_cids = CompoundCacher.RebuildCompoundJSON()
|
202 |
+
CompoundCacher.BuildCache(start_from_scratch=False, kegg_additions_cids=kegg_additions_cids)
|
CC/molecule.py
ADDED
@@ -0,0 +1,292 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from openbabel import openbabel
|
2 |
+
import types
|
3 |
+
import re
|
4 |
+
import chemaxon
|
5 |
+
from thermodynamic_constants import default_T, default_pH
|
6 |
+
import pdb
|
7 |
+
|
8 |
+
class OpenBabelError(Exception):
|
9 |
+
pass
|
10 |
+
|
11 |
+
class Molecule(object):
|
12 |
+
|
13 |
+
# for more rendering options visit:
|
14 |
+
# http://www.ggasoftware.com/opensource/indigo/api/options#rendering
|
15 |
+
_obElements = openbabel.OBElementTable()
|
16 |
+
_obSmarts = openbabel.OBSmartsPattern()
|
17 |
+
|
18 |
+
@staticmethod
|
19 |
+
def GetNumberOfElements():
|
20 |
+
return Molecule._obElements.GetNumberOfElements()
|
21 |
+
|
22 |
+
@staticmethod
|
23 |
+
def GetAllElements():
|
24 |
+
return [Molecule._obElements.GetSymbol(i) for i in
|
25 |
+
range(Molecule.GetNumberOfElements())]
|
26 |
+
|
27 |
+
@staticmethod
|
28 |
+
def GetSymbol(atomic_num):
|
29 |
+
return Molecule._obElements.GetSymbol(atomic_num)
|
30 |
+
|
31 |
+
@staticmethod
|
32 |
+
def GetAtomicNum(elem):
|
33 |
+
if type(elem) == str:
|
34 |
+
elem = str(elem)
|
35 |
+
return Molecule._obElements.GetAtomicNum(elem)
|
36 |
+
|
37 |
+
@staticmethod
|
38 |
+
def VerifySmarts(smarts):
|
39 |
+
return Molecule._obSmarts.Init(smarts)
|
40 |
+
|
41 |
+
def __init__(self):
|
42 |
+
self.title = None
|
43 |
+
self.obmol = openbabel.OBMol()
|
44 |
+
self.smiles = None
|
45 |
+
self.inchi = None
|
46 |
+
|
47 |
+
def __str__(self):
|
48 |
+
return self.title or self.smiles or self.inchi or ""
|
49 |
+
|
50 |
+
def __len__(self):
|
51 |
+
return self.GetNumAtoms()
|
52 |
+
|
53 |
+
def Clone(self):
|
54 |
+
tmp = Molecule()
|
55 |
+
tmp.title = self.title
|
56 |
+
tmp.obmol = openbabel.OBMol(self.obmol)
|
57 |
+
tmp.smiles = self.smiles
|
58 |
+
tmp.inchi = self.inchi
|
59 |
+
return tmp
|
60 |
+
|
61 |
+
def SetTitle(self, title):
|
62 |
+
self.title = title
|
63 |
+
|
64 |
+
@staticmethod
|
65 |
+
def FromSmiles(smiles):
|
66 |
+
m = Molecule()
|
67 |
+
m.smiles = smiles
|
68 |
+
obConversion = openbabel.OBConversion()
|
69 |
+
obConversion.AddOption("w", obConversion.OUTOPTIONS)
|
70 |
+
obConversion.SetInFormat("smiles")
|
71 |
+
if not obConversion.ReadString(m.obmol, m.smiles):
|
72 |
+
raise OpenBabelError("Cannot read the SMILES string: " + smiles)
|
73 |
+
try:
|
74 |
+
m.UpdateSmiles()
|
75 |
+
#m.UpdateInChI()
|
76 |
+
except OpenBabelError:
|
77 |
+
raise OpenBabelError("Failed to create Molecule from SMILES: " + smiles)
|
78 |
+
m.SetTitle(smiles)
|
79 |
+
return m
|
80 |
+
|
81 |
+
@staticmethod
|
82 |
+
def FromInChI(inchi):
|
83 |
+
m = Molecule()
|
84 |
+
m.inchi = inchi
|
85 |
+
obConversion = openbabel.OBConversion()
|
86 |
+
obConversion.AddOption("w", obConversion.OUTOPTIONS)
|
87 |
+
obConversion.SetInFormat("inchi")
|
88 |
+
obConversion.ReadString(m.obmol, m.inchi)
|
89 |
+
try:
|
90 |
+
m.UpdateInChI()
|
91 |
+
#m.UpdateSmiles()
|
92 |
+
except OpenBabelError:
|
93 |
+
raise OpenBabelError("Failed to create Molecule from InChI: " + inchi)
|
94 |
+
m.SetTitle(inchi)
|
95 |
+
return m
|
96 |
+
|
97 |
+
@staticmethod
|
98 |
+
def FromMol(mol):
|
99 |
+
m = Molecule()
|
100 |
+
obConversion = openbabel.OBConversion()
|
101 |
+
obConversion.AddOption("w", obConversion.OUTOPTIONS)
|
102 |
+
obConversion.SetInFormat("mol")
|
103 |
+
obConversion.ReadString(m.obmol, mol)
|
104 |
+
try:
|
105 |
+
m.UpdateInChI()
|
106 |
+
m.UpdateSmiles()
|
107 |
+
except OpenBabelError:
|
108 |
+
raise OpenBabelError("Failed to create Molecule from MOL file:\n" + mol)
|
109 |
+
m.SetTitle("")
|
110 |
+
return m
|
111 |
+
|
112 |
+
@staticmethod
|
113 |
+
def FromOBMol(obmol):
|
114 |
+
m = Molecule()
|
115 |
+
m.obmol = obmol
|
116 |
+
try:
|
117 |
+
m.UpdateInChI()
|
118 |
+
m.UpdateSmiles()
|
119 |
+
except OpenBabelError:
|
120 |
+
raise OpenBabelError("Failed to create Molecule from OBMol")
|
121 |
+
m.SetTitle("")
|
122 |
+
return m
|
123 |
+
|
124 |
+
@staticmethod
|
125 |
+
def _FromFormat(s, fmt='inchi'):
|
126 |
+
if fmt == 'smiles' or fmt == 'smi':
|
127 |
+
return Molecule.FromSmiles(s)
|
128 |
+
if fmt == 'inchi':
|
129 |
+
return Molecule.FromInChI(s)
|
130 |
+
if fmt == 'mol':
|
131 |
+
return Molecule.FromMol(s)
|
132 |
+
if fmt == 'obmol':
|
133 |
+
return Molecule.FromOBMol(s)
|
134 |
+
|
135 |
+
@staticmethod
|
136 |
+
def _ToFormat(obmol, fmt='inchi'):
|
137 |
+
#print('formatting started...')
|
138 |
+
#pdb.set_trace()
|
139 |
+
obConversion = openbabel.OBConversion()
|
140 |
+
obConversion.AddOption("w", obConversion.OUTOPTIONS)
|
141 |
+
obConversion.SetOutFormat(fmt)
|
142 |
+
res = obConversion.WriteString(obmol)
|
143 |
+
#print('res :::: ')
|
144 |
+
#print(res)
|
145 |
+
if not res:
|
146 |
+
raise OpenBabelError("Cannot convert OBMol to %s" % fmt)
|
147 |
+
if fmt == 'smiles' or fmt == 'smi':
|
148 |
+
#print('I am in')
|
149 |
+
res = res.split()
|
150 |
+
if res == []:
|
151 |
+
raise OpenBabelError("Cannot convert OBMol to %s" % fmt)
|
152 |
+
else:
|
153 |
+
return res[0]
|
154 |
+
elif fmt == 'inchi':
|
155 |
+
return res.strip()
|
156 |
+
else:
|
157 |
+
return res
|
158 |
+
|
159 |
+
@staticmethod
|
160 |
+
def Smiles2InChI(smiles):
|
161 |
+
obConversion = openbabel.OBConversion()
|
162 |
+
obConversion.AddOption("w", obConversion.OUTOPTIONS)
|
163 |
+
obConversion.SetInAndOutFormats("smiles", "inchi")
|
164 |
+
obmol = openbabel.OBMol()
|
165 |
+
if not obConversion.ReadString(obmol, smiles):
|
166 |
+
raise OpenBabelError("Cannot read the SMILES string: " + smiles)
|
167 |
+
return obConversion.WriteString(obmol).strip()
|
168 |
+
|
169 |
+
@staticmethod
|
170 |
+
def InChI2Smiles(inchi):
|
171 |
+
obConversion = openbabel.OBConversion()
|
172 |
+
obConversion.AddOption("w", obConversion.OUTOPTIONS)
|
173 |
+
obConversion.SetInAndOutFormats("inchi", "smiles")
|
174 |
+
obmol = openbabel.OBMol()
|
175 |
+
if not obConversion.ReadString(obmol, inchi):
|
176 |
+
raise OpenBabelError("Cannot read the InChI string: " + inchi)
|
177 |
+
return obConversion.WriteString(obmol).split()[0]
|
178 |
+
|
179 |
+
def RemoveHydrogens(self):
|
180 |
+
self.obmol.DeleteHydrogens()
|
181 |
+
|
182 |
+
def RemoveAtoms(self, indices):
|
183 |
+
self.obmol.BeginModify()
|
184 |
+
for i in sorted(indices, reverse=True):
|
185 |
+
self.obmol.DeleteAtom(self.obmol.GetAtom(i+1))
|
186 |
+
self.obmol.EndModify()
|
187 |
+
self.smiles = None
|
188 |
+
self.inchi = None
|
189 |
+
|
190 |
+
def SetAtomicNum(self, index, new_atomic_num):
|
191 |
+
self.obmol.GetAtom(index+1).SetAtomicNum(new_atomic_num)
|
192 |
+
self.smiles = None
|
193 |
+
self.inchi = None
|
194 |
+
|
195 |
+
def ToOBMol(self):
|
196 |
+
return self.obmol
|
197 |
+
|
198 |
+
def ToFormat(self, fmt='inchi'):
|
199 |
+
return Molecule._ToFormat(self.obmol, fmt=fmt)
|
200 |
+
|
201 |
+
def ToMolfile(self):
|
202 |
+
return self.ToFormat('mol')
|
203 |
+
|
204 |
+
def UpdateInChI(self):
|
205 |
+
self.inchi = Molecule._ToFormat(self.obmol, 'inchi')
|
206 |
+
|
207 |
+
def ToInChI(self):
|
208 |
+
"""
|
209 |
+
Lazy storage of the InChI identifier (calculate once only when
|
210 |
+
asked for and store for later use).
|
211 |
+
"""
|
212 |
+
if not self.inchi:
|
213 |
+
self.UpdateInChI()
|
214 |
+
return self.inchi
|
215 |
+
|
216 |
+
def UpdateSmiles(self):
|
217 |
+
self.smiles = Molecule._ToFormat(self.obmol, 'smiles')
|
218 |
+
|
219 |
+
def ToSmiles(self):
|
220 |
+
"""
|
221 |
+
Lazy storage of the SMILES identifier (calculate once only when
|
222 |
+
asked for and store for later use).
|
223 |
+
"""
|
224 |
+
if not self.smiles:
|
225 |
+
self.UpdateSmiles()
|
226 |
+
return self.smiles
|
227 |
+
|
228 |
+
def GetFormula(self):
|
229 |
+
tokens = re.findall('InChI=1S?/([0-9A-Za-z\.]+)', self.ToInChI())
|
230 |
+
if len(tokens) == 1:
|
231 |
+
return tokens[0]
|
232 |
+
elif len(tokens) > 1:
|
233 |
+
raise ValueError('Bad InChI: ' + self.ToInChI())
|
234 |
+
else:
|
235 |
+
return ''
|
236 |
+
|
237 |
+
def GetExactMass(self):
|
238 |
+
return self.obmol.GetExactMass()
|
239 |
+
|
240 |
+
def GetAtomBagAndCharge(self):
|
241 |
+
inchi = self.ToInChI()
|
242 |
+
atom_bag, major_ms_charge = chemaxon.GetAtomBagAndCharge(inchi)
|
243 |
+
return atom_bag, major_ms_charge
|
244 |
+
|
245 |
+
def GetHydrogensAndCharge(self):
|
246 |
+
atom_bag, charge = self.GetAtomBagAndCharge()
|
247 |
+
return atom_bag.get('H', 0), charge
|
248 |
+
|
249 |
+
def GetNumElectrons(self):
|
250 |
+
"""Calculates the number of electrons in a given molecule."""
|
251 |
+
atom_bag, fixed_charge = self.GetAtomBagAndCharge()
|
252 |
+
return atom_bag.get('e-', 0)
|
253 |
+
|
254 |
+
def GetNumAtoms(self):
|
255 |
+
return self.obmol.NumAtoms()
|
256 |
+
|
257 |
+
def GetAtoms(self):
|
258 |
+
return [self.obmol.GetAtom(i+1) for i in range(self.obmol.NumAtoms())]
|
259 |
+
|
260 |
+
def FindSmarts(self, smarts):
|
261 |
+
"""
|
262 |
+
Corrects the pyBel version of Smarts.findall() which returns results as tuples,
|
263 |
+
with 1-based indices even though Molecule.atoms is 0-based.
|
264 |
+
|
265 |
+
Args:
|
266 |
+
mol: the molecule to search in.
|
267 |
+
smarts_str: the SMARTS query to search for.
|
268 |
+
|
269 |
+
Returns:
|
270 |
+
The re-mapped list of SMARTS matches.
|
271 |
+
"""
|
272 |
+
Molecule._obSmarts.Init(smarts)
|
273 |
+
if Molecule._obSmarts.Match(self.obmol):
|
274 |
+
match_list = Molecule._obSmarts.GetMapList()
|
275 |
+
shift_left = lambda m: [(n - 1) for n in m]
|
276 |
+
return list(map(shift_left, match_list))
|
277 |
+
else:
|
278 |
+
return []
|
279 |
+
|
280 |
+
def GetAtomCharges(self):
|
281 |
+
"""
|
282 |
+
Returns:
|
283 |
+
A list of charges, according to the number of atoms
|
284 |
+
in the molecule
|
285 |
+
"""
|
286 |
+
return [atom.GetFormalCharge() for atom in self.GetAtoms()]
|
287 |
+
|
288 |
+
if __name__ == '__main__':
|
289 |
+
|
290 |
+
mol = Molecule.FromInChI('InChI=1/C5H10O2/c1-3-5(6)7-4-2/h3-4H2,1-2H3')
|
291 |
+
#mol = Molecule.FromInChI('InChI=1S/H2/h1H')
|
292 |
+
print(mol.GetExactMass())
|
CC/thermodynamic_constants.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
R = 8.31e-3 # kJ/(K*mol)
|
2 |
+
F = 96.485 # kC/mol
|
3 |
+
J_per_cal = 4.184
|
4 |
+
default_T = 298.15 # K
|
5 |
+
default_I = 0.25 # M
|
6 |
+
default_pH = 7.0
|
7 |
+
default_c0 = 1 # M
|
8 |
+
default_pMg = 10
|
9 |
+
default_RT = R * default_T
|
10 |
+
default_c_mid = 1e-3 # M
|
11 |
+
default_c_range = (1e-6, 1e-2) # M
|
12 |
+
dG0_f_Mg = -455.3 # kJ/mol, formation energy of Mg2+
|
13 |
+
|
14 |
+
symbol_d_G = "ΔG"
|
15 |
+
symbol_d_G0 = "ΔG°"
|
16 |
+
symbol_d_G_prime = "ΔG'"
|
17 |
+
symbol_d_G0_prime = "ΔG'°"
|
18 |
+
|
19 |
+
symbol_dr_G = "Δ<sub>r</sub>G"
|
20 |
+
symbol_dr_G0 = "Δ<sub>r</sub>G°"
|
21 |
+
symbol_dr_G_prime = "Δ<sub>r</sub>G'"
|
22 |
+
symbol_dr_G0_prime = "Δ<sub>r</sub>G'°"
|
23 |
+
symbol_dr_Gc_prime = "Δ<sub>r</sub>G'<sup>c</sup>"
|
24 |
+
|
25 |
+
symbol_df_G = "Δ<sub>f</sub>G"
|
26 |
+
symbol_df_G0 = "Δ<sub>f</sub>G°"
|
27 |
+
symbol_df_G_prime = "Δ<sub>f</sub>G'"
|
28 |
+
symbol_df_G0_prime = "Δ<sub>f</sub>G'°"
|
29 |
+
|
30 |
+
# Approximation of the temperature dependency of ionic strength effects
|
31 |
+
DH_alpha = lambda T : 1e-3*(9.20483*T) - 1e-5*(1.284668 * T**2) + 1e-8*(4.95199 * T**3)
|
32 |
+
DH_beta = 1.6
|
33 |
+
|
34 |
+
# Debye-Huckel
|
35 |
+
debye_huckel = lambda I_T : DH_alpha(I_T[1]) * I_T[0]**(0.5) / (1.0 + DH_beta * I_T[0]**(0.5))
|
36 |
+
|