vuu10 commited on
Commit
6d990bb
·
1 Parent(s): a5f27f8

Upload 6 files

Browse files
CC/Untitled.ipynb ADDED
@@ -0,0 +1,1038 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "ed0cdaf6-71e1-4ef0-894f-0beabdc392cf",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import pandas as pd\n",
11
+ "import numpy as np\n",
12
+ "import re\n",
13
+ "from PIL import Image\n",
14
+ "import webbrowser\n",
15
+ "import json\n",
16
+ "import pickle\n",
17
+ "import sys \n",
18
+ "import joblib\n",
19
+ "import sys\n",
20
+ "\n",
21
+ "from rdkit import Chem\n",
22
+ "from rdkit.Chem import Draw\n",
23
+ "from rdkit.Chem import rdChemReactions as Reactions\n",
24
+ "\n",
25
+ "from compound_cacher import CompoundCacher\n",
26
+ "from compound import Compound\n",
27
+ "from chemaxon import *\n",
28
+ "import chemaxon"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": 2,
34
+ "id": "e64deced-2a44-4d8e-ba8f-d9843f11724a",
35
+ "metadata": {},
36
+ "outputs": [],
37
+ "source": [
38
+ "def load_smiles():\n",
39
+ " db = pd.read_csv('./../data/cache_compounds_20160818.csv',index_col='compound_id')\n",
40
+ " db_smiles = db['smiles_pH7'].to_dict()\n",
41
+ " return db_smiles\n",
42
+ "\n",
43
+ "def load_molsig_rad1():\n",
44
+ " molecular_signature_r1 = json.load(open('./../data/decompose_vector_ac.json'))\n",
45
+ " return molecular_signature_r1\n",
46
+ "\n",
47
+ "def load_molsig_rad2():\n",
48
+ " molecular_signature_r2 = json.load(open('./../data/decompose_vector_ac_r2_py3_indent_modified_manual.json'))\n",
49
+ " return molecular_signature_r2\n",
50
+ "\n",
51
+ "def load_model():\n",
52
+ " filename = './../model/M12_model_BR.pkl'\n",
53
+ " loaded_model = joblib.load(open(filename, 'rb'))\n",
54
+ " return loaded_model"
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "code",
59
+ "execution_count": 3,
60
+ "id": "71615c14-49c3-45e7-9495-194ef22fb1ee",
61
+ "metadata": {},
62
+ "outputs": [],
63
+ "source": [
64
+ "db_smiles = load_smiles()\n",
65
+ "molsig_r1 = load_molsig_rad1()\n",
66
+ "molsig_r2 = load_molsig_rad2()\n",
67
+ "loaded_model = load_model()"
68
+ ]
69
+ },
70
+ {
71
+ "cell_type": "code",
72
+ "execution_count": 4,
73
+ "id": "b86b8049-cbf2-473f-8715-5e5f908193a2",
74
+ "metadata": {},
75
+ "outputs": [],
76
+ "source": [
77
+ "def parse_reaction_formula_side(s):\n",
78
+ " \"\"\"\n",
79
+ " Parses the side formula, e.g. '2 C00001 + C00002 + 3 C00003'\n",
80
+ " Ignores stoichiometry.\n",
81
+ "\n",
82
+ " Returns:\n",
83
+ " The set of CIDs.\n",
84
+ " \"\"\"\n",
85
+ " if s.strip() == \"null\":\n",
86
+ " return {}\n",
87
+ "\n",
88
+ " compound_bag = {}\n",
89
+ " for member in re.split('\\s+\\+\\s+', s):\n",
90
+ " tokens = member.split(None, 1)\n",
91
+ " if len(tokens) == 0:\n",
92
+ " continue\n",
93
+ " if len(tokens) == 1:\n",
94
+ " amount = 1\n",
95
+ " key = member\n",
96
+ " else:\n",
97
+ " amount = float(tokens[0])\n",
98
+ " key = tokens[1]\n",
99
+ "\n",
100
+ " compound_bag[key] = compound_bag.get(key, 0) + amount\n",
101
+ "\n",
102
+ " return compound_bag\n",
103
+ "\n",
104
+ "def parse_formula(formula, arrow='<=>', rid=None):\n",
105
+ " \"\"\"\n",
106
+ " Parses a two-sided formula such as: 2 C00001 => C00002 + C00003\n",
107
+ "\n",
108
+ " Return:\n",
109
+ " The set of substrates, products and the direction of the reaction\n",
110
+ " \"\"\"\n",
111
+ " tokens = formula.split(arrow)\n",
112
+ " if len(tokens) < 2:\n",
113
+ " print(('Reaction does not contain the arrow sign (%s): %s'\n",
114
+ " % (arrow, formula)))\n",
115
+ " if len(tokens) > 2:\n",
116
+ " print(('Reaction contains more than one arrow sign (%s): %s'\n",
117
+ " % (arrow, formula)))\n",
118
+ "\n",
119
+ " left = tokens[0].strip()\n",
120
+ " right = tokens[1].strip()\n",
121
+ "\n",
122
+ " sparse_reaction = {}\n",
123
+ " for cid, count in parse_reaction_formula_side(left).items():\n",
124
+ " sparse_reaction[cid] = sparse_reaction.get(cid, 0) - count\n",
125
+ "\n",
126
+ " for cid, count in parse_reaction_formula_side(right).items():\n",
127
+ " sparse_reaction[cid] = sparse_reaction.get(cid, 0) + count \n",
128
+ " \n",
129
+ " return sparse_reaction"
130
+ ]
131
+ },
132
+ {
133
+ "cell_type": "code",
134
+ "execution_count": 5,
135
+ "id": "7342b178-3472-4734-83e3-3de431abe15e",
136
+ "metadata": {},
137
+ "outputs": [],
138
+ "source": [
139
+ "rxn_string = \"C00222 + C00010 + C00006 <=> C00024 + C00011 + C00005\""
140
+ ]
141
+ },
142
+ {
143
+ "cell_type": "code",
144
+ "execution_count": 6,
145
+ "id": "7b4dfe4f-48a8-4011-b201-7fb3a3268cef",
146
+ "metadata": {},
147
+ "outputs": [],
148
+ "source": [
149
+ "rxn_dic = parse_formula(rxn_string)"
150
+ ]
151
+ },
152
+ {
153
+ "cell_type": "code",
154
+ "execution_count": 7,
155
+ "id": "1f523aa2-b9dc-4153-8c1c-dec58e1ab987",
156
+ "metadata": {},
157
+ "outputs": [],
158
+ "source": [
159
+ "def get_ddG0(rxn_dict,pH,I,novel_mets):\n",
160
+ " ccache = CompoundCacher()\n",
161
+ " # ddG0 = get_transform_ddG0(rxn_dict, ccache, pH, I, T)\n",
162
+ " T = 298.15\n",
163
+ " ddG0_forward = 0\n",
164
+ " for compound_id, coeff in rxn_dict.items():\n",
165
+ " if novel_mets != None and compound_id in novel_mets:\n",
166
+ " comp = novel_mets[compound_id]\n",
167
+ " else:\n",
168
+ " comp = ccache.get_compound(compound_id)\n",
169
+ " ddG0_forward += coeff * comp.transform_pH7(pH, I, T)\n",
170
+ "\n",
171
+ " return ddG0_forward"
172
+ ]
173
+ },
174
+ {
175
+ "cell_type": "code",
176
+ "execution_count": 8,
177
+ "id": "33cf30ff-8b2c-4da9-9134-75a60a5c5d66",
178
+ "metadata": {},
179
+ "outputs": [
180
+ {
181
+ "data": {
182
+ "text/plain": [
183
+ "-3.6254822995515497"
184
+ ]
185
+ },
186
+ "execution_count": 8,
187
+ "metadata": {},
188
+ "output_type": "execute_result"
189
+ }
190
+ ],
191
+ "source": [
192
+ "get_ddG0(rxn_dic, 7.0, 0.1, {})"
193
+ ]
194
+ },
195
+ {
196
+ "cell_type": "code",
197
+ "execution_count": 9,
198
+ "id": "9e39855d-eb9e-4ea9-aeb9-8b770cc24c8e",
199
+ "metadata": {},
200
+ "outputs": [],
201
+ "source": [
202
+ "def get_rule(rxn_dict, molsig1, molsig2, novel_decomposed1, novel_decomposed2):\n",
203
+ " if novel_decomposed1 != None:\n",
204
+ " for cid in novel_decomposed1:\n",
205
+ " molsig1[cid] = novel_decomposed1[cid]\n",
206
+ " if novel_decomposed2 != None:\n",
207
+ " for cid in novel_decomposed2:\n",
208
+ " molsig2[cid] = novel_decomposed2[cid]\n",
209
+ "\n",
210
+ " molsigna_df1 = pd.DataFrame.from_dict(molsig1).fillna(0)\n",
211
+ " all_mets1 = molsigna_df1.columns.tolist()\n",
212
+ " all_mets1.append(\"C00080\")\n",
213
+ " all_mets1.append(\"C00282\")\n",
214
+ "\n",
215
+ " molsigna_df2 = pd.DataFrame.from_dict(molsig2).fillna(0)\n",
216
+ " all_mets2 = molsigna_df2.columns.tolist()\n",
217
+ " all_mets2.append(\"C00080\")\n",
218
+ " all_mets2.append(\"C00282\")\n",
219
+ "\n",
220
+ " moieties_r1 = open('./data/group_names_r1.txt')\n",
221
+ " moieties_r2 = open('./data/group_names_r2_py3_modified_manual.txt')\n",
222
+ " moie_r1 = moieties_r1.read().splitlines()\n",
223
+ " moie_r2 = moieties_r2.read().splitlines()\n",
224
+ "\n",
225
+ " molsigna_df1 = molsigna_df1.reindex(moie_r1)\n",
226
+ " molsigna_df2 = molsigna_df2.reindex(moie_r2)\n",
227
+ "\n",
228
+ " rule_df1 = pd.DataFrame(index=molsigna_df1.index)\n",
229
+ " rule_df2 = pd.DataFrame(index=molsigna_df2.index)\n",
230
+ " # for rid, value in reaction_dict.items():\n",
231
+ " # # skip the reactions with missing metabolites\n",
232
+ " # mets = value.keys()\n",
233
+ " # flag = False\n",
234
+ " # for met in mets:\n",
235
+ " # if met not in all_mets:\n",
236
+ " # flag = True\n",
237
+ " # break\n",
238
+ " # if flag: continue\n",
239
+ "\n",
240
+ " rule_df1['change'] = 0\n",
241
+ " for met, stoic in rxn_dict.items():\n",
242
+ " if met == \"C00080\" or met == \"C00282\":\n",
243
+ " continue # hydogen is zero\n",
244
+ " rule_df1['change'] += molsigna_df1[met] * stoic\n",
245
+ "\n",
246
+ " rule_df2['change'] = 0\n",
247
+ " for met, stoic in rxn_dict.items():\n",
248
+ " if met == \"C00080\" or met == \"C00282\":\n",
249
+ " continue # hydogen is zero\n",
250
+ " rule_df2['change'] += molsigna_df2[met] * stoic\n",
251
+ "\n",
252
+ " rule_vec1 = rule_df1.to_numpy().T\n",
253
+ " rule_vec2 = rule_df2.to_numpy().T\n",
254
+ "\n",
255
+ " m1, n1 = rule_vec1.shape\n",
256
+ " m2, n2 = rule_vec2.shape\n",
257
+ "\n",
258
+ " zeros1 = np.zeros((m1, 44))\n",
259
+ " zeros2 = np.zeros((m2, 44))\n",
260
+ " X1 = np.concatenate((rule_vec1, zeros1), 1)\n",
261
+ " X2 = np.concatenate((rule_vec2, zeros2), 1)\n",
262
+ "\n",
263
+ " rule_comb = np.concatenate((X1, X2), 1)\n",
264
+ "\n",
265
+ " # rule_df_final = {}\n",
266
+ " # rule_df_final['rad1'] = rule_df1\n",
267
+ " # rule_df_final['rad2'] = rule_df2\n",
268
+ " return rule_comb, rule_df1, rule_df2\n"
269
+ ]
270
+ },
271
+ {
272
+ "cell_type": "code",
273
+ "execution_count": 14,
274
+ "id": "a93ea75e-9851-45fd-aa58-d7f325b4b5a6",
275
+ "metadata": {},
276
+ "outputs": [
277
+ {
278
+ "data": {
279
+ "text/plain": [
280
+ "{'C00222': -1,\n",
281
+ " 'C00010': -1,\n",
282
+ " 'C00006': -1,\n",
283
+ " 'C00024': 1,\n",
284
+ " 'C00011': 1,\n",
285
+ " 'C00005': 1}"
286
+ ]
287
+ },
288
+ "execution_count": 14,
289
+ "metadata": {},
290
+ "output_type": "execute_result"
291
+ }
292
+ ],
293
+ "source": [
294
+ "rxn_dic"
295
+ ]
296
+ },
297
+ {
298
+ "cell_type": "code",
299
+ "execution_count": null,
300
+ "id": "981948dd-db2c-4463-b983-1220353d963e",
301
+ "metadata": {},
302
+ "outputs": [],
303
+ "source": []
304
+ },
305
+ {
306
+ "cell_type": "code",
307
+ "execution_count": 23,
308
+ "id": "96eb1c38-2ca7-4e38-bcc4-ade1cef73852",
309
+ "metadata": {},
310
+ "outputs": [
311
+ {
312
+ "data": {
313
+ "text/plain": [
314
+ "(array([-19.96775194]), array([6.66052556]))"
315
+ ]
316
+ },
317
+ "execution_count": 23,
318
+ "metadata": {},
319
+ "output_type": "execute_result"
320
+ }
321
+ ],
322
+ "source": [
323
+ "loaded_model.predict(X, return_std= True)"
324
+ ]
325
+ },
326
+ {
327
+ "cell_type": "code",
328
+ "execution_count": null,
329
+ "id": "81128dd3-5005-40a6-b5fe-8ecacef824bc",
330
+ "metadata": {},
331
+ "outputs": [],
332
+ "source": [
333
+ "def get_ddG0(rxn_dict,pH,I,novel_mets):\n",
334
+ " ccache = CompoundCacher()\n",
335
+ " # ddG0 = get_transform_ddG0(rxn_dict, ccache, pH, I, T)\n",
336
+ " T = 298.15\n",
337
+ " ddG0_forward = 0\n",
338
+ " for compound_id, coeff in rxn_dict.items():\n",
339
+ " if novel_mets != None and compound_id in novel_mets:\n",
340
+ " comp = novel_mets[compound_id]\n",
341
+ " else:\n",
342
+ " comp = ccache.get_compound(compound_id)\n",
343
+ " ddG0_forward += coeff * comp.transform_pH7(pH, I, T)\n",
344
+ "\n",
345
+ " return ddG0_forward\n",
346
+ "\n",
347
+ "\n",
348
+ "def get_dG0(rxn_dict,rid,pH,I,loaded_model,molsig_r1, molsig_r2, novel_decomposed_r1, novel_decomposed_r2,novel_mets):\n",
349
+ " rule_comb, rule_df1, rule_df2 = get_rule(rxn_dict,molsig_r1,molsig_r2, novel_decomposed_r1, novel_decomposed_r2)\n",
350
+ " X = rule_comb\n",
351
+ " ymean, ystd = loaded_model.predict(X, return_std=True)\n",
352
+ " result = {}\n",
353
+ " return ymean[0] + get_ddG0(rxn_dict, pH, I, novel_mets),ystd[0], rule_df1, rule_df2"
354
+ ]
355
+ },
356
+ {
357
+ "cell_type": "code",
358
+ "execution_count": null,
359
+ "id": "751ec201-f062-4ac0-8d24-fe959636cbdc",
360
+ "metadata": {},
361
+ "outputs": [],
362
+ "source": []
363
+ },
364
+ {
365
+ "cell_type": "code",
366
+ "execution_count": null,
367
+ "id": "c6cb1e4d-24be-42a1-b88b-793a62597c92",
368
+ "metadata": {},
369
+ "outputs": [],
370
+ "source": []
371
+ },
372
+ {
373
+ "cell_type": "code",
374
+ "execution_count": null,
375
+ "id": "7abe24be-1653-455b-9931-9446480d39bb",
376
+ "metadata": {},
377
+ "outputs": [],
378
+ "source": []
379
+ },
380
+ {
381
+ "cell_type": "code",
382
+ "execution_count": null,
383
+ "id": "f13433dc-51a3-41e5-8a0b-b0f21724ef98",
384
+ "metadata": {},
385
+ "outputs": [],
386
+ "source": []
387
+ },
388
+ {
389
+ "cell_type": "code",
390
+ "execution_count": 2,
391
+ "id": "db7c764f-d216-44a9-8f88-0e3a7c51377a",
392
+ "metadata": {},
393
+ "outputs": [],
394
+ "source": [
395
+ "ccc= CompoundCacher()"
396
+ ]
397
+ },
398
+ {
399
+ "cell_type": "code",
400
+ "execution_count": 3,
401
+ "id": "09e6f7f2-5be7-4db3-b55d-756ecb711095",
402
+ "metadata": {},
403
+ "outputs": [],
404
+ "source": [
405
+ "a = ccc.get_compound('C00001')"
406
+ ]
407
+ },
408
+ {
409
+ "cell_type": "code",
410
+ "execution_count": 4,
411
+ "id": "d28e44b7-d942-4739-9d7d-2f4e082ac1b9",
412
+ "metadata": {},
413
+ "outputs": [
414
+ {
415
+ "data": {
416
+ "text/plain": [
417
+ "81.4472134155519"
418
+ ]
419
+ },
420
+ "execution_count": 4,
421
+ "metadata": {},
422
+ "output_type": "execute_result"
423
+ }
424
+ ],
425
+ "source": [
426
+ "a.transform_pH7(7, 0.25 , 298)"
427
+ ]
428
+ },
429
+ {
430
+ "cell_type": "code",
431
+ "execution_count": 5,
432
+ "id": "1ef3fc0d-7d63-42ea-8743-522fe010a95d",
433
+ "metadata": {},
434
+ "outputs": [],
435
+ "source": [
436
+ "inchi_k = \"InChI=1S/C14H14O/c15-14-8-4-7-13(11-14)10-9-12-5-2-1-3-6-12/h1-8,11,15H,9-10H2\" ;"
437
+ ]
438
+ },
439
+ {
440
+ "cell_type": "code",
441
+ "execution_count": 6,
442
+ "id": "4e651d1c-2c96-42d1-adab-466dc7518146",
443
+ "metadata": {},
444
+ "outputs": [
445
+ {
446
+ "name": "stderr",
447
+ "output_type": "stream",
448
+ "text": [
449
+ "C:\\Users\\vuu10\\AppData\\Local\\Continuum\\anaconda3\\envs\\dGPredictor_py3\\lib\\openbabel\\__init__.py:14: UserWarning: \"import openbabel\" is deprecated, instead use \"from openbabel import openbabel\"\n",
450
+ " warnings.warn('\"import openbabel\" is deprecated, instead use \"from openbabel import openbabel\"')\n"
451
+ ]
452
+ }
453
+ ],
454
+ "source": [
455
+ "c = Compound.from_inchi('Test', 'sajdf', inchi_k )"
456
+ ]
457
+ },
458
+ {
459
+ "cell_type": "code",
460
+ "execution_count": 18,
461
+ "id": "6eb5c2dc-f14c-46de-889b-0e9b7faa9f79",
462
+ "metadata": {},
463
+ "outputs": [
464
+ {
465
+ "ename": "AttributeError",
466
+ "evalue": "'Compound' object has no attribute 'smiles_ph7'",
467
+ "output_type": "error",
468
+ "traceback": [
469
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
470
+ "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)",
471
+ "\u001b[1;32m<ipython-input-18-7a0d06664090>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mc\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msmiles_ph7\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
472
+ "\u001b[1;31mAttributeError\u001b[0m: 'Compound' object has no attribute 'smiles_ph7'"
473
+ ]
474
+ }
475
+ ],
476
+ "source": [
477
+ "c.smiles_ph7()"
478
+ ]
479
+ },
480
+ {
481
+ "cell_type": "code",
482
+ "execution_count": 7,
483
+ "id": "edd156dc-4355-4c2c-ba4e-6d98e776a96a",
484
+ "metadata": {},
485
+ "outputs": [],
486
+ "source": [
487
+ "from chemaxon import *\n",
488
+ "import chemaxon"
489
+ ]
490
+ },
491
+ {
492
+ "cell_type": "code",
493
+ "execution_count": 8,
494
+ "id": "880d2ef6-6b03-49d3-8f60-66769c22a84d",
495
+ "metadata": {},
496
+ "outputs": [],
497
+ "source": [
498
+ "pKas, major_ms_smiles = chemaxon.GetDissociationConstants(inchi_k)"
499
+ ]
500
+ },
501
+ {
502
+ "cell_type": "code",
503
+ "execution_count": 9,
504
+ "id": "7a2391dc-313c-47f2-9f54-823bfdb95fcd",
505
+ "metadata": {},
506
+ "outputs": [
507
+ {
508
+ "data": {
509
+ "text/plain": [
510
+ "'OC1=CC=CC(CCC2=CC=CC=C2)=C1\\r'"
511
+ ]
512
+ },
513
+ "execution_count": 9,
514
+ "metadata": {},
515
+ "output_type": "execute_result"
516
+ }
517
+ ],
518
+ "source": [
519
+ "major_ms_smiles"
520
+ ]
521
+ },
522
+ {
523
+ "cell_type": "code",
524
+ "execution_count": 10,
525
+ "id": "96d90c4a-14a2-45fb-8573-97db84de2dff",
526
+ "metadata": {},
527
+ "outputs": [],
528
+ "source": [
529
+ "major_ms_smiles = Compound.smiles2smiles(major_ms_smiles)"
530
+ ]
531
+ },
532
+ {
533
+ "cell_type": "code",
534
+ "execution_count": 11,
535
+ "id": "36d46620-b895-4ec8-85d0-7499759812c6",
536
+ "metadata": {},
537
+ "outputs": [],
538
+ "source": [
539
+ "MIN_PH = 0.0\n",
540
+ "MAX_PH = 14.0\n",
541
+ "pKas = sorted([pka for pka in pKas if pka > MIN_PH and pka < MAX_PH], reverse=True)"
542
+ ]
543
+ },
544
+ {
545
+ "cell_type": "code",
546
+ "execution_count": 12,
547
+ "id": "ffccf9d9-5a52-4be6-af4c-f39b3db2a27c",
548
+ "metadata": {},
549
+ "outputs": [
550
+ {
551
+ "data": {
552
+ "text/plain": [
553
+ "[10.1]"
554
+ ]
555
+ },
556
+ "execution_count": 12,
557
+ "metadata": {},
558
+ "output_type": "execute_result"
559
+ }
560
+ ],
561
+ "source": [
562
+ "pKas"
563
+ ]
564
+ },
565
+ {
566
+ "cell_type": "code",
567
+ "execution_count": 13,
568
+ "id": "e83721fa-9a42-42ef-9a03-59fc2689c73b",
569
+ "metadata": {},
570
+ "outputs": [],
571
+ "source": [
572
+ "atom_bag, major_ms_charge = chemaxon.GetAtomBagAndCharge(major_ms_smiles)"
573
+ ]
574
+ },
575
+ {
576
+ "cell_type": "code",
577
+ "execution_count": null,
578
+ "id": "47a87ed7-968d-44b6-a237-a8469ba3fe3b",
579
+ "metadata": {},
580
+ "outputs": [],
581
+ "source": []
582
+ },
583
+ {
584
+ "cell_type": "code",
585
+ "execution_count": null,
586
+ "id": "49cfefde-ee96-4ca8-89af-c50f2f2ca70b",
587
+ "metadata": {},
588
+ "outputs": [],
589
+ "source": []
590
+ },
591
+ {
592
+ "cell_type": "code",
593
+ "execution_count": null,
594
+ "id": "9b881c7b-a14a-4561-9c3c-157116efdfd0",
595
+ "metadata": {},
596
+ "outputs": [],
597
+ "source": []
598
+ },
599
+ {
600
+ "cell_type": "code",
601
+ "execution_count": null,
602
+ "id": "10c8f915-e61a-4560-b546-fe6ea8bfdde3",
603
+ "metadata": {},
604
+ "outputs": [],
605
+ "source": []
606
+ },
607
+ {
608
+ "cell_type": "code",
609
+ "execution_count": null,
610
+ "id": "936fafa5-1bf6-495c-be79-d4cc620f4861",
611
+ "metadata": {},
612
+ "outputs": [],
613
+ "source": []
614
+ },
615
+ {
616
+ "cell_type": "code",
617
+ "execution_count": null,
618
+ "id": "285f9370-2fba-44c4-a36b-66c95f9f2eed",
619
+ "metadata": {},
620
+ "outputs": [],
621
+ "source": []
622
+ },
623
+ {
624
+ "cell_type": "code",
625
+ "execution_count": null,
626
+ "id": "adbcd78f-869a-4cc9-b727-03c80df31edd",
627
+ "metadata": {},
628
+ "outputs": [],
629
+ "source": []
630
+ },
631
+ {
632
+ "cell_type": "code",
633
+ "execution_count": null,
634
+ "id": "17fbfee9-c8b7-4644-814f-0e8aa0ad5ee9",
635
+ "metadata": {},
636
+ "outputs": [],
637
+ "source": []
638
+ },
639
+ {
640
+ "cell_type": "code",
641
+ "execution_count": 21,
642
+ "id": "70f90669-ff90-4bc4-955c-63672e42bb3c",
643
+ "metadata": {},
644
+ "outputs": [],
645
+ "source": [
646
+ "formula, formal_charge = GetFormulaAndCharge(molstring)\n",
647
+ "\n",
648
+ "atom_bag = {}"
649
+ ]
650
+ },
651
+ {
652
+ "cell_type": "code",
653
+ "execution_count": 25,
654
+ "id": "e40e4088-c246-4afb-98ae-f92cb738e988",
655
+ "metadata": {},
656
+ "outputs": [],
657
+ "source": [
658
+ "for mol_formula_times in formula.split('.'):\n",
659
+ " for times, mol_formula in re.findall('^(\\d+)?(\\w+)', mol_formula_times):\n",
660
+ " if not times:\n",
661
+ " times = 1\n",
662
+ " else:\n",
663
+ " times = int(times)\n",
664
+ " for atom, count in re.findall(\"([A-Z][a-z]*)([0-9]*)\", mol_formula):\n",
665
+ " if count == '':\n",
666
+ " count = 1\n",
667
+ " else:\n",
668
+ " count = int(count)\n",
669
+ " atom_bag[atom] = atom_bag.get(atom, 0) + count * times"
670
+ ]
671
+ },
672
+ {
673
+ "cell_type": "code",
674
+ "execution_count": 26,
675
+ "id": "391cfbba-2da5-4b60-ba32-217754913b35",
676
+ "metadata": {},
677
+ "outputs": [
678
+ {
679
+ "data": {
680
+ "text/plain": [
681
+ "{'C': 14, 'H': 14, 'O': 1}"
682
+ ]
683
+ },
684
+ "execution_count": 26,
685
+ "metadata": {},
686
+ "output_type": "execute_result"
687
+ }
688
+ ],
689
+ "source": [
690
+ "atom_bag"
691
+ ]
692
+ },
693
+ {
694
+ "cell_type": "code",
695
+ "execution_count": 52,
696
+ "id": "812f8297-a5cc-4d63-b132-243c278c6b76",
697
+ "metadata": {},
698
+ "outputs": [
699
+ {
700
+ "name": "stdout",
701
+ "output_type": "stream",
702
+ "text": [
703
+ "6\n",
704
+ "1\n",
705
+ "8\n"
706
+ ]
707
+ }
708
+ ],
709
+ "source": [
710
+ "from rdkit.Chem import rdchem\n",
711
+ "for (elem, c) in atom_bag.items():\n",
712
+ " ll = rdchem.GetPeriodicTable()\n",
713
+ " atomic_num = ll.GetAtomicNumber(elem)\n",
714
+ " print(atomic_num)"
715
+ ]
716
+ },
717
+ {
718
+ "cell_type": "code",
719
+ "execution_count": 55,
720
+ "id": "463fcb01-2cd0-4aee-990c-946c534dc766",
721
+ "metadata": {},
722
+ "outputs": [],
723
+ "source": [
724
+ "\n",
725
+ "n_protons = sum([c * ll.GetAtomicNumber(str(elem))\n",
726
+ " for (elem, c) in atom_bag.items()])"
727
+ ]
728
+ },
729
+ {
730
+ "cell_type": "code",
731
+ "execution_count": 57,
732
+ "id": "ac1c69f6-54db-41ba-9fdf-e7ab6a2dfcbc",
733
+ "metadata": {},
734
+ "outputs": [],
735
+ "source": [
736
+ "atom_bag['e-'] = n_protons - formal_charge"
737
+ ]
738
+ },
739
+ {
740
+ "cell_type": "code",
741
+ "execution_count": 58,
742
+ "id": "61b1931e-dbaf-4e0f-afb2-6595f64d70d6",
743
+ "metadata": {},
744
+ "outputs": [
745
+ {
746
+ "data": {
747
+ "text/plain": [
748
+ "{'C': 14, 'H': 14, 'O': 1, 'e-': 106}"
749
+ ]
750
+ },
751
+ "execution_count": 58,
752
+ "metadata": {},
753
+ "output_type": "execute_result"
754
+ }
755
+ ],
756
+ "source": [
757
+ "atom_bag"
758
+ ]
759
+ },
760
+ {
761
+ "cell_type": "code",
762
+ "execution_count": 60,
763
+ "id": "12bdbf80-7dc5-4d47-a479-703ad5a6aa06",
764
+ "metadata": {},
765
+ "outputs": [
766
+ {
767
+ "data": {
768
+ "text/plain": [
769
+ "0"
770
+ ]
771
+ },
772
+ "execution_count": 60,
773
+ "metadata": {},
774
+ "output_type": "execute_result"
775
+ }
776
+ ],
777
+ "source": [
778
+ "\n",
779
+ "formal_charge\n",
780
+ "\n"
781
+ ]
782
+ },
783
+ {
784
+ "cell_type": "code",
785
+ "execution_count": null,
786
+ "id": "b51f36c0-707a-4856-8c23-9081e2ea2cf7",
787
+ "metadata": {},
788
+ "outputs": [],
789
+ "source": [
790
+ "all_pKas, smiles_list = GetDissociationConstants_val(inchi_k)"
791
+ ]
792
+ },
793
+ {
794
+ "cell_type": "code",
795
+ "execution_count": 13,
796
+ "id": "6dd79761-760d-4233-b113-a34e6322a0e5",
797
+ "metadata": {},
798
+ "outputs": [],
799
+ "source": [
800
+ "MID_PH = 7.0\n",
801
+ "N_PKAS = 20\n",
802
+ "\n",
803
+ "n_acidic = N_PKAS\n",
804
+ "n_basic = N_PKAS\n",
805
+ "pH = MID_PH"
806
+ ]
807
+ },
808
+ {
809
+ "cell_type": "code",
810
+ "execution_count": 14,
811
+ "id": "6167191a-b361-4ae0-a78a-927490c72f87",
812
+ "metadata": {},
813
+ "outputs": [],
814
+ "source": [
815
+ "args = []\n",
816
+ "if n_acidic + n_basic > 0:\n",
817
+ " args += ['pka', '-a', str(n_acidic), '-b', str(n_basic),\n",
818
+ " 'majorms', '-M', 'true', '--pH', str(pH)]\n"
819
+ ]
820
+ },
821
+ {
822
+ "cell_type": "code",
823
+ "execution_count": 15,
824
+ "id": "dd4275ec-c71e-4b5b-bb35-de8b3c7c4883",
825
+ "metadata": {},
826
+ "outputs": [
827
+ {
828
+ "data": {
829
+ "text/plain": [
830
+ "['pka', '-a', '20', '-b', '20', 'majorms', '-M', 'true', '--pH', '7.0']"
831
+ ]
832
+ },
833
+ "execution_count": 15,
834
+ "metadata": {},
835
+ "output_type": "execute_result"
836
+ }
837
+ ],
838
+ "source": [
839
+ "args"
840
+ ]
841
+ },
842
+ {
843
+ "cell_type": "code",
844
+ "execution_count": null,
845
+ "id": "79d07dc5-963a-4373-9d72-1eb6de48ede9",
846
+ "metadata": {},
847
+ "outputs": [],
848
+ "source": []
849
+ },
850
+ {
851
+ "cell_type": "code",
852
+ "execution_count": 16,
853
+ "id": "712a71fb-e3e3-4b01-828d-5a3862aa1b30",
854
+ "metadata": {},
855
+ "outputs": [],
856
+ "source": [
857
+ "logging.debug(\"INPUT: echo %s | %s\" % (inchi_k, ' '.join([CXCALC_BIN] + args)))"
858
+ ]
859
+ },
860
+ {
861
+ "cell_type": "code",
862
+ "execution_count": 17,
863
+ "id": "287bf822-23b8-42de-85ca-e52678875cfa",
864
+ "metadata": {},
865
+ "outputs": [],
866
+ "source": [
867
+ "molstring= inchi_k"
868
+ ]
869
+ },
870
+ {
871
+ "cell_type": "code",
872
+ "execution_count": 18,
873
+ "id": "4d2ff427-237c-4d63-a718-f29f12884d96",
874
+ "metadata": {},
875
+ "outputs": [],
876
+ "source": [
877
+ "p1 = Popen([\"echo\", molstring], stdout=PIPE, shell=use_shell_for_echo)"
878
+ ]
879
+ },
880
+ {
881
+ "cell_type": "code",
882
+ "execution_count": 19,
883
+ "id": "923a09f2-b959-4837-ab1a-a858d91de0b4",
884
+ "metadata": {},
885
+ "outputs": [],
886
+ "source": [
887
+ "p2 = Popen([CXCALC_BIN] + args, stdin=p1.stdout,\n",
888
+ " executable=CXCALC_BIN, stdout=PIPE, shell=False)"
889
+ ]
890
+ },
891
+ {
892
+ "cell_type": "code",
893
+ "execution_count": 20,
894
+ "id": "a6b30545-c65a-4c56-9985-71a103b9da00",
895
+ "metadata": {},
896
+ "outputs": [],
897
+ "source": [
898
+ "res = p2.communicate()[0]"
899
+ ]
900
+ },
901
+ {
902
+ "cell_type": "code",
903
+ "execution_count": 21,
904
+ "id": "ac059602-027f-4a1a-932f-c1339c38c7d7",
905
+ "metadata": {},
906
+ "outputs": [],
907
+ "source": [
908
+ "if p2.returncode != 0:\n",
909
+ " raise ChemAxonError(str(args))\n",
910
+ "logging.debug(\"OUTPUT: %s\" % res)"
911
+ ]
912
+ },
913
+ {
914
+ "cell_type": "code",
915
+ "execution_count": 22,
916
+ "id": "671642a5-3877-44e3-b935-f987fd601444",
917
+ "metadata": {},
918
+ "outputs": [],
919
+ "source": [
920
+ "output = res"
921
+ ]
922
+ },
923
+ {
924
+ "cell_type": "code",
925
+ "execution_count": 23,
926
+ "id": "a9f4bb4a-af86-4e97-bf1d-40c58013f90e",
927
+ "metadata": {},
928
+ "outputs": [
929
+ {
930
+ "data": {
931
+ "text/plain": [
932
+ "b'id\\tapKa1\\tapKa2\\tapKa3\\tapKa4\\tapKa5\\tapKa6\\tapKa7\\tapKa8\\tapKa9\\tapKa10\\tapKa11\\tapKa12\\tapKa13\\tapKa14\\tapKa15\\tapKa16\\tapKa17\\tapKa18\\tapKa19\\tapKa20\\tbpKa1\\tbpKa2\\tbpKa3\\tbpKa4\\tbpKa5\\tbpKa6\\tbpKa7\\tbpKa8\\tbpKa9\\tbpKa10\\tbpKa11\\tbpKa12\\tbpKa13\\tbpKa14\\tbpKa15\\tbpKa16\\tbpKa17\\tbpKa18\\tbpKa19\\tbpKa20\\tatoms\\tmajor-ms\\r\\n1\\t10.10\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t-5.48\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t15,15\\tOC1=CC=CC(CCC2=CC=CC=C2)=C1\\r\\n'"
933
+ ]
934
+ },
935
+ "execution_count": 23,
936
+ "metadata": {},
937
+ "output_type": "execute_result"
938
+ }
939
+ ],
940
+ "source": [
941
+ "output"
942
+ ]
943
+ },
944
+ {
945
+ "cell_type": "code",
946
+ "execution_count": 24,
947
+ "id": "215ffc9b-35a8-4f45-8f39-9c99deae6335",
948
+ "metadata": {},
949
+ "outputs": [],
950
+ "source": [
951
+ "atom2pKa, smiles_list = ParsePkaOutput(output, n_acidic, n_basic)"
952
+ ]
953
+ },
954
+ {
955
+ "cell_type": "code",
956
+ "execution_count": 26,
957
+ "id": "21c380d3-5410-4c55-b6d7-cb0588f373ca",
958
+ "metadata": {},
959
+ "outputs": [
960
+ {
961
+ "data": {
962
+ "text/plain": [
963
+ "['OC1=CC=CC(CCC2=CC=CC=C2)=C1\\r']"
964
+ ]
965
+ },
966
+ "execution_count": 26,
967
+ "metadata": {},
968
+ "output_type": "execute_result"
969
+ }
970
+ ],
971
+ "source": [
972
+ "smiles_list"
973
+ ]
974
+ },
975
+ {
976
+ "cell_type": "code",
977
+ "execution_count": 27,
978
+ "id": "1437693a-0923-4df1-837d-acb2b524fcae",
979
+ "metadata": {},
980
+ "outputs": [],
981
+ "source": [
982
+ "all_pKas = []\n",
983
+ "for pKa_list in list(atom2pKa.values()):\n",
984
+ " all_pKas += [pKa for pKa, _ in pKa_list]"
985
+ ]
986
+ },
987
+ {
988
+ "cell_type": "code",
989
+ "execution_count": 28,
990
+ "id": "8e77324c-ed61-4615-a7c7-4f5ca781dc90",
991
+ "metadata": {},
992
+ "outputs": [
993
+ {
994
+ "data": {
995
+ "text/plain": [
996
+ "[10.1, -5.48]"
997
+ ]
998
+ },
999
+ "execution_count": 28,
1000
+ "metadata": {},
1001
+ "output_type": "execute_result"
1002
+ }
1003
+ ],
1004
+ "source": [
1005
+ "all_pKas"
1006
+ ]
1007
+ },
1008
+ {
1009
+ "cell_type": "code",
1010
+ "execution_count": null,
1011
+ "id": "8616be46-1814-4755-b919-4b7790569890",
1012
+ "metadata": {},
1013
+ "outputs": [],
1014
+ "source": []
1015
+ }
1016
+ ],
1017
+ "metadata": {
1018
+ "kernelspec": {
1019
+ "display_name": "Python 3",
1020
+ "language": "python",
1021
+ "name": "python3"
1022
+ },
1023
+ "language_info": {
1024
+ "codemirror_mode": {
1025
+ "name": "ipython",
1026
+ "version": 3
1027
+ },
1028
+ "file_extension": ".py",
1029
+ "mimetype": "text/x-python",
1030
+ "name": "python",
1031
+ "nbconvert_exporter": "python",
1032
+ "pygments_lexer": "ipython3",
1033
+ "version": "3.8.10"
1034
+ }
1035
+ },
1036
+ "nbformat": 4,
1037
+ "nbformat_minor": 5
1038
+ }
CC/chemaxon.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import csv
3
+ import re
4
+ import platform
5
+ import io
6
+ from subprocess import Popen, PIPE
7
+ from openbabel import openbabel
8
+ import pdb
9
+ from rdkit.Chem import rdchem
10
+
11
+ if platform.system() == 'Windows':
12
+ CXCALC_BIN = 'C:\\Users\\vuu10\\AppData\\Local\\Programs\\ChemAxon\\MarvinSuite\\bin\\cxcalc.exe'
13
+ #CXCALC_BIN = 'C:\\Program Files (x86)\\ChemAxon\\MarvinBeans\\bin\\cxcalc.bat'
14
+ use_shell_for_echo = True
15
+ else:
16
+ CXCALC_BIN = 'cxcalc'
17
+ use_shell_for_echo = False
18
+
19
+ MID_PH = 7.0
20
+ N_PKAS = 20
21
+
22
+
23
+ class ChemAxonError(Exception):
24
+ pass
25
+
26
+
27
+ def RunCxcalc(molstring, args):
28
+ # pdb.set_trace()
29
+ # with open(platform.DEV_NULL, 'w') as dev_null:
30
+ try:
31
+ logging.debug("INPUT: echo %s | %s" %
32
+ (molstring, ' '.join([CXCALC_BIN] + args)))
33
+ p1 = Popen(["echo", molstring], stdout=PIPE,
34
+ shell=use_shell_for_echo)
35
+ # p2 = Popen([CXCALC_BIN] + args, stdin=p1.stdout,
36
+ # executable=CXCALC_BIN, stdout=PIPE, stderr=dev_null, shell=False)
37
+ p2 = Popen([CXCALC_BIN] + args, stdin=p1.stdout,
38
+ executable=CXCALC_BIN, stdout=PIPE, shell=False)
39
+ # p.wait()
40
+ # os.remove(temp_fname)
41
+ res = p2.communicate()[0]
42
+ if p2.returncode != 0:
43
+ raise ChemAxonError(str(args))
44
+ logging.debug("OUTPUT: %s" % res)
45
+ res = res.decode('utf-8')
46
+ return res
47
+ except OSError:
48
+ raise Exception(
49
+ "Marvin (by ChemAxon) must be installed to calculate pKa data.")
50
+
51
+
52
+ def ParsePkaOutput(s, n_acidic, n_basic):
53
+ """
54
+ Returns:
55
+ A dictionary that maps the atom index to a list of pKas
56
+ that are assigned to that atom.
57
+ """
58
+ # s = s.decode('utf-8')
59
+ atom2pKa = {}
60
+
61
+ pkaline = s.split('\n')[1]
62
+ splitline = pkaline.split('\t')
63
+ splitline.pop(0)
64
+
65
+ if n_acidic + n_basic > 0:
66
+ if len(splitline) != (n_acidic + n_basic + 2):
67
+ raise ChemAxonError('ChemAxon failed to find any pKas')
68
+
69
+ pKa_list = []
70
+ acid_or_base_list = []
71
+ for i in range(n_acidic + n_basic):
72
+ x = splitline.pop(0)
73
+ if x == '':
74
+ continue
75
+
76
+ pKa_list.append(float(x))
77
+ if i < n_acidic:
78
+ acid_or_base_list.append('acid')
79
+ else:
80
+ acid_or_base_list.append('base')
81
+
82
+ atom_list = splitline.pop(0)
83
+
84
+ if atom_list: # a comma separated list of the deprotonated atoms
85
+ atom_numbers = [int(y)-1 for y in atom_list.split(',')]
86
+ for i, j in enumerate(atom_numbers):
87
+ atom2pKa.setdefault(j, [])
88
+ atom2pKa[j].append((pKa_list[i], acid_or_base_list[i]))
89
+
90
+ smiles_list = splitline
91
+ return atom2pKa, smiles_list
92
+
93
+
94
+ def GetDissociationConstants_val(molstring, n_acidic=N_PKAS, n_basic=N_PKAS,
95
+ pH=MID_PH):
96
+ """
97
+ Returns:
98
+ A pair of (pKa list, major pseudoisomer)
99
+
100
+ - the pKa list is of the pKa values in ascending order.
101
+ - the major pseudoisomer is a SMILES string of the major species
102
+ at the given pH.
103
+ """
104
+ args = []
105
+ if n_acidic + n_basic > 0:
106
+ args += ['pka', '-a', str(n_acidic), '-b', str(n_basic),
107
+ 'majorms', '-M', 'true', '--pH', str(pH)]
108
+
109
+ output = RunCxcalc(molstring, args)
110
+ atom2pKa, smiles_list = ParsePkaOutput(output, n_acidic, n_basic)
111
+
112
+ all_pKas = []
113
+ for pKa_list in list(atom2pKa.values()):
114
+ all_pKas += [pKa for pKa, _ in pKa_list]
115
+
116
+ return sorted(all_pKas), smiles_list
117
+
118
+
119
+ def GetDissociationConstants(molstring, n_acidic=N_PKAS, n_basic=N_PKAS,
120
+ pH=MID_PH):
121
+ """
122
+ Arguments:
123
+ molstring - a text description of the molecule (SMILES or InChI)
124
+ n_acidic - the max no. of acidic pKas to calculate
125
+ n_basic - the max no. of basic pKas to calculate
126
+ pH - the pH for which the major pseudoisomer is calculated
127
+
128
+ Returns a pair:
129
+ (all_pKas, major_ms)
130
+
131
+ - all_pKas is a list of floats (pKa values)
132
+ - major_ms is a SMILES string of the major pseudoisomer at pH_mid
133
+ """
134
+ all_pKas, smiles_list = GetDissociationConstants_val(molstring, n_acidic,
135
+ n_basic, pH)
136
+ major_ms = smiles_list[0]
137
+ return all_pKas, major_ms
138
+
139
+
140
+ def GetFormulaAndCharge(molstring):
141
+ """
142
+ Arguments:
143
+ molstring - a text description of the molecule (SMILES or InChI)
144
+
145
+ Returns:
146
+ chemical formula of the molecule
147
+ """
148
+ args = ['formula', 'formalcharge']
149
+ output = RunCxcalc(molstring, args)
150
+ # the output is a tab separated table whose columns are:
151
+ # id, Formula, Formal charge
152
+ f = io.StringIO(output)
153
+ tsv_output = csv.reader(f, delimiter='\t')
154
+ headers = next(tsv_output)
155
+ if headers != ['id', 'Formula', 'Formal charge']:
156
+ raise ChemAxonError(
157
+ 'cannot get the formula and charge for: ' + molstring)
158
+ _, formula, formal_charge = next(tsv_output)
159
+
160
+ try:
161
+ formal_charge = int(formal_charge)
162
+ except ValueError:
163
+ formal_charge = 0
164
+
165
+ return formula, formal_charge
166
+
167
+
168
+ def GetAtomBagAndCharge(molstring):
169
+ formula, formal_charge = GetFormulaAndCharge(molstring)
170
+ periodic_table = rdchem.GetPeriodicTable()
171
+
172
+ atom_bag = {}
173
+ for mol_formula_times in formula.split('.'):
174
+ for times, mol_formula in re.findall('^(\d+)?(\w+)', mol_formula_times):
175
+ if not times:
176
+ times = 1
177
+ else:
178
+ times = int(times)
179
+ for atom, count in re.findall("([A-Z][a-z]*)([0-9]*)", mol_formula):
180
+ if count == '':
181
+ count = 1
182
+ else:
183
+ count = int(count)
184
+ atom_bag[atom] = atom_bag.get(atom, 0) + count * times
185
+
186
+ n_protons = sum([c * periodic_table.GetAtomicNumber(str(elem))
187
+ for (elem, c) in atom_bag.items()])
188
+ atom_bag['e-'] = n_protons - formal_charge
189
+
190
+ return atom_bag, formal_charge
191
+
192
+
193
+ if __name__ == "__main__":
194
+ logging.getLogger().setLevel(logging.WARNING)
195
+ from molecule import Molecule
196
+ compound_list = [
197
+ ('D-Erythrulose', 'InChI=1S/C4H8O4/c5-1-3(7)4(8)2-6/h3,5-7H,1-2H2/t3-/m1/s1')]
198
+
199
+ for name, inchi in compound_list:
200
+ print("Formula: %s\nCharge: %d" % GetFormulaAndCharge(inchi))
201
+ diss_table, major_ms = GetDissociationConstants(inchi)
202
+ m = Molecule.FromSmiles(major_ms)
203
+ print("Name: %s\nInChI: %s\npKas: %s" %
204
+ (name, m.ToInChI(), str(diss_table)))
CC/compound.py ADDED
@@ -0,0 +1,337 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import urllib.request, urllib.parse, urllib.error, logging
2
+ from openbabel import openbabel
3
+ import chemaxon
4
+ import numpy as np
5
+ from thermodynamic_constants import R, debye_huckel
6
+ from scipy.special import logsumexp
7
+
8
+ MIN_PH = 0.0
9
+ MAX_PH = 14.0
10
+
11
+ class Compound(object):
12
+
13
+ def __init__(self, database, compound_id, inchi,
14
+ atom_bag, pKas, smiles_pH7, majorMSpH7, nHs, zs):
15
+ self.database = database
16
+ self.compound_id = compound_id
17
+ self.inchi = inchi
18
+ self.atom_bag = atom_bag
19
+ self.pKas = pKas
20
+ self.smiles_pH7 = smiles_pH7
21
+ self.majorMSpH7 = majorMSpH7
22
+ self.nHs = nHs
23
+ self.zs = zs
24
+
25
+ @staticmethod
26
+ def from_kegg(compound_id):
27
+ return Compound.from_inchi('KEGG', compound_id,
28
+ Compound.get_inchi(compound_id))
29
+
30
+ @staticmethod
31
+ def from_inchi(database, compound_id, inchi):
32
+ if compound_id == 'C00080':
33
+ # We add an exception for H+ (and put nH = 0) in order to eliminate
34
+ # its effect of the Legendre transform
35
+ return Compound(database, compound_id, inchi,
36
+ {'H' : 1}, [], None, 0, [0], [0])
37
+ elif compound_id == 'C00087':
38
+ # ChemAxon gets confused with the structure of sulfur
39
+ # (returns a protonated form, [SH-], at pH 7).
40
+ # So we implement it manually here.
41
+ return Compound(database, compound_id, inchi,
42
+ {'S' : 1, 'e-': 16}, [], 'S', 0, [0], [0])
43
+ elif compound_id == 'C00237':
44
+ # ChemAxon gets confused with the structure of carbon monoxide
45
+ # (returns a protonated form, [CH]#[O+], at pH 7).
46
+ # So we implement it manually here.
47
+ return Compound(database, compound_id, inchi,
48
+ {'C' : 1, 'O': 1, 'e-': 14}, [], '[C-]#[O+]', 0, [0], [0])
49
+ elif compound_id == 'C00282':
50
+ # ChemAxon gets confused with the structure of hydrogen
51
+ # So we implement it manually here.
52
+ return Compound(database, compound_id, inchi,
53
+ {'H' : 2, 'e-': 2}, [], None, 0, [2], [0])
54
+ elif compound_id == 'C01353':
55
+ # When given the structure of carbonic acid, ChemAxon returns the
56
+ # pKas for CO2(tot), i.e. it assumes the non-hydrated CO2 species is
57
+ # one of the pseudoisomers, and the lower pKa value is 6.05 instead of
58
+ # 3.78. Here, we introduce a new "KEGG" compound that will represent
59
+ # pure bicarbonate (without CO2(sp)) and therefore plug in the pKa
60
+ # values from Alberty's book.
61
+ return Compound(database, compound_id, inchi,
62
+ {'C': 1, 'H': 1, 'O': 3, 'e-': 32}, [10.33, 3.43],
63
+ 'OC(=O)[O-]', 1, [0, 1, 2], [-2, -1, 0])
64
+ # Metal Cations get multiple pKa values from ChemAxon, which is
65
+ # obviously a bug. We override the important ones here:
66
+ elif compound_id == 'C00076': # Ca2+
67
+ return Compound(database, compound_id, inchi,
68
+ {'Ca' : 1, 'e-': 18}, [], '[Ca++]', 0, [0], [2])
69
+ elif compound_id == 'C00238': # K+
70
+ return Compound(database, compound_id, inchi,
71
+ {'K' : 1, 'e-': 18}, [], '[K+]', 0, [0], [1])
72
+ elif compound_id == 'C00305': # Mg2+
73
+ return Compound(database, compound_id, inchi,
74
+ {'Mg' : 1, 'e-': 10}, [], '[Mg++]', 0, [0], [2])
75
+ elif compound_id == 'C14818': # Fe2+
76
+ return Compound(database, compound_id, inchi,
77
+ {'Fe' : 1, 'e-': 24}, [], '[Fe++]', 0, [0], [2])
78
+ elif compound_id == 'C14819': # Fe3+
79
+ return Compound(database, compound_id, inchi,
80
+ {'Fe' : 1, 'e-': 23}, [], '[Fe+++]', 0, [0], [3])
81
+ elif compound_id == 'C00138': # ferredoxin(red)
82
+ return Compound(database, compound_id, inchi,
83
+ {'Fe' : 1, 'e-': 26}, [], None, 0, [0], [0])
84
+ elif compound_id == 'C00139': # ferredoxin(ox)
85
+ return Compound(database, compound_id, inchi,
86
+ {'Fe' : 1, 'e-': 25}, [], None, 0, [0], [1])
87
+ elif inchi is None:
88
+ # If the compound has no explicit structure, we assume that it has
89
+ # no proton dissociations in the relevant pH range
90
+ return Compound(database, compound_id, inchi,
91
+ {}, [], None, 0, [0], [0])
92
+
93
+ # Otherwise, we use ChemAxon's software to get the pKas and the
94
+ # properties of all microspecies
95
+
96
+ try:
97
+ pKas, major_ms_smiles = chemaxon.GetDissociationConstants(inchi)
98
+ major_ms_smiles = Compound.smiles2smiles(major_ms_smiles)
99
+ pKas = sorted([pka for pka in pKas if pka > MIN_PH and pka < MAX_PH], reverse=True)
100
+ except chemaxon.ChemAxonError:
101
+ logging.warning('chemaxon failed to find pKas for this molecule: ' + inchi)
102
+ # use the original InChI to get the parameters (i.e. assume it
103
+ # represents the major microspecies at pH 7)
104
+ major_ms_smiles = Compound.inchi2smiles(inchi)
105
+ pKas = []
106
+
107
+ if major_ms_smiles:
108
+ atom_bag, major_ms_charge = chemaxon.GetAtomBagAndCharge(major_ms_smiles)
109
+ major_ms_nH = atom_bag.get('H', 0)
110
+ else:
111
+ atom_bag = {}
112
+ major_ms_charge = 0
113
+ major_ms_nH = 0
114
+
115
+ n_species = len(pKas) + 1
116
+ if pKas == []:
117
+ majorMSpH7 = 0
118
+ else:
119
+ majorMSpH7 = len([1 for pka in pKas if pka > 7])
120
+
121
+ nHs = []
122
+ zs = []
123
+
124
+ for i in range(n_species):
125
+ zs.append((i - majorMSpH7) + major_ms_charge)
126
+ nHs.append((i - majorMSpH7) + major_ms_nH)
127
+
128
+ return Compound(database, compound_id, inchi,
129
+ atom_bag, pKas, major_ms_smiles, majorMSpH7, nHs, zs)
130
+
131
+ def to_json_dict(self):
132
+ return {'database' : self.database,
133
+ 'compound_id' : self.compound_id,
134
+ 'inchi' : self.inchi,
135
+ 'atom_bag' : self.atom_bag,
136
+ 'pKas' : self.pKas,
137
+ 'smiles_pH7' : self.smiles_pH7,
138
+ 'majorMSpH7' : self.majorMSpH7,
139
+ 'nHs' : self.nHs,
140
+ 'zs' : self.zs}
141
+
142
+ @staticmethod
143
+ def from_json_dict(d):
144
+ return Compound(d['database'], d['compound_id'], d['inchi'], d['atom_bag'],
145
+ d['pKas'], d['smiles_pH7'], d['majorMSpH7'],
146
+ d['nHs'], d['zs'])
147
+
148
+ @staticmethod
149
+ def get_inchi(compound_id):
150
+ s_mol = urllib.request.urlopen('http://rest.kegg.jp/get/cpd:%s/mol' % compound_id).read()
151
+ return Compound.mol2inchi(s_mol)
152
+
153
+ @staticmethod
154
+ def mol2inchi(s):
155
+ openbabel.obErrorLog.SetOutputLevel(-1)
156
+
157
+ conv = openbabel.OBConversion()
158
+ conv.SetInAndOutFormats('mol', 'inchi')
159
+ conv.AddOption("F", conv.OUTOPTIONS)
160
+ conv.AddOption("T", conv.OUTOPTIONS)
161
+ conv.AddOption("x", conv.OUTOPTIONS, "noiso")
162
+ conv.AddOption("w", conv.OUTOPTIONS)
163
+ obmol = openbabel.OBMol()
164
+ if not conv.ReadString(obmol, str(s)):
165
+ return None
166
+ inchi = conv.WriteString(obmol, True) # second argument is trimWhitespace
167
+ if inchi == '':
168
+ return None
169
+ else:
170
+ return inchi
171
+
172
+ @staticmethod
173
+ def inchi2smiles(inchi):
174
+ openbabel.obErrorLog.SetOutputLevel(-1)
175
+
176
+ conv = openbabel.OBConversion()
177
+ conv.SetInAndOutFormats('inchi', 'smiles')
178
+ #conv.AddOption("F", conv.OUTOPTIONS)
179
+ #conv.AddOption("T", conv.OUTOPTIONS)
180
+ #conv.AddOption("x", conv.OUTOPTIONS, "noiso")
181
+ #conv.AddOption("w", conv.OUTOPTIONS)
182
+ obmol = openbabel.OBMol()
183
+ conv.ReadString(obmol, str(inchi))
184
+ smiles = conv.WriteString(obmol, True) # second argument is trimWhitespace
185
+ if smiles == '':
186
+ return None
187
+ else:
188
+ return smiles
189
+
190
+ @staticmethod
191
+ def smiles2smiles(smiles_in):
192
+ openbabel.obErrorLog.SetOutputLevel(-1)
193
+
194
+ conv = openbabel.OBConversion()
195
+ conv.SetInAndOutFormats('smiles', 'smiles')
196
+ #conv.AddOption("F", conv.OUTOPTIONS)
197
+ #conv.AddOption("T", conv.OUTOPTIONS)
198
+ #conv.AddOption("x", conv.OUTOPTIONS, "noiso")
199
+ #conv.AddOption("w", conv.OUTOPTIONS)
200
+ obmol = openbabel.OBMol()
201
+ conv.ReadString(obmol, str(smiles_in))
202
+ smiles_out = conv.WriteString(obmol, True) # second argument is trimWhitespace
203
+ if smiles_out == '':
204
+ return None
205
+ else:
206
+ return smiles_out
207
+ @staticmethod
208
+ def smiles2inchi(smiles):
209
+ openbabel.obErrorLog.SetOutputLevel(-1)
210
+
211
+ conv = openbabel.OBConversion()
212
+ conv.SetInAndOutFormats('smiles', 'inchi')
213
+ conv.AddOption("F", conv.OUTOPTIONS)
214
+ conv.AddOption("T", conv.OUTOPTIONS)
215
+ conv.AddOption("x", conv.OUTOPTIONS, "noiso")
216
+ conv.AddOption("w", conv.OUTOPTIONS)
217
+ obmol = openbabel.OBMol()
218
+ conv.ReadString(obmol, str(smiles))
219
+ inchi = conv.WriteString(obmol, True) # second argument is trimWhitespace
220
+ if inchi == '':
221
+ return None
222
+ else:
223
+ return inchi
224
+
225
+ def __str__(self):
226
+ return "%s\nInChI: %s\npKas: %s\nmajor MS: nH = %d, charge = %d" % \
227
+ (self.compound_id, self.inchi, ', '.join(['%.2f' % p for p in self.pKas]),
228
+ self.nHs[self.majorMSpH7], self.zs[self.majorMSpH7])
229
+
230
+ def _dG0_prime_vector(self, pH, I, T):
231
+ """
232
+ Calculates the difference in kJ/mol between dG'0 and
233
+ the dG0 of the MS with the least hydrogens (dG0[0])
234
+
235
+ Returns:
236
+ dG'0 - dG0[0]
237
+ """
238
+ if self.inchi is None:
239
+ return 0
240
+ elif self.pKas == []:
241
+ dG0s = np.zeros((1, 1))
242
+ else:
243
+ dG0s = -np.cumsum([0] + self.pKas) * R * T * np.log(10)
244
+ dG0s = dG0s
245
+ DH = debye_huckel((I, T))
246
+
247
+ # dG0' = dG0 + nH * (R T ln(10) pH + DH) - charge^2 * DH
248
+ pseudoisomers = np.vstack([dG0s, np.array(self.nHs), np.array(self.zs)]).T
249
+ dG0_prime_vector = pseudoisomers[:, 0] + \
250
+ pseudoisomers[:, 1] * (R * T * np.log(10) * pH + DH) - \
251
+ pseudoisomers[:, 2]**2 * DH
252
+ return dG0_prime_vector
253
+
254
+ def _transform(self, pH, I, T):
255
+
256
+ return -R * T * logsumexp(self._dG0_prime_vector(pH, I, T) / (-R * T))
257
+
258
+ def _ddG(self, i_from, i_to, T):
259
+ """
260
+ Calculates the difference in kJ/mol between two MSs.
261
+
262
+ Returns:
263
+ dG0[i_to] - dG0[i_from]
264
+ """
265
+ if not (0 <= i_from <= len(self.pKas)):
266
+ raise ValueError('MS index is out of bounds: 0 <= %d <= %d' % (i_from, len(self.pKas)))
267
+
268
+ if not (0 <= i_to <= len(self.pKas)):
269
+ raise ValueError('MS index is out of bounds: 0 <= %d <= %d' % (i_to, len(self.pKas)))
270
+
271
+ if i_from == i_to:
272
+ return 0
273
+ elif i_from < i_to:
274
+ return sum(self.pKas[i_from:i_to]) * R * T * np.log(10)
275
+ else:
276
+ return -sum(self.pKas[i_to:i_from]) * R * T * np.log(10)
277
+
278
+ def transform(self, i, pH, I, T):
279
+ """
280
+ Returns the difference in kJ/mol between dG'0 and the dG0 of the
281
+ MS with index 'i'.
282
+
283
+ Returns:
284
+ (dG'0 - dG0[0]) + (dG0[0] - dG0[i]) = dG'0 - dG0[i]
285
+ """
286
+ return self._transform(pH, I, T) + self._ddG(0, i, T)
287
+
288
+ def transform_pH7(self, pH, I, T):
289
+ """
290
+ Returns the transform for the major MS in pH 7
291
+ """
292
+ return self.transform(self.majorMSpH7, pH, I, T)
293
+
294
+ def transform_neutral(self, pH, I, T):
295
+ """
296
+ Returns the transform for the MS with no charge
297
+ """
298
+ try:
299
+ return self.transform(pH, I, T, self.zs.index(0))
300
+ except ValueError:
301
+ raise ValueError("The compound (%s) does not have a microspecies with 0 charge"
302
+ % self.compound_id)
303
+
304
+ def get_species(self, major_ms_dG0_f, T):
305
+ """
306
+ Given the chemical formation energy of the major microspecies,
307
+ uses the pKa values to calculate the chemical formation energies
308
+ of all other species, and returns a list of dictionaries with
309
+ all the relevant data: dG0_f, nH, nMg, z (charge)
310
+ """
311
+ for i, (nH, z) in enumerate(zip(self.nHs, self.zs)):
312
+ dG0_f = major_ms_dG0_f + self._ddG(i, self.majorMSpH7, T)
313
+ d = {'phase': 'aqueous', 'dG0_f': np.round(dG0_f, 2),
314
+ 'nH': nH, 'z': z, 'nMg': 0}
315
+ yield d
316
+
317
+ if __name__ == '__main__':
318
+ import sys, json
319
+ logger = logging.getLogger('')
320
+ logger.setLevel(logging.DEBUG)
321
+ from compound_cacher import CompoundCacher, CompoundEncoder
322
+ from molecule import Molecule, OpenBabelError
323
+ ccache = CompoundCacher(cache_fname=None)
324
+
325
+ for compound_id in ['C00087', 'C00282', 'C00237']:
326
+ comp = Compound.from_kegg(compound_id)
327
+ try:
328
+ mol = Molecule.FromInChI(str(comp.inchi))
329
+ sys.stderr.write('%s : formula = %s, nE = %s' %
330
+ (str(comp.inchi), mol.GetFormula(), mol.GetNumElectrons()))
331
+ except OpenBabelError:
332
+ pass
333
+ ccache.add(comp)
334
+ sys.stderr.write('\ncompound id = %s, nH = %s, z = %s, pKa = %s, bag = %s\n\n\n' %
335
+ (compound_id, str(comp.nHs), str(comp.zs), str(comp.pKas), str(comp.atom_bag)))
336
+
337
+ ccache.dump()
CC/compound_cacher.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json, os, logging, csv, gzip, numpy, pdb
2
+ from compound import Compound
3
+ base_path = os.path.split(os.path.realpath(__file__))[0]
4
+
5
+ ### Input Files:
6
+ # original version of the KEGG compound file
7
+ OLD_COMPOUND_JSON_FNAME = os.path.join(base_path, './data_cc/equilibrator_compounds.json.gz')
8
+
9
+ # a CSV file with additional names and InChIs (mostly compounds missing from KEGG
10
+ # and added manually)
11
+ KEGG_ADDITIONS_TSV_FNAME = os.path.join(base_path, './data_cc/kegg_additions.tsv')
12
+
13
+ ### Files created by this module:
14
+ # names and InChIs only
15
+ KEGG_COMPOUND_JSON_FNAME = os.path.join(base_path, './data_cc/kegg_compounds.json.gz')
16
+
17
+ # names, InChIs and pKa data
18
+ DEFAULT_CACHE_FNAME = os.path.join(base_path, './data_cc/compounds.json.gz')
19
+
20
+
21
+ class CompoundEncoder(json.JSONEncoder):
22
+ def default(self, obj):
23
+ if (isinstance(obj, Compound)):
24
+ return obj.to_json_dict()
25
+ return json.JSONEncoder.default(self, obj)
26
+
27
+ class Singleton(type):
28
+ def __init__(cls,name,bases,dic):
29
+ super(Singleton,cls).__init__(name,bases,dic)
30
+ cls.instance=None
31
+ def __call__(cls,*args,**kw):
32
+ if cls.instance is None:
33
+ cls.instance=super(Singleton,cls).__call__(*args,**kw)
34
+ return cls.instance
35
+
36
+ class CompoundCacher(object, metaclass=Singleton):
37
+ """
38
+ CompoundCacher is a singleton that handles caching of Compound objects
39
+ for the component-contribution package. The Compounds are retrieved by
40
+ their ID (which is the KEGG ID in most cases).
41
+ The first time a Compound is requested, it is obtained from the relevant
42
+ database and a Compound object is created (this takes a while because
43
+ it usually involves internet communication and then invoking the ChemAxon
44
+ plugin for calculating the pKa values for that structure).
45
+ Any further request for the same Compound ID will draw the object from
46
+ the cache. When the method dump() is called, all cached data is written
47
+ to a file that will be loaded in future python sessions.
48
+ """
49
+
50
+ def __init__(self, cache_fname=None):
51
+ self.cache_fname = cache_fname
52
+ if self.cache_fname is None:
53
+ self.cache_fname = DEFAULT_CACHE_FNAME
54
+
55
+ compounds = json.load(gzip.open(KEGG_COMPOUND_JSON_FNAME, 'r'))
56
+ self.compound_id2inchi = { d['compound_id']: d['inchi']
57
+ for d in compounds }
58
+ self.need_to_update_cache_file = False
59
+ self.load()
60
+
61
+ def get_all_compound_ids(self):
62
+ return sorted(self.compound_id2inchi.keys())
63
+
64
+ def load(self):
65
+ # parse the JSON cache file and store in a dictionary 'compound_dict'
66
+ self.compound_dict = {}
67
+ self.compound_ids = []
68
+ if os.path.exists(self.cache_fname):
69
+ for d in json.load(gzip.open(self.cache_fname, 'r')):
70
+ self.compound_ids.append(d['compound_id'])
71
+ self.compound_dict[d['compound_id']] = Compound.from_json_dict(d)
72
+
73
+ def dump(self):
74
+ if self.need_to_update_cache_file:
75
+ fp = gzip.open(self.cache_fname, 'w')
76
+ data = sorted(list(self.compound_dict.values()),
77
+ key=lambda d:d.compound_id)
78
+ dict_data = [x.to_json_dict() for x in data]
79
+ json.dump(dict_data, fp, cls=CompoundEncoder,
80
+ sort_keys=True, indent=4, separators=(',', ': '))
81
+ fp.close()
82
+ self.need_to_update_cache_file = False
83
+
84
+ def get_compound(self, compound_id, kegg_additions_cids=None):
85
+ if compound_id not in self.compound_dict:
86
+ logging.debug('Cache miss: %s' % str(compound_id))
87
+ inchi = self.compound_id2inchi[compound_id]
88
+ comp = Compound.from_inchi('KEGG', compound_id, inchi)
89
+ self.add(comp)
90
+
91
+ #if a compound id is in the kegg_additions.tsv
92
+ #remove the one in cache, and replace it with new one
93
+ else:
94
+ if kegg_additions_cids is not None:
95
+ if compound_id in kegg_additions_cids:
96
+ self.remove(compound_id)
97
+ logging.debug('Cache update: %s' % str(compound_id))
98
+ inchi = self.compound_id2inchi[compound_id]
99
+ comp = Compound.from_inchi('KEGG', compound_id, inchi)
100
+ self.add(comp)
101
+
102
+ logging.debug('Cache hit: %s' % str(compound_id))
103
+ return self.compound_dict[compound_id]
104
+
105
+ def remove(self, compound_id):
106
+ if compound_id in self.compound_dict:
107
+ del self.compound_dict[compound_id]
108
+ else:
109
+ logging.debug('%s is not cached, cannot remove it' % str(compound_id))
110
+
111
+ def add(self, comp):
112
+ self.compound_dict[comp.compound_id] = comp
113
+ self.need_to_update_cache_file = True
114
+
115
+ def get_element_matrix(self, compound_ids):
116
+ if type(compound_ids) == str:
117
+ compound_ids = [compound_ids]
118
+ # gather the "atom bags" of all compounds in a list 'atom_bag_list'
119
+ elements = set()
120
+ atom_bag_list = []
121
+ for compound_id in compound_ids:
122
+ comp = self.get_compound(compound_id)
123
+ atom_bag = comp.atom_bag
124
+ if atom_bag is not None:
125
+ elements = elements.union(list(atom_bag.keys()))
126
+ atom_bag_list.append(atom_bag)
127
+ elements.discard('H') # don't balance H (it's enough to balance e-)
128
+ elements = sorted(elements)
129
+
130
+ # create the elemental matrix, where each row is a compound and each
131
+ # column is an element (or e-)
132
+ Ematrix = numpy.matrix(numpy.zeros((len(atom_bag_list), len(elements))))
133
+ for i, atom_bag in enumerate(atom_bag_list):
134
+ if atom_bag is None:
135
+ Ematrix[i, :] = numpy.nan
136
+ else:
137
+ for j, elem in enumerate(elements):
138
+ Ematrix[i, j] = atom_bag.get(elem, 0)
139
+ return elements, Ematrix
140
+
141
+ ###############################################################################
142
+
143
+ @staticmethod
144
+ def RebuildCompoundJSON():
145
+
146
+ kegg_dict = {}
147
+ for d in json.load(gzip.open(OLD_COMPOUND_JSON_FNAME, 'r')):
148
+ cid = d['CID']
149
+ kegg_dict[cid] = {'compound_id': cid,
150
+ 'name': d['name'],
151
+ 'names': d['names'],
152
+ 'inchi': d['InChI']}
153
+
154
+ # override some of the compounds or add new ones with 'fake' IDs,
155
+ # i.e. C80000 or higher.
156
+ kegg_additions_cids = []
157
+ for d in csv.DictReader(open(KEGG_ADDITIONS_TSV_FNAME, 'r'),
158
+ delimiter='\t'):
159
+ cid = 'C%05d' % int(d['cid'])
160
+ kegg_additions_cids.append(cid)
161
+ kegg_dict[cid] = {'compound_id': cid,
162
+ 'name': d['name'],
163
+ 'names': [d['name']],
164
+ 'inchi': d['inchi']}
165
+
166
+ compound_json = [kegg_dict[compound_id] for compound_id in sorted(kegg_dict.keys())]
167
+
168
+ new_json = gzip.open(KEGG_COMPOUND_JSON_FNAME, 'w')
169
+ json.dump(compound_json, new_json, sort_keys=True, indent=4)
170
+ new_json.close()
171
+ return kegg_additions_cids
172
+
173
+ ###############################################################################
174
+
175
+ @staticmethod
176
+ def BuildCache(start_from_scratch=False, kegg_additions_cids=None):
177
+ if start_from_scratch and os.path.exists(DEFAULT_CACHE_FNAME):
178
+ os.remove(DEFAULT_CACHE_FNAME)
179
+
180
+ ccache = CompoundCacher(cache_fname=DEFAULT_CACHE_FNAME)
181
+
182
+ i = 0
183
+ for compound_id in ccache.get_all_compound_ids():
184
+ logging.debug('Caching %s' % compound_id)
185
+ comp = ccache.get_compound(compound_id, kegg_additions_cids=kegg_additions_cids)
186
+ logging.debug(str(comp))
187
+ i += 1
188
+ if i % 100 == 0:
189
+ logging.debug('Dumping Cache ...')
190
+ ccache.dump()
191
+
192
+ ccache.dump()
193
+
194
+ ###############################################################################
195
+
196
+ if __name__ == '__main__':
197
+ logger = logging.getLogger('')
198
+ #logger.setLevel(logging.WARNING)
199
+ logger.setLevel(logging.DEBUG)
200
+
201
+ kegg_additions_cids = CompoundCacher.RebuildCompoundJSON()
202
+ CompoundCacher.BuildCache(start_from_scratch=False, kegg_additions_cids=kegg_additions_cids)
CC/molecule.py ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openbabel import openbabel
2
+ import types
3
+ import re
4
+ import chemaxon
5
+ from thermodynamic_constants import default_T, default_pH
6
+ import pdb
7
+
8
+ class OpenBabelError(Exception):
9
+ pass
10
+
11
+ class Molecule(object):
12
+
13
+ # for more rendering options visit:
14
+ # http://www.ggasoftware.com/opensource/indigo/api/options#rendering
15
+ _obElements = openbabel.OBElementTable()
16
+ _obSmarts = openbabel.OBSmartsPattern()
17
+
18
+ @staticmethod
19
+ def GetNumberOfElements():
20
+ return Molecule._obElements.GetNumberOfElements()
21
+
22
+ @staticmethod
23
+ def GetAllElements():
24
+ return [Molecule._obElements.GetSymbol(i) for i in
25
+ range(Molecule.GetNumberOfElements())]
26
+
27
+ @staticmethod
28
+ def GetSymbol(atomic_num):
29
+ return Molecule._obElements.GetSymbol(atomic_num)
30
+
31
+ @staticmethod
32
+ def GetAtomicNum(elem):
33
+ if type(elem) == str:
34
+ elem = str(elem)
35
+ return Molecule._obElements.GetAtomicNum(elem)
36
+
37
+ @staticmethod
38
+ def VerifySmarts(smarts):
39
+ return Molecule._obSmarts.Init(smarts)
40
+
41
+ def __init__(self):
42
+ self.title = None
43
+ self.obmol = openbabel.OBMol()
44
+ self.smiles = None
45
+ self.inchi = None
46
+
47
+ def __str__(self):
48
+ return self.title or self.smiles or self.inchi or ""
49
+
50
+ def __len__(self):
51
+ return self.GetNumAtoms()
52
+
53
+ def Clone(self):
54
+ tmp = Molecule()
55
+ tmp.title = self.title
56
+ tmp.obmol = openbabel.OBMol(self.obmol)
57
+ tmp.smiles = self.smiles
58
+ tmp.inchi = self.inchi
59
+ return tmp
60
+
61
+ def SetTitle(self, title):
62
+ self.title = title
63
+
64
+ @staticmethod
65
+ def FromSmiles(smiles):
66
+ m = Molecule()
67
+ m.smiles = smiles
68
+ obConversion = openbabel.OBConversion()
69
+ obConversion.AddOption("w", obConversion.OUTOPTIONS)
70
+ obConversion.SetInFormat("smiles")
71
+ if not obConversion.ReadString(m.obmol, m.smiles):
72
+ raise OpenBabelError("Cannot read the SMILES string: " + smiles)
73
+ try:
74
+ m.UpdateSmiles()
75
+ #m.UpdateInChI()
76
+ except OpenBabelError:
77
+ raise OpenBabelError("Failed to create Molecule from SMILES: " + smiles)
78
+ m.SetTitle(smiles)
79
+ return m
80
+
81
+ @staticmethod
82
+ def FromInChI(inchi):
83
+ m = Molecule()
84
+ m.inchi = inchi
85
+ obConversion = openbabel.OBConversion()
86
+ obConversion.AddOption("w", obConversion.OUTOPTIONS)
87
+ obConversion.SetInFormat("inchi")
88
+ obConversion.ReadString(m.obmol, m.inchi)
89
+ try:
90
+ m.UpdateInChI()
91
+ #m.UpdateSmiles()
92
+ except OpenBabelError:
93
+ raise OpenBabelError("Failed to create Molecule from InChI: " + inchi)
94
+ m.SetTitle(inchi)
95
+ return m
96
+
97
+ @staticmethod
98
+ def FromMol(mol):
99
+ m = Molecule()
100
+ obConversion = openbabel.OBConversion()
101
+ obConversion.AddOption("w", obConversion.OUTOPTIONS)
102
+ obConversion.SetInFormat("mol")
103
+ obConversion.ReadString(m.obmol, mol)
104
+ try:
105
+ m.UpdateInChI()
106
+ m.UpdateSmiles()
107
+ except OpenBabelError:
108
+ raise OpenBabelError("Failed to create Molecule from MOL file:\n" + mol)
109
+ m.SetTitle("")
110
+ return m
111
+
112
+ @staticmethod
113
+ def FromOBMol(obmol):
114
+ m = Molecule()
115
+ m.obmol = obmol
116
+ try:
117
+ m.UpdateInChI()
118
+ m.UpdateSmiles()
119
+ except OpenBabelError:
120
+ raise OpenBabelError("Failed to create Molecule from OBMol")
121
+ m.SetTitle("")
122
+ return m
123
+
124
+ @staticmethod
125
+ def _FromFormat(s, fmt='inchi'):
126
+ if fmt == 'smiles' or fmt == 'smi':
127
+ return Molecule.FromSmiles(s)
128
+ if fmt == 'inchi':
129
+ return Molecule.FromInChI(s)
130
+ if fmt == 'mol':
131
+ return Molecule.FromMol(s)
132
+ if fmt == 'obmol':
133
+ return Molecule.FromOBMol(s)
134
+
135
+ @staticmethod
136
+ def _ToFormat(obmol, fmt='inchi'):
137
+ #print('formatting started...')
138
+ #pdb.set_trace()
139
+ obConversion = openbabel.OBConversion()
140
+ obConversion.AddOption("w", obConversion.OUTOPTIONS)
141
+ obConversion.SetOutFormat(fmt)
142
+ res = obConversion.WriteString(obmol)
143
+ #print('res :::: ')
144
+ #print(res)
145
+ if not res:
146
+ raise OpenBabelError("Cannot convert OBMol to %s" % fmt)
147
+ if fmt == 'smiles' or fmt == 'smi':
148
+ #print('I am in')
149
+ res = res.split()
150
+ if res == []:
151
+ raise OpenBabelError("Cannot convert OBMol to %s" % fmt)
152
+ else:
153
+ return res[0]
154
+ elif fmt == 'inchi':
155
+ return res.strip()
156
+ else:
157
+ return res
158
+
159
+ @staticmethod
160
+ def Smiles2InChI(smiles):
161
+ obConversion = openbabel.OBConversion()
162
+ obConversion.AddOption("w", obConversion.OUTOPTIONS)
163
+ obConversion.SetInAndOutFormats("smiles", "inchi")
164
+ obmol = openbabel.OBMol()
165
+ if not obConversion.ReadString(obmol, smiles):
166
+ raise OpenBabelError("Cannot read the SMILES string: " + smiles)
167
+ return obConversion.WriteString(obmol).strip()
168
+
169
+ @staticmethod
170
+ def InChI2Smiles(inchi):
171
+ obConversion = openbabel.OBConversion()
172
+ obConversion.AddOption("w", obConversion.OUTOPTIONS)
173
+ obConversion.SetInAndOutFormats("inchi", "smiles")
174
+ obmol = openbabel.OBMol()
175
+ if not obConversion.ReadString(obmol, inchi):
176
+ raise OpenBabelError("Cannot read the InChI string: " + inchi)
177
+ return obConversion.WriteString(obmol).split()[0]
178
+
179
+ def RemoveHydrogens(self):
180
+ self.obmol.DeleteHydrogens()
181
+
182
+ def RemoveAtoms(self, indices):
183
+ self.obmol.BeginModify()
184
+ for i in sorted(indices, reverse=True):
185
+ self.obmol.DeleteAtom(self.obmol.GetAtom(i+1))
186
+ self.obmol.EndModify()
187
+ self.smiles = None
188
+ self.inchi = None
189
+
190
+ def SetAtomicNum(self, index, new_atomic_num):
191
+ self.obmol.GetAtom(index+1).SetAtomicNum(new_atomic_num)
192
+ self.smiles = None
193
+ self.inchi = None
194
+
195
+ def ToOBMol(self):
196
+ return self.obmol
197
+
198
+ def ToFormat(self, fmt='inchi'):
199
+ return Molecule._ToFormat(self.obmol, fmt=fmt)
200
+
201
+ def ToMolfile(self):
202
+ return self.ToFormat('mol')
203
+
204
+ def UpdateInChI(self):
205
+ self.inchi = Molecule._ToFormat(self.obmol, 'inchi')
206
+
207
+ def ToInChI(self):
208
+ """
209
+ Lazy storage of the InChI identifier (calculate once only when
210
+ asked for and store for later use).
211
+ """
212
+ if not self.inchi:
213
+ self.UpdateInChI()
214
+ return self.inchi
215
+
216
+ def UpdateSmiles(self):
217
+ self.smiles = Molecule._ToFormat(self.obmol, 'smiles')
218
+
219
+ def ToSmiles(self):
220
+ """
221
+ Lazy storage of the SMILES identifier (calculate once only when
222
+ asked for and store for later use).
223
+ """
224
+ if not self.smiles:
225
+ self.UpdateSmiles()
226
+ return self.smiles
227
+
228
+ def GetFormula(self):
229
+ tokens = re.findall('InChI=1S?/([0-9A-Za-z\.]+)', self.ToInChI())
230
+ if len(tokens) == 1:
231
+ return tokens[0]
232
+ elif len(tokens) > 1:
233
+ raise ValueError('Bad InChI: ' + self.ToInChI())
234
+ else:
235
+ return ''
236
+
237
+ def GetExactMass(self):
238
+ return self.obmol.GetExactMass()
239
+
240
+ def GetAtomBagAndCharge(self):
241
+ inchi = self.ToInChI()
242
+ atom_bag, major_ms_charge = chemaxon.GetAtomBagAndCharge(inchi)
243
+ return atom_bag, major_ms_charge
244
+
245
+ def GetHydrogensAndCharge(self):
246
+ atom_bag, charge = self.GetAtomBagAndCharge()
247
+ return atom_bag.get('H', 0), charge
248
+
249
+ def GetNumElectrons(self):
250
+ """Calculates the number of electrons in a given molecule."""
251
+ atom_bag, fixed_charge = self.GetAtomBagAndCharge()
252
+ return atom_bag.get('e-', 0)
253
+
254
+ def GetNumAtoms(self):
255
+ return self.obmol.NumAtoms()
256
+
257
+ def GetAtoms(self):
258
+ return [self.obmol.GetAtom(i+1) for i in range(self.obmol.NumAtoms())]
259
+
260
+ def FindSmarts(self, smarts):
261
+ """
262
+ Corrects the pyBel version of Smarts.findall() which returns results as tuples,
263
+ with 1-based indices even though Molecule.atoms is 0-based.
264
+
265
+ Args:
266
+ mol: the molecule to search in.
267
+ smarts_str: the SMARTS query to search for.
268
+
269
+ Returns:
270
+ The re-mapped list of SMARTS matches.
271
+ """
272
+ Molecule._obSmarts.Init(smarts)
273
+ if Molecule._obSmarts.Match(self.obmol):
274
+ match_list = Molecule._obSmarts.GetMapList()
275
+ shift_left = lambda m: [(n - 1) for n in m]
276
+ return list(map(shift_left, match_list))
277
+ else:
278
+ return []
279
+
280
+ def GetAtomCharges(self):
281
+ """
282
+ Returns:
283
+ A list of charges, according to the number of atoms
284
+ in the molecule
285
+ """
286
+ return [atom.GetFormalCharge() for atom in self.GetAtoms()]
287
+
288
+ if __name__ == '__main__':
289
+
290
+ mol = Molecule.FromInChI('InChI=1/C5H10O2/c1-3-5(6)7-4-2/h3-4H2,1-2H3')
291
+ #mol = Molecule.FromInChI('InChI=1S/H2/h1H')
292
+ print(mol.GetExactMass())
CC/thermodynamic_constants.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ R = 8.31e-3 # kJ/(K*mol)
2
+ F = 96.485 # kC/mol
3
+ J_per_cal = 4.184
4
+ default_T = 298.15 # K
5
+ default_I = 0.25 # M
6
+ default_pH = 7.0
7
+ default_c0 = 1 # M
8
+ default_pMg = 10
9
+ default_RT = R * default_T
10
+ default_c_mid = 1e-3 # M
11
+ default_c_range = (1e-6, 1e-2) # M
12
+ dG0_f_Mg = -455.3 # kJ/mol, formation energy of Mg2+
13
+
14
+ symbol_d_G = "&Delta;G"
15
+ symbol_d_G0 = "&Delta;G&deg;"
16
+ symbol_d_G_prime = "&Delta;G'"
17
+ symbol_d_G0_prime = "&Delta;G'&deg;"
18
+
19
+ symbol_dr_G = "&Delta;<sub>r</sub>G"
20
+ symbol_dr_G0 = "&Delta;<sub>r</sub>G&deg;"
21
+ symbol_dr_G_prime = "&Delta;<sub>r</sub>G'"
22
+ symbol_dr_G0_prime = "&Delta;<sub>r</sub>G'&deg;"
23
+ symbol_dr_Gc_prime = "&Delta;<sub>r</sub>G'<sup>c</sup>"
24
+
25
+ symbol_df_G = "&Delta;<sub>f</sub>G"
26
+ symbol_df_G0 = "&Delta;<sub>f</sub>G&deg;"
27
+ symbol_df_G_prime = "&Delta;<sub>f</sub>G'"
28
+ symbol_df_G0_prime = "&Delta;<sub>f</sub>G'&deg;"
29
+
30
+ # Approximation of the temperature dependency of ionic strength effects
31
+ DH_alpha = lambda T : 1e-3*(9.20483*T) - 1e-5*(1.284668 * T**2) + 1e-8*(4.95199 * T**3)
32
+ DH_beta = 1.6
33
+
34
+ # Debye-Huckel
35
+ debye_huckel = lambda I_T : DH_alpha(I_T[1]) * I_T[0]**(0.5) / (1.0 + DH_beta * I_T[0]**(0.5))
36
+