Spaces:

vuu10
/

dGPredictor

Runtime error

App Files Files Community

vuu10 commited on Apr 28, 2023

Commit

6d990bb

1 Parent(s): a5f27f8

Upload 6 files

Browse files

Files changed (6) hide show

CC/Untitled.ipynb +1038 -0
CC/chemaxon.py +204 -0
CC/compound.py +337 -0
CC/compound_cacher.py +202 -0
CC/molecule.py +292 -0
CC/thermodynamic_constants.py +36 -0

CC/Untitled.ipynb ADDED Viewed

	@@ -0,0 +1,1038 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "ed0cdaf6-71e1-4ef0-894f-0beabdc392cf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import re\n",
+    "from PIL import Image\n",
+    "import webbrowser\n",
+    "import json\n",
+    "import pickle\n",
+    "import sys \n",
+    "import joblib\n",
+    "import sys\n",
+    "\n",
+    "from rdkit import Chem\n",
+    "from rdkit.Chem import Draw\n",
+    "from rdkit.Chem import rdChemReactions as Reactions\n",
+    "\n",
+    "from compound_cacher import CompoundCacher\n",
+    "from compound import Compound\n",
+    "from chemaxon import *\n",
+    "import chemaxon"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "e64deced-2a44-4d8e-ba8f-d9843f11724a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_smiles():\n",
+    "    db = pd.read_csv('./../data/cache_compounds_20160818.csv',index_col='compound_id')\n",
+    "    db_smiles = db['smiles_pH7'].to_dict()\n",
+    "    return db_smiles\n",
+    "\n",
+    "def load_molsig_rad1():\n",
+    "    molecular_signature_r1 = json.load(open('./../data/decompose_vector_ac.json'))\n",
+    "    return molecular_signature_r1\n",
+    "\n",
+    "def load_molsig_rad2():\n",
+    "    molecular_signature_r2 = json.load(open('./../data/decompose_vector_ac_r2_py3_indent_modified_manual.json'))\n",
+    "    return molecular_signature_r2\n",
+    "\n",
+    "def load_model():\n",
+    "    filename = './../model/M12_model_BR.pkl'\n",
+    "    loaded_model = joblib.load(open(filename, 'rb'))\n",
+    "    return loaded_model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "71615c14-49c3-45e7-9495-194ef22fb1ee",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "db_smiles = load_smiles()\n",
+    "molsig_r1 = load_molsig_rad1()\n",
+    "molsig_r2 = load_molsig_rad2()\n",
+    "loaded_model = load_model()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "b86b8049-cbf2-473f-8715-5e5f908193a2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def parse_reaction_formula_side(s):\n",
+    "    \"\"\"\n",
+    "        Parses the side formula, e.g. '2 C00001 + C00002 + 3 C00003'\n",
+    "        Ignores stoichiometry.\n",
+    "\n",
+    "        Returns:\n",
+    "            The set of CIDs.\n",
+    "    \"\"\"\n",
+    "    if s.strip() == \"null\":\n",
+    "        return {}\n",
+    "\n",
+    "    compound_bag = {}\n",
+    "    for member in re.split('\\s+\\+\\s+', s):\n",
+    "        tokens = member.split(None, 1)\n",
+    "        if len(tokens) == 0:\n",
+    "            continue\n",
+    "        if len(tokens) == 1:\n",
+    "            amount = 1\n",
+    "            key = member\n",
+    "        else:\n",
+    "            amount = float(tokens[0])\n",
+    "            key = tokens[1]\n",
+    "\n",
+    "        compound_bag[key] = compound_bag.get(key, 0) + amount\n",
+    "\n",
+    "    return compound_bag\n",
+    "\n",
+    "def parse_formula(formula, arrow='<=>', rid=None):\n",
+    "    \"\"\"\n",
+    "        Parses a two-sided formula such as: 2 C00001 => C00002 + C00003\n",
+    "\n",
+    "        Return:\n",
+    "            The set of substrates, products and the direction of the reaction\n",
+    "    \"\"\"\n",
+    "    tokens = formula.split(arrow)\n",
+    "    if len(tokens) < 2:\n",
+    "        print(('Reaction does not contain the arrow sign (%s): %s'\n",
+    "                                 % (arrow, formula)))\n",
+    "    if len(tokens) > 2:\n",
+    "        print(('Reaction contains more than one arrow sign (%s): %s'\n",
+    "                                 % (arrow, formula)))\n",
+    "\n",
+    "    left = tokens[0].strip()\n",
+    "    right = tokens[1].strip()\n",
+    "\n",
+    "    sparse_reaction = {}\n",
+    "    for cid, count in parse_reaction_formula_side(left).items():\n",
+    "        sparse_reaction[cid] = sparse_reaction.get(cid, 0) - count\n",
+    "\n",
+    "    for cid, count in parse_reaction_formula_side(right).items():\n",
+    "        sparse_reaction[cid] = sparse_reaction.get(cid, 0) + count  \n",
+    "    \n",
+    "    return sparse_reaction"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "7342b178-3472-4734-83e3-3de431abe15e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rxn_string = \"C00222 + C00010 + C00006 <=> C00024 + C00011 + C00005\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "7b4dfe4f-48a8-4011-b201-7fb3a3268cef",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rxn_dic = parse_formula(rxn_string)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "1f523aa2-b9dc-4153-8c1c-dec58e1ab987",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_ddG0(rxn_dict,pH,I,novel_mets):\n",
+    "    ccache = CompoundCacher()\n",
+    "    # ddG0 = get_transform_ddG0(rxn_dict, ccache, pH, I, T)\n",
+    "    T = 298.15\n",
+    "    ddG0_forward = 0\n",
+    "    for compound_id, coeff in rxn_dict.items():\n",
+    "        if novel_mets != None and compound_id in novel_mets:\n",
+    "            comp = novel_mets[compound_id]\n",
+    "        else:\n",
+    "            comp = ccache.get_compound(compound_id)\n",
+    "        ddG0_forward += coeff * comp.transform_pH7(pH, I, T)\n",
+    "\n",
+    "    return ddG0_forward"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "33cf30ff-8b2c-4da9-9134-75a60a5c5d66",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "-3.6254822995515497"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "get_ddG0(rxn_dic, 7.0,  0.1, {})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "9e39855d-eb9e-4ea9-aeb9-8b770cc24c8e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_rule(rxn_dict, molsig1, molsig2, novel_decomposed1, novel_decomposed2):\n",
+    "    if novel_decomposed1 != None:\n",
+    "        for cid in novel_decomposed1:\n",
+    "            molsig1[cid] = novel_decomposed1[cid]\n",
+    "    if novel_decomposed2 != None:\n",
+    "        for cid in novel_decomposed2:\n",
+    "            molsig2[cid] = novel_decomposed2[cid]\n",
+    "\n",
+    "    molsigna_df1 = pd.DataFrame.from_dict(molsig1).fillna(0)\n",
+    "    all_mets1 = molsigna_df1.columns.tolist()\n",
+    "    all_mets1.append(\"C00080\")\n",
+    "    all_mets1.append(\"C00282\")\n",
+    "\n",
+    "    molsigna_df2 = pd.DataFrame.from_dict(molsig2).fillna(0)\n",
+    "    all_mets2 = molsigna_df2.columns.tolist()\n",
+    "    all_mets2.append(\"C00080\")\n",
+    "    all_mets2.append(\"C00282\")\n",
+    "\n",
+    "    moieties_r1 = open('./data/group_names_r1.txt')\n",
+    "    moieties_r2 = open('./data/group_names_r2_py3_modified_manual.txt')\n",
+    "    moie_r1 = moieties_r1.read().splitlines()\n",
+    "    moie_r2 = moieties_r2.read().splitlines()\n",
+    "\n",
+    "    molsigna_df1 = molsigna_df1.reindex(moie_r1)\n",
+    "    molsigna_df2 = molsigna_df2.reindex(moie_r2)\n",
+    "\n",
+    "    rule_df1 = pd.DataFrame(index=molsigna_df1.index)\n",
+    "    rule_df2 = pd.DataFrame(index=molsigna_df2.index)\n",
+    "    # for rid, value in reaction_dict.items():\n",
+    "    #     # skip the reactions with missing metabolites\n",
+    "    #     mets = value.keys()\n",
+    "    #     flag = False\n",
+    "    #     for met in mets:\n",
+    "    #         if met not in all_mets:\n",
+    "    #             flag = True\n",
+    "    #             break\n",
+    "    #     if flag: continue\n",
+    "\n",
+    "    rule_df1['change'] = 0\n",
+    "    for met, stoic in rxn_dict.items():\n",
+    "        if met == \"C00080\" or met == \"C00282\":\n",
+    "            continue  # hydogen is zero\n",
+    "        rule_df1['change'] += molsigna_df1[met] * stoic\n",
+    "\n",
+    "    rule_df2['change'] = 0\n",
+    "    for met, stoic in rxn_dict.items():\n",
+    "        if met == \"C00080\" or met == \"C00282\":\n",
+    "            continue  # hydogen is zero\n",
+    "        rule_df2['change'] += molsigna_df2[met] * stoic\n",
+    "\n",
+    "    rule_vec1 = rule_df1.to_numpy().T\n",
+    "    rule_vec2 = rule_df2.to_numpy().T\n",
+    "\n",
+    "    m1, n1 = rule_vec1.shape\n",
+    "    m2, n2 = rule_vec2.shape\n",
+    "\n",
+    "    zeros1 = np.zeros((m1, 44))\n",
+    "    zeros2 = np.zeros((m2, 44))\n",
+    "    X1 = np.concatenate((rule_vec1, zeros1), 1)\n",
+    "    X2 = np.concatenate((rule_vec2, zeros2), 1)\n",
+    "\n",
+    "    rule_comb = np.concatenate((X1, X2), 1)\n",
+    "\n",
+    "    # rule_df_final = {}\n",
+    "    # rule_df_final['rad1'] = rule_df1\n",
+    "    # rule_df_final['rad2'] = rule_df2\n",
+    "    return rule_comb, rule_df1, rule_df2\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "a93ea75e-9851-45fd-aa58-d7f325b4b5a6",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'C00222': -1,\n",
+       " 'C00010': -1,\n",
+       " 'C00006': -1,\n",
+       " 'C00024': 1,\n",
+       " 'C00011': 1,\n",
+       " 'C00005': 1}"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "rxn_dic"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "981948dd-db2c-4463-b983-1220353d963e",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "96eb1c38-2ca7-4e38-bcc4-ade1cef73852",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(array([-19.96775194]), array([6.66052556]))"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "loaded_model.predict(X, return_std= True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "81128dd3-5005-40a6-b5fe-8ecacef824bc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_ddG0(rxn_dict,pH,I,novel_mets):\n",
+    "    ccache = CompoundCacher()\n",
+    "    # ddG0 = get_transform_ddG0(rxn_dict, ccache, pH, I, T)\n",
+    "    T = 298.15\n",
+    "    ddG0_forward = 0\n",
+    "    for compound_id, coeff in rxn_dict.items():\n",
+    "        if novel_mets != None and compound_id in novel_mets:\n",
+    "            comp = novel_mets[compound_id]\n",
+    "        else:\n",
+    "            comp = ccache.get_compound(compound_id)\n",
+    "        ddG0_forward += coeff * comp.transform_pH7(pH, I, T)\n",
+    "\n",
+    "    return ddG0_forward\n",
+    "\n",
+    "\n",
+    "def get_dG0(rxn_dict,rid,pH,I,loaded_model,molsig_r1, molsig_r2, novel_decomposed_r1, novel_decomposed_r2,novel_mets):\n",
+    "    rule_comb, rule_df1, rule_df2 = get_rule(rxn_dict,molsig_r1,molsig_r2, novel_decomposed_r1, novel_decomposed_r2)\n",
+    "    X  = rule_comb\n",
+    "    ymean, ystd = loaded_model.predict(X, return_std=True)\n",
+    "    result = {}\n",
+    "    return ymean[0] + get_ddG0(rxn_dict, pH, I, novel_mets),ystd[0], rule_df1, rule_df2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "751ec201-f062-4ac0-8d24-fe959636cbdc",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c6cb1e4d-24be-42a1-b88b-793a62597c92",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7abe24be-1653-455b-9931-9446480d39bb",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f13433dc-51a3-41e5-8a0b-b0f21724ef98",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "db7c764f-d216-44a9-8f88-0e3a7c51377a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ccc=  CompoundCacher()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "09e6f7f2-5be7-4db3-b55d-756ecb711095",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "a = ccc.get_compound('C00001')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "d28e44b7-d942-4739-9d7d-2f4e082ac1b9",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "81.4472134155519"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "a.transform_pH7(7, 0.25 , 298)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "1ef3fc0d-7d63-42ea-8743-522fe010a95d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "inchi_k = \"InChI=1S/C14H14O/c15-14-8-4-7-13(11-14)10-9-12-5-2-1-3-6-12/h1-8,11,15H,9-10H2\" ;"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "4e651d1c-2c96-42d1-adab-466dc7518146",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\vuu10\\AppData\\Local\\Continuum\\anaconda3\\envs\\dGPredictor_py3\\lib\\openbabel\\__init__.py:14: UserWarning: \"import openbabel\" is deprecated, instead use \"from openbabel import openbabel\"\n",
+      "  warnings.warn('\"import openbabel\" is deprecated, instead use \"from openbabel import openbabel\"')\n"
+     ]
+    }
+   ],
+   "source": [
+    "c = Compound.from_inchi('Test', 'sajdf', inchi_k )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "6eb5c2dc-f14c-46de-889b-0e9b7faa9f79",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "AttributeError",
+     "evalue": "'Compound' object has no attribute 'smiles_ph7'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "\u001b[1;32m<ipython-input-18-7a0d06664090>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mc\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msmiles_ph7\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[1;31mAttributeError\u001b[0m: 'Compound' object has no attribute 'smiles_ph7'"
+     ]
+    }
+   ],
+   "source": [
+    "c.smiles_ph7()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "edd156dc-4355-4c2c-ba4e-6d98e776a96a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from chemaxon import *\n",
+    "import chemaxon"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "880d2ef6-6b03-49d3-8f60-66769c22a84d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pKas, major_ms_smiles = chemaxon.GetDissociationConstants(inchi_k)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "7a2391dc-313c-47f2-9f54-823bfdb95fcd",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'OC1=CC=CC(CCC2=CC=CC=C2)=C1\\r'"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "major_ms_smiles"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "96d90c4a-14a2-45fb-8573-97db84de2dff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "major_ms_smiles = Compound.smiles2smiles(major_ms_smiles)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "36d46620-b895-4ec8-85d0-7499759812c6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "MIN_PH = 0.0\n",
+    "MAX_PH = 14.0\n",
+    "pKas = sorted([pka for pka in pKas if pka > MIN_PH and pka < MAX_PH], reverse=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "ffccf9d9-5a52-4be6-af4c-f39b3db2a27c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[10.1]"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pKas"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "e83721fa-9a42-42ef-9a03-59fc2689c73b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "atom_bag, major_ms_charge = chemaxon.GetAtomBagAndCharge(major_ms_smiles)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "47a87ed7-968d-44b6-a237-a8469ba3fe3b",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "49cfefde-ee96-4ca8-89af-c50f2f2ca70b",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9b881c7b-a14a-4561-9c3c-157116efdfd0",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "10c8f915-e61a-4560-b546-fe6ea8bfdde3",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "936fafa5-1bf6-495c-be79-d4cc620f4861",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "285f9370-2fba-44c4-a36b-66c95f9f2eed",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "adbcd78f-869a-4cc9-b727-03c80df31edd",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "17fbfee9-c8b7-4644-814f-0e8aa0ad5ee9",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "70f90669-ff90-4bc4-955c-63672e42bb3c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "formula, formal_charge = GetFormulaAndCharge(molstring)\n",
+    "\n",
+    "atom_bag = {}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "e40e4088-c246-4afb-98ae-f92cb738e988",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for mol_formula_times in formula.split('.'):\n",
+    "    for times, mol_formula in re.findall('^(\\d+)?(\\w+)', mol_formula_times):\n",
+    "        if not times:\n",
+    "            times = 1\n",
+    "        else:\n",
+    "            times = int(times)\n",
+    "        for atom, count in re.findall(\"([A-Z][a-z]*)([0-9]*)\", mol_formula):\n",
+    "            if count == '':\n",
+    "                count = 1\n",
+    "            else:\n",
+    "                count = int(count)\n",
+    "            atom_bag[atom] = atom_bag.get(atom, 0) + count * times"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "391cfbba-2da5-4b60-ba32-217754913b35",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'C': 14, 'H': 14, 'O': 1}"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "atom_bag"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "id": "812f8297-a5cc-4d63-b132-243c278c6b76",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "6\n",
+      "1\n",
+      "8\n"
+     ]
+    }
+   ],
+   "source": [
+    "from rdkit.Chem import rdchem\n",
+    "for (elem, c) in atom_bag.items():\n",
+    "    ll = rdchem.GetPeriodicTable()\n",
+    "    atomic_num = ll.GetAtomicNumber(elem)\n",
+    "    print(atomic_num)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "id": "463fcb01-2cd0-4aee-990c-946c534dc766",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "n_protons = sum([c * ll.GetAtomicNumber(str(elem))\n",
+    "                 for (elem, c) in atom_bag.items()])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "id": "ac1c69f6-54db-41ba-9fdf-e7ab6a2dfcbc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "atom_bag['e-'] = n_protons - formal_charge"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "id": "61b1931e-dbaf-4e0f-afb2-6595f64d70d6",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'C': 14, 'H': 14, 'O': 1, 'e-': 106}"
+      ]
+     },
+     "execution_count": 58,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "atom_bag"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "id": "12bdbf80-7dc5-4d47-a479-703ad5a6aa06",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 60,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\n",
+    "formal_charge\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b51f36c0-707a-4856-8c23-9081e2ea2cf7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_pKas, smiles_list = GetDissociationConstants_val(inchi_k)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "6dd79761-760d-4233-b113-a34e6322a0e5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "MID_PH = 7.0\n",
+    "N_PKAS = 20\n",
+    "\n",
+    "n_acidic = N_PKAS\n",
+    "n_basic = N_PKAS\n",
+    "pH = MID_PH"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "6167191a-b361-4ae0-a78a-927490c72f87",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "args = []\n",
+    "if n_acidic + n_basic > 0:\n",
+    "    args += ['pka', '-a', str(n_acidic), '-b', str(n_basic),\n",
+    "             'majorms', '-M', 'true', '--pH', str(pH)]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "dd4275ec-c71e-4b5b-bb35-de8b3c7c4883",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['pka', '-a', '20', '-b', '20', 'majorms', '-M', 'true', '--pH', '7.0']"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "args"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "79d07dc5-963a-4373-9d72-1eb6de48ede9",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "712a71fb-e3e3-4b01-828d-5a3862aa1b30",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "logging.debug(\"INPUT: echo %s | %s\" % (inchi_k, ' '.join([CXCALC_BIN] + args)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "287bf822-23b8-42de-85ca-e52678875cfa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "molstring= inchi_k"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "4d2ff427-237c-4d63-a718-f29f12884d96",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "p1 = Popen([\"echo\", molstring], stdout=PIPE, shell=use_shell_for_echo)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "923a09f2-b959-4837-ab1a-a858d91de0b4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "p2 = Popen([CXCALC_BIN] + args, stdin=p1.stdout,\n",
+    "                   executable=CXCALC_BIN, stdout=PIPE, shell=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "a6b30545-c65a-4c56-9985-71a103b9da00",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "res = p2.communicate()[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "ac059602-027f-4a1a-932f-c1339c38c7d7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if p2.returncode != 0:\n",
+    "    raise ChemAxonError(str(args))\n",
+    "logging.debug(\"OUTPUT: %s\" % res)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "671642a5-3877-44e3-b935-f987fd601444",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "output = res"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "a9f4bb4a-af86-4e97-bf1d-40c58013f90e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "b'id\\tapKa1\\tapKa2\\tapKa3\\tapKa4\\tapKa5\\tapKa6\\tapKa7\\tapKa8\\tapKa9\\tapKa10\\tapKa11\\tapKa12\\tapKa13\\tapKa14\\tapKa15\\tapKa16\\tapKa17\\tapKa18\\tapKa19\\tapKa20\\tbpKa1\\tbpKa2\\tbpKa3\\tbpKa4\\tbpKa5\\tbpKa6\\tbpKa7\\tbpKa8\\tbpKa9\\tbpKa10\\tbpKa11\\tbpKa12\\tbpKa13\\tbpKa14\\tbpKa15\\tbpKa16\\tbpKa17\\tbpKa18\\tbpKa19\\tbpKa20\\tatoms\\tmajor-ms\\r\\n1\\t10.10\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t-5.48\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t15,15\\tOC1=CC=CC(CCC2=CC=CC=C2)=C1\\r\\n'"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "215ffc9b-35a8-4f45-8f39-9c99deae6335",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "atom2pKa, smiles_list = ParsePkaOutput(output, n_acidic, n_basic)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "21c380d3-5410-4c55-b6d7-cb0588f373ca",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['OC1=CC=CC(CCC2=CC=CC=C2)=C1\\r']"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "smiles_list"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "1437693a-0923-4df1-837d-acb2b524fcae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_pKas = []\n",
+    "for pKa_list in list(atom2pKa.values()):\n",
+    "    all_pKas += [pKa for pKa, _ in pKa_list]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "8e77324c-ed61-4615-a7c7-4f5ca781dc90",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[10.1, -5.48]"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "all_pKas"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8616be46-1814-4755-b919-4b7790569890",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

CC/chemaxon.py ADDED Viewed

	@@ -0,0 +1,204 @@

+import logging
+import csv
+import re
+import platform
+import io
+from subprocess import Popen, PIPE
+from openbabel import openbabel
+import pdb
+from rdkit.Chem import rdchem
+if platform.system() == 'Windows':
+    CXCALC_BIN = 'C:\\Users\\vuu10\\AppData\\Local\\Programs\\ChemAxon\\MarvinSuite\\bin\\cxcalc.exe'
+    #CXCALC_BIN = 'C:\\Program Files (x86)\\ChemAxon\\MarvinBeans\\bin\\cxcalc.bat'
+    use_shell_for_echo = True
+else:
+    CXCALC_BIN = 'cxcalc'
+    use_shell_for_echo = False
+MID_PH = 7.0
+N_PKAS = 20
+class ChemAxonError(Exception):
+    pass
+def RunCxcalc(molstring, args):
+    # pdb.set_trace()
+    #     with open(platform.DEV_NULL, 'w') as dev_null:
+    try:
+        logging.debug("INPUT: echo %s | %s" %
+                      (molstring, ' '.join([CXCALC_BIN] + args)))
+        p1 = Popen(["echo", molstring], stdout=PIPE,
+                   shell=use_shell_for_echo)
+# 		p2 = Popen([CXCALC_BIN] + args, stdin=p1.stdout,
+# 				   executable=CXCALC_BIN, stdout=PIPE, stderr=dev_null, shell=False)
+        p2 = Popen([CXCALC_BIN] + args, stdin=p1.stdout,
+                   executable=CXCALC_BIN, stdout=PIPE, shell=False)
+        # p.wait()
+        # os.remove(temp_fname)
+        res = p2.communicate()[0]
+        if p2.returncode != 0:
+            raise ChemAxonError(str(args))
+        logging.debug("OUTPUT: %s" % res)
+        res = res.decode('utf-8')
+        return res
+    except OSError:
+        raise Exception(
+            "Marvin (by ChemAxon) must be installed to calculate pKa data.")
+def ParsePkaOutput(s, n_acidic, n_basic):
+    """
+        Returns:
+            A dictionary that maps the atom index to a list of pKas
+            that are assigned to that atom.
+    """
+#     s = s.decode('utf-8')
+    atom2pKa = {}
+    pkaline = s.split('\n')[1]
+    splitline = pkaline.split('\t')
+    splitline.pop(0)
+    if n_acidic + n_basic > 0:
+        if len(splitline) != (n_acidic + n_basic + 2):
+            raise ChemAxonError('ChemAxon failed to find any pKas')
+        pKa_list = []
+        acid_or_base_list = []
+        for i in range(n_acidic + n_basic):
+            x = splitline.pop(0)
+            if x == '':
+                continue
+            pKa_list.append(float(x))
+            if i < n_acidic:
+                acid_or_base_list.append('acid')
+            else:
+                acid_or_base_list.append('base')
+        atom_list = splitline.pop(0)
+        if atom_list:  # a comma separated list of the deprotonated atoms
+            atom_numbers = [int(y)-1 for y in atom_list.split(',')]
+            for i, j in enumerate(atom_numbers):
+                atom2pKa.setdefault(j, [])
+                atom2pKa[j].append((pKa_list[i], acid_or_base_list[i]))
+    smiles_list = splitline
+    return atom2pKa, smiles_list
+def GetDissociationConstants_val(molstring, n_acidic=N_PKAS, n_basic=N_PKAS,
+                                 pH=MID_PH):
+    """
+        Returns:
+            A pair of (pKa list, major pseudoisomer)
+            - the pKa list is of the pKa values in ascending order.
+            - the major pseudoisomer is a SMILES string of the major species
+              at the given pH.
+    """
+    args = []
+    if n_acidic + n_basic > 0:
+        args += ['pka', '-a', str(n_acidic), '-b', str(n_basic),
+                 'majorms', '-M', 'true', '--pH', str(pH)]
+    output = RunCxcalc(molstring, args)
+    atom2pKa, smiles_list = ParsePkaOutput(output, n_acidic, n_basic)
+    all_pKas = []
+    for pKa_list in list(atom2pKa.values()):
+        all_pKas += [pKa for pKa, _ in pKa_list]
+    return sorted(all_pKas), smiles_list
+def GetDissociationConstants(molstring, n_acidic=N_PKAS, n_basic=N_PKAS,
+                             pH=MID_PH):
+    """
+        Arguments:
+            molstring - a text description of the molecule (SMILES or InChI)
+            n_acidic  - the max no. of acidic pKas to calculate
+            n_basic   - the max no. of basic pKas to calculate
+            pH        - the pH for which the major pseudoisomer is calculated
+        Returns a pair:
+            (all_pKas, major_ms)
+        - all_pKas is a list of floats (pKa values)
+        - major_ms is a SMILES string of the major pseudoisomer at pH_mid
+    """
+    all_pKas, smiles_list = GetDissociationConstants_val(molstring, n_acidic,
+                                                         n_basic, pH)
+    major_ms = smiles_list[0]
+    return all_pKas, major_ms
+def GetFormulaAndCharge(molstring):
+    """
+        Arguments:
+            molstring - a text description of the molecule (SMILES or InChI)
+        Returns:
+            chemical formula of the molecule
+    """
+    args = ['formula', 'formalcharge']
+    output = RunCxcalc(molstring, args)
+    # the output is a tab separated table whose columns are:
+    # id, Formula, Formal charge
+    f = io.StringIO(output)
+    tsv_output = csv.reader(f, delimiter='\t')
+    headers = next(tsv_output)
+    if headers != ['id', 'Formula', 'Formal charge']:
+        raise ChemAxonError(
+            'cannot get the formula and charge for: ' + molstring)
+    _, formula, formal_charge = next(tsv_output)
+    try:
+        formal_charge = int(formal_charge)
+    except ValueError:
+        formal_charge = 0
+    return formula, formal_charge
+def GetAtomBagAndCharge(molstring):
+    formula, formal_charge = GetFormulaAndCharge(molstring)
+    periodic_table = rdchem.GetPeriodicTable()
+    atom_bag = {}
+    for mol_formula_times in formula.split('.'):
+        for times, mol_formula in re.findall('^(\d+)?(\w+)', mol_formula_times):
+            if not times:
+                times = 1
+            else:
+                times = int(times)
+            for atom, count in re.findall("([A-Z][a-z]*)([0-9]*)", mol_formula):
+                if count == '':
+                    count = 1
+                else:
+                    count = int(count)
+                atom_bag[atom] = atom_bag.get(atom, 0) + count * times
+    n_protons = sum([c * periodic_table.GetAtomicNumber(str(elem))
+                     for (elem, c) in atom_bag.items()])
+    atom_bag['e-'] = n_protons - formal_charge
+    return atom_bag, formal_charge
+if __name__ == "__main__":
+    logging.getLogger().setLevel(logging.WARNING)
+    from molecule import Molecule
+    compound_list = [
+        ('D-Erythrulose', 'InChI=1S/C4H8O4/c5-1-3(7)4(8)2-6/h3,5-7H,1-2H2/t3-/m1/s1')]
+    for name, inchi in compound_list:
+        print("Formula: %s\nCharge: %d" % GetFormulaAndCharge(inchi))
+        diss_table, major_ms = GetDissociationConstants(inchi)
+        m = Molecule.FromSmiles(major_ms)
+        print("Name: %s\nInChI: %s\npKas: %s" %
+              (name, m.ToInChI(), str(diss_table)))

CC/compound.py ADDED Viewed

	@@ -0,0 +1,337 @@

+import urllib.request, urllib.parse, urllib.error, logging
+from openbabel import openbabel
+import chemaxon
+import numpy as np
+from thermodynamic_constants import R, debye_huckel
+from scipy.special import logsumexp
+MIN_PH = 0.0
+MAX_PH = 14.0
+class Compound(object):
+    def __init__(self, database, compound_id, inchi,
+                 atom_bag, pKas, smiles_pH7, majorMSpH7, nHs, zs):
+        self.database = database
+        self.compound_id = compound_id
+        self.inchi = inchi
+        self.atom_bag = atom_bag
+        self.pKas = pKas
+        self.smiles_pH7 = smiles_pH7
+        self.majorMSpH7 = majorMSpH7
+        self.nHs = nHs
+        self.zs = zs
+    @staticmethod
+    def from_kegg(compound_id):
+        return Compound.from_inchi('KEGG', compound_id,
+                                   Compound.get_inchi(compound_id))
+    @staticmethod
+    def from_inchi(database, compound_id, inchi):
+        if compound_id == 'C00080':
+            # We add an exception for H+ (and put nH = 0) in order to eliminate
+            # its effect of the Legendre transform
+            return Compound(database, compound_id, inchi,
+                            {'H' : 1}, [], None, 0, [0], [0])
+        elif compound_id == 'C00087':
+            # ChemAxon gets confused with the structure of sulfur
+            # (returns a protonated form, [SH-], at pH 7).
+            # So we implement it manually here.
+            return Compound(database, compound_id, inchi,
+                            {'S' : 1, 'e-': 16}, [], 'S', 0, [0], [0])
+        elif compound_id == 'C00237':
+            # ChemAxon gets confused with the structure of carbon monoxide
+            # (returns a protonated form, [CH]#[O+], at pH 7).
+            # So we implement it manually here.
+            return Compound(database, compound_id, inchi,
+                            {'C' : 1, 'O': 1, 'e-': 14}, [], '[C-]#[O+]', 0, [0], [0])
+        elif compound_id == 'C00282':
+            # ChemAxon gets confused with the structure of hydrogen
+            # So we implement it manually here.
+            return Compound(database, compound_id, inchi,
+                            {'H' : 2, 'e-': 2}, [], None, 0, [2], [0])
+        elif compound_id == 'C01353':
+            # When given the structure of carbonic acid, ChemAxon returns the
+            # pKas for CO2(tot), i.e. it assumes the non-hydrated CO2 species is
+            # one of the pseudoisomers, and the lower pKa value is 6.05 instead of
+            # 3.78. Here, we introduce a new "KEGG" compound that will represent
+            # pure bicarbonate (without CO2(sp)) and therefore plug in the pKa
+            # values from Alberty's book.
+            return Compound(database, compound_id, inchi,
+                            {'C': 1, 'H': 1, 'O': 3, 'e-': 32}, [10.33, 3.43],
+                            'OC(=O)[O-]', 1, [0, 1, 2], [-2, -1, 0])
+        # Metal Cations get multiple pKa values from ChemAxon, which is
+        # obviously a bug. We override the important ones here:
+        elif compound_id == 'C00076': # Ca2+
+            return Compound(database, compound_id, inchi,
+                            {'Ca' : 1, 'e-': 18}, [], '[Ca++]', 0, [0], [2])
+        elif compound_id == 'C00238': # K+
+            return Compound(database, compound_id, inchi,
+                            {'K' : 1, 'e-': 18}, [], '[K+]', 0, [0], [1])
+        elif compound_id == 'C00305': # Mg2+
+            return Compound(database, compound_id, inchi,
+                            {'Mg' : 1, 'e-': 10}, [], '[Mg++]', 0, [0], [2])
+        elif compound_id == 'C14818': # Fe2+
+            return Compound(database, compound_id, inchi,
+                            {'Fe' : 1, 'e-': 24}, [], '[Fe++]', 0, [0], [2])
+        elif compound_id == 'C14819': # Fe3+
+            return Compound(database, compound_id, inchi,
+                            {'Fe' : 1, 'e-': 23}, [], '[Fe+++]', 0, [0], [3])
+        elif compound_id == 'C00138': # ferredoxin(red)
+            return Compound(database, compound_id, inchi,
+                            {'Fe' : 1, 'e-': 26}, [], None, 0, [0], [0])
+        elif compound_id == 'C00139': # ferredoxin(ox)
+            return Compound(database, compound_id, inchi,
+                            {'Fe' : 1, 'e-': 25}, [], None, 0, [0], [1])
+        elif inchi is None:
+            # If the compound has no explicit structure, we assume that it has
+            # no proton dissociations in the relevant pH range
+            return Compound(database, compound_id, inchi,
+                            {}, [], None, 0, [0], [0])
+        # Otherwise, we use ChemAxon's software to get the pKas and the
+        # properties of all microspecies
+        try:
+            pKas, major_ms_smiles = chemaxon.GetDissociationConstants(inchi)
+            major_ms_smiles = Compound.smiles2smiles(major_ms_smiles)
+            pKas = sorted([pka for pka in pKas if pka > MIN_PH and pka < MAX_PH], reverse=True)
+        except chemaxon.ChemAxonError:
+            logging.warning('chemaxon failed to find pKas for this molecule: ' + inchi)
+            # use the original InChI to get the parameters (i.e. assume it
+            # represents the major microspecies at pH 7)
+            major_ms_smiles = Compound.inchi2smiles(inchi)
+            pKas = []
+        if major_ms_smiles:
+            atom_bag, major_ms_charge = chemaxon.GetAtomBagAndCharge(major_ms_smiles)
+            major_ms_nH = atom_bag.get('H', 0)
+        else:
+            atom_bag = {}
+            major_ms_charge = 0
+            major_ms_nH = 0
+        n_species = len(pKas) + 1
+        if pKas == []:
+            majorMSpH7 = 0
+        else:
+            majorMSpH7 = len([1 for pka in pKas if pka > 7])
+        nHs = []
+        zs = []
+        for i in range(n_species):
+            zs.append((i - majorMSpH7) + major_ms_charge)
+            nHs.append((i - majorMSpH7) + major_ms_nH)
+        return Compound(database, compound_id, inchi,
+                        atom_bag, pKas, major_ms_smiles, majorMSpH7, nHs, zs)
+    def to_json_dict(self):
+        return {'database' : self.database,
+                'compound_id' : self.compound_id,
+                'inchi' : self.inchi,
+                'atom_bag' : self.atom_bag,
+                'pKas' : self.pKas,
+                'smiles_pH7' : self.smiles_pH7,
+                'majorMSpH7' : self.majorMSpH7,
+                'nHs' : self.nHs,
+                'zs' : self.zs}
+    @staticmethod
+    def from_json_dict(d):
+        return Compound(d['database'], d['compound_id'], d['inchi'], d['atom_bag'],
+                        d['pKas'], d['smiles_pH7'], d['majorMSpH7'],
+                        d['nHs'], d['zs'])
+    @staticmethod
+    def get_inchi(compound_id):
+        s_mol = urllib.request.urlopen('http://rest.kegg.jp/get/cpd:%s/mol' % compound_id).read()
+        return Compound.mol2inchi(s_mol)
+    @staticmethod
+    def mol2inchi(s):
+        openbabel.obErrorLog.SetOutputLevel(-1)
+        conv = openbabel.OBConversion()
+        conv.SetInAndOutFormats('mol', 'inchi')
+        conv.AddOption("F", conv.OUTOPTIONS)
+        conv.AddOption("T", conv.OUTOPTIONS)
+        conv.AddOption("x", conv.OUTOPTIONS, "noiso")
+        conv.AddOption("w", conv.OUTOPTIONS)
+        obmol = openbabel.OBMol()
+        if not conv.ReadString(obmol, str(s)):
+            return None
+        inchi = conv.WriteString(obmol, True) # second argument is trimWhitespace
+        if inchi == '':
+            return None
+        else:
+            return inchi
+    @staticmethod
+    def inchi2smiles(inchi):
+        openbabel.obErrorLog.SetOutputLevel(-1)
+        conv = openbabel.OBConversion()
+        conv.SetInAndOutFormats('inchi', 'smiles')
+        #conv.AddOption("F", conv.OUTOPTIONS)
+        #conv.AddOption("T", conv.OUTOPTIONS)
+        #conv.AddOption("x", conv.OUTOPTIONS, "noiso")
+        #conv.AddOption("w", conv.OUTOPTIONS)
+        obmol = openbabel.OBMol()
+        conv.ReadString(obmol, str(inchi))
+        smiles = conv.WriteString(obmol, True) # second argument is trimWhitespace
+        if smiles == '':
+            return None
+        else:
+            return smiles
+    @staticmethod
+    def smiles2smiles(smiles_in):
+        openbabel.obErrorLog.SetOutputLevel(-1)
+        conv = openbabel.OBConversion()
+        conv.SetInAndOutFormats('smiles', 'smiles')
+        #conv.AddOption("F", conv.OUTOPTIONS)
+        #conv.AddOption("T", conv.OUTOPTIONS)
+        #conv.AddOption("x", conv.OUTOPTIONS, "noiso")
+        #conv.AddOption("w", conv.OUTOPTIONS)
+        obmol = openbabel.OBMol()
+        conv.ReadString(obmol, str(smiles_in))
+        smiles_out = conv.WriteString(obmol, True) # second argument is trimWhitespace
+        if smiles_out == '':
+            return None
+        else:
+            return smiles_out
+    @staticmethod
+    def smiles2inchi(smiles):
+        openbabel.obErrorLog.SetOutputLevel(-1)
+        conv = openbabel.OBConversion()
+        conv.SetInAndOutFormats('smiles', 'inchi')
+        conv.AddOption("F", conv.OUTOPTIONS)
+        conv.AddOption("T", conv.OUTOPTIONS)
+        conv.AddOption("x", conv.OUTOPTIONS, "noiso")
+        conv.AddOption("w", conv.OUTOPTIONS)
+        obmol = openbabel.OBMol()
+        conv.ReadString(obmol, str(smiles))
+        inchi = conv.WriteString(obmol, True) # second argument is trimWhitespace
+        if inchi == '':
+            return None
+        else:
+            return inchi
+    def __str__(self):
+        return "%s\nInChI: %s\npKas: %s\nmajor MS: nH = %d, charge = %d" % \
+            (self.compound_id, self.inchi, ', '.join(['%.2f' % p for p in self.pKas]),
+             self.nHs[self.majorMSpH7], self.zs[self.majorMSpH7])
+    def _dG0_prime_vector(self, pH, I, T):
+        """
+            Calculates the difference in kJ/mol between dG'0 and
+            the dG0 of the MS with the least hydrogens (dG0[0])
+            Returns:
+                dG'0 - dG0[0]
+        """
+        if self.inchi is None:
+            return 0
+        elif self.pKas == []:
+            dG0s = np.zeros((1, 1))
+        else:
+            dG0s = -np.cumsum([0] + self.pKas) * R * T * np.log(10)
+            dG0s = dG0s
+        DH = debye_huckel((I, T))
+        # dG0' = dG0 + nH * (R T ln(10) pH + DH) - charge^2 * DH
+        pseudoisomers = np.vstack([dG0s, np.array(self.nHs), np.array(self.zs)]).T
+        dG0_prime_vector = pseudoisomers[:, 0] + \
+                           pseudoisomers[:, 1] * (R * T * np.log(10) * pH + DH) - \
+                           pseudoisomers[:, 2]**2 * DH
+        return dG0_prime_vector
+    def _transform(self, pH, I, T):
+        return -R * T * logsumexp(self._dG0_prime_vector(pH, I, T) / (-R * T))
+    def _ddG(self, i_from, i_to, T):
+        """
+            Calculates the difference in kJ/mol between two MSs.
+            Returns:
+                dG0[i_to] - dG0[i_from]
+        """
+        if not (0 <= i_from <= len(self.pKas)):
+            raise ValueError('MS index is out of bounds: 0 <= %d <= %d' % (i_from, len(self.pKas)))
+        if not (0 <= i_to <= len(self.pKas)):
+            raise ValueError('MS index is out of bounds: 0 <= %d <= %d' % (i_to, len(self.pKas)))
+        if i_from == i_to:
+            return 0
+        elif i_from < i_to:
+            return sum(self.pKas[i_from:i_to]) * R * T * np.log(10)
+        else:
+            return -sum(self.pKas[i_to:i_from]) * R * T * np.log(10)
+    def transform(self, i, pH, I, T):
+        """
+            Returns the difference in kJ/mol between dG'0 and the dG0 of the
+            MS with index 'i'.
+            Returns:
+                (dG'0 - dG0[0]) + (dG0[0] - dG0[i])  = dG'0 - dG0[i]
+        """
+        return self._transform(pH, I, T) + self._ddG(0, i, T)
+    def transform_pH7(self, pH, I, T):
+        """
+            Returns the transform for the major MS in pH 7
+        """
+        return self.transform(self.majorMSpH7, pH, I, T)
+    def transform_neutral(self, pH, I, T):
+        """
+            Returns the transform for the MS with no charge
+        """
+        try:
+            return self.transform(pH, I, T, self.zs.index(0))
+        except ValueError:
+            raise ValueError("The compound (%s) does not have a microspecies with 0 charge"
+                             % self.compound_id)
+    def get_species(self, major_ms_dG0_f, T):
+        """
+            Given the chemical formation energy of the major microspecies,
+            uses the pKa values to calculate the chemical formation energies
+            of all other species, and returns a list of dictionaries with
+            all the relevant data: dG0_f, nH, nMg, z (charge)
+        """
+        for i, (nH, z) in enumerate(zip(self.nHs, self.zs)):
+            dG0_f = major_ms_dG0_f + self._ddG(i, self.majorMSpH7, T)
+            d = {'phase': 'aqueous', 'dG0_f': np.round(dG0_f, 2),
+                 'nH': nH, 'z': z, 'nMg': 0}
+            yield d
+if __name__ == '__main__':
+    import sys, json
+    logger = logging.getLogger('')
+    logger.setLevel(logging.DEBUG)
+    from compound_cacher import CompoundCacher, CompoundEncoder
+    from molecule import Molecule, OpenBabelError
+    ccache = CompoundCacher(cache_fname=None)
+    for compound_id in ['C00087', 'C00282', 'C00237']:
+        comp = Compound.from_kegg(compound_id)
+        try:
+            mol = Molecule.FromInChI(str(comp.inchi))
+            sys.stderr.write('%s : formula = %s, nE = %s' %
+                             (str(comp.inchi), mol.GetFormula(), mol.GetNumElectrons()))
+        except OpenBabelError:
+            pass
+        ccache.add(comp)
+        sys.stderr.write('\ncompound id = %s, nH = %s, z = %s, pKa = %s, bag = %s\n\n\n' %
+                         (compound_id, str(comp.nHs), str(comp.zs), str(comp.pKas), str(comp.atom_bag)))
+    ccache.dump()

CC/compound_cacher.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import json, os, logging, csv, gzip, numpy, pdb
+from compound import Compound
+base_path = os.path.split(os.path.realpath(__file__))[0]
+### Input Files:
+# original version of the KEGG compound file
+OLD_COMPOUND_JSON_FNAME = os.path.join(base_path, './data_cc/equilibrator_compounds.json.gz')
+# a CSV file with additional names and InChIs (mostly compounds missing from KEGG
+# and added manually)
+KEGG_ADDITIONS_TSV_FNAME = os.path.join(base_path, './data_cc/kegg_additions.tsv')
+### Files created by this module:
+# names and InChIs only
+KEGG_COMPOUND_JSON_FNAME = os.path.join(base_path, './data_cc/kegg_compounds.json.gz')
+# names, InChIs and pKa data
+DEFAULT_CACHE_FNAME = os.path.join(base_path, './data_cc/compounds.json.gz')
+class CompoundEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if (isinstance(obj, Compound)):
+            return obj.to_json_dict()
+        return json.JSONEncoder.default(self, obj)
+class Singleton(type):
+    def __init__(cls,name,bases,dic):
+        super(Singleton,cls).__init__(name,bases,dic)
+        cls.instance=None
+    def __call__(cls,*args,**kw):
+        if cls.instance is None:
+            cls.instance=super(Singleton,cls).__call__(*args,**kw)
+        return cls.instance
+class CompoundCacher(object, metaclass=Singleton):
+    """
+        CompoundCacher is a singleton that handles caching of Compound objects
+        for the component-contribution package. The Compounds are retrieved by
+        their ID (which is the KEGG ID in most cases).
+        The first time a Compound is requested, it is obtained from the relevant
+        database and a Compound object is created (this takes a while because
+        it usually involves internet communication and then invoking the ChemAxon
+        plugin for calculating the pKa values for that structure).
+        Any further request for the same Compound ID will draw the object from
+        the cache. When the method dump() is called, all cached data is written
+        to a file that will be loaded in future python sessions.
+    """
+    def __init__(self, cache_fname=None):
+        self.cache_fname = cache_fname
+        if self.cache_fname is None:
+            self.cache_fname = DEFAULT_CACHE_FNAME
+        compounds = json.load(gzip.open(KEGG_COMPOUND_JSON_FNAME, 'r'))
+        self.compound_id2inchi = { d['compound_id']: d['inchi']
+                                   for d in compounds }
+        self.need_to_update_cache_file = False
+        self.load()
+    def get_all_compound_ids(self):
+        return sorted(self.compound_id2inchi.keys())
+    def load(self):
+        # parse the JSON cache file and store in a dictionary 'compound_dict'
+        self.compound_dict = {}
+        self.compound_ids = []
+        if os.path.exists(self.cache_fname):
+            for d in json.load(gzip.open(self.cache_fname, 'r')):
+                self.compound_ids.append(d['compound_id'])
+                self.compound_dict[d['compound_id']] = Compound.from_json_dict(d)
+    def dump(self):
+        if self.need_to_update_cache_file:
+            fp = gzip.open(self.cache_fname, 'w')
+            data = sorted(list(self.compound_dict.values()),
+                          key=lambda d:d.compound_id)
+            dict_data = [x.to_json_dict() for x in data]
+            json.dump(dict_data, fp, cls=CompoundEncoder,
+                      sort_keys=True, indent=4,  separators=(',', ': '))
+            fp.close()
+            self.need_to_update_cache_file = False
+    def get_compound(self, compound_id, kegg_additions_cids=None):
+        if compound_id not in self.compound_dict:
+            logging.debug('Cache miss: %s' % str(compound_id))
+            inchi = self.compound_id2inchi[compound_id]
+            comp = Compound.from_inchi('KEGG', compound_id, inchi)
+            self.add(comp)
+        #if a compound id is in the kegg_additions.tsv
+        #remove the one in cache, and replace it with new one
+        else:
+            if kegg_additions_cids is not None:
+                if compound_id in kegg_additions_cids:
+                    self.remove(compound_id)
+                    logging.debug('Cache update: %s' % str(compound_id))
+                    inchi = self.compound_id2inchi[compound_id]
+                    comp = Compound.from_inchi('KEGG', compound_id, inchi)
+                    self.add(comp)
+        logging.debug('Cache hit: %s' % str(compound_id))
+        return self.compound_dict[compound_id]
+    def remove(self, compound_id):
+        if compound_id in self.compound_dict:
+            del self.compound_dict[compound_id]
+        else:
+            logging.debug('%s is not cached, cannot remove it' % str(compound_id))
+    def add(self, comp):
+        self.compound_dict[comp.compound_id] = comp
+        self.need_to_update_cache_file = True
+    def get_element_matrix(self, compound_ids):
+        if type(compound_ids) == str:
+            compound_ids = [compound_ids]
+        # gather the "atom bags" of all compounds in a list 'atom_bag_list'
+        elements = set()
+        atom_bag_list = []
+        for compound_id in compound_ids:
+            comp = self.get_compound(compound_id)
+            atom_bag = comp.atom_bag
+            if atom_bag is not None:
+                elements = elements.union(list(atom_bag.keys()))
+            atom_bag_list.append(atom_bag)
+        elements.discard('H') # don't balance H (it's enough to balance e-)
+        elements = sorted(elements)
+        # create the elemental matrix, where each row is a compound and each
+        # column is an element (or e-)
+        Ematrix = numpy.matrix(numpy.zeros((len(atom_bag_list), len(elements))))
+        for i, atom_bag in enumerate(atom_bag_list):
+            if atom_bag is None:
+                Ematrix[i, :] = numpy.nan
+            else:
+                for j, elem in enumerate(elements):
+                    Ematrix[i, j] = atom_bag.get(elem, 0)
+        return elements, Ematrix
+###############################################################################
+    @staticmethod
+    def RebuildCompoundJSON():
+        kegg_dict = {}
+        for d in json.load(gzip.open(OLD_COMPOUND_JSON_FNAME, 'r')):
+            cid = d['CID']
+            kegg_dict[cid] = {'compound_id': cid,
+                              'name': d['name'],
+                              'names': d['names'],
+                              'inchi': d['InChI']}
+        # override some of the compounds or add new ones with 'fake' IDs,
+        # i.e. C80000 or higher.
+        kegg_additions_cids = []
+        for d in csv.DictReader(open(KEGG_ADDITIONS_TSV_FNAME, 'r'),
+                                delimiter='\t'):
+            cid = 'C%05d' % int(d['cid'])
+            kegg_additions_cids.append(cid)
+            kegg_dict[cid] = {'compound_id': cid,
+                              'name': d['name'],
+                              'names': [d['name']],
+                              'inchi': d['inchi']}
+        compound_json = [kegg_dict[compound_id] for compound_id in sorted(kegg_dict.keys())]
+        new_json = gzip.open(KEGG_COMPOUND_JSON_FNAME, 'w')
+        json.dump(compound_json, new_json, sort_keys=True, indent=4)
+        new_json.close()
+        return kegg_additions_cids
+###############################################################################
+    @staticmethod
+    def BuildCache(start_from_scratch=False, kegg_additions_cids=None):
+        if start_from_scratch and os.path.exists(DEFAULT_CACHE_FNAME):
+            os.remove(DEFAULT_CACHE_FNAME)
+        ccache = CompoundCacher(cache_fname=DEFAULT_CACHE_FNAME)
+        i = 0
+        for compound_id in ccache.get_all_compound_ids():
+            logging.debug('Caching %s' % compound_id)
+            comp = ccache.get_compound(compound_id, kegg_additions_cids=kegg_additions_cids)
+            logging.debug(str(comp))
+            i += 1
+            if i % 100 == 0:
+                logging.debug('Dumping Cache ...')
+                ccache.dump()
+        ccache.dump()
+###############################################################################
+if __name__ == '__main__':
+    logger = logging.getLogger('')
+    #logger.setLevel(logging.WARNING)
+    logger.setLevel(logging.DEBUG)
+    kegg_additions_cids = CompoundCacher.RebuildCompoundJSON()
+    CompoundCacher.BuildCache(start_from_scratch=False, kegg_additions_cids=kegg_additions_cids)

CC/molecule.py ADDED Viewed

	@@ -0,0 +1,292 @@

+from openbabel import openbabel
+import types
+import re
+import chemaxon
+from thermodynamic_constants import default_T, default_pH
+import pdb
+class OpenBabelError(Exception):
+    pass
+class Molecule(object):
+    # for more rendering options visit:
+    # http://www.ggasoftware.com/opensource/indigo/api/options#rendering
+    _obElements = openbabel.OBElementTable()
+    _obSmarts = openbabel.OBSmartsPattern()
+    @staticmethod
+    def GetNumberOfElements():
+        return Molecule._obElements.GetNumberOfElements()
+    @staticmethod
+    def GetAllElements():
+        return [Molecule._obElements.GetSymbol(i) for i in
+                range(Molecule.GetNumberOfElements())]
+    @staticmethod
+    def GetSymbol(atomic_num):
+        return Molecule._obElements.GetSymbol(atomic_num)
+    @staticmethod
+    def GetAtomicNum(elem):
+        if type(elem) == str:
+            elem = str(elem)
+        return Molecule._obElements.GetAtomicNum(elem)
+    @staticmethod
+    def VerifySmarts(smarts):
+        return Molecule._obSmarts.Init(smarts)
+    def __init__(self):
+        self.title = None
+        self.obmol = openbabel.OBMol()
+        self.smiles = None
+        self.inchi = None
+    def __str__(self):
+        return self.title or self.smiles or self.inchi or ""
+    def __len__(self):
+        return self.GetNumAtoms()
+    def Clone(self):
+        tmp = Molecule()
+        tmp.title = self.title
+        tmp.obmol = openbabel.OBMol(self.obmol)
+        tmp.smiles = self.smiles
+        tmp.inchi = self.inchi
+        return tmp
+    def SetTitle(self, title):
+        self.title = title
+    @staticmethod
+    def FromSmiles(smiles):
+        m = Molecule()
+        m.smiles = smiles
+        obConversion = openbabel.OBConversion()
+        obConversion.AddOption("w", obConversion.OUTOPTIONS)
+        obConversion.SetInFormat("smiles")
+        if not obConversion.ReadString(m.obmol, m.smiles):
+            raise OpenBabelError("Cannot read the SMILES string: " + smiles)
+        try:
+            m.UpdateSmiles()
+            #m.UpdateInChI()
+        except OpenBabelError:
+            raise OpenBabelError("Failed to create Molecule from SMILES: " + smiles)
+        m.SetTitle(smiles)
+        return m
+    @staticmethod
+    def FromInChI(inchi):
+        m = Molecule()
+        m.inchi = inchi
+        obConversion = openbabel.OBConversion()
+        obConversion.AddOption("w", obConversion.OUTOPTIONS)
+        obConversion.SetInFormat("inchi")
+        obConversion.ReadString(m.obmol, m.inchi)
+        try:
+            m.UpdateInChI()
+            #m.UpdateSmiles()
+        except OpenBabelError:
+            raise OpenBabelError("Failed to create Molecule from InChI: " + inchi)
+        m.SetTitle(inchi)
+        return m
+    @staticmethod
+    def FromMol(mol):
+        m = Molecule()
+        obConversion = openbabel.OBConversion()
+        obConversion.AddOption("w", obConversion.OUTOPTIONS)
+        obConversion.SetInFormat("mol")
+        obConversion.ReadString(m.obmol, mol)
+        try:
+            m.UpdateInChI()
+            m.UpdateSmiles()
+        except OpenBabelError:
+            raise OpenBabelError("Failed to create Molecule from MOL file:\n" + mol)
+        m.SetTitle("")
+        return m
+    @staticmethod
+    def FromOBMol(obmol):
+        m = Molecule()
+        m.obmol = obmol
+        try:
+            m.UpdateInChI()
+            m.UpdateSmiles()
+        except OpenBabelError:
+            raise OpenBabelError("Failed to create Molecule from OBMol")
+        m.SetTitle("")
+        return m
+    @staticmethod
+    def _FromFormat(s, fmt='inchi'):
+        if fmt == 'smiles' or fmt == 'smi':
+            return Molecule.FromSmiles(s)
+        if fmt == 'inchi':
+            return Molecule.FromInChI(s)
+        if fmt == 'mol':
+            return Molecule.FromMol(s)
+        if fmt == 'obmol':
+            return Molecule.FromOBMol(s)
+    @staticmethod
+    def _ToFormat(obmol, fmt='inchi'):
+        #print('formatting started...')
+        #pdb.set_trace()
+        obConversion = openbabel.OBConversion()
+        obConversion.AddOption("w", obConversion.OUTOPTIONS)
+        obConversion.SetOutFormat(fmt)
+        res = obConversion.WriteString(obmol)
+        #print('res :::: ')
+        #print(res)
+        if not res:
+            raise OpenBabelError("Cannot convert OBMol to %s" % fmt)
+        if fmt == 'smiles' or fmt == 'smi':
+            #print('I am in')
+            res = res.split()
+            if res == []:
+                raise OpenBabelError("Cannot convert OBMol to %s" % fmt)
+            else:
+                return res[0]
+        elif fmt == 'inchi':
+            return res.strip()
+        else:
+            return res
+    @staticmethod
+    def Smiles2InChI(smiles):
+        obConversion = openbabel.OBConversion()
+        obConversion.AddOption("w", obConversion.OUTOPTIONS)
+        obConversion.SetInAndOutFormats("smiles", "inchi")
+        obmol = openbabel.OBMol()
+        if not obConversion.ReadString(obmol, smiles):
+            raise OpenBabelError("Cannot read the SMILES string: " + smiles)
+        return obConversion.WriteString(obmol).strip()
+    @staticmethod
+    def InChI2Smiles(inchi):
+        obConversion = openbabel.OBConversion()
+        obConversion.AddOption("w", obConversion.OUTOPTIONS)
+        obConversion.SetInAndOutFormats("inchi", "smiles")
+        obmol = openbabel.OBMol()
+        if not obConversion.ReadString(obmol, inchi):
+            raise OpenBabelError("Cannot read the InChI string: " + inchi)
+        return obConversion.WriteString(obmol).split()[0]
+    def RemoveHydrogens(self):
+        self.obmol.DeleteHydrogens()
+    def RemoveAtoms(self, indices):
+        self.obmol.BeginModify()
+        for i in sorted(indices, reverse=True):
+            self.obmol.DeleteAtom(self.obmol.GetAtom(i+1))
+        self.obmol.EndModify()
+        self.smiles = None
+        self.inchi = None
+    def SetAtomicNum(self, index, new_atomic_num):
+        self.obmol.GetAtom(index+1).SetAtomicNum(new_atomic_num)
+        self.smiles = None
+        self.inchi = None
+    def ToOBMol(self):
+        return self.obmol
+    def ToFormat(self, fmt='inchi'):
+        return Molecule._ToFormat(self.obmol, fmt=fmt)
+    def ToMolfile(self):
+        return self.ToFormat('mol')
+    def UpdateInChI(self):
+        self.inchi = Molecule._ToFormat(self.obmol, 'inchi')
+    def ToInChI(self):
+        """
+            Lazy storage of the InChI identifier (calculate once only when
+            asked for and store for later use).
+        """
+        if not self.inchi:
+            self.UpdateInChI()
+        return self.inchi
+    def UpdateSmiles(self):
+        self.smiles = Molecule._ToFormat(self.obmol, 'smiles')
+    def ToSmiles(self):
+        """
+            Lazy storage of the SMILES identifier (calculate once only when
+            asked for and store for later use).
+        """
+        if not self.smiles:
+            self.UpdateSmiles()
+        return self.smiles
+    def GetFormula(self):
+        tokens = re.findall('InChI=1S?/([0-9A-Za-z\.]+)', self.ToInChI())
+        if len(tokens) == 1:
+            return tokens[0]
+        elif len(tokens) > 1:
+            raise ValueError('Bad InChI: ' + self.ToInChI())
+        else:
+            return ''
+    def GetExactMass(self):
+        return self.obmol.GetExactMass()
+    def GetAtomBagAndCharge(self):
+        inchi = self.ToInChI()
+        atom_bag, major_ms_charge = chemaxon.GetAtomBagAndCharge(inchi)
+        return atom_bag, major_ms_charge
+    def GetHydrogensAndCharge(self):
+        atom_bag, charge = self.GetAtomBagAndCharge()
+        return atom_bag.get('H', 0), charge
+    def GetNumElectrons(self):
+        """Calculates the number of electrons in a given molecule."""
+        atom_bag, fixed_charge = self.GetAtomBagAndCharge()
+        return atom_bag.get('e-', 0)
+    def GetNumAtoms(self):
+        return self.obmol.NumAtoms()
+    def GetAtoms(self):
+        return [self.obmol.GetAtom(i+1) for i in range(self.obmol.NumAtoms())]
+    def FindSmarts(self, smarts):
+        """
+        Corrects the pyBel version of Smarts.findall() which returns results as tuples,
+        with 1-based indices even though Molecule.atoms is 0-based.
+        Args:
+            mol: the molecule to search in.
+            smarts_str: the SMARTS query to search for.
+        Returns:
+            The re-mapped list of SMARTS matches.
+        """
+        Molecule._obSmarts.Init(smarts)
+        if Molecule._obSmarts.Match(self.obmol):
+            match_list = Molecule._obSmarts.GetMapList()
+            shift_left = lambda m: [(n - 1) for n in m]
+            return list(map(shift_left, match_list))
+        else:
+            return []
+    def GetAtomCharges(self):
+        """
+            Returns:
+                A list of charges, according to the number of atoms
+                in the molecule
+        """
+        return [atom.GetFormalCharge() for atom in self.GetAtoms()]
+if __name__ == '__main__':
+    mol = Molecule.FromInChI('InChI=1/C5H10O2/c1-3-5(6)7-4-2/h3-4H2,1-2H3')
+    #mol = Molecule.FromInChI('InChI=1S/H2/h1H')
+    print(mol.GetExactMass())

CC/thermodynamic_constants.py ADDED Viewed

	@@ -0,0 +1,36 @@

+R = 8.31e-3 # kJ/(K*mol)
+F = 96.485 # kC/mol
+J_per_cal = 4.184
+default_T = 298.15 # K
+default_I = 0.25 # M
+default_pH = 7.0
+default_c0 = 1 # M
+default_pMg = 10
+default_RT = R * default_T
+default_c_mid = 1e-3 # M
+default_c_range = (1e-6, 1e-2) # M
+dG0_f_Mg = -455.3 # kJ/mol, formation energy of Mg2+
+symbol_d_G = "&Delta;G"
+symbol_d_G0 = "&Delta;G&deg;"
+symbol_d_G_prime = "&Delta;G'"
+symbol_d_G0_prime = "&Delta;G'&deg;"
+symbol_dr_G = "&Delta;<sub>r</sub>G"
+symbol_dr_G0 = "&Delta;<sub>r</sub>G&deg;"
+symbol_dr_G_prime = "&Delta;<sub>r</sub>G'"
+symbol_dr_G0_prime = "&Delta;<sub>r</sub>G'&deg;"
+symbol_dr_Gc_prime = "&Delta;<sub>r</sub>G'<sup>c</sup>"
+symbol_df_G = "&Delta;<sub>f</sub>G"
+symbol_df_G0 = "&Delta;<sub>f</sub>G&deg;"
+symbol_df_G_prime = "&Delta;<sub>f</sub>G'"
+symbol_df_G0_prime = "&Delta;<sub>f</sub>G'&deg;"
+# Approximation of the temperature dependency of ionic strength effects
+DH_alpha = lambda T : 1e-3*(9.20483*T) - 1e-5*(1.284668 * T**2) + 1e-8*(4.95199 * T**3)
+DH_beta = 1.6
+# Debye-Huckel
+debye_huckel = lambda I_T : DH_alpha(I_T[1]) * I_T[0]**(0.5) / (1.0 + DH_beta * I_T[0]**(0.5))