import pandas as pd import pdb import json from rdkit import Chem def count_substructures(radius,molecule): """Helper function for get the information of molecular signature of a metabolite. The relaxed signature requires the number of each substructure to construct a matrix for each molecule. Parameters ---------- radius : int the radius is bond-distance that defines how many neighbor atoms should be considered in a reaction center. molecule : Molecule a molecule object create by RDkit (e.g. Chem.MolFromInchi(inchi_code) or Chem.MolToSmiles(smiles_code)) Returns ------- dict dictionary of molecular signature for a molecule, {smiles: molecular_signature} """ m = molecule smi_count = dict() atomList = [atom for atom in m.GetAtoms()] for i in range(len(atomList)): env = Chem.FindAtomEnvironmentOfRadiusN(m,radius,i) atoms=set() for bidx in env: atoms.add(m.GetBondWithIdx(bidx).GetBeginAtomIdx()) atoms.add(m.GetBondWithIdx(bidx).GetEndAtomIdx()) # only one atom is in this environment, such as O in H2O if len(atoms) == 0: atoms = {i} smi = Chem.MolFragmentToSmiles(m,atomsToUse=list(atoms), bondsToUse=env,canonical=True) if smi in smi_count: smi_count[smi] = smi_count[smi] + 1 else: smi_count[smi] = 1 return smi_count def decompse_ac(db_smiles,radius=1): non_decomposable = [] decompose_vector = dict() for cid in db_smiles: # print cid smiles_pH7 = db_smiles[cid] try: mol = Chem.MolFromSmiles(smiles_pH7) mol = Chem.RemoveHs(mol) # Chem.RemoveStereochemistry(mol) smi_count = count_substructures(radius,mol) decompose_vector[cid] = smi_count except Exception as e: non_decomposable.append(cid) with open('./data/decompose_vector_ac.json','w') as fp: json.dump(decompose_vector,fp) def get_rxn_rule(): """calculate reaction rules based on the relaxed molecular signatures. Parameters ---------- radius : int the radius is bond-distance that defines how many neighbor atoms should be considered in a reaction center. Returns ------- None All of the reaction rules are saved in files (csv file) """ reaction_dict = json.load(open('./data/optstoic_v3_Sji_dict.json')) molecular_signature = json.load(open('./data/decompose_vector_ac.json')) molsigna_df = pd.DataFrame.from_dict(molecular_signature).fillna(0) all_mets = molsigna_df.columns.tolist() all_mets.append("C00080") all_mets.append("C00282") rule_df = pd.DataFrame(index=molsigna_df.index) for rid, value in list(reaction_dict.items()): # skip the reactions with missing metabolites mets = list(value.keys()) flag = False for met in mets: if met not in all_mets: flag = True break if flag: continue rule_df[rid] = 0 for met, stoic in list(value.items()): if met == "C00080" or met == "C00282": continue # hydogen is zero rule_df[rid] += molsigna_df[met] * stoic rule_df.to_csv("./data/reaction_rule.csv", index=True) def get_rxn_rule_no_stero(): """calculate reaction rules based on the relaxed molecular signatures. Parameters ---------- radius : int the radius is bond-distance that defines how many neighbor atoms should be considered in a reaction center. Returns ------- None All of the reaction rules are saved in files (csv file) """ reaction_dict = json.load(open('./data/optstoic_v3_Sji_dict.json')) molecular_signature = json.load(open('./data/decompose_vector_ac_nostereo.json')) molsigna_df = pd.DataFrame.from_dict(molecular_signature).fillna(0) all_mets = molsigna_df.columns.tolist() all_mets.append("C00080") all_mets.append("C00282") rule_df = pd.DataFrame(index=molsigna_df.index) for rid, value in list(reaction_dict.items()): # skip the reactions with missing metabolites mets = list(value.keys()) flag = False for met in mets: if met not in all_mets: flag = True break if flag: continue rule_df[rid] = 0 for met, stoic in list(value.items()): if met == "C00080" or met == "C00282": continue # hydogen is zero rule_df[rid] += molsigna_df[met] * stoic rule_df.to_csv("./data/reaction_rule_no_stero.csv", index=True) def get_rxn_rule_remove_TECRDB_mets(): """calculate reaction rules based on the relaxed molecular signatures. Parameters ---------- radius : int the radius is bond-distance that defines how many neighbor atoms should be considered in a reaction center. Returns ------- None All of the reaction rules are saved in files (csv file) """ reaction_dict = json.load(open('./data/optstoic_v3_Sji_dict.json')) molecular_signature = json.load(open('./data/decompose_vector_ac.json')) molsigna_df = pd.DataFrame.from_dict(molecular_signature).fillna(0) all_mets = molsigna_df.columns.tolist() all_mets.append("C00080") all_mets.append("C00282") mets_TECRDB_df = pd.read_csv('./data/TECRBD_mets.txt',header=None) mets_TECRDB = mets_TECRDB_df[0].tolist() # pdb.set_trace() all_mets = list(set(all_mets + mets_TECRDB)) rule_df = pd.DataFrame(index=molsigna_df.index) for rid, value in list(reaction_dict.items()): # skip the reactions with missing metabolites mets = list(value.keys()) flag = False for met in mets: if met not in all_mets: flag = True break if flag: continue rule_df[rid] = 0 for met, stoic in list(value.items()): if met in mets_TECRDB: continue # hydogen is zero rule_df[rid] += molsigna_df[met] * stoic rule_df.to_csv("./data/reaction_rule_remove_TECRDB_mets.csv", index=True) def get_rxn_rule_no_stero_remove_TECRDB_mets(): """calculate reaction rules based on the relaxed molecular signatures. Parameters ---------- radius : int the radius is bond-distance that defines how many neighbor atoms should be considered in a reaction center. Returns ------- None All of the reaction rules are saved in files (csv file) """ reaction_dict = json.load(open('./data/optstoic_v3_Sji_dict.json')) molecular_signature = json.load(open('./data/decompose_vector_ac_nostereo.json')) molsigna_df = pd.DataFrame.from_dict(molecular_signature).fillna(0) all_mets = molsigna_df.columns.tolist() all_mets.append("C00080") all_mets.append("C00282") mets_TECRDB_df = pd.read_csv('./data/TECRBD_mets.txt',header=None) mets_TECRDB = mets_TECRDB_df[0].tolist() # pdb.set_trace() all_mets = list(set(all_mets + mets_TECRDB)) rule_df = pd.DataFrame(index=molsigna_df.index) for rid, value in list(reaction_dict.items()): # skip the reactions with missing metabolites mets = list(value.keys()) flag = False for met in mets: if met not in all_mets: flag = True break if flag: continue rule_df[rid] = 0 for met, stoic in list(value.items()): if met in mets_TECRDB: continue # hydogen is zero rule_df[rid] += molsigna_df[met] * stoic rule_df.to_csv("./data/reaction_rule_nostereo_remove_TECRDB_mets.csv", index=True) if __name__ == '__main__': # db = pd.read_csv('./data/cache_compounds_20160818.csv',index_col='compound_id') # db_smiles = db['smiles_pH7'].to_dict() # decompse_ac(db_smiles) # get_rxn_rule() # get_rxn_rule_remove_TECRDB_mets() get_rxn_rule_no_stero_remove_TECRDB_mets()