Spaces:

vuu10
/

dGPredictor

Runtime error

App Files Files Community

dGPredictor / decompose_groups.py

vuu10

Upload 8 files

66061aa over 1 year ago

raw

history blame

No virus

8.21 kB

	import pandas as pd
	import pdb
	import json
	from rdkit import Chem

	def count_substructures(radius,molecule):
	"""Helper function for get the information of molecular signature of a
	metabolite. The relaxed signature requires the number of each substructure
	to construct a matrix for each molecule.
	Parameters
	----------
	radius : int
	the radius is bond-distance that defines how many neighbor atoms should
	be considered in a reaction center.
	molecule : Molecule
	a molecule object create by RDkit (e.g. Chem.MolFromInchi(inchi_code)
	or Chem.MolToSmiles(smiles_code))
	Returns
	-------
	dict
	dictionary of molecular signature for a molecule,
	{smiles: molecular_signature}
	"""
	m = molecule
	smi_count = dict()
	atomList = [atom for atom in m.GetAtoms()]

	for i in range(len(atomList)):
	env = Chem.FindAtomEnvironmentOfRadiusN(m,radius,i)
	atoms=set()
	for bidx in env:
	atoms.add(m.GetBondWithIdx(bidx).GetBeginAtomIdx())
	atoms.add(m.GetBondWithIdx(bidx).GetEndAtomIdx())

	# only one atom is in this environment, such as O in H2O
	if len(atoms) == 0:
	atoms = {i}

	smi = Chem.MolFragmentToSmiles(m,atomsToUse=list(atoms),
	bondsToUse=env,canonical=True)

	if smi in smi_count:
	smi_count[smi] = smi_count[smi] + 1
	else:
	smi_count[smi] = 1
	return smi_count

	def decompse_ac(db_smiles,radius=1):
	non_decomposable = []
	decompose_vector = dict()

	for cid in db_smiles:
	# print cid
	smiles_pH7 = db_smiles[cid]
	try:
	mol = Chem.MolFromSmiles(smiles_pH7)
	mol = Chem.RemoveHs(mol)
	# Chem.RemoveStereochemistry(mol)
	smi_count = count_substructures(radius,mol)
	decompose_vector[cid] = smi_count

	except Exception as e:
	non_decomposable.append(cid)

	with open('./data/decompose_vector_ac.json','w') as fp:
	json.dump(decompose_vector,fp)

	def get_rxn_rule():
	"""calculate reaction rules based on the relaxed molecular signatures.

	Parameters
	----------
	radius : int
	the radius is bond-distance that defines how many neighbor atoms should
	be considered in a reaction center.

	Returns
	-------
	None
	All of the reaction rules are saved in files (csv file)

	"""
	reaction_dict = json.load(open('./data/optstoic_v3_Sji_dict.json'))
	molecular_signature = json.load(open('./data/decompose_vector_ac.json'))
	molsigna_df = pd.DataFrame.from_dict(molecular_signature).fillna(0)
	all_mets = molsigna_df.columns.tolist()
	all_mets.append("C00080")
	all_mets.append("C00282")


	rule_df = pd.DataFrame(index=molsigna_df.index)
	for rid, value in list(reaction_dict.items()):
	# skip the reactions with missing metabolites
	mets = list(value.keys())
	flag = False
	for met in mets:
	if met not in all_mets:
	flag = True
	break
	if flag: continue

	rule_df[rid] = 0
	for met, stoic in list(value.items()):
	if met == "C00080" or met == "C00282":
	continue # hydogen is zero
	rule_df[rid] += molsigna_df[met] * stoic
	rule_df.to_csv("./data/reaction_rule.csv", index=True)

	def get_rxn_rule_no_stero():
	"""calculate reaction rules based on the relaxed molecular signatures.

	Parameters
	----------
	radius : int
	the radius is bond-distance that defines how many neighbor atoms should
	be considered in a reaction center.

	Returns
	-------
	None
	All of the reaction rules are saved in files (csv file)

	"""
	reaction_dict = json.load(open('./data/optstoic_v3_Sji_dict.json'))
	molecular_signature = json.load(open('./data/decompose_vector_ac_nostereo.json'))
	molsigna_df = pd.DataFrame.from_dict(molecular_signature).fillna(0)
	all_mets = molsigna_df.columns.tolist()
	all_mets.append("C00080")
	all_mets.append("C00282")


	rule_df = pd.DataFrame(index=molsigna_df.index)
	for rid, value in list(reaction_dict.items()):
	# skip the reactions with missing metabolites
	mets = list(value.keys())
	flag = False
	for met in mets:
	if met not in all_mets:
	flag = True
	break
	if flag: continue

	rule_df[rid] = 0
	for met, stoic in list(value.items()):
	if met == "C00080" or met == "C00282":
	continue # hydogen is zero
	rule_df[rid] += molsigna_df[met] * stoic
	rule_df.to_csv("./data/reaction_rule_no_stero.csv", index=True)

	def get_rxn_rule_remove_TECRDB_mets():
	"""calculate reaction rules based on the relaxed molecular signatures.

	Parameters
	----------
	radius : int
	the radius is bond-distance that defines how many neighbor atoms should
	be considered in a reaction center.

	Returns
	-------
	None
	All of the reaction rules are saved in files (csv file)

	"""
	reaction_dict = json.load(open('./data/optstoic_v3_Sji_dict.json'))
	molecular_signature = json.load(open('./data/decompose_vector_ac.json'))
	molsigna_df = pd.DataFrame.from_dict(molecular_signature).fillna(0)
	all_mets = molsigna_df.columns.tolist()
	all_mets.append("C00080")
	all_mets.append("C00282")

	mets_TECRDB_df = pd.read_csv('./data/TECRBD_mets.txt',header=None)
	mets_TECRDB = mets_TECRDB_df[0].tolist()

	# pdb.set_trace()
	all_mets = list(set(all_mets + mets_TECRDB))

	rule_df = pd.DataFrame(index=molsigna_df.index)
	for rid, value in list(reaction_dict.items()):
	# skip the reactions with missing metabolites
	mets = list(value.keys())
	flag = False
	for met in mets:
	if met not in all_mets:
	flag = True
	break
	if flag: continue

	rule_df[rid] = 0
	for met, stoic in list(value.items()):
	if met in mets_TECRDB:
	continue # hydogen is zero
	rule_df[rid] += molsigna_df[met] * stoic
	rule_df.to_csv("./data/reaction_rule_remove_TECRDB_mets.csv", index=True)

	def get_rxn_rule_no_stero_remove_TECRDB_mets():
	"""calculate reaction rules based on the relaxed molecular signatures.

	Parameters
	----------
	radius : int
	the radius is bond-distance that defines how many neighbor atoms should
	be considered in a reaction center.

	Returns
	-------
	None
	All of the reaction rules are saved in files (csv file)

	"""
	reaction_dict = json.load(open('./data/optstoic_v3_Sji_dict.json'))
	molecular_signature = json.load(open('./data/decompose_vector_ac_nostereo.json'))
	molsigna_df = pd.DataFrame.from_dict(molecular_signature).fillna(0)
	all_mets = molsigna_df.columns.tolist()
	all_mets.append("C00080")
	all_mets.append("C00282")

	mets_TECRDB_df = pd.read_csv('./data/TECRBD_mets.txt',header=None)
	mets_TECRDB = mets_TECRDB_df[0].tolist()

	# pdb.set_trace()
	all_mets = list(set(all_mets + mets_TECRDB))

	rule_df = pd.DataFrame(index=molsigna_df.index)
	for rid, value in list(reaction_dict.items()):
	# skip the reactions with missing metabolites
	mets = list(value.keys())
	flag = False
	for met in mets:
	if met not in all_mets:
	flag = True
	break
	if flag: continue

	rule_df[rid] = 0
	for met, stoic in list(value.items()):
	if met in mets_TECRDB:
	continue # hydogen is zero
	rule_df[rid] += molsigna_df[met] * stoic
	rule_df.to_csv("./data/reaction_rule_nostereo_remove_TECRDB_mets.csv", index=True)



	if __name__ == '__main__':
	# db = pd.read_csv('./data/cache_compounds_20160818.csv',index_col='compound_id')
	# db_smiles = db['smiles_pH7'].to_dict()
	# decompse_ac(db_smiles)
	# get_rxn_rule()

	# get_rxn_rule_remove_TECRDB_mets()
	get_rxn_rule_no_stero_remove_TECRDB_mets()