fatmacankara commited on
Commit
ed603e1
·
1 Parent(s): 082f385

Update code/pdb_featureVector.py

Browse files
Files changed (1) hide show
  1. code/pdb_featureVector.py +41 -31
code/pdb_featureVector.py CHANGED
@@ -230,35 +230,33 @@ def pdb(input_set, mode, impute):
230
  modbase = no_swiss_models.copy()
231
  print('Proceeding to Modbase search...')
232
  print('------------------------------------\n')
233
- if len(modbase) > 0:
234
- modbase = modbase[SIMPLE_COLS]
235
- modbase.replace({'[]': np.NaN, 'nan-nan': np.NaN, '': np.NaN}, inplace=True)
236
- modbase = modbase.fillna(np.NaN)
237
- print('\n>> Adding Modbase residue positions...\n')
238
- modbase_simple = modbase[['uniprotID', 'wt', 'pos', 'mut','datapoint']]
239
- modbase_simple = modbase_simple.drop_duplicates(['uniprotID', 'wt', 'pos' ,'mut','datapoint'])
240
- modbaseOut, no_modbase_models_updated = addModbaseModels(modbase_simple, path_to_input_files, path_to_output_files)
241
 
242
- if len(modbaseOut) > 0:
243
- modbase = modbase.merge(modbaseOut, on = ['uniprotID', 'wt', 'pos', 'mut','datapoint'], how = 'left')
244
- no_modbase_models_updated['sasa'] = np.NaN
245
- modbase.reset_index(inplace=True, drop=True)
246
- no_modbase_add = modbase[pd.isna(modbase.coordinates)]
247
- modbase = modbase[~pd.isna(modbase.coordinates)]
248
- no_modbase_models_updated = pd.concat([no_modbase_models_updated, no_modbase_add])
249
- print('\n>> Mapping to Modbase models...\n')
250
- modbase = changeUPtoModels(modbase)
251
- print('\n>> Calculating 3D distances for Modbase models...\n')
252
- modbase = isZeroDistance(modbase)
253
- modbase = match3DModels(modbase)
254
- modbase = selectMaxAnnot(modbase)
255
- modbase = modbase.sort_values(by=['datapoint', 'quality_score', 'distance','hitTotal', 'annotTotal'], ascending=[True, True, True, True, True])
256
- modbase = modbase.drop_duplicates(['datapoint'])
257
- modbase.replace({'[]': np.NaN, 'hit': 0.0}, inplace=True)
258
- else:
259
- modbase = pd.DataFrame(columns = SIMPLE_COLS)
260
  else:
261
- modbase = modbase[SIMPLE_COLS]
262
 
263
  else:
264
  no_modbase_models_updated = pd.DataFrame()
@@ -272,7 +270,15 @@ def pdb(input_set, mode, impute):
272
  'lipidationBinary', 'siteBinary', 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
273
  'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', 'modifiedResidueBinary', 'zincFingerBinary',
274
  'motifBinary', 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', 'glycosylationBinary', 'propeptideBinary', 'sasa']
 
 
 
 
 
 
275
  no_modbase_models_updated = no_modbase_models_updated.drop_duplicates()
 
 
276
  if len(pdb)>0:
277
  pdb = pdb[COLS]
278
  pdb['Source'] = 'PDB'
@@ -288,8 +294,7 @@ def pdb(input_set, mode, impute):
288
  modbase['Source'] = 'Modbase'
289
  else:
290
  modbase = pd.DataFrame()
291
- if len(no_modbase_models_updated) == 0:
292
- no_modbase_models_updated = pd.DataFrame()
293
 
294
  # st.write('======PDB==========')
295
  # st.write(pdb.to_string())
@@ -462,10 +467,15 @@ def pdb(input_set, mode, impute):
462
  hours, rem = divmod(end - start, 3600)
463
  minutes, seconds = divmod(rem, 60)
464
  print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
 
 
 
 
 
465
 
466
  return final_data
467
  elif len(no_modbase_models_updated) >0 and (len(no_modbase_models_updated) !=org_len):
468
  st.write(f'{len(no_modbase_models_updated)} of {org_len} datapoins could not be mapped to any structures.')
469
- st.write(f'{org_len-len(no_modbase_models_updated)} of {org_len} datapoins could not be mapped to any structures.')
470
  elif len(no_modbase_models_updated) == org_len:
471
- st.write(f'0 of {org_len} datapoins could not be mapped to any structures. Feature vector could not be created.')
 
230
  modbase = no_swiss_models.copy()
231
  print('Proceeding to Modbase search...')
232
  print('------------------------------------\n')
233
+
234
+ modbase = modbase[SIMPLE_COLS]
235
+ modbase.replace({'[]': np.NaN, 'nan-nan': np.NaN, '': np.NaN}, inplace=True)
236
+ modbase = modbase.fillna(np.NaN)
237
+ print('\n>> Adding Modbase residue positions...\n')
238
+ modbase_simple = modbase[['uniprotID', 'wt', 'pos', 'mut','datapoint']]
239
+ modbase_simple = modbase_simple.drop_duplicates(['uniprotID', 'wt', 'pos' ,'mut','datapoint'])
240
+ modbaseOut, no_modbase_models_updated = addModbaseModels(modbase_simple, path_to_input_files, path_to_output_files)
241
 
242
+ if len(modbaseOut) > 0:
243
+ modbase = modbase.merge(modbaseOut, on = ['uniprotID', 'wt', 'pos', 'mut','datapoint'], how = 'left')
244
+ no_modbase_models_updated['sasa'] = np.NaN
245
+ modbase.reset_index(inplace=True, drop=True)
246
+ no_modbase_add = modbase[pd.isna(modbase.coordinates)]
247
+ modbase = modbase[~pd.isna(modbase.coordinates)]
248
+ no_modbase_models_updated = pd.concat([no_modbase_models_updated, no_modbase_add])
249
+ print('\n>> Mapping to Modbase models...\n')
250
+ modbase = changeUPtoModels(modbase)
251
+ print('\n>> Calculating 3D distances for Modbase models...\n')
252
+ modbase = isZeroDistance(modbase)
253
+ modbase = match3DModels(modbase)
254
+ modbase = selectMaxAnnot(modbase)
255
+ modbase = modbase.sort_values(by=['datapoint', 'quality_score', 'distance','hitTotal', 'annotTotal'], ascending=[True, True, True, True, True])
256
+ modbase = modbase.drop_duplicates(['datapoint'])
257
+ modbase.replace({'[]': np.NaN, 'hit': 0.0}, inplace=True)
 
 
258
  else:
259
+ modbase = pd.DataFrame(columns = SIMPLE_COLS)
260
 
261
  else:
262
  no_modbase_models_updated = pd.DataFrame()
 
270
  'lipidationBinary', 'siteBinary', 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
271
  'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', 'modifiedResidueBinary', 'zincFingerBinary',
272
  'motifBinary', 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', 'glycosylationBinary', 'propeptideBinary', 'sasa']
273
+
274
+ if len(no_modbase_models_updated) == 0:
275
+ no_modbase_models_updated = pd.DataFrame()
276
+ no_modbase_models_updated = no_modbase_models_updated[~no_modbase_models_updated.datapoint.isin(modbase.datapoint.to_list())]
277
+ no_modbase_models_updated = no_modbase_models_updated[['uniprotID', 'wt', 'pos', 'mut', 'datapoint']]
278
+ no_modbase_models_updated.pos = no_modbase_models_updated.pos.astype(int)
279
  no_modbase_models_updated = no_modbase_models_updated.drop_duplicates()
280
+
281
+
282
  if len(pdb)>0:
283
  pdb = pdb[COLS]
284
  pdb['Source'] = 'PDB'
 
294
  modbase['Source'] = 'Modbase'
295
  else:
296
  modbase = pd.DataFrame()
297
+
 
298
 
299
  # st.write('======PDB==========')
300
  # st.write(pdb.to_string())
 
467
  hours, rem = divmod(end - start, 3600)
468
  minutes, seconds = divmod(rem, 60)
469
  print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
470
+ if len(no_modbase_models_updated) >0 and (len(no_modbase_models_updated) !=org_len):
471
+ st.write(f'{len(no_modbase_models_updated)} of {org_len} datapoins could not be mapped to any structures.')
472
+ st.write(f'{org_len-len(no_modbase_models_updated)} of {org_len} datapoins were mapped to a structure.')
473
+ elif len(no_modbase_models_updated) == org_len:
474
+ st.write(f'0 of {org_len} datapoins could not be mapped to any structures. Feature vector could not be created.')
475
 
476
  return final_data
477
  elif len(no_modbase_models_updated) >0 and (len(no_modbase_models_updated) !=org_len):
478
  st.write(f'{len(no_modbase_models_updated)} of {org_len} datapoins could not be mapped to any structures.')
479
+ st.write(f'{org_len-len(no_modbase_models_updated)} of {org_len} datapoins were mapped to a structure.')
480
  elif len(no_modbase_models_updated) == org_len:
481
+ st.write(f'0 of {org_len} datapoins could not be mapped to any structures. Feature vector could not be created.')