Spaces:
Running
Running
Commit
·
ed603e1
1
Parent(s):
082f385
Update code/pdb_featureVector.py
Browse files- code/pdb_featureVector.py +41 -31
code/pdb_featureVector.py
CHANGED
@@ -230,35 +230,33 @@ def pdb(input_set, mode, impute):
|
|
230 |
modbase = no_swiss_models.copy()
|
231 |
print('Proceeding to Modbase search...')
|
232 |
print('------------------------------------\n')
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
else:
|
259 |
-
modbase = pd.DataFrame(columns = SIMPLE_COLS)
|
260 |
else:
|
261 |
-
modbase =
|
262 |
|
263 |
else:
|
264 |
no_modbase_models_updated = pd.DataFrame()
|
@@ -272,7 +270,15 @@ def pdb(input_set, mode, impute):
|
|
272 |
'lipidationBinary', 'siteBinary', 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
|
273 |
'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', 'modifiedResidueBinary', 'zincFingerBinary',
|
274 |
'motifBinary', 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', 'glycosylationBinary', 'propeptideBinary', 'sasa']
|
|
|
|
|
|
|
|
|
|
|
|
|
275 |
no_modbase_models_updated = no_modbase_models_updated.drop_duplicates()
|
|
|
|
|
276 |
if len(pdb)>0:
|
277 |
pdb = pdb[COLS]
|
278 |
pdb['Source'] = 'PDB'
|
@@ -288,8 +294,7 @@ def pdb(input_set, mode, impute):
|
|
288 |
modbase['Source'] = 'Modbase'
|
289 |
else:
|
290 |
modbase = pd.DataFrame()
|
291 |
-
|
292 |
-
no_modbase_models_updated = pd.DataFrame()
|
293 |
|
294 |
# st.write('======PDB==========')
|
295 |
# st.write(pdb.to_string())
|
@@ -462,10 +467,15 @@ def pdb(input_set, mode, impute):
|
|
462 |
hours, rem = divmod(end - start, 3600)
|
463 |
minutes, seconds = divmod(rem, 60)
|
464 |
print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
|
|
|
|
|
|
|
|
|
|
|
465 |
|
466 |
return final_data
|
467 |
elif len(no_modbase_models_updated) >0 and (len(no_modbase_models_updated) !=org_len):
|
468 |
st.write(f'{len(no_modbase_models_updated)} of {org_len} datapoins could not be mapped to any structures.')
|
469 |
-
st.write(f'{org_len-len(no_modbase_models_updated)} of {org_len} datapoins
|
470 |
elif len(no_modbase_models_updated) == org_len:
|
471 |
-
st.write(f'0 of {org_len} datapoins could not be mapped to any structures. Feature vector could not be created.')
|
|
|
230 |
modbase = no_swiss_models.copy()
|
231 |
print('Proceeding to Modbase search...')
|
232 |
print('------------------------------------\n')
|
233 |
+
|
234 |
+
modbase = modbase[SIMPLE_COLS]
|
235 |
+
modbase.replace({'[]': np.NaN, 'nan-nan': np.NaN, '': np.NaN}, inplace=True)
|
236 |
+
modbase = modbase.fillna(np.NaN)
|
237 |
+
print('\n>> Adding Modbase residue positions...\n')
|
238 |
+
modbase_simple = modbase[['uniprotID', 'wt', 'pos', 'mut','datapoint']]
|
239 |
+
modbase_simple = modbase_simple.drop_duplicates(['uniprotID', 'wt', 'pos' ,'mut','datapoint'])
|
240 |
+
modbaseOut, no_modbase_models_updated = addModbaseModels(modbase_simple, path_to_input_files, path_to_output_files)
|
241 |
|
242 |
+
if len(modbaseOut) > 0:
|
243 |
+
modbase = modbase.merge(modbaseOut, on = ['uniprotID', 'wt', 'pos', 'mut','datapoint'], how = 'left')
|
244 |
+
no_modbase_models_updated['sasa'] = np.NaN
|
245 |
+
modbase.reset_index(inplace=True, drop=True)
|
246 |
+
no_modbase_add = modbase[pd.isna(modbase.coordinates)]
|
247 |
+
modbase = modbase[~pd.isna(modbase.coordinates)]
|
248 |
+
no_modbase_models_updated = pd.concat([no_modbase_models_updated, no_modbase_add])
|
249 |
+
print('\n>> Mapping to Modbase models...\n')
|
250 |
+
modbase = changeUPtoModels(modbase)
|
251 |
+
print('\n>> Calculating 3D distances for Modbase models...\n')
|
252 |
+
modbase = isZeroDistance(modbase)
|
253 |
+
modbase = match3DModels(modbase)
|
254 |
+
modbase = selectMaxAnnot(modbase)
|
255 |
+
modbase = modbase.sort_values(by=['datapoint', 'quality_score', 'distance','hitTotal', 'annotTotal'], ascending=[True, True, True, True, True])
|
256 |
+
modbase = modbase.drop_duplicates(['datapoint'])
|
257 |
+
modbase.replace({'[]': np.NaN, 'hit': 0.0}, inplace=True)
|
|
|
|
|
258 |
else:
|
259 |
+
modbase = pd.DataFrame(columns = SIMPLE_COLS)
|
260 |
|
261 |
else:
|
262 |
no_modbase_models_updated = pd.DataFrame()
|
|
|
270 |
'lipidationBinary', 'siteBinary', 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
|
271 |
'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', 'modifiedResidueBinary', 'zincFingerBinary',
|
272 |
'motifBinary', 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', 'glycosylationBinary', 'propeptideBinary', 'sasa']
|
273 |
+
|
274 |
+
if len(no_modbase_models_updated) == 0:
|
275 |
+
no_modbase_models_updated = pd.DataFrame()
|
276 |
+
no_modbase_models_updated = no_modbase_models_updated[~no_modbase_models_updated.datapoint.isin(modbase.datapoint.to_list())]
|
277 |
+
no_modbase_models_updated = no_modbase_models_updated[['uniprotID', 'wt', 'pos', 'mut', 'datapoint']]
|
278 |
+
no_modbase_models_updated.pos = no_modbase_models_updated.pos.astype(int)
|
279 |
no_modbase_models_updated = no_modbase_models_updated.drop_duplicates()
|
280 |
+
|
281 |
+
|
282 |
if len(pdb)>0:
|
283 |
pdb = pdb[COLS]
|
284 |
pdb['Source'] = 'PDB'
|
|
|
294 |
modbase['Source'] = 'Modbase'
|
295 |
else:
|
296 |
modbase = pd.DataFrame()
|
297 |
+
|
|
|
298 |
|
299 |
# st.write('======PDB==========')
|
300 |
# st.write(pdb.to_string())
|
|
|
467 |
hours, rem = divmod(end - start, 3600)
|
468 |
minutes, seconds = divmod(rem, 60)
|
469 |
print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
|
470 |
+
if len(no_modbase_models_updated) >0 and (len(no_modbase_models_updated) !=org_len):
|
471 |
+
st.write(f'{len(no_modbase_models_updated)} of {org_len} datapoins could not be mapped to any structures.')
|
472 |
+
st.write(f'{org_len-len(no_modbase_models_updated)} of {org_len} datapoins were mapped to a structure.')
|
473 |
+
elif len(no_modbase_models_updated) == org_len:
|
474 |
+
st.write(f'0 of {org_len} datapoins could not be mapped to any structures. Feature vector could not be created.')
|
475 |
|
476 |
return final_data
|
477 |
elif len(no_modbase_models_updated) >0 and (len(no_modbase_models_updated) !=org_len):
|
478 |
st.write(f'{len(no_modbase_models_updated)} of {org_len} datapoins could not be mapped to any structures.')
|
479 |
+
st.write(f'{org_len-len(no_modbase_models_updated)} of {org_len} datapoins were mapped to a structure.')
|
480 |
elif len(no_modbase_models_updated) == org_len:
|
481 |
+
st.write(f'0 of {org_len} datapoins could not be mapped to any structures. Feature vector could not be created.')
|