Spaces:
Sleeping
Sleeping
Commit
·
36da03c
1
Parent(s):
8cc5118
Update code/modbaseModelAdd.py
Browse files- code/modbaseModelAdd.py +157 -6
code/modbaseModelAdd.py
CHANGED
@@ -1,3 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import requests
|
2 |
import numpy as np
|
3 |
import pandas as pd
|
@@ -73,7 +222,9 @@ def addModbaseModels(dataframe, path_to_input_files, path_to_output_files):
|
|
73 |
st.write('wt', wt)
|
74 |
st.write('protein', protein)
|
75 |
st.write('path_to_output_files', path_to_output_files)
|
76 |
-
|
|
|
|
|
77 |
with open(path_to_output_files / 'modbase_structures_individual'/ f'{model_id}.txt', encoding="utf8") as m:
|
78 |
|
79 |
lines = m.readlines()
|
@@ -106,13 +257,13 @@ def addModbaseModels(dataframe, path_to_input_files, path_to_output_files):
|
|
106 |
break
|
107 |
try:
|
108 |
k = pd.Series(
|
109 |
-
[uniprot_id, target_begin, target_end,quality_score, model_id, coordDict, AAonPDB, coordVAR])
|
110 |
new_row = {'uniprotID': uniprot_id, 'target_begin': target_begin,
|
111 |
'target_end': target_end, 'quality_score': quality_score,
|
112 |
'model_id': model_id, 'coordinates': coordDict,
|
113 |
-
'AAonPDB': AAonPDB, 'coordVAR': coordVAR}
|
114 |
modbase_reduced = modbase_reduced.append(new_row, ignore_index=True)
|
115 |
-
modbase_reduced = modbase_reduced[['uniprotID', 'quality_score', 'model_id', 'coordinates', 'AAonPDB', 'coordVAR']]
|
116 |
modbase = dataframe.merge(modbase_reduced, on='uniprotID', how='left')
|
117 |
modbase.quality_score = modbase.quality_score.astype(float)
|
118 |
modbase = modbase.sort_values(by=['datapoint', 'quality_score'], ascending=False)
|
@@ -142,6 +293,6 @@ def addModbaseModels(dataframe, path_to_input_files, path_to_output_files):
|
|
142 |
no_modbase_no_Coord = modbase[pd.isna(modbase['coordVAR'])]
|
143 |
no_modbase = pd.concat([no_modbase, no_modbase_no_Coord])
|
144 |
modbase = modbase[~pd.isna(modbase['coordVAR'])]
|
|
|
145 |
no_modbase = no_modbase[keep_cols]
|
146 |
-
return modbase, no_modbase
|
147 |
-
|
|
|
1 |
+
# import requests
|
2 |
+
# import numpy as np
|
3 |
+
# import pandas as pd
|
4 |
+
# from utils import *
|
5 |
+
# from pathlib import Path
|
6 |
+
# from bs4 import BeautifulSoup
|
7 |
+
# from add_sasa import *
|
8 |
+
# def addModbaseModels(dataframe, path_to_input_files, path_to_output_files):
|
9 |
+
# if len(dataframe) != 0:
|
10 |
+
# # GET MODBASE MODELS
|
11 |
+
# # Get IDs from data to retrieve only their models from MODBASE
|
12 |
+
# dataframe.reset_index(inplace=True, drop=True)
|
13 |
+
# existing_modbase_models = list(Path(path_to_output_files / 'modbase_structures').glob("*"))
|
14 |
+
# existing_modbase_models = [str(i) for i in existing_modbase_models]
|
15 |
+
# existing_modbase_models = [i.split('/')[-1].split('.')[0] for i in existing_modbase_models]
|
16 |
+
|
17 |
+
# existing_modbase_models_ind = list(Path(path_to_output_files / 'modbase_structures_individual').glob("*"))
|
18 |
+
# existing_modbase_models_ind = [str(i) for i in existing_modbase_models_ind]
|
19 |
+
# existing_modbase_models_ind = [i.split('/')[-1].split('.')[0] for i in existing_modbase_models_ind]
|
20 |
+
|
21 |
+
# modbase_reduced = pd.DataFrame(columns = ['uniprotID', 'target_begin', 'target_end', 'quality_score',
|
22 |
+
# 'model_id', 'coordinates','AAonPDB', 'coordVAR'])
|
23 |
+
# print('Retrieving ModBase models...\n')
|
24 |
+
# modbase = pd.DataFrame(
|
25 |
+
# columns=['uniprotID', 'target_begin', 'target_end', 'quality_score', 'model_id',
|
26 |
+
# 'coordinates', 'AAonPDB', 'coordVAR'])
|
27 |
+
# no_modbase = pd.DataFrame(
|
28 |
+
# columns=['uniprotID', 'target_begin', 'target_end', 'quality_score', 'model_id',
|
29 |
+
# 'coordinates', 'AAonPDB', 'coordVAR'])
|
30 |
+
# # Get model files associated with each UniProtID
|
31 |
+
# existing_free_sasa = list(Path(path_to_output_files / 'freesasa_files').glob("*"))
|
32 |
+
# existing_free_sasa = [str(i) for i in existing_free_sasa]
|
33 |
+
# existing_free_sasa = [i.split('/')[-1].split('.')[0] for i in existing_free_sasa]
|
34 |
+
# keep_cols = dataframe.columns
|
35 |
+
# for i in dataframe.index:
|
36 |
+
# coordDict = {}
|
37 |
+
# protein = dataframe.at[i, 'uniprotID']
|
38 |
+
# varPos = int(dataframe.at[i, 'pos'])
|
39 |
+
# wt = dataframe.at[i, 'wt']
|
40 |
+
# mut = dataframe.at[i, 'mut']
|
41 |
+
# datapoint = dataframe.at[i, 'datapoint']
|
42 |
+
|
43 |
+
# if protein not in existing_modbase_models:
|
44 |
+
# print('Downloading Modbase models for ', protein)
|
45 |
+
# url = 'https://salilab.org/modbase/retrieve/modbase/?databaseID=' + protein
|
46 |
+
# req = requests.get(url)
|
47 |
+
# name = path_to_output_files / 'modbase_structures' / f'{protein}.txt'
|
48 |
+
# with open(name, 'wb') as f:
|
49 |
+
# f.write(req.content)
|
50 |
+
# else:
|
51 |
+
# print('Model exists for', protein)
|
52 |
+
# name = Path(path_to_output_files / 'modbase_structures' / f'{protein}.txt')
|
53 |
+
|
54 |
+
# with open(name, encoding="utf8") as f:
|
55 |
+
# a = open(name, 'r').read()
|
56 |
+
# soup = BeautifulSoup(a, 'lxml')
|
57 |
+
# if soup.findAll('pdbfile') != []:
|
58 |
+
# for pdb in soup.findAll('pdbfile'):
|
59 |
+
# model_id = str(pdb.contents[1])[10:-11]
|
60 |
+
# if model_id not in existing_modbase_models_ind:
|
61 |
+
# with open(path_to_output_files / 'modbase_structures_individual' / f'{model_id}.txt', 'w', encoding="utf8") as individual:
|
62 |
+
# individual.write(str('UniProt ID: ' + protein))
|
63 |
+
# individual.write('\n')
|
64 |
+
# individual.write(str(pdb.contents[3])[10:-11].strip())
|
65 |
+
# run_freesasa(
|
66 |
+
# Path(path_to_output_files / 'modbase_structures_individual' / f'{model_id.lower()}.txt'),
|
67 |
+
# Path(path_to_output_files / 'freesasa_files' / f'{model_id.lower()}.txt'),
|
68 |
+
# include_hetatms=True,
|
69 |
+
# outdir=None, force_rerun=False, file_type='pdb')
|
70 |
+
# filename = Path(path_to_output_files / 'freesasa_files' / f'{model_id.lower()}.txt')
|
71 |
+
# st.write('filename', filename)
|
72 |
+
# st.write('varPos', varPos)
|
73 |
+
# st.write('wt', wt)
|
74 |
+
# st.write('protein', protein)
|
75 |
+
# st.write('path_to_output_files', path_to_output_files)
|
76 |
+
# dataframe.loc[i, 'sasa'] = sasa(protein, varPos, wt, 1, filename, path_to_output_files, file_type='pdb')
|
77 |
+
# st.write('sasa', dataframe.loc[i, 'sasa'] )
|
78 |
+
# st.write('model_id', model_id)
|
79 |
+
# with open(path_to_output_files / 'modbase_structures_individual'/ f'{model_id}.txt', encoding="utf8") as m:
|
80 |
+
|
81 |
+
# lines = m.readlines()
|
82 |
+
# quality_score = -999
|
83 |
+
# for ind_line in lines:
|
84 |
+
# if ind_line[0:10] == 'UniProt ID':
|
85 |
+
# uniprot_id = ind_line.split(':')[1].strip()
|
86 |
+
# if ind_line[0:23] == 'REMARK 220 TARGET BEGIN':
|
87 |
+
# target_begin = ind_line[40:43].strip()
|
88 |
+
# if ind_line[0:21] == 'REMARK 220 TARGET END':
|
89 |
+
# target_end = ind_line[40:43].strip()
|
90 |
+
# coordDict, AAonPDB, coordVAR = {},np.NaN,np.NaN
|
91 |
+
# if (int(varPos) > int(target_begin)) & (int(varPos) < int(target_end)):
|
92 |
+
# coordDict = {}
|
93 |
+
# for ind_line in lines:
|
94 |
+
# if ind_line[0:27] == 'REMARK 220 MODPIPE MODEL ID':
|
95 |
+
# model_id = ind_line[40:].strip()
|
96 |
+
# if ind_line[0:15].strip() == 'REMARK 220 MPQS':
|
97 |
+
# quality_score = ind_line[40:].strip()
|
98 |
+
# if ind_line[0:4] == 'ATOM' and ind_line[13:15] == 'CA':
|
99 |
+
# position = int(ind_line[22:26].strip())
|
100 |
+
# chain = ind_line[20:22].strip()
|
101 |
+
# aminoacid = threeToOne(ind_line[17:20])
|
102 |
+
# coords = [ind_line[31:38].strip(), ind_line[39:46].strip(), ind_line[47:54].strip()]
|
103 |
+
# coordDict[position] = coords
|
104 |
+
# if position == int(varPos):
|
105 |
+
# AAonPDB = aminoacid
|
106 |
+
# coordVAR = str(coords)
|
107 |
+
# if ind_line[0:3] == 'TER':
|
108 |
+
# break
|
109 |
+
# try:
|
110 |
+
# k = pd.Series(
|
111 |
+
# [uniprot_id, target_begin, target_end,quality_score, model_id, coordDict, AAonPDB, coordVAR])
|
112 |
+
# new_row = {'uniprotID': uniprot_id, 'target_begin': target_begin,
|
113 |
+
# 'target_end': target_end, 'quality_score': quality_score,
|
114 |
+
# 'model_id': model_id, 'coordinates': coordDict,
|
115 |
+
# 'AAonPDB': AAonPDB, 'coordVAR': coordVAR}
|
116 |
+
# modbase_reduced = modbase_reduced.append(new_row, ignore_index=True)
|
117 |
+
# modbase_reduced = modbase_reduced[['uniprotID', 'quality_score', 'model_id', 'coordinates', 'AAonPDB', 'coordVAR']]
|
118 |
+
# modbase = dataframe.merge(modbase_reduced, on='uniprotID', how='left')
|
119 |
+
# modbase.quality_score = modbase.quality_score.astype(float)
|
120 |
+
# modbase = modbase.sort_values(by=['datapoint', 'quality_score'], ascending=False)
|
121 |
+
# modbase.reset_index(inplace=True, drop=True)
|
122 |
+
# modbase.fillna(np.NaN, inplace=True)
|
123 |
+
# modbase.replace({'\'?\', ': '',
|
124 |
+
# ', \'?\'': '',
|
125 |
+
# '(': '', ')': '',
|
126 |
+
# '[\'?\']': np.NaN,
|
127 |
+
# '[]': np.NaN,
|
128 |
+
# 'nan-nan': np.NaN,
|
129 |
+
# '': np.NaN}, inplace=True)
|
130 |
+
# except NameError:
|
131 |
+
# print('This file doesnt have Quality Score. Replacer: -999', model_id)
|
132 |
+
# else:
|
133 |
+
# new_row = {'uniprotID': uniprot_id, 'wt': wt,
|
134 |
+
# 'pos': varPos, 'mut': mut, 'datapoint': datapoint }
|
135 |
+
# no_modbase = no_modbase.append(new_row, ignore_index=True)
|
136 |
+
|
137 |
+
# else:
|
138 |
+
# new_row = {'uniprotID': uniprot_id, 'wt': wt,
|
139 |
+
# 'pos': varPos, 'mut': mut, 'datapoint': datapoint }
|
140 |
+
# no_modbase = no_modbase.append(new_row, ignore_index=True)
|
141 |
+
|
142 |
+
|
143 |
+
|
144 |
+
# no_modbase_no_Coord = modbase[pd.isna(modbase['coordVAR'])]
|
145 |
+
# no_modbase = pd.concat([no_modbase, no_modbase_no_Coord])
|
146 |
+
# modbase = modbase[~pd.isna(modbase['coordVAR'])]
|
147 |
+
# no_modbase = no_modbase[keep_cols]
|
148 |
+
# return modbase, no_modbase
|
149 |
+
|
150 |
import requests
|
151 |
import numpy as np
|
152 |
import pandas as pd
|
|
|
222 |
st.write('wt', wt)
|
223 |
st.write('protein', protein)
|
224 |
st.write('path_to_output_files', path_to_output_files)
|
225 |
+
sasa_val = sasa(protein, varPos, wt, 1, filename, path_to_output_files, file_type='pdb')
|
226 |
+
st.write('sasa', sasa_val)
|
227 |
+
st.write('model_id', model_id)
|
228 |
with open(path_to_output_files / 'modbase_structures_individual'/ f'{model_id}.txt', encoding="utf8") as m:
|
229 |
|
230 |
lines = m.readlines()
|
|
|
257 |
break
|
258 |
try:
|
259 |
k = pd.Series(
|
260 |
+
[uniprot_id, target_begin, target_end,quality_score, model_id, coordDict, AAonPDB, coordVAR, sasa_val])
|
261 |
new_row = {'uniprotID': uniprot_id, 'target_begin': target_begin,
|
262 |
'target_end': target_end, 'quality_score': quality_score,
|
263 |
'model_id': model_id, 'coordinates': coordDict,
|
264 |
+
'AAonPDB': AAonPDB, 'coordVAR': coordVAR, 'sasa':sasa_val}
|
265 |
modbase_reduced = modbase_reduced.append(new_row, ignore_index=True)
|
266 |
+
modbase_reduced = modbase_reduced[['uniprotID', 'quality_score', 'model_id', 'coordinates', 'AAonPDB', 'coordVAR', 'sasa']]
|
267 |
modbase = dataframe.merge(modbase_reduced, on='uniprotID', how='left')
|
268 |
modbase.quality_score = modbase.quality_score.astype(float)
|
269 |
modbase = modbase.sort_values(by=['datapoint', 'quality_score'], ascending=False)
|
|
|
293 |
no_modbase_no_Coord = modbase[pd.isna(modbase['coordVAR'])]
|
294 |
no_modbase = pd.concat([no_modbase, no_modbase_no_Coord])
|
295 |
modbase = modbase[~pd.isna(modbase['coordVAR'])]
|
296 |
+
st.write(modbase['datapoint', 'model_id', 'sasa'])
|
297 |
no_modbase = no_modbase[keep_cols]
|
298 |
+
return modbase, no_modbase
|
|