fatmacankara commited on
Commit
36da03c
·
1 Parent(s): 8cc5118

Update code/modbaseModelAdd.py

Browse files
Files changed (1) hide show
  1. code/modbaseModelAdd.py +157 -6
code/modbaseModelAdd.py CHANGED
@@ -1,3 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import requests
2
  import numpy as np
3
  import pandas as pd
@@ -73,7 +222,9 @@ def addModbaseModels(dataframe, path_to_input_files, path_to_output_files):
73
  st.write('wt', wt)
74
  st.write('protein', protein)
75
  st.write('path_to_output_files', path_to_output_files)
76
- dataframe.loc[i, 'sasa'] = sasa(protein, varPos, wt, 1, filename, path_to_output_files, file_type='pdb')
 
 
77
  with open(path_to_output_files / 'modbase_structures_individual'/ f'{model_id}.txt', encoding="utf8") as m:
78
 
79
  lines = m.readlines()
@@ -106,13 +257,13 @@ def addModbaseModels(dataframe, path_to_input_files, path_to_output_files):
106
  break
107
  try:
108
  k = pd.Series(
109
- [uniprot_id, target_begin, target_end,quality_score, model_id, coordDict, AAonPDB, coordVAR])
110
  new_row = {'uniprotID': uniprot_id, 'target_begin': target_begin,
111
  'target_end': target_end, 'quality_score': quality_score,
112
  'model_id': model_id, 'coordinates': coordDict,
113
- 'AAonPDB': AAonPDB, 'coordVAR': coordVAR}
114
  modbase_reduced = modbase_reduced.append(new_row, ignore_index=True)
115
- modbase_reduced = modbase_reduced[['uniprotID', 'quality_score', 'model_id', 'coordinates', 'AAonPDB', 'coordVAR']]
116
  modbase = dataframe.merge(modbase_reduced, on='uniprotID', how='left')
117
  modbase.quality_score = modbase.quality_score.astype(float)
118
  modbase = modbase.sort_values(by=['datapoint', 'quality_score'], ascending=False)
@@ -142,6 +293,6 @@ def addModbaseModels(dataframe, path_to_input_files, path_to_output_files):
142
  no_modbase_no_Coord = modbase[pd.isna(modbase['coordVAR'])]
143
  no_modbase = pd.concat([no_modbase, no_modbase_no_Coord])
144
  modbase = modbase[~pd.isna(modbase['coordVAR'])]
 
145
  no_modbase = no_modbase[keep_cols]
146
- return modbase, no_modbase
147
-
 
1
+ # import requests
2
+ # import numpy as np
3
+ # import pandas as pd
4
+ # from utils import *
5
+ # from pathlib import Path
6
+ # from bs4 import BeautifulSoup
7
+ # from add_sasa import *
8
+ # def addModbaseModels(dataframe, path_to_input_files, path_to_output_files):
9
+ # if len(dataframe) != 0:
10
+ # # GET MODBASE MODELS
11
+ # # Get IDs from data to retrieve only their models from MODBASE
12
+ # dataframe.reset_index(inplace=True, drop=True)
13
+ # existing_modbase_models = list(Path(path_to_output_files / 'modbase_structures').glob("*"))
14
+ # existing_modbase_models = [str(i) for i in existing_modbase_models]
15
+ # existing_modbase_models = [i.split('/')[-1].split('.')[0] for i in existing_modbase_models]
16
+
17
+ # existing_modbase_models_ind = list(Path(path_to_output_files / 'modbase_structures_individual').glob("*"))
18
+ # existing_modbase_models_ind = [str(i) for i in existing_modbase_models_ind]
19
+ # existing_modbase_models_ind = [i.split('/')[-1].split('.')[0] for i in existing_modbase_models_ind]
20
+
21
+ # modbase_reduced = pd.DataFrame(columns = ['uniprotID', 'target_begin', 'target_end', 'quality_score',
22
+ # 'model_id', 'coordinates','AAonPDB', 'coordVAR'])
23
+ # print('Retrieving ModBase models...\n')
24
+ # modbase = pd.DataFrame(
25
+ # columns=['uniprotID', 'target_begin', 'target_end', 'quality_score', 'model_id',
26
+ # 'coordinates', 'AAonPDB', 'coordVAR'])
27
+ # no_modbase = pd.DataFrame(
28
+ # columns=['uniprotID', 'target_begin', 'target_end', 'quality_score', 'model_id',
29
+ # 'coordinates', 'AAonPDB', 'coordVAR'])
30
+ # # Get model files associated with each UniProtID
31
+ # existing_free_sasa = list(Path(path_to_output_files / 'freesasa_files').glob("*"))
32
+ # existing_free_sasa = [str(i) for i in existing_free_sasa]
33
+ # existing_free_sasa = [i.split('/')[-1].split('.')[0] for i in existing_free_sasa]
34
+ # keep_cols = dataframe.columns
35
+ # for i in dataframe.index:
36
+ # coordDict = {}
37
+ # protein = dataframe.at[i, 'uniprotID']
38
+ # varPos = int(dataframe.at[i, 'pos'])
39
+ # wt = dataframe.at[i, 'wt']
40
+ # mut = dataframe.at[i, 'mut']
41
+ # datapoint = dataframe.at[i, 'datapoint']
42
+
43
+ # if protein not in existing_modbase_models:
44
+ # print('Downloading Modbase models for ', protein)
45
+ # url = 'https://salilab.org/modbase/retrieve/modbase/?databaseID=' + protein
46
+ # req = requests.get(url)
47
+ # name = path_to_output_files / 'modbase_structures' / f'{protein}.txt'
48
+ # with open(name, 'wb') as f:
49
+ # f.write(req.content)
50
+ # else:
51
+ # print('Model exists for', protein)
52
+ # name = Path(path_to_output_files / 'modbase_structures' / f'{protein}.txt')
53
+
54
+ # with open(name, encoding="utf8") as f:
55
+ # a = open(name, 'r').read()
56
+ # soup = BeautifulSoup(a, 'lxml')
57
+ # if soup.findAll('pdbfile') != []:
58
+ # for pdb in soup.findAll('pdbfile'):
59
+ # model_id = str(pdb.contents[1])[10:-11]
60
+ # if model_id not in existing_modbase_models_ind:
61
+ # with open(path_to_output_files / 'modbase_structures_individual' / f'{model_id}.txt', 'w', encoding="utf8") as individual:
62
+ # individual.write(str('UniProt ID: ' + protein))
63
+ # individual.write('\n')
64
+ # individual.write(str(pdb.contents[3])[10:-11].strip())
65
+ # run_freesasa(
66
+ # Path(path_to_output_files / 'modbase_structures_individual' / f'{model_id.lower()}.txt'),
67
+ # Path(path_to_output_files / 'freesasa_files' / f'{model_id.lower()}.txt'),
68
+ # include_hetatms=True,
69
+ # outdir=None, force_rerun=False, file_type='pdb')
70
+ # filename = Path(path_to_output_files / 'freesasa_files' / f'{model_id.lower()}.txt')
71
+ # st.write('filename', filename)
72
+ # st.write('varPos', varPos)
73
+ # st.write('wt', wt)
74
+ # st.write('protein', protein)
75
+ # st.write('path_to_output_files', path_to_output_files)
76
+ # dataframe.loc[i, 'sasa'] = sasa(protein, varPos, wt, 1, filename, path_to_output_files, file_type='pdb')
77
+ # st.write('sasa', dataframe.loc[i, 'sasa'] )
78
+ # st.write('model_id', model_id)
79
+ # with open(path_to_output_files / 'modbase_structures_individual'/ f'{model_id}.txt', encoding="utf8") as m:
80
+
81
+ # lines = m.readlines()
82
+ # quality_score = -999
83
+ # for ind_line in lines:
84
+ # if ind_line[0:10] == 'UniProt ID':
85
+ # uniprot_id = ind_line.split(':')[1].strip()
86
+ # if ind_line[0:23] == 'REMARK 220 TARGET BEGIN':
87
+ # target_begin = ind_line[40:43].strip()
88
+ # if ind_line[0:21] == 'REMARK 220 TARGET END':
89
+ # target_end = ind_line[40:43].strip()
90
+ # coordDict, AAonPDB, coordVAR = {},np.NaN,np.NaN
91
+ # if (int(varPos) > int(target_begin)) & (int(varPos) < int(target_end)):
92
+ # coordDict = {}
93
+ # for ind_line in lines:
94
+ # if ind_line[0:27] == 'REMARK 220 MODPIPE MODEL ID':
95
+ # model_id = ind_line[40:].strip()
96
+ # if ind_line[0:15].strip() == 'REMARK 220 MPQS':
97
+ # quality_score = ind_line[40:].strip()
98
+ # if ind_line[0:4] == 'ATOM' and ind_line[13:15] == 'CA':
99
+ # position = int(ind_line[22:26].strip())
100
+ # chain = ind_line[20:22].strip()
101
+ # aminoacid = threeToOne(ind_line[17:20])
102
+ # coords = [ind_line[31:38].strip(), ind_line[39:46].strip(), ind_line[47:54].strip()]
103
+ # coordDict[position] = coords
104
+ # if position == int(varPos):
105
+ # AAonPDB = aminoacid
106
+ # coordVAR = str(coords)
107
+ # if ind_line[0:3] == 'TER':
108
+ # break
109
+ # try:
110
+ # k = pd.Series(
111
+ # [uniprot_id, target_begin, target_end,quality_score, model_id, coordDict, AAonPDB, coordVAR])
112
+ # new_row = {'uniprotID': uniprot_id, 'target_begin': target_begin,
113
+ # 'target_end': target_end, 'quality_score': quality_score,
114
+ # 'model_id': model_id, 'coordinates': coordDict,
115
+ # 'AAonPDB': AAonPDB, 'coordVAR': coordVAR}
116
+ # modbase_reduced = modbase_reduced.append(new_row, ignore_index=True)
117
+ # modbase_reduced = modbase_reduced[['uniprotID', 'quality_score', 'model_id', 'coordinates', 'AAonPDB', 'coordVAR']]
118
+ # modbase = dataframe.merge(modbase_reduced, on='uniprotID', how='left')
119
+ # modbase.quality_score = modbase.quality_score.astype(float)
120
+ # modbase = modbase.sort_values(by=['datapoint', 'quality_score'], ascending=False)
121
+ # modbase.reset_index(inplace=True, drop=True)
122
+ # modbase.fillna(np.NaN, inplace=True)
123
+ # modbase.replace({'\'?\', ': '',
124
+ # ', \'?\'': '',
125
+ # '(': '', ')': '',
126
+ # '[\'?\']': np.NaN,
127
+ # '[]': np.NaN,
128
+ # 'nan-nan': np.NaN,
129
+ # '': np.NaN}, inplace=True)
130
+ # except NameError:
131
+ # print('This file doesnt have Quality Score. Replacer: -999', model_id)
132
+ # else:
133
+ # new_row = {'uniprotID': uniprot_id, 'wt': wt,
134
+ # 'pos': varPos, 'mut': mut, 'datapoint': datapoint }
135
+ # no_modbase = no_modbase.append(new_row, ignore_index=True)
136
+
137
+ # else:
138
+ # new_row = {'uniprotID': uniprot_id, 'wt': wt,
139
+ # 'pos': varPos, 'mut': mut, 'datapoint': datapoint }
140
+ # no_modbase = no_modbase.append(new_row, ignore_index=True)
141
+
142
+
143
+
144
+ # no_modbase_no_Coord = modbase[pd.isna(modbase['coordVAR'])]
145
+ # no_modbase = pd.concat([no_modbase, no_modbase_no_Coord])
146
+ # modbase = modbase[~pd.isna(modbase['coordVAR'])]
147
+ # no_modbase = no_modbase[keep_cols]
148
+ # return modbase, no_modbase
149
+
150
  import requests
151
  import numpy as np
152
  import pandas as pd
 
222
  st.write('wt', wt)
223
  st.write('protein', protein)
224
  st.write('path_to_output_files', path_to_output_files)
225
+ sasa_val = sasa(protein, varPos, wt, 1, filename, path_to_output_files, file_type='pdb')
226
+ st.write('sasa', sasa_val)
227
+ st.write('model_id', model_id)
228
  with open(path_to_output_files / 'modbase_structures_individual'/ f'{model_id}.txt', encoding="utf8") as m:
229
 
230
  lines = m.readlines()
 
257
  break
258
  try:
259
  k = pd.Series(
260
+ [uniprot_id, target_begin, target_end,quality_score, model_id, coordDict, AAonPDB, coordVAR, sasa_val])
261
  new_row = {'uniprotID': uniprot_id, 'target_begin': target_begin,
262
  'target_end': target_end, 'quality_score': quality_score,
263
  'model_id': model_id, 'coordinates': coordDict,
264
+ 'AAonPDB': AAonPDB, 'coordVAR': coordVAR, 'sasa':sasa_val}
265
  modbase_reduced = modbase_reduced.append(new_row, ignore_index=True)
266
+ modbase_reduced = modbase_reduced[['uniprotID', 'quality_score', 'model_id', 'coordinates', 'AAonPDB', 'coordVAR', 'sasa']]
267
  modbase = dataframe.merge(modbase_reduced, on='uniprotID', how='left')
268
  modbase.quality_score = modbase.quality_score.astype(float)
269
  modbase = modbase.sort_values(by=['datapoint', 'quality_score'], ascending=False)
 
293
  no_modbase_no_Coord = modbase[pd.isna(modbase['coordVAR'])]
294
  no_modbase = pd.concat([no_modbase, no_modbase_no_Coord])
295
  modbase = modbase[~pd.isna(modbase['coordVAR'])]
296
+ st.write(modbase['datapoint', 'model_id', 'sasa'])
297
  no_modbase = no_modbase[keep_cols]
298
+ return modbase, no_modbase