fatmacankara commited on
Commit
c2a02c6
·
0 Parent(s):

Duplicate from fatmacankara/ASCARIS

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +38 -0
  2. README.md +14 -0
  3. app.py +129 -0
  4. code/__pycache__/add_3Dalignment.cpython-37.pyc +0 -0
  5. code/__pycache__/add_alignment.cpython-37.pyc +0 -0
  6. code/__pycache__/add_annotations.cpython-37.pyc +0 -0
  7. code/__pycache__/add_domains.cpython-37.pyc +0 -0
  8. code/__pycache__/add_interface_pos.cpython-37.pyc +0 -0
  9. code/__pycache__/add_sasa.cpython-37.pyc +0 -0
  10. code/__pycache__/add_sequence.cpython-37.pyc +0 -0
  11. code/__pycache__/add_structure.cpython-37.pyc +0 -0
  12. code/__pycache__/alphafold_featureVector.cpython-37.pyc +0 -0
  13. code/__pycache__/alphafold_model.cpython-37.pyc +0 -0
  14. code/__pycache__/calc_pc_property.cpython-37.pyc +0 -0
  15. code/__pycache__/manage_files.cpython-37.pyc +0 -0
  16. code/__pycache__/pdb_featureVector.cpython-37.pyc +0 -0
  17. code/__pycache__/process_input.cpython-37.pyc +0 -0
  18. code/__pycache__/standard.cpython-37.pyc +0 -0
  19. code/__pycache__/uniprotSequenceMatch.cpython-37.pyc +0 -0
  20. code/add_3Dalignment.py +261 -0
  21. code/add_alignment.py +423 -0
  22. code/add_annotations.py +95 -0
  23. code/add_domains.py +57 -0
  24. code/add_interface_pos.py +35 -0
  25. code/add_sasa.py +131 -0
  26. code/add_sequence.py +44 -0
  27. code/add_structure.py +168 -0
  28. code/alphafold_featureVector.py +579 -0
  29. code/alphafold_model.py +33 -0
  30. code/calc_pc_property.py +441 -0
  31. code/create_swissmodelSummary.py +1 -0
  32. code/get_alphafoldStructures.py +97 -0
  33. code/main.py +35 -0
  34. code/manage_files.py +42 -0
  35. code/pdb_featureVector.py +0 -0
  36. code/process_input.py +40 -0
  37. code/standard.py +13 -0
  38. code/uniprotSequenceMatch.py +40 -0
  39. input_files/H_sapiens_interfacesHQ.txt +3 -0
  40. input_files/alphafold_structures/AF-A0A0A0MRZ7-F1-model_v1.cif.gz +3 -0
  41. input_files/alphafold_structures/AF-A0A0A0MRZ7-F1-model_v1.pdb.gz +3 -0
  42. input_files/alphafold_structures/AF-A0A0A0MRZ8-F1-model_v1.cif.gz +3 -0
  43. input_files/alphafold_structures/AF-A0A0A0MRZ8-F1-model_v1.pdb.gz +3 -0
  44. input_files/alphafold_structures/AF-A0A0A0MRZ9-F1-model_v1.cif.gz +3 -0
  45. input_files/alphafold_structures/AF-A0A0A0MRZ9-F1-model_v1.pdb.gz +3 -0
  46. input_files/alphafold_structures/AF-A0A0A0MS00-F1-model_v1.cif.gz +3 -0
  47. input_files/alphafold_structures/AF-A0A0A0MS00-F1-model_v1.pdb.gz +3 -0
  48. input_files/alphafold_structures/AF-A0A0A0MS01-F1-model_v1.cif.gz +3 -0
  49. input_files/alphafold_structures/AF-A0A0A0MS01-F1-model_v1.pdb.gz +3 -0
  50. input_files/alphafold_structures/AF-A0A0A0MS02-F1-model_v1.cif.gz +3 -0
.gitattributes ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ input_files/alphafold_summary.txt filter=lfs diff=lfs merge=lfs -text
37
+ input_files/H_sapiens_interfacesHQ.txt filter=lfs diff=lfs merge=lfs -text
38
+ input_files/swissmodel_structures.txt filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: ASCARIS
3
+ emoji: 🦀
4
+ colorFrom: indigo
5
+ colorTo: gray
6
+ sdk: streamlit
7
+ python_version: '3.7'
8
+ sdk_version: 1.21.0
9
+ app_file: app.py
10
+ pinned: false
11
+ duplicated_from: fatmacankara/ASCARIS
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from os import path
4
+ import sys
5
+ import streamlit.components.v1 as components
6
+ sys.path.append('code/')
7
+ #sys.path.append('ASCARIS/code/')
8
+ import pdb_featureVector
9
+ import alphafold_featureVector
10
+ import argparse
11
+ from st_aggrid import AgGrid, GridOptionsBuilder, JsCode,GridUpdateMode
12
+ showWarningOnDirectExecution = False
13
+ def download_button(object_to_download, download_filename):
14
+
15
+ if isinstance(object_to_download, pd.DataFrame):
16
+ object_to_download = object_to_download.to_csv(index=False)
17
+
18
+ # Try JSON encode for everything else
19
+ else:
20
+ object_to_download = json.dumps(object_to_download)
21
+ try:
22
+ # some strings <-> bytes conversions necessary here
23
+ b64 = base64.b64encode(object_to_download.encode()).decode()
24
+
25
+ except AttributeError as e:
26
+ b64 = base64.b64encode(object_to_download).decode()
27
+
28
+ dl_link = f"""<html><head><title>Start Auto Download file</title><script src="http://code.jquery.com/jquery-3.2.1.min.js"></script><script>$('<a href="data:text/csv;base64,{b64}" download="{download_filename}">')[0].click()</script></head></html>"""
29
+ return dl_link
30
+
31
+
32
+ def download_df():
33
+ components.html(
34
+ download_button(selected_df, st.session_state.filename),
35
+ height=0,
36
+ )
37
+
38
+
39
+ original_title = '<p style="font-family:Trebuchet MS; color:#FD7456; font-size: 35px; font-weight:bold; text-align:center">Welcome to ASCARIS</p>'
40
+ st.markdown(original_title, unsafe_allow_html=True)
41
+ st.write('')
42
+ st.write('')
43
+ st.write('')
44
+ st.write('')
45
+
46
+
47
+
48
+ source = st.selectbox('Select Protein Structure Database (1: PDB, SwissModel, Modbase 2: AlphaFold)',[1,2])
49
+ impute = st.selectbox('Select Imputation',[True, False])
50
+ input_data = st.text_input('Enter Input Variation')
51
+
52
+
53
+
54
+
55
+ #sys.path.append(path.abspath('../code/'))
56
+ parser = argparse.ArgumentParser(description='ASCARIS')
57
+
58
+ parser.add_argument('-s', '--source_option',
59
+ help='Selection of input structure data.\n 1: PDB Structures (default), 2: AlphaFold Structures',
60
+ default=1)
61
+ parser.add_argument('-i', '--input_datapoint',
62
+ help='Input file or query datapoint\n Option 1: Comma-separated list of idenfiers (UniProt ID-wt residue-position-mutated residue (e.g. Q9Y4W6-N-432-T or Q9Y4W6-N-432-T, Q9Y4W6-N-432-T)) \n Option 2: Enter comma-separated file path')
63
+
64
+ parser.add_argument('-impute', '--imputation_state', default='True',
65
+ help='Whether resulting feature vector should be imputed or not. Default True.')
66
+
67
+ args = parser.parse_args()
68
+
69
+ input_set = input_data
70
+ mode = source
71
+ impute = impute
72
+
73
+ print('*****************************************')
74
+ print('Feature vector generation is in progress. \nPlease check log file for updates..')
75
+ print('*****************************************')
76
+ mode = int(mode)
77
+
78
+ with st.spinner('In progress...This may take a while...'):
79
+ try:
80
+ if mode == 1:
81
+ selected_df = pdb_featureVector.pdb(input_set, mode, impute)
82
+ int_builder = GridOptionsBuilder.from_dataframe(selected_df)
83
+ int_builder.configure_default_column(editable=False, filterable=True, cellStyle={'text-align': 'center'})
84
+ int_builder.configure_pagination(enabled=True, paginationAutoPageSize=False, paginationPageSize=10)
85
+ int_builder.configure_selection(selection_mode='multiple', use_checkbox=True)
86
+ gridoptions = int_builder.build()
87
+ int_return = AgGrid(selected_df,
88
+ width='100%',
89
+ height=(len(selected_df) + 4) * 35.2 + 3,
90
+ theme='light',
91
+ enable_enterprise_modules=False,
92
+ gridOptions=gridoptions,
93
+ fit_columns_on_grid_load=False,
94
+ update_mode=GridUpdateMode.SELECTION_CHANGED, # or MODEL_CHANGED
95
+ custom_css={".ag-header-cell-label": {"justify-content": "center"}})
96
+ st.success('Feature vector successfully created.')
97
+
98
+
99
+ elif mode == 2:
100
+ selected_df = alphafold_featureVector.alphafold(input_set, mode, impute)
101
+ int_builder = GridOptionsBuilder.from_dataframe(selected_df)
102
+ int_builder.configure_default_column(editable=False, filterable=True, cellStyle={'text-align': 'center'})
103
+ int_builder.configure_pagination(enabled=True, paginationAutoPageSize=False, paginationPageSize=10)
104
+ int_builder.configure_selection(selection_mode='multiple', use_checkbox=True)
105
+ gridoptions = int_builder.build()
106
+ int_return = AgGrid(selected_df,
107
+ width='100%',
108
+ height=(len(selected_df) + 4) * 35.2 + 3,
109
+ theme='light',
110
+ enable_enterprise_modules=False,
111
+ gridOptions=gridoptions,
112
+ fit_columns_on_grid_load=False,
113
+ update_mode=GridUpdateMode.SELECTION_CHANGED, # or MODEL_CHANGED
114
+ custom_css={".ag-header-cell-label": {"justify-content": "center"}})
115
+ st.success('Feature vector successfully created.')
116
+
117
+
118
+ except:
119
+ pass
120
+ download_df = pd.DataFrame()
121
+
122
+ with st.form("my_form", clear_on_submit=False):
123
+ st.text_input("Enter filename", key="filename")
124
+ submit = st.form_submit_button("Download feature vector", on_click=download_df)
125
+
126
+
127
+
128
+
129
+
code/__pycache__/add_3Dalignment.cpython-37.pyc ADDED
Binary file (5.67 kB). View file
 
code/__pycache__/add_alignment.cpython-37.pyc ADDED
Binary file (7.99 kB). View file
 
code/__pycache__/add_annotations.cpython-37.pyc ADDED
Binary file (3.78 kB). View file
 
code/__pycache__/add_domains.cpython-37.pyc ADDED
Binary file (1.44 kB). View file
 
code/__pycache__/add_interface_pos.cpython-37.pyc ADDED
Binary file (1.12 kB). View file
 
code/__pycache__/add_sasa.cpython-37.pyc ADDED
Binary file (3.17 kB). View file
 
code/__pycache__/add_sequence.cpython-37.pyc ADDED
Binary file (1.27 kB). View file
 
code/__pycache__/add_structure.cpython-37.pyc ADDED
Binary file (5.93 kB). View file
 
code/__pycache__/alphafold_featureVector.cpython-37.pyc ADDED
Binary file (15.4 kB). View file
 
code/__pycache__/alphafold_model.cpython-37.pyc ADDED
Binary file (1.35 kB). View file
 
code/__pycache__/calc_pc_property.cpython-37.pyc ADDED
Binary file (8.84 kB). View file
 
code/__pycache__/manage_files.cpython-37.pyc ADDED
Binary file (1.43 kB). View file
 
code/__pycache__/pdb_featureVector.cpython-37.pyc ADDED
Binary file (33.7 kB). View file
 
code/__pycache__/process_input.cpython-37.pyc ADDED
Binary file (1.69 kB). View file
 
code/__pycache__/standard.cpython-37.pyc ADDED
Binary file (749 Bytes). View file
 
code/__pycache__/uniprotSequenceMatch.cpython-37.pyc ADDED
Binary file (1.28 kB). View file
 
code/add_3Dalignment.py ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This code file produces alignments between the structure and the sequence for a given protein.
3
+
4
+ """
5
+
6
+ import math
7
+ import glob
8
+ import numpy as np
9
+ from Bio import Align
10
+ import gzip
11
+ from pathlib import Path
12
+ from Bio.Align import substitution_matrices
13
+ aligner = Align.PairwiseAligner()
14
+
15
+ def distance(x1, y1, z1, x2, y2, z2):
16
+ d = math.sqrt(math.pow(x2 - x1, 2) +
17
+ math.pow(y2 - y1, 2) +
18
+ math.pow(z2 - z1, 2) * 1.0)
19
+ return d
20
+
21
+
22
+ def find_distance(coordMut, coordAnnot):
23
+ if coordMut != np.NaN:
24
+ try:
25
+ dist = distance(float(coordMut[0]), float(coordMut[1]), float(coordMut[2]), float(coordAnnot[0]),
26
+ float(coordAnnot[1]), float(coordAnnot[2]))
27
+ return "%.2f" % dist
28
+ except:
29
+ ValueError
30
+ dist = 'nan'
31
+ return dist
32
+ else:
33
+ return np.NaN
34
+
35
+
36
+ def threeToOne(variant):
37
+ if variant == "ALA":
38
+ variant = "A"
39
+ elif variant == "ARG":
40
+ variant = "R"
41
+ elif variant == "VAL":
42
+ variant = "V"
43
+ elif variant == "GLU":
44
+ variant = "E"
45
+ elif variant == "PRO":
46
+ variant = "P"
47
+ elif variant == "LEU":
48
+ variant = "L"
49
+ elif variant == "GLY":
50
+ variant = "G"
51
+ elif variant == "ASN":
52
+ variant = "N"
53
+ elif variant == "SER":
54
+ variant = "S"
55
+ elif variant == "GLN":
56
+ variant = "Q"
57
+ elif variant == "THR":
58
+ variant = "T"
59
+ elif variant == "MET":
60
+ variant = "M"
61
+ elif variant == "LYS":
62
+ variant = "K"
63
+ elif variant == "ASP":
64
+ variant = "D"
65
+ elif variant == "ILE":
66
+ variant = "I"
67
+ elif variant == "PHE":
68
+ variant = "F"
69
+ elif variant == "TRP":
70
+ variant = "W"
71
+ elif variant == "TYR":
72
+ variant = "Y"
73
+ elif variant == "HIS":
74
+ variant = "H"
75
+ elif variant == "CYS":
76
+ variant = "C"
77
+ elif variant == 'UNK':
78
+ variant = 'X'
79
+ elif variant == 'ASX':
80
+ variant = 'O'
81
+ return (variant)
82
+
83
+
84
+ def get_coords(annot, alignments, coords, resnums_for_sasa, mode):
85
+ if mode == 1:
86
+ for alignment in alignments[0]:
87
+ alignment = (str(alignment).strip().split('\n'))
88
+ startGap = 0
89
+ if alignment[0].startswith('.'):
90
+ for k in alignment[0]:
91
+ if k == '.' or k == '-':
92
+ startGap += 1
93
+ else:
94
+ break
95
+ countGap = startGap
96
+ countResidue = 0
97
+ for j in alignment[0][startGap:]:
98
+ if j == '.' or j == '-':
99
+ countGap += 1
100
+ else:
101
+ countResidue += 1
102
+ if countResidue == float(annot):
103
+ break
104
+ countGap_pdb = 0
105
+ countResidue_pdb = 0
106
+ for m in alignment[2][0:countResidue + countGap - 1]:
107
+ if m == '.' or m == '-':
108
+ countGap_pdb += 1
109
+ posAtom = countResidue + countGap - countGap_pdb
110
+
111
+ realpdbStart = 0
112
+ for j in alignment[2]:
113
+ if j == '.' or j == '-':
114
+ realpdbStart += 1
115
+ else:
116
+ break
117
+
118
+ if (alignment[2][countResidue + countGap - 1] != '-') and (float(annot) >= float(realpdbStart) + 1):
119
+ try:
120
+ coordinates = alignments[1]
121
+ residue_numbers = alignments[2]
122
+ coordWeWant = coordinates[posAtom - 1]
123
+ residue_number_we_want = residue_numbers[posAtom - 1]
124
+
125
+ except:
126
+ IndexError
127
+ coordWeWant = 'nan'
128
+ else:
129
+ coordWeWant = 'nan'
130
+ return coordWeWant, posAtom, residue_number_we_want
131
+ if mode == 2:
132
+ if annot != 'nan':
133
+ if int(annot) <= 1400:
134
+ alignment = (str(alignments).strip().split('\n'))
135
+ startGap = 0
136
+ if alignment[0].startswith('.'):
137
+ for k in alignment[0]:
138
+ if k == '.' or k == '-':
139
+ startGap += 1
140
+ else:
141
+ break
142
+ countGap = startGap
143
+ countResidue = 0
144
+ for j in alignment[0][startGap:]:
145
+ if j == '.' or j == '-':
146
+ countGap += 1
147
+ else:
148
+ countResidue += 1
149
+ if countResidue == float(annot):
150
+ break
151
+ countGap_pdb = 0
152
+ countResidue_pdb = 0
153
+ for m in alignment[2][0:countResidue + countGap - 1]:
154
+ if m == '.' or m == '-':
155
+ countGap_pdb += 1
156
+ posAtom = countResidue + countGap - countGap_pdb
157
+ realpdbStart = 0
158
+ for j in alignment[2]:
159
+ if j == '.' or j == '-':
160
+ realpdbStart += 1
161
+ else:
162
+ break
163
+ if len(alignment[2]) > (countResidue + countGap - 1):
164
+ if (alignment[2][countResidue + countGap - 1] != '-') and (float(annot) >= float(realpdbStart) + 1):
165
+ try:
166
+ coordinates = coords
167
+ residue_numbers = resnums_for_sasa
168
+ coordWeWant = coordinates[posAtom - 1]
169
+ residue_number_we_want = residue_numbers[posAtom - 1]
170
+ except:
171
+ IndexError
172
+ coordWeWant = 'nan'
173
+ residue_number_we_want = 'nan'
174
+ else:
175
+ coordWeWant = 'nan'
176
+ residue_number_we_want = 'nan'
177
+ return coordWeWant, posAtom, residue_number_we_want
178
+ else:
179
+ coordWeWant = 'nan'
180
+ residue_number_we_want = 'nan'
181
+ return coordWeWant, posAtom, residue_number_we_want
182
+ else:
183
+ return np.NaN, np.NaN, np.NaN
184
+ else:
185
+ return np.NaN, np.NaN, np.NaN
186
+
187
+
188
+ def get_alignments_3D(identifier, model_num, pdb_path, pdbSequence, source, chain, pdbID, mode, path_3D_alignment,file_format = 'gzip'):
189
+ if mode == 1:
190
+ atomSequence = ''
191
+ coords = []
192
+ resnums_for_sasa = []
193
+ with open(pdb_path, encoding="utf8") as f:
194
+ for line in f.readlines():
195
+ if source != 'MODBASE':
196
+ if line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA' and line[21].upper() == chain.upper():
197
+ atomSequence += threeToOne(line[17:20].strip())
198
+ coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
199
+ resnums_for_sasa.append(line[22:26].strip())
200
+ elif line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA' and line[21] == ' ':
201
+ atomSequence += threeToOne(line[17:20].strip())
202
+ coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
203
+ resnums_for_sasa.append(line[22:26].strip())
204
+ else:
205
+ if line[0:7].strip() == 'ATOM' and line[13:15].strip() == 'CA':
206
+ atomSequence += threeToOne(line[17:20].strip())
207
+ coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
208
+ resnums_for_sasa.append(line[22:26].strip())
209
+
210
+ f = open(Path(path_3D_alignment / f'{identifier}_{pdbID}_{str(chain)}_alignment.txt'),"w")
211
+
212
+ aligner.mode = 'local'
213
+ aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
214
+ aligner.open_gap_score = -11
215
+ aligner.extend_gap_score = -1
216
+ alignments = aligner.align(pdbSequence, atomSequence)
217
+ alignments = (list(alignments))
218
+ for alignment in alignments:
219
+ f.write(str(alignment))
220
+ f.write('\n')
221
+ f.write('\n')
222
+ return alignments, coords, resnums_for_sasa
223
+ elif mode==2:
224
+ atomSequence = ''
225
+ coords = []
226
+ resnums_for_sasa = []
227
+ if file_format == 'txt':
228
+ with open(name, encoding="utf8") as f:
229
+ for line in f.readlines():
230
+ if line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA':
231
+ atomSequence += threeToOne(line[17:20].strip())
232
+ coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
233
+ resnums_for_sasa.append(line[22:26].strip())
234
+ elif line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA' and line[21] == ' ':
235
+ atomSequence += threeToOne(line[17:20].strip())
236
+ coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
237
+ resnums_for_sasa.append(line[22:26].strip())
238
+ elif file_format == 'gzip':
239
+ with gzip.open(pdb_path, mode='rb') as f:
240
+ for line in f:
241
+ line = line.decode()
242
+ if line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA':
243
+ atomSequence += threeToOne(line[17:20].strip())
244
+ coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
245
+ resnums_for_sasa.append(line[22:26].strip())
246
+ elif line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA' and line[21] == ' ':
247
+ atomSequence += threeToOne(line[17:20].strip())
248
+ coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
249
+ resnums_for_sasa.append(line[22:26].strip())
250
+ f = open(Path(path_3D_alignment / f'{identifier}_{str(model_num)}_3Dalignment.txt'),"w")
251
+ aligner.mode = 'local'
252
+ aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
253
+ aligner.open_gap_score = -11
254
+ aligner.extend_gap_score = -1
255
+ alignments = aligner.align(pdbSequence, atomSequence)
256
+ alignments = (list(alignments))
257
+ for alignment in alignments:
258
+ f.write(str(alignment))
259
+ f.write('\n')
260
+ f.write('\n')
261
+ return alignments, coords, resnums_for_sasa
code/add_alignment.py ADDED
@@ -0,0 +1,423 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from Bio import Align
2
+ from Bio.Align import substitution_matrices
3
+ from pathlib import Path
4
+ import streamlit as st
5
+ from Bio.pairwise2 import format_alignment
6
+ from Bio import pairwise2
7
+ from Bio import pairwise2
8
+ from Bio.SubsMat import MatrixInfo as matlist
9
+
10
+
11
+
12
+ """
13
+ def do_alignment(identifier, uniprotSequence, pdbSequence, alignment_path):
14
+ aligner = Align.PairwiseAligner()
15
+ #print(f'Aligning Datapoint: {identifier}')
16
+ if len(pdbSequence) >= 1:
17
+ f = open(Path(alignment_path / f'{identifier}_alignment.txt'), "w")
18
+ aligner.mode = 'local'
19
+ aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
20
+ aligner.open_gap_score = -11
21
+ aligner.extend_gap_score = -1
22
+ alignments = aligner.align(uniprotSequence, pdbSequence)
23
+ alignments = (list(alignments))
24
+
25
+ merge_in_threes = str(alignments[0]).split('\n')
26
+ K = 3
27
+ res = ["".join(str(alignments[0]).split('\n')[idx: idx + K]) for idx in range(len(str(alignments[0]).split('\n')) - K + 1)]
28
+ slice_val = slice(0,len(res),4)
29
+ writtenlist = res[slice_val]
30
+
31
+ new_alignment = []
32
+ for i in writtenlist:
33
+ cont1 = list(filter(None, i.split('target')))
34
+ cont2 = cont1[0].split('query')
35
+ target_pos = (list(filter(None,cont2[0].split(' '))))[0]
36
+ target = (list(filter(None,cont2[0].split(' '))))[1]
37
+ alg_pos = (list(filter(None,cont2[0].split(' '))))[2]
38
+ alg = (list(filter(None,cont2[0].split(' '))))[3]
39
+ query_pos = (list(filter(None,cont2[1].split(' '))))[0]
40
+ query = (list(filter(None,cont2[1].split(' '))))[1]
41
+ if int(target_pos)>0:
42
+ new_target = int(target_pos) * 'X' + target
43
+ else:
44
+ new_target = int(target_pos) * ' ' + target
45
+
46
+ if int(alg_pos)>0:
47
+ new_alg = int(target_pos) * 'X' + target
48
+ else:
49
+ new_alg = int(target_pos) * ' ' + alg
50
+
51
+ if int(query_pos)>0:
52
+ new_query = int(target_pos) * 'X' + target
53
+ else:
54
+ new_query = int(target_pos) * ' ' + target
55
+
56
+ new_alignment.append(new_target+'\n' +new_alg +'\n' +new_query)
57
+ alignment_list = []
58
+ k = 0
59
+ for alignment in new_alignment:
60
+ k += 1
61
+ st.write('COUNT', k)
62
+ st.write('alignment')
63
+ st.write(alignment)
64
+ f.write(str(alignment))
65
+ f.write('\n')
66
+ f.write('\n')
67
+ alignment = (str(alignment).strip().split('\n'))
68
+ alignment = [''.join(['.' if m == ' ' else m for m in x]) for x in alignment]
69
+ st.write('alignment_updated')
70
+ st.write(alignment)
71
+ alignment_list.append(alignment)
72
+ return alignment_list
73
+
74
+ """
75
+ def do_alignment(identifier, uniprotSequence, pdbSequence, alignment_path):
76
+ aligner = Align.PairwiseAligner()
77
+ #print(f'Aligning Datapoint: {identifier}')
78
+ if len(pdbSequence) >= 1:
79
+ f = open(Path(alignment_path / f'{identifier}_alignment.txt'), "w")
80
+ aligner.mode = 'local'
81
+ aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
82
+ aligner.open_gap_score = -11
83
+ aligner.extend_gap_score = -1
84
+ alignments = aligner.align(uniprotSequence, pdbSequence)
85
+
86
+ sub_matrix = matlist.blosum62
87
+ alignments2 = pairwise2.align.localds(uniprotSequence, pdbSequence, sub_matrix, -11, -1)
88
+
89
+ alignment_list = []
90
+ k = 0
91
+ for alignment in alignments:
92
+
93
+ f.write(str(alignment))
94
+ f.write('\n')
95
+ f.write('\n')
96
+ alignment = (str(alignment).strip().split('\n'))
97
+ alignment = [''.join(['.' if m == ' ' else m for m in x]) for x in alignment]
98
+
99
+ alignment_list.append(alignment)
100
+ return alignment_list
101
+
102
+ def mutation_position_on_pdb(alignment_list, pos):
103
+ which_alignment_to_go = 0
104
+ for alignment in alignment_list:
105
+
106
+ #char_list = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
107
+ #for char in alignment[1]:
108
+ # if char in char_list:
109
+ # alignment[1] = alignment[1].replace(char, '.')
110
+
111
+
112
+ which_alignment_to_go += 1
113
+ alignment_uniprot = alignment[0]
114
+ alignment_pdb = alignment[2]
115
+ startGap = 0
116
+ if alignment_uniprot.startswith('.') or alignment_uniprot.startswith('-'):
117
+ for k in alignment_uniprot:
118
+ if k == '.' or k == '-':
119
+ startGap += 1
120
+ else:
121
+ break
122
+
123
+ countGap = startGap
124
+ countResidue = 0
125
+ canonicalRes = ' '
126
+ pdbRes = ' '
127
+ for j in alignment_uniprot[startGap:]:
128
+ if j == '.' or j == '-':
129
+ countGap += 1
130
+ else:
131
+ countResidue += 1
132
+
133
+ if int(countResidue) == int(pos):
134
+ canonicalRes = alignment_uniprot[countResidue + countGap - 1]
135
+ try:
136
+ pdbRes = alignment_pdb[countResidue + countGap - 1]
137
+ except:
138
+ IndexError
139
+ pdbRes = 'nan'
140
+ break
141
+
142
+ if (alignment[1][countResidue + countGap - 1] == '|') or (alignment[1][countResidue + countGap - 1] == 'X'):
143
+ if canonicalRes == pdbRes:
144
+ pdb_alignStatus = 'aligned'
145
+ elif canonicalRes != pdbRes:
146
+ pdb_alignStatus = 'aligned*'
147
+ countGap_pdb = 0
148
+ countResidue_pdb = 0
149
+ pdbRes = ' '
150
+ for j in alignment_pdb[0:countResidue + countGap - 1]:
151
+ if j == '.' or j == '-':
152
+ countGap_pdb += 1
153
+ if alignment_pdb[countResidue + countGap - 1] == '.' or alignment_pdb[
154
+ countResidue + countGap - 1] == '-':
155
+ mutationPositionOnPDB = 'nan'
156
+ posPDB = 'nan'
157
+
158
+
159
+ else:
160
+ posPDB = countResidue + countGap - countGap_pdb
161
+
162
+ mutationPositionOnPDB = str(posPDB)
163
+
164
+ break
165
+ elif (canonicalRes == pdbRes) and ((alignment[1][countResidue + countGap - 1] == '.') or (
166
+ alignment[1][poscountResidue+ countGap - 1] == '-')):
167
+ pdb_alignStatus = 'not_aligned'
168
+ mutationPositionOnPDB = 'nan'
169
+ elif (canonicalRes != pdbRes) and ((alignment[1][countResidue + countGap - 1] == '.') or (
170
+ alignment[1][countResidue + countGap - 1] == '-')):
171
+ pdb_alignStatus = 'not_aligned'
172
+ mutationPositionOnPDB = 'nan'
173
+ elif alignment_pdb[countResidue + countGap - 1] == '.' or alignment_pdb[
174
+ countResidue + countGap - 1] == '-':
175
+ mutationPositionOnPDB = 'nan'
176
+ posPDB = 'nan'
177
+
178
+ return (pdb_alignStatus, mutationPositionOnPDB, startGap, alignment_list[which_alignment_to_go - 1])
179
+
180
+
181
+ def find_position_on_pdb_for_range_annotations(posAnnotation, startGap, alignment_to_use):
182
+ annotation_on_pdb_start = 'nan'
183
+ annotation_on_pdb_end = 'nan'
184
+ pos1 = int(posAnnotation.split('-')[0])
185
+ count_gap = startGap
186
+ count_residue = 0
187
+ for j in alignment_to_use[0][startGap:]:
188
+ if j == '.' or j == '-':
189
+ count_gap += 1
190
+ else:
191
+ count_residue += 1
192
+ if int(count_residue) == int(pos1): # count gaps until the first position
193
+ break
194
+ annotation_on_up_start = int(pos1) + int(count_gap)
195
+
196
+ pos2 = int(posAnnotation.split('-')[1])
197
+ count_gap = startGap
198
+ count_residue = 0
199
+ for j in alignment_to_use[0][startGap:]:
200
+ if j == '.' or j == '-':
201
+ count_gap += 1
202
+ else:
203
+ count_residue += 1
204
+ if int(count_residue) == int(pos2): # count gaps until the first position
205
+ break
206
+
207
+ annotation_on_up_end = int(pos2) + int(count_gap)
208
+ try:
209
+ pdb_residue_start = alignment_to_use[2][annotation_on_up_start - 1].strip()
210
+ if (pdb_residue_start == '.') or (pdb_residue_start == '-'):
211
+ for ran in range(len(alignment_to_use[2][(annotation_on_up_start - 1):annotation_on_up_end])):
212
+ if (alignment_to_use[2][(annotation_on_up_start - 1):annotation_on_up_end][ran] != '.') and \
213
+ (alignment_to_use[2][(annotation_on_up_start - 1):annotation_on_up_end][ran] != '-') and \
214
+ ((alignment_to_use[1][(annotation_on_up_start - 1):annotation_on_up_end][ran] == '|') or
215
+ (alignment_to_use[1][(annotation_on_up_start - 1):annotation_on_up_end][ran] == 'X')):
216
+ annotation_on_up_start += ran
217
+ break
218
+ elif (pdb_residue_start != '.') and (pdb_residue_start != '-') and \
219
+ ((alignment_to_use[1][annotation_on_up_start - 1] == '.') or (
220
+ alignment_to_use[1][annotation_on_up_start - 1] == '-')):
221
+ for ran in range(len(alignment_to_use[2][(annotation_on_up_start - 1):annotation_on_up_end])):
222
+ if ((alignment_to_use[1][(annotation_on_up_start - 1):annotation_on_up_end][ran] == '|') or
223
+ (alignment_to_use[1][(annotation_on_up_start - 1):annotation_on_up_end][ran] == 'X')):
224
+ annotation_on_up_start += ran
225
+ break
226
+ count_gap_pdb = 0
227
+ if annotation_on_up_start != 'nan':
228
+ for q in alignment_to_use[2][0:annotation_on_up_start - 1]:
229
+ if q == '.' or q == '-':
230
+ count_gap_pdb += 1
231
+ if alignment_to_use[1][annotation_on_up_start] == '-' or alignment_to_use[1][annotation_on_up_start] == '.':
232
+ annotation_on_pdb_start = 'nan'
233
+ else:
234
+ annotation_on_pdb_start = int(annotation_on_up_start) - count_gap_pdb
235
+ else:
236
+ annotation_on_pdb_start = 'nan'
237
+ except:
238
+ IndexError
239
+ try:
240
+ pdb_residue_end = alignment_to_use[2][annotation_on_up_end - 1].strip()
241
+ if pdb_residue_end == '.' or pdb_residue_end == '-':
242
+ for ran in range(len(alignment_to_use[2][(annotation_on_up_start - 1):annotation_on_up_end])):
243
+ if ((alignment_to_use[1][annotation_on_up_start - 1:annotation_on_up_end][ran] == '.') or
244
+ (alignment_to_use[1][(annotation_on_up_start - 1):][ran] == '-')):
245
+ annotation_on_up_start += (ran - 1)
246
+ annotation_on_up_end = annotation_on_up_start
247
+ break
248
+ elif (pdb_residue_end != '.') and (pdb_residue_end != '-') and \
249
+ ((alignment_to_use[1][annotation_on_up_end - 1] == '.') or (
250
+ alignment_to_use[1][annotation_on_up_end - 1] == '-')):
251
+ for ran in range(len(alignment_to_use[2][(annotation_on_up_start - 1):annotation_on_up_end])):
252
+ if ((alignment_to_use[1][annotation_on_up_start - 1:annotation_on_up_end][ran] == '.') or
253
+ (alignment_to_use[1][(annotation_on_up_start - 1):][ran] == '-')):
254
+ annotation_on_up_start += (ran - 1)
255
+ annotation_on_up_end = annotation_on_up_start
256
+ break
257
+ count_gap_pdb = 0
258
+ if annotation_on_up_end != 'nan':
259
+ for q in alignment_to_use[2][0:annotation_on_up_end - 1]:
260
+ if q == '.' or q == '-':
261
+ count_gap_pdb += 1
262
+ if alignment_to_use[1][annotation_on_up_end - 1] == '-' or alignment_to_use[1][
263
+ annotation_on_up_end - 1] == '.' and annotation_on_pdb_start == 'nan':
264
+ annotation_on_pdb_end = 'nan'
265
+ elif alignment_to_use[1][annotation_on_up_end - 1] == '-' or alignment_to_use[1][
266
+ annotation_on_up_end - 1] == '.' and annotation_on_pdb_start != 'nan':
267
+ annotation_on_pdb_end = int(annotation_on_up_end) - count_gap_pdb
268
+ else:
269
+ annotation_on_pdb_end = int(annotation_on_up_end) - count_gap_pdb
270
+ else:
271
+ annotation_on_pdb_end = 'nan'
272
+ except:
273
+ IndexError # Say isoform 2 is matched with the length 100, but canonical is 150 aa long. If there is an annotation at 105. position, for the isoform it throws an index error.
274
+
275
+ if annotation_on_pdb_start == 'nan' and annotation_on_pdb_end != 'nan':
276
+ annotation_on_pdb_start = annotation_on_up_start - count_gap_pdb
277
+ if annotation_on_pdb_start == annotation_on_pdb_end:
278
+ annotation_on_pdb_start = 'nan'
279
+ annotation_on_pdb_end = 'nan'
280
+ return annotation_on_up_start, annotation_on_up_end, annotation_on_pdb_start, annotation_on_pdb_end
281
+
282
+
283
+ def annotation_pos_on_pdb(annot_positions, startGap, alignment_to_use, identifier):
284
+ newpos = []
285
+ if annot_positions != 'nan':
286
+ annot_positions = (str(annot_positions).replace("'", ''))
287
+ annot_positions = (str(annot_positions).replace('[', ''))
288
+ annot_positions = (str(annot_positions).replace("]", ''))
289
+ positionList_perAnnotation = annot_positions.split(',')
290
+ positionList_perAnnotation = [h.strip() for h in positionList_perAnnotation]
291
+
292
+ position_start_on_pdb = 'nan'
293
+ position_end_on_pdb = 'nan'
294
+ try:
295
+ positionList_perAnnotation = [i for i in positionList_perAnnotation if i != 'nan']
296
+ except:
297
+ TypeError
298
+ for position in range(len(positionList_perAnnotation)):
299
+ if ('-' not in str(positionList_perAnnotation[position])) and (str(positionList_perAnnotation[position]) != '?') and (str(positionList_perAnnotation[position]) != '') and (len(str(positionList_perAnnotation[position])) != 0):
300
+ count_gap = startGap
301
+ count_residue = 0
302
+ for j in alignment_to_use[0][startGap:]:
303
+ if j == '.' or j == '-':
304
+ count_gap += 1
305
+ else:
306
+ count_residue += 1
307
+ try:
308
+ if int(count_residue) == int(positionList_perAnnotation[position]):
309
+ break
310
+ except:
311
+ ValueError
312
+
313
+ annotation_on_up = int(positionList_perAnnotation[position]) + int(count_gap)
314
+ try:
315
+ pdb_residue_start = alignment_to_use[2][annotation_on_up - 1].strip()
316
+ except:
317
+ IndexError
318
+ pdb_residue_start = 'nan'
319
+ if pdb_residue_start != 'nan':
320
+ try:
321
+ if (pdb_residue_start == '.') or (pdb_residue_start == '-'):
322
+ for ran in range(len(alignment_to_use[2][(annotation_on_up - 1):annotation_on_up])):
323
+ if (alignment_to_use[2][(annotation_on_up - 1):annotation_on_up][ran] != '.') and \
324
+ (alignment_to_use[2][(annotation_on_up - 1):annotation_on_up][
325
+ ran] != '-') and \
326
+ ((alignment_to_use[1][(annotation_on_up - 1):annotation_on_up][
327
+ ran] == '|') or
328
+ (alignment_to_use[1][(annotation_on_up - 1):annotation_on_up][
329
+ ran] == 'X')):
330
+ annotation_on_up += ran
331
+ break
332
+ elif (pdb_residue_start != '.') and (pdb_residue_start != '-') and \
333
+ ((alignment_to_use[1][annotation_on_up - 1] == '.') or (
334
+ alignment_to_use[1][annotation_on_up - 1] == '-')):
335
+ for ran in range(len(alignment_to_use[2][(annotation_on_up - 1):annotation_on_up])):
336
+ if ((alignment_to_use[1][(annotation_on_up - 1):annotation_on_up][ran] == '|') or
337
+ (alignment_to_use[1][(annotation_on_up - 1):annotation_on_up][ran] == 'X')):
338
+ annotation_on_up += ran
339
+ break
340
+ count_gap_pdb = 0
341
+ for q in alignment_to_use[2][0:annotation_on_up - 1]:
342
+ if q == '.' or q == '-':
343
+ count_gap_pdb += 1
344
+ if alignment_to_use[1][annotation_on_up] == '-' or alignment_to_use[1][
345
+ annotation_on_up] == '.':
346
+ annotation_on_pdb = 'nan'
347
+ else:
348
+ annotation_on_pdb = int(annotation_on_up) - count_gap_pdb
349
+
350
+ if count_gap_pdb == annotation_on_up:
351
+ annotation_on_pdb = 'nan'
352
+ try:
353
+ if alignment_to_use[2][count_gap_pdb + annotation_on_pdb - 1] == '.' or alignment_to_use[2][
354
+ count_gap_pdb + annotation_on_pdb - 1] == '-':
355
+ annotation_on_pdb = 'nan'
356
+ except:
357
+ IndexError
358
+ annotation_on_pdb = 'nan'
359
+ except:
360
+ IndexError
361
+ annotation_on_pdb = 'nan'
362
+
363
+ newpos.append(annotation_on_pdb)
364
+
365
+ elif ('-' in str(positionList_perAnnotation[position])) and (
366
+ str(positionList_perAnnotation[position]) != '?') and (
367
+ str(positionList_perAnnotation[position]) != ' ') and (
368
+ len(str(positionList_perAnnotation[position])) != 0):
369
+ try:
370
+ position_start_on_pdb = \
371
+ find_position_on_pdb_for_range_annotations(positionList_perAnnotation[position],
372
+ startGap, alignment_to_use)[2]
373
+ position_end_on_pdb = \
374
+ find_position_on_pdb_for_range_annotations(positionList_perAnnotation[position],
375
+ startGap, alignment_to_use)[3]
376
+ except:
377
+ ValueError
378
+ newpositions = str(position_start_on_pdb) + '-' + str(position_end_on_pdb)
379
+ newpos.append(newpositions)
380
+ else:
381
+ pass
382
+ try:
383
+ newpos = [i for i in newpos if i != 'nan']
384
+ except:
385
+ TypeError
386
+ return newpos
387
+
388
+
389
+ def final_stage(df, annotation_list, alignment_path):
390
+ for i in df.index:
391
+
392
+
393
+ identifier = df.at[i, 'uniprotID'] + '_' + df.at[i, 'pdbID'] + '_' + df.at[i, 'chain'] + '_'
394
+ alignment_list = do_alignment(identifier, df.at[i, 'uniprotSequence'], df.at[i, 'pdbSequence'], alignment_path)
395
+ df.at[i, 'pdb_alignStatus'] = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[0]
396
+
397
+ print()
398
+ df.at[i, 'mutationPositionOnPDB'] = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[1]
399
+ startGap = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[2]
400
+ alignment_to_use = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[3]
401
+ for annot in annotation_list:
402
+ df.at[i, annot] = annotation_pos_on_pdb(df.at[i, annot], startGap, alignment_to_use, identifier)
403
+ if str(df.at[i, 'domStart']) != 'nan' and str(df.at[i, 'domEnd']) != 'nan' and \
404
+ ((str(df.at[i, 'domStart']) != '-1' and str(df.at[i, 'domEnd']) != '-1' and
405
+ str(df.at[i, 'domStart']) != '-1.0' and str(df.at[i, 'domEnd']) != '-1.0')):
406
+ domainLoc = str(df.at[i, 'domStart']).split('.')[0] + '-' + str(df.at[i, 'domEnd']).split('.')[0]
407
+ domain_pos = find_position_on_pdb_for_range_annotations(domainLoc, startGap, alignment_to_use)
408
+ df.at[i, 'domainStartonPDB'] = domain_pos[2]
409
+ df.at[i, 'domainEndonPDB'] = domain_pos[3]
410
+ elif str(df.at[i, 'domStart']) != '-1' or str(df.at[i, 'domEnd']) != '-1' or \
411
+ str(df.at[i, 'domStart']) != '-1.0' or str(df.at[i, 'domEnd']) != '-1.0':
412
+ df.at[i, 'domainStartonPDB'] = 'nan'
413
+ df.at[i, 'domainEndonPDB'] = 'nan'
414
+
415
+
416
+ df = df.astype(str)
417
+ return df
418
+
419
+ def alignment(dataframe_to_align, annotation_list, alignment_path):
420
+ domainList = ['domStart', 'domEnd']
421
+ result = final_stage(dataframe_to_align, annotation_list, alignment_path)
422
+ return result
423
+ #
code/add_annotations.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ssl
2
+ import requests as r
3
+ from decimal import *
4
+ import numpy as np
5
+ def add_annotations(dataframe):
6
+ print('Downloading UniProt sequence annotations...\n')
7
+ ssl._create_default_https_context = ssl._create_unverified_context
8
+
9
+ original_annot_name = ['DISULFID', 'INIT_MET', 'INTRAMEM', 'VARIANT', 'DNA_BIND', 'ACT_SITE', 'NP_BIND', 'LIPID',
10
+ 'SITE',
11
+ 'TRANSMEM', 'CROSSLNK', 'MUTAGEN', 'STRAND', 'HELIX', 'TURN', 'METAL', 'REPEAT', 'TOPO_DOM',
12
+ 'CA_BIND', 'BINDING', 'REGION', 'SIGNAL', 'MOD_RES', 'ZN_FING', 'MOTIF', 'COILED', 'PEPTIDE',
13
+ 'TRANSIT', 'CARBOHYD', 'PROPEP']
14
+ annotation_list = ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
15
+ 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis', 'strand',
16
+ 'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain', 'caBinding', 'bindingSite',
17
+ 'region',
18
+ 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide',
19
+ 'transitPeptide', 'glycosylation', 'propeptide']
20
+
21
+ dataframe = dataframe.reset_index().drop(['index'], axis=1)
22
+
23
+ for annot in original_annot_name:
24
+ dataframe[annot] = ''
25
+
26
+ for protein in list(set(dataframe.uniprotID.to_list())):
27
+ print('Downloading annotations for ' + protein)
28
+ uniprot_entry = r.get("http://www.uniprot.org/uniprot/" + protein + ".txt")
29
+ uniprot_entry = uniprot_entry.text.split('\n')
30
+
31
+ annot_for_protein = []
32
+ for annotation in original_annot_name:
33
+ for line in uniprot_entry:
34
+ if annotation.strip() in line and line.startswith(
35
+ 'FT') and 'evidence' not in line and 'ECO' not in line and 'note' not in line:
36
+ annot_for_protein.append(list(filter(None, line.split(' ')))[1:])
37
+ for select in annot_for_protein:
38
+ if select[0] not in dataframe.columns:
39
+ dataframe.loc[dataframe.uniprotID == protein, select[0]] = str((select[1] + '; '))
40
+ else:
41
+ dataframe.loc[dataframe.uniprotID == protein, select[0]] += str((select[1] + '; '))
42
+ for i in range(len(original_annot_name)):
43
+ dataframe = dataframe.rename(columns={original_annot_name[i]: annotation_list[i]})
44
+
45
+ # Fix annotation positions
46
+ print('Processing positions...\n')
47
+ for i in dataframe.index:
48
+ for annot in dataframe.columns[-30:]:
49
+ if annot != 'disulfide':
50
+ if dataframe.at[i, annot] != 'nan':
51
+ dataframe.at[i, annot] = ([x for x in [k.strip() for k in dataframe.at[i, annot].split(';')] if x])
52
+ if '..' not in str(dataframe.at[i, annot]):
53
+ pass
54
+ elif '..' in str(dataframe.at[i, annot]):
55
+ dataframe.at[i, annot] = str(dataframe.at[i, annot]).replace('..', '-')
56
+ else:
57
+ disulfide_annot = []
58
+ if dataframe.at[i, annot] != 'nan':
59
+ dataframe.at[i, annot]= dataframe.at[i, annot].split(';')
60
+ dataframe.at[i, annot] = [i.split('..') for i in dataframe.at[i, annot]]
61
+ dataframe.at[i, annot] =[e for v in dataframe.at[i, annot] for e in v]
62
+ dataframe.at[i, annot] = [i for i in dataframe.at[i, annot] if i != ' ']
63
+
64
+ # Add binary annotations
65
+ print('Adding binary annotations...\n')
66
+ dataframe = dataframe.astype('str')
67
+ for i in dataframe.index:
68
+ for k in annotation_list: # get the positions of each attribute as a list
69
+ txt = k + 'Binary'
70
+ dataframe.at[i, txt] = Decimal('nan')
71
+ try:
72
+ for positions in dataframe.at[i, k].split(','):
73
+ position = positions.strip('[').strip(']').replace("'", "")
74
+ if position != 'nan' and position != '' and '-' not in position and int(
75
+ dataframe.at[i, 'pos']) == int(position):
76
+ dataframe.at[i, txt] = '1'
77
+ break
78
+ elif position != 'nan' and position != '' and '-' not in position and int(
79
+ dataframe.at[i, 'pos']) != int(position):
80
+ dataframe.at[i, txt] = '0'
81
+ elif position != 'nan' and position != '' and '-' in position:
82
+ if int(position.split('-')[0]) < int(dataframe.at[i, 'pos']) < int(position.split('-')[1]):
83
+ dataframe.at[i, txt] = '1'
84
+ break
85
+ else:
86
+ dataframe.at[i, txt] = '0'
87
+ except:
88
+ ValueError
89
+
90
+ # Final corrections
91
+
92
+ dataframe = dataframe.replace({'[\'?\']': 'nan'})
93
+ dataframe = dataframe.replace({'[]': 'nan'})
94
+ return dataframe
95
+
code/add_domains.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import Counter
2
+ import pandas as pd
3
+
4
+ def add_domains(data, path_to_domains):
5
+ domains = pd.read_csv(path_to_domains, delimiter=' ')
6
+ data = data.merge(domains, right_on='proteinID', left_on='uniprotID', how='left')
7
+ data = data.drop(['proteinID'], axis=1)
8
+ # Label each data point as range or notRange based on the relative distance of mutation and domain boundaries.
9
+ data = data.astype('str')
10
+ data.domStart = data.domStart.astype('float')
11
+ data.domEnd = data.domEnd.astype('float')
12
+
13
+ for i in data.index:
14
+ if data.at[i, 'domain'] != 'nan':
15
+ if int(data.at[i, 'domStart']) <= int(data.at[i, 'pos']) <= int(data.at[i, 'domEnd']):
16
+ data.at[i, 'distance'] = 0
17
+ else:
18
+ distance = min(abs(int(data.at[i, 'domStart']) - int(data.at[i, 'pos'])),
19
+ abs(int(data.at[i, 'domEnd']) - int(data.at[i, 'pos'])))
20
+ data.at[i, 'distance'] = int(distance)
21
+ else:
22
+ data.at[i, 'distance'] = 'nan'
23
+
24
+ data = data.sort_values(by=['datapoint', 'distance']).reset_index(drop=True) # Distances will be sorted.
25
+
26
+ # Keep the one with the least distance. But we may have more than one range domains for a datapoint if distance = 0.
27
+ # For this reason first we need to separate range ones so that when we take the first occurance to get the closest one
28
+ # for non range ones, other distance=0 ones wont disappear.
29
+
30
+ data_range = data[data.distance == 0]
31
+ data_out_range = data[data.distance != 0]
32
+
33
+ # For the range ones, find the most occurance
34
+
35
+ dom = []
36
+ for i in data_range.index:
37
+ dom.append(data_range.at[i, 'domain'])
38
+
39
+ domainCount = Counter(dom) # Occurance of domains.
40
+
41
+ # For out of range ones, take the closest distance.
42
+ data_out_range = data_out_range.drop_duplicates(['datapoint'], keep='first') # Already sorted above.
43
+ domain_counts = pd.DataFrame(domainCount.items(), columns=['domain', 'count'])
44
+ data_range_counts = data_range.merge(domain_counts, on='domain')
45
+ data_range_counts = data_range_counts.sort_values(['datapoint', 'count'])
46
+ data_range_counts = data_range_counts.drop_duplicates(['datapoint'], keep='last') # Take with the higher count.
47
+ data_range_counts = data_range_counts.drop(['count'], axis=1)
48
+
49
+ # Merge them back together
50
+
51
+ frames = [data_range_counts, data_out_range]
52
+ data = pd.concat(frames, sort=False) # Here when you concat two data frames, we might have range and not range with
53
+ # min distance for the same data point. Delete the one coming from notRange one.
54
+ data = data.sort_values(['datapoint', 'distance']).reset_index(drop=True)
55
+ data = data.drop_duplicates(['datapoint'], keep='first')
56
+ data = data.astype(str)
57
+ return data
code/add_interface_pos.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def get_interface_positions(dataframe, column1, column2):
2
+ interface_positions = {}
3
+ for i in dataframe.index:
4
+ if dataframe.at[i, column1] not in interface_positions and dataframe.at[i, column1 + '_IRES'] != '[]':
5
+ interface_positions[dataframe.at[i, column1]] = dataframe.at[i, str(column1 + '_IRES')]
6
+ elif dataframe.at[i, column1] in interface_positions and dataframe.at[i, column1 + '_IRES'] != '[]':
7
+ interface_positions[dataframe.at[i, column1]] = interface_positions[dataframe.at[i, column1]].strip(
8
+ ']') + ',' + (dataframe.at[i, str(column1 + '_IRES')]).strip('[')
9
+ if dataframe.at[i, column2] not in interface_positions and dataframe.at[i, column2 + '_IRES'] != '[]':
10
+ interface_positions[dataframe.at[i, column2]] = dataframe.at[i, str(column2 + '_IRES')]
11
+ elif dataframe.at[i, column2] in interface_positions and dataframe.at[i, column2 + '_IRES'] != '[]':
12
+ interface_positions[dataframe.at[i, column2]] = interface_positions[dataframe.at[i, column2]].strip(
13
+ ']') + ',' + (dataframe.at[i, str(column2 + '_IRES')]).strip('[')
14
+
15
+ try:
16
+ for key, value in interface_positions.items():
17
+ n = []
18
+ m = []
19
+ if value != '[]':
20
+ valueList = value.split(',')
21
+ valueList[0] = str(valueList[0]).strip('[')
22
+ valueList[-1] = str(valueList[-1]).strip(']')
23
+ for val in valueList:
24
+ if '-' in val:
25
+ for r in range(int(val.split('-')[0]), int(val.split('-')[1]) + 1):
26
+ n.append(r)
27
+ else:
28
+ m.append(int(val))
29
+ fin = m + n
30
+
31
+ interface_positions[key] = fin
32
+ except:
33
+ ValueError
34
+
35
+ return interface_positions
code/add_sasa.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import ssbio.utils
3
+ import subprocess
4
+ import ssbio
5
+ import os.path as op
6
+ from add_3Dalignment import *
7
+ import os
8
+ from pathlib import Path
9
+ import gzip
10
+ import shutil
11
+ import streamlit as st
12
+
13
+ def run_freesasa(infile, outfile, include_hetatms=True, outdir=None, force_rerun=False, file_type = 'gzip'):
14
+ if not outdir:
15
+ outdir = ''
16
+ outfile = op.join(outdir, outfile)
17
+ if file_type == 'pdb':
18
+ if ssbio.utils.force_rerun(flag=force_rerun, outfile=outfile):
19
+ if include_hetatms:
20
+ shell_command = 'freesasa --format=rsa --hetatm {} -o {}'.format(infile, outfile)
21
+ else:
22
+ shell_command = 'freesasa --format=rsa {} -o {}'.format(infile, outfile)
23
+ command = subprocess.Popen(shell_command,
24
+ stdout=subprocess.PIPE,
25
+ stderr=subprocess.PIPE,
26
+ shell=True)
27
+ out, err = command.communicate()
28
+
29
+ elif file_type == 'gzip':
30
+ with gzip.open(infile, 'rb') as f_in:
31
+ with open('file_temp.pdb', 'wb') as f_out:
32
+ shutil.copyfileobj(f_in, f_out)
33
+
34
+ infile = 'file_temp.pdb'
35
+
36
+ if ssbio.utils.force_rerun(flag=force_rerun, outfile=outfile):
37
+ if include_hetatms:
38
+ shell_command = 'freesasa --format=rsa --hetatm {} -o {}'.format(infile, outfile)
39
+ else:
40
+ shell_command = 'freesasa --format=rsa {} -o {}'.format(infile, outfile)
41
+ command = subprocess.Popen(shell_command,
42
+ stdout=subprocess.PIPE,
43
+ stderr=subprocess.PIPE,
44
+ shell=True)
45
+ out, err = command.communicate()
46
+
47
+
48
+ return outfile
49
+
50
+ def calculate_freesasa(ID, model_num, existing_free_sasa, path_to_input,path_to_output_files, file_type = 'gzip'):
51
+ print('Calculating surface area...\n')
52
+ file_base = str(Path(path_to_input / '*'))
53
+ file_str = glob.glob(file_base)[0].split('-')[-1].split('.')[0]
54
+ if file_type == 'gzip':
55
+ if ID not in existing_free_sasa:
56
+ fullID = f'AF-{ID}-F{model_num}-{file_str }.pdb.gz'
57
+ run_freesasa(Path(path_to_input / fullID),
58
+ Path(path_to_output_files / f'freesasa_files/{fullID}.txt'), include_hetatms=True,
59
+ outdir=None, force_rerun=False)
60
+ elif file_type == 'pdb':
61
+ if ID not in existing_free_sasa:
62
+ fullID = f'AF-{ID}-F{model_num}-model_v1.pdb'
63
+ run_freesasa(Path(path_to_input / fullID),
64
+ Path(path_to_output_files / f'freesasa_files/{fullID}.txt'), include_hetatms=True,
65
+ outdir=None, force_rerun=False)
66
+
67
+ def sasa(source, pdbID, uniprotID, sasa_pos, wt, mode, path_to_output_files,file_type = 'gzip'):
68
+ if mode == 1:
69
+ sasa = 'nan'
70
+ for filename in list(Path(path_to_output_files / 'freesasa_files').glob("*")):
71
+ if source == 'PDB':
72
+ fname = str(filename).split('.')[0].split('/')[-1].upper()
73
+ elif source == 'MODBASE':
74
+ fname = str(filename).split('.')[0].split('/')[-1]
75
+ elif source == 'SWISSSMODEL':
76
+ fname = str(filename).split('_')[2]
77
+ if pdbID == fname:
78
+ files = open(filename, 'r')
79
+ file = files.readlines()
80
+ for k in file:
81
+
82
+ if k.strip()[10:13] == sasa_pos:
83
+ residue = str(k[4:7].strip())
84
+ if wt == threeToOne(residue):
85
+ sasa = str(k[22:28]).strip('\n')
86
+ return (sasa)
87
+ elif wt != threeToOne(residue):
88
+ sasa = str(k[22:28]).strip('\n') + '*'
89
+ return (sasa)
90
+ else:
91
+ return 'nan' #######
92
+
93
+ if mode == 2:
94
+ if sasa_pos != np.NaN:
95
+ sasa = 'nan'
96
+ if file_type == 'pdb':
97
+ for filename in list(Path(path_to_output_files / 'freesasa_files').glob("*")):
98
+ fname = list(filter(None, filename.split('.'))).split('/')[-1].upper()
99
+ if uniprotID == fname:
100
+ files = open(filename, 'r')
101
+ file = files.readlines()
102
+ for k in file:
103
+ if k.strip()[10:13] == sasa_pos:
104
+ residue = str(k[4:7].strip())
105
+ if wt == threeToOne(residue):
106
+ sasa = str(k[22:28]).strip('\n')
107
+ elif wt != threeToOne(residue):
108
+ sasa = str(k[22:28]).strip('\n') + '*'
109
+
110
+ return sasa
111
+ elif file_type == 'gzip':
112
+ for filename in list(Path(path_to_output_files / 'freesasa_files').glob("*")):
113
+ fname = list(filter(None, str(filename).split('.')))[0].split('/')[-1].split('-')[1].upper()
114
+
115
+ if uniprotID == fname:
116
+ files = open(filename, 'r')
117
+ file = files.readlines()
118
+ for k in file:
119
+ if str(k.strip()[10:13]) == str(sasa_pos):
120
+ residue = str(k[4:7].strip())
121
+ if wt == threeToOne(residue):
122
+ sasa = str(k[22:28]).strip('\n')
123
+ elif wt != threeToOne(residue):
124
+ sasa = str(k[22:28]).strip('\n') + '*'
125
+ else:
126
+ sasa = 'nan'
127
+
128
+ return sasa
129
+ else:
130
+ sasa = 'nan'
131
+ return sasa
code/add_sequence.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests as r
2
+ from io import StringIO
3
+ from Bio import SeqIO
4
+ import xml.etree.ElementTree as ET
5
+
6
+ def get_uniprot_seq(protein_id):
7
+ print('Fetching UniProt Sequences for ID: ', protein_id)
8
+ baseUrl = "http://www.uniprot.org/uniprot/"
9
+ currentUrl = baseUrl + protein_id + ".fasta"
10
+ response = r.post(currentUrl)
11
+ cData = ''.join(response.text)
12
+ Seq = StringIO(cData)
13
+ pSeq = list(SeqIO.parse(Seq, 'fasta'))
14
+ try:
15
+ return str(pSeq[0].seq)
16
+ except:
17
+ IndexError
18
+ return str('')
19
+
20
+
21
+ def get_isoforms(protein_id):
22
+ print('Fetching UniProt Isoforms for ID: ', protein_id)
23
+ try:
24
+ # a dictionary storing the sequence of your isoforms, key: accesion number, value: sequence
25
+ isoforms = dict()
26
+ # make a call to EBI API
27
+ req = r.get('https://www.ebi.ac.uk/proteins/api/proteins/{}/isoforms'.format(protein_id))
28
+ # parse the returned XML
29
+ uniprot = ET.fromstring(req.text)
30
+ for isoform in uniprot:
31
+ # get the sequence
32
+ seq = isoform.find('{http://uniprot.org/uniprot}sequence')
33
+
34
+ # get the accession number
35
+ iso_accession = isoform.find('{http://uniprot.org/uniprot}accession')
36
+
37
+ # add the values to the dictionary
38
+ if seq.text and iso_accession.text:
39
+ isoforms[iso_accession.text] = seq.text
40
+ return isoforms
41
+ except:
42
+ AttributeError
43
+ isoforms = {}
44
+ return isoforms
code/add_structure.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import time
3
+ import json
4
+ import zlib
5
+ from xml.etree import ElementTree
6
+ from urllib.parse import urlparse, parse_qs, urlencode
7
+ import requests
8
+ from requests.adapters import HTTPAdapter, Retry
9
+ from unipressed import IdMappingClient
10
+
11
+ ## Code adapted from UniProt documentation.
12
+ def get_pdb_ids_2(protein_id):
13
+ POLLING_INTERVAL = 5
14
+ API_URL = "https://rest.uniprot.org"
15
+
16
+ retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])
17
+ session = requests.Session()
18
+ session.mount("https://", HTTPAdapter(max_retries=retries))
19
+
20
+ def check_response(response):
21
+ try:
22
+ response.raise_for_status()
23
+ except requests.HTTPError:
24
+ print(response.json())
25
+ raise
26
+
27
+ def submit_id_mapping(from_db, to_db, ids):
28
+ request = requests.post(
29
+ f"{API_URL}/idmapping/run",
30
+ data={"from": from_db, "to": to_db, "ids": ids},
31
+ )
32
+ check_response(request)
33
+ return request.json()["jobId"]
34
+
35
+ def get_next_link(headers):
36
+ re_next_link = re.compile(r'<(.+)>; rel="next"')
37
+ if "Link" in headers:
38
+ match = re_next_link.match(headers["Link"])
39
+ if match:
40
+ return match.group(1)
41
+
42
+ def check_id_mapping_results_ready(job_id):
43
+ while True:
44
+ request = session.get(f"{API_URL}/idmapping/status/{job_id}")
45
+ check_response(request)
46
+ j = request.json()
47
+ if "jobStatus" in j:
48
+ if j["jobStatus"] == "RUNNING":
49
+ print(f"Retrying in {POLLING_INTERVAL}s")
50
+ time.sleep(POLLING_INTERVAL)
51
+ else:
52
+ raise Exception(j["jobStatus"])
53
+ else:
54
+ return bool(j["results"] or j["failedIds"])
55
+
56
+ def get_batch(batch_response, file_format, compressed):
57
+ batch_url = get_next_link(batch_response.headers)
58
+ while batch_url:
59
+ batch_response = session.get(batch_url)
60
+ batch_response.raise_for_status()
61
+ yield decode_results(batch_response, file_format, compressed)
62
+ batch_url = get_next_link(batch_response.headers)
63
+
64
+ def combine_batches(all_results, batch_results, file_format):
65
+ if file_format == "json":
66
+ for key in ("results", "failedIds"):
67
+ if key in batch_results and batch_results[key]:
68
+ all_results[key] += batch_results[key]
69
+ elif file_format == "tsv":
70
+ return all_results + batch_results[1:]
71
+ else:
72
+ return all_results + batch_results
73
+ return all_results
74
+
75
+ def get_id_mapping_results_link(job_id):
76
+ url = f"{API_URL}/idmapping/details/{job_id}"
77
+ request = session.get(url)
78
+ check_response(request)
79
+ return request.json()["redirectURL"]
80
+
81
+ def decode_results(response, file_format, compressed):
82
+ if compressed:
83
+ decompressed = zlib.decompress(response.content, 16 + zlib.MAX_WBITS)
84
+ if file_format == "json":
85
+ j = json.loads(decompressed.decode("utf-8"))
86
+ return j
87
+ elif file_format == "tsv":
88
+ return [line for line in decompressed.decode("utf-8").split("\n") if line]
89
+ elif file_format == "xlsx":
90
+ return [decompressed]
91
+ elif file_format == "xml":
92
+ return [decompressed.decode("utf-8")]
93
+ else:
94
+ return decompressed.decode("utf-8")
95
+ elif file_format == "json":
96
+ return response.json()
97
+ elif file_format == "tsv":
98
+ return [line for line in response.text.split("\n") if line]
99
+ elif file_format == "xlsx":
100
+ return [response.content]
101
+ elif file_format == "xml":
102
+ return [response.text]
103
+ return response.text
104
+
105
+ def get_xml_namespace(element):
106
+ m = re.match(r"\{(.*)\}", element.tag)
107
+ return m.groups()[0] if m else ""
108
+
109
+ def merge_xml_results(xml_results):
110
+ merged_root = ElementTree.fromstring(xml_results[0])
111
+ for result in xml_results[1:]:
112
+ root = ElementTree.fromstring(result)
113
+ for child in root.findall("{http://uniprot.org/uniprot}entry"):
114
+ merged_root.insert(-1, child)
115
+ ElementTree.register_namespace("", get_xml_namespace(merged_root[0]))
116
+ return ElementTree.tostring(merged_root, encoding="utf-8", xml_declaration=True)
117
+
118
+
119
+ def get_id_mapping_results_search(url):
120
+ parsed = urlparse(url)
121
+ query = parse_qs(parsed.query)
122
+ file_format = query["format"][0] if "format" in query else "json"
123
+ if "size" in query:
124
+ size = int(query["size"][0])
125
+ else:
126
+ size = 500
127
+ query["size"] = size
128
+ compressed = (
129
+ query["compressed"][0].lower() == "true" if "compressed" in query else False
130
+ )
131
+ parsed = parsed._replace(query=urlencode(query, doseq=True))
132
+ url = parsed.geturl()
133
+ request = session.get(url)
134
+ check_response(request)
135
+ results = decode_results(request, file_format, compressed)
136
+ total = int(request.headers["x-total-results"])
137
+ for i, batch in enumerate(get_batch(request, file_format, compressed), 1):
138
+ results = combine_batches(results, batch, file_format)
139
+ if file_format == "xml":
140
+ return merge_xml_results(results)
141
+ return results
142
+
143
+
144
+ job_id = submit_id_mapping(
145
+ from_db="UniProtKB_AC-ID", to_db="PDB", ids=protein_id
146
+ )
147
+ if check_id_mapping_results_ready(job_id):
148
+ link = get_id_mapping_results_link(job_id)
149
+ results = get_id_mapping_results_search(link)
150
+ # Equivalently using the stream endpoint which is more demanding
151
+ # on the API and so is less stable:
152
+ # results = get_id_mapping_results_stream(link)
153
+
154
+ return [i['to'] for i in results['results']]
155
+ def get_pdb_ids(protein_id):
156
+ try:
157
+ request = IdMappingClient.submit(
158
+ source="UniProtKB_AC-ID", dest="PDB", ids={protein_id})
159
+
160
+ time.sleep(2.0)
161
+ pdb_list = list(request.each_result())
162
+ return [i['to'] for i in pdb_list]
163
+ except requests.exceptions.HTTPError:
164
+ get_pdb_ids_2(protein_id)
165
+ except KeyError:
166
+ get_pdb_ids_2(protein_id)
167
+
168
+
code/alphafold_featureVector.py ADDED
@@ -0,0 +1,579 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # IMPORT NECESSARY MODULES AND LIBRARIES
2
+ from timeit import default_timer as timer
3
+ import xml.etree.ElementTree as ET
4
+ from collections import Counter
5
+ from bs4 import BeautifulSoup
6
+ from io import StringIO
7
+ from decimal import *
8
+ import pandas as pd
9
+ import requests as r
10
+ import os.path as op
11
+ from pathlib import Path
12
+ import subprocess
13
+ import argparse
14
+ import ssbio.utils
15
+ import warnings
16
+ import sys
17
+ import pathlib
18
+ import os, glob
19
+ import math
20
+ import ssbio
21
+ import ssl
22
+ import gzip
23
+ import ast
24
+ import itertools
25
+
26
+ from Bio.Align import substitution_matrices
27
+ from Bio.PDB.Polypeptide import *
28
+ from Bio.PDB import PDBList
29
+ from Bio import Align
30
+ from Bio import SeqIO
31
+ from Bio.PDB import *
32
+ import numpy as np
33
+
34
+
35
+
36
+
37
+ # FUNCTIONS
38
+ from calc_pc_property import *
39
+ from add_domains import *
40
+ from add_annotations import *
41
+ from add_structure import *
42
+ from add_alignment import *
43
+ from manage_files import *
44
+ from add_3Dalignment import *
45
+ from add_sasa import *
46
+ from standard import *
47
+ from add_interface_pos import *
48
+ from standard import *
49
+ from uniprotSequenceMatch import uniprotSequenceMatch
50
+ from process_input import clean_data
51
+ from alphafold_model import *
52
+
53
+
54
+ def alphafold(input_set, mode, impute):
55
+ start = timer()
56
+ # Necessary lists
57
+ annotation_list = ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
58
+ 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis', 'strand',
59
+ 'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain', 'caBinding', 'bindingSite',
60
+ 'region',
61
+ 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide',
62
+ 'transitPeptide', 'glycosylation', 'propeptide']
63
+
64
+ change_names = {'Disulfide bond': 'disulfide', 'Initiator methionine': 'intMet',
65
+ 'Natural variant': 'naturalVariant',
66
+ 'DNA binding': 'dnaBinding',
67
+ 'Active site': 'activeSite', 'Nucleotide binding': 'nucleotideBinding', 'Lipidation': 'lipidation',
68
+ 'Site': 'site', 'Transmembrane': 'transmembrane', 'Cross-link': 'crosslink',
69
+ 'Mutagenesis': 'mutagenesis', 'Beta strand': 'strand', 'Helix': 'helix', 'Turn': 'turn',
70
+ 'Metal binding': 'metalBinding', 'Repeat': 'repeat',
71
+ 'Topological domain': 'topologicalDomain', 'Calcium binding': 'caBinding',
72
+ 'Binding site': 'bindingSite',
73
+ 'Region': 'region', 'Signal peptide': 'signalPeptide', 'Modified residue': 'modifiedResidue',
74
+ 'Zinc finger': 'zincFinger', 'Motif': 'motif', 'Coiled coil': 'coiledCoil', 'Peptide': 'peptide',
75
+ 'Transit peptide': 'transitPeptide', 'Glycosylation': 'glycosylation', 'Propeptide': 'propeptide',
76
+ 'Intramembrane': 'intramembrane'}
77
+
78
+
79
+ ## Standardizing input
80
+ data = clean_data(input_set)
81
+
82
+ path_to_input_files, path_to_output_files, path_to_domains, fisher_path, path_to_interfaces, alphafold_path, alphafold_summary= manage_files(mode)
83
+ out_path = path_to_output_files / 'log.txt'
84
+ sys.stdout = open(out_path, 'w')
85
+ print('Creating directories...')
86
+ file_base = str(Path(alphafold_path / '*'))
87
+ file_str = glob.glob(file_base)[0].split('-')[-1].split('.')[0]
88
+ ## Physicochemical properties
89
+ print('Adding physicochemical properties...\n')
90
+ data = add_physicochemical(data)
91
+
92
+ ## Domains
93
+ print('Adding domains\n')
94
+ data = add_domains(data, path_to_domains)
95
+
96
+ ## Processing data frame
97
+ data = data.astype(str)
98
+ data = data.replace({'NaN': np.NaN, 'nan': np.NaN})
99
+ data.domain = data.domain.replace({np.NaN: '-1'}) # Fill -1 if NaN - standardization.
100
+ data.domStart = data.domStart.replace({np.NaN: '-1'})
101
+ data.domEnd = data.domEnd.replace({np.NaN: '-1'})
102
+ data.distance = data.distance.replace({np.NaN: '-1'})
103
+ fisherResult = pd.read_csv(fisher_path, sep='\t')
104
+ significant_domains = fisherResult.domain.to_list()
105
+
106
+ data = data.reset_index()
107
+ data = data.drop(columns=['index'])
108
+
109
+ ## not_match_in_uniprot : Data points not matched to UniProt sequence
110
+ ## uniprot_matched: Data points matched to UniProt sequence. Proceed with this data frame
111
+ ## canonical_fasta : Dataframe including canonical sequence for the protein of interest. Obtained from UniProt.
112
+ ## isoform_fasta: Dataframe including isoform sequences for the protein of interest. Obtained from UniProt.
113
+ not_match_in_uniprot, uniprot_matched, canonical_fasta, isoform_fasta = uniprotSequenceMatch(data)
114
+
115
+ not_match_in_uniprot = not_match_in_uniprot.reset_index().drop(['index'], axis=1)
116
+
117
+ for key in change_names.keys():
118
+ not_match_in_uniprot[key] = ''
119
+ not_match_in_uniprot = not_match_in_uniprot.rename(columns=change_names)
120
+ uniprot_matched = add_annotations(uniprot_matched)
121
+
122
+ for w in uniprot_matched.index:
123
+ for q in annotation_list:
124
+ per_protein = []
125
+ if uniprot_matched.at[w, q] != 'nan':
126
+ fix = ast.literal_eval(uniprot_matched.at[w, q])
127
+ for z in fix:
128
+ if '-' in z:
129
+ per_protein += np.arange(int(z.split('-')[0]), int(z.split('-')[1])+1,1).tolist()
130
+ else:
131
+ try:
132
+ per_protein.append(int(z))
133
+ except:
134
+ ValueError
135
+ uniprot_matched.at[w, q] = per_protein
136
+ else:
137
+ uniprot_matched.at[w, q] = 'nan'
138
+ uniprot_matched = uniprot_matched.rename(columns=change_names)
139
+ uniprot_matched['wt_sequence_match'] = uniprot_matched['wt_sequence_match'].astype(str)
140
+
141
+
142
+ ## Avoiding downloading files for SASA calculation if already downloaded.
143
+
144
+ existing_free_sasa = list(Path(path_to_output_files / 'freesasa_files').glob("*"))
145
+ existing_free_sasa = [str(i) for i in existing_free_sasa]
146
+ existing_free_sasa = [i.split('/')[-1].split('.')[0] for i in existing_free_sasa]
147
+ ## Decide if the wild type amino acid is on canonical or isoform sequence. Selected sequence will be used for the
148
+ ## sequence alignment.
149
+ for i in uniprot_matched.index:
150
+ if len(uniprot_matched.at[i, 'uniprotSequence']) >= int(uniprot_matched.at[i, 'pos']):
151
+ wt = uniprot_matched.at[i, 'wt']
152
+ can = str(uniprot_matched.at[i, 'uniprotSequence'])[int(uniprot_matched.at[i, 'pos']) - 1]
153
+ if wt == can:
154
+ uniprot_matched.at[i, 'wt_sequence_match'] = 'm'
155
+ elif wt != can:
156
+ isoList = isoform_fasta[
157
+ isoform_fasta['uniprotID'] == uniprot_matched.at[i, 'uniprotID']].isoformSequence.to_list()
158
+ for k in isoList:
159
+ if len(k) >= int(uniprot_matched.at[i, 'pos']):
160
+ resInIso = k[int(int(uniprot_matched.at[i, 'pos']) - 1)]
161
+ if wt == resInIso:
162
+ whichIsoform = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[0]
163
+ uniprot_matched.at[i, 'wt_sequence_match'] = 'i'
164
+ uniprot_matched.at[i, 'whichIsoform'] = whichIsoform
165
+ break
166
+
167
+ elif len(uniprot_matched.at[i, 'uniprotSequence']) < int(uniprot_matched.at[i, 'pos']):
168
+ isoList = isoform_fasta[
169
+ isoform_fasta['uniprotID'] == uniprot_matched.at[i, 'uniprotID']].isoformSequence.to_list()
170
+ for k in isoList:
171
+ if len(k) >= int(uniprot_matched.at[i, 'pos']):
172
+ resInIso = k[int(int(uniprot_matched.at[i, 'pos']) - 1)]
173
+ wt = uniprot_matched.at[i, 'wt']
174
+ if wt == resInIso:
175
+ whichIsoform = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[0]
176
+ uniprot_matched.at[i, 'wt_sequence_match'] = 'i'
177
+ uniprot_matched.at[i, 'whichIsoform'] = whichIsoform
178
+ break
179
+
180
+
181
+
182
+ uniprot_matched = uniprot_matched.replace({'nan': np.NaN})
183
+ for annot in ['Domain', 'Alternative sequence', 'Chain', 'Sequence conflict', 'Compositional bias']:
184
+ try:
185
+ uniprot_matched = uniprot_matched.drop(columns=annot)
186
+ except:
187
+ KeyError
188
+
189
+ print('You have %d data points that failed to match a UniProt Sequence\nProceeding with %d remaining...\n'
190
+ % (len(not_match_in_uniprot.drop_duplicates(['datapoint'])),
191
+ len(uniprot_matched.drop_duplicates(['datapoint']))))
192
+
193
+ ## Adding interface residue information.
194
+
195
+ data_interface = pd.read_csv(path_to_interfaces, sep='\t')
196
+ interface_positions = get_interface_positions(data_interface, 'P1', 'P2')
197
+
198
+ interface_dataframe = pd.DataFrame()
199
+ for key, val in interface_positions.items():
200
+ k = pd.Series((key, str(list(set(val)))))
201
+ interface_dataframe = interface_dataframe.append(k, ignore_index=True)
202
+ interface_dataframe.columns = ['uniprotID', 'interface_positions']
203
+
204
+ uniprot_matched = uniprot_matched.merge(interface_dataframe, on='uniprotID', how='left')
205
+ uniprot_matched.interface_positions = uniprot_matched.interface_positions.astype('str')
206
+
207
+ ## PDB info file is pre-generated for time concerns. Includes summary data of AlphaFold structures.
208
+ ## With new updates, can be updated separately.
209
+
210
+ pdb_info = pd.read_csv(alphafold_summary, sep='\t')
211
+
212
+ ## Keeping how many models each AlphaFold structure has.
213
+ model_count = modelCount(alphafold_path)
214
+ for k, v in model_count.items():
215
+ model_count[k] = int(v / 2) # two types of files for each file.
216
+ uniprot_matched = uniprot_matched.astype(str)
217
+ uniprot_matched.domStart = uniprot_matched.domStart.astype(float)
218
+ uniprot_matched.domEnd = uniprot_matched.domEnd.astype(float)
219
+ uniprot_matched.domStart = uniprot_matched.domStart.astype(int)
220
+ uniprot_matched.domEnd = uniprot_matched.domEnd.astype(int)
221
+
222
+
223
+
224
+ ## Main part to add annotation information, align sequences, finding distances
225
+
226
+ for i in uniprot_matched.index:
227
+ print('Processing', i, 'of', len(uniprot_matched))
228
+ if len(uniprot_matched.at[i, 'uniprotSequence']) >= int(uniprot_matched.at[i, 'pos']):
229
+ wt = uniprot_matched.at[i, 'wt']
230
+ can = str(uniprot_matched.at[i, 'uniprotSequence'])[int(uniprot_matched.at[i, 'pos']) - 1]
231
+ ## Information about whether the mutation is found on the canonical or isoform sequence.
232
+
233
+ if wt == can:
234
+ uniprot_matched.at[i, 'wt_sequence_match'] = 'm'
235
+ elif wt != can:
236
+ isoList = isoform_fasta[
237
+ isoform_fasta['uniprotID'] == uniprot_matched.at[i, 'uniprotID']].isoformSequence.to_list()
238
+ for k in isoList:
239
+ if len(k) >= int(uniprot_matched.at[i, 'pos']):
240
+ resInIso = k[int(int(uniprot_matched.at[i, 'pos']) - 1)]
241
+ if wt == resInIso:
242
+ whichIsoform = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[0]
243
+ uniprot_matched.at[i, 'wt_sequence_match'] = 'i'
244
+ uniprot_matched.at[i, 'whichIsoform'] = whichIsoform
245
+ break
246
+ elif len(uniprot_matched.at[i, 'uniprotSequence']) < int(uniprot_matched.at[i, 'pos']):
247
+ isoList = isoform_fasta[
248
+ isoform_fasta['uniprotID'] == uniprot_matched.at[i, 'uniprotID']].isoformSequence.to_list()
249
+ for k in isoList:
250
+ if len(k) >= int(uniprot_matched.at[i, 'pos']):
251
+ resInIso = k[int(int(uniprot_matched.at[i, 'pos']) - 1)]
252
+ wt = uniprot_matched.at[i, 'wt']
253
+ if wt == resInIso:
254
+ whichIsoform = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[0]
255
+ uniprot_matched.at[i, 'wt_sequence_match'] = 'i'
256
+ uniprot_matched.at[i, 'whichIsoform'] = whichIsoform
257
+ break
258
+ uniprotID = uniprot_matched.at[i, 'uniprotID']
259
+ datapoint = uniprot_matched.at[i, 'datapoint']
260
+
261
+ for k in annotation_list:
262
+ txt = k + 'Binary'
263
+
264
+ if (str(uniprot_matched.at[i, txt]) == '0') or (str(uniprot_matched.at[i, txt]) == '0.0'):
265
+ uniprot_matched.at[i, txt] = '1'
266
+ elif (str(uniprot_matched.at[i, txt]).lower() == 'nan') | (str(uniprot_matched.at[i, txt]) == np.NaN) :
267
+ uniprot_matched.at[i, txt] = '0'
268
+ elif (str(uniprot_matched.at[i, txt]) == '1') or (str(uniprot_matched.at[i, txt]) == '1.0'):
269
+ uniprot_matched.at[i, txt] = '2'
270
+ ## Search in all models.
271
+ models_for_protein = [val for key, val in model_count.items() if
272
+ uniprotID in key.split(';')] # We have this many models for the protein.
273
+ which_model_mutation = which_model(
274
+ int(uniprot_matched.at[i, 'pos'])) # List of models in which the mutation can be found.
275
+ models_for_all_annotations = {}
276
+ for annot in annotation_list:
277
+ if len(uniprot_matched.at[i, annot]) != 0 and type(uniprot_matched.at[i, annot]) != list:
278
+ uniprot_matched.at[i, annot] = list(
279
+ map(str.strip, uniprot_matched.at[i, annot].strip('][').replace('"', '').split(',')))
280
+ models_for_annotations = {} # Recording which position is found in which model file.
281
+ for annot_position in uniprot_matched.at[i, annot]:
282
+ if annot_position != 'nan' and annot_position != '':
283
+ models_for_that_position = which_model(int(annot_position))
284
+ else:
285
+ models_for_that_position = {}
286
+ for key, val in models_for_that_position.items():
287
+ if key not in models_for_annotations.keys():
288
+ models_for_annotations[key] = [val]
289
+ else:
290
+ models_for_annotations[key] += [val]
291
+ models_for_all_annotations[annot] = models_for_annotations
292
+ new_dict = {}
293
+ for key, val in models_for_all_annotations.items():
294
+ subdict = {k: v for k, v in val.items() if k in which_model_mutation}
295
+ subdict = dict(sorted(subdict.items()))
296
+ new_dict[key] = subdict
297
+ new_dict = reduce_model_dict(new_dict)
298
+ models_we_need = list(set(itertools.chain.from_iterable(
299
+ [list(ov.keys()) for ok, ov in new_dict.items()]))) # Read models with these numbers
300
+ info_per_model = {} # her bir datapoint için baştan yazılıyor.
301
+ dist_of_annots = {}
302
+ all_domain_distances = []
303
+
304
+ for mod in models_we_need:
305
+ print('---------PRINTING FOR MODEL--------', mod)
306
+ dist_of_annots[str(mod)] = {}
307
+ info_per_model[mod] = {}
308
+ info_per_model[mod]['datapoint'] = datapoint
309
+ identifier = uniprot_matched.at[i, 'uniprotSequence']
310
+ try:
311
+ pdbSequence = pdb_info.loc[(pdb_info.uniprotID == uniprotID) & (
312
+ pdb_info.model_num == mod)].sequence.item()
313
+ except:
314
+ ValueError
315
+ pdbSequence = 'nan'
316
+ if pdbSequence != 'nan': # The number in models we need might not be present for that protein. Preventng error.
317
+ pdbSequence = pdb_info.loc[(pdb_info.uniprotID == uniprotID) & (pdb_info.model_num == mod)].sequence.item()
318
+ alignment_list = do_alignment(uniprot_matched.at[i, 'datapoint'], uniprot_matched.at[i, 'uniprotSequence'],
319
+ pdbSequence, Path(path_to_output_files / 'alignment_files'))
320
+ pdb_alignStatus = mutation_position_on_pdb(alignment_list, uniprot_matched.at[i, 'pos'])[0]
321
+ info_per_model[mod]['pdb_alignStatus'] = pdb_alignStatus
322
+ mutationPositionOnPDB = mutation_position_on_pdb(alignment_list, uniprot_matched.at[i, 'pos'])[1]
323
+ info_per_model[mod]['mutationPositionOnPDB'] = mutationPositionOnPDB
324
+ startGap = mutation_position_on_pdb(alignment_list, uniprot_matched.at[i, 'pos'])[2]
325
+ info_per_model[mod]['startGap'] = startGap
326
+ alignment_to_use = mutation_position_on_pdb(alignment_list, uniprot_matched.at[i, 'pos'])[3]
327
+ for annot in annotation_list:
328
+ if new_dict[annot] == {}:
329
+ annotation_pos_on_pdb_ = []
330
+ else:
331
+ try:
332
+ annotation_pos_on_pdb_ = annotation_pos_on_pdb(new_dict[annot][mod], startGap, alignment_to_use,
333
+ identifier)
334
+ except:
335
+ KeyError
336
+ info_per_model[mod][annot] = annotation_pos_on_pdb_
337
+
338
+ pdb_path = Path(f'{alphafold_path}/AF-{uniprotID}-F{mod}-{file_str}.pdb.gz')
339
+
340
+ if get_alignments_3D(uniprotID, mod, pdb_path, pdbSequence, 'nan', 'nan', 'nan', mode, Path(path_to_output_files / '3D_alignment'),
341
+ 'gzip') != None:
342
+
343
+ alignments, coords, resnums_for_sasa = get_alignments_3D(uniprotID, mod, pdb_path, pdbSequence, 'nan',
344
+ 'nan', 'nan', mode, Path(path_to_output_files / '3D_alignment'),
345
+ 'gzip')
346
+ alignments = alignments[0]
347
+
348
+ calculate_freesasa(uniprotID, mod, existing_free_sasa, alphafold_path, path_to_output_files)
349
+ if (mutationPositionOnPDB != 'nan'):
350
+ if (int(mutationPositionOnPDB) <= 1400):
351
+ try:
352
+ coordMut = get_coords(mutationPositionOnPDB, alignments, coords, resnums_for_sasa, mode)[0]
353
+ except:
354
+ ValueError
355
+ coordMut = 'nan'
356
+ else:
357
+ coordMut = np.NaN
358
+
359
+ sasa_pos = get_coords(mutationPositionOnPDB, alignments, coords, resnums_for_sasa, mode)[2]
360
+ sasa_val = sasa('alphafold', 'nan', uniprotID, sasa_pos, uniprot_matched.at[i, 'wt'], mode,
361
+ path_to_output_files, file_type='gzip')
362
+
363
+ if sasa_val != None:
364
+ uniprot_matched.at[i, 'sasa'] = sasa_val
365
+ else:
366
+ coordMut = 'nan'
367
+ sasa_val = 'nan'
368
+ uniprot_matched.at[i, 'sasa'] = sasa_val
369
+
370
+ domainPositionOnPDB_list = list(
371
+ range(int(uniprot_matched.at[i, 'domStart']), int(uniprot_matched.at[i, 'domEnd'])))
372
+ domain_distances = []
373
+ if len(domainPositionOnPDB_list) != 0:
374
+ for domain_ in domainPositionOnPDB_list:
375
+ coordDomain = get_coords(domain_, alignments, coords, resnums_for_sasa, mode)[0]
376
+ distance_dom = float(find_distance(coordMut,
377
+ coordDomain)) # bu bir anotasyonun bir modeldeki bir tane pozisyonu için.
378
+ domain_distances.append(distance_dom)
379
+ minimum_domain = min(domain_distances) # minimum for one model.
380
+ else:
381
+ minimum_domain = np.NaN
382
+ all_domain_distances.append(minimum_domain)
383
+ list_dist_of_annots = []
384
+ for key, val in info_per_model.items():
385
+ modNum = key
386
+ min_annots = {} # Write from scratch for each annotation.
387
+
388
+ if modNum == mod:
389
+ for label, annotPos in val.items(): # For each annotation type, calculate all distances of the annot positions.
390
+ if label in annotation_list:
391
+ all_annot_distance_per_model = [] # All distances of an annoation in hat model
392
+ for annot_position in annotPos:
393
+ if (annot_position != 'nan'):
394
+ if (int(annot_position) <= 1400):
395
+ coordAnnot = \
396
+ get_coords(annot_position, alignments, coords, resnums_for_sasa, mode)[
397
+ 0]
398
+ distance = float(find_distance(coordMut,
399
+ coordAnnot)) # bu bir anotasyonun bir modeldeki bir tane pozisyonu için.
400
+ all_annot_distance_per_model.append(distance)
401
+ if all_annot_distance_per_model != []:
402
+ all_annot_distance_per_model = [float(i) for i in all_annot_distance_per_model]
403
+ try:
404
+ minimum_position = float(min(all_annot_distance_per_model))
405
+ except:
406
+ ValueError
407
+ minimum_position = 'nan'
408
+ min_annots[label] = float(
409
+ minimum_position) # Minimum of the annotation in this model.
410
+ if min_annots != {}:
411
+ list_dist_of_annots.append(min_annots)
412
+ dist_of_annots[str(
413
+ mod)] = list_dist_of_annots # Getting minimum of all possible models
414
+ # uniprot_matched.at[i, annotation_type] = minimum_position
415
+ else:
416
+ print('Model File Not Found')
417
+
418
+ uniprot_matched.at[i, 'sasa'] = np.NaN
419
+
420
+
421
+ if len(all_domain_distances) != 0:
422
+ uniprot_matched.at[i, 'domaindistance3D'] = min(all_domain_distances)
423
+ else:
424
+ uniprot_matched.at[i, 'domaindistance3D'] = np.NaN
425
+ dist_of_annots_min_of_all = {}
426
+ flat = [item for sublist in list(dist_of_annots.values()) for item in sublist]
427
+ for f in flat:
428
+ for key, val in f.items():
429
+ if key not in dist_of_annots_min_of_all.keys():
430
+ dist_of_annots_min_of_all[key] = val
431
+ elif (key in dist_of_annots_min_of_all.keys()) & (float(dist_of_annots_min_of_all[key]) > float(val)):
432
+ dist_of_annots_min_of_all[key] = val
433
+ key_list = []
434
+ for key, val in dist_of_annots_min_of_all.items():
435
+ uniprot_matched.at[i, key] = val
436
+ key_list.append(key)
437
+ remaining = list(set(annotation_list) - set(key_list))
438
+
439
+ for rem in remaining:
440
+ uniprot_matched.at[i, rem] = ''
441
+ uniprot_matched.at[i, 'distances'] = [dist_of_annots]
442
+
443
+ if (uniprot_matched.at[i, 'sasa'] != None) & (uniprot_matched.at[i, 'sasa'] != np.NaN) & (
444
+ str(uniprot_matched.at[i, 'sasa']) != 'nan'):
445
+ if '*' in uniprot_matched.at[i, 'sasa']:
446
+ uniprot_matched.at[i, 'sasa'] = uniprot_matched.at[i, 'sasa'].split('*')[0]
447
+ try:
448
+ uniprot_matched.at[i, 'sasa'] = float(uniprot_matched.at[i, 'sasa'].strip())
449
+ except:
450
+ TypeError
451
+
452
+ if float(uniprot_matched.at[i, 'sasa']) < 5:
453
+ uniprot_matched.at[i, 'trsh4'] = 'core'
454
+ elif float(uniprot_matched.at[i, 'sasa']) >= 5:
455
+ uniprot_matched.at[i, 'trsh4'] = 'surface'
456
+ elif str(uniprot_matched.at[i, 'sasa']) == 'nan':
457
+ uniprot_matched.at[i, 'trsh4'] = 'nan'
458
+ else:
459
+ uniprot_matched.at[i, 'trsh4'] = 'nan'
460
+ if (str(uniprot_matched.at[i, 'pos']) in uniprot_matched.at[i, 'interface_positions']) and uniprot_matched.at[
461
+ i, 'trsh4'] == 'surface':
462
+ uniprot_matched.at[i, 'threeState_trsh4_HQ'] = 'interface'
463
+ elif (str(uniprot_matched.at[i, 'pos']) not in uniprot_matched.at[i, 'interface_positions']) and uniprot_matched.at[
464
+ i, 'trsh4'] == 'surface':
465
+ uniprot_matched.at[i, 'threeState_trsh4_HQ'] = 'surface'
466
+ elif (str(uniprot_matched.at[i, 'pos']) not in uniprot_matched.at[i, 'interface_positions']) and uniprot_matched.at[
467
+ i, 'trsh4'] == 'core':
468
+ uniprot_matched.at[i, 'threeState_trsh4_HQ'] = 'core'
469
+ elif (str(uniprot_matched.at[i, 'pos']) in uniprot_matched.at[i, 'interface_positions']) and uniprot_matched.at[
470
+ i, 'trsh4'] == 'core':
471
+ uniprot_matched.at[i, 'threeState_trsh4_HQ'] = 'conflict'
472
+ elif uniprot_matched.at[i, 'trsh4'] == 'nan':
473
+ uniprot_matched.at[i, 'threeState_trsh4_HQ'] = 'nan'
474
+ if uniprot_matched.at[i, 'domain'] in significant_domains:
475
+ uniprot_matched.at[i, 'domain_fisher'] = uniprot_matched.at[i, 'domain']
476
+ else:
477
+ uniprot_matched.at[i, 'domain_fisher'] = 'NULL'
478
+ uniprot_matched = uniprot_matched.round(2)
479
+ uniprot_matched = uniprot_matched.astype(str)
480
+
481
+ uniprot_matched[ 'domain'] = uniprot_matched['domain'].replace({'-1': 'NULL'})
482
+ uniprot_matched = uniprot_matched.drop_duplicates()
483
+ uniprot_matched.rename(
484
+ columns={'uniprotID': 'prot_uniprotAcc', 'wt': 'wt_residue', 'pos': 'position', 'mut': 'mut_residue',
485
+ 'datapoint': 'meta_merged', 'datapoint_disease': 'meta-lab_merged', 'label': 'source_db',
486
+ 'family': 'prot_family', 'domain': 'domains_all', 'domain_fisher': 'domains_sig',
487
+ 'domaindistance3D': 'domains_3Ddist', 'threeState_trsh4_HQ': 'location_3state',
488
+ 'disulfideBinary': 'disulfide_bin', 'intMetBinary': 'intMet_bin',
489
+ 'intramembraneBinary': 'intramembrane_bin',
490
+ 'naturalVariantBinary': 'naturalVariant_bin', 'dnaBindingBinary': 'dnaBinding_bin',
491
+ 'activeSiteBinary': 'activeSite_bin',
492
+ 'nucleotideBindingBinary': 'nucleotideBinding_bin', 'lipidationBinary': 'lipidation_bin',
493
+ 'siteBinary': 'site_bin',
494
+ 'transmembraneBinary': 'transmembrane_bin', 'crosslinkBinary': 'crosslink_bin',
495
+ 'mutagenesisBinary': 'mutagenesis_bin',
496
+ 'strandBinary': 'strand_bin', 'helixBinary': 'helix_bin', 'turnBinary': 'turn_bin',
497
+ 'metalBindingBinary': 'metalBinding_bin',
498
+ 'repeatBinary': 'repeat_bin', 'topologicalDomainBinary': 'topologicalDomain_bin',
499
+ 'caBindingBinary': 'caBinding_bin',
500
+ 'bindingSiteBinary': 'bindingSite_bin', 'regionBinary': 'region_bin',
501
+ 'signalPeptideBinary': 'signalPeptide_bin',
502
+ 'modifiedResidueBinary': 'modifiedResidue_bin', 'zincFingerBinary': 'zincFinger_bin',
503
+ 'motifBinary': 'motif_bin',
504
+ 'coiledCoilBinary': 'coiledCoil_bin', 'peptideBinary': 'peptide_bin',
505
+ 'transitPeptideBinary': 'transitPeptide_bin',
506
+ 'glycosylationBinary': 'glycosylation_bin', 'propeptideBinary': 'propeptide_bin',
507
+ 'disulfide': 'disulfide_dist', 'intMet': 'intMet_dist',
508
+ 'intramembrane': 'intramembrane_dist', 'naturalVariant': 'naturalVariant_dist',
509
+ 'dnaBinding': 'dnaBinding_dist', 'activeSite': 'activeSite_dist',
510
+ 'nucleotideBinding': 'nucleotideBinding_dist', 'lipidation': 'lipidation_dist', 'site': 'site_dist',
511
+ 'transmembrane': 'transmembrane_dist', 'crosslink': 'crosslink_dist',
512
+ 'mutagenesis': 'mutagenesis_dist', 'strand': 'strand_dist', 'helix': 'helix_dist', 'turn': 'turn_dist',
513
+ 'metalBinding': 'metalBinding_dist', 'repeat': 'repeat_dist',
514
+ 'topologicalDomain': 'topologicalDomain_dist', 'caBinding': 'caBinding_dist',
515
+ 'bindingSite': 'bindingSite_dist', 'region': 'region_dist',
516
+ 'signalPeptide': 'signalPeptide_dist', 'modifiedResidue': 'modifiedResidue_dist',
517
+ 'zincFinger': 'zincFinger_dist', 'motif': 'motif_dist', 'coiledCoil': 'coiledCoil_dist',
518
+ 'peptide': 'peptide_dist', 'transitPeptide': 'transitPeptide_dist',
519
+ 'glycosylation': 'glycosylation_dist', 'propeptide': 'propeptide_dist'}, inplace=True)
520
+
521
+ uniprot_matched = uniprot_matched[
522
+ ['prot_uniprotAcc', 'wt_residue', 'mut_residue', 'position', 'meta_merged', 'composition', 'polarity', 'volume',
523
+ 'granthamScore', 'domains_all',
524
+ 'domains_sig', 'domains_3Ddist', 'sasa', 'location_3state', 'disulfide_bin', 'intMet_bin',
525
+ 'intramembrane_bin', 'naturalVariant_bin', 'dnaBinding_bin',
526
+ 'activeSite_bin', 'nucleotideBinding_bin', 'lipidation_bin', 'site_bin',
527
+ 'transmembrane_bin', 'crosslink_bin', 'mutagenesis_bin', 'strand_bin',
528
+ 'helix_bin', 'turn_bin', 'metalBinding_bin', 'repeat_bin',
529
+ 'caBinding_bin', 'topologicalDomain_bin', 'bindingSite_bin',
530
+ 'region_bin', 'signalPeptide_bin', 'modifiedResidue_bin',
531
+ 'zincFinger_bin', 'motif_bin', 'coiledCoil_bin', 'peptide_bin',
532
+ 'transitPeptide_bin', 'glycosylation_bin', 'propeptide_bin', 'disulfide_dist', 'intMet_dist',
533
+ 'intramembrane_dist',
534
+ 'naturalVariant_dist', 'dnaBinding_dist', 'activeSite_dist',
535
+ 'nucleotideBinding_dist', 'lipidation_dist', 'site_dist',
536
+ 'transmembrane_dist', 'crosslink_dist', 'mutagenesis_dist',
537
+ 'strand_dist', 'helix_dist', 'turn_dist', 'metalBinding_dist',
538
+ 'repeat_dist', 'caBinding_dist', 'topologicalDomain_dist',
539
+ 'bindingSite_dist', 'region_dist', 'signalPeptide_dist',
540
+ 'modifiedResidue_dist', 'zincFinger_dist', 'motif_dist',
541
+ 'coiledCoil_dist', 'peptide_dist', 'transitPeptide_dist',
542
+ 'glycosylation_dist', 'propeptide_dist']]
543
+ uniprot_matched = uniprot_matched.reset_index()
544
+ uniprot_matched = uniprot_matched.drop(columns = {'index'})
545
+ # Imputation
546
+ if (impute == 'True') or (impute == 'true'):
547
+ filler = [20.71, 46.67, 28.13,15.5, 35.94, 21.84, 25.15, 45.15, 29.81, 29.91, 34.67, 24.72, 10.66,11.55, 13.02,
548
+ 21.54,27.42, 38.39, 30.44, 20.9, 25.82, 46.12, 32.1, 35.96, 35.86, 37.88, 19.09, 35.2, 26.95, 37.48]
549
+ col_index = 0
550
+
551
+ for col_ in uniprot_matched.columns[-30:]:
552
+ uniprot_matched[col_] = uniprot_matched[col_].fillna(filler[col_index])
553
+ uniprot_matched[col_] = uniprot_matched[col_].replace({'nan': filler[col_index]})
554
+ uniprot_matched[col_] = uniprot_matched[col_].replace({'': filler[col_index]})
555
+ """
556
+ if uniprot_matched[col_].values == '':
557
+ uniprot_matched[col_] = filler[col_index]
558
+ """
559
+ col_index += 1
560
+
561
+ uniprot_matched['domains_3Ddist'] = uniprot_matched['domains_3Ddist'].fillna(29.78)
562
+ uniprot_matched['sasa'] = uniprot_matched['sasa'].fillna(35.6)
563
+ uniprot_matched['location_3state'] = uniprot_matched['location_3state'].fillna('unknown')
564
+ elif (impute == 'False') or (impute == 'false'):
565
+ pass
566
+ uniprot_matched = uniprot_matched.replace({'nan': np.NaN})
567
+ uniprot_matched = uniprot_matched.replace({'['']': np.NaN})
568
+ uniprot_matched.to_csv(path_to_output_files / 'featurevector_alphafold.txt', index=False, sep='\t')
569
+ if len(uniprot_matched) == 0:
570
+ print(
571
+ 'No feature vector could be produced for input data. Please check the presence of a structure for the input proteins.')
572
+
573
+ print('Feature vector successfully created...')
574
+ end = timer()
575
+ hours, rem = divmod(end - start, 3600)
576
+ minutes, seconds = divmod(rem, 60)
577
+ print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
578
+ sys.stdout.close()
579
+ return uniprot_matched
code/alphafold_model.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import Counter
2
+ import glob
3
+ def reduce_model_dict(dict):
4
+ for key, val in dict.items():
5
+ used = []
6
+ for key2, val2 in val.items():
7
+ new = []
8
+ for i in val2:
9
+ if i not in used:
10
+ new.append(i)
11
+ used.append(i)
12
+ val[key2] = new
13
+ return dict
14
+
15
+
16
+ def which_model(position):
17
+ models_dict = {}
18
+ x = 1
19
+ for i, j in zip(range(1400, 27000, 200), range(1, 27000, 200)):
20
+ if position <= i and position >= j:
21
+ models_dict[x] = position
22
+ x += 1
23
+ return models_dict
24
+
25
+ def modelCount(path_to_models):
26
+ count_list = []
27
+ for file in list(path_to_models.glob("*")):
28
+ protein_id = str(file).split('-')[1]
29
+ count_list.append(protein_id)
30
+ count_dict = Counter(count_list)
31
+ count_dict = {';'.join(sorted(k for k in count_dict.keys() if count_dict[k] == v)): v for v in
32
+ set(count_dict.values())}
33
+ return count_dict
code/calc_pc_property.py ADDED
@@ -0,0 +1,441 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ def compositionValues(aa1, aa2):
3
+ compositionValues = {'S': 1.42, 'R': 0.65, 'L': 0, 'P': 0.39, 'T': 0.71, 'A': 0, 'V': 0, 'G': 0.74,
4
+ 'I': 0, 'F': 0, 'Y': 0.20, 'C': 2.75, 'H': 0.58, 'Q': 0.89, 'N': 1.33, 'K': 0.33,
5
+ 'D': 1.38, 'E': 0.92, 'M': 0, 'W': 0.13}
6
+ dif = round((compositionValues[aa1] - compositionValues[aa2]), 2)
7
+ return (dif)
8
+
9
+
10
+ def polarityValues(aa1, aa2):
11
+ polarityValues = {'S': 9.2, 'R': 10.5, 'L': 4.9, 'P': 8.0, 'T': 8.6, 'A': 8.1, 'V': 5.9, 'G': 9.0,
12
+ 'I': 5.2, 'F': 5.2, 'Y': 6.2, 'C': 5.5, 'H': 10.4, 'Q': 10.5, 'N': 11.6, 'K': 11.3,
13
+ 'D': 13.0, 'E': 12.3, 'M': 5.7, 'W': 5.4}
14
+ dif = round((polarityValues[aa1] - polarityValues[aa2]), 2)
15
+ return (dif)
16
+
17
+
18
+ def volumeValues(aa1, aa2):
19
+ volumeValues = {'S': 32, 'R': 124, 'L': 111, 'P': 32.5, 'T': 61, 'A': 31, 'V': 84, 'G': 3,
20
+ 'I': 111, 'F': 132, 'Y': 136, 'C': 55, 'H': 96, 'Q': 85, 'N': 56, 'K': 119,
21
+ 'D': 54, 'E': 83, 'M': 105, 'W': 170}
22
+ dif = round((volumeValues[aa1] - volumeValues[aa2]), 2)
23
+ return (dif)
24
+
25
+
26
+ def add_physicochemical(df):
27
+ grantham_dict = {
28
+ ('A', 'A'): '0',
29
+ ('A', 'C'): '195',
30
+ ('A', 'D'): '126',
31
+ ('A', 'E'): '107',
32
+ ('A', 'F'): '113',
33
+ ('A', 'G'): '60',
34
+ ('A', 'H'): '86',
35
+ ('A', 'I'): '94',
36
+ ('A', 'K'): '106',
37
+ ('A', 'L'): '96',
38
+ ('A', 'M'): '84',
39
+ ('A', 'N'): '111',
40
+ ('A', 'P'): '27',
41
+ ('A', 'Q'): '91',
42
+ ('A', 'R'): '112',
43
+ ('A', 'S'): '99',
44
+ ('A', 'T'): '58',
45
+ ('A', 'V'): '64',
46
+ ('A', 'W'): '148',
47
+ ('A', 'Y'): '112',
48
+ ('C', 'A'): '195',
49
+ ('C', 'C'): '0',
50
+ ('C', 'D'): '154',
51
+ ('C', 'E'): '170',
52
+ ('C', 'F'): '205',
53
+ ('C', 'G'): '159',
54
+ ('C', 'H'): '174',
55
+ ('C', 'I'): '198',
56
+ ('C', 'K'): '202',
57
+ ('C', 'L'): '198',
58
+ ('C', 'M'): '196',
59
+ ('C', 'N'): '139',
60
+ ('C', 'P'): '169',
61
+ ('C', 'Q'): '154',
62
+ ('C', 'R'): '180',
63
+ ('C', 'S'): '112',
64
+ ('C', 'T'): '149',
65
+ ('C', 'V'): '192',
66
+ ('C', 'W'): '215',
67
+ ('C', 'Y'): '194',
68
+ ('D', 'A'): '126',
69
+ ('D', 'C'): '154',
70
+ ('D', 'D'): '0',
71
+ ('D', 'E'): '45',
72
+ ('D', 'F'): '177',
73
+ ('D', 'G'): '94',
74
+ ('D', 'H'): '81',
75
+ ('D', 'I'): '168',
76
+ ('D', 'K'): '101',
77
+ ('D', 'L'): '172',
78
+ ('D', 'M'): '160',
79
+ ('D', 'N'): '23',
80
+ ('D', 'P'): '108',
81
+ ('D', 'Q'): '61',
82
+ ('D', 'R'): '96',
83
+ ('D', 'S'): '65',
84
+ ('D', 'T'): '85',
85
+ ('D', 'V'): '152',
86
+ ('D', 'W'): '181',
87
+ ('D', 'Y'): '160',
88
+ ('E', 'A'): '107',
89
+ ('E', 'C'): '170',
90
+ ('E', 'D'): '45',
91
+ ('E', 'E'): '0',
92
+ ('E', 'F'): '140',
93
+ ('E', 'G'): '98',
94
+ ('E', 'H'): '40',
95
+ ('E', 'I'): '134',
96
+ ('E', 'K'): '56',
97
+ ('E', 'L'): '138',
98
+ ('E', 'M'): '126',
99
+ ('E', 'N'): '42',
100
+ ('E', 'P'): '93',
101
+ ('E', 'Q'): '29',
102
+ ('E', 'R'): '54',
103
+ ('E', 'S'): '80',
104
+ ('E', 'T'): '65',
105
+ ('E', 'V'): '121',
106
+ ('E', 'W'): '152',
107
+ ('E', 'Y'): '122',
108
+ ('F', 'A'): '113',
109
+ ('F', 'C'): '205',
110
+ ('F', 'D'): '177',
111
+ ('F', 'E'): '140',
112
+ ('F', 'F'): '0',
113
+ ('F', 'G'): '153',
114
+ ('F', 'H'): '100',
115
+ ('F', 'I'): '21',
116
+ ('F', 'K'): '102',
117
+ ('F', 'L'): '22',
118
+ ('F', 'M'): '28',
119
+ ('F', 'N'): '158',
120
+ ('F', 'P'): '114',
121
+ ('F', 'Q'): '116',
122
+ ('F', 'R'): '97',
123
+ ('F', 'S'): '155',
124
+ ('F', 'T'): '103',
125
+ ('F', 'V'): '50',
126
+ ('F', 'W'): '40',
127
+ ('F', 'Y'): '22',
128
+ ('G', 'A'): '60',
129
+ ('G', 'C'): '159',
130
+ ('G', 'D'): '94',
131
+ ('G', 'E'): '98',
132
+ ('G', 'F'): '153',
133
+ ('G', 'G'): '0',
134
+ ('G', 'H'): '98',
135
+ ('G', 'I'): '135',
136
+ ('G', 'K'): '127',
137
+ ('G', 'L'): '138',
138
+ ('G', 'M'): '127',
139
+ ('G', 'N'): '80',
140
+ ('G', 'P'): '42',
141
+ ('G', 'Q'): '87',
142
+ ('G', 'R'): '125',
143
+ ('G', 'S'): '56',
144
+ ('G', 'T'): '59',
145
+ ('G', 'V'): '109',
146
+ ('G', 'W'): '184',
147
+ ('G', 'Y'): '147',
148
+ ('H', 'A'): '86',
149
+ ('H', 'C'): '174',
150
+ ('H', 'D'): '81',
151
+ ('H', 'E'): '40',
152
+ ('H', 'F'): '100',
153
+ ('H', 'G'): '98',
154
+ ('H', 'H'): '0',
155
+ ('H', 'I'): '94',
156
+ ('H', 'K'): '32',
157
+ ('H', 'L'): '99',
158
+ ('H', 'M'): '87',
159
+ ('H', 'N'): '68',
160
+ ('H', 'P'): '77',
161
+ ('H', 'Q'): '24',
162
+ ('H', 'R'): '29',
163
+ ('H', 'S'): '89',
164
+ ('H', 'T'): '47',
165
+ ('H', 'V'): '84',
166
+ ('H', 'W'): '115',
167
+ ('H', 'Y'): '83',
168
+ ('I', 'A'): '94',
169
+ ('I', 'C'): '198',
170
+ ('I', 'D'): '168',
171
+ ('I', 'E'): '134',
172
+ ('I', 'F'): '21',
173
+ ('I', 'G'): '135',
174
+ ('I', 'H'): '94',
175
+ ('I', 'I'): '0',
176
+ ('I', 'K'): '102',
177
+ ('I', 'L'): '5',
178
+ ('I', 'M'): '10',
179
+ ('I', 'N'): '149',
180
+ ('I', 'P'): '95',
181
+ ('I', 'Q'): '109',
182
+ ('I', 'R'): '97',
183
+ ('I', 'S'): '142',
184
+ ('I', 'T'): '89',
185
+ ('I', 'V'): '29',
186
+ ('I', 'W'): '61',
187
+ ('I', 'Y'): '33',
188
+ ('K', 'A'): '106',
189
+ ('K', 'C'): '202',
190
+ ('K', 'D'): '101',
191
+ ('K', 'E'): '56',
192
+ ('K', 'F'): '102',
193
+ ('K', 'G'): '127',
194
+ ('K', 'H'): '32',
195
+ ('K', 'I'): '102',
196
+ ('K', 'K'): '0',
197
+ ('K', 'L'): '107',
198
+ ('K', 'M'): '95',
199
+ ('K', 'N'): '94',
200
+ ('K', 'P'): '103',
201
+ ('K', 'Q'): '53',
202
+ ('K', 'R'): '26',
203
+ ('K', 'S'): '121',
204
+ ('K', 'T'): '78',
205
+ ('K', 'V'): '97',
206
+ ('K', 'W'): '110',
207
+ ('K', 'Y'): '85',
208
+ ('L', 'A'): '96',
209
+ ('L', 'C'): '198',
210
+ ('L', 'D'): '172',
211
+ ('L', 'E'): '138',
212
+ ('L', 'F'): '22',
213
+ ('L', 'G'): '138',
214
+ ('L', 'H'): '99',
215
+ ('L', 'I'): '5',
216
+ ('L', 'K'): '107',
217
+ ('L', 'L'): '0',
218
+ ('L', 'M'): '15',
219
+ ('L', 'N'): '153',
220
+ ('L', 'P'): '98',
221
+ ('L', 'Q'): '113',
222
+ ('L', 'R'): '102',
223
+ ('L', 'S'): '145',
224
+ ('L', 'T'): '92',
225
+ ('L', 'V'): '32',
226
+ ('L', 'W'): '61',
227
+ ('L', 'Y'): '36',
228
+ ('M', 'A'): '84',
229
+ ('M', 'C'): '196',
230
+ ('M', 'D'): '160',
231
+ ('M', 'E'): '126',
232
+ ('M', 'F'): '28',
233
+ ('M', 'G'): '127',
234
+ ('M', 'H'): '87',
235
+ ('M', 'I'): '10',
236
+ ('M', 'K'): '95',
237
+ ('M', 'L'): '15',
238
+ ('M', 'M'): '0',
239
+ ('M', 'N'): '142',
240
+ ('M', 'P'): '87',
241
+ ('M', 'Q'): '101',
242
+ ('M', 'R'): '91',
243
+ ('M', 'S'): '135',
244
+ ('M', 'T'): '81',
245
+ ('M', 'V'): '21',
246
+ ('M', 'W'): '67',
247
+ ('M', 'Y'): '36',
248
+ ('N', 'A'): '111',
249
+ ('N', 'C'): '139',
250
+ ('N', 'D'): '23',
251
+ ('N', 'E'): '42',
252
+ ('N', 'F'): '158',
253
+ ('N', 'G'): '80',
254
+ ('N', 'H'): '68',
255
+ ('N', 'I'): '149',
256
+ ('N', 'K'): '94',
257
+ ('N', 'L'): '153',
258
+ ('N', 'M'): '142',
259
+ ('N', 'N'): '0',
260
+ ('N', 'P'): '91',
261
+ ('N', 'Q'): '46',
262
+ ('N', 'R'): '86',
263
+ ('N', 'S'): '46',
264
+ ('N', 'T'): '65',
265
+ ('N', 'V'): '133',
266
+ ('N', 'W'): '174',
267
+ ('N', 'Y'): '143',
268
+ ('P', 'A'): '27',
269
+ ('P', 'C'): '169',
270
+ ('P', 'D'): '108',
271
+ ('P', 'E'): '93',
272
+ ('P', 'F'): '114',
273
+ ('P', 'G'): '42',
274
+ ('P', 'H'): '77',
275
+ ('P', 'I'): '95',
276
+ ('P', 'K'): '103',
277
+ ('P', 'L'): '98',
278
+ ('P', 'M'): '87',
279
+ ('P', 'N'): '91',
280
+ ('P', 'P'): '0',
281
+ ('P', 'Q'): '76',
282
+ ('P', 'R'): '103',
283
+ ('P', 'S'): '74',
284
+ ('P', 'T'): '38',
285
+ ('P', 'V'): '68',
286
+ ('P', 'W'): '147',
287
+ ('P', 'Y'): '110',
288
+ ('Q', 'A'): '91',
289
+ ('Q', 'C'): '154',
290
+ ('Q', 'D'): '61',
291
+ ('Q', 'E'): '29',
292
+ ('Q', 'F'): '116',
293
+ ('Q', 'G'): '87',
294
+ ('Q', 'H'): '24',
295
+ ('Q', 'I'): '109',
296
+ ('Q', 'K'): '53',
297
+ ('Q', 'L'): '113',
298
+ ('Q', 'M'): '101',
299
+ ('Q', 'N'): '46',
300
+ ('Q', 'P'): '76',
301
+ ('Q', 'Q'): '0',
302
+ ('Q', 'R'): '43',
303
+ ('Q', 'S'): '68',
304
+ ('Q', 'T'): '42',
305
+ ('Q', 'V'): '96',
306
+ ('Q', 'W'): '130',
307
+ ('Q', 'Y'): '99',
308
+ ('R', 'A'): '112',
309
+ ('R', 'C'): '180',
310
+ ('R', 'D'): '96',
311
+ ('R', 'E'): '54',
312
+ ('R', 'F'): '97',
313
+ ('R', 'G'): '125',
314
+ ('R', 'H'): '29',
315
+ ('R', 'I'): '97',
316
+ ('R', 'K'): '26',
317
+ ('R', 'L'): '102',
318
+ ('R', 'M'): '91',
319
+ ('R', 'N'): '86',
320
+ ('R', 'P'): '103',
321
+ ('R', 'Q'): '43',
322
+ ('R', 'R'): '0',
323
+ ('R', 'S'): '110',
324
+ ('R', 'T'): '71',
325
+ ('R', 'V'): '96',
326
+ ('R', 'W'): '101',
327
+ ('R', 'Y'): '77',
328
+ ('S', 'A'): '99',
329
+ ('S', 'C'): '112',
330
+ ('S', 'D'): '65',
331
+ ('S', 'E'): '80',
332
+ ('S', 'F'): '155',
333
+ ('S', 'G'): '56',
334
+ ('S', 'H'): '89',
335
+ ('S', 'I'): '142',
336
+ ('S', 'K'): '121',
337
+ ('S', 'L'): '145',
338
+ ('S', 'M'): '135',
339
+ ('S', 'N'): '46',
340
+ ('S', 'P'): '74',
341
+ ('S', 'Q'): '68',
342
+ ('S', 'R'): '110',
343
+ ('S', 'S'): '0',
344
+ ('S', 'T'): '58',
345
+ ('S', 'V'): '124',
346
+ ('S', 'W'): '177',
347
+ ('S', 'Y'): '144',
348
+ ('T', 'A'): '58',
349
+ ('T', 'C'): '149',
350
+ ('T', 'D'): '85',
351
+ ('T', 'E'): '65',
352
+ ('T', 'F'): '103',
353
+ ('T', 'G'): '59',
354
+ ('T', 'H'): '47',
355
+ ('T', 'I'): '89',
356
+ ('T', 'K'): '78',
357
+ ('T', 'L'): '92',
358
+ ('T', 'M'): '81',
359
+ ('T', 'N'): '65',
360
+ ('T', 'P'): '38',
361
+ ('T', 'Q'): '42',
362
+ ('T', 'R'): '71',
363
+ ('T', 'S'): '58',
364
+ ('T', 'T'): '0',
365
+ ('T', 'V'): '69',
366
+ ('T', 'W'): '128',
367
+ ('T', 'Y'): '92',
368
+ ('V', 'A'): '64',
369
+ ('V', 'C'): '192',
370
+ ('V', 'D'): '152',
371
+ ('V', 'E'): '121',
372
+ ('V', 'F'): '50',
373
+ ('V', 'G'): '109',
374
+ ('V', 'H'): '84',
375
+ ('V', 'I'): '29',
376
+ ('V', 'K'): '97',
377
+ ('V', 'L'): '32',
378
+ ('V', 'M'): '21',
379
+ ('V', 'N'): '133',
380
+ ('V', 'P'): '68',
381
+ ('V', 'Q'): '96',
382
+ ('V', 'R'): '96',
383
+ ('V', 'S'): '124',
384
+ ('V', 'T'): '69',
385
+ ('V', 'V'): '0',
386
+ ('V', 'W'): '88',
387
+ ('V', 'Y'): '55',
388
+ ('W', 'A'): '148',
389
+ ('W', 'C'): '215',
390
+ ('W', 'D'): '181',
391
+ ('W', 'E'): '152',
392
+ ('W', 'F'): '40',
393
+ ('W', 'G'): '184',
394
+ ('W', 'H'): '115',
395
+ ('W', 'I'): '61',
396
+ ('W', 'K'): '110',
397
+ ('W', 'L'): '61',
398
+ ('W', 'M'): '67',
399
+ ('W', 'N'): '174',
400
+ ('W', 'P'): '147',
401
+ ('W', 'Q'): '130',
402
+ ('W', 'R'): '101',
403
+ ('W', 'S'): '177',
404
+ ('W', 'T'): '128',
405
+ ('W', 'V'): '88',
406
+ ('W', 'W'): '0',
407
+ ('W', 'Y'): '37',
408
+ ('Y', 'A'): '112',
409
+ ('Y', 'C'): '194',
410
+ ('Y', 'D'): '160',
411
+ ('Y', 'E'): '122',
412
+ ('Y', 'F'): '22',
413
+ ('Y', 'G'): '147',
414
+ ('Y', 'H'): '83',
415
+ ('Y', 'I'): '33',
416
+ ('Y', 'K'): '85',
417
+ ('Y', 'L'): '36',
418
+ ('Y', 'M'): '36',
419
+ ('Y', 'N'): '143',
420
+ ('Y', 'P'): '110',
421
+ ('Y', 'Q'): '99',
422
+ ('Y', 'R'): '77',
423
+ ('Y', 'S'): '144',
424
+ ('Y', 'T'): '92',
425
+ ('Y', 'V'): '55',
426
+ ('Y', 'W'): '37',
427
+ ('Y', 'Y'): '0'
428
+ }
429
+ for i in df.index:
430
+ try:
431
+ df.at[i, 'composition'] = compositionValues(df.at[i, 'wt'], df.at[i, 'mut'])
432
+ df.at[i, 'polarity'] = polarityValues(df.at[i, 'wt'], df.at[i, 'mut'])
433
+ df.at[i, 'volume'] = volumeValues(df.at[i, 'wt'], df.at[i, 'mut'])
434
+ df.at[i, 'granthamScore'] = grantham_dict[df.at[i, 'wt'], df.at[i, 'mut']]
435
+ except:
436
+ KeyError
437
+ df.at[i, 'composition'] = 'nan'
438
+ df.at[i, 'polarity'] = 'nan'
439
+ df.at[i, 'volume'] = 'nan'
440
+ df.at[i, 'granthamScore'] = 'nan'
441
+ return df
code/create_swissmodelSummary.py ADDED
@@ -0,0 +1 @@
 
0
  help='Enter the directory where meta-data is found.',
1
  default=1)
2
  os.makedirs('input_files/extract_swissmodel_structures/', exist_ok=True)
3
  all_swissmodel = open('input_files/swissmodel_structures.txt', 'w')
4
  all_swissmodel.write('UniProtKB_ac iso_id uniprot_seq_length uniprot_seq_md5 coordinate_id provider from to template qmeandisco_global seqid url')
5
  all_swissmodel.write('\n')
6
  for f in glob.glob(f'{meta_data}/*.tar.gz'):
7
  name = f.split('/')[-1].split('.')[0]
8
  with tarfile.open(f) as tar:
9
  tar.extractall(f'input_files/extract_swissmodel_structures/{name}')
10
  with open(f'input_files/extract_swissmodel_structures/{name}/SWISS-MODEL_Repository/INDEX') as x:
11
  lines = (x.readlines())[7:]
12
  for line in lines:
13
  all_swissmodel.write(line)
14
  shutil.rmtree('input_files/extract_swissmodel_structures/')
15
  swissmodel_file()
 
1
+ '''
2
  help='Enter the directory where meta-data is found.',
3
  default=1)
4
  os.makedirs('input_files/extract_swissmodel_structures/', exist_ok=True)
5
  all_swissmodel = open('input_files/swissmodel_structures.txt', 'w')
6
  all_swissmodel.write('UniProtKB_ac iso_id uniprot_seq_length uniprot_seq_md5 coordinate_id provider from to template qmeandisco_global seqid url')
7
  all_swissmodel.write('\n')
8
  for f in glob.glob(f'{meta_data}/*.tar.gz'):
9
  name = f.split('/')[-1].split('.')[0]
10
  with tarfile.open(f) as tar:
11
  tar.extractall(f'input_files/extract_swissmodel_structures/{name}')
12
  with open(f'input_files/extract_swissmodel_structures/{name}/SWISS-MODEL_Repository/INDEX') as x:
13
  lines = (x.readlines())[7:]
14
  for line in lines:
15
  all_swissmodel.write(line)
16
  shutil.rmtree('input_files/extract_swissmodel_structures/')
17
  swissmodel_file()
code/get_alphafoldStructures.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tarfile, glob, os
2
+ from biopandas.pdb import PandasPdb
3
+ import argparse
4
+ import numpy as np
5
+
6
+ parser = argparse.ArgumentParser(description='ASCARIS')
7
+
8
+ parser.add_argument('-file_name', '--file_name',
9
+ help='Enter the file tar file name to untar',
10
+ default=1)
11
+
12
+ args = parser.parse_args()
13
+
14
+ alphafold = args.file_name
15
+
16
+ def threeToOne(variant):
17
+ if variant == "ALA":
18
+ variant = "A"
19
+ elif variant == "ARG":
20
+ variant = "R"
21
+ elif variant == "VAL":
22
+ variant = "V"
23
+ elif variant == "GLU":
24
+ variant = "E"
25
+ elif variant == "PRO":
26
+ variant = "P"
27
+ elif variant == "LEU":
28
+ variant = "L"
29
+ elif variant == "GLY":
30
+ variant = "G"
31
+ elif variant == "ASN":
32
+ variant = "N"
33
+ elif variant == "SER":
34
+ variant = "S"
35
+ elif variant == "GLN":
36
+ variant = "Q"
37
+ elif variant == "THR":
38
+ variant = "T"
39
+ elif variant == "MET":
40
+ variant = "M"
41
+ elif variant == "LYS":
42
+ variant = "K"
43
+ elif variant == "ASP":
44
+ variant = "D"
45
+ elif variant == "ILE":
46
+ variant = "I"
47
+ elif variant == "PHE":
48
+ variant = "F"
49
+ elif variant == "TRP":
50
+ variant = "W"
51
+ elif variant == "TYR":
52
+ variant = "Y"
53
+ elif variant == "HIS":
54
+ variant = "H"
55
+ elif variant == "CYS":
56
+ variant = "C"
57
+ elif variant == 'UNK':
58
+ variant = 'X'
59
+ elif variant == 'ASX':
60
+ variant = 'O'
61
+ return (variant)
62
+ # Unzip AlphaFold structures
63
+
64
+ def create_file():
65
+ os.makedirs('input_files/alphafold_structures/', exist_ok=True)
66
+ for f in glob.glob(f'input_files/{alphafold}'):
67
+ with tarfile.open(f) as tar:
68
+ tar.extractall(f'input_files/alphafold_structures/')
69
+
70
+ # Create summary file
71
+ alphafold_summary_file = open('input_files/alphafold_summary.txt', 'w')
72
+ alphafold_summary_file.write('uniprotID\tchain\tsequence\tmodel_num')
73
+ alphafold_summary_file.write('\n')
74
+ for f in glob.glob('input_files/alphafold_structures/*pdb*'):
75
+ str1 = PandasPdb().read_pdb(f)
76
+ str1 = str1.df['ATOM']
77
+ str1 = str1[['alt_loc', 'residue_name', 'residue_number', 'atom_name', 'insertion', 'chain_id']]
78
+ str1 = str1[str1.atom_name == 'CA']
79
+ str1['residue_name'] = str1['residue_name'].apply(lambda x: threeToOne(x))
80
+ str1['alt_loc'] = str1['alt_loc'].replace({'': np.NaN})
81
+ str1 = str1.drop_duplicates(['residue_name', 'residue_number'])
82
+ structure_residues_pdb = ''.join(str1.residue_name.to_list())
83
+ model_no = f.split('-')[2].strip()[1:]
84
+ up_name = f.split('-')[1].strip()
85
+ chain_id = list(set(str1.chain_id.to_list()))[0]
86
+ alphafold_summary_file.write(up_name)
87
+ alphafold_summary_file.write('\t')
88
+ alphafold_summary_file.write(chain_id)
89
+ alphafold_summary_file.write('\t')
90
+ alphafold_summary_file.write(structure_residues_pdb)
91
+ alphafold_summary_file.write('\t')
92
+ alphafold_summary_file.write(model_no)
93
+ alphafold_summary_file.write('\n')
94
+
95
+
96
+ if __name__ == '__main__':
97
+ create_file()
code/main.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pdb_featureVector
2
+ import alphafold_featureVector
3
+ import argparse
4
+
5
+ parser = argparse.ArgumentParser(description='ASCARIS')
6
+
7
+ parser.add_argument('-s', '--source_option',
8
+ help='Selection of input structure data.\n 1: PDB Structures (default), 2: AlphaFold Structures',
9
+ default=1)
10
+ parser.add_argument('-i', '--input_datapoint',
11
+ help='Input file or query datapoint\n Option 1: Comma-separated list of idenfiers (UniProt ID-wt residue-position-mutated residue (e.g. Q9Y4W6-N-432-T or Q9Y4W6-N-432-T, Q9Y4W6-N-432-T)) \n Option 2: Enter comma-separated file path')
12
+
13
+ parser.add_argument('-impute', '--imputation_state', default='True',
14
+ help='Whether resulting feature vector should be imputed or not. Default True.')
15
+
16
+ args = parser.parse_args()
17
+
18
+ input_set = args.input_datapoint
19
+ mode = args.source_option
20
+ impute = args.imputation_state
21
+
22
+ def run_featureVector(input_set, mode, impute):
23
+ print('*****************************************')
24
+ print('Feature vector generation is in progress. \nPlease check log file for updates..')
25
+ print('*****************************************')
26
+ mode = int(mode)
27
+ if mode == 1:
28
+ pdb_featureVector.pdb(input_set, mode, impute)
29
+ elif mode == 2:
30
+ alphafold_featureVector.alphafold(input_set, mode, impute)
31
+
32
+ if __name__ == '__main__':
33
+ run_featureVector(input_set, mode, impute)
34
+
35
+
code/manage_files.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+ def manage_files(mode):
4
+ if mode== 1:
5
+ path_to_input_files = Path('input_files')
6
+ path_to_domains = path_to_input_files / 'domains.txt'
7
+ swiss_model_path = path_to_input_files / 'INDEX.json'
8
+ fisher_path = path_to_input_files / 'significant_domains.txt'
9
+ path_to_interfaces = path_to_input_files / 'H_sapiens_interfacesHQ.txt'
10
+
11
+ path_to_output_files = Path('out_files/pdb')
12
+ os.makedirs(path_to_output_files / 'pdb_structures/', exist_ok=True)
13
+ os.makedirs(path_to_output_files / 'alignment_files/', exist_ok=True)
14
+ os.makedirs(path_to_output_files / 'swissmodel_structures/', exist_ok=True)
15
+ os.makedirs(path_to_output_files / 'modbase_structures/', exist_ok=True)
16
+ os.makedirs(path_to_output_files / 'modbase_structures_individual/', exist_ok=True)
17
+ os.makedirs(path_to_output_files / 'freesasa_files/', exist_ok=True)
18
+ os.makedirs(path_to_output_files / '3D_alignment/', exist_ok=True)
19
+ path_to_alignment_files = path_to_output_files / 'alignment_files'
20
+ path_3D_alignment = path_to_output_files / '3D_alignment'
21
+ path_to_freesasa = path_to_output_files / 'freesasa_files'
22
+ buffer = path_to_output_files / 'file_buffer.txt'
23
+ outpath = path_to_output_files / 'feature_vector.txt'
24
+
25
+ return path_to_input_files, path_to_output_files, path_to_domains,fisher_path, path_to_interfaces, buffer
26
+
27
+ elif mode == 2:
28
+ path_to_input_files = Path('input_files')
29
+ path_to_domains = path_to_input_files / 'domains.txt'
30
+ fisher_path = path_to_input_files / 'significant_domains.txt'
31
+ alphafold_summary = path_to_input_files / 'alphafold_summary.txt'
32
+ path_to_interfaces = path_to_input_files / 'H_sapiens_interfacesHQ.txt'
33
+ # Unzip before using
34
+ alphafold_path = Path(path_to_input_files/'alphafold_structures')
35
+
36
+ path_to_output_files = Path('out_files/alphafold')
37
+ os.makedirs(path_to_output_files, exist_ok=True)
38
+ os.makedirs(path_to_output_files / 'freesasa_files', exist_ok=True)
39
+ os.makedirs(path_to_output_files / 'alignment_files', exist_ok=True)
40
+ os.makedirs(path_to_output_files / '3D_alignment', exist_ok=True)
41
+
42
+ return path_to_input_files,path_to_output_files, path_to_domains, fisher_path, path_to_interfaces, alphafold_path, alphafold_summary
code/pdb_featureVector.py ADDED
The diff for this file is too large to render. See raw diff
 
code/process_input.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ def clean_data(input_set):
4
+ data = pd.DataFrame()
5
+ try:
6
+ if ',' in input_set:
7
+ input_set = [i.strip() for i in input_set.split(',')]
8
+ for i in input_set:
9
+ data = data.append(pd.Series([j.strip() for j in i.split('-')]), ignore_index=True)
10
+ data.columns = ['uniprotID', 'wt', 'pos', 'mut']
11
+ elif '\t' in input_set:
12
+ input_set = [i.strip() for i in input_set.split('\t')]
13
+ for i in input_set:
14
+ data = data.append(pd.Series([j.strip() for j in i.split('-')]), ignore_index=True)
15
+ data.columns = ['uniprotID', 'wt', 'pos', 'mut']
16
+
17
+ elif '-' in input_set:
18
+ data = data.append(pd.Series([j.strip() for j in input_set.split('-')]), ignore_index=True)
19
+ data.columns = ['uniprotID', 'wt', 'pos', 'mut']
20
+
21
+ elif '.txt' in input_set:
22
+ data = pd.read_csv(input_set, sep='\t', names=['uniprotID', 'wt', 'pos', 'mut'])
23
+ data = data[['uniprotID', 'wt', 'pos', 'mut']]
24
+
25
+ # Exclude termination codons, synonymous mutations and any non-standard residues such as Sec, 4 or 6.
26
+ aa_list = ['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
27
+ data.wt = data.wt.str.strip()
28
+ data.mut = data.mut.str.strip()
29
+ data = data[data.wt.isin(aa_list)]
30
+ data = data[data.mut.isin(aa_list)]
31
+
32
+ for i in data.index:
33
+ data.at[i, 'datapoint'] = data.at[i, 'uniprotID'] + data.at[i, 'wt'] + str(data.at[i, 'pos']) + data.at[i, 'mut']
34
+
35
+ data = data.astype(str)
36
+ return data
37
+ except:
38
+ ValueError
39
+ print('Please check the input format.')
40
+
code/standard.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def standardize(df, get_columns):
2
+ cols_to_change = ['sasa', 'domaindistance3D', 'disulfide', 'intMet', 'intramembrane',
3
+ 'naturalVariant', 'dnaBinding', 'activeSite', 'nucleotideBinding',
4
+ 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis',
5
+ 'strand', 'helix', 'turn', 'metalBinding', 'repeat', 'caBinding',
6
+ 'topologicalDomain', 'bindingSite', 'region', 'signalPeptide',
7
+ 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide',
8
+ 'transitPeptide', 'glycosylation', 'propeptide']
9
+ for col in cols_to_change: # because in the other ones, they are 3D distance. Here, no distance calculated.
10
+ df[col] = 'nan'
11
+ df = df[get_columns.columns]
12
+
13
+ return df
code/uniprotSequenceMatch.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from add_sequence import *
2
+ import pandas as pd
3
+ import numpy as np
4
+
5
+ def uniprotSequenceMatch(data):
6
+ print('Retrieving UniProt sequences...\n')
7
+
8
+ canonical_fasta = pd.DataFrame(columns=['uniprotID', 'uniprotSequence'])
9
+ up_list = list(set(data['uniprotID'].to_list()))
10
+ for i in range(len(up_list)):
11
+ canonical_fasta.at[i, 'uniprotSequence'] = get_uniprot_seq(up_list[i])
12
+ canonical_fasta.at[i, 'uniprotID'] = up_list[i]
13
+
14
+ canonical_fasta = canonical_fasta.drop_duplicates()
15
+ isoform_fasta = pd.DataFrame(columns=['uniprotID', 'isoformSequence'])
16
+ iso_dict = []
17
+ for i in range(len(up_list)):
18
+ iso_dict.append(get_isoforms(up_list[i]))
19
+
20
+ index = 0
21
+ for i in iso_dict:
22
+ for key, val in i.items():
23
+ isoform_fasta.at[index, 'uniprotID'] = key
24
+ isoform_fasta.at[index, 'isoformSequence'] = val
25
+ index += 1
26
+ isoform_fasta = isoform_fasta.drop_duplicates()
27
+
28
+ for i in isoform_fasta.index:
29
+ isoform_fasta.at[i, 'whichIsoform'] = isoform_fasta.at[i, 'uniprotID'][7:10].strip()
30
+ isoform_fasta.at[i, 'uniprotID'] = isoform_fasta.at[i, 'uniprotID'][0:6]
31
+ print('Sequence files created...\n')
32
+
33
+ data = data.merge(canonical_fasta, on='uniprotID', how='left')
34
+ data = data.replace({'': np.NaN, 'nan': np.NaN})
35
+ data['whichIsoform'] = np.NaN
36
+ data['wt_sequence_match'] = np.NaN
37
+ not_match_in_uniprot = data[data.uniprotSequence.isna()]
38
+ uniprot_matched = data[~data.uniprotSequence.isna()]
39
+
40
+ return not_match_in_uniprot, uniprot_matched, canonical_fasta, isoform_fasta
input_files/H_sapiens_interfacesHQ.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90fb5f5fe31e20921290e0da588d50d2939feedac80767cdd3b46225ce849b8d
3
+ size 19252152
input_files/alphafold_structures/AF-A0A0A0MRZ7-F1-model_v1.cif.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5a22037a2ae883cc095f647170271d6a69f38de045206e99c4ac5586658ccb3
3
+ size 26598
input_files/alphafold_structures/AF-A0A0A0MRZ7-F1-model_v1.pdb.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93e034885f400396df77e65944c65e8d22000f011343a98d8f7727b97b378860
3
+ size 18469
input_files/alphafold_structures/AF-A0A0A0MRZ8-F1-model_v1.cif.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:367a7e9d82ad6a452f643eed923237ed149cc3cf1dabef23304d4e4f5711a191
3
+ size 25647
input_files/alphafold_structures/AF-A0A0A0MRZ8-F1-model_v1.pdb.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:449fa624948266313cdf18a365e11036b6eaa5502395ed88b58f1841ebf70e60
3
+ size 17763
input_files/alphafold_structures/AF-A0A0A0MRZ9-F1-model_v1.cif.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35de071f52a5644df10d8181b5c6034b04734895e155b68d3e3f5133e98f3ef6
3
+ size 27026
input_files/alphafold_structures/AF-A0A0A0MRZ9-F1-model_v1.pdb.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a509714d54bdf9b9ad7a9bcdccc4122e256cec371fb04e251f68e2e67ade17a
3
+ size 18748
input_files/alphafold_structures/AF-A0A0A0MS00-F1-model_v1.cif.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6b9e658af67a6b4ca14f5c960c4629140eb78588c46cfe1fab3bbe2c1c7d17e
3
+ size 25157
input_files/alphafold_structures/AF-A0A0A0MS00-F1-model_v1.pdb.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b840d9a1c9de25dd6484ad2675f26e578e883c277d4e332247cb1f45a7706ffb
3
+ size 17329
input_files/alphafold_structures/AF-A0A0A0MS01-F1-model_v1.cif.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9077d070c0fea099e5afdc10d4c599367064518be2412088e8f7f2213156f91
3
+ size 26786
input_files/alphafold_structures/AF-A0A0A0MS01-F1-model_v1.pdb.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1abd18dc11f67b8b3a3dd8b30c4a74fec7fefec62c601153401ca5c550c96dbd
3
+ size 18678
input_files/alphafold_structures/AF-A0A0A0MS02-F1-model_v1.cif.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db309cbaaf7d073230b4ab1a98ecc8213c6cfebfe87cc4f6f3990944feef7059
3
+ size 26727