Spaces:
Running
Running
fatmacankara
commited on
Commit
·
c2a02c6
0
Parent(s):
Duplicate from fatmacankara/ASCARIS
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +38 -0
- README.md +14 -0
- app.py +129 -0
- code/__pycache__/add_3Dalignment.cpython-37.pyc +0 -0
- code/__pycache__/add_alignment.cpython-37.pyc +0 -0
- code/__pycache__/add_annotations.cpython-37.pyc +0 -0
- code/__pycache__/add_domains.cpython-37.pyc +0 -0
- code/__pycache__/add_interface_pos.cpython-37.pyc +0 -0
- code/__pycache__/add_sasa.cpython-37.pyc +0 -0
- code/__pycache__/add_sequence.cpython-37.pyc +0 -0
- code/__pycache__/add_structure.cpython-37.pyc +0 -0
- code/__pycache__/alphafold_featureVector.cpython-37.pyc +0 -0
- code/__pycache__/alphafold_model.cpython-37.pyc +0 -0
- code/__pycache__/calc_pc_property.cpython-37.pyc +0 -0
- code/__pycache__/manage_files.cpython-37.pyc +0 -0
- code/__pycache__/pdb_featureVector.cpython-37.pyc +0 -0
- code/__pycache__/process_input.cpython-37.pyc +0 -0
- code/__pycache__/standard.cpython-37.pyc +0 -0
- code/__pycache__/uniprotSequenceMatch.cpython-37.pyc +0 -0
- code/add_3Dalignment.py +261 -0
- code/add_alignment.py +423 -0
- code/add_annotations.py +95 -0
- code/add_domains.py +57 -0
- code/add_interface_pos.py +35 -0
- code/add_sasa.py +131 -0
- code/add_sequence.py +44 -0
- code/add_structure.py +168 -0
- code/alphafold_featureVector.py +579 -0
- code/alphafold_model.py +33 -0
- code/calc_pc_property.py +441 -0
- code/create_swissmodelSummary.py +1 -0
- code/get_alphafoldStructures.py +97 -0
- code/main.py +35 -0
- code/manage_files.py +42 -0
- code/pdb_featureVector.py +0 -0
- code/process_input.py +40 -0
- code/standard.py +13 -0
- code/uniprotSequenceMatch.py +40 -0
- input_files/H_sapiens_interfacesHQ.txt +3 -0
- input_files/alphafold_structures/AF-A0A0A0MRZ7-F1-model_v1.cif.gz +3 -0
- input_files/alphafold_structures/AF-A0A0A0MRZ7-F1-model_v1.pdb.gz +3 -0
- input_files/alphafold_structures/AF-A0A0A0MRZ8-F1-model_v1.cif.gz +3 -0
- input_files/alphafold_structures/AF-A0A0A0MRZ8-F1-model_v1.pdb.gz +3 -0
- input_files/alphafold_structures/AF-A0A0A0MRZ9-F1-model_v1.cif.gz +3 -0
- input_files/alphafold_structures/AF-A0A0A0MRZ9-F1-model_v1.pdb.gz +3 -0
- input_files/alphafold_structures/AF-A0A0A0MS00-F1-model_v1.cif.gz +3 -0
- input_files/alphafold_structures/AF-A0A0A0MS00-F1-model_v1.pdb.gz +3 -0
- input_files/alphafold_structures/AF-A0A0A0MS01-F1-model_v1.cif.gz +3 -0
- input_files/alphafold_structures/AF-A0A0A0MS01-F1-model_v1.pdb.gz +3 -0
- input_files/alphafold_structures/AF-A0A0A0MS02-F1-model_v1.cif.gz +3 -0
.gitattributes
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
input_files/alphafold_summary.txt filter=lfs diff=lfs merge=lfs -text
|
37 |
+
input_files/H_sapiens_interfacesHQ.txt filter=lfs diff=lfs merge=lfs -text
|
38 |
+
input_files/swissmodel_structures.txt filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: ASCARIS
|
3 |
+
emoji: 🦀
|
4 |
+
colorFrom: indigo
|
5 |
+
colorTo: gray
|
6 |
+
sdk: streamlit
|
7 |
+
python_version: '3.7'
|
8 |
+
sdk_version: 1.21.0
|
9 |
+
app_file: app.py
|
10 |
+
pinned: false
|
11 |
+
duplicated_from: fatmacankara/ASCARIS
|
12 |
+
---
|
13 |
+
|
14 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
from os import path
|
4 |
+
import sys
|
5 |
+
import streamlit.components.v1 as components
|
6 |
+
sys.path.append('code/')
|
7 |
+
#sys.path.append('ASCARIS/code/')
|
8 |
+
import pdb_featureVector
|
9 |
+
import alphafold_featureVector
|
10 |
+
import argparse
|
11 |
+
from st_aggrid import AgGrid, GridOptionsBuilder, JsCode,GridUpdateMode
|
12 |
+
showWarningOnDirectExecution = False
|
13 |
+
def download_button(object_to_download, download_filename):
|
14 |
+
|
15 |
+
if isinstance(object_to_download, pd.DataFrame):
|
16 |
+
object_to_download = object_to_download.to_csv(index=False)
|
17 |
+
|
18 |
+
# Try JSON encode for everything else
|
19 |
+
else:
|
20 |
+
object_to_download = json.dumps(object_to_download)
|
21 |
+
try:
|
22 |
+
# some strings <-> bytes conversions necessary here
|
23 |
+
b64 = base64.b64encode(object_to_download.encode()).decode()
|
24 |
+
|
25 |
+
except AttributeError as e:
|
26 |
+
b64 = base64.b64encode(object_to_download).decode()
|
27 |
+
|
28 |
+
dl_link = f"""<html><head><title>Start Auto Download file</title><script src="http://code.jquery.com/jquery-3.2.1.min.js"></script><script>$('<a href="data:text/csv;base64,{b64}" download="{download_filename}">')[0].click()</script></head></html>"""
|
29 |
+
return dl_link
|
30 |
+
|
31 |
+
|
32 |
+
def download_df():
|
33 |
+
components.html(
|
34 |
+
download_button(selected_df, st.session_state.filename),
|
35 |
+
height=0,
|
36 |
+
)
|
37 |
+
|
38 |
+
|
39 |
+
original_title = '<p style="font-family:Trebuchet MS; color:#FD7456; font-size: 35px; font-weight:bold; text-align:center">Welcome to ASCARIS</p>'
|
40 |
+
st.markdown(original_title, unsafe_allow_html=True)
|
41 |
+
st.write('')
|
42 |
+
st.write('')
|
43 |
+
st.write('')
|
44 |
+
st.write('')
|
45 |
+
|
46 |
+
|
47 |
+
|
48 |
+
source = st.selectbox('Select Protein Structure Database (1: PDB, SwissModel, Modbase 2: AlphaFold)',[1,2])
|
49 |
+
impute = st.selectbox('Select Imputation',[True, False])
|
50 |
+
input_data = st.text_input('Enter Input Variation')
|
51 |
+
|
52 |
+
|
53 |
+
|
54 |
+
|
55 |
+
#sys.path.append(path.abspath('../code/'))
|
56 |
+
parser = argparse.ArgumentParser(description='ASCARIS')
|
57 |
+
|
58 |
+
parser.add_argument('-s', '--source_option',
|
59 |
+
help='Selection of input structure data.\n 1: PDB Structures (default), 2: AlphaFold Structures',
|
60 |
+
default=1)
|
61 |
+
parser.add_argument('-i', '--input_datapoint',
|
62 |
+
help='Input file or query datapoint\n Option 1: Comma-separated list of idenfiers (UniProt ID-wt residue-position-mutated residue (e.g. Q9Y4W6-N-432-T or Q9Y4W6-N-432-T, Q9Y4W6-N-432-T)) \n Option 2: Enter comma-separated file path')
|
63 |
+
|
64 |
+
parser.add_argument('-impute', '--imputation_state', default='True',
|
65 |
+
help='Whether resulting feature vector should be imputed or not. Default True.')
|
66 |
+
|
67 |
+
args = parser.parse_args()
|
68 |
+
|
69 |
+
input_set = input_data
|
70 |
+
mode = source
|
71 |
+
impute = impute
|
72 |
+
|
73 |
+
print('*****************************************')
|
74 |
+
print('Feature vector generation is in progress. \nPlease check log file for updates..')
|
75 |
+
print('*****************************************')
|
76 |
+
mode = int(mode)
|
77 |
+
|
78 |
+
with st.spinner('In progress...This may take a while...'):
|
79 |
+
try:
|
80 |
+
if mode == 1:
|
81 |
+
selected_df = pdb_featureVector.pdb(input_set, mode, impute)
|
82 |
+
int_builder = GridOptionsBuilder.from_dataframe(selected_df)
|
83 |
+
int_builder.configure_default_column(editable=False, filterable=True, cellStyle={'text-align': 'center'})
|
84 |
+
int_builder.configure_pagination(enabled=True, paginationAutoPageSize=False, paginationPageSize=10)
|
85 |
+
int_builder.configure_selection(selection_mode='multiple', use_checkbox=True)
|
86 |
+
gridoptions = int_builder.build()
|
87 |
+
int_return = AgGrid(selected_df,
|
88 |
+
width='100%',
|
89 |
+
height=(len(selected_df) + 4) * 35.2 + 3,
|
90 |
+
theme='light',
|
91 |
+
enable_enterprise_modules=False,
|
92 |
+
gridOptions=gridoptions,
|
93 |
+
fit_columns_on_grid_load=False,
|
94 |
+
update_mode=GridUpdateMode.SELECTION_CHANGED, # or MODEL_CHANGED
|
95 |
+
custom_css={".ag-header-cell-label": {"justify-content": "center"}})
|
96 |
+
st.success('Feature vector successfully created.')
|
97 |
+
|
98 |
+
|
99 |
+
elif mode == 2:
|
100 |
+
selected_df = alphafold_featureVector.alphafold(input_set, mode, impute)
|
101 |
+
int_builder = GridOptionsBuilder.from_dataframe(selected_df)
|
102 |
+
int_builder.configure_default_column(editable=False, filterable=True, cellStyle={'text-align': 'center'})
|
103 |
+
int_builder.configure_pagination(enabled=True, paginationAutoPageSize=False, paginationPageSize=10)
|
104 |
+
int_builder.configure_selection(selection_mode='multiple', use_checkbox=True)
|
105 |
+
gridoptions = int_builder.build()
|
106 |
+
int_return = AgGrid(selected_df,
|
107 |
+
width='100%',
|
108 |
+
height=(len(selected_df) + 4) * 35.2 + 3,
|
109 |
+
theme='light',
|
110 |
+
enable_enterprise_modules=False,
|
111 |
+
gridOptions=gridoptions,
|
112 |
+
fit_columns_on_grid_load=False,
|
113 |
+
update_mode=GridUpdateMode.SELECTION_CHANGED, # or MODEL_CHANGED
|
114 |
+
custom_css={".ag-header-cell-label": {"justify-content": "center"}})
|
115 |
+
st.success('Feature vector successfully created.')
|
116 |
+
|
117 |
+
|
118 |
+
except:
|
119 |
+
pass
|
120 |
+
download_df = pd.DataFrame()
|
121 |
+
|
122 |
+
with st.form("my_form", clear_on_submit=False):
|
123 |
+
st.text_input("Enter filename", key="filename")
|
124 |
+
submit = st.form_submit_button("Download feature vector", on_click=download_df)
|
125 |
+
|
126 |
+
|
127 |
+
|
128 |
+
|
129 |
+
|
code/__pycache__/add_3Dalignment.cpython-37.pyc
ADDED
Binary file (5.67 kB). View file
|
|
code/__pycache__/add_alignment.cpython-37.pyc
ADDED
Binary file (7.99 kB). View file
|
|
code/__pycache__/add_annotations.cpython-37.pyc
ADDED
Binary file (3.78 kB). View file
|
|
code/__pycache__/add_domains.cpython-37.pyc
ADDED
Binary file (1.44 kB). View file
|
|
code/__pycache__/add_interface_pos.cpython-37.pyc
ADDED
Binary file (1.12 kB). View file
|
|
code/__pycache__/add_sasa.cpython-37.pyc
ADDED
Binary file (3.17 kB). View file
|
|
code/__pycache__/add_sequence.cpython-37.pyc
ADDED
Binary file (1.27 kB). View file
|
|
code/__pycache__/add_structure.cpython-37.pyc
ADDED
Binary file (5.93 kB). View file
|
|
code/__pycache__/alphafold_featureVector.cpython-37.pyc
ADDED
Binary file (15.4 kB). View file
|
|
code/__pycache__/alphafold_model.cpython-37.pyc
ADDED
Binary file (1.35 kB). View file
|
|
code/__pycache__/calc_pc_property.cpython-37.pyc
ADDED
Binary file (8.84 kB). View file
|
|
code/__pycache__/manage_files.cpython-37.pyc
ADDED
Binary file (1.43 kB). View file
|
|
code/__pycache__/pdb_featureVector.cpython-37.pyc
ADDED
Binary file (33.7 kB). View file
|
|
code/__pycache__/process_input.cpython-37.pyc
ADDED
Binary file (1.69 kB). View file
|
|
code/__pycache__/standard.cpython-37.pyc
ADDED
Binary file (749 Bytes). View file
|
|
code/__pycache__/uniprotSequenceMatch.cpython-37.pyc
ADDED
Binary file (1.28 kB). View file
|
|
code/add_3Dalignment.py
ADDED
@@ -0,0 +1,261 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This code file produces alignments between the structure and the sequence for a given protein.
|
3 |
+
|
4 |
+
"""
|
5 |
+
|
6 |
+
import math
|
7 |
+
import glob
|
8 |
+
import numpy as np
|
9 |
+
from Bio import Align
|
10 |
+
import gzip
|
11 |
+
from pathlib import Path
|
12 |
+
from Bio.Align import substitution_matrices
|
13 |
+
aligner = Align.PairwiseAligner()
|
14 |
+
|
15 |
+
def distance(x1, y1, z1, x2, y2, z2):
|
16 |
+
d = math.sqrt(math.pow(x2 - x1, 2) +
|
17 |
+
math.pow(y2 - y1, 2) +
|
18 |
+
math.pow(z2 - z1, 2) * 1.0)
|
19 |
+
return d
|
20 |
+
|
21 |
+
|
22 |
+
def find_distance(coordMut, coordAnnot):
|
23 |
+
if coordMut != np.NaN:
|
24 |
+
try:
|
25 |
+
dist = distance(float(coordMut[0]), float(coordMut[1]), float(coordMut[2]), float(coordAnnot[0]),
|
26 |
+
float(coordAnnot[1]), float(coordAnnot[2]))
|
27 |
+
return "%.2f" % dist
|
28 |
+
except:
|
29 |
+
ValueError
|
30 |
+
dist = 'nan'
|
31 |
+
return dist
|
32 |
+
else:
|
33 |
+
return np.NaN
|
34 |
+
|
35 |
+
|
36 |
+
def threeToOne(variant):
|
37 |
+
if variant == "ALA":
|
38 |
+
variant = "A"
|
39 |
+
elif variant == "ARG":
|
40 |
+
variant = "R"
|
41 |
+
elif variant == "VAL":
|
42 |
+
variant = "V"
|
43 |
+
elif variant == "GLU":
|
44 |
+
variant = "E"
|
45 |
+
elif variant == "PRO":
|
46 |
+
variant = "P"
|
47 |
+
elif variant == "LEU":
|
48 |
+
variant = "L"
|
49 |
+
elif variant == "GLY":
|
50 |
+
variant = "G"
|
51 |
+
elif variant == "ASN":
|
52 |
+
variant = "N"
|
53 |
+
elif variant == "SER":
|
54 |
+
variant = "S"
|
55 |
+
elif variant == "GLN":
|
56 |
+
variant = "Q"
|
57 |
+
elif variant == "THR":
|
58 |
+
variant = "T"
|
59 |
+
elif variant == "MET":
|
60 |
+
variant = "M"
|
61 |
+
elif variant == "LYS":
|
62 |
+
variant = "K"
|
63 |
+
elif variant == "ASP":
|
64 |
+
variant = "D"
|
65 |
+
elif variant == "ILE":
|
66 |
+
variant = "I"
|
67 |
+
elif variant == "PHE":
|
68 |
+
variant = "F"
|
69 |
+
elif variant == "TRP":
|
70 |
+
variant = "W"
|
71 |
+
elif variant == "TYR":
|
72 |
+
variant = "Y"
|
73 |
+
elif variant == "HIS":
|
74 |
+
variant = "H"
|
75 |
+
elif variant == "CYS":
|
76 |
+
variant = "C"
|
77 |
+
elif variant == 'UNK':
|
78 |
+
variant = 'X'
|
79 |
+
elif variant == 'ASX':
|
80 |
+
variant = 'O'
|
81 |
+
return (variant)
|
82 |
+
|
83 |
+
|
84 |
+
def get_coords(annot, alignments, coords, resnums_for_sasa, mode):
|
85 |
+
if mode == 1:
|
86 |
+
for alignment in alignments[0]:
|
87 |
+
alignment = (str(alignment).strip().split('\n'))
|
88 |
+
startGap = 0
|
89 |
+
if alignment[0].startswith('.'):
|
90 |
+
for k in alignment[0]:
|
91 |
+
if k == '.' or k == '-':
|
92 |
+
startGap += 1
|
93 |
+
else:
|
94 |
+
break
|
95 |
+
countGap = startGap
|
96 |
+
countResidue = 0
|
97 |
+
for j in alignment[0][startGap:]:
|
98 |
+
if j == '.' or j == '-':
|
99 |
+
countGap += 1
|
100 |
+
else:
|
101 |
+
countResidue += 1
|
102 |
+
if countResidue == float(annot):
|
103 |
+
break
|
104 |
+
countGap_pdb = 0
|
105 |
+
countResidue_pdb = 0
|
106 |
+
for m in alignment[2][0:countResidue + countGap - 1]:
|
107 |
+
if m == '.' or m == '-':
|
108 |
+
countGap_pdb += 1
|
109 |
+
posAtom = countResidue + countGap - countGap_pdb
|
110 |
+
|
111 |
+
realpdbStart = 0
|
112 |
+
for j in alignment[2]:
|
113 |
+
if j == '.' or j == '-':
|
114 |
+
realpdbStart += 1
|
115 |
+
else:
|
116 |
+
break
|
117 |
+
|
118 |
+
if (alignment[2][countResidue + countGap - 1] != '-') and (float(annot) >= float(realpdbStart) + 1):
|
119 |
+
try:
|
120 |
+
coordinates = alignments[1]
|
121 |
+
residue_numbers = alignments[2]
|
122 |
+
coordWeWant = coordinates[posAtom - 1]
|
123 |
+
residue_number_we_want = residue_numbers[posAtom - 1]
|
124 |
+
|
125 |
+
except:
|
126 |
+
IndexError
|
127 |
+
coordWeWant = 'nan'
|
128 |
+
else:
|
129 |
+
coordWeWant = 'nan'
|
130 |
+
return coordWeWant, posAtom, residue_number_we_want
|
131 |
+
if mode == 2:
|
132 |
+
if annot != 'nan':
|
133 |
+
if int(annot) <= 1400:
|
134 |
+
alignment = (str(alignments).strip().split('\n'))
|
135 |
+
startGap = 0
|
136 |
+
if alignment[0].startswith('.'):
|
137 |
+
for k in alignment[0]:
|
138 |
+
if k == '.' or k == '-':
|
139 |
+
startGap += 1
|
140 |
+
else:
|
141 |
+
break
|
142 |
+
countGap = startGap
|
143 |
+
countResidue = 0
|
144 |
+
for j in alignment[0][startGap:]:
|
145 |
+
if j == '.' or j == '-':
|
146 |
+
countGap += 1
|
147 |
+
else:
|
148 |
+
countResidue += 1
|
149 |
+
if countResidue == float(annot):
|
150 |
+
break
|
151 |
+
countGap_pdb = 0
|
152 |
+
countResidue_pdb = 0
|
153 |
+
for m in alignment[2][0:countResidue + countGap - 1]:
|
154 |
+
if m == '.' or m == '-':
|
155 |
+
countGap_pdb += 1
|
156 |
+
posAtom = countResidue + countGap - countGap_pdb
|
157 |
+
realpdbStart = 0
|
158 |
+
for j in alignment[2]:
|
159 |
+
if j == '.' or j == '-':
|
160 |
+
realpdbStart += 1
|
161 |
+
else:
|
162 |
+
break
|
163 |
+
if len(alignment[2]) > (countResidue + countGap - 1):
|
164 |
+
if (alignment[2][countResidue + countGap - 1] != '-') and (float(annot) >= float(realpdbStart) + 1):
|
165 |
+
try:
|
166 |
+
coordinates = coords
|
167 |
+
residue_numbers = resnums_for_sasa
|
168 |
+
coordWeWant = coordinates[posAtom - 1]
|
169 |
+
residue_number_we_want = residue_numbers[posAtom - 1]
|
170 |
+
except:
|
171 |
+
IndexError
|
172 |
+
coordWeWant = 'nan'
|
173 |
+
residue_number_we_want = 'nan'
|
174 |
+
else:
|
175 |
+
coordWeWant = 'nan'
|
176 |
+
residue_number_we_want = 'nan'
|
177 |
+
return coordWeWant, posAtom, residue_number_we_want
|
178 |
+
else:
|
179 |
+
coordWeWant = 'nan'
|
180 |
+
residue_number_we_want = 'nan'
|
181 |
+
return coordWeWant, posAtom, residue_number_we_want
|
182 |
+
else:
|
183 |
+
return np.NaN, np.NaN, np.NaN
|
184 |
+
else:
|
185 |
+
return np.NaN, np.NaN, np.NaN
|
186 |
+
|
187 |
+
|
188 |
+
def get_alignments_3D(identifier, model_num, pdb_path, pdbSequence, source, chain, pdbID, mode, path_3D_alignment,file_format = 'gzip'):
|
189 |
+
if mode == 1:
|
190 |
+
atomSequence = ''
|
191 |
+
coords = []
|
192 |
+
resnums_for_sasa = []
|
193 |
+
with open(pdb_path, encoding="utf8") as f:
|
194 |
+
for line in f.readlines():
|
195 |
+
if source != 'MODBASE':
|
196 |
+
if line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA' and line[21].upper() == chain.upper():
|
197 |
+
atomSequence += threeToOne(line[17:20].strip())
|
198 |
+
coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
|
199 |
+
resnums_for_sasa.append(line[22:26].strip())
|
200 |
+
elif line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA' and line[21] == ' ':
|
201 |
+
atomSequence += threeToOne(line[17:20].strip())
|
202 |
+
coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
|
203 |
+
resnums_for_sasa.append(line[22:26].strip())
|
204 |
+
else:
|
205 |
+
if line[0:7].strip() == 'ATOM' and line[13:15].strip() == 'CA':
|
206 |
+
atomSequence += threeToOne(line[17:20].strip())
|
207 |
+
coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
|
208 |
+
resnums_for_sasa.append(line[22:26].strip())
|
209 |
+
|
210 |
+
f = open(Path(path_3D_alignment / f'{identifier}_{pdbID}_{str(chain)}_alignment.txt'),"w")
|
211 |
+
|
212 |
+
aligner.mode = 'local'
|
213 |
+
aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
|
214 |
+
aligner.open_gap_score = -11
|
215 |
+
aligner.extend_gap_score = -1
|
216 |
+
alignments = aligner.align(pdbSequence, atomSequence)
|
217 |
+
alignments = (list(alignments))
|
218 |
+
for alignment in alignments:
|
219 |
+
f.write(str(alignment))
|
220 |
+
f.write('\n')
|
221 |
+
f.write('\n')
|
222 |
+
return alignments, coords, resnums_for_sasa
|
223 |
+
elif mode==2:
|
224 |
+
atomSequence = ''
|
225 |
+
coords = []
|
226 |
+
resnums_for_sasa = []
|
227 |
+
if file_format == 'txt':
|
228 |
+
with open(name, encoding="utf8") as f:
|
229 |
+
for line in f.readlines():
|
230 |
+
if line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA':
|
231 |
+
atomSequence += threeToOne(line[17:20].strip())
|
232 |
+
coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
|
233 |
+
resnums_for_sasa.append(line[22:26].strip())
|
234 |
+
elif line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA' and line[21] == ' ':
|
235 |
+
atomSequence += threeToOne(line[17:20].strip())
|
236 |
+
coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
|
237 |
+
resnums_for_sasa.append(line[22:26].strip())
|
238 |
+
elif file_format == 'gzip':
|
239 |
+
with gzip.open(pdb_path, mode='rb') as f:
|
240 |
+
for line in f:
|
241 |
+
line = line.decode()
|
242 |
+
if line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA':
|
243 |
+
atomSequence += threeToOne(line[17:20].strip())
|
244 |
+
coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
|
245 |
+
resnums_for_sasa.append(line[22:26].strip())
|
246 |
+
elif line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA' and line[21] == ' ':
|
247 |
+
atomSequence += threeToOne(line[17:20].strip())
|
248 |
+
coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
|
249 |
+
resnums_for_sasa.append(line[22:26].strip())
|
250 |
+
f = open(Path(path_3D_alignment / f'{identifier}_{str(model_num)}_3Dalignment.txt'),"w")
|
251 |
+
aligner.mode = 'local'
|
252 |
+
aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
|
253 |
+
aligner.open_gap_score = -11
|
254 |
+
aligner.extend_gap_score = -1
|
255 |
+
alignments = aligner.align(pdbSequence, atomSequence)
|
256 |
+
alignments = (list(alignments))
|
257 |
+
for alignment in alignments:
|
258 |
+
f.write(str(alignment))
|
259 |
+
f.write('\n')
|
260 |
+
f.write('\n')
|
261 |
+
return alignments, coords, resnums_for_sasa
|
code/add_alignment.py
ADDED
@@ -0,0 +1,423 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from Bio import Align
|
2 |
+
from Bio.Align import substitution_matrices
|
3 |
+
from pathlib import Path
|
4 |
+
import streamlit as st
|
5 |
+
from Bio.pairwise2 import format_alignment
|
6 |
+
from Bio import pairwise2
|
7 |
+
from Bio import pairwise2
|
8 |
+
from Bio.SubsMat import MatrixInfo as matlist
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
"""
|
13 |
+
def do_alignment(identifier, uniprotSequence, pdbSequence, alignment_path):
|
14 |
+
aligner = Align.PairwiseAligner()
|
15 |
+
#print(f'Aligning Datapoint: {identifier}')
|
16 |
+
if len(pdbSequence) >= 1:
|
17 |
+
f = open(Path(alignment_path / f'{identifier}_alignment.txt'), "w")
|
18 |
+
aligner.mode = 'local'
|
19 |
+
aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
|
20 |
+
aligner.open_gap_score = -11
|
21 |
+
aligner.extend_gap_score = -1
|
22 |
+
alignments = aligner.align(uniprotSequence, pdbSequence)
|
23 |
+
alignments = (list(alignments))
|
24 |
+
|
25 |
+
merge_in_threes = str(alignments[0]).split('\n')
|
26 |
+
K = 3
|
27 |
+
res = ["".join(str(alignments[0]).split('\n')[idx: idx + K]) for idx in range(len(str(alignments[0]).split('\n')) - K + 1)]
|
28 |
+
slice_val = slice(0,len(res),4)
|
29 |
+
writtenlist = res[slice_val]
|
30 |
+
|
31 |
+
new_alignment = []
|
32 |
+
for i in writtenlist:
|
33 |
+
cont1 = list(filter(None, i.split('target')))
|
34 |
+
cont2 = cont1[0].split('query')
|
35 |
+
target_pos = (list(filter(None,cont2[0].split(' '))))[0]
|
36 |
+
target = (list(filter(None,cont2[0].split(' '))))[1]
|
37 |
+
alg_pos = (list(filter(None,cont2[0].split(' '))))[2]
|
38 |
+
alg = (list(filter(None,cont2[0].split(' '))))[3]
|
39 |
+
query_pos = (list(filter(None,cont2[1].split(' '))))[0]
|
40 |
+
query = (list(filter(None,cont2[1].split(' '))))[1]
|
41 |
+
if int(target_pos)>0:
|
42 |
+
new_target = int(target_pos) * 'X' + target
|
43 |
+
else:
|
44 |
+
new_target = int(target_pos) * ' ' + target
|
45 |
+
|
46 |
+
if int(alg_pos)>0:
|
47 |
+
new_alg = int(target_pos) * 'X' + target
|
48 |
+
else:
|
49 |
+
new_alg = int(target_pos) * ' ' + alg
|
50 |
+
|
51 |
+
if int(query_pos)>0:
|
52 |
+
new_query = int(target_pos) * 'X' + target
|
53 |
+
else:
|
54 |
+
new_query = int(target_pos) * ' ' + target
|
55 |
+
|
56 |
+
new_alignment.append(new_target+'\n' +new_alg +'\n' +new_query)
|
57 |
+
alignment_list = []
|
58 |
+
k = 0
|
59 |
+
for alignment in new_alignment:
|
60 |
+
k += 1
|
61 |
+
st.write('COUNT', k)
|
62 |
+
st.write('alignment')
|
63 |
+
st.write(alignment)
|
64 |
+
f.write(str(alignment))
|
65 |
+
f.write('\n')
|
66 |
+
f.write('\n')
|
67 |
+
alignment = (str(alignment).strip().split('\n'))
|
68 |
+
alignment = [''.join(['.' if m == ' ' else m for m in x]) for x in alignment]
|
69 |
+
st.write('alignment_updated')
|
70 |
+
st.write(alignment)
|
71 |
+
alignment_list.append(alignment)
|
72 |
+
return alignment_list
|
73 |
+
|
74 |
+
"""
|
75 |
+
def do_alignment(identifier, uniprotSequence, pdbSequence, alignment_path):
|
76 |
+
aligner = Align.PairwiseAligner()
|
77 |
+
#print(f'Aligning Datapoint: {identifier}')
|
78 |
+
if len(pdbSequence) >= 1:
|
79 |
+
f = open(Path(alignment_path / f'{identifier}_alignment.txt'), "w")
|
80 |
+
aligner.mode = 'local'
|
81 |
+
aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
|
82 |
+
aligner.open_gap_score = -11
|
83 |
+
aligner.extend_gap_score = -1
|
84 |
+
alignments = aligner.align(uniprotSequence, pdbSequence)
|
85 |
+
|
86 |
+
sub_matrix = matlist.blosum62
|
87 |
+
alignments2 = pairwise2.align.localds(uniprotSequence, pdbSequence, sub_matrix, -11, -1)
|
88 |
+
|
89 |
+
alignment_list = []
|
90 |
+
k = 0
|
91 |
+
for alignment in alignments:
|
92 |
+
|
93 |
+
f.write(str(alignment))
|
94 |
+
f.write('\n')
|
95 |
+
f.write('\n')
|
96 |
+
alignment = (str(alignment).strip().split('\n'))
|
97 |
+
alignment = [''.join(['.' if m == ' ' else m for m in x]) for x in alignment]
|
98 |
+
|
99 |
+
alignment_list.append(alignment)
|
100 |
+
return alignment_list
|
101 |
+
|
102 |
+
def mutation_position_on_pdb(alignment_list, pos):
|
103 |
+
which_alignment_to_go = 0
|
104 |
+
for alignment in alignment_list:
|
105 |
+
|
106 |
+
#char_list = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
|
107 |
+
#for char in alignment[1]:
|
108 |
+
# if char in char_list:
|
109 |
+
# alignment[1] = alignment[1].replace(char, '.')
|
110 |
+
|
111 |
+
|
112 |
+
which_alignment_to_go += 1
|
113 |
+
alignment_uniprot = alignment[0]
|
114 |
+
alignment_pdb = alignment[2]
|
115 |
+
startGap = 0
|
116 |
+
if alignment_uniprot.startswith('.') or alignment_uniprot.startswith('-'):
|
117 |
+
for k in alignment_uniprot:
|
118 |
+
if k == '.' or k == '-':
|
119 |
+
startGap += 1
|
120 |
+
else:
|
121 |
+
break
|
122 |
+
|
123 |
+
countGap = startGap
|
124 |
+
countResidue = 0
|
125 |
+
canonicalRes = ' '
|
126 |
+
pdbRes = ' '
|
127 |
+
for j in alignment_uniprot[startGap:]:
|
128 |
+
if j == '.' or j == '-':
|
129 |
+
countGap += 1
|
130 |
+
else:
|
131 |
+
countResidue += 1
|
132 |
+
|
133 |
+
if int(countResidue) == int(pos):
|
134 |
+
canonicalRes = alignment_uniprot[countResidue + countGap - 1]
|
135 |
+
try:
|
136 |
+
pdbRes = alignment_pdb[countResidue + countGap - 1]
|
137 |
+
except:
|
138 |
+
IndexError
|
139 |
+
pdbRes = 'nan'
|
140 |
+
break
|
141 |
+
|
142 |
+
if (alignment[1][countResidue + countGap - 1] == '|') or (alignment[1][countResidue + countGap - 1] == 'X'):
|
143 |
+
if canonicalRes == pdbRes:
|
144 |
+
pdb_alignStatus = 'aligned'
|
145 |
+
elif canonicalRes != pdbRes:
|
146 |
+
pdb_alignStatus = 'aligned*'
|
147 |
+
countGap_pdb = 0
|
148 |
+
countResidue_pdb = 0
|
149 |
+
pdbRes = ' '
|
150 |
+
for j in alignment_pdb[0:countResidue + countGap - 1]:
|
151 |
+
if j == '.' or j == '-':
|
152 |
+
countGap_pdb += 1
|
153 |
+
if alignment_pdb[countResidue + countGap - 1] == '.' or alignment_pdb[
|
154 |
+
countResidue + countGap - 1] == '-':
|
155 |
+
mutationPositionOnPDB = 'nan'
|
156 |
+
posPDB = 'nan'
|
157 |
+
|
158 |
+
|
159 |
+
else:
|
160 |
+
posPDB = countResidue + countGap - countGap_pdb
|
161 |
+
|
162 |
+
mutationPositionOnPDB = str(posPDB)
|
163 |
+
|
164 |
+
break
|
165 |
+
elif (canonicalRes == pdbRes) and ((alignment[1][countResidue + countGap - 1] == '.') or (
|
166 |
+
alignment[1][poscountResidue+ countGap - 1] == '-')):
|
167 |
+
pdb_alignStatus = 'not_aligned'
|
168 |
+
mutationPositionOnPDB = 'nan'
|
169 |
+
elif (canonicalRes != pdbRes) and ((alignment[1][countResidue + countGap - 1] == '.') or (
|
170 |
+
alignment[1][countResidue + countGap - 1] == '-')):
|
171 |
+
pdb_alignStatus = 'not_aligned'
|
172 |
+
mutationPositionOnPDB = 'nan'
|
173 |
+
elif alignment_pdb[countResidue + countGap - 1] == '.' or alignment_pdb[
|
174 |
+
countResidue + countGap - 1] == '-':
|
175 |
+
mutationPositionOnPDB = 'nan'
|
176 |
+
posPDB = 'nan'
|
177 |
+
|
178 |
+
return (pdb_alignStatus, mutationPositionOnPDB, startGap, alignment_list[which_alignment_to_go - 1])
|
179 |
+
|
180 |
+
|
181 |
+
def find_position_on_pdb_for_range_annotations(posAnnotation, startGap, alignment_to_use):
|
182 |
+
annotation_on_pdb_start = 'nan'
|
183 |
+
annotation_on_pdb_end = 'nan'
|
184 |
+
pos1 = int(posAnnotation.split('-')[0])
|
185 |
+
count_gap = startGap
|
186 |
+
count_residue = 0
|
187 |
+
for j in alignment_to_use[0][startGap:]:
|
188 |
+
if j == '.' or j == '-':
|
189 |
+
count_gap += 1
|
190 |
+
else:
|
191 |
+
count_residue += 1
|
192 |
+
if int(count_residue) == int(pos1): # count gaps until the first position
|
193 |
+
break
|
194 |
+
annotation_on_up_start = int(pos1) + int(count_gap)
|
195 |
+
|
196 |
+
pos2 = int(posAnnotation.split('-')[1])
|
197 |
+
count_gap = startGap
|
198 |
+
count_residue = 0
|
199 |
+
for j in alignment_to_use[0][startGap:]:
|
200 |
+
if j == '.' or j == '-':
|
201 |
+
count_gap += 1
|
202 |
+
else:
|
203 |
+
count_residue += 1
|
204 |
+
if int(count_residue) == int(pos2): # count gaps until the first position
|
205 |
+
break
|
206 |
+
|
207 |
+
annotation_on_up_end = int(pos2) + int(count_gap)
|
208 |
+
try:
|
209 |
+
pdb_residue_start = alignment_to_use[2][annotation_on_up_start - 1].strip()
|
210 |
+
if (pdb_residue_start == '.') or (pdb_residue_start == '-'):
|
211 |
+
for ran in range(len(alignment_to_use[2][(annotation_on_up_start - 1):annotation_on_up_end])):
|
212 |
+
if (alignment_to_use[2][(annotation_on_up_start - 1):annotation_on_up_end][ran] != '.') and \
|
213 |
+
(alignment_to_use[2][(annotation_on_up_start - 1):annotation_on_up_end][ran] != '-') and \
|
214 |
+
((alignment_to_use[1][(annotation_on_up_start - 1):annotation_on_up_end][ran] == '|') or
|
215 |
+
(alignment_to_use[1][(annotation_on_up_start - 1):annotation_on_up_end][ran] == 'X')):
|
216 |
+
annotation_on_up_start += ran
|
217 |
+
break
|
218 |
+
elif (pdb_residue_start != '.') and (pdb_residue_start != '-') and \
|
219 |
+
((alignment_to_use[1][annotation_on_up_start - 1] == '.') or (
|
220 |
+
alignment_to_use[1][annotation_on_up_start - 1] == '-')):
|
221 |
+
for ran in range(len(alignment_to_use[2][(annotation_on_up_start - 1):annotation_on_up_end])):
|
222 |
+
if ((alignment_to_use[1][(annotation_on_up_start - 1):annotation_on_up_end][ran] == '|') or
|
223 |
+
(alignment_to_use[1][(annotation_on_up_start - 1):annotation_on_up_end][ran] == 'X')):
|
224 |
+
annotation_on_up_start += ran
|
225 |
+
break
|
226 |
+
count_gap_pdb = 0
|
227 |
+
if annotation_on_up_start != 'nan':
|
228 |
+
for q in alignment_to_use[2][0:annotation_on_up_start - 1]:
|
229 |
+
if q == '.' or q == '-':
|
230 |
+
count_gap_pdb += 1
|
231 |
+
if alignment_to_use[1][annotation_on_up_start] == '-' or alignment_to_use[1][annotation_on_up_start] == '.':
|
232 |
+
annotation_on_pdb_start = 'nan'
|
233 |
+
else:
|
234 |
+
annotation_on_pdb_start = int(annotation_on_up_start) - count_gap_pdb
|
235 |
+
else:
|
236 |
+
annotation_on_pdb_start = 'nan'
|
237 |
+
except:
|
238 |
+
IndexError
|
239 |
+
try:
|
240 |
+
pdb_residue_end = alignment_to_use[2][annotation_on_up_end - 1].strip()
|
241 |
+
if pdb_residue_end == '.' or pdb_residue_end == '-':
|
242 |
+
for ran in range(len(alignment_to_use[2][(annotation_on_up_start - 1):annotation_on_up_end])):
|
243 |
+
if ((alignment_to_use[1][annotation_on_up_start - 1:annotation_on_up_end][ran] == '.') or
|
244 |
+
(alignment_to_use[1][(annotation_on_up_start - 1):][ran] == '-')):
|
245 |
+
annotation_on_up_start += (ran - 1)
|
246 |
+
annotation_on_up_end = annotation_on_up_start
|
247 |
+
break
|
248 |
+
elif (pdb_residue_end != '.') and (pdb_residue_end != '-') and \
|
249 |
+
((alignment_to_use[1][annotation_on_up_end - 1] == '.') or (
|
250 |
+
alignment_to_use[1][annotation_on_up_end - 1] == '-')):
|
251 |
+
for ran in range(len(alignment_to_use[2][(annotation_on_up_start - 1):annotation_on_up_end])):
|
252 |
+
if ((alignment_to_use[1][annotation_on_up_start - 1:annotation_on_up_end][ran] == '.') or
|
253 |
+
(alignment_to_use[1][(annotation_on_up_start - 1):][ran] == '-')):
|
254 |
+
annotation_on_up_start += (ran - 1)
|
255 |
+
annotation_on_up_end = annotation_on_up_start
|
256 |
+
break
|
257 |
+
count_gap_pdb = 0
|
258 |
+
if annotation_on_up_end != 'nan':
|
259 |
+
for q in alignment_to_use[2][0:annotation_on_up_end - 1]:
|
260 |
+
if q == '.' or q == '-':
|
261 |
+
count_gap_pdb += 1
|
262 |
+
if alignment_to_use[1][annotation_on_up_end - 1] == '-' or alignment_to_use[1][
|
263 |
+
annotation_on_up_end - 1] == '.' and annotation_on_pdb_start == 'nan':
|
264 |
+
annotation_on_pdb_end = 'nan'
|
265 |
+
elif alignment_to_use[1][annotation_on_up_end - 1] == '-' or alignment_to_use[1][
|
266 |
+
annotation_on_up_end - 1] == '.' and annotation_on_pdb_start != 'nan':
|
267 |
+
annotation_on_pdb_end = int(annotation_on_up_end) - count_gap_pdb
|
268 |
+
else:
|
269 |
+
annotation_on_pdb_end = int(annotation_on_up_end) - count_gap_pdb
|
270 |
+
else:
|
271 |
+
annotation_on_pdb_end = 'nan'
|
272 |
+
except:
|
273 |
+
IndexError # Say isoform 2 is matched with the length 100, but canonical is 150 aa long. If there is an annotation at 105. position, for the isoform it throws an index error.
|
274 |
+
|
275 |
+
if annotation_on_pdb_start == 'nan' and annotation_on_pdb_end != 'nan':
|
276 |
+
annotation_on_pdb_start = annotation_on_up_start - count_gap_pdb
|
277 |
+
if annotation_on_pdb_start == annotation_on_pdb_end:
|
278 |
+
annotation_on_pdb_start = 'nan'
|
279 |
+
annotation_on_pdb_end = 'nan'
|
280 |
+
return annotation_on_up_start, annotation_on_up_end, annotation_on_pdb_start, annotation_on_pdb_end
|
281 |
+
|
282 |
+
|
283 |
+
def annotation_pos_on_pdb(annot_positions, startGap, alignment_to_use, identifier):
|
284 |
+
newpos = []
|
285 |
+
if annot_positions != 'nan':
|
286 |
+
annot_positions = (str(annot_positions).replace("'", ''))
|
287 |
+
annot_positions = (str(annot_positions).replace('[', ''))
|
288 |
+
annot_positions = (str(annot_positions).replace("]", ''))
|
289 |
+
positionList_perAnnotation = annot_positions.split(',')
|
290 |
+
positionList_perAnnotation = [h.strip() for h in positionList_perAnnotation]
|
291 |
+
|
292 |
+
position_start_on_pdb = 'nan'
|
293 |
+
position_end_on_pdb = 'nan'
|
294 |
+
try:
|
295 |
+
positionList_perAnnotation = [i for i in positionList_perAnnotation if i != 'nan']
|
296 |
+
except:
|
297 |
+
TypeError
|
298 |
+
for position in range(len(positionList_perAnnotation)):
|
299 |
+
if ('-' not in str(positionList_perAnnotation[position])) and (str(positionList_perAnnotation[position]) != '?') and (str(positionList_perAnnotation[position]) != '') and (len(str(positionList_perAnnotation[position])) != 0):
|
300 |
+
count_gap = startGap
|
301 |
+
count_residue = 0
|
302 |
+
for j in alignment_to_use[0][startGap:]:
|
303 |
+
if j == '.' or j == '-':
|
304 |
+
count_gap += 1
|
305 |
+
else:
|
306 |
+
count_residue += 1
|
307 |
+
try:
|
308 |
+
if int(count_residue) == int(positionList_perAnnotation[position]):
|
309 |
+
break
|
310 |
+
except:
|
311 |
+
ValueError
|
312 |
+
|
313 |
+
annotation_on_up = int(positionList_perAnnotation[position]) + int(count_gap)
|
314 |
+
try:
|
315 |
+
pdb_residue_start = alignment_to_use[2][annotation_on_up - 1].strip()
|
316 |
+
except:
|
317 |
+
IndexError
|
318 |
+
pdb_residue_start = 'nan'
|
319 |
+
if pdb_residue_start != 'nan':
|
320 |
+
try:
|
321 |
+
if (pdb_residue_start == '.') or (pdb_residue_start == '-'):
|
322 |
+
for ran in range(len(alignment_to_use[2][(annotation_on_up - 1):annotation_on_up])):
|
323 |
+
if (alignment_to_use[2][(annotation_on_up - 1):annotation_on_up][ran] != '.') and \
|
324 |
+
(alignment_to_use[2][(annotation_on_up - 1):annotation_on_up][
|
325 |
+
ran] != '-') and \
|
326 |
+
((alignment_to_use[1][(annotation_on_up - 1):annotation_on_up][
|
327 |
+
ran] == '|') or
|
328 |
+
(alignment_to_use[1][(annotation_on_up - 1):annotation_on_up][
|
329 |
+
ran] == 'X')):
|
330 |
+
annotation_on_up += ran
|
331 |
+
break
|
332 |
+
elif (pdb_residue_start != '.') and (pdb_residue_start != '-') and \
|
333 |
+
((alignment_to_use[1][annotation_on_up - 1] == '.') or (
|
334 |
+
alignment_to_use[1][annotation_on_up - 1] == '-')):
|
335 |
+
for ran in range(len(alignment_to_use[2][(annotation_on_up - 1):annotation_on_up])):
|
336 |
+
if ((alignment_to_use[1][(annotation_on_up - 1):annotation_on_up][ran] == '|') or
|
337 |
+
(alignment_to_use[1][(annotation_on_up - 1):annotation_on_up][ran] == 'X')):
|
338 |
+
annotation_on_up += ran
|
339 |
+
break
|
340 |
+
count_gap_pdb = 0
|
341 |
+
for q in alignment_to_use[2][0:annotation_on_up - 1]:
|
342 |
+
if q == '.' or q == '-':
|
343 |
+
count_gap_pdb += 1
|
344 |
+
if alignment_to_use[1][annotation_on_up] == '-' or alignment_to_use[1][
|
345 |
+
annotation_on_up] == '.':
|
346 |
+
annotation_on_pdb = 'nan'
|
347 |
+
else:
|
348 |
+
annotation_on_pdb = int(annotation_on_up) - count_gap_pdb
|
349 |
+
|
350 |
+
if count_gap_pdb == annotation_on_up:
|
351 |
+
annotation_on_pdb = 'nan'
|
352 |
+
try:
|
353 |
+
if alignment_to_use[2][count_gap_pdb + annotation_on_pdb - 1] == '.' or alignment_to_use[2][
|
354 |
+
count_gap_pdb + annotation_on_pdb - 1] == '-':
|
355 |
+
annotation_on_pdb = 'nan'
|
356 |
+
except:
|
357 |
+
IndexError
|
358 |
+
annotation_on_pdb = 'nan'
|
359 |
+
except:
|
360 |
+
IndexError
|
361 |
+
annotation_on_pdb = 'nan'
|
362 |
+
|
363 |
+
newpos.append(annotation_on_pdb)
|
364 |
+
|
365 |
+
elif ('-' in str(positionList_perAnnotation[position])) and (
|
366 |
+
str(positionList_perAnnotation[position]) != '?') and (
|
367 |
+
str(positionList_perAnnotation[position]) != ' ') and (
|
368 |
+
len(str(positionList_perAnnotation[position])) != 0):
|
369 |
+
try:
|
370 |
+
position_start_on_pdb = \
|
371 |
+
find_position_on_pdb_for_range_annotations(positionList_perAnnotation[position],
|
372 |
+
startGap, alignment_to_use)[2]
|
373 |
+
position_end_on_pdb = \
|
374 |
+
find_position_on_pdb_for_range_annotations(positionList_perAnnotation[position],
|
375 |
+
startGap, alignment_to_use)[3]
|
376 |
+
except:
|
377 |
+
ValueError
|
378 |
+
newpositions = str(position_start_on_pdb) + '-' + str(position_end_on_pdb)
|
379 |
+
newpos.append(newpositions)
|
380 |
+
else:
|
381 |
+
pass
|
382 |
+
try:
|
383 |
+
newpos = [i for i in newpos if i != 'nan']
|
384 |
+
except:
|
385 |
+
TypeError
|
386 |
+
return newpos
|
387 |
+
|
388 |
+
|
389 |
+
def final_stage(df, annotation_list, alignment_path):
|
390 |
+
for i in df.index:
|
391 |
+
|
392 |
+
|
393 |
+
identifier = df.at[i, 'uniprotID'] + '_' + df.at[i, 'pdbID'] + '_' + df.at[i, 'chain'] + '_'
|
394 |
+
alignment_list = do_alignment(identifier, df.at[i, 'uniprotSequence'], df.at[i, 'pdbSequence'], alignment_path)
|
395 |
+
df.at[i, 'pdb_alignStatus'] = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[0]
|
396 |
+
|
397 |
+
print()
|
398 |
+
df.at[i, 'mutationPositionOnPDB'] = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[1]
|
399 |
+
startGap = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[2]
|
400 |
+
alignment_to_use = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[3]
|
401 |
+
for annot in annotation_list:
|
402 |
+
df.at[i, annot] = annotation_pos_on_pdb(df.at[i, annot], startGap, alignment_to_use, identifier)
|
403 |
+
if str(df.at[i, 'domStart']) != 'nan' and str(df.at[i, 'domEnd']) != 'nan' and \
|
404 |
+
((str(df.at[i, 'domStart']) != '-1' and str(df.at[i, 'domEnd']) != '-1' and
|
405 |
+
str(df.at[i, 'domStart']) != '-1.0' and str(df.at[i, 'domEnd']) != '-1.0')):
|
406 |
+
domainLoc = str(df.at[i, 'domStart']).split('.')[0] + '-' + str(df.at[i, 'domEnd']).split('.')[0]
|
407 |
+
domain_pos = find_position_on_pdb_for_range_annotations(domainLoc, startGap, alignment_to_use)
|
408 |
+
df.at[i, 'domainStartonPDB'] = domain_pos[2]
|
409 |
+
df.at[i, 'domainEndonPDB'] = domain_pos[3]
|
410 |
+
elif str(df.at[i, 'domStart']) != '-1' or str(df.at[i, 'domEnd']) != '-1' or \
|
411 |
+
str(df.at[i, 'domStart']) != '-1.0' or str(df.at[i, 'domEnd']) != '-1.0':
|
412 |
+
df.at[i, 'domainStartonPDB'] = 'nan'
|
413 |
+
df.at[i, 'domainEndonPDB'] = 'nan'
|
414 |
+
|
415 |
+
|
416 |
+
df = df.astype(str)
|
417 |
+
return df
|
418 |
+
|
419 |
+
def alignment(dataframe_to_align, annotation_list, alignment_path):
|
420 |
+
domainList = ['domStart', 'domEnd']
|
421 |
+
result = final_stage(dataframe_to_align, annotation_list, alignment_path)
|
422 |
+
return result
|
423 |
+
#
|
code/add_annotations.py
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import ssl
|
2 |
+
import requests as r
|
3 |
+
from decimal import *
|
4 |
+
import numpy as np
|
5 |
+
def add_annotations(dataframe):
|
6 |
+
print('Downloading UniProt sequence annotations...\n')
|
7 |
+
ssl._create_default_https_context = ssl._create_unverified_context
|
8 |
+
|
9 |
+
original_annot_name = ['DISULFID', 'INIT_MET', 'INTRAMEM', 'VARIANT', 'DNA_BIND', 'ACT_SITE', 'NP_BIND', 'LIPID',
|
10 |
+
'SITE',
|
11 |
+
'TRANSMEM', 'CROSSLNK', 'MUTAGEN', 'STRAND', 'HELIX', 'TURN', 'METAL', 'REPEAT', 'TOPO_DOM',
|
12 |
+
'CA_BIND', 'BINDING', 'REGION', 'SIGNAL', 'MOD_RES', 'ZN_FING', 'MOTIF', 'COILED', 'PEPTIDE',
|
13 |
+
'TRANSIT', 'CARBOHYD', 'PROPEP']
|
14 |
+
annotation_list = ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
|
15 |
+
'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis', 'strand',
|
16 |
+
'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain', 'caBinding', 'bindingSite',
|
17 |
+
'region',
|
18 |
+
'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide',
|
19 |
+
'transitPeptide', 'glycosylation', 'propeptide']
|
20 |
+
|
21 |
+
dataframe = dataframe.reset_index().drop(['index'], axis=1)
|
22 |
+
|
23 |
+
for annot in original_annot_name:
|
24 |
+
dataframe[annot] = ''
|
25 |
+
|
26 |
+
for protein in list(set(dataframe.uniprotID.to_list())):
|
27 |
+
print('Downloading annotations for ' + protein)
|
28 |
+
uniprot_entry = r.get("http://www.uniprot.org/uniprot/" + protein + ".txt")
|
29 |
+
uniprot_entry = uniprot_entry.text.split('\n')
|
30 |
+
|
31 |
+
annot_for_protein = []
|
32 |
+
for annotation in original_annot_name:
|
33 |
+
for line in uniprot_entry:
|
34 |
+
if annotation.strip() in line and line.startswith(
|
35 |
+
'FT') and 'evidence' not in line and 'ECO' not in line and 'note' not in line:
|
36 |
+
annot_for_protein.append(list(filter(None, line.split(' ')))[1:])
|
37 |
+
for select in annot_for_protein:
|
38 |
+
if select[0] not in dataframe.columns:
|
39 |
+
dataframe.loc[dataframe.uniprotID == protein, select[0]] = str((select[1] + '; '))
|
40 |
+
else:
|
41 |
+
dataframe.loc[dataframe.uniprotID == protein, select[0]] += str((select[1] + '; '))
|
42 |
+
for i in range(len(original_annot_name)):
|
43 |
+
dataframe = dataframe.rename(columns={original_annot_name[i]: annotation_list[i]})
|
44 |
+
|
45 |
+
# Fix annotation positions
|
46 |
+
print('Processing positions...\n')
|
47 |
+
for i in dataframe.index:
|
48 |
+
for annot in dataframe.columns[-30:]:
|
49 |
+
if annot != 'disulfide':
|
50 |
+
if dataframe.at[i, annot] != 'nan':
|
51 |
+
dataframe.at[i, annot] = ([x for x in [k.strip() for k in dataframe.at[i, annot].split(';')] if x])
|
52 |
+
if '..' not in str(dataframe.at[i, annot]):
|
53 |
+
pass
|
54 |
+
elif '..' in str(dataframe.at[i, annot]):
|
55 |
+
dataframe.at[i, annot] = str(dataframe.at[i, annot]).replace('..', '-')
|
56 |
+
else:
|
57 |
+
disulfide_annot = []
|
58 |
+
if dataframe.at[i, annot] != 'nan':
|
59 |
+
dataframe.at[i, annot]= dataframe.at[i, annot].split(';')
|
60 |
+
dataframe.at[i, annot] = [i.split('..') for i in dataframe.at[i, annot]]
|
61 |
+
dataframe.at[i, annot] =[e for v in dataframe.at[i, annot] for e in v]
|
62 |
+
dataframe.at[i, annot] = [i for i in dataframe.at[i, annot] if i != ' ']
|
63 |
+
|
64 |
+
# Add binary annotations
|
65 |
+
print('Adding binary annotations...\n')
|
66 |
+
dataframe = dataframe.astype('str')
|
67 |
+
for i in dataframe.index:
|
68 |
+
for k in annotation_list: # get the positions of each attribute as a list
|
69 |
+
txt = k + 'Binary'
|
70 |
+
dataframe.at[i, txt] = Decimal('nan')
|
71 |
+
try:
|
72 |
+
for positions in dataframe.at[i, k].split(','):
|
73 |
+
position = positions.strip('[').strip(']').replace("'", "")
|
74 |
+
if position != 'nan' and position != '' and '-' not in position and int(
|
75 |
+
dataframe.at[i, 'pos']) == int(position):
|
76 |
+
dataframe.at[i, txt] = '1'
|
77 |
+
break
|
78 |
+
elif position != 'nan' and position != '' and '-' not in position and int(
|
79 |
+
dataframe.at[i, 'pos']) != int(position):
|
80 |
+
dataframe.at[i, txt] = '0'
|
81 |
+
elif position != 'nan' and position != '' and '-' in position:
|
82 |
+
if int(position.split('-')[0]) < int(dataframe.at[i, 'pos']) < int(position.split('-')[1]):
|
83 |
+
dataframe.at[i, txt] = '1'
|
84 |
+
break
|
85 |
+
else:
|
86 |
+
dataframe.at[i, txt] = '0'
|
87 |
+
except:
|
88 |
+
ValueError
|
89 |
+
|
90 |
+
# Final corrections
|
91 |
+
|
92 |
+
dataframe = dataframe.replace({'[\'?\']': 'nan'})
|
93 |
+
dataframe = dataframe.replace({'[]': 'nan'})
|
94 |
+
return dataframe
|
95 |
+
|
code/add_domains.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from collections import Counter
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
def add_domains(data, path_to_domains):
|
5 |
+
domains = pd.read_csv(path_to_domains, delimiter=' ')
|
6 |
+
data = data.merge(domains, right_on='proteinID', left_on='uniprotID', how='left')
|
7 |
+
data = data.drop(['proteinID'], axis=1)
|
8 |
+
# Label each data point as range or notRange based on the relative distance of mutation and domain boundaries.
|
9 |
+
data = data.astype('str')
|
10 |
+
data.domStart = data.domStart.astype('float')
|
11 |
+
data.domEnd = data.domEnd.astype('float')
|
12 |
+
|
13 |
+
for i in data.index:
|
14 |
+
if data.at[i, 'domain'] != 'nan':
|
15 |
+
if int(data.at[i, 'domStart']) <= int(data.at[i, 'pos']) <= int(data.at[i, 'domEnd']):
|
16 |
+
data.at[i, 'distance'] = 0
|
17 |
+
else:
|
18 |
+
distance = min(abs(int(data.at[i, 'domStart']) - int(data.at[i, 'pos'])),
|
19 |
+
abs(int(data.at[i, 'domEnd']) - int(data.at[i, 'pos'])))
|
20 |
+
data.at[i, 'distance'] = int(distance)
|
21 |
+
else:
|
22 |
+
data.at[i, 'distance'] = 'nan'
|
23 |
+
|
24 |
+
data = data.sort_values(by=['datapoint', 'distance']).reset_index(drop=True) # Distances will be sorted.
|
25 |
+
|
26 |
+
# Keep the one with the least distance. But we may have more than one range domains for a datapoint if distance = 0.
|
27 |
+
# For this reason first we need to separate range ones so that when we take the first occurance to get the closest one
|
28 |
+
# for non range ones, other distance=0 ones wont disappear.
|
29 |
+
|
30 |
+
data_range = data[data.distance == 0]
|
31 |
+
data_out_range = data[data.distance != 0]
|
32 |
+
|
33 |
+
# For the range ones, find the most occurance
|
34 |
+
|
35 |
+
dom = []
|
36 |
+
for i in data_range.index:
|
37 |
+
dom.append(data_range.at[i, 'domain'])
|
38 |
+
|
39 |
+
domainCount = Counter(dom) # Occurance of domains.
|
40 |
+
|
41 |
+
# For out of range ones, take the closest distance.
|
42 |
+
data_out_range = data_out_range.drop_duplicates(['datapoint'], keep='first') # Already sorted above.
|
43 |
+
domain_counts = pd.DataFrame(domainCount.items(), columns=['domain', 'count'])
|
44 |
+
data_range_counts = data_range.merge(domain_counts, on='domain')
|
45 |
+
data_range_counts = data_range_counts.sort_values(['datapoint', 'count'])
|
46 |
+
data_range_counts = data_range_counts.drop_duplicates(['datapoint'], keep='last') # Take with the higher count.
|
47 |
+
data_range_counts = data_range_counts.drop(['count'], axis=1)
|
48 |
+
|
49 |
+
# Merge them back together
|
50 |
+
|
51 |
+
frames = [data_range_counts, data_out_range]
|
52 |
+
data = pd.concat(frames, sort=False) # Here when you concat two data frames, we might have range and not range with
|
53 |
+
# min distance for the same data point. Delete the one coming from notRange one.
|
54 |
+
data = data.sort_values(['datapoint', 'distance']).reset_index(drop=True)
|
55 |
+
data = data.drop_duplicates(['datapoint'], keep='first')
|
56 |
+
data = data.astype(str)
|
57 |
+
return data
|
code/add_interface_pos.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def get_interface_positions(dataframe, column1, column2):
|
2 |
+
interface_positions = {}
|
3 |
+
for i in dataframe.index:
|
4 |
+
if dataframe.at[i, column1] not in interface_positions and dataframe.at[i, column1 + '_IRES'] != '[]':
|
5 |
+
interface_positions[dataframe.at[i, column1]] = dataframe.at[i, str(column1 + '_IRES')]
|
6 |
+
elif dataframe.at[i, column1] in interface_positions and dataframe.at[i, column1 + '_IRES'] != '[]':
|
7 |
+
interface_positions[dataframe.at[i, column1]] = interface_positions[dataframe.at[i, column1]].strip(
|
8 |
+
']') + ',' + (dataframe.at[i, str(column1 + '_IRES')]).strip('[')
|
9 |
+
if dataframe.at[i, column2] not in interface_positions and dataframe.at[i, column2 + '_IRES'] != '[]':
|
10 |
+
interface_positions[dataframe.at[i, column2]] = dataframe.at[i, str(column2 + '_IRES')]
|
11 |
+
elif dataframe.at[i, column2] in interface_positions and dataframe.at[i, column2 + '_IRES'] != '[]':
|
12 |
+
interface_positions[dataframe.at[i, column2]] = interface_positions[dataframe.at[i, column2]].strip(
|
13 |
+
']') + ',' + (dataframe.at[i, str(column2 + '_IRES')]).strip('[')
|
14 |
+
|
15 |
+
try:
|
16 |
+
for key, value in interface_positions.items():
|
17 |
+
n = []
|
18 |
+
m = []
|
19 |
+
if value != '[]':
|
20 |
+
valueList = value.split(',')
|
21 |
+
valueList[0] = str(valueList[0]).strip('[')
|
22 |
+
valueList[-1] = str(valueList[-1]).strip(']')
|
23 |
+
for val in valueList:
|
24 |
+
if '-' in val:
|
25 |
+
for r in range(int(val.split('-')[0]), int(val.split('-')[1]) + 1):
|
26 |
+
n.append(r)
|
27 |
+
else:
|
28 |
+
m.append(int(val))
|
29 |
+
fin = m + n
|
30 |
+
|
31 |
+
interface_positions[key] = fin
|
32 |
+
except:
|
33 |
+
ValueError
|
34 |
+
|
35 |
+
return interface_positions
|
code/add_sasa.py
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import glob
|
2 |
+
import ssbio.utils
|
3 |
+
import subprocess
|
4 |
+
import ssbio
|
5 |
+
import os.path as op
|
6 |
+
from add_3Dalignment import *
|
7 |
+
import os
|
8 |
+
from pathlib import Path
|
9 |
+
import gzip
|
10 |
+
import shutil
|
11 |
+
import streamlit as st
|
12 |
+
|
13 |
+
def run_freesasa(infile, outfile, include_hetatms=True, outdir=None, force_rerun=False, file_type = 'gzip'):
|
14 |
+
if not outdir:
|
15 |
+
outdir = ''
|
16 |
+
outfile = op.join(outdir, outfile)
|
17 |
+
if file_type == 'pdb':
|
18 |
+
if ssbio.utils.force_rerun(flag=force_rerun, outfile=outfile):
|
19 |
+
if include_hetatms:
|
20 |
+
shell_command = 'freesasa --format=rsa --hetatm {} -o {}'.format(infile, outfile)
|
21 |
+
else:
|
22 |
+
shell_command = 'freesasa --format=rsa {} -o {}'.format(infile, outfile)
|
23 |
+
command = subprocess.Popen(shell_command,
|
24 |
+
stdout=subprocess.PIPE,
|
25 |
+
stderr=subprocess.PIPE,
|
26 |
+
shell=True)
|
27 |
+
out, err = command.communicate()
|
28 |
+
|
29 |
+
elif file_type == 'gzip':
|
30 |
+
with gzip.open(infile, 'rb') as f_in:
|
31 |
+
with open('file_temp.pdb', 'wb') as f_out:
|
32 |
+
shutil.copyfileobj(f_in, f_out)
|
33 |
+
|
34 |
+
infile = 'file_temp.pdb'
|
35 |
+
|
36 |
+
if ssbio.utils.force_rerun(flag=force_rerun, outfile=outfile):
|
37 |
+
if include_hetatms:
|
38 |
+
shell_command = 'freesasa --format=rsa --hetatm {} -o {}'.format(infile, outfile)
|
39 |
+
else:
|
40 |
+
shell_command = 'freesasa --format=rsa {} -o {}'.format(infile, outfile)
|
41 |
+
command = subprocess.Popen(shell_command,
|
42 |
+
stdout=subprocess.PIPE,
|
43 |
+
stderr=subprocess.PIPE,
|
44 |
+
shell=True)
|
45 |
+
out, err = command.communicate()
|
46 |
+
|
47 |
+
|
48 |
+
return outfile
|
49 |
+
|
50 |
+
def calculate_freesasa(ID, model_num, existing_free_sasa, path_to_input,path_to_output_files, file_type = 'gzip'):
|
51 |
+
print('Calculating surface area...\n')
|
52 |
+
file_base = str(Path(path_to_input / '*'))
|
53 |
+
file_str = glob.glob(file_base)[0].split('-')[-1].split('.')[0]
|
54 |
+
if file_type == 'gzip':
|
55 |
+
if ID not in existing_free_sasa:
|
56 |
+
fullID = f'AF-{ID}-F{model_num}-{file_str }.pdb.gz'
|
57 |
+
run_freesasa(Path(path_to_input / fullID),
|
58 |
+
Path(path_to_output_files / f'freesasa_files/{fullID}.txt'), include_hetatms=True,
|
59 |
+
outdir=None, force_rerun=False)
|
60 |
+
elif file_type == 'pdb':
|
61 |
+
if ID not in existing_free_sasa:
|
62 |
+
fullID = f'AF-{ID}-F{model_num}-model_v1.pdb'
|
63 |
+
run_freesasa(Path(path_to_input / fullID),
|
64 |
+
Path(path_to_output_files / f'freesasa_files/{fullID}.txt'), include_hetatms=True,
|
65 |
+
outdir=None, force_rerun=False)
|
66 |
+
|
67 |
+
def sasa(source, pdbID, uniprotID, sasa_pos, wt, mode, path_to_output_files,file_type = 'gzip'):
|
68 |
+
if mode == 1:
|
69 |
+
sasa = 'nan'
|
70 |
+
for filename in list(Path(path_to_output_files / 'freesasa_files').glob("*")):
|
71 |
+
if source == 'PDB':
|
72 |
+
fname = str(filename).split('.')[0].split('/')[-1].upper()
|
73 |
+
elif source == 'MODBASE':
|
74 |
+
fname = str(filename).split('.')[0].split('/')[-1]
|
75 |
+
elif source == 'SWISSSMODEL':
|
76 |
+
fname = str(filename).split('_')[2]
|
77 |
+
if pdbID == fname:
|
78 |
+
files = open(filename, 'r')
|
79 |
+
file = files.readlines()
|
80 |
+
for k in file:
|
81 |
+
|
82 |
+
if k.strip()[10:13] == sasa_pos:
|
83 |
+
residue = str(k[4:7].strip())
|
84 |
+
if wt == threeToOne(residue):
|
85 |
+
sasa = str(k[22:28]).strip('\n')
|
86 |
+
return (sasa)
|
87 |
+
elif wt != threeToOne(residue):
|
88 |
+
sasa = str(k[22:28]).strip('\n') + '*'
|
89 |
+
return (sasa)
|
90 |
+
else:
|
91 |
+
return 'nan' #######
|
92 |
+
|
93 |
+
if mode == 2:
|
94 |
+
if sasa_pos != np.NaN:
|
95 |
+
sasa = 'nan'
|
96 |
+
if file_type == 'pdb':
|
97 |
+
for filename in list(Path(path_to_output_files / 'freesasa_files').glob("*")):
|
98 |
+
fname = list(filter(None, filename.split('.'))).split('/')[-1].upper()
|
99 |
+
if uniprotID == fname:
|
100 |
+
files = open(filename, 'r')
|
101 |
+
file = files.readlines()
|
102 |
+
for k in file:
|
103 |
+
if k.strip()[10:13] == sasa_pos:
|
104 |
+
residue = str(k[4:7].strip())
|
105 |
+
if wt == threeToOne(residue):
|
106 |
+
sasa = str(k[22:28]).strip('\n')
|
107 |
+
elif wt != threeToOne(residue):
|
108 |
+
sasa = str(k[22:28]).strip('\n') + '*'
|
109 |
+
|
110 |
+
return sasa
|
111 |
+
elif file_type == 'gzip':
|
112 |
+
for filename in list(Path(path_to_output_files / 'freesasa_files').glob("*")):
|
113 |
+
fname = list(filter(None, str(filename).split('.')))[0].split('/')[-1].split('-')[1].upper()
|
114 |
+
|
115 |
+
if uniprotID == fname:
|
116 |
+
files = open(filename, 'r')
|
117 |
+
file = files.readlines()
|
118 |
+
for k in file:
|
119 |
+
if str(k.strip()[10:13]) == str(sasa_pos):
|
120 |
+
residue = str(k[4:7].strip())
|
121 |
+
if wt == threeToOne(residue):
|
122 |
+
sasa = str(k[22:28]).strip('\n')
|
123 |
+
elif wt != threeToOne(residue):
|
124 |
+
sasa = str(k[22:28]).strip('\n') + '*'
|
125 |
+
else:
|
126 |
+
sasa = 'nan'
|
127 |
+
|
128 |
+
return sasa
|
129 |
+
else:
|
130 |
+
sasa = 'nan'
|
131 |
+
return sasa
|
code/add_sequence.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests as r
|
2 |
+
from io import StringIO
|
3 |
+
from Bio import SeqIO
|
4 |
+
import xml.etree.ElementTree as ET
|
5 |
+
|
6 |
+
def get_uniprot_seq(protein_id):
|
7 |
+
print('Fetching UniProt Sequences for ID: ', protein_id)
|
8 |
+
baseUrl = "http://www.uniprot.org/uniprot/"
|
9 |
+
currentUrl = baseUrl + protein_id + ".fasta"
|
10 |
+
response = r.post(currentUrl)
|
11 |
+
cData = ''.join(response.text)
|
12 |
+
Seq = StringIO(cData)
|
13 |
+
pSeq = list(SeqIO.parse(Seq, 'fasta'))
|
14 |
+
try:
|
15 |
+
return str(pSeq[0].seq)
|
16 |
+
except:
|
17 |
+
IndexError
|
18 |
+
return str('')
|
19 |
+
|
20 |
+
|
21 |
+
def get_isoforms(protein_id):
|
22 |
+
print('Fetching UniProt Isoforms for ID: ', protein_id)
|
23 |
+
try:
|
24 |
+
# a dictionary storing the sequence of your isoforms, key: accesion number, value: sequence
|
25 |
+
isoforms = dict()
|
26 |
+
# make a call to EBI API
|
27 |
+
req = r.get('https://www.ebi.ac.uk/proteins/api/proteins/{}/isoforms'.format(protein_id))
|
28 |
+
# parse the returned XML
|
29 |
+
uniprot = ET.fromstring(req.text)
|
30 |
+
for isoform in uniprot:
|
31 |
+
# get the sequence
|
32 |
+
seq = isoform.find('{http://uniprot.org/uniprot}sequence')
|
33 |
+
|
34 |
+
# get the accession number
|
35 |
+
iso_accession = isoform.find('{http://uniprot.org/uniprot}accession')
|
36 |
+
|
37 |
+
# add the values to the dictionary
|
38 |
+
if seq.text and iso_accession.text:
|
39 |
+
isoforms[iso_accession.text] = seq.text
|
40 |
+
return isoforms
|
41 |
+
except:
|
42 |
+
AttributeError
|
43 |
+
isoforms = {}
|
44 |
+
return isoforms
|
code/add_structure.py
ADDED
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import time
|
3 |
+
import json
|
4 |
+
import zlib
|
5 |
+
from xml.etree import ElementTree
|
6 |
+
from urllib.parse import urlparse, parse_qs, urlencode
|
7 |
+
import requests
|
8 |
+
from requests.adapters import HTTPAdapter, Retry
|
9 |
+
from unipressed import IdMappingClient
|
10 |
+
|
11 |
+
## Code adapted from UniProt documentation.
|
12 |
+
def get_pdb_ids_2(protein_id):
|
13 |
+
POLLING_INTERVAL = 5
|
14 |
+
API_URL = "https://rest.uniprot.org"
|
15 |
+
|
16 |
+
retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])
|
17 |
+
session = requests.Session()
|
18 |
+
session.mount("https://", HTTPAdapter(max_retries=retries))
|
19 |
+
|
20 |
+
def check_response(response):
|
21 |
+
try:
|
22 |
+
response.raise_for_status()
|
23 |
+
except requests.HTTPError:
|
24 |
+
print(response.json())
|
25 |
+
raise
|
26 |
+
|
27 |
+
def submit_id_mapping(from_db, to_db, ids):
|
28 |
+
request = requests.post(
|
29 |
+
f"{API_URL}/idmapping/run",
|
30 |
+
data={"from": from_db, "to": to_db, "ids": ids},
|
31 |
+
)
|
32 |
+
check_response(request)
|
33 |
+
return request.json()["jobId"]
|
34 |
+
|
35 |
+
def get_next_link(headers):
|
36 |
+
re_next_link = re.compile(r'<(.+)>; rel="next"')
|
37 |
+
if "Link" in headers:
|
38 |
+
match = re_next_link.match(headers["Link"])
|
39 |
+
if match:
|
40 |
+
return match.group(1)
|
41 |
+
|
42 |
+
def check_id_mapping_results_ready(job_id):
|
43 |
+
while True:
|
44 |
+
request = session.get(f"{API_URL}/idmapping/status/{job_id}")
|
45 |
+
check_response(request)
|
46 |
+
j = request.json()
|
47 |
+
if "jobStatus" in j:
|
48 |
+
if j["jobStatus"] == "RUNNING":
|
49 |
+
print(f"Retrying in {POLLING_INTERVAL}s")
|
50 |
+
time.sleep(POLLING_INTERVAL)
|
51 |
+
else:
|
52 |
+
raise Exception(j["jobStatus"])
|
53 |
+
else:
|
54 |
+
return bool(j["results"] or j["failedIds"])
|
55 |
+
|
56 |
+
def get_batch(batch_response, file_format, compressed):
|
57 |
+
batch_url = get_next_link(batch_response.headers)
|
58 |
+
while batch_url:
|
59 |
+
batch_response = session.get(batch_url)
|
60 |
+
batch_response.raise_for_status()
|
61 |
+
yield decode_results(batch_response, file_format, compressed)
|
62 |
+
batch_url = get_next_link(batch_response.headers)
|
63 |
+
|
64 |
+
def combine_batches(all_results, batch_results, file_format):
|
65 |
+
if file_format == "json":
|
66 |
+
for key in ("results", "failedIds"):
|
67 |
+
if key in batch_results and batch_results[key]:
|
68 |
+
all_results[key] += batch_results[key]
|
69 |
+
elif file_format == "tsv":
|
70 |
+
return all_results + batch_results[1:]
|
71 |
+
else:
|
72 |
+
return all_results + batch_results
|
73 |
+
return all_results
|
74 |
+
|
75 |
+
def get_id_mapping_results_link(job_id):
|
76 |
+
url = f"{API_URL}/idmapping/details/{job_id}"
|
77 |
+
request = session.get(url)
|
78 |
+
check_response(request)
|
79 |
+
return request.json()["redirectURL"]
|
80 |
+
|
81 |
+
def decode_results(response, file_format, compressed):
|
82 |
+
if compressed:
|
83 |
+
decompressed = zlib.decompress(response.content, 16 + zlib.MAX_WBITS)
|
84 |
+
if file_format == "json":
|
85 |
+
j = json.loads(decompressed.decode("utf-8"))
|
86 |
+
return j
|
87 |
+
elif file_format == "tsv":
|
88 |
+
return [line for line in decompressed.decode("utf-8").split("\n") if line]
|
89 |
+
elif file_format == "xlsx":
|
90 |
+
return [decompressed]
|
91 |
+
elif file_format == "xml":
|
92 |
+
return [decompressed.decode("utf-8")]
|
93 |
+
else:
|
94 |
+
return decompressed.decode("utf-8")
|
95 |
+
elif file_format == "json":
|
96 |
+
return response.json()
|
97 |
+
elif file_format == "tsv":
|
98 |
+
return [line for line in response.text.split("\n") if line]
|
99 |
+
elif file_format == "xlsx":
|
100 |
+
return [response.content]
|
101 |
+
elif file_format == "xml":
|
102 |
+
return [response.text]
|
103 |
+
return response.text
|
104 |
+
|
105 |
+
def get_xml_namespace(element):
|
106 |
+
m = re.match(r"\{(.*)\}", element.tag)
|
107 |
+
return m.groups()[0] if m else ""
|
108 |
+
|
109 |
+
def merge_xml_results(xml_results):
|
110 |
+
merged_root = ElementTree.fromstring(xml_results[0])
|
111 |
+
for result in xml_results[1:]:
|
112 |
+
root = ElementTree.fromstring(result)
|
113 |
+
for child in root.findall("{http://uniprot.org/uniprot}entry"):
|
114 |
+
merged_root.insert(-1, child)
|
115 |
+
ElementTree.register_namespace("", get_xml_namespace(merged_root[0]))
|
116 |
+
return ElementTree.tostring(merged_root, encoding="utf-8", xml_declaration=True)
|
117 |
+
|
118 |
+
|
119 |
+
def get_id_mapping_results_search(url):
|
120 |
+
parsed = urlparse(url)
|
121 |
+
query = parse_qs(parsed.query)
|
122 |
+
file_format = query["format"][0] if "format" in query else "json"
|
123 |
+
if "size" in query:
|
124 |
+
size = int(query["size"][0])
|
125 |
+
else:
|
126 |
+
size = 500
|
127 |
+
query["size"] = size
|
128 |
+
compressed = (
|
129 |
+
query["compressed"][0].lower() == "true" if "compressed" in query else False
|
130 |
+
)
|
131 |
+
parsed = parsed._replace(query=urlencode(query, doseq=True))
|
132 |
+
url = parsed.geturl()
|
133 |
+
request = session.get(url)
|
134 |
+
check_response(request)
|
135 |
+
results = decode_results(request, file_format, compressed)
|
136 |
+
total = int(request.headers["x-total-results"])
|
137 |
+
for i, batch in enumerate(get_batch(request, file_format, compressed), 1):
|
138 |
+
results = combine_batches(results, batch, file_format)
|
139 |
+
if file_format == "xml":
|
140 |
+
return merge_xml_results(results)
|
141 |
+
return results
|
142 |
+
|
143 |
+
|
144 |
+
job_id = submit_id_mapping(
|
145 |
+
from_db="UniProtKB_AC-ID", to_db="PDB", ids=protein_id
|
146 |
+
)
|
147 |
+
if check_id_mapping_results_ready(job_id):
|
148 |
+
link = get_id_mapping_results_link(job_id)
|
149 |
+
results = get_id_mapping_results_search(link)
|
150 |
+
# Equivalently using the stream endpoint which is more demanding
|
151 |
+
# on the API and so is less stable:
|
152 |
+
# results = get_id_mapping_results_stream(link)
|
153 |
+
|
154 |
+
return [i['to'] for i in results['results']]
|
155 |
+
def get_pdb_ids(protein_id):
|
156 |
+
try:
|
157 |
+
request = IdMappingClient.submit(
|
158 |
+
source="UniProtKB_AC-ID", dest="PDB", ids={protein_id})
|
159 |
+
|
160 |
+
time.sleep(2.0)
|
161 |
+
pdb_list = list(request.each_result())
|
162 |
+
return [i['to'] for i in pdb_list]
|
163 |
+
except requests.exceptions.HTTPError:
|
164 |
+
get_pdb_ids_2(protein_id)
|
165 |
+
except KeyError:
|
166 |
+
get_pdb_ids_2(protein_id)
|
167 |
+
|
168 |
+
|
code/alphafold_featureVector.py
ADDED
@@ -0,0 +1,579 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# IMPORT NECESSARY MODULES AND LIBRARIES
|
2 |
+
from timeit import default_timer as timer
|
3 |
+
import xml.etree.ElementTree as ET
|
4 |
+
from collections import Counter
|
5 |
+
from bs4 import BeautifulSoup
|
6 |
+
from io import StringIO
|
7 |
+
from decimal import *
|
8 |
+
import pandas as pd
|
9 |
+
import requests as r
|
10 |
+
import os.path as op
|
11 |
+
from pathlib import Path
|
12 |
+
import subprocess
|
13 |
+
import argparse
|
14 |
+
import ssbio.utils
|
15 |
+
import warnings
|
16 |
+
import sys
|
17 |
+
import pathlib
|
18 |
+
import os, glob
|
19 |
+
import math
|
20 |
+
import ssbio
|
21 |
+
import ssl
|
22 |
+
import gzip
|
23 |
+
import ast
|
24 |
+
import itertools
|
25 |
+
|
26 |
+
from Bio.Align import substitution_matrices
|
27 |
+
from Bio.PDB.Polypeptide import *
|
28 |
+
from Bio.PDB import PDBList
|
29 |
+
from Bio import Align
|
30 |
+
from Bio import SeqIO
|
31 |
+
from Bio.PDB import *
|
32 |
+
import numpy as np
|
33 |
+
|
34 |
+
|
35 |
+
|
36 |
+
|
37 |
+
# FUNCTIONS
|
38 |
+
from calc_pc_property import *
|
39 |
+
from add_domains import *
|
40 |
+
from add_annotations import *
|
41 |
+
from add_structure import *
|
42 |
+
from add_alignment import *
|
43 |
+
from manage_files import *
|
44 |
+
from add_3Dalignment import *
|
45 |
+
from add_sasa import *
|
46 |
+
from standard import *
|
47 |
+
from add_interface_pos import *
|
48 |
+
from standard import *
|
49 |
+
from uniprotSequenceMatch import uniprotSequenceMatch
|
50 |
+
from process_input import clean_data
|
51 |
+
from alphafold_model import *
|
52 |
+
|
53 |
+
|
54 |
+
def alphafold(input_set, mode, impute):
|
55 |
+
start = timer()
|
56 |
+
# Necessary lists
|
57 |
+
annotation_list = ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
|
58 |
+
'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis', 'strand',
|
59 |
+
'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain', 'caBinding', 'bindingSite',
|
60 |
+
'region',
|
61 |
+
'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide',
|
62 |
+
'transitPeptide', 'glycosylation', 'propeptide']
|
63 |
+
|
64 |
+
change_names = {'Disulfide bond': 'disulfide', 'Initiator methionine': 'intMet',
|
65 |
+
'Natural variant': 'naturalVariant',
|
66 |
+
'DNA binding': 'dnaBinding',
|
67 |
+
'Active site': 'activeSite', 'Nucleotide binding': 'nucleotideBinding', 'Lipidation': 'lipidation',
|
68 |
+
'Site': 'site', 'Transmembrane': 'transmembrane', 'Cross-link': 'crosslink',
|
69 |
+
'Mutagenesis': 'mutagenesis', 'Beta strand': 'strand', 'Helix': 'helix', 'Turn': 'turn',
|
70 |
+
'Metal binding': 'metalBinding', 'Repeat': 'repeat',
|
71 |
+
'Topological domain': 'topologicalDomain', 'Calcium binding': 'caBinding',
|
72 |
+
'Binding site': 'bindingSite',
|
73 |
+
'Region': 'region', 'Signal peptide': 'signalPeptide', 'Modified residue': 'modifiedResidue',
|
74 |
+
'Zinc finger': 'zincFinger', 'Motif': 'motif', 'Coiled coil': 'coiledCoil', 'Peptide': 'peptide',
|
75 |
+
'Transit peptide': 'transitPeptide', 'Glycosylation': 'glycosylation', 'Propeptide': 'propeptide',
|
76 |
+
'Intramembrane': 'intramembrane'}
|
77 |
+
|
78 |
+
|
79 |
+
## Standardizing input
|
80 |
+
data = clean_data(input_set)
|
81 |
+
|
82 |
+
path_to_input_files, path_to_output_files, path_to_domains, fisher_path, path_to_interfaces, alphafold_path, alphafold_summary= manage_files(mode)
|
83 |
+
out_path = path_to_output_files / 'log.txt'
|
84 |
+
sys.stdout = open(out_path, 'w')
|
85 |
+
print('Creating directories...')
|
86 |
+
file_base = str(Path(alphafold_path / '*'))
|
87 |
+
file_str = glob.glob(file_base)[0].split('-')[-1].split('.')[0]
|
88 |
+
## Physicochemical properties
|
89 |
+
print('Adding physicochemical properties...\n')
|
90 |
+
data = add_physicochemical(data)
|
91 |
+
|
92 |
+
## Domains
|
93 |
+
print('Adding domains\n')
|
94 |
+
data = add_domains(data, path_to_domains)
|
95 |
+
|
96 |
+
## Processing data frame
|
97 |
+
data = data.astype(str)
|
98 |
+
data = data.replace({'NaN': np.NaN, 'nan': np.NaN})
|
99 |
+
data.domain = data.domain.replace({np.NaN: '-1'}) # Fill -1 if NaN - standardization.
|
100 |
+
data.domStart = data.domStart.replace({np.NaN: '-1'})
|
101 |
+
data.domEnd = data.domEnd.replace({np.NaN: '-1'})
|
102 |
+
data.distance = data.distance.replace({np.NaN: '-1'})
|
103 |
+
fisherResult = pd.read_csv(fisher_path, sep='\t')
|
104 |
+
significant_domains = fisherResult.domain.to_list()
|
105 |
+
|
106 |
+
data = data.reset_index()
|
107 |
+
data = data.drop(columns=['index'])
|
108 |
+
|
109 |
+
## not_match_in_uniprot : Data points not matched to UniProt sequence
|
110 |
+
## uniprot_matched: Data points matched to UniProt sequence. Proceed with this data frame
|
111 |
+
## canonical_fasta : Dataframe including canonical sequence for the protein of interest. Obtained from UniProt.
|
112 |
+
## isoform_fasta: Dataframe including isoform sequences for the protein of interest. Obtained from UniProt.
|
113 |
+
not_match_in_uniprot, uniprot_matched, canonical_fasta, isoform_fasta = uniprotSequenceMatch(data)
|
114 |
+
|
115 |
+
not_match_in_uniprot = not_match_in_uniprot.reset_index().drop(['index'], axis=1)
|
116 |
+
|
117 |
+
for key in change_names.keys():
|
118 |
+
not_match_in_uniprot[key] = ''
|
119 |
+
not_match_in_uniprot = not_match_in_uniprot.rename(columns=change_names)
|
120 |
+
uniprot_matched = add_annotations(uniprot_matched)
|
121 |
+
|
122 |
+
for w in uniprot_matched.index:
|
123 |
+
for q in annotation_list:
|
124 |
+
per_protein = []
|
125 |
+
if uniprot_matched.at[w, q] != 'nan':
|
126 |
+
fix = ast.literal_eval(uniprot_matched.at[w, q])
|
127 |
+
for z in fix:
|
128 |
+
if '-' in z:
|
129 |
+
per_protein += np.arange(int(z.split('-')[0]), int(z.split('-')[1])+1,1).tolist()
|
130 |
+
else:
|
131 |
+
try:
|
132 |
+
per_protein.append(int(z))
|
133 |
+
except:
|
134 |
+
ValueError
|
135 |
+
uniprot_matched.at[w, q] = per_protein
|
136 |
+
else:
|
137 |
+
uniprot_matched.at[w, q] = 'nan'
|
138 |
+
uniprot_matched = uniprot_matched.rename(columns=change_names)
|
139 |
+
uniprot_matched['wt_sequence_match'] = uniprot_matched['wt_sequence_match'].astype(str)
|
140 |
+
|
141 |
+
|
142 |
+
## Avoiding downloading files for SASA calculation if already downloaded.
|
143 |
+
|
144 |
+
existing_free_sasa = list(Path(path_to_output_files / 'freesasa_files').glob("*"))
|
145 |
+
existing_free_sasa = [str(i) for i in existing_free_sasa]
|
146 |
+
existing_free_sasa = [i.split('/')[-1].split('.')[0] for i in existing_free_sasa]
|
147 |
+
## Decide if the wild type amino acid is on canonical or isoform sequence. Selected sequence will be used for the
|
148 |
+
## sequence alignment.
|
149 |
+
for i in uniprot_matched.index:
|
150 |
+
if len(uniprot_matched.at[i, 'uniprotSequence']) >= int(uniprot_matched.at[i, 'pos']):
|
151 |
+
wt = uniprot_matched.at[i, 'wt']
|
152 |
+
can = str(uniprot_matched.at[i, 'uniprotSequence'])[int(uniprot_matched.at[i, 'pos']) - 1]
|
153 |
+
if wt == can:
|
154 |
+
uniprot_matched.at[i, 'wt_sequence_match'] = 'm'
|
155 |
+
elif wt != can:
|
156 |
+
isoList = isoform_fasta[
|
157 |
+
isoform_fasta['uniprotID'] == uniprot_matched.at[i, 'uniprotID']].isoformSequence.to_list()
|
158 |
+
for k in isoList:
|
159 |
+
if len(k) >= int(uniprot_matched.at[i, 'pos']):
|
160 |
+
resInIso = k[int(int(uniprot_matched.at[i, 'pos']) - 1)]
|
161 |
+
if wt == resInIso:
|
162 |
+
whichIsoform = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[0]
|
163 |
+
uniprot_matched.at[i, 'wt_sequence_match'] = 'i'
|
164 |
+
uniprot_matched.at[i, 'whichIsoform'] = whichIsoform
|
165 |
+
break
|
166 |
+
|
167 |
+
elif len(uniprot_matched.at[i, 'uniprotSequence']) < int(uniprot_matched.at[i, 'pos']):
|
168 |
+
isoList = isoform_fasta[
|
169 |
+
isoform_fasta['uniprotID'] == uniprot_matched.at[i, 'uniprotID']].isoformSequence.to_list()
|
170 |
+
for k in isoList:
|
171 |
+
if len(k) >= int(uniprot_matched.at[i, 'pos']):
|
172 |
+
resInIso = k[int(int(uniprot_matched.at[i, 'pos']) - 1)]
|
173 |
+
wt = uniprot_matched.at[i, 'wt']
|
174 |
+
if wt == resInIso:
|
175 |
+
whichIsoform = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[0]
|
176 |
+
uniprot_matched.at[i, 'wt_sequence_match'] = 'i'
|
177 |
+
uniprot_matched.at[i, 'whichIsoform'] = whichIsoform
|
178 |
+
break
|
179 |
+
|
180 |
+
|
181 |
+
|
182 |
+
uniprot_matched = uniprot_matched.replace({'nan': np.NaN})
|
183 |
+
for annot in ['Domain', 'Alternative sequence', 'Chain', 'Sequence conflict', 'Compositional bias']:
|
184 |
+
try:
|
185 |
+
uniprot_matched = uniprot_matched.drop(columns=annot)
|
186 |
+
except:
|
187 |
+
KeyError
|
188 |
+
|
189 |
+
print('You have %d data points that failed to match a UniProt Sequence\nProceeding with %d remaining...\n'
|
190 |
+
% (len(not_match_in_uniprot.drop_duplicates(['datapoint'])),
|
191 |
+
len(uniprot_matched.drop_duplicates(['datapoint']))))
|
192 |
+
|
193 |
+
## Adding interface residue information.
|
194 |
+
|
195 |
+
data_interface = pd.read_csv(path_to_interfaces, sep='\t')
|
196 |
+
interface_positions = get_interface_positions(data_interface, 'P1', 'P2')
|
197 |
+
|
198 |
+
interface_dataframe = pd.DataFrame()
|
199 |
+
for key, val in interface_positions.items():
|
200 |
+
k = pd.Series((key, str(list(set(val)))))
|
201 |
+
interface_dataframe = interface_dataframe.append(k, ignore_index=True)
|
202 |
+
interface_dataframe.columns = ['uniprotID', 'interface_positions']
|
203 |
+
|
204 |
+
uniprot_matched = uniprot_matched.merge(interface_dataframe, on='uniprotID', how='left')
|
205 |
+
uniprot_matched.interface_positions = uniprot_matched.interface_positions.astype('str')
|
206 |
+
|
207 |
+
## PDB info file is pre-generated for time concerns. Includes summary data of AlphaFold structures.
|
208 |
+
## With new updates, can be updated separately.
|
209 |
+
|
210 |
+
pdb_info = pd.read_csv(alphafold_summary, sep='\t')
|
211 |
+
|
212 |
+
## Keeping how many models each AlphaFold structure has.
|
213 |
+
model_count = modelCount(alphafold_path)
|
214 |
+
for k, v in model_count.items():
|
215 |
+
model_count[k] = int(v / 2) # two types of files for each file.
|
216 |
+
uniprot_matched = uniprot_matched.astype(str)
|
217 |
+
uniprot_matched.domStart = uniprot_matched.domStart.astype(float)
|
218 |
+
uniprot_matched.domEnd = uniprot_matched.domEnd.astype(float)
|
219 |
+
uniprot_matched.domStart = uniprot_matched.domStart.astype(int)
|
220 |
+
uniprot_matched.domEnd = uniprot_matched.domEnd.astype(int)
|
221 |
+
|
222 |
+
|
223 |
+
|
224 |
+
## Main part to add annotation information, align sequences, finding distances
|
225 |
+
|
226 |
+
for i in uniprot_matched.index:
|
227 |
+
print('Processing', i, 'of', len(uniprot_matched))
|
228 |
+
if len(uniprot_matched.at[i, 'uniprotSequence']) >= int(uniprot_matched.at[i, 'pos']):
|
229 |
+
wt = uniprot_matched.at[i, 'wt']
|
230 |
+
can = str(uniprot_matched.at[i, 'uniprotSequence'])[int(uniprot_matched.at[i, 'pos']) - 1]
|
231 |
+
## Information about whether the mutation is found on the canonical or isoform sequence.
|
232 |
+
|
233 |
+
if wt == can:
|
234 |
+
uniprot_matched.at[i, 'wt_sequence_match'] = 'm'
|
235 |
+
elif wt != can:
|
236 |
+
isoList = isoform_fasta[
|
237 |
+
isoform_fasta['uniprotID'] == uniprot_matched.at[i, 'uniprotID']].isoformSequence.to_list()
|
238 |
+
for k in isoList:
|
239 |
+
if len(k) >= int(uniprot_matched.at[i, 'pos']):
|
240 |
+
resInIso = k[int(int(uniprot_matched.at[i, 'pos']) - 1)]
|
241 |
+
if wt == resInIso:
|
242 |
+
whichIsoform = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[0]
|
243 |
+
uniprot_matched.at[i, 'wt_sequence_match'] = 'i'
|
244 |
+
uniprot_matched.at[i, 'whichIsoform'] = whichIsoform
|
245 |
+
break
|
246 |
+
elif len(uniprot_matched.at[i, 'uniprotSequence']) < int(uniprot_matched.at[i, 'pos']):
|
247 |
+
isoList = isoform_fasta[
|
248 |
+
isoform_fasta['uniprotID'] == uniprot_matched.at[i, 'uniprotID']].isoformSequence.to_list()
|
249 |
+
for k in isoList:
|
250 |
+
if len(k) >= int(uniprot_matched.at[i, 'pos']):
|
251 |
+
resInIso = k[int(int(uniprot_matched.at[i, 'pos']) - 1)]
|
252 |
+
wt = uniprot_matched.at[i, 'wt']
|
253 |
+
if wt == resInIso:
|
254 |
+
whichIsoform = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[0]
|
255 |
+
uniprot_matched.at[i, 'wt_sequence_match'] = 'i'
|
256 |
+
uniprot_matched.at[i, 'whichIsoform'] = whichIsoform
|
257 |
+
break
|
258 |
+
uniprotID = uniprot_matched.at[i, 'uniprotID']
|
259 |
+
datapoint = uniprot_matched.at[i, 'datapoint']
|
260 |
+
|
261 |
+
for k in annotation_list:
|
262 |
+
txt = k + 'Binary'
|
263 |
+
|
264 |
+
if (str(uniprot_matched.at[i, txt]) == '0') or (str(uniprot_matched.at[i, txt]) == '0.0'):
|
265 |
+
uniprot_matched.at[i, txt] = '1'
|
266 |
+
elif (str(uniprot_matched.at[i, txt]).lower() == 'nan') | (str(uniprot_matched.at[i, txt]) == np.NaN) :
|
267 |
+
uniprot_matched.at[i, txt] = '0'
|
268 |
+
elif (str(uniprot_matched.at[i, txt]) == '1') or (str(uniprot_matched.at[i, txt]) == '1.0'):
|
269 |
+
uniprot_matched.at[i, txt] = '2'
|
270 |
+
## Search in all models.
|
271 |
+
models_for_protein = [val for key, val in model_count.items() if
|
272 |
+
uniprotID in key.split(';')] # We have this many models for the protein.
|
273 |
+
which_model_mutation = which_model(
|
274 |
+
int(uniprot_matched.at[i, 'pos'])) # List of models in which the mutation can be found.
|
275 |
+
models_for_all_annotations = {}
|
276 |
+
for annot in annotation_list:
|
277 |
+
if len(uniprot_matched.at[i, annot]) != 0 and type(uniprot_matched.at[i, annot]) != list:
|
278 |
+
uniprot_matched.at[i, annot] = list(
|
279 |
+
map(str.strip, uniprot_matched.at[i, annot].strip('][').replace('"', '').split(',')))
|
280 |
+
models_for_annotations = {} # Recording which position is found in which model file.
|
281 |
+
for annot_position in uniprot_matched.at[i, annot]:
|
282 |
+
if annot_position != 'nan' and annot_position != '':
|
283 |
+
models_for_that_position = which_model(int(annot_position))
|
284 |
+
else:
|
285 |
+
models_for_that_position = {}
|
286 |
+
for key, val in models_for_that_position.items():
|
287 |
+
if key not in models_for_annotations.keys():
|
288 |
+
models_for_annotations[key] = [val]
|
289 |
+
else:
|
290 |
+
models_for_annotations[key] += [val]
|
291 |
+
models_for_all_annotations[annot] = models_for_annotations
|
292 |
+
new_dict = {}
|
293 |
+
for key, val in models_for_all_annotations.items():
|
294 |
+
subdict = {k: v for k, v in val.items() if k in which_model_mutation}
|
295 |
+
subdict = dict(sorted(subdict.items()))
|
296 |
+
new_dict[key] = subdict
|
297 |
+
new_dict = reduce_model_dict(new_dict)
|
298 |
+
models_we_need = list(set(itertools.chain.from_iterable(
|
299 |
+
[list(ov.keys()) for ok, ov in new_dict.items()]))) # Read models with these numbers
|
300 |
+
info_per_model = {} # her bir datapoint için baştan yazılıyor.
|
301 |
+
dist_of_annots = {}
|
302 |
+
all_domain_distances = []
|
303 |
+
|
304 |
+
for mod in models_we_need:
|
305 |
+
print('---------PRINTING FOR MODEL--------', mod)
|
306 |
+
dist_of_annots[str(mod)] = {}
|
307 |
+
info_per_model[mod] = {}
|
308 |
+
info_per_model[mod]['datapoint'] = datapoint
|
309 |
+
identifier = uniprot_matched.at[i, 'uniprotSequence']
|
310 |
+
try:
|
311 |
+
pdbSequence = pdb_info.loc[(pdb_info.uniprotID == uniprotID) & (
|
312 |
+
pdb_info.model_num == mod)].sequence.item()
|
313 |
+
except:
|
314 |
+
ValueError
|
315 |
+
pdbSequence = 'nan'
|
316 |
+
if pdbSequence != 'nan': # The number in models we need might not be present for that protein. Preventng error.
|
317 |
+
pdbSequence = pdb_info.loc[(pdb_info.uniprotID == uniprotID) & (pdb_info.model_num == mod)].sequence.item()
|
318 |
+
alignment_list = do_alignment(uniprot_matched.at[i, 'datapoint'], uniprot_matched.at[i, 'uniprotSequence'],
|
319 |
+
pdbSequence, Path(path_to_output_files / 'alignment_files'))
|
320 |
+
pdb_alignStatus = mutation_position_on_pdb(alignment_list, uniprot_matched.at[i, 'pos'])[0]
|
321 |
+
info_per_model[mod]['pdb_alignStatus'] = pdb_alignStatus
|
322 |
+
mutationPositionOnPDB = mutation_position_on_pdb(alignment_list, uniprot_matched.at[i, 'pos'])[1]
|
323 |
+
info_per_model[mod]['mutationPositionOnPDB'] = mutationPositionOnPDB
|
324 |
+
startGap = mutation_position_on_pdb(alignment_list, uniprot_matched.at[i, 'pos'])[2]
|
325 |
+
info_per_model[mod]['startGap'] = startGap
|
326 |
+
alignment_to_use = mutation_position_on_pdb(alignment_list, uniprot_matched.at[i, 'pos'])[3]
|
327 |
+
for annot in annotation_list:
|
328 |
+
if new_dict[annot] == {}:
|
329 |
+
annotation_pos_on_pdb_ = []
|
330 |
+
else:
|
331 |
+
try:
|
332 |
+
annotation_pos_on_pdb_ = annotation_pos_on_pdb(new_dict[annot][mod], startGap, alignment_to_use,
|
333 |
+
identifier)
|
334 |
+
except:
|
335 |
+
KeyError
|
336 |
+
info_per_model[mod][annot] = annotation_pos_on_pdb_
|
337 |
+
|
338 |
+
pdb_path = Path(f'{alphafold_path}/AF-{uniprotID}-F{mod}-{file_str}.pdb.gz')
|
339 |
+
|
340 |
+
if get_alignments_3D(uniprotID, mod, pdb_path, pdbSequence, 'nan', 'nan', 'nan', mode, Path(path_to_output_files / '3D_alignment'),
|
341 |
+
'gzip') != None:
|
342 |
+
|
343 |
+
alignments, coords, resnums_for_sasa = get_alignments_3D(uniprotID, mod, pdb_path, pdbSequence, 'nan',
|
344 |
+
'nan', 'nan', mode, Path(path_to_output_files / '3D_alignment'),
|
345 |
+
'gzip')
|
346 |
+
alignments = alignments[0]
|
347 |
+
|
348 |
+
calculate_freesasa(uniprotID, mod, existing_free_sasa, alphafold_path, path_to_output_files)
|
349 |
+
if (mutationPositionOnPDB != 'nan'):
|
350 |
+
if (int(mutationPositionOnPDB) <= 1400):
|
351 |
+
try:
|
352 |
+
coordMut = get_coords(mutationPositionOnPDB, alignments, coords, resnums_for_sasa, mode)[0]
|
353 |
+
except:
|
354 |
+
ValueError
|
355 |
+
coordMut = 'nan'
|
356 |
+
else:
|
357 |
+
coordMut = np.NaN
|
358 |
+
|
359 |
+
sasa_pos = get_coords(mutationPositionOnPDB, alignments, coords, resnums_for_sasa, mode)[2]
|
360 |
+
sasa_val = sasa('alphafold', 'nan', uniprotID, sasa_pos, uniprot_matched.at[i, 'wt'], mode,
|
361 |
+
path_to_output_files, file_type='gzip')
|
362 |
+
|
363 |
+
if sasa_val != None:
|
364 |
+
uniprot_matched.at[i, 'sasa'] = sasa_val
|
365 |
+
else:
|
366 |
+
coordMut = 'nan'
|
367 |
+
sasa_val = 'nan'
|
368 |
+
uniprot_matched.at[i, 'sasa'] = sasa_val
|
369 |
+
|
370 |
+
domainPositionOnPDB_list = list(
|
371 |
+
range(int(uniprot_matched.at[i, 'domStart']), int(uniprot_matched.at[i, 'domEnd'])))
|
372 |
+
domain_distances = []
|
373 |
+
if len(domainPositionOnPDB_list) != 0:
|
374 |
+
for domain_ in domainPositionOnPDB_list:
|
375 |
+
coordDomain = get_coords(domain_, alignments, coords, resnums_for_sasa, mode)[0]
|
376 |
+
distance_dom = float(find_distance(coordMut,
|
377 |
+
coordDomain)) # bu bir anotasyonun bir modeldeki bir tane pozisyonu için.
|
378 |
+
domain_distances.append(distance_dom)
|
379 |
+
minimum_domain = min(domain_distances) # minimum for one model.
|
380 |
+
else:
|
381 |
+
minimum_domain = np.NaN
|
382 |
+
all_domain_distances.append(minimum_domain)
|
383 |
+
list_dist_of_annots = []
|
384 |
+
for key, val in info_per_model.items():
|
385 |
+
modNum = key
|
386 |
+
min_annots = {} # Write from scratch for each annotation.
|
387 |
+
|
388 |
+
if modNum == mod:
|
389 |
+
for label, annotPos in val.items(): # For each annotation type, calculate all distances of the annot positions.
|
390 |
+
if label in annotation_list:
|
391 |
+
all_annot_distance_per_model = [] # All distances of an annoation in hat model
|
392 |
+
for annot_position in annotPos:
|
393 |
+
if (annot_position != 'nan'):
|
394 |
+
if (int(annot_position) <= 1400):
|
395 |
+
coordAnnot = \
|
396 |
+
get_coords(annot_position, alignments, coords, resnums_for_sasa, mode)[
|
397 |
+
0]
|
398 |
+
distance = float(find_distance(coordMut,
|
399 |
+
coordAnnot)) # bu bir anotasyonun bir modeldeki bir tane pozisyonu için.
|
400 |
+
all_annot_distance_per_model.append(distance)
|
401 |
+
if all_annot_distance_per_model != []:
|
402 |
+
all_annot_distance_per_model = [float(i) for i in all_annot_distance_per_model]
|
403 |
+
try:
|
404 |
+
minimum_position = float(min(all_annot_distance_per_model))
|
405 |
+
except:
|
406 |
+
ValueError
|
407 |
+
minimum_position = 'nan'
|
408 |
+
min_annots[label] = float(
|
409 |
+
minimum_position) # Minimum of the annotation in this model.
|
410 |
+
if min_annots != {}:
|
411 |
+
list_dist_of_annots.append(min_annots)
|
412 |
+
dist_of_annots[str(
|
413 |
+
mod)] = list_dist_of_annots # Getting minimum of all possible models
|
414 |
+
# uniprot_matched.at[i, annotation_type] = minimum_position
|
415 |
+
else:
|
416 |
+
print('Model File Not Found')
|
417 |
+
|
418 |
+
uniprot_matched.at[i, 'sasa'] = np.NaN
|
419 |
+
|
420 |
+
|
421 |
+
if len(all_domain_distances) != 0:
|
422 |
+
uniprot_matched.at[i, 'domaindistance3D'] = min(all_domain_distances)
|
423 |
+
else:
|
424 |
+
uniprot_matched.at[i, 'domaindistance3D'] = np.NaN
|
425 |
+
dist_of_annots_min_of_all = {}
|
426 |
+
flat = [item for sublist in list(dist_of_annots.values()) for item in sublist]
|
427 |
+
for f in flat:
|
428 |
+
for key, val in f.items():
|
429 |
+
if key not in dist_of_annots_min_of_all.keys():
|
430 |
+
dist_of_annots_min_of_all[key] = val
|
431 |
+
elif (key in dist_of_annots_min_of_all.keys()) & (float(dist_of_annots_min_of_all[key]) > float(val)):
|
432 |
+
dist_of_annots_min_of_all[key] = val
|
433 |
+
key_list = []
|
434 |
+
for key, val in dist_of_annots_min_of_all.items():
|
435 |
+
uniprot_matched.at[i, key] = val
|
436 |
+
key_list.append(key)
|
437 |
+
remaining = list(set(annotation_list) - set(key_list))
|
438 |
+
|
439 |
+
for rem in remaining:
|
440 |
+
uniprot_matched.at[i, rem] = ''
|
441 |
+
uniprot_matched.at[i, 'distances'] = [dist_of_annots]
|
442 |
+
|
443 |
+
if (uniprot_matched.at[i, 'sasa'] != None) & (uniprot_matched.at[i, 'sasa'] != np.NaN) & (
|
444 |
+
str(uniprot_matched.at[i, 'sasa']) != 'nan'):
|
445 |
+
if '*' in uniprot_matched.at[i, 'sasa']:
|
446 |
+
uniprot_matched.at[i, 'sasa'] = uniprot_matched.at[i, 'sasa'].split('*')[0]
|
447 |
+
try:
|
448 |
+
uniprot_matched.at[i, 'sasa'] = float(uniprot_matched.at[i, 'sasa'].strip())
|
449 |
+
except:
|
450 |
+
TypeError
|
451 |
+
|
452 |
+
if float(uniprot_matched.at[i, 'sasa']) < 5:
|
453 |
+
uniprot_matched.at[i, 'trsh4'] = 'core'
|
454 |
+
elif float(uniprot_matched.at[i, 'sasa']) >= 5:
|
455 |
+
uniprot_matched.at[i, 'trsh4'] = 'surface'
|
456 |
+
elif str(uniprot_matched.at[i, 'sasa']) == 'nan':
|
457 |
+
uniprot_matched.at[i, 'trsh4'] = 'nan'
|
458 |
+
else:
|
459 |
+
uniprot_matched.at[i, 'trsh4'] = 'nan'
|
460 |
+
if (str(uniprot_matched.at[i, 'pos']) in uniprot_matched.at[i, 'interface_positions']) and uniprot_matched.at[
|
461 |
+
i, 'trsh4'] == 'surface':
|
462 |
+
uniprot_matched.at[i, 'threeState_trsh4_HQ'] = 'interface'
|
463 |
+
elif (str(uniprot_matched.at[i, 'pos']) not in uniprot_matched.at[i, 'interface_positions']) and uniprot_matched.at[
|
464 |
+
i, 'trsh4'] == 'surface':
|
465 |
+
uniprot_matched.at[i, 'threeState_trsh4_HQ'] = 'surface'
|
466 |
+
elif (str(uniprot_matched.at[i, 'pos']) not in uniprot_matched.at[i, 'interface_positions']) and uniprot_matched.at[
|
467 |
+
i, 'trsh4'] == 'core':
|
468 |
+
uniprot_matched.at[i, 'threeState_trsh4_HQ'] = 'core'
|
469 |
+
elif (str(uniprot_matched.at[i, 'pos']) in uniprot_matched.at[i, 'interface_positions']) and uniprot_matched.at[
|
470 |
+
i, 'trsh4'] == 'core':
|
471 |
+
uniprot_matched.at[i, 'threeState_trsh4_HQ'] = 'conflict'
|
472 |
+
elif uniprot_matched.at[i, 'trsh4'] == 'nan':
|
473 |
+
uniprot_matched.at[i, 'threeState_trsh4_HQ'] = 'nan'
|
474 |
+
if uniprot_matched.at[i, 'domain'] in significant_domains:
|
475 |
+
uniprot_matched.at[i, 'domain_fisher'] = uniprot_matched.at[i, 'domain']
|
476 |
+
else:
|
477 |
+
uniprot_matched.at[i, 'domain_fisher'] = 'NULL'
|
478 |
+
uniprot_matched = uniprot_matched.round(2)
|
479 |
+
uniprot_matched = uniprot_matched.astype(str)
|
480 |
+
|
481 |
+
uniprot_matched[ 'domain'] = uniprot_matched['domain'].replace({'-1': 'NULL'})
|
482 |
+
uniprot_matched = uniprot_matched.drop_duplicates()
|
483 |
+
uniprot_matched.rename(
|
484 |
+
columns={'uniprotID': 'prot_uniprotAcc', 'wt': 'wt_residue', 'pos': 'position', 'mut': 'mut_residue',
|
485 |
+
'datapoint': 'meta_merged', 'datapoint_disease': 'meta-lab_merged', 'label': 'source_db',
|
486 |
+
'family': 'prot_family', 'domain': 'domains_all', 'domain_fisher': 'domains_sig',
|
487 |
+
'domaindistance3D': 'domains_3Ddist', 'threeState_trsh4_HQ': 'location_3state',
|
488 |
+
'disulfideBinary': 'disulfide_bin', 'intMetBinary': 'intMet_bin',
|
489 |
+
'intramembraneBinary': 'intramembrane_bin',
|
490 |
+
'naturalVariantBinary': 'naturalVariant_bin', 'dnaBindingBinary': 'dnaBinding_bin',
|
491 |
+
'activeSiteBinary': 'activeSite_bin',
|
492 |
+
'nucleotideBindingBinary': 'nucleotideBinding_bin', 'lipidationBinary': 'lipidation_bin',
|
493 |
+
'siteBinary': 'site_bin',
|
494 |
+
'transmembraneBinary': 'transmembrane_bin', 'crosslinkBinary': 'crosslink_bin',
|
495 |
+
'mutagenesisBinary': 'mutagenesis_bin',
|
496 |
+
'strandBinary': 'strand_bin', 'helixBinary': 'helix_bin', 'turnBinary': 'turn_bin',
|
497 |
+
'metalBindingBinary': 'metalBinding_bin',
|
498 |
+
'repeatBinary': 'repeat_bin', 'topologicalDomainBinary': 'topologicalDomain_bin',
|
499 |
+
'caBindingBinary': 'caBinding_bin',
|
500 |
+
'bindingSiteBinary': 'bindingSite_bin', 'regionBinary': 'region_bin',
|
501 |
+
'signalPeptideBinary': 'signalPeptide_bin',
|
502 |
+
'modifiedResidueBinary': 'modifiedResidue_bin', 'zincFingerBinary': 'zincFinger_bin',
|
503 |
+
'motifBinary': 'motif_bin',
|
504 |
+
'coiledCoilBinary': 'coiledCoil_bin', 'peptideBinary': 'peptide_bin',
|
505 |
+
'transitPeptideBinary': 'transitPeptide_bin',
|
506 |
+
'glycosylationBinary': 'glycosylation_bin', 'propeptideBinary': 'propeptide_bin',
|
507 |
+
'disulfide': 'disulfide_dist', 'intMet': 'intMet_dist',
|
508 |
+
'intramembrane': 'intramembrane_dist', 'naturalVariant': 'naturalVariant_dist',
|
509 |
+
'dnaBinding': 'dnaBinding_dist', 'activeSite': 'activeSite_dist',
|
510 |
+
'nucleotideBinding': 'nucleotideBinding_dist', 'lipidation': 'lipidation_dist', 'site': 'site_dist',
|
511 |
+
'transmembrane': 'transmembrane_dist', 'crosslink': 'crosslink_dist',
|
512 |
+
'mutagenesis': 'mutagenesis_dist', 'strand': 'strand_dist', 'helix': 'helix_dist', 'turn': 'turn_dist',
|
513 |
+
'metalBinding': 'metalBinding_dist', 'repeat': 'repeat_dist',
|
514 |
+
'topologicalDomain': 'topologicalDomain_dist', 'caBinding': 'caBinding_dist',
|
515 |
+
'bindingSite': 'bindingSite_dist', 'region': 'region_dist',
|
516 |
+
'signalPeptide': 'signalPeptide_dist', 'modifiedResidue': 'modifiedResidue_dist',
|
517 |
+
'zincFinger': 'zincFinger_dist', 'motif': 'motif_dist', 'coiledCoil': 'coiledCoil_dist',
|
518 |
+
'peptide': 'peptide_dist', 'transitPeptide': 'transitPeptide_dist',
|
519 |
+
'glycosylation': 'glycosylation_dist', 'propeptide': 'propeptide_dist'}, inplace=True)
|
520 |
+
|
521 |
+
uniprot_matched = uniprot_matched[
|
522 |
+
['prot_uniprotAcc', 'wt_residue', 'mut_residue', 'position', 'meta_merged', 'composition', 'polarity', 'volume',
|
523 |
+
'granthamScore', 'domains_all',
|
524 |
+
'domains_sig', 'domains_3Ddist', 'sasa', 'location_3state', 'disulfide_bin', 'intMet_bin',
|
525 |
+
'intramembrane_bin', 'naturalVariant_bin', 'dnaBinding_bin',
|
526 |
+
'activeSite_bin', 'nucleotideBinding_bin', 'lipidation_bin', 'site_bin',
|
527 |
+
'transmembrane_bin', 'crosslink_bin', 'mutagenesis_bin', 'strand_bin',
|
528 |
+
'helix_bin', 'turn_bin', 'metalBinding_bin', 'repeat_bin',
|
529 |
+
'caBinding_bin', 'topologicalDomain_bin', 'bindingSite_bin',
|
530 |
+
'region_bin', 'signalPeptide_bin', 'modifiedResidue_bin',
|
531 |
+
'zincFinger_bin', 'motif_bin', 'coiledCoil_bin', 'peptide_bin',
|
532 |
+
'transitPeptide_bin', 'glycosylation_bin', 'propeptide_bin', 'disulfide_dist', 'intMet_dist',
|
533 |
+
'intramembrane_dist',
|
534 |
+
'naturalVariant_dist', 'dnaBinding_dist', 'activeSite_dist',
|
535 |
+
'nucleotideBinding_dist', 'lipidation_dist', 'site_dist',
|
536 |
+
'transmembrane_dist', 'crosslink_dist', 'mutagenesis_dist',
|
537 |
+
'strand_dist', 'helix_dist', 'turn_dist', 'metalBinding_dist',
|
538 |
+
'repeat_dist', 'caBinding_dist', 'topologicalDomain_dist',
|
539 |
+
'bindingSite_dist', 'region_dist', 'signalPeptide_dist',
|
540 |
+
'modifiedResidue_dist', 'zincFinger_dist', 'motif_dist',
|
541 |
+
'coiledCoil_dist', 'peptide_dist', 'transitPeptide_dist',
|
542 |
+
'glycosylation_dist', 'propeptide_dist']]
|
543 |
+
uniprot_matched = uniprot_matched.reset_index()
|
544 |
+
uniprot_matched = uniprot_matched.drop(columns = {'index'})
|
545 |
+
# Imputation
|
546 |
+
if (impute == 'True') or (impute == 'true'):
|
547 |
+
filler = [20.71, 46.67, 28.13,15.5, 35.94, 21.84, 25.15, 45.15, 29.81, 29.91, 34.67, 24.72, 10.66,11.55, 13.02,
|
548 |
+
21.54,27.42, 38.39, 30.44, 20.9, 25.82, 46.12, 32.1, 35.96, 35.86, 37.88, 19.09, 35.2, 26.95, 37.48]
|
549 |
+
col_index = 0
|
550 |
+
|
551 |
+
for col_ in uniprot_matched.columns[-30:]:
|
552 |
+
uniprot_matched[col_] = uniprot_matched[col_].fillna(filler[col_index])
|
553 |
+
uniprot_matched[col_] = uniprot_matched[col_].replace({'nan': filler[col_index]})
|
554 |
+
uniprot_matched[col_] = uniprot_matched[col_].replace({'': filler[col_index]})
|
555 |
+
"""
|
556 |
+
if uniprot_matched[col_].values == '':
|
557 |
+
uniprot_matched[col_] = filler[col_index]
|
558 |
+
"""
|
559 |
+
col_index += 1
|
560 |
+
|
561 |
+
uniprot_matched['domains_3Ddist'] = uniprot_matched['domains_3Ddist'].fillna(29.78)
|
562 |
+
uniprot_matched['sasa'] = uniprot_matched['sasa'].fillna(35.6)
|
563 |
+
uniprot_matched['location_3state'] = uniprot_matched['location_3state'].fillna('unknown')
|
564 |
+
elif (impute == 'False') or (impute == 'false'):
|
565 |
+
pass
|
566 |
+
uniprot_matched = uniprot_matched.replace({'nan': np.NaN})
|
567 |
+
uniprot_matched = uniprot_matched.replace({'['']': np.NaN})
|
568 |
+
uniprot_matched.to_csv(path_to_output_files / 'featurevector_alphafold.txt', index=False, sep='\t')
|
569 |
+
if len(uniprot_matched) == 0:
|
570 |
+
print(
|
571 |
+
'No feature vector could be produced for input data. Please check the presence of a structure for the input proteins.')
|
572 |
+
|
573 |
+
print('Feature vector successfully created...')
|
574 |
+
end = timer()
|
575 |
+
hours, rem = divmod(end - start, 3600)
|
576 |
+
minutes, seconds = divmod(rem, 60)
|
577 |
+
print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
|
578 |
+
sys.stdout.close()
|
579 |
+
return uniprot_matched
|
code/alphafold_model.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from collections import Counter
|
2 |
+
import glob
|
3 |
+
def reduce_model_dict(dict):
|
4 |
+
for key, val in dict.items():
|
5 |
+
used = []
|
6 |
+
for key2, val2 in val.items():
|
7 |
+
new = []
|
8 |
+
for i in val2:
|
9 |
+
if i not in used:
|
10 |
+
new.append(i)
|
11 |
+
used.append(i)
|
12 |
+
val[key2] = new
|
13 |
+
return dict
|
14 |
+
|
15 |
+
|
16 |
+
def which_model(position):
|
17 |
+
models_dict = {}
|
18 |
+
x = 1
|
19 |
+
for i, j in zip(range(1400, 27000, 200), range(1, 27000, 200)):
|
20 |
+
if position <= i and position >= j:
|
21 |
+
models_dict[x] = position
|
22 |
+
x += 1
|
23 |
+
return models_dict
|
24 |
+
|
25 |
+
def modelCount(path_to_models):
|
26 |
+
count_list = []
|
27 |
+
for file in list(path_to_models.glob("*")):
|
28 |
+
protein_id = str(file).split('-')[1]
|
29 |
+
count_list.append(protein_id)
|
30 |
+
count_dict = Counter(count_list)
|
31 |
+
count_dict = {';'.join(sorted(k for k in count_dict.keys() if count_dict[k] == v)): v for v in
|
32 |
+
set(count_dict.values())}
|
33 |
+
return count_dict
|
code/calc_pc_property.py
ADDED
@@ -0,0 +1,441 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
def compositionValues(aa1, aa2):
|
3 |
+
compositionValues = {'S': 1.42, 'R': 0.65, 'L': 0, 'P': 0.39, 'T': 0.71, 'A': 0, 'V': 0, 'G': 0.74,
|
4 |
+
'I': 0, 'F': 0, 'Y': 0.20, 'C': 2.75, 'H': 0.58, 'Q': 0.89, 'N': 1.33, 'K': 0.33,
|
5 |
+
'D': 1.38, 'E': 0.92, 'M': 0, 'W': 0.13}
|
6 |
+
dif = round((compositionValues[aa1] - compositionValues[aa2]), 2)
|
7 |
+
return (dif)
|
8 |
+
|
9 |
+
|
10 |
+
def polarityValues(aa1, aa2):
|
11 |
+
polarityValues = {'S': 9.2, 'R': 10.5, 'L': 4.9, 'P': 8.0, 'T': 8.6, 'A': 8.1, 'V': 5.9, 'G': 9.0,
|
12 |
+
'I': 5.2, 'F': 5.2, 'Y': 6.2, 'C': 5.5, 'H': 10.4, 'Q': 10.5, 'N': 11.6, 'K': 11.3,
|
13 |
+
'D': 13.0, 'E': 12.3, 'M': 5.7, 'W': 5.4}
|
14 |
+
dif = round((polarityValues[aa1] - polarityValues[aa2]), 2)
|
15 |
+
return (dif)
|
16 |
+
|
17 |
+
|
18 |
+
def volumeValues(aa1, aa2):
|
19 |
+
volumeValues = {'S': 32, 'R': 124, 'L': 111, 'P': 32.5, 'T': 61, 'A': 31, 'V': 84, 'G': 3,
|
20 |
+
'I': 111, 'F': 132, 'Y': 136, 'C': 55, 'H': 96, 'Q': 85, 'N': 56, 'K': 119,
|
21 |
+
'D': 54, 'E': 83, 'M': 105, 'W': 170}
|
22 |
+
dif = round((volumeValues[aa1] - volumeValues[aa2]), 2)
|
23 |
+
return (dif)
|
24 |
+
|
25 |
+
|
26 |
+
def add_physicochemical(df):
|
27 |
+
grantham_dict = {
|
28 |
+
('A', 'A'): '0',
|
29 |
+
('A', 'C'): '195',
|
30 |
+
('A', 'D'): '126',
|
31 |
+
('A', 'E'): '107',
|
32 |
+
('A', 'F'): '113',
|
33 |
+
('A', 'G'): '60',
|
34 |
+
('A', 'H'): '86',
|
35 |
+
('A', 'I'): '94',
|
36 |
+
('A', 'K'): '106',
|
37 |
+
('A', 'L'): '96',
|
38 |
+
('A', 'M'): '84',
|
39 |
+
('A', 'N'): '111',
|
40 |
+
('A', 'P'): '27',
|
41 |
+
('A', 'Q'): '91',
|
42 |
+
('A', 'R'): '112',
|
43 |
+
('A', 'S'): '99',
|
44 |
+
('A', 'T'): '58',
|
45 |
+
('A', 'V'): '64',
|
46 |
+
('A', 'W'): '148',
|
47 |
+
('A', 'Y'): '112',
|
48 |
+
('C', 'A'): '195',
|
49 |
+
('C', 'C'): '0',
|
50 |
+
('C', 'D'): '154',
|
51 |
+
('C', 'E'): '170',
|
52 |
+
('C', 'F'): '205',
|
53 |
+
('C', 'G'): '159',
|
54 |
+
('C', 'H'): '174',
|
55 |
+
('C', 'I'): '198',
|
56 |
+
('C', 'K'): '202',
|
57 |
+
('C', 'L'): '198',
|
58 |
+
('C', 'M'): '196',
|
59 |
+
('C', 'N'): '139',
|
60 |
+
('C', 'P'): '169',
|
61 |
+
('C', 'Q'): '154',
|
62 |
+
('C', 'R'): '180',
|
63 |
+
('C', 'S'): '112',
|
64 |
+
('C', 'T'): '149',
|
65 |
+
('C', 'V'): '192',
|
66 |
+
('C', 'W'): '215',
|
67 |
+
('C', 'Y'): '194',
|
68 |
+
('D', 'A'): '126',
|
69 |
+
('D', 'C'): '154',
|
70 |
+
('D', 'D'): '0',
|
71 |
+
('D', 'E'): '45',
|
72 |
+
('D', 'F'): '177',
|
73 |
+
('D', 'G'): '94',
|
74 |
+
('D', 'H'): '81',
|
75 |
+
('D', 'I'): '168',
|
76 |
+
('D', 'K'): '101',
|
77 |
+
('D', 'L'): '172',
|
78 |
+
('D', 'M'): '160',
|
79 |
+
('D', 'N'): '23',
|
80 |
+
('D', 'P'): '108',
|
81 |
+
('D', 'Q'): '61',
|
82 |
+
('D', 'R'): '96',
|
83 |
+
('D', 'S'): '65',
|
84 |
+
('D', 'T'): '85',
|
85 |
+
('D', 'V'): '152',
|
86 |
+
('D', 'W'): '181',
|
87 |
+
('D', 'Y'): '160',
|
88 |
+
('E', 'A'): '107',
|
89 |
+
('E', 'C'): '170',
|
90 |
+
('E', 'D'): '45',
|
91 |
+
('E', 'E'): '0',
|
92 |
+
('E', 'F'): '140',
|
93 |
+
('E', 'G'): '98',
|
94 |
+
('E', 'H'): '40',
|
95 |
+
('E', 'I'): '134',
|
96 |
+
('E', 'K'): '56',
|
97 |
+
('E', 'L'): '138',
|
98 |
+
('E', 'M'): '126',
|
99 |
+
('E', 'N'): '42',
|
100 |
+
('E', 'P'): '93',
|
101 |
+
('E', 'Q'): '29',
|
102 |
+
('E', 'R'): '54',
|
103 |
+
('E', 'S'): '80',
|
104 |
+
('E', 'T'): '65',
|
105 |
+
('E', 'V'): '121',
|
106 |
+
('E', 'W'): '152',
|
107 |
+
('E', 'Y'): '122',
|
108 |
+
('F', 'A'): '113',
|
109 |
+
('F', 'C'): '205',
|
110 |
+
('F', 'D'): '177',
|
111 |
+
('F', 'E'): '140',
|
112 |
+
('F', 'F'): '0',
|
113 |
+
('F', 'G'): '153',
|
114 |
+
('F', 'H'): '100',
|
115 |
+
('F', 'I'): '21',
|
116 |
+
('F', 'K'): '102',
|
117 |
+
('F', 'L'): '22',
|
118 |
+
('F', 'M'): '28',
|
119 |
+
('F', 'N'): '158',
|
120 |
+
('F', 'P'): '114',
|
121 |
+
('F', 'Q'): '116',
|
122 |
+
('F', 'R'): '97',
|
123 |
+
('F', 'S'): '155',
|
124 |
+
('F', 'T'): '103',
|
125 |
+
('F', 'V'): '50',
|
126 |
+
('F', 'W'): '40',
|
127 |
+
('F', 'Y'): '22',
|
128 |
+
('G', 'A'): '60',
|
129 |
+
('G', 'C'): '159',
|
130 |
+
('G', 'D'): '94',
|
131 |
+
('G', 'E'): '98',
|
132 |
+
('G', 'F'): '153',
|
133 |
+
('G', 'G'): '0',
|
134 |
+
('G', 'H'): '98',
|
135 |
+
('G', 'I'): '135',
|
136 |
+
('G', 'K'): '127',
|
137 |
+
('G', 'L'): '138',
|
138 |
+
('G', 'M'): '127',
|
139 |
+
('G', 'N'): '80',
|
140 |
+
('G', 'P'): '42',
|
141 |
+
('G', 'Q'): '87',
|
142 |
+
('G', 'R'): '125',
|
143 |
+
('G', 'S'): '56',
|
144 |
+
('G', 'T'): '59',
|
145 |
+
('G', 'V'): '109',
|
146 |
+
('G', 'W'): '184',
|
147 |
+
('G', 'Y'): '147',
|
148 |
+
('H', 'A'): '86',
|
149 |
+
('H', 'C'): '174',
|
150 |
+
('H', 'D'): '81',
|
151 |
+
('H', 'E'): '40',
|
152 |
+
('H', 'F'): '100',
|
153 |
+
('H', 'G'): '98',
|
154 |
+
('H', 'H'): '0',
|
155 |
+
('H', 'I'): '94',
|
156 |
+
('H', 'K'): '32',
|
157 |
+
('H', 'L'): '99',
|
158 |
+
('H', 'M'): '87',
|
159 |
+
('H', 'N'): '68',
|
160 |
+
('H', 'P'): '77',
|
161 |
+
('H', 'Q'): '24',
|
162 |
+
('H', 'R'): '29',
|
163 |
+
('H', 'S'): '89',
|
164 |
+
('H', 'T'): '47',
|
165 |
+
('H', 'V'): '84',
|
166 |
+
('H', 'W'): '115',
|
167 |
+
('H', 'Y'): '83',
|
168 |
+
('I', 'A'): '94',
|
169 |
+
('I', 'C'): '198',
|
170 |
+
('I', 'D'): '168',
|
171 |
+
('I', 'E'): '134',
|
172 |
+
('I', 'F'): '21',
|
173 |
+
('I', 'G'): '135',
|
174 |
+
('I', 'H'): '94',
|
175 |
+
('I', 'I'): '0',
|
176 |
+
('I', 'K'): '102',
|
177 |
+
('I', 'L'): '5',
|
178 |
+
('I', 'M'): '10',
|
179 |
+
('I', 'N'): '149',
|
180 |
+
('I', 'P'): '95',
|
181 |
+
('I', 'Q'): '109',
|
182 |
+
('I', 'R'): '97',
|
183 |
+
('I', 'S'): '142',
|
184 |
+
('I', 'T'): '89',
|
185 |
+
('I', 'V'): '29',
|
186 |
+
('I', 'W'): '61',
|
187 |
+
('I', 'Y'): '33',
|
188 |
+
('K', 'A'): '106',
|
189 |
+
('K', 'C'): '202',
|
190 |
+
('K', 'D'): '101',
|
191 |
+
('K', 'E'): '56',
|
192 |
+
('K', 'F'): '102',
|
193 |
+
('K', 'G'): '127',
|
194 |
+
('K', 'H'): '32',
|
195 |
+
('K', 'I'): '102',
|
196 |
+
('K', 'K'): '0',
|
197 |
+
('K', 'L'): '107',
|
198 |
+
('K', 'M'): '95',
|
199 |
+
('K', 'N'): '94',
|
200 |
+
('K', 'P'): '103',
|
201 |
+
('K', 'Q'): '53',
|
202 |
+
('K', 'R'): '26',
|
203 |
+
('K', 'S'): '121',
|
204 |
+
('K', 'T'): '78',
|
205 |
+
('K', 'V'): '97',
|
206 |
+
('K', 'W'): '110',
|
207 |
+
('K', 'Y'): '85',
|
208 |
+
('L', 'A'): '96',
|
209 |
+
('L', 'C'): '198',
|
210 |
+
('L', 'D'): '172',
|
211 |
+
('L', 'E'): '138',
|
212 |
+
('L', 'F'): '22',
|
213 |
+
('L', 'G'): '138',
|
214 |
+
('L', 'H'): '99',
|
215 |
+
('L', 'I'): '5',
|
216 |
+
('L', 'K'): '107',
|
217 |
+
('L', 'L'): '0',
|
218 |
+
('L', 'M'): '15',
|
219 |
+
('L', 'N'): '153',
|
220 |
+
('L', 'P'): '98',
|
221 |
+
('L', 'Q'): '113',
|
222 |
+
('L', 'R'): '102',
|
223 |
+
('L', 'S'): '145',
|
224 |
+
('L', 'T'): '92',
|
225 |
+
('L', 'V'): '32',
|
226 |
+
('L', 'W'): '61',
|
227 |
+
('L', 'Y'): '36',
|
228 |
+
('M', 'A'): '84',
|
229 |
+
('M', 'C'): '196',
|
230 |
+
('M', 'D'): '160',
|
231 |
+
('M', 'E'): '126',
|
232 |
+
('M', 'F'): '28',
|
233 |
+
('M', 'G'): '127',
|
234 |
+
('M', 'H'): '87',
|
235 |
+
('M', 'I'): '10',
|
236 |
+
('M', 'K'): '95',
|
237 |
+
('M', 'L'): '15',
|
238 |
+
('M', 'M'): '0',
|
239 |
+
('M', 'N'): '142',
|
240 |
+
('M', 'P'): '87',
|
241 |
+
('M', 'Q'): '101',
|
242 |
+
('M', 'R'): '91',
|
243 |
+
('M', 'S'): '135',
|
244 |
+
('M', 'T'): '81',
|
245 |
+
('M', 'V'): '21',
|
246 |
+
('M', 'W'): '67',
|
247 |
+
('M', 'Y'): '36',
|
248 |
+
('N', 'A'): '111',
|
249 |
+
('N', 'C'): '139',
|
250 |
+
('N', 'D'): '23',
|
251 |
+
('N', 'E'): '42',
|
252 |
+
('N', 'F'): '158',
|
253 |
+
('N', 'G'): '80',
|
254 |
+
('N', 'H'): '68',
|
255 |
+
('N', 'I'): '149',
|
256 |
+
('N', 'K'): '94',
|
257 |
+
('N', 'L'): '153',
|
258 |
+
('N', 'M'): '142',
|
259 |
+
('N', 'N'): '0',
|
260 |
+
('N', 'P'): '91',
|
261 |
+
('N', 'Q'): '46',
|
262 |
+
('N', 'R'): '86',
|
263 |
+
('N', 'S'): '46',
|
264 |
+
('N', 'T'): '65',
|
265 |
+
('N', 'V'): '133',
|
266 |
+
('N', 'W'): '174',
|
267 |
+
('N', 'Y'): '143',
|
268 |
+
('P', 'A'): '27',
|
269 |
+
('P', 'C'): '169',
|
270 |
+
('P', 'D'): '108',
|
271 |
+
('P', 'E'): '93',
|
272 |
+
('P', 'F'): '114',
|
273 |
+
('P', 'G'): '42',
|
274 |
+
('P', 'H'): '77',
|
275 |
+
('P', 'I'): '95',
|
276 |
+
('P', 'K'): '103',
|
277 |
+
('P', 'L'): '98',
|
278 |
+
('P', 'M'): '87',
|
279 |
+
('P', 'N'): '91',
|
280 |
+
('P', 'P'): '0',
|
281 |
+
('P', 'Q'): '76',
|
282 |
+
('P', 'R'): '103',
|
283 |
+
('P', 'S'): '74',
|
284 |
+
('P', 'T'): '38',
|
285 |
+
('P', 'V'): '68',
|
286 |
+
('P', 'W'): '147',
|
287 |
+
('P', 'Y'): '110',
|
288 |
+
('Q', 'A'): '91',
|
289 |
+
('Q', 'C'): '154',
|
290 |
+
('Q', 'D'): '61',
|
291 |
+
('Q', 'E'): '29',
|
292 |
+
('Q', 'F'): '116',
|
293 |
+
('Q', 'G'): '87',
|
294 |
+
('Q', 'H'): '24',
|
295 |
+
('Q', 'I'): '109',
|
296 |
+
('Q', 'K'): '53',
|
297 |
+
('Q', 'L'): '113',
|
298 |
+
('Q', 'M'): '101',
|
299 |
+
('Q', 'N'): '46',
|
300 |
+
('Q', 'P'): '76',
|
301 |
+
('Q', 'Q'): '0',
|
302 |
+
('Q', 'R'): '43',
|
303 |
+
('Q', 'S'): '68',
|
304 |
+
('Q', 'T'): '42',
|
305 |
+
('Q', 'V'): '96',
|
306 |
+
('Q', 'W'): '130',
|
307 |
+
('Q', 'Y'): '99',
|
308 |
+
('R', 'A'): '112',
|
309 |
+
('R', 'C'): '180',
|
310 |
+
('R', 'D'): '96',
|
311 |
+
('R', 'E'): '54',
|
312 |
+
('R', 'F'): '97',
|
313 |
+
('R', 'G'): '125',
|
314 |
+
('R', 'H'): '29',
|
315 |
+
('R', 'I'): '97',
|
316 |
+
('R', 'K'): '26',
|
317 |
+
('R', 'L'): '102',
|
318 |
+
('R', 'M'): '91',
|
319 |
+
('R', 'N'): '86',
|
320 |
+
('R', 'P'): '103',
|
321 |
+
('R', 'Q'): '43',
|
322 |
+
('R', 'R'): '0',
|
323 |
+
('R', 'S'): '110',
|
324 |
+
('R', 'T'): '71',
|
325 |
+
('R', 'V'): '96',
|
326 |
+
('R', 'W'): '101',
|
327 |
+
('R', 'Y'): '77',
|
328 |
+
('S', 'A'): '99',
|
329 |
+
('S', 'C'): '112',
|
330 |
+
('S', 'D'): '65',
|
331 |
+
('S', 'E'): '80',
|
332 |
+
('S', 'F'): '155',
|
333 |
+
('S', 'G'): '56',
|
334 |
+
('S', 'H'): '89',
|
335 |
+
('S', 'I'): '142',
|
336 |
+
('S', 'K'): '121',
|
337 |
+
('S', 'L'): '145',
|
338 |
+
('S', 'M'): '135',
|
339 |
+
('S', 'N'): '46',
|
340 |
+
('S', 'P'): '74',
|
341 |
+
('S', 'Q'): '68',
|
342 |
+
('S', 'R'): '110',
|
343 |
+
('S', 'S'): '0',
|
344 |
+
('S', 'T'): '58',
|
345 |
+
('S', 'V'): '124',
|
346 |
+
('S', 'W'): '177',
|
347 |
+
('S', 'Y'): '144',
|
348 |
+
('T', 'A'): '58',
|
349 |
+
('T', 'C'): '149',
|
350 |
+
('T', 'D'): '85',
|
351 |
+
('T', 'E'): '65',
|
352 |
+
('T', 'F'): '103',
|
353 |
+
('T', 'G'): '59',
|
354 |
+
('T', 'H'): '47',
|
355 |
+
('T', 'I'): '89',
|
356 |
+
('T', 'K'): '78',
|
357 |
+
('T', 'L'): '92',
|
358 |
+
('T', 'M'): '81',
|
359 |
+
('T', 'N'): '65',
|
360 |
+
('T', 'P'): '38',
|
361 |
+
('T', 'Q'): '42',
|
362 |
+
('T', 'R'): '71',
|
363 |
+
('T', 'S'): '58',
|
364 |
+
('T', 'T'): '0',
|
365 |
+
('T', 'V'): '69',
|
366 |
+
('T', 'W'): '128',
|
367 |
+
('T', 'Y'): '92',
|
368 |
+
('V', 'A'): '64',
|
369 |
+
('V', 'C'): '192',
|
370 |
+
('V', 'D'): '152',
|
371 |
+
('V', 'E'): '121',
|
372 |
+
('V', 'F'): '50',
|
373 |
+
('V', 'G'): '109',
|
374 |
+
('V', 'H'): '84',
|
375 |
+
('V', 'I'): '29',
|
376 |
+
('V', 'K'): '97',
|
377 |
+
('V', 'L'): '32',
|
378 |
+
('V', 'M'): '21',
|
379 |
+
('V', 'N'): '133',
|
380 |
+
('V', 'P'): '68',
|
381 |
+
('V', 'Q'): '96',
|
382 |
+
('V', 'R'): '96',
|
383 |
+
('V', 'S'): '124',
|
384 |
+
('V', 'T'): '69',
|
385 |
+
('V', 'V'): '0',
|
386 |
+
('V', 'W'): '88',
|
387 |
+
('V', 'Y'): '55',
|
388 |
+
('W', 'A'): '148',
|
389 |
+
('W', 'C'): '215',
|
390 |
+
('W', 'D'): '181',
|
391 |
+
('W', 'E'): '152',
|
392 |
+
('W', 'F'): '40',
|
393 |
+
('W', 'G'): '184',
|
394 |
+
('W', 'H'): '115',
|
395 |
+
('W', 'I'): '61',
|
396 |
+
('W', 'K'): '110',
|
397 |
+
('W', 'L'): '61',
|
398 |
+
('W', 'M'): '67',
|
399 |
+
('W', 'N'): '174',
|
400 |
+
('W', 'P'): '147',
|
401 |
+
('W', 'Q'): '130',
|
402 |
+
('W', 'R'): '101',
|
403 |
+
('W', 'S'): '177',
|
404 |
+
('W', 'T'): '128',
|
405 |
+
('W', 'V'): '88',
|
406 |
+
('W', 'W'): '0',
|
407 |
+
('W', 'Y'): '37',
|
408 |
+
('Y', 'A'): '112',
|
409 |
+
('Y', 'C'): '194',
|
410 |
+
('Y', 'D'): '160',
|
411 |
+
('Y', 'E'): '122',
|
412 |
+
('Y', 'F'): '22',
|
413 |
+
('Y', 'G'): '147',
|
414 |
+
('Y', 'H'): '83',
|
415 |
+
('Y', 'I'): '33',
|
416 |
+
('Y', 'K'): '85',
|
417 |
+
('Y', 'L'): '36',
|
418 |
+
('Y', 'M'): '36',
|
419 |
+
('Y', 'N'): '143',
|
420 |
+
('Y', 'P'): '110',
|
421 |
+
('Y', 'Q'): '99',
|
422 |
+
('Y', 'R'): '77',
|
423 |
+
('Y', 'S'): '144',
|
424 |
+
('Y', 'T'): '92',
|
425 |
+
('Y', 'V'): '55',
|
426 |
+
('Y', 'W'): '37',
|
427 |
+
('Y', 'Y'): '0'
|
428 |
+
}
|
429 |
+
for i in df.index:
|
430 |
+
try:
|
431 |
+
df.at[i, 'composition'] = compositionValues(df.at[i, 'wt'], df.at[i, 'mut'])
|
432 |
+
df.at[i, 'polarity'] = polarityValues(df.at[i, 'wt'], df.at[i, 'mut'])
|
433 |
+
df.at[i, 'volume'] = volumeValues(df.at[i, 'wt'], df.at[i, 'mut'])
|
434 |
+
df.at[i, 'granthamScore'] = grantham_dict[df.at[i, 'wt'], df.at[i, 'mut']]
|
435 |
+
except:
|
436 |
+
KeyError
|
437 |
+
df.at[i, 'composition'] = 'nan'
|
438 |
+
df.at[i, 'polarity'] = 'nan'
|
439 |
+
df.at[i, 'volume'] = 'nan'
|
440 |
+
df.at[i, 'granthamScore'] = 'nan'
|
441 |
+
return df
|
code/create_swissmodelSummary.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
0 |
help='Enter the directory where meta-data is found.',
|
1 |
default=1)
|
2 |
os.makedirs('input_files/extract_swissmodel_structures/', exist_ok=True)
|
3 |
all_swissmodel = open('input_files/swissmodel_structures.txt', 'w')
|
4 |
all_swissmodel.write('UniProtKB_ac iso_id uniprot_seq_length uniprot_seq_md5 coordinate_id provider from to template qmeandisco_global seqid url')
|
5 |
all_swissmodel.write('\n')
|
6 |
for f in glob.glob(f'{meta_data}/*.tar.gz'):
|
7 |
name = f.split('/')[-1].split('.')[0]
|
8 |
with tarfile.open(f) as tar:
|
9 |
tar.extractall(f'input_files/extract_swissmodel_structures/{name}')
|
10 |
with open(f'input_files/extract_swissmodel_structures/{name}/SWISS-MODEL_Repository/INDEX') as x:
|
11 |
lines = (x.readlines())[7:]
|
12 |
for line in lines:
|
13 |
all_swissmodel.write(line)
|
14 |
shutil.rmtree('input_files/extract_swissmodel_structures/')
|
15 |
swissmodel_file()
|
|
|
1 |
+
'''
|
2 |
help='Enter the directory where meta-data is found.',
|
3 |
default=1)
|
4 |
os.makedirs('input_files/extract_swissmodel_structures/', exist_ok=True)
|
5 |
all_swissmodel = open('input_files/swissmodel_structures.txt', 'w')
|
6 |
all_swissmodel.write('UniProtKB_ac iso_id uniprot_seq_length uniprot_seq_md5 coordinate_id provider from to template qmeandisco_global seqid url')
|
7 |
all_swissmodel.write('\n')
|
8 |
for f in glob.glob(f'{meta_data}/*.tar.gz'):
|
9 |
name = f.split('/')[-1].split('.')[0]
|
10 |
with tarfile.open(f) as tar:
|
11 |
tar.extractall(f'input_files/extract_swissmodel_structures/{name}')
|
12 |
with open(f'input_files/extract_swissmodel_structures/{name}/SWISS-MODEL_Repository/INDEX') as x:
|
13 |
lines = (x.readlines())[7:]
|
14 |
for line in lines:
|
15 |
all_swissmodel.write(line)
|
16 |
shutil.rmtree('input_files/extract_swissmodel_structures/')
|
17 |
swissmodel_file()
|
code/get_alphafoldStructures.py
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import tarfile, glob, os
|
2 |
+
from biopandas.pdb import PandasPdb
|
3 |
+
import argparse
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
parser = argparse.ArgumentParser(description='ASCARIS')
|
7 |
+
|
8 |
+
parser.add_argument('-file_name', '--file_name',
|
9 |
+
help='Enter the file tar file name to untar',
|
10 |
+
default=1)
|
11 |
+
|
12 |
+
args = parser.parse_args()
|
13 |
+
|
14 |
+
alphafold = args.file_name
|
15 |
+
|
16 |
+
def threeToOne(variant):
|
17 |
+
if variant == "ALA":
|
18 |
+
variant = "A"
|
19 |
+
elif variant == "ARG":
|
20 |
+
variant = "R"
|
21 |
+
elif variant == "VAL":
|
22 |
+
variant = "V"
|
23 |
+
elif variant == "GLU":
|
24 |
+
variant = "E"
|
25 |
+
elif variant == "PRO":
|
26 |
+
variant = "P"
|
27 |
+
elif variant == "LEU":
|
28 |
+
variant = "L"
|
29 |
+
elif variant == "GLY":
|
30 |
+
variant = "G"
|
31 |
+
elif variant == "ASN":
|
32 |
+
variant = "N"
|
33 |
+
elif variant == "SER":
|
34 |
+
variant = "S"
|
35 |
+
elif variant == "GLN":
|
36 |
+
variant = "Q"
|
37 |
+
elif variant == "THR":
|
38 |
+
variant = "T"
|
39 |
+
elif variant == "MET":
|
40 |
+
variant = "M"
|
41 |
+
elif variant == "LYS":
|
42 |
+
variant = "K"
|
43 |
+
elif variant == "ASP":
|
44 |
+
variant = "D"
|
45 |
+
elif variant == "ILE":
|
46 |
+
variant = "I"
|
47 |
+
elif variant == "PHE":
|
48 |
+
variant = "F"
|
49 |
+
elif variant == "TRP":
|
50 |
+
variant = "W"
|
51 |
+
elif variant == "TYR":
|
52 |
+
variant = "Y"
|
53 |
+
elif variant == "HIS":
|
54 |
+
variant = "H"
|
55 |
+
elif variant == "CYS":
|
56 |
+
variant = "C"
|
57 |
+
elif variant == 'UNK':
|
58 |
+
variant = 'X'
|
59 |
+
elif variant == 'ASX':
|
60 |
+
variant = 'O'
|
61 |
+
return (variant)
|
62 |
+
# Unzip AlphaFold structures
|
63 |
+
|
64 |
+
def create_file():
|
65 |
+
os.makedirs('input_files/alphafold_structures/', exist_ok=True)
|
66 |
+
for f in glob.glob(f'input_files/{alphafold}'):
|
67 |
+
with tarfile.open(f) as tar:
|
68 |
+
tar.extractall(f'input_files/alphafold_structures/')
|
69 |
+
|
70 |
+
# Create summary file
|
71 |
+
alphafold_summary_file = open('input_files/alphafold_summary.txt', 'w')
|
72 |
+
alphafold_summary_file.write('uniprotID\tchain\tsequence\tmodel_num')
|
73 |
+
alphafold_summary_file.write('\n')
|
74 |
+
for f in glob.glob('input_files/alphafold_structures/*pdb*'):
|
75 |
+
str1 = PandasPdb().read_pdb(f)
|
76 |
+
str1 = str1.df['ATOM']
|
77 |
+
str1 = str1[['alt_loc', 'residue_name', 'residue_number', 'atom_name', 'insertion', 'chain_id']]
|
78 |
+
str1 = str1[str1.atom_name == 'CA']
|
79 |
+
str1['residue_name'] = str1['residue_name'].apply(lambda x: threeToOne(x))
|
80 |
+
str1['alt_loc'] = str1['alt_loc'].replace({'': np.NaN})
|
81 |
+
str1 = str1.drop_duplicates(['residue_name', 'residue_number'])
|
82 |
+
structure_residues_pdb = ''.join(str1.residue_name.to_list())
|
83 |
+
model_no = f.split('-')[2].strip()[1:]
|
84 |
+
up_name = f.split('-')[1].strip()
|
85 |
+
chain_id = list(set(str1.chain_id.to_list()))[0]
|
86 |
+
alphafold_summary_file.write(up_name)
|
87 |
+
alphafold_summary_file.write('\t')
|
88 |
+
alphafold_summary_file.write(chain_id)
|
89 |
+
alphafold_summary_file.write('\t')
|
90 |
+
alphafold_summary_file.write(structure_residues_pdb)
|
91 |
+
alphafold_summary_file.write('\t')
|
92 |
+
alphafold_summary_file.write(model_no)
|
93 |
+
alphafold_summary_file.write('\n')
|
94 |
+
|
95 |
+
|
96 |
+
if __name__ == '__main__':
|
97 |
+
create_file()
|
code/main.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pdb_featureVector
|
2 |
+
import alphafold_featureVector
|
3 |
+
import argparse
|
4 |
+
|
5 |
+
parser = argparse.ArgumentParser(description='ASCARIS')
|
6 |
+
|
7 |
+
parser.add_argument('-s', '--source_option',
|
8 |
+
help='Selection of input structure data.\n 1: PDB Structures (default), 2: AlphaFold Structures',
|
9 |
+
default=1)
|
10 |
+
parser.add_argument('-i', '--input_datapoint',
|
11 |
+
help='Input file or query datapoint\n Option 1: Comma-separated list of idenfiers (UniProt ID-wt residue-position-mutated residue (e.g. Q9Y4W6-N-432-T or Q9Y4W6-N-432-T, Q9Y4W6-N-432-T)) \n Option 2: Enter comma-separated file path')
|
12 |
+
|
13 |
+
parser.add_argument('-impute', '--imputation_state', default='True',
|
14 |
+
help='Whether resulting feature vector should be imputed or not. Default True.')
|
15 |
+
|
16 |
+
args = parser.parse_args()
|
17 |
+
|
18 |
+
input_set = args.input_datapoint
|
19 |
+
mode = args.source_option
|
20 |
+
impute = args.imputation_state
|
21 |
+
|
22 |
+
def run_featureVector(input_set, mode, impute):
|
23 |
+
print('*****************************************')
|
24 |
+
print('Feature vector generation is in progress. \nPlease check log file for updates..')
|
25 |
+
print('*****************************************')
|
26 |
+
mode = int(mode)
|
27 |
+
if mode == 1:
|
28 |
+
pdb_featureVector.pdb(input_set, mode, impute)
|
29 |
+
elif mode == 2:
|
30 |
+
alphafold_featureVector.alphafold(input_set, mode, impute)
|
31 |
+
|
32 |
+
if __name__ == '__main__':
|
33 |
+
run_featureVector(input_set, mode, impute)
|
34 |
+
|
35 |
+
|
code/manage_files.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from pathlib import Path
|
3 |
+
def manage_files(mode):
|
4 |
+
if mode== 1:
|
5 |
+
path_to_input_files = Path('input_files')
|
6 |
+
path_to_domains = path_to_input_files / 'domains.txt'
|
7 |
+
swiss_model_path = path_to_input_files / 'INDEX.json'
|
8 |
+
fisher_path = path_to_input_files / 'significant_domains.txt'
|
9 |
+
path_to_interfaces = path_to_input_files / 'H_sapiens_interfacesHQ.txt'
|
10 |
+
|
11 |
+
path_to_output_files = Path('out_files/pdb')
|
12 |
+
os.makedirs(path_to_output_files / 'pdb_structures/', exist_ok=True)
|
13 |
+
os.makedirs(path_to_output_files / 'alignment_files/', exist_ok=True)
|
14 |
+
os.makedirs(path_to_output_files / 'swissmodel_structures/', exist_ok=True)
|
15 |
+
os.makedirs(path_to_output_files / 'modbase_structures/', exist_ok=True)
|
16 |
+
os.makedirs(path_to_output_files / 'modbase_structures_individual/', exist_ok=True)
|
17 |
+
os.makedirs(path_to_output_files / 'freesasa_files/', exist_ok=True)
|
18 |
+
os.makedirs(path_to_output_files / '3D_alignment/', exist_ok=True)
|
19 |
+
path_to_alignment_files = path_to_output_files / 'alignment_files'
|
20 |
+
path_3D_alignment = path_to_output_files / '3D_alignment'
|
21 |
+
path_to_freesasa = path_to_output_files / 'freesasa_files'
|
22 |
+
buffer = path_to_output_files / 'file_buffer.txt'
|
23 |
+
outpath = path_to_output_files / 'feature_vector.txt'
|
24 |
+
|
25 |
+
return path_to_input_files, path_to_output_files, path_to_domains,fisher_path, path_to_interfaces, buffer
|
26 |
+
|
27 |
+
elif mode == 2:
|
28 |
+
path_to_input_files = Path('input_files')
|
29 |
+
path_to_domains = path_to_input_files / 'domains.txt'
|
30 |
+
fisher_path = path_to_input_files / 'significant_domains.txt'
|
31 |
+
alphafold_summary = path_to_input_files / 'alphafold_summary.txt'
|
32 |
+
path_to_interfaces = path_to_input_files / 'H_sapiens_interfacesHQ.txt'
|
33 |
+
# Unzip before using
|
34 |
+
alphafold_path = Path(path_to_input_files/'alphafold_structures')
|
35 |
+
|
36 |
+
path_to_output_files = Path('out_files/alphafold')
|
37 |
+
os.makedirs(path_to_output_files, exist_ok=True)
|
38 |
+
os.makedirs(path_to_output_files / 'freesasa_files', exist_ok=True)
|
39 |
+
os.makedirs(path_to_output_files / 'alignment_files', exist_ok=True)
|
40 |
+
os.makedirs(path_to_output_files / '3D_alignment', exist_ok=True)
|
41 |
+
|
42 |
+
return path_to_input_files,path_to_output_files, path_to_domains, fisher_path, path_to_interfaces, alphafold_path, alphafold_summary
|
code/pdb_featureVector.py
ADDED
The diff for this file is too large to render.
See raw diff
|
|
code/process_input.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
|
3 |
+
def clean_data(input_set):
|
4 |
+
data = pd.DataFrame()
|
5 |
+
try:
|
6 |
+
if ',' in input_set:
|
7 |
+
input_set = [i.strip() for i in input_set.split(',')]
|
8 |
+
for i in input_set:
|
9 |
+
data = data.append(pd.Series([j.strip() for j in i.split('-')]), ignore_index=True)
|
10 |
+
data.columns = ['uniprotID', 'wt', 'pos', 'mut']
|
11 |
+
elif '\t' in input_set:
|
12 |
+
input_set = [i.strip() for i in input_set.split('\t')]
|
13 |
+
for i in input_set:
|
14 |
+
data = data.append(pd.Series([j.strip() for j in i.split('-')]), ignore_index=True)
|
15 |
+
data.columns = ['uniprotID', 'wt', 'pos', 'mut']
|
16 |
+
|
17 |
+
elif '-' in input_set:
|
18 |
+
data = data.append(pd.Series([j.strip() for j in input_set.split('-')]), ignore_index=True)
|
19 |
+
data.columns = ['uniprotID', 'wt', 'pos', 'mut']
|
20 |
+
|
21 |
+
elif '.txt' in input_set:
|
22 |
+
data = pd.read_csv(input_set, sep='\t', names=['uniprotID', 'wt', 'pos', 'mut'])
|
23 |
+
data = data[['uniprotID', 'wt', 'pos', 'mut']]
|
24 |
+
|
25 |
+
# Exclude termination codons, synonymous mutations and any non-standard residues such as Sec, 4 or 6.
|
26 |
+
aa_list = ['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
|
27 |
+
data.wt = data.wt.str.strip()
|
28 |
+
data.mut = data.mut.str.strip()
|
29 |
+
data = data[data.wt.isin(aa_list)]
|
30 |
+
data = data[data.mut.isin(aa_list)]
|
31 |
+
|
32 |
+
for i in data.index:
|
33 |
+
data.at[i, 'datapoint'] = data.at[i, 'uniprotID'] + data.at[i, 'wt'] + str(data.at[i, 'pos']) + data.at[i, 'mut']
|
34 |
+
|
35 |
+
data = data.astype(str)
|
36 |
+
return data
|
37 |
+
except:
|
38 |
+
ValueError
|
39 |
+
print('Please check the input format.')
|
40 |
+
|
code/standard.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def standardize(df, get_columns):
|
2 |
+
cols_to_change = ['sasa', 'domaindistance3D', 'disulfide', 'intMet', 'intramembrane',
|
3 |
+
'naturalVariant', 'dnaBinding', 'activeSite', 'nucleotideBinding',
|
4 |
+
'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis',
|
5 |
+
'strand', 'helix', 'turn', 'metalBinding', 'repeat', 'caBinding',
|
6 |
+
'topologicalDomain', 'bindingSite', 'region', 'signalPeptide',
|
7 |
+
'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide',
|
8 |
+
'transitPeptide', 'glycosylation', 'propeptide']
|
9 |
+
for col in cols_to_change: # because in the other ones, they are 3D distance. Here, no distance calculated.
|
10 |
+
df[col] = 'nan'
|
11 |
+
df = df[get_columns.columns]
|
12 |
+
|
13 |
+
return df
|
code/uniprotSequenceMatch.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from add_sequence import *
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
def uniprotSequenceMatch(data):
|
6 |
+
print('Retrieving UniProt sequences...\n')
|
7 |
+
|
8 |
+
canonical_fasta = pd.DataFrame(columns=['uniprotID', 'uniprotSequence'])
|
9 |
+
up_list = list(set(data['uniprotID'].to_list()))
|
10 |
+
for i in range(len(up_list)):
|
11 |
+
canonical_fasta.at[i, 'uniprotSequence'] = get_uniprot_seq(up_list[i])
|
12 |
+
canonical_fasta.at[i, 'uniprotID'] = up_list[i]
|
13 |
+
|
14 |
+
canonical_fasta = canonical_fasta.drop_duplicates()
|
15 |
+
isoform_fasta = pd.DataFrame(columns=['uniprotID', 'isoformSequence'])
|
16 |
+
iso_dict = []
|
17 |
+
for i in range(len(up_list)):
|
18 |
+
iso_dict.append(get_isoforms(up_list[i]))
|
19 |
+
|
20 |
+
index = 0
|
21 |
+
for i in iso_dict:
|
22 |
+
for key, val in i.items():
|
23 |
+
isoform_fasta.at[index, 'uniprotID'] = key
|
24 |
+
isoform_fasta.at[index, 'isoformSequence'] = val
|
25 |
+
index += 1
|
26 |
+
isoform_fasta = isoform_fasta.drop_duplicates()
|
27 |
+
|
28 |
+
for i in isoform_fasta.index:
|
29 |
+
isoform_fasta.at[i, 'whichIsoform'] = isoform_fasta.at[i, 'uniprotID'][7:10].strip()
|
30 |
+
isoform_fasta.at[i, 'uniprotID'] = isoform_fasta.at[i, 'uniprotID'][0:6]
|
31 |
+
print('Sequence files created...\n')
|
32 |
+
|
33 |
+
data = data.merge(canonical_fasta, on='uniprotID', how='left')
|
34 |
+
data = data.replace({'': np.NaN, 'nan': np.NaN})
|
35 |
+
data['whichIsoform'] = np.NaN
|
36 |
+
data['wt_sequence_match'] = np.NaN
|
37 |
+
not_match_in_uniprot = data[data.uniprotSequence.isna()]
|
38 |
+
uniprot_matched = data[~data.uniprotSequence.isna()]
|
39 |
+
|
40 |
+
return not_match_in_uniprot, uniprot_matched, canonical_fasta, isoform_fasta
|
input_files/H_sapiens_interfacesHQ.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:90fb5f5fe31e20921290e0da588d50d2939feedac80767cdd3b46225ce849b8d
|
3 |
+
size 19252152
|
input_files/alphafold_structures/AF-A0A0A0MRZ7-F1-model_v1.cif.gz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c5a22037a2ae883cc095f647170271d6a69f38de045206e99c4ac5586658ccb3
|
3 |
+
size 26598
|
input_files/alphafold_structures/AF-A0A0A0MRZ7-F1-model_v1.pdb.gz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:93e034885f400396df77e65944c65e8d22000f011343a98d8f7727b97b378860
|
3 |
+
size 18469
|
input_files/alphafold_structures/AF-A0A0A0MRZ8-F1-model_v1.cif.gz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:367a7e9d82ad6a452f643eed923237ed149cc3cf1dabef23304d4e4f5711a191
|
3 |
+
size 25647
|
input_files/alphafold_structures/AF-A0A0A0MRZ8-F1-model_v1.pdb.gz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:449fa624948266313cdf18a365e11036b6eaa5502395ed88b58f1841ebf70e60
|
3 |
+
size 17763
|
input_files/alphafold_structures/AF-A0A0A0MRZ9-F1-model_v1.cif.gz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:35de071f52a5644df10d8181b5c6034b04734895e155b68d3e3f5133e98f3ef6
|
3 |
+
size 27026
|
input_files/alphafold_structures/AF-A0A0A0MRZ9-F1-model_v1.pdb.gz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0a509714d54bdf9b9ad7a9bcdccc4122e256cec371fb04e251f68e2e67ade17a
|
3 |
+
size 18748
|
input_files/alphafold_structures/AF-A0A0A0MS00-F1-model_v1.cif.gz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d6b9e658af67a6b4ca14f5c960c4629140eb78588c46cfe1fab3bbe2c1c7d17e
|
3 |
+
size 25157
|
input_files/alphafold_structures/AF-A0A0A0MS00-F1-model_v1.pdb.gz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b840d9a1c9de25dd6484ad2675f26e578e883c277d4e332247cb1f45a7706ffb
|
3 |
+
size 17329
|
input_files/alphafold_structures/AF-A0A0A0MS01-F1-model_v1.cif.gz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e9077d070c0fea099e5afdc10d4c599367064518be2412088e8f7f2213156f91
|
3 |
+
size 26786
|
input_files/alphafold_structures/AF-A0A0A0MS01-F1-model_v1.pdb.gz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1abd18dc11f67b8b3a3dd8b30c4a74fec7fefec62c601153401ca5c550c96dbd
|
3 |
+
size 18678
|
input_files/alphafold_structures/AF-A0A0A0MS02-F1-model_v1.cif.gz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:db309cbaaf7d073230b4ab1a98ecc8213c6cfebfe87cc4f6f3990944feef7059
|
3 |
+
size 26727
|