Erva Ulusoy commited on
Commit
c712316
1 Parent(s): 968c130

initialize space

Browse files
data/finalized_domain2go_mappings.txt ADDED
The diff for this file is too large to render. See raw diff
 
domain2go_app.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import requests
3
+ from io import StringIO
4
+ from Bio import SeqIO
5
+ import os
6
+ import time
7
+ import pandas as pd
8
+
9
+ from run_domain2go_app import *
10
+
11
+
12
+ def convert_df(df):
13
+ return df.to_csv(index=False).encode('utf-8')
14
+
15
+
16
+ st.markdown("""
17
+ <div style="background-color:#f9f9f9;padding:10px">
18
+ <p style="color:#b22d2a;font-size:15px;">Disclaimer</p>
19
+ <p style="color:#000000;font-size:14px;">This program is designed to generate predictions for a single protein due to the extended runtime of InterProScan. If you need predictions for multiple UniProtKB/Swiss-Prot proteins, we recommend utilizing our comprehensive protein function prediction dataset available in our <a href="https://github.com/HUBioDataLab/Domain2GO">Github repository</a>.</p>
20
+ </div>
21
+ """, unsafe_allow_html=True)
22
+
23
+
24
+
25
+ domain_tab, pred_tab = st.tabs(['Domains', 'Function predictions'])
26
+
27
+ with domain_tab:
28
+ st.header('Domains in sequence')
29
+
30
+ with st.sidebar:
31
+
32
+ st.title("Domain2GO: Mutual Annotation-Based Prediction of Protein Domain Functions")
33
+ st.write("[![arXiv](https://img.shields.io/badge/bioRxiv-2022.11.03.514980-b31b1b.svg)](https://www.biorxiv.org/content/10.1101/2022.11.03.514980v1) [![github-repository](https://img.shields.io/badge/GitHub-black?logo=github)](https://github.com/HUBioDataLab/Domain2GO)")
34
+
35
+ if 'example_seq_button' not in st.session_state:
36
+ st.session_state.example_seq_button = False
37
+
38
+ def click_button():
39
+ st.session_state.example_seq_button = not st.session_state.example_seq_button
40
+
41
+ input_type = st.radio('Select input type', ['Enter sequence', 'Upload FASTA file'])
42
+ if input_type == 'Enter sequence':
43
+ if st.session_state.example_seq_button:
44
+ sequence = st.text_area('Enter protein sequence in FASTA format.',
45
+ value='>sp|O18783|PLMN_NOTEU\n'
46
+ 'MEYGKVIFLFLLFLKSGQGESLENYIKTEGASLSNSQKKQFVASSTEECEALCEKETEFVCRSFEHYNKEQKCVIMSENSKTSSVERKRDVVLFEKRIYLSDCKSGNGRNYRGTLSKTKSGITCQKWSDLSPHVPNYAPSKYPDAGLEKNYCRNPDDDVKGPWCYTTNPDIRYEYCDVPECEDECMHCSGENYRGTISKTESGIECQPWDSQEPHSHEYIPSKFPSKDLKENYCRNPDGEPRPWCFTSNPEKRWEFCNIPRCSSPPPPPGPMLQCLKGRGENYRGKIAVTKSGHTCQRWNKQTPHKHNRTPENFPCRGLDENYCRNPDGELEPWCYTTNPDVRQEYCAIPSCGTSSPHTDRVEQSPVIQECYEGKGENYRGTTSTTISGKKCQAWSSMTPHQHKKTPDNFPNADLIRNYCRNPDGDKSPWCYTMDPTVRWEFCNLEKCSGTGSTVLNAQTTRVPSVDTTSHPESDCMYGSGKDYRGKRSTTVTGTLCQAWTAQEPHRHTIFTPDTYPRAGLEENYCRNPDGDPNGPWCYTTNPKKLFDYCDIPQCVSPSSFDCGKPRVEPQKCPGRIVGGCYAQPHSWPWQISLRTRFGEHFCGGTLIAPQWVLTAAHCLERSQWPGAYKVILGLHREVNPESYSQEIGVSRLFKGPLAADIALLKLNRPAAINDKVIPACLPSQDFMVPDRTLCHVTGWGDTQGTSPRGLLKQASLPVIDNRVCNRHEYLNGRVKSTELCAGHLVGRGDSCQGDSGGPLICFEDDKYVLQGVTSWGLGCARPNKPGVYVRVSRYISWIEDVMKNN')
47
+ else:
48
+ sequence = st.text_input('Enter protein sequence in FASTA format.')
49
+ name = sequence.split('\n')[0].strip('>')
50
+ st.button('Use example sequence', on_click=click_button)
51
+ else:
52
+ protein_input = st.file_uploader('Choose file')
53
+ if protein_input:
54
+ protein_input_stringio = StringIO(protein_input.getvalue().decode("utf-8"))
55
+ fasta_sequences = SeqIO.parse(protein_input_stringio, 'fasta')
56
+ for fasta in fasta_sequences:
57
+ name, sequence = fasta.id, str(fasta.seq)
58
+
59
+ email = st.text_input('Enter your email for InterProScan query: ')
60
+
61
+ # prevent user from clicking 'Find domains' button if email or sequence is empty
62
+ domains_submitted = False
63
+ if st.button('Find domains'):
64
+ if email and sequence:
65
+ domains_submitted = True
66
+ st.session_state.disabled = True
67
+ else:
68
+ st.warning('Please enter your email and protein sequence first.')
69
+ else:
70
+ with domain_tab:
71
+ st.warning('Please enter your query and click "Find domains" to see domains in sequence.')
72
+ with domain_tab:
73
+ no_domains = False
74
+ error_in_interproscan = False
75
+ if domains_submitted:
76
+ with st.spinner('Finding domains in sequence using InterProScan. This may take a while...'):
77
+ result = find_domains(email, sequence, name)
78
+ result_text = result[0]
79
+ if result_text == 'Domains found.':
80
+ st.success(result_text + 'You can now see function predictions for the sequence in the "Function predictions" tab.')
81
+ st.session_state['domain_df'] = result[1]
82
+ elif result_text == 'No domains found.':
83
+ st.warning(result_text)
84
+ no_domains = True
85
+ else:
86
+ st.error(result_text)
87
+ st.write(f'InterProScan job id: {result[1]}')
88
+ st.write(f'InterProScan job response: {result[2]}')
89
+ error_in_interproscan = True
90
+
91
+
92
+ if 'domain_df' in st.session_state:
93
+ with st.expander('Show domains in sequence'):
94
+ st.write(st.session_state.domain_df)
95
+ domains_csv = convert_df(st.session_state.domain_df)
96
+ st.download_button(
97
+ label="Download domains in sequence as CSV",
98
+ data=domains_csv,
99
+ file_name=f"{name}_domains.csv",
100
+ mime="text/csv",
101
+ )
102
+
103
+ with pred_tab:
104
+ st.header('Function predictions')
105
+ if 'domain_df' not in st.session_state:
106
+ if no_domains:
107
+ st.warning('No domains found. Please find domains in sequence first.')
108
+ elif error_in_interproscan:
109
+ st.error('Error in InterProScan. Please check InterProScan job id and response.')
110
+ else:
111
+ st.warning('Please find domains in sequence first.')
112
+ else:
113
+ with st.spinner('Generating function predictions...'):
114
+ cwd = os.getcwd()
115
+ mapping_path = "{}Domain2GO/data".format(cwd.split("Domain2GO")[0])
116
+ pred_results = generate_function_predictions(st.session_state.domain_df, mapping_path)
117
+ pred_result_text = pred_results[0]
118
+ if pred_result_text == 'Function predictions found.':
119
+ st.success(pred_result_text)
120
+ st.session_state['pred_df'] = pred_results[1]
121
+ elif pred_result_text == 'No function predictions found.':
122
+ st.warning(pred_result_text)
123
+
124
+ if 'pred_df' in st.session_state:
125
+ with st.expander('Show function predictions'):
126
+ st.write(st.session_state.pred_df)
127
+ pred_csv = convert_df(st.session_state.pred_df)
128
+ st.download_button(
129
+ label="Download function predictions as CSV",
130
+ data=pred_csv,
131
+ file_name=f"{name}_function_predictions.csv",
132
+ mime="text/csv",
133
+ )
134
+
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # This file may be used to create an environment using:
2
+ # $ conda create --name <env> --file <this file>
3
+
4
+ pandas==1.3.2
5
+ dask==2022.2.1
6
+ numpy==1.20.3
7
+ scipy==1.7.1
8
+ requests==2.31.0
9
+ biopython==1.81
run_domain2go_app.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from io import StringIO
3
+ from Bio import SeqIO
4
+ import os
5
+ import time
6
+ import pandas as pd
7
+
8
+ def find_domains(email, sequence, name):
9
+
10
+ # send request to interproscan api
11
+ headers = {
12
+ 'Content-Type': 'application/x-www-form-urlencoded',
13
+ 'Accept': 'text/plain',
14
+ }
15
+
16
+ data= {
17
+ 'email': email,
18
+ 'stype': 'p',
19
+ 'sequence': f'{sequence}'}
20
+
21
+
22
+ job_id_response = requests.post('https://www.ebi.ac.uk/Tools/services/rest/iprscan5/run', headers=headers, data=data)
23
+ job_id = job_id_response.text
24
+
25
+ # get results
26
+
27
+ headers = {
28
+ 'Accept': 'application/json',
29
+ }
30
+
31
+ job_result_url = f'https://www.ebi.ac.uk/Tools/services/rest/iprscan5/result/{job_id}/json'
32
+
33
+ json_output = None
34
+ entries = dict()
35
+ with requests.Session() as s:
36
+ # try 10 times if not successful print error
37
+ c=0
38
+ while c<10:
39
+ job_result_response = s.get(job_result_url, headers=headers)
40
+ if job_result_response.status_code == 200:
41
+ json_output= job_result_response.json()['results'][0]
42
+ print('InterProScan job done')
43
+ break
44
+ else:
45
+ time.sleep(60)
46
+ c+=1
47
+
48
+ if json_output is None:
49
+ result_text = 'InterProScan job failed'
50
+ return [result_text, job_id, job_result_response.text]
51
+
52
+ else:
53
+ for elem in json_output['matches']:
54
+ entry = elem['signature']['entry']
55
+
56
+ location_list = [f"{i['start']}-{i['end']}" for i in elem['locations']]
57
+
58
+ if type(entry) == dict and entry['type'] == 'DOMAIN':
59
+ if entry['accession'] not in entries:
60
+ entries[entry['accession']] = {
61
+ 'name': entry['name'],
62
+ # add locations as a list
63
+ 'locations': location_list
64
+ }
65
+
66
+ else:
67
+ try:
68
+ entries[entry['accession']]['locations'].extend(location_list)
69
+ except AttributeError:
70
+ entries[entry['accession']]['locations'] = entries[entry['accession']]['locations'].split(' ')
71
+ entries[entry['accession']]['locations'] = [i for i in entries[entry['accession']]['locations'] if i]
72
+ entries[entry['accession']]['locations'].extend(location_list)
73
+
74
+ entries[entry['accession']]['locations'] = list(set(entries[entry['accession']]['locations']))
75
+ entries[entry['accession']]['locations'] = ';'.join(entries[entry['accession']]['locations'])
76
+
77
+ if entries:
78
+ result_text = 'Domains found.'
79
+
80
+ # create domains dataframe
81
+ domains_df = pd.DataFrame.from_dict(entries, orient='index').reset_index()
82
+ domains_df['protein_name'] = name
83
+ domains_df = domains_df[['protein_name', 'index', 'name', 'locations']]
84
+ domains_df.columns = ['protein_name', 'accession', 'name', 'locations']
85
+ return [result_text, domains_df]
86
+
87
+ else:
88
+ result_text = 'No domains found.'
89
+ return [result_text]
90
+
91
+ # generate protein function predictions based on domain2go mappings
92
+
93
+ def generate_function_predictions(domains_df, mapping_path):
94
+
95
+ # read domain2go mappings
96
+ domain2go_df = pd.read_csv(os.path.join(mapping_path, 'finalized_domain2go_mappings.txt'))
97
+ print('Domain2GO mappings loaded')
98
+ # merge domain2go mappings with domains found in protein sequence
99
+ merged_df = pd.merge(domains_df, domain2go_df, left_on='accession', right_on='Interpro')
100
+
101
+ print('Function predictions generated.')
102
+
103
+ # if merged_df is empty return
104
+ if merged_df.empty:
105
+ result_text = 'No function predictions found.'
106
+ return [result_text]
107
+
108
+ else:
109
+ merged_df = merged_df[['accession', 'name', 'locations', 'GO', 's']]
110
+ merged_df.columns = ['domain_accession', 'domain_name', 'domain_locations', 'GO_id', 'probability']
111
+
112
+ # save protein function predictions
113
+ protein_name = domains_df['protein_name'].iloc[0]
114
+ result_text= 'Function predictions found.'
115
+ return [result_text, merged_df]