File size: 7,008 Bytes
c712316
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bf55a9d
c712316
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b83c412
c712316
 
 
b83c412
 
c712316
 
 
 
 
 
 
b83c412
c712316
b83c412
c712316
 
eac5d9d
c712316
77b1fab
c712316
77b1fab
c712316
77b1fab
c712316
 
 
b83c412
c712316
 
 
 
 
b83c412
c712316
 
effe2a1
c712316
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b83c412
c712316
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c18585d
 
c712316
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b83c412
c712316
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import streamlit as st
import requests
from io import StringIO
from Bio import SeqIO
import os
import time
import pandas as pd

from run_domain2go_app import *


def convert_df(df):
   return df.to_csv(index=False).encode('utf-8')


st.markdown("""
<div style="background-color:#f0f2f6;padding:10px">
<p style="color:#b22d2a;font-size:15px;">Disclaimer</p>
<p style="color:#000000;font-size:14px;">This program is designed to generate predictions for a single protein due to the extended runtime of InterProScan. If you need predictions for multiple UniProtKB/Swiss-Prot proteins, we recommend utilizing our comprehensive protein function prediction dataset available in our <a href="https://github.com/HUBioDataLab/Domain2GO">Github repository</a>.</p>
</div>
""", unsafe_allow_html=True)



domain_tab, pred_tab = st.tabs(['Domains', 'Function predictions'])

with domain_tab:
    st.header('Domains in sequence')

with st.sidebar:

    st.title("Domain2GO: Mutual Annotation-Based Prediction of Protein Domain Functions")
    st.write("[![arXiv](https://img.shields.io/badge/bioRxiv-2022.11.03.514980-b31b1b.svg)](https://www.biorxiv.org/content/10.1101/2022.11.03.514980v1) [![github-repository](https://img.shields.io/badge/GitHub-black?logo=github)](https://github.com/HUBioDataLab/Domain2GO)")

    if 'example_seq_button' not in st.session_state:
        st.session_state.example_seq_button = False

    def click_button():
        st.session_state.example_seq_button = not st.session_state.example_seq_button
    
    input_type = st.radio('Select input type', ['Enter sequence', 'Upload FASTA file'])
    if input_type == 'Enter sequence':
        if st.session_state.example_seq_button:
            st.session_state['sequence'] = st.text_area('Enter protein sequence in FASTA format.', 
                value='>sp|O18783|PLMN_NOTEU\n'
                'MEYGKVIFLFLLFLKSGQGESLENYIKTEGASLSNSQKKQFVASSTEECEALCEKETEFVCRSFEHYNKEQKCVIMSENSKTSSVERKRDVVLFEKRIYLSDCKSGNGRNYRGTLSKTKSGITCQKWSDLSPHVPNYAPSKYPDAGLEKNYCRNPDDDVKGPWCYTTNPDIRYEYCDVPECEDECMHCSGENYRGTISKTESGIECQPWDSQEPHSHEYIPSKFPSKDLKENYCRNPDGEPRPWCFTSNPEKRWEFCNIPRCSSPPPPPGPMLQCLKGRGENYRGKIAVTKSGHTCQRWNKQTPHKHNRTPENFPCRGLDENYCRNPDGELEPWCYTTNPDVRQEYCAIPSCGTSSPHTDRVEQSPVIQECYEGKGENYRGTTSTTISGKKCQAWSSMTPHQHKKTPDNFPNADLIRNYCRNPDGDKSPWCYTMDPTVRWEFCNLEKCSGTGSTVLNAQTTRVPSVDTTSHPESDCMYGSGKDYRGKRSTTVTGTLCQAWTAQEPHRHTIFTPDTYPRAGLEENYCRNPDGDPNGPWCYTTNPKKLFDYCDIPQCVSPSSFDCGKPRVEPQKCPGRIVGGCYAQPHSWPWQISLRTRFGEHFCGGTLIAPQWVLTAAHCLERSQWPGAYKVILGLHREVNPESYSQEIGVSRLFKGPLAADIALLKLNRPAAINDKVIPACLPSQDFMVPDRTLCHVTGWGDTQGTSPRGLLKQASLPVIDNRVCNRHEYLNGRVKSTELCAGHLVGRGDSCQGDSGGPLICFEDDKYVLQGVTSWGLGCARPNKPGVYVRVSRYISWIEDVMKNN')
        else:
            st.session_state['sequence'] = st.text_input('Enter protein sequence in FASTA format.')
        st.session_state['name'] = st.session_state['sequence'].split('\n')[0].strip('>')
        st.button('Use example sequence', on_click=click_button)
    else:
        protein_input = st.file_uploader('Choose file')
        if protein_input:
            protein_input_stringio = StringIO(protein_input.getvalue().decode("utf-8"))
            fasta_sequences = SeqIO.parse(protein_input_stringio, 'fasta')
            for fasta in fasta_sequences:
                st.session_state['name'], st.session_state['sequence'] = fasta.id, str(fasta.seq)

    st.session_state['email'] = st.text_input('Enter your email for InterProScan query: ')

    # prevent user from clicking 'Find domains' button if email or sequence is empty
    domains_submitted = False
    if st.button('Find domains'):
        if 'email' in st.session_state and 'sequence' in st.session_state and '@' in st.session_state.email:
            domains_submitted = True
            st.session_state.disabled = True
        else:
            st.warning('Please enter your email and protein sequence first. If you have already entered your email and protein sequence, please check that your email is valid.')
    else:
        with domain_tab:
            st.warning('Please enter your query and click "Find domains" to see domains in sequence.')

with domain_tab:
    no_domains = False
    error_in_interproscan = False
    if domains_submitted:
        with st.spinner('Finding domains in sequence using InterProScan. This may take a while...'):
            result = find_domains(st.session_state.email, st.session_state.sequence, st.session_state.name)
        result_text = result[0]
        if result_text == 'Domains found.':
            st.success(result_text + ' You can now see function predictions for the sequence in the "Function predictions" tab.')
            st.session_state['domain_df'] = result[1]
        elif result_text == 'No domains found.':
            st.warning(result_text)
            no_domains = True
        else:
            st.error(result_text)
            st.write(f'InterProScan job id: {result[1]}')
            st.write(f'InterProScan job response: {result[2]}')
            error_in_interproscan = True


    if 'domain_df' in st.session_state:
        with st.expander('Show domains in sequence'):
            st.write(st.session_state.domain_df)
            domains_csv = convert_df(st.session_state.domain_df)
            st.download_button(
                label="Download domains in sequence as CSV",
                data=domains_csv,
                file_name=f"{st.session_state.name}_domains.csv",
                mime="text/csv",
            )

with pred_tab:
    st.header('Function predictions')
    if 'domain_df' not in st.session_state:
        if no_domains:
            st.warning('No domains found. Please find domains in sequence first.')
        elif error_in_interproscan:
            st.error('Error in InterProScan. Please check InterProScan job id and response.')
        else:
            st.warning('Please find domains in sequence first.')
    else:
        with st.spinner('Generating function predictions...'):
            cwd = os.getcwd()
            # mapping_path = "{}/Domain2GO/data".format(cwd.split("Domain2GO")[0])
            mapping_path = './data'
            pred_results = generate_function_predictions(st.session_state.domain_df, mapping_path)
            pred_result_text = pred_results[0]
            if pred_result_text == 'Function predictions found.':
                st.success(pred_result_text)
                st.session_state['pred_df'] = pred_results[1]
            elif pred_result_text == 'No function predictions found.':
                st.warning(pred_result_text)

        if 'pred_df' in st.session_state:
            with st.expander('Show function predictions'):
                st.write(st.session_state.pred_df)
                pred_csv = convert_df(st.session_state.pred_df)
                st.download_button(
                    label="Download function predictions as CSV",
                    data=pred_csv,
                    file_name=f"{st.session_state.name}_function_predictions.csv",
                    mime="text/csv",
                )