File size: 2,594 Bytes
d5ba624
3b2ed25
 
f659c90
 
 
3b2ed25
 
 
 
 
d5ba624
3b2ed25
 
0cfb879
 
 
 
ca02cf1
 
 
 
 
 
 
 
 
0cfb879
3b2ed25
d5ba624
0f05186
d5ba624
 
 
 
 
 
0f05186
ca02cf1
 
 
0f05186
1e0d468
 
 
ca02cf1
 
 
 
 
 
 
 
 
 
 
 
 
3b2ed25
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import streamlit as st
import requests

def get_genome_from_ncbi(accession_number):
    # Construct the URL for NCBI GenBank
    url = f"https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=nuccore&id={accession_number}&report=fasta&retmode=text"
    response = requests.get(url)
    
    if response.status_code == 200:
        return response.text
    else:
        st.warning(f"Failed to retrieve genome for {accession_number}. Please check the accession number.")
        return None

def extract_sequences(genome_data):
    # Extract the sequence part from the FASTA format
    lines = genome_data.splitlines()
    sequence = ''.join(lines[1:])  # Join all lines except the first (which is the header)
    
    # Get the first 55 and last 114 base pairs
    start_sequence = sequence[:55]  # First 55 base pairs
    end_sequence = sequence[-114:]   # Last 114 base pairs

    return start_sequence, end_sequence

def format_fasta(accession_number, sequence):
    return f">{accession_number}\n{sequence}"

def main():
    st.title("NCBI Genome Sequence Extractor")

    # File uploader for the user to upload the input file
    uploaded_file = st.file_uploader("Upload a text file containing accession numbers", type="txt")
    
    if uploaded_file is not None:
        # Read and split accession numbers from the uploaded file
        accession_numbers = uploaded_file.read().decode("utf-8").splitlines()

        # Prepare lists to store formatted sequences
        starting_sequences = []
        ending_sequences = []

        for accession_number in accession_numbers:
            genome_data = get_genome_from_ncbi(accession_number)
            if genome_data:
                start_sequence, end_sequence = extract_sequences(genome_data)
                formatted_start_sequence = format_fasta(accession_number, start_sequence)
                formatted_end_sequence = format_fasta(accession_number, end_sequence)
                starting_sequences.append(formatted_start_sequence)
                ending_sequences.append(formatted_end_sequence)

        # Write starting sequences to a file for download
        start_sequences_text = "\n".join(starting_sequences)
        st.download_button("Download Starting Sequences", start_sequences_text, file_name="starting_sequences.txt", mime="text/plain")

        # Write ending sequences to a file for download
        end_sequences_text = "\n".join(ending_sequences)
        st.download_button("Download Ending Sequences", end_sequences_text, file_name="ending_sequences.txt", mime="text/plain")

if __name__ == "__main__":
    main()