HI_SCBL / homorepeat_app.py
Jayesh13's picture
Create homorepeat_app.py
5e885f1 verified
raw
history blame
3.91 kB
import streamlit as st
import pandas as pd
import xlsxwriter
from io import BytesIO
from collections import defaultdict
# Function to find repeated amino acids in the protein sequence
def find_homorepeats(protein):
n = len(protein)
freq = defaultdict(int)
i = 0
while i < n:
curr = protein[i]
repeat = ""
while i < n and curr == protein[i]:
repeat += protein[i]
i += 1
# Only consider repeats of length > 1
if len(repeat) > 1:
freq[repeat] += 1
return freq
# Function to process a single CSV file and return its analysis
def process_csv(file):
df = pd.read_csv(file)
if len(df.columns) < 3:
st.error(f"Error: The file must have at least three columns: ID, Protein Name, Sequence")
return None
# Storing entry ID, protein name, and sequence
sequences = []
for _, row in df.iterrows():
entry_id = str(row[0])
protein_name = str(row[1])
sequence = str(row[2]).replace('"', '').replace(' ', '')
sequences.append((entry_id, protein_name, sequence))
# Analyzing homorepeats in the sequences
homorepeats = set()
sequence_data = []
for entry_id, protein_name, sequence in sequences:
freq = find_homorepeats(sequence)
homorepeats.update(freq.keys()) # Collect unique homorepeats
sequence_data.append((entry_id, protein_name, freq))
return homorepeats, sequence_data
# Function to generate and download Excel workbook
def create_excel(sequences_data, homorepeats):
output = BytesIO()
workbook = xlsxwriter.Workbook(output, {'in_memory': True})
worksheet = workbook.add_worksheet()
# Write the header
worksheet.write(0, 0, "Entry ID")
worksheet.write(0, 1, "Protein Name")
col = 2
for repeat in sorted(homorepeats):
worksheet.write(0, col, repeat)
col += 1
# Write data for each sequence
row = 1
for entry_id, protein_name, freq in sequences_data:
worksheet.write(row, 0, entry_id)
worksheet.write(row, 1, protein_name)
col = 2
for repeat in sorted(homorepeats):
worksheet.write(row, col, freq.get(repeat, 0))
col += 1
row += 1
workbook.close()
output.seek(0)
return output
# Streamlit UI components
st.title("Protein Homorepeat Analysis")
# Step 1: Upload CSV Files
uploaded_files = st.file_uploader("Upload CSV files", accept_multiple_files=True, type=["csv"])
# Step 2: Process files and display results
if uploaded_files:
all_homorepeats = set()
all_sequences_data = []
for file in uploaded_files:
homorepeats, sequence_data = process_csv(file)
if homorepeats is not None:
all_homorepeats.update(homorepeats)
all_sequences_data.extend(sequence_data)
if all_sequences_data:
st.success(f"Processed {len(uploaded_files)} files successfully!")
# Step 3: Generate and download the Excel report
excel_file = create_excel(all_sequences_data, all_homorepeats)
# Download the Excel file
st.download_button(
label="Download Excel file",
data=excel_file,
file_name="protein_homorepeat_results.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)
# Step 4: Display summary table
if st.checkbox("Show Results Table"):
# Convert the sequences data into a DataFrame for easy display
rows = []
for entry_id, protein_name, freq in all_sequences_data:
row = {"Entry ID": entry_id, "Protein Name": protein_name}
row.update({repeat: freq.get(repeat, 0) for repeat in sorted(all_homorepeats)})
rows.append(row)
result_df = pd.DataFrame(rows)
st.dataframe(result_df)