import streamlit as st import pandas as pd import xlsxwriter from io import BytesIO from collections import defaultdict # Function to find repeated amino acids in the protein sequence def find_homorepeats(protein): n = len(protein) freq = defaultdict(int) i = 0 while i < n: curr = protein[i] repeat = "" while i < n and curr == protein[i]: repeat += protein[i] i += 1 # Only consider repeats of length > 1 if len(repeat) > 1: freq[repeat] += 1 return freq # Function to process a single CSV file and return its analysis def process_csv(file): df = pd.read_csv(file) if len(df.columns) < 3: st.error(f"Error: The file must have at least three columns: ID, Protein Name, Sequence") return None # Storing entry ID, protein name, and sequence sequences = [] for _, row in df.iterrows(): entry_id = str(row[0]) protein_name = str(row[1]) sequence = str(row[2]).replace('"', '').replace(' ', '') sequences.append((entry_id, protein_name, sequence)) # Analyzing homorepeats in the sequences homorepeats = set() sequence_data = [] for entry_id, protein_name, sequence in sequences: freq = find_homorepeats(sequence) homorepeats.update(freq.keys()) # Collect unique homorepeats sequence_data.append((entry_id, protein_name, freq)) return homorepeats, sequence_data # Function to generate and download Excel workbook def create_excel(sequences_data, homorepeats): output = BytesIO() workbook = xlsxwriter.Workbook(output, {'in_memory': True}) worksheet = workbook.add_worksheet() # Write the header worksheet.write(0, 0, "Entry ID") worksheet.write(0, 1, "Protein Name") col = 2 for repeat in sorted(homorepeats): worksheet.write(0, col, repeat) col += 1 # Write data for each sequence row = 1 for entry_id, protein_name, freq in sequences_data: worksheet.write(row, 0, entry_id) worksheet.write(row, 1, protein_name) col = 2 for repeat in sorted(homorepeats): worksheet.write(row, col, freq.get(repeat, 0)) col += 1 row += 1 workbook.close() output.seek(0) return output # Streamlit UI components st.title("Protein Homorepeat Analysis") # Step 1: Upload CSV Files uploaded_files = st.file_uploader("Upload CSV files", accept_multiple_files=True, type=["csv"]) # Step 2: Process files and display results if uploaded_files: all_homorepeats = set() all_sequences_data = [] for file in uploaded_files: homorepeats, sequence_data = process_csv(file) if homorepeats is not None: all_homorepeats.update(homorepeats) all_sequences_data.extend(sequence_data) if all_sequences_data: st.success(f"Processed {len(uploaded_files)} files successfully!") # Step 3: Generate and download the Excel report excel_file = create_excel(all_sequences_data, all_homorepeats) # Download the Excel file st.download_button( label="Download Excel file", data=excel_file, file_name="protein_homorepeat_results.xlsx", mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ) # Step 4: Display summary table if st.checkbox("Show Results Table"): # Convert the sequences data into a DataFrame for easy display rows = [] for entry_id, protein_name, freq in all_sequences_data: row = {"Entry ID": entry_id, "Protein Name": protein_name} row.update({repeat: freq.get(repeat, 0) for repeat in sorted(all_homorepeats)}) rows.append(row) result_df = pd.DataFrame(rows) st.dataframe(result_df)