import streamlit as st
import pandas as pd
import xlsxwriter
from io import BytesIO
from collections import defaultdict

# Function to find repeated amino acids in the protein sequence
def find_homorepeats(protein):
    n = len(protein)
    freq = defaultdict(int)
    i = 0

    while i < n:
        curr = protein[i]
        repeat = ""
        while i < n and curr == protein[i]:
            repeat += protein[i]
            i += 1

        # Only consider repeats of length > 1
        if len(repeat) > 1:
            freq[repeat] += 1

    return freq

# Function to process a single CSV file and return its analysis
def process_csv(file):
    df = pd.read_csv(file)
    if len(df.columns) < 3:
        st.error(f"Error: The file must have at least three columns: ID, Protein Name, Sequence")
        return None

    # Storing entry ID, protein name, and sequence
    sequences = []
    for _, row in df.iterrows():
        entry_id = str(row[0])
        protein_name = str(row[1])
        sequence = str(row[2]).replace('"', '').replace(' ', '')
        sequences.append((entry_id, protein_name, sequence))

    # Analyzing homorepeats in the sequences
    homorepeats = set()
    sequence_data = []
    for entry_id, protein_name, sequence in sequences:
        freq = find_homorepeats(sequence)
        homorepeats.update(freq.keys())  # Collect unique homorepeats
        sequence_data.append((entry_id, protein_name, freq))

    return homorepeats, sequence_data

# Function to generate and download Excel workbook
def create_excel(sequences_data, homorepeats):
    output = BytesIO()
    workbook = xlsxwriter.Workbook(output, {'in_memory': True})
    worksheet = workbook.add_worksheet()

    # Write the header
    worksheet.write(0, 0, "Entry ID")
    worksheet.write(0, 1, "Protein Name")
    col = 2
    for repeat in sorted(homorepeats):
        worksheet.write(0, col, repeat)
        col += 1

    # Write data for each sequence
    row = 1
    for entry_id, protein_name, freq in sequences_data:
        worksheet.write(row, 0, entry_id)
        worksheet.write(row, 1, protein_name)
        col = 2
        for repeat in sorted(homorepeats):
            worksheet.write(row, col, freq.get(repeat, 0))
            col += 1
        row += 1

    workbook.close()
    output.seek(0)
    return output

# Streamlit UI components
st.title("Protein Homorepeat Analysis")

# Step 1: Upload CSV Files
uploaded_files = st.file_uploader("Upload CSV files", accept_multiple_files=True, type=["csv"])

# Step 2: Process files and display results
if uploaded_files:
    all_homorepeats = set()
    all_sequences_data = []

    for file in uploaded_files:
        homorepeats, sequence_data = process_csv(file)
        if homorepeats is not None:
            all_homorepeats.update(homorepeats)
            all_sequences_data.extend(sequence_data)

    if all_sequences_data:
        st.success(f"Processed {len(uploaded_files)} files successfully!")

        # Step 3: Generate and download the Excel report
        excel_file = create_excel(all_sequences_data, all_homorepeats)

        # Download the Excel file
        st.download_button(
            label="Download Excel file",
            data=excel_file,
            file_name="protein_homorepeat_results.xlsx",
            mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
        )

        # Step 4: Display summary table
        if st.checkbox("Show Results Table"):
            # Convert the sequences data into a DataFrame for easy display
            rows = []
            for entry_id, protein_name, freq in all_sequences_data:
                row = {"Entry ID": entry_id, "Protein Name": protein_name}
                row.update({repeat: freq.get(repeat, 0) for repeat in sorted(all_homorepeats)})
                rows.append(row)

            result_df = pd.DataFrame(rows)
            st.dataframe(result_df)