Spaces:

Jayesh13
/

HI_SCBL

Runtime error

App Files Files Community

HI_SCBL / homorepeat_app.py

Jayesh13

Create homorepeat_app.py

5e885f1 verified 5 months ago

raw

history blame

3.91 kB

	import streamlit as st
	import pandas as pd
	import xlsxwriter
	from io import BytesIO
	from collections import defaultdict

	# Function to find repeated amino acids in the protein sequence
	def find_homorepeats(protein):
	n = len(protein)
	freq = defaultdict(int)
	i = 0

	while i < n:
	curr = protein[i]
	repeat = ""
	while i < n and curr == protein[i]:
	repeat += protein[i]
	i += 1

	# Only consider repeats of length > 1
	if len(repeat) > 1:
	freq[repeat] += 1

	return freq

	# Function to process a single CSV file and return its analysis
	def process_csv(file):
	df = pd.read_csv(file)
	if len(df.columns) < 3:
	st.error(f"Error: The file must have at least three columns: ID, Protein Name, Sequence")
	return None

	# Storing entry ID, protein name, and sequence
	sequences = []
	for _, row in df.iterrows():
	entry_id = str(row[0])
	protein_name = str(row[1])
	sequence = str(row[2]).replace('"', '').replace(' ', '')
	sequences.append((entry_id, protein_name, sequence))

	# Analyzing homorepeats in the sequences
	homorepeats = set()
	sequence_data = []
	for entry_id, protein_name, sequence in sequences:
	freq = find_homorepeats(sequence)
	homorepeats.update(freq.keys()) # Collect unique homorepeats
	sequence_data.append((entry_id, protein_name, freq))

	return homorepeats, sequence_data

	# Function to generate and download Excel workbook
	def create_excel(sequences_data, homorepeats):
	output = BytesIO()
	workbook = xlsxwriter.Workbook(output, {'in_memory': True})
	worksheet = workbook.add_worksheet()

	# Write the header
	worksheet.write(0, 0, "Entry ID")
	worksheet.write(0, 1, "Protein Name")
	col = 2
	for repeat in sorted(homorepeats):
	worksheet.write(0, col, repeat)
	col += 1

	# Write data for each sequence
	row = 1
	for entry_id, protein_name, freq in sequences_data:
	worksheet.write(row, 0, entry_id)
	worksheet.write(row, 1, protein_name)
	col = 2
	for repeat in sorted(homorepeats):
	worksheet.write(row, col, freq.get(repeat, 0))
	col += 1
	row += 1

	workbook.close()
	output.seek(0)
	return output

	# Streamlit UI components
	st.title("Protein Homorepeat Analysis")

	# Step 1: Upload CSV Files
	uploaded_files = st.file_uploader("Upload CSV files", accept_multiple_files=True, type=["csv"])

	# Step 2: Process files and display results
	if uploaded_files:
	all_homorepeats = set()
	all_sequences_data = []

	for file in uploaded_files:
	homorepeats, sequence_data = process_csv(file)
	if homorepeats is not None:
	all_homorepeats.update(homorepeats)
	all_sequences_data.extend(sequence_data)

	if all_sequences_data:
	st.success(f"Processed {len(uploaded_files)} files successfully!")

	# Step 3: Generate and download the Excel report
	excel_file = create_excel(all_sequences_data, all_homorepeats)

	# Download the Excel file
	st.download_button(
	label="Download Excel file",
	data=excel_file,
	file_name="protein_homorepeat_results.xlsx",
	mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
	)

	# Step 4: Display summary table
	if st.checkbox("Show Results Table"):
	# Convert the sequences data into a DataFrame for easy display
	rows = []
	for entry_id, protein_name, freq in all_sequences_data:
	row = {"Entry ID": entry_id, "Protein Name": protein_name}
	row.update({repeat: freq.get(repeat, 0) for repeat in sorted(all_homorepeats)})
	rows.append(row)

	result_df = pd.DataFrame(rows)
	st.dataframe(result_df)