Spaces:

Jayesh13
/

HI_SCBL

Runtime error

App Files Files Community

HI_SCBL / app.py

Jayesh13

Update app.py

ef8ec9c verified 6 months ago

raw

history blame

4.76 kB

	# Install required dependencies if not present
	import os
	os.system("pip install streamlit pandas xlsxwriter")

	import streamlit as st
	import pandas as pd
	import xlsxwriter
	from io import BytesIO
	from collections import defaultdict



	# Function to find repeated amino acids in the protein sequence
	def find_homorepeats(protein):
	n = len(protein)
	freq = defaultdict(int)
	i = 0

	while i < n:
	curr = protein[i]
	repeat = ""
	while i < n and curr == protein[i]:
	repeat += protein[i]
	i += 1

	# Only consider repeats of length > 1
	if len(repeat) > 1:
	freq[repeat] += 1

	return freq

	# Function to process a single CSV file and return its analysis
	def process_csv(file):
	df = pd.read_csv(file)
	if len(df.columns) < 3:
	st.error(f"Error: The file must have at least three columns: ID, Protein Name, Sequence")
	return None

	# Storing entry ID, protein name, and sequence
	sequences = []
	for _, row in df.iterrows():
	entry_id = str(row[0])
	protein_name = str(row[1])
	sequence = str(row[2]).replace('"', '').replace(' ', '')
	sequences.append((entry_id, protein_name, sequence))

	# Analyzing homorepeats in the sequences
	homorepeats = set()
	sequence_data = []
	for entry_id, protein_name, sequence in sequences:
	freq = find_homorepeats(sequence)
	homorepeats.update(freq.keys()) # Collect unique homorepeats
	sequence_data.append((entry_id, protein_name, freq))

	return homorepeats, sequence_data

	# Function to generate and download Excel workbook with file names as separators
	def create_excel(sequences_data, homorepeats, filenames):
	output = BytesIO()
	workbook = xlsxwriter.Workbook(output, {'in_memory': True})
	worksheet = workbook.add_worksheet()

	row = 0

	# Iterate through sequences data grouped by filenames
	for file_index, file_data in enumerate(sequences_data):
	filename = filenames[file_index]

	# Write filename as a separator row
	worksheet.write(row, 0, f"File: {filename}")
	row += 1

	# Write the header for the current file
	worksheet.write(row, 0, "Entry ID")
	worksheet.write(row, 1, "Protein Name")
	col = 2
	for repeat in sorted(homorepeats):
	worksheet.write(row, col, repeat)
	col += 1
	row += 1

	# Write data for each sequence in the current file
	for entry_id, protein_name, freq in file_data:
	worksheet.write(row, 0, entry_id)
	worksheet.write(row, 1, protein_name)
	col = 2
	for repeat in sorted(homorepeats):
	worksheet.write(row, col, freq.get(repeat, 0))
	col += 1
	row += 1

	# Add an empty row as a separator between files
	row += 1

	workbook.close()
	output.seek(0)
	return output

	# Streamlit UI components
	st.title("Protein Homorepeat Analysis")

	# Step 1: Upload CSV Files
	uploaded_files = st.file_uploader("Upload CSV files", accept_multiple_files=True, type=["csv"])

	# Step 2: Process files and display results
	if uploaded_files:
	all_homorepeats = set()
	all_sequences_data = []
	filenames = []

	for file in uploaded_files:
	homorepeats, sequence_data = process_csv(file)
	if homorepeats is not None:
	all_homorepeats.update(homorepeats)
	all_sequences_data.append(sequence_data)
	filenames.append(file.name)

	if all_sequences_data:
	st.success(f"Processed {len(uploaded_files)} files successfully!")

	# Step 3: Generate and download the Excel report
	excel_file = create_excel(all_sequences_data, all_homorepeats, filenames)

	# Download the Excel file
	st.download_button(
	label="Download Excel file",
	data=excel_file,
	file_name="protein_homorepeat_results.xlsx",
	mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
	)

	# Step 4: Display summary table
	if st.checkbox("Show Results Table"):
	# Convert the sequences data into a DataFrame for easy display
	rows = []
	for file_index, file_data in enumerate(all_sequences_data):
	filename = filenames[file_index]
	for entry_id, protein_name, freq in file_data:
	row = {"Filename": filename, "Entry ID": entry_id, "Protein Name": protein_name}
	row.update({repeat: freq.get(repeat, 0) for repeat in sorted(all_homorepeats)})
	rows.append(row)

	result_df = pd.DataFrame(rows)
	st.dataframe(result_df)