Jayesh13 commited on
Commit
5e885f1
·
verified ·
1 Parent(s): 3ed9f75

Create homorepeat_app.py

Browse files
Files changed (1) hide show
  1. homorepeat_app.py +121 -0
homorepeat_app.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import xlsxwriter
4
+ from io import BytesIO
5
+ from collections import defaultdict
6
+
7
+ # Function to find repeated amino acids in the protein sequence
8
+ def find_homorepeats(protein):
9
+ n = len(protein)
10
+ freq = defaultdict(int)
11
+ i = 0
12
+
13
+ while i < n:
14
+ curr = protein[i]
15
+ repeat = ""
16
+ while i < n and curr == protein[i]:
17
+ repeat += protein[i]
18
+ i += 1
19
+
20
+ # Only consider repeats of length > 1
21
+ if len(repeat) > 1:
22
+ freq[repeat] += 1
23
+
24
+ return freq
25
+
26
+ # Function to process a single CSV file and return its analysis
27
+ def process_csv(file):
28
+ df = pd.read_csv(file)
29
+ if len(df.columns) < 3:
30
+ st.error(f"Error: The file must have at least three columns: ID, Protein Name, Sequence")
31
+ return None
32
+
33
+ # Storing entry ID, protein name, and sequence
34
+ sequences = []
35
+ for _, row in df.iterrows():
36
+ entry_id = str(row[0])
37
+ protein_name = str(row[1])
38
+ sequence = str(row[2]).replace('"', '').replace(' ', '')
39
+ sequences.append((entry_id, protein_name, sequence))
40
+
41
+ # Analyzing homorepeats in the sequences
42
+ homorepeats = set()
43
+ sequence_data = []
44
+ for entry_id, protein_name, sequence in sequences:
45
+ freq = find_homorepeats(sequence)
46
+ homorepeats.update(freq.keys()) # Collect unique homorepeats
47
+ sequence_data.append((entry_id, protein_name, freq))
48
+
49
+ return homorepeats, sequence_data
50
+
51
+ # Function to generate and download Excel workbook
52
+ def create_excel(sequences_data, homorepeats):
53
+ output = BytesIO()
54
+ workbook = xlsxwriter.Workbook(output, {'in_memory': True})
55
+ worksheet = workbook.add_worksheet()
56
+
57
+ # Write the header
58
+ worksheet.write(0, 0, "Entry ID")
59
+ worksheet.write(0, 1, "Protein Name")
60
+ col = 2
61
+ for repeat in sorted(homorepeats):
62
+ worksheet.write(0, col, repeat)
63
+ col += 1
64
+
65
+ # Write data for each sequence
66
+ row = 1
67
+ for entry_id, protein_name, freq in sequences_data:
68
+ worksheet.write(row, 0, entry_id)
69
+ worksheet.write(row, 1, protein_name)
70
+ col = 2
71
+ for repeat in sorted(homorepeats):
72
+ worksheet.write(row, col, freq.get(repeat, 0))
73
+ col += 1
74
+ row += 1
75
+
76
+ workbook.close()
77
+ output.seek(0)
78
+ return output
79
+
80
+ # Streamlit UI components
81
+ st.title("Protein Homorepeat Analysis")
82
+
83
+ # Step 1: Upload CSV Files
84
+ uploaded_files = st.file_uploader("Upload CSV files", accept_multiple_files=True, type=["csv"])
85
+
86
+ # Step 2: Process files and display results
87
+ if uploaded_files:
88
+ all_homorepeats = set()
89
+ all_sequences_data = []
90
+
91
+ for file in uploaded_files:
92
+ homorepeats, sequence_data = process_csv(file)
93
+ if homorepeats is not None:
94
+ all_homorepeats.update(homorepeats)
95
+ all_sequences_data.extend(sequence_data)
96
+
97
+ if all_sequences_data:
98
+ st.success(f"Processed {len(uploaded_files)} files successfully!")
99
+
100
+ # Step 3: Generate and download the Excel report
101
+ excel_file = create_excel(all_sequences_data, all_homorepeats)
102
+
103
+ # Download the Excel file
104
+ st.download_button(
105
+ label="Download Excel file",
106
+ data=excel_file,
107
+ file_name="protein_homorepeat_results.xlsx",
108
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
109
+ )
110
+
111
+ # Step 4: Display summary table
112
+ if st.checkbox("Show Results Table"):
113
+ # Convert the sequences data into a DataFrame for easy display
114
+ rows = []
115
+ for entry_id, protein_name, freq in all_sequences_data:
116
+ row = {"Entry ID": entry_id, "Protein Name": protein_name}
117
+ row.update({repeat: freq.get(repeat, 0) for repeat in sorted(all_homorepeats)})
118
+ rows.append(row)
119
+
120
+ result_df = pd.DataFrame(rows)
121
+ st.dataframe(result_df)