Spaces:
Sleeping
Sleeping
new pre-processing function
Browse files
app.py
CHANGED
@@ -201,20 +201,27 @@ def fig_to_img(fig):
|
|
201 |
img = Image.open(buf)
|
202 |
return img
|
203 |
|
204 |
-
|
205 |
-
def process_family_sequence(protein_fasta):
|
206 |
lines = protein_fasta.split('\n')
|
207 |
-
|
208 |
headers = [line for line in lines if line.startswith('>')]
|
209 |
if len(headers) > 1:
|
210 |
-
return None,
|
211 |
|
212 |
protein_sequence = ''.join(line for line in lines if not line.startswith('>'))
|
213 |
-
|
214 |
# Check for invalid characters
|
215 |
valid_characters = set("ACDEFGHIKLMNPQRSTVWYacdefghiklmnpqrstvwy") # the 20 standard amino acids
|
216 |
if not set(protein_sequence).issubset(valid_characters):
|
217 |
-
return None,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
218 |
|
219 |
encoded_input = tokenizer([protein_sequence], padding=True, truncation=True, max_length=512, return_tensors="pt")
|
220 |
input_idsfam = encoded_input["input_ids"]
|
@@ -263,20 +270,7 @@ def process_family_sequence(protein_fasta):
|
|
263 |
|
264 |
|
265 |
def process_single_sequence(protein_fasta): #, protein_file
|
266 |
-
|
267 |
-
lines = protein_fasta.split('\n')
|
268 |
-
|
269 |
-
headers = [line for line in lines if line.startswith('>')]
|
270 |
-
if len(headers) > 1:
|
271 |
-
return None, "Multiple fasta sequences detected. Please upload a fasta file with multiple sequences, otherwise only include one fasta sequence.", None
|
272 |
-
|
273 |
-
protein_sequence = ''.join(line for line in lines if not line.startswith('>'))
|
274 |
-
|
275 |
-
# Check for invalid characters
|
276 |
-
valid_characters = set("ACDEFGHIKLMNPQRSTVWYacdefghiklmnpqrstvwy") # the 20 standard amino acids
|
277 |
-
if not set(protein_sequence).issubset(valid_characters):
|
278 |
-
return None, "Invalid protein sequence. It contains characters that are not one of the 20 standard amino acids. Does your sequence contain gaps?", None
|
279 |
-
|
280 |
|
281 |
encoded_input = tokenizer([protein_sequence], padding=True, truncation=True, max_length=512, return_tensors="pt")
|
282 |
input_ids = encoded_input["input_ids"]
|
@@ -360,18 +354,7 @@ def mask_residue(sequence, position):
|
|
360 |
return sequence[:position] + 'X' + sequence[position+1:]
|
361 |
|
362 |
def generate_heatmap(protein_fasta):
|
363 |
-
|
364 |
-
header = lines[0]
|
365 |
-
protein_sequence = ''.join(lines[1:])
|
366 |
-
|
367 |
-
# Check if the header is valid
|
368 |
-
if not header.startswith('>'):
|
369 |
-
return None, "Invalid FASTA format. Header should start with '>'.", None
|
370 |
-
|
371 |
-
# Check for invalid characters in the sequence
|
372 |
-
valid_characters = set("ACDEFGHIKLMNPQRSTVWYacdefghiklmnpqrstvwy")
|
373 |
-
if not set(protein_sequence).issubset(valid_characters):
|
374 |
-
return None, "Invalid protein sequence. It contains characters that are not one of the 20 standard amino acids.", None
|
375 |
|
376 |
# Tokenize and predict for original sequence
|
377 |
encoded_input = tokenizer([protein_sequence], padding=True, truncation=True, max_length=512, return_tensors="pt")
|
|
|
201 |
img = Image.open(buf)
|
202 |
return img
|
203 |
|
204 |
+
def preprocess_protein_sequence(protein_fasta):
|
|
|
205 |
lines = protein_fasta.split('\n')
|
206 |
+
|
207 |
headers = [line for line in lines if line.startswith('>')]
|
208 |
if len(headers) > 1:
|
209 |
+
return None, "Multiple fasta sequences detected. Please upload a fasta file with only one sequence."
|
210 |
|
211 |
protein_sequence = ''.join(line for line in lines if not line.startswith('>'))
|
212 |
+
|
213 |
# Check for invalid characters
|
214 |
valid_characters = set("ACDEFGHIKLMNPQRSTVWYacdefghiklmnpqrstvwy") # the 20 standard amino acids
|
215 |
if not set(protein_sequence).issubset(valid_characters):
|
216 |
+
return None, "Invalid protein sequence. It contains characters that are not one of the 20 standard amino acids. Does your sequence contain gaps?"
|
217 |
+
|
218 |
+
return protein_sequence, None
|
219 |
+
|
220 |
+
|
221 |
+
def process_family_sequence(protein_fasta):
|
222 |
+
protein_sequence, error_msg = preprocess_protein_sequence(protein_fasta)
|
223 |
+
if error_msg:
|
224 |
+
return None, None, None, error_msg
|
225 |
|
226 |
encoded_input = tokenizer([protein_sequence], padding=True, truncation=True, max_length=512, return_tensors="pt")
|
227 |
input_idsfam = encoded_input["input_ids"]
|
|
|
270 |
|
271 |
|
272 |
def process_single_sequence(protein_fasta): #, protein_file
|
273 |
+
protein_sequence, error_msg = preprocess_protein_sequence(protein_fasta)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
274 |
|
275 |
encoded_input = tokenizer([protein_sequence], padding=True, truncation=True, max_length=512, return_tensors="pt")
|
276 |
input_ids = encoded_input["input_ids"]
|
|
|
354 |
return sequence[:position] + 'X' + sequence[position+1:]
|
355 |
|
356 |
def generate_heatmap(protein_fasta):
|
357 |
+
protein_sequence, error_msg = preprocess_protein_sequence(protein_fasta)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
358 |
|
359 |
# Tokenize and predict for original sequence
|
360 |
encoded_input = tokenizer([protein_sequence], padding=True, truncation=True, max_length=512, return_tensors="pt")
|