NiniCat LfOreVEr commited on
Commit
f174fdf
1 Parent(s): f90af69

Update cas9on.py (#4)

Browse files

- Update cas9on.py (bf083c28f7799d4200a4b99c657ad0f124e70a49)


Co-authored-by: Qingyang Liu <LfOreVEr@users.noreply.huggingface.co>

Files changed (1) hide show
  1. cas9on.py +12 -7
cas9on.py CHANGED
@@ -70,20 +70,24 @@ def fetch_ensembl_sequence(transcript_id):
70
  print(f"Error fetching sequence data from Ensembl: {response.text}")
71
  return None
72
 
73
- def find_crispr_targets(sequence, chr, start, strand, transcript_id, exon_id, pam="NGG", target_length=20):
74
  targets = []
75
  len_sequence = len(sequence)
76
- complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
77
  dnatorna = {'A': 'A', 'T': 'U', 'C': 'C', 'G': 'G'}
78
 
79
- if strand == -1:
80
- sequence = ''.join([complement[base] for base in sequence])
81
  for i in range(len_sequence - len(pam) + 1):
82
  if sequence[i + 1:i + 3] == pam[1:]:
83
  if i >= target_length:
84
  target_seq = sequence[i - target_length:i + 3]
85
- tar_start = start + i - target_length
86
- tar_end = start + i + 3 - 1
 
 
 
 
 
 
87
  gRNA = ''.join([dnatorna[base] for base in sequence[i - target_length:i]])
88
  targets.append([target_seq, gRNA, chr, str(tar_start), str(tar_end), str(strand), transcript_id, exon_id])
89
 
@@ -131,9 +135,10 @@ def process_gene(gene_symbol, model_path):
131
  if gene_sequence:
132
  all_gene_sequences.append(gene_sequence) # Add this gene sequence to the list
133
  start = exon['start']
 
134
  strand = exon['strand']
135
  chr = exon['seq_region_name']
136
- targets = find_crispr_targets(gene_sequence, chr, start, strand, transcript_id, exon_id)
137
  if targets:
138
  # Predict on-target efficiency for each gRNA site
139
  formatted_data = format_prediction_output(targets, model_path)
 
70
  print(f"Error fetching sequence data from Ensembl: {response.text}")
71
  return None
72
 
73
+ def find_crispr_targets(sequence, chr, start, end, strand, transcript_id, exon_id, pam="NGG", target_length=20):
74
  targets = []
75
  len_sequence = len(sequence)
76
+ #complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
77
  dnatorna = {'A': 'A', 'T': 'U', 'C': 'C', 'G': 'G'}
78
 
 
 
79
  for i in range(len_sequence - len(pam) + 1):
80
  if sequence[i + 1:i + 3] == pam[1:]:
81
  if i >= target_length:
82
  target_seq = sequence[i - target_length:i + 3]
83
+ if strand == -1:
84
+ tar_start = end - (i + 2)
85
+ tar_end = end - (i - target_length)
86
+ #seq_in_ref = ''.join([complement[base] for base in target_seq])[::-1]
87
+ else:
88
+ tar_start = start + i - target_length
89
+ tar_end = start + i + 3 - 1
90
+ #seq_in_ref = target_seq
91
  gRNA = ''.join([dnatorna[base] for base in sequence[i - target_length:i]])
92
  targets.append([target_seq, gRNA, chr, str(tar_start), str(tar_end), str(strand), transcript_id, exon_id])
93
 
 
135
  if gene_sequence:
136
  all_gene_sequences.append(gene_sequence) # Add this gene sequence to the list
137
  start = exon['start']
138
+ end = exon['end']
139
  strand = exon['strand']
140
  chr = exon['seq_region_name']
141
+ targets = find_crispr_targets(gene_sequence, chr, start, end, strand, transcript_id, exon_id)
142
  if targets:
143
  # Predict on-target efficiency for each gRNA site
144
  formatted_data = format_prediction_output(targets, model_path)