Spaces:
Sleeping
Sleeping
# Function for when you want to prepare DNA sequence feature for ML applications | |
import numpy as np | |
# Function for when you want to prepare DNA sequence feature for ML applications | |
def dnaseq_features(seq): | |
start=0 | |
n_segs=101 | |
seq_name = 'seq' | |
remaind = len(seq)%n_segs | |
if(remaind != 0): | |
last_id = len(seq) - remaind | |
upd_seq = seq[start:last_id] | |
dic_seq = {} | |
for i in range(0,3): | |
a = int(i*n_segs) ; b = int(i*n_segs)+n_segs | |
identifier = f"{seq_name}_{a}:{b}" | |
dic_seq[identifier] = upd_seq[a:b] | |
lst_seq = dic_seq.values() | |
index = list(dic_seq.keys()) | |
values = list(dic_seq.values()) | |
# One hot encode | |
ii=-1 | |
for data in lst_seq: | |
ii+=1 | |
abc = 'ACGT' | |
char_to_int = dict((c, i) for i, c in enumerate(abc)) | |
int_enc = [char_to_int[char] for char in data] | |
ohe = [] | |
for value in int_enc: | |
base = [0 for _ in range(len(abc))] | |
base[value] = 1 | |
ohe.append(base) | |
np_mat = np.array(ohe) | |
np_mat = np.expand_dims(np_mat,axis=0) | |
if(ii != 0): | |
matrix = np.concatenate([np_mat,matrix],axis=0) | |
else: | |
matrix = np_mat | |
return matrix,index,values |