predict-tabs / utils.py
S Santhosh Kumar
Add application file
df11f35
# Function for when you want to prepare DNA sequence feature for ML applications
import numpy as np
# Function for when you want to prepare DNA sequence feature for ML applications
def dnaseq_features(seq):
start=0
n_segs=101
seq_name = 'seq'
remaind = len(seq)%n_segs
if(remaind != 0):
last_id = len(seq) - remaind
upd_seq = seq[start:last_id]
dic_seq = {}
for i in range(0,3):
a = int(i*n_segs) ; b = int(i*n_segs)+n_segs
identifier = f"{seq_name}_{a}:{b}"
dic_seq[identifier] = upd_seq[a:b]
lst_seq = dic_seq.values()
index = list(dic_seq.keys())
values = list(dic_seq.values())
# One hot encode
ii=-1
for data in lst_seq:
ii+=1
abc = 'ACGT'
char_to_int = dict((c, i) for i, c in enumerate(abc))
int_enc = [char_to_int[char] for char in data]
ohe = []
for value in int_enc:
base = [0 for _ in range(len(abc))]
base[value] = 1
ohe.append(base)
np_mat = np.array(ohe)
np_mat = np.expand_dims(np_mat,axis=0)
if(ii != 0):
matrix = np.concatenate([np_mat,matrix],axis=0)
else:
matrix = np_mat
return matrix,index,values