fatmacankara commited on
Commit
c8c0720
·
1 Parent(s): d9ba5a4

Create add_domains.py

Browse files
Files changed (1) hide show
  1. code/add_domains.py +32 -0
code/add_domains.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import Counter
2
+ import pandas as pd
3
+ import numpy as np
4
+
5
+ def add_domains(data, path_to_domains):
6
+ DOMAINS = pd.read_csv(path_to_domains, delimiter=' ')
7
+ data = data.merge(DOMAINS, right_on='proteinID', left_on='uniprotID', how='left')
8
+ data.domStart = data.domStart.astype('Int64')
9
+ data.domEnd = data.domEnd.astype('Int64')
10
+ data = data.drop(['proteinID'], axis=1)
11
+ data['distance'] = np.NaN
12
+ zeroDistanceDomains = []
13
+ for i in data.index:
14
+ if pd.isna(data.at[i, 'domain']):
15
+ data.at[i, 'distance'] = np.NaN
16
+ else:
17
+ if int(data.at[i, 'domStart']) <= int(data.at[i, 'pos']) <= int(data.at[i, 'domEnd']):
18
+ data.at[i, 'distance'] = 0
19
+ DOMAIN_NAME = data.at[i, 'domain']
20
+ zeroDistanceDomains.append(DOMAIN_NAME)
21
+ data = data.sort_values(by=['datapoint', 'distance']).reset_index(drop=True) # Distances will be sorted.
22
+
23
+ ZeroDistance = data[data.distance == 0.0]
24
+ NotZeroDistance = data[data.distance != 0.0]
25
+ NotZeroDistance.distance = -1000
26
+
27
+ NotZeroDistance = NotZeroDistance[~NotZeroDistance.domain.isin(zeroDistanceDomains)]
28
+
29
+ data = pd.concat([ZeroDistance, NotZeroDistance], sort=False)
30
+ data.reset_index(drop=True, inplace=True)
31
+ data.fillna(-1, inplace=True)
32
+ return data