Spaces:
Running
Running
Commit
·
c8c0720
1
Parent(s):
d9ba5a4
Create add_domains.py
Browse files- code/add_domains.py +32 -0
code/add_domains.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from collections import Counter
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
def add_domains(data, path_to_domains):
|
6 |
+
DOMAINS = pd.read_csv(path_to_domains, delimiter=' ')
|
7 |
+
data = data.merge(DOMAINS, right_on='proteinID', left_on='uniprotID', how='left')
|
8 |
+
data.domStart = data.domStart.astype('Int64')
|
9 |
+
data.domEnd = data.domEnd.astype('Int64')
|
10 |
+
data = data.drop(['proteinID'], axis=1)
|
11 |
+
data['distance'] = np.NaN
|
12 |
+
zeroDistanceDomains = []
|
13 |
+
for i in data.index:
|
14 |
+
if pd.isna(data.at[i, 'domain']):
|
15 |
+
data.at[i, 'distance'] = np.NaN
|
16 |
+
else:
|
17 |
+
if int(data.at[i, 'domStart']) <= int(data.at[i, 'pos']) <= int(data.at[i, 'domEnd']):
|
18 |
+
data.at[i, 'distance'] = 0
|
19 |
+
DOMAIN_NAME = data.at[i, 'domain']
|
20 |
+
zeroDistanceDomains.append(DOMAIN_NAME)
|
21 |
+
data = data.sort_values(by=['datapoint', 'distance']).reset_index(drop=True) # Distances will be sorted.
|
22 |
+
|
23 |
+
ZeroDistance = data[data.distance == 0.0]
|
24 |
+
NotZeroDistance = data[data.distance != 0.0]
|
25 |
+
NotZeroDistance.distance = -1000
|
26 |
+
|
27 |
+
NotZeroDistance = NotZeroDistance[~NotZeroDistance.domain.isin(zeroDistanceDomains)]
|
28 |
+
|
29 |
+
data = pd.concat([ZeroDistance, NotZeroDistance], sort=False)
|
30 |
+
data.reset_index(drop=True, inplace=True)
|
31 |
+
data.fillna(-1, inplace=True)
|
32 |
+
return data
|