fatmacankara commited on
Commit
e668ce0
·
1 Parent(s): 97b1b63

Delete code/add_domains_alphafold.py

Browse files
Files changed (1) hide show
  1. code/add_domains_alphafold.py +0 -57
code/add_domains_alphafold.py DELETED
@@ -1,57 +0,0 @@
1
- from collections import Counter
2
- import pandas as pd
3
-
4
- def add_domains(data, path_to_domains):
5
- domains = pd.read_csv(path_to_domains, delimiter=' ')
6
- data = data.merge(domains, right_on='proteinID', left_on='uniprotID', how='left')
7
- data = data.drop(['proteinID'], axis=1)
8
- # Label each data point as range or notRange based on the relative distance of mutation and domain boundaries.
9
- data = data.astype('str')
10
- data.domStart = data.domStart.astype('float')
11
- data.domEnd = data.domEnd.astype('float')
12
-
13
- for i in data.index:
14
- if data.at[i, 'domain'] != 'nan':
15
- if int(data.at[i, 'domStart']) <= int(data.at[i, 'pos']) <= int(data.at[i, 'domEnd']):
16
- data.at[i, 'distance'] = 0
17
- else:
18
- distance = min(abs(int(data.at[i, 'domStart']) - int(data.at[i, 'pos'])),
19
- abs(int(data.at[i, 'domEnd']) - int(data.at[i, 'pos'])))
20
- data.at[i, 'distance'] = int(distance)
21
- else:
22
- data.at[i, 'distance'] = 'nan'
23
-
24
- data = data.sort_values(by=['datapoint', 'distance']).reset_index(drop=True) # Distances will be sorted.
25
-
26
- # Keep the one with the least distance. But we may have more than one range domains for a datapoint if distance = 0.
27
- # For this reason first we need to separate range ones so that when we take the first occurance to get the closest one
28
- # for non range ones, other distance=0 ones wont disappear.
29
-
30
- data_range = data[data.distance == 0]
31
- data_out_range = data[data.distance != 0]
32
-
33
- # For the range ones, find the most occurance
34
-
35
- dom = []
36
- for i in data_range.index:
37
- dom.append(data_range.at[i, 'domain'])
38
-
39
- domainCount = Counter(dom) # Occurance of domains.
40
-
41
- # For out of range ones, take the closest distance.
42
- data_out_range = data_out_range.drop_duplicates(['datapoint'], keep='first') # Already sorted above.
43
- domain_counts = pd.DataFrame(domainCount.items(), columns=['domain', 'count'])
44
- data_range_counts = data_range.merge(domain_counts, on='domain')
45
- data_range_counts = data_range_counts.sort_values(['datapoint', 'count'])
46
- data_range_counts = data_range_counts.drop_duplicates(['datapoint'], keep='last') # Take with the higher count.
47
- data_range_counts = data_range_counts.drop(['count'], axis=1)
48
-
49
- # Merge them back together
50
-
51
- frames = [data_range_counts, data_out_range]
52
- data = pd.concat(frames, sort=False) # Here when you concat two data frames, we might have range and not range with
53
- # min distance for the same data point. Delete the one coming from notRange one.
54
- data = data.sort_values(['datapoint', 'distance']).reset_index(drop=True)
55
- data = data.drop_duplicates(['datapoint'], keep='first')
56
- data = data.astype(str)
57
- return data