fadliaulawi
commited on
Commit
•
2bc0f8a
1
Parent(s):
07396d9
Add new rsID handling
Browse files
utils.py
CHANGED
@@ -26,6 +26,9 @@ mistakes = {
|
|
26 |
'P': ['9']
|
27 |
}
|
28 |
|
|
|
|
|
|
|
29 |
def permutate(word):
|
30 |
|
31 |
if len(word) == 0:
|
@@ -52,32 +55,38 @@ def call(url):
|
|
52 |
|
53 |
return res
|
54 |
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
gwas_gene_rsid
|
67 |
-
gwas_gene_rsid.
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
def integrate(df):
|
83 |
|
|
|
26 |
'P': ['9']
|
27 |
}
|
28 |
|
29 |
+
raw_url = "https://www.ebi.ac.uk/gwas/api/search/downloads/alternative"
|
30 |
+
gwas_path = "resources/gwas_catalog.tsv"
|
31 |
+
|
32 |
def permutate(word):
|
33 |
|
34 |
if len(word) == 0:
|
|
|
55 |
|
56 |
return res
|
57 |
|
58 |
+
def generate_raw_files():
|
59 |
+
|
60 |
+
# Load Raw GWAS files
|
61 |
+
if os.path.exists(gwas_path):
|
62 |
+
gwas = pd.read_csv(gwas_path, delimiter='\t')[['DISEASE/TRAIT', 'CHR_ID', 'MAPPED_GENE', 'SNPS', 'P-VALUE', 'OR or BETA']]
|
63 |
+
else:
|
64 |
+
data = requests.get(raw_url).content.decode('utf-8')
|
65 |
+
gwas = pd.read_csv(StringIO(data), delimiter='\t')[['DISEASE/TRAIT', 'CHR_ID', 'MAPPED_GENE', 'SNPS', 'P-VALUE', 'OR or BETA']]
|
66 |
+
|
67 |
+
# Load Genes and SNPs from GWAS
|
68 |
+
gwas_gene_rsid = gwas[['MAPPED_GENE', 'SNPS']]
|
69 |
+
gwas_gene_rsid.dropna(inplace=True, ignore_index=True)
|
70 |
+
gwas_gene_rsid['MAPPED_GENE'] = gwas_gene_rsid['MAPPED_GENE'].apply(lambda x: x.replace(' ', '').upper())
|
71 |
+
|
72 |
+
# Generate Genes and SNPs mapping
|
73 |
+
ground_truth = defaultdict(list)
|
74 |
+
for i in gwas_gene_rsid.index:
|
75 |
+
gene = gwas_gene_rsid.loc[i, 'MAPPED_GENE']
|
76 |
+
snp = gwas_gene_rsid.loc[i, 'SNPS']
|
77 |
+
|
78 |
+
pattern = r"[,x\-]"
|
79 |
+
genes = re.split(pattern, gene)
|
80 |
+
snps = re.split(pattern, snp)
|
81 |
+
|
82 |
+
for gene in genes:
|
83 |
+
for snp in snps:
|
84 |
+
ground_truth[gene].append(snp)
|
85 |
+
ground_truth[snp].append(gene)
|
86 |
+
|
87 |
+
return gwas, ground_truth
|
88 |
+
|
89 |
+
gwas, ground_truth = generate_raw_files()
|
90 |
|
91 |
def integrate(df):
|
92 |
|