fadliaulawi commited on
Commit
2bc0f8a
1 Parent(s): 07396d9

Add new rsID handling

Browse files
Files changed (1) hide show
  1. utils.py +35 -26
utils.py CHANGED
@@ -26,6 +26,9 @@ mistakes = {
26
  'P': ['9']
27
  }
28
 
 
 
 
29
  def permutate(word):
30
 
31
  if len(word) == 0:
@@ -52,32 +55,38 @@ def call(url):
52
 
53
  return res
54
 
55
- raw_url = "https://www.ebi.ac.uk/gwas/api/search/downloads/alternative"
56
- gwas_path = "resources/gwas_catalog.tsv"
57
-
58
- # Load Raw GWAS files
59
- if os.path.exists(gwas_path):
60
- gwas = pd.read_csv(gwas_path, delimiter='\t')[['DISEASE/TRAIT', 'CHR_ID', 'MAPPED_GENE', 'SNPS', 'P-VALUE', 'OR or BETA']]
61
- else:
62
- data = requests.get(raw_url).content.decode('utf-8')
63
- gwas = pd.read_csv(StringIO(data), delimiter='\t')[['DISEASE/TRAIT', 'CHR_ID', 'MAPPED_GENE', 'SNPS', 'P-VALUE', 'OR or BETA']]
64
-
65
- # Load Genes and SNPs from GWAS
66
- gwas_gene_rsid = gwas[['MAPPED_GENE', 'SNPS']]
67
- gwas_gene_rsid.dropna(inplace=True, ignore_index=True)
68
- gwas_gene_rsid['MAPPED_GENE'] = gwas_gene_rsid['MAPPED_GENE'].apply(lambda x: x.replace(' ', '').upper())
69
-
70
- # Generate Genes and SNPs mapping
71
- ground_truth = defaultdict(list)
72
- for i in gwas_gene_rsid.index:
73
- gene = gwas_gene_rsid.loc[i, 'MAPPED_GENE']
74
- snp = gwas_gene_rsid.loc[i, 'SNPS']
75
-
76
- pattern = r"[,\-]"
77
- genes = re.split(pattern, gene)
78
- for gene in genes:
79
- ground_truth[gene].append(snp)
80
- ground_truth[snp].append(gene)
 
 
 
 
 
 
81
 
82
  def integrate(df):
83
 
 
26
  'P': ['9']
27
  }
28
 
29
+ raw_url = "https://www.ebi.ac.uk/gwas/api/search/downloads/alternative"
30
+ gwas_path = "resources/gwas_catalog.tsv"
31
+
32
  def permutate(word):
33
 
34
  if len(word) == 0:
 
55
 
56
  return res
57
 
58
+ def generate_raw_files():
59
+
60
+ # Load Raw GWAS files
61
+ if os.path.exists(gwas_path):
62
+ gwas = pd.read_csv(gwas_path, delimiter='\t')[['DISEASE/TRAIT', 'CHR_ID', 'MAPPED_GENE', 'SNPS', 'P-VALUE', 'OR or BETA']]
63
+ else:
64
+ data = requests.get(raw_url).content.decode('utf-8')
65
+ gwas = pd.read_csv(StringIO(data), delimiter='\t')[['DISEASE/TRAIT', 'CHR_ID', 'MAPPED_GENE', 'SNPS', 'P-VALUE', 'OR or BETA']]
66
+
67
+ # Load Genes and SNPs from GWAS
68
+ gwas_gene_rsid = gwas[['MAPPED_GENE', 'SNPS']]
69
+ gwas_gene_rsid.dropna(inplace=True, ignore_index=True)
70
+ gwas_gene_rsid['MAPPED_GENE'] = gwas_gene_rsid['MAPPED_GENE'].apply(lambda x: x.replace(' ', '').upper())
71
+
72
+ # Generate Genes and SNPs mapping
73
+ ground_truth = defaultdict(list)
74
+ for i in gwas_gene_rsid.index:
75
+ gene = gwas_gene_rsid.loc[i, 'MAPPED_GENE']
76
+ snp = gwas_gene_rsid.loc[i, 'SNPS']
77
+
78
+ pattern = r"[,x\-]"
79
+ genes = re.split(pattern, gene)
80
+ snps = re.split(pattern, snp)
81
+
82
+ for gene in genes:
83
+ for snp in snps:
84
+ ground_truth[gene].append(snp)
85
+ ground_truth[snp].append(gene)
86
+
87
+ return gwas, ground_truth
88
+
89
+ gwas, ground_truth = generate_raw_files()
90
 
91
  def integrate(df):
92