fadliaulawi commited on
Commit
9c8e6da
1 Parent(s): 745c0a6

Enable API validation

Browse files
Files changed (1) hide show
  1. process.py +50 -50
process.py CHANGED
@@ -203,7 +203,7 @@ class Process():
203
  df['SNPs'] = df['SNPs'].str.lower()
204
 
205
  # Check if there is two gene names
206
- sym = [',', '-', '/', '|']
207
  for i in df.index:
208
  gene = df.loc[i, 'Genes']
209
  for s in sym:
@@ -238,60 +238,60 @@ class Process():
238
  df_clean = df.copy()
239
 
240
  # # Validate genes and SNPs with APIs
241
- # def permutate(word):
242
 
243
- # if len(word) == 0:
244
- # return ['']
245
 
246
- # change = []
247
- # res = permutate(word[1:])
248
 
249
- # if word[0] in mistakes:
250
- # change = [mistakes[word[0]] + r for r in res]
251
 
252
- # return [word[0] + r for r in res] + change
253
 
254
- # def call(url):
255
-
256
- # while True:
257
- # try:
258
- # res = requests.get(url)
259
- # time.sleep(1)
260
- # break
261
- # except Exception as e:
262
- # print(e)
263
-
264
- # return res
265
-
266
- # mistakes = {'I': '1', 'O': '0'} # Common mistakes need to be maintained
267
- # dbsnp = {}
268
-
269
- # for i in df.index:
270
- # snp = df.loc[i, 'SNPs']
271
- # gene = df.loc[i, 'Genes']
272
-
273
- # if snp not in dbsnp:
274
- # res = call(f'https://www.ebi.ac.uk/gwas/rest/api/singleNucleotidePolymorphisms/{snp}/')
275
- # try:
276
- # res = res.json()
277
- # dbsnp[snp] = [r['gene']['geneName'] for r in res['genomicContexts']]
278
- # except:
279
- # dbsnp[snp] = []
280
-
281
- # res = call(f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=snp&retmode=json&id={snp[2:]}').json()['result'][snp[2:]]
282
- # if 'error' not in res:
283
- # dbsnp[snp].extend([r['name'] for r in res['genes']])
284
-
285
- # dbsnp[snp] = list(set(dbsnp[snp]))
286
-
287
- # if gene not in dbsnp[snp]:
288
- # for other in permutate(gene):
289
- # if other in dbsnp[snp]:
290
- # df.loc[i, 'Genes'] = other
291
- # print(f'{gene} corrected to {other}')
292
- # break
293
- # else:
294
- # df = df.drop(i)
295
 
296
  # df.reset_index(drop=True, inplace=True)
297
  df_no_llm = df.copy()
 
203
  df['SNPs'] = df['SNPs'].str.lower()
204
 
205
  # Check if there is two gene names
206
+ sym = [',', '/', '|']
207
  for i in df.index:
208
  gene = df.loc[i, 'Genes']
209
  for s in sym:
 
238
  df_clean = df.copy()
239
 
240
  # # Validate genes and SNPs with APIs
241
+ def permutate(word):
242
 
243
+ if len(word) == 0:
244
+ return ['']
245
 
246
+ change = []
247
+ res = permutate(word[1:])
248
 
249
+ if word[0] in mistakes:
250
+ change = [mistakes[word[0]] + r for r in res]
251
 
252
+ return [word[0] + r for r in res] + change
253
 
254
+ def call(url):
255
+
256
+ while True:
257
+ try:
258
+ res = requests.get(url)
259
+ time.sleep(1)
260
+ break
261
+ except Exception as e:
262
+ print(e)
263
+
264
+ return res
265
+
266
+ mistakes = {'I': '1', 'O': '0'} # Common mistakes need to be maintained
267
+ dbsnp = {}
268
+
269
+ for i in df.index:
270
+ snp = df.loc[i, 'SNPs']
271
+ gene = df.loc[i, 'Genes']
272
+
273
+ if snp not in dbsnp:
274
+ res = call(f'https://www.ebi.ac.uk/gwas/rest/api/singleNucleotidePolymorphisms/{snp}/')
275
+ try:
276
+ res = res.json()
277
+ dbsnp[snp] = [r['gene']['geneName'] for r in res['genomicContexts']]
278
+ except:
279
+ dbsnp[snp] = []
280
+
281
+ res = call(f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=snp&retmode=json&id={snp[2:]}').json()['result'][snp[2:]]
282
+ if 'error' not in res:
283
+ dbsnp[snp].extend([r['name'] for r in res['genes']])
284
+
285
+ dbsnp[snp] = list(set(dbsnp[snp]))
286
+
287
+ if gene not in dbsnp[snp]:
288
+ for other in permutate(gene):
289
+ if other in dbsnp[snp]:
290
+ df.loc[i, 'Genes'] = other
291
+ print(f'{gene} corrected to {other}')
292
+ break
293
+ else:
294
+ df = df.drop(i)
295
 
296
  # df.reset_index(drop=True, inplace=True)
297
  df_no_llm = df.copy()