Spaces:

KalbeDigitalLab
/

nutrigenme-paper-extractor

Running

fadliaulawi commited on Sep 9

Commit

2bc0f8a

•

1 Parent(s): 07396d9

Add new rsID handling

Files changed (1) hide show

utils.py CHANGED Viewed

@@ -26,6 +26,9 @@ mistakes = {
     'P': ['9']
 }
 def permutate(word):
     if len(word) == 0:
@@ -52,32 +55,38 @@ def call(url):
     return res
-raw_url = "https://www.ebi.ac.uk/gwas/api/search/downloads/alternative"
-gwas_path = "resources/gwas_catalog.tsv"
-# Load Raw GWAS files
-if os.path.exists(gwas_path):
-    gwas = pd.read_csv(gwas_path, delimiter='\t')[['DISEASE/TRAIT', 'CHR_ID', 'MAPPED_GENE', 'SNPS', 'P-VALUE', 'OR or BETA']]
-else:
-    data = requests.get(raw_url).content.decode('utf-8')
-    gwas = pd.read_csv(StringIO(data), delimiter='\t')[['DISEASE/TRAIT', 'CHR_ID', 'MAPPED_GENE', 'SNPS', 'P-VALUE', 'OR or BETA']]
-# Load Genes and SNPs from GWAS
-gwas_gene_rsid = gwas[['MAPPED_GENE', 'SNPS']]
-gwas_gene_rsid.dropna(inplace=True, ignore_index=True)
-gwas_gene_rsid['MAPPED_GENE'] = gwas_gene_rsid['MAPPED_GENE'].apply(lambda x: x.replace(' ', '').upper())
-# Generate Genes and SNPs mapping
-ground_truth = defaultdict(list)
-for i in gwas_gene_rsid.index:
-    gene = gwas_gene_rsid.loc[i, 'MAPPED_GENE']
-    snp = gwas_gene_rsid.loc[i, 'SNPS']
-    pattern = r"[,\-]"
-    genes = re.split(pattern, gene)
-    for gene in genes:
-        ground_truth[gene].append(snp)
-        ground_truth[snp].append(gene)
 def integrate(df):

     'P': ['9']
 }
+raw_url = "https://www.ebi.ac.uk/gwas/api/search/downloads/alternative"
+gwas_path = "resources/gwas_catalog.tsv"
 def permutate(word):
     if len(word) == 0:
     return res
+def generate_raw_files():
+    # Load Raw GWAS files
+    if os.path.exists(gwas_path):
+        gwas = pd.read_csv(gwas_path, delimiter='\t')[['DISEASE/TRAIT', 'CHR_ID', 'MAPPED_GENE', 'SNPS', 'P-VALUE', 'OR or BETA']]
+    else:
+        data = requests.get(raw_url).content.decode('utf-8')
+        gwas = pd.read_csv(StringIO(data), delimiter='\t')[['DISEASE/TRAIT', 'CHR_ID', 'MAPPED_GENE', 'SNPS', 'P-VALUE', 'OR or BETA']]
+    # Load Genes and SNPs from GWAS
+    gwas_gene_rsid = gwas[['MAPPED_GENE', 'SNPS']]
+    gwas_gene_rsid.dropna(inplace=True, ignore_index=True)
+    gwas_gene_rsid['MAPPED_GENE'] = gwas_gene_rsid['MAPPED_GENE'].apply(lambda x: x.replace(' ', '').upper())
+    # Generate Genes and SNPs mapping
+    ground_truth = defaultdict(list)
+    for i in gwas_gene_rsid.index:
+        gene = gwas_gene_rsid.loc[i, 'MAPPED_GENE']
+        snp = gwas_gene_rsid.loc[i, 'SNPS']
+        pattern = r"[,x\-]"
+        genes = re.split(pattern, gene)
+        snps = re.split(pattern, snp)
+        for gene in genes:
+            for snp in snps:
+                ground_truth[gene].append(snp)
+                ground_truth[snp].append(gene)
+    return gwas, ground_truth
+gwas, ground_truth = generate_raw_files()
 def integrate(df):