fadliaulawi commited on
Commit
1bd03b4
1 Parent(s): 51409c3

Change validation logic

Browse files
Files changed (1) hide show
  1. validate.py +42 -3
validate.py CHANGED
@@ -1,4 +1,3 @@
1
- from collections import defaultdict
2
  from dotenv import load_dotenv
3
  from langchain_google_genai import ChatGoogleGenerativeAI
4
  from langchain_openai import ChatOpenAI
@@ -21,7 +20,7 @@ class Validation():
21
  else:
22
  self.llm = ChatOpenAI(temperature=0, model_name=llm, api_key=os.environ['PERPLEXITY_API_KEY'], base_url="https://api.perplexity.ai")
23
 
24
- def validate(self, df, api):
25
 
26
  df = df.fillna('')
27
  df['Genes'] = df['Genes'].str.replace(' ', '').str.upper()
@@ -83,6 +82,7 @@ class Validation():
83
  df.reset_index(drop=True, inplace=True)
84
  df_clean = df.copy()
85
 
 
86
  # Validate genes and SNPs with APIs
87
  if api:
88
  dbsnp = {}
@@ -110,14 +110,34 @@ class Validation():
110
  else:
111
  df = df.drop(i)
112
 
 
113
  # Check with GWAS ground truth
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  for i in df.index:
115
  gene = df.loc[i, 'Genes']
116
  snp = df.loc[i, 'rsID']
117
  perms = permutate(gene)
118
 
119
  for perm in perms:
120
- if perm in ground_truth and snp in ground_truth[perm]:
121
  df.loc[i, 'Genes'] = perm
122
  if gene != perm:
123
  print(f'{gene} corrected to {perm} with {snp}')
@@ -128,6 +148,25 @@ class Validation():
128
  print(f'{gene} and {snp} not found')
129
  df = df.drop(i)
130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  df.reset_index(drop=True, inplace=True)
132
 
133
  return df, df_clean
 
 
1
  from dotenv import load_dotenv
2
  from langchain_google_genai import ChatGoogleGenerativeAI
3
  from langchain_openai import ChatOpenAI
 
20
  else:
21
  self.llm = ChatOpenAI(temperature=0, model_name=llm, api_key=os.environ['PERPLEXITY_API_KEY'], base_url="https://api.perplexity.ai")
22
 
23
+ def validate(self, df, text, api):
24
 
25
  df = df.fillna('')
26
  df['Genes'] = df['Genes'].str.replace(' ', '').str.upper()
 
82
  df.reset_index(drop=True, inplace=True)
83
  df_clean = df.copy()
84
 
85
+ # WARNING: DEPRECATED
86
  # Validate genes and SNPs with APIs
87
  if api:
88
  dbsnp = {}
 
110
  else:
111
  df = df.drop(i)
112
 
113
+ # WARNING: DEPRECATED
114
  # Check with GWAS ground truth
115
+ if False:
116
+ for i in df.index:
117
+ gene = df.loc[i, 'Genes']
118
+ snp = df.loc[i, 'rsID']
119
+ perms = permutate(gene)
120
+
121
+ for perm in perms:
122
+ if perm in ground_truth and snp in ground_truth[perm]:
123
+ df.loc[i, 'Genes'] = perm
124
+ if gene != perm:
125
+ print(f'{gene} corrected to {perm} with {snp}')
126
+ else:
127
+ print(f'{gene} and {snp} safe')
128
+ break
129
+ else:
130
+ print(f'{gene} and {snp} not found')
131
+ df = df.drop(i)
132
+
133
+ # Check with Text
134
  for i in df.index:
135
  gene = df.loc[i, 'Genes']
136
  snp = df.loc[i, 'rsID']
137
  perms = permutate(gene)
138
 
139
  for perm in perms:
140
+ if perm in text and snp in text:
141
  df.loc[i, 'Genes'] = perm
142
  if gene != perm:
143
  print(f'{gene} corrected to {perm} with {snp}')
 
148
  print(f'{gene} and {snp} not found')
149
  df = df.drop(i)
150
 
151
+ # Drop (duplicate) entries with empty values
152
+ genes = []
153
+ snps = []
154
+ for i in df.index:
155
+ gene = df.loc[i, 'Genes']
156
+ snp = df.loc[i, 'rsID']
157
+
158
+ if len(gene) == 0 and len(snp) == 0:
159
+ df = df.drop(i)
160
+ elif len(gene) == 0:
161
+ if snp in snps:
162
+ df = df.drop(i)
163
+ elif len(snp) == 0:
164
+ if gene in genes:
165
+ df = df.drop(i)
166
+ else:
167
+ genes.append(gene)
168
+ snps.append(snp)
169
+
170
  df.reset_index(drop=True, inplace=True)
171
 
172
  return df, df_clean