Dreamsome commited on
Commit
4d9ad4a
·
1 Parent(s): 06a91a3

enhance contamination

Browse files
Files changed (1) hide show
  1. app.py +48 -22
app.py CHANGED
@@ -234,52 +234,78 @@ data was heavily used in their benchmark datasets.
234
  with metrics:
235
 
236
  with st.spinner('Calculating contamination ratio...'):
237
-
238
  train_dataset = datasets['train']
239
  test_dataset = datasets['test']
 
240
  from nltk import ngrams
241
- def generate_ngrams(text, n=8):
242
- return set(ngrams(text.split(), n))
 
 
 
 
 
 
 
 
 
243
 
244
- train_dataset['ngrams'] = train_dataset['text'].apply(generate_ngrams)
245
- test_dataset['ngrams'] = test_dataset['text'].apply(generate_ngrams)
246
 
247
- # Creating a set of n-grams in the train set
248
- train_ngrams = set.union(*train_dataset['ngrams'])
249
 
250
- # Creating a boolean mask marking documents in the test set that have appeared in the train set
251
- common_docs = test_dataset['ngrams'].apply(lambda x: not x.isdisjoint(train_ngrams))
252
- common_docs_count = common_docs.sum()
 
 
 
 
 
253
 
254
  train_dataset_count = len(train_dataset)
255
  test_dataset_count = len(test_dataset)
256
- contaminate_ratio = common_docs_count / test_dataset_count
257
 
258
  col1, col2, col3, col4 = st.columns(4)
259
  col1.metric(label="Train Set Size", value="%d" % train_dataset_count)
260
  col2.metric(label="Test Set Size", value="%d" % test_dataset_count)
261
- col3.metric(label="Overlapped Docs", value="%d" % common_docs_count)
262
  col4.metric(label="Contaminated Ratio", value="%.2f%%" % (contaminate_ratio * 100))
263
  with code:
264
  st.code(
265
  '''
266
  from nltk import ngrams
267
- def generate_ngrams(text, n=8):
268
- return set(ngrams(text.split(), n))
 
 
 
 
 
 
 
 
 
 
 
 
269
 
270
- train_dataset['ngrams'] = train_dataset['text'].apply(generate_ngrams)
271
- test_dataset['ngrams'] = test_dataset['text'].apply(generate_ngrams)
272
 
273
- # Creating a set of n-grams in the train set
274
- train_ngrams = set.union(*train_dataset['ngrams'])
275
 
276
- # Creating a boolean mask marking documents in the test set that have appeared in the train set
277
- common_docs = test_dataset['ngrams'].apply(lambda x: not x.isdisjoint(train_ngrams))
278
- common_docs_count = common_docs.sum()
 
 
279
 
280
  train_dataset_count = len(train_dataset)
281
  test_dataset_count = len(test_dataset)
282
- contaminate_ratio = common_docs / test_dataset_count
283
  '''
284
  )
285
 
 
234
  with metrics:
235
 
236
  with st.spinner('Calculating contamination ratio...'):
 
237
  train_dataset = datasets['train']
238
  test_dataset = datasets['test']
239
+
240
  from nltk import ngrams
241
+ from datasketch import MinHash, MinHashLSH
242
+
243
+ def process_data(df):
244
+ minhashes = {}
245
+ for idx, r in df.iterrows():
246
+ minhash = MinHash(num_perm=128)
247
+ for d in ngrams(r['text'], 13):
248
+ s = "".join(d).encode('utf-8')
249
+ minhash.update(s)
250
+ minhashes[idx] = minhash
251
+ return minhashes
252
 
253
+ train_minhashes = process_data(train_dataset)
254
+ test_minhashes = process_data(test_dataset)
255
 
256
+ lsh = MinHashLSH(threshold=0.8, num_perm=128)
 
257
 
258
+ for idx, minhash in train_minhashes.items():
259
+ lsh.insert(idx, minhash)
260
+
261
+ duplicates_count = 0
262
+ for idx, minhash in test_minhashes.items():
263
+ result = lsh.query(minhash)
264
+ if len(result) > 0:
265
+ duplicates_count += 1
266
 
267
  train_dataset_count = len(train_dataset)
268
  test_dataset_count = len(test_dataset)
269
+ contaminate_ratio = duplicates_count / test_dataset_count
270
 
271
  col1, col2, col3, col4 = st.columns(4)
272
  col1.metric(label="Train Set Size", value="%d" % train_dataset_count)
273
  col2.metric(label="Test Set Size", value="%d" % test_dataset_count)
274
+ col3.metric(label="Overlapped Docs", value="%d" % duplicates_count)
275
  col4.metric(label="Contaminated Ratio", value="%.2f%%" % (contaminate_ratio * 100))
276
  with code:
277
  st.code(
278
  '''
279
  from nltk import ngrams
280
+ from datasketch import MinHash, MinHashLSH
281
+
282
+ def process_data(df):
283
+ minhashes = {}
284
+ for idx, r in df.iterrows():
285
+ minhash = MinHash(num_perm=128)
286
+ for d in ngrams(r['text'], 13):
287
+ s = "".join(d).encode('utf-8')
288
+ minhash.update(s)
289
+ minhashes[idx] = minhash
290
+ return minhashes
291
+
292
+ train_minhashes = process_data(train_dataset)
293
+ test_minhashes = process_data(test_dataset)
294
 
295
+ lsh = MinHashLSH(threshold=0.8, num_perm=128)
 
296
 
297
+ for idx, minhash in train_minhashes.items():
298
+ lsh.insert(idx, minhash)
299
 
300
+ duplicates_count = 0
301
+ for idx, minhash in test_minhashes.items():
302
+ result = lsh.query(minhash)
303
+ if len(result) > 0:
304
+ duplicates_count += 1
305
 
306
  train_dataset_count = len(train_dataset)
307
  test_dataset_count = len(test_dataset)
308
+ contaminate_ratio = duplicates_count / test_dataset_count
309
  '''
310
  )
311