Spaces:
Runtime error
Runtime error
enhance contamination
Browse files
app.py
CHANGED
@@ -234,52 +234,78 @@ data was heavily used in their benchmark datasets.
|
|
234 |
with metrics:
|
235 |
|
236 |
with st.spinner('Calculating contamination ratio...'):
|
237 |
-
|
238 |
train_dataset = datasets['train']
|
239 |
test_dataset = datasets['test']
|
|
|
240 |
from nltk import ngrams
|
241 |
-
|
242 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
243 |
|
244 |
-
|
245 |
-
|
246 |
|
247 |
-
|
248 |
-
train_ngrams = set.union(*train_dataset['ngrams'])
|
249 |
|
250 |
-
|
251 |
-
|
252 |
-
|
|
|
|
|
|
|
|
|
|
|
253 |
|
254 |
train_dataset_count = len(train_dataset)
|
255 |
test_dataset_count = len(test_dataset)
|
256 |
-
contaminate_ratio =
|
257 |
|
258 |
col1, col2, col3, col4 = st.columns(4)
|
259 |
col1.metric(label="Train Set Size", value="%d" % train_dataset_count)
|
260 |
col2.metric(label="Test Set Size", value="%d" % test_dataset_count)
|
261 |
-
col3.metric(label="Overlapped Docs", value="%d" %
|
262 |
col4.metric(label="Contaminated Ratio", value="%.2f%%" % (contaminate_ratio * 100))
|
263 |
with code:
|
264 |
st.code(
|
265 |
'''
|
266 |
from nltk import ngrams
|
267 |
-
|
268 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
269 |
|
270 |
-
|
271 |
-
test_dataset['ngrams'] = test_dataset['text'].apply(generate_ngrams)
|
272 |
|
273 |
-
|
274 |
-
|
275 |
|
276 |
-
|
277 |
-
|
278 |
-
|
|
|
|
|
279 |
|
280 |
train_dataset_count = len(train_dataset)
|
281 |
test_dataset_count = len(test_dataset)
|
282 |
-
contaminate_ratio =
|
283 |
'''
|
284 |
)
|
285 |
|
|
|
234 |
with metrics:
|
235 |
|
236 |
with st.spinner('Calculating contamination ratio...'):
|
|
|
237 |
train_dataset = datasets['train']
|
238 |
test_dataset = datasets['test']
|
239 |
+
|
240 |
from nltk import ngrams
|
241 |
+
from datasketch import MinHash, MinHashLSH
|
242 |
+
|
243 |
+
def process_data(df):
|
244 |
+
minhashes = {}
|
245 |
+
for idx, r in df.iterrows():
|
246 |
+
minhash = MinHash(num_perm=128)
|
247 |
+
for d in ngrams(r['text'], 13):
|
248 |
+
s = "".join(d).encode('utf-8')
|
249 |
+
minhash.update(s)
|
250 |
+
minhashes[idx] = minhash
|
251 |
+
return minhashes
|
252 |
|
253 |
+
train_minhashes = process_data(train_dataset)
|
254 |
+
test_minhashes = process_data(test_dataset)
|
255 |
|
256 |
+
lsh = MinHashLSH(threshold=0.8, num_perm=128)
|
|
|
257 |
|
258 |
+
for idx, minhash in train_minhashes.items():
|
259 |
+
lsh.insert(idx, minhash)
|
260 |
+
|
261 |
+
duplicates_count = 0
|
262 |
+
for idx, minhash in test_minhashes.items():
|
263 |
+
result = lsh.query(minhash)
|
264 |
+
if len(result) > 0:
|
265 |
+
duplicates_count += 1
|
266 |
|
267 |
train_dataset_count = len(train_dataset)
|
268 |
test_dataset_count = len(test_dataset)
|
269 |
+
contaminate_ratio = duplicates_count / test_dataset_count
|
270 |
|
271 |
col1, col2, col3, col4 = st.columns(4)
|
272 |
col1.metric(label="Train Set Size", value="%d" % train_dataset_count)
|
273 |
col2.metric(label="Test Set Size", value="%d" % test_dataset_count)
|
274 |
+
col3.metric(label="Overlapped Docs", value="%d" % duplicates_count)
|
275 |
col4.metric(label="Contaminated Ratio", value="%.2f%%" % (contaminate_ratio * 100))
|
276 |
with code:
|
277 |
st.code(
|
278 |
'''
|
279 |
from nltk import ngrams
|
280 |
+
from datasketch import MinHash, MinHashLSH
|
281 |
+
|
282 |
+
def process_data(df):
|
283 |
+
minhashes = {}
|
284 |
+
for idx, r in df.iterrows():
|
285 |
+
minhash = MinHash(num_perm=128)
|
286 |
+
for d in ngrams(r['text'], 13):
|
287 |
+
s = "".join(d).encode('utf-8')
|
288 |
+
minhash.update(s)
|
289 |
+
minhashes[idx] = minhash
|
290 |
+
return minhashes
|
291 |
+
|
292 |
+
train_minhashes = process_data(train_dataset)
|
293 |
+
test_minhashes = process_data(test_dataset)
|
294 |
|
295 |
+
lsh = MinHashLSH(threshold=0.8, num_perm=128)
|
|
|
296 |
|
297 |
+
for idx, minhash in train_minhashes.items():
|
298 |
+
lsh.insert(idx, minhash)
|
299 |
|
300 |
+
duplicates_count = 0
|
301 |
+
for idx, minhash in test_minhashes.items():
|
302 |
+
result = lsh.query(minhash)
|
303 |
+
if len(result) > 0:
|
304 |
+
duplicates_count += 1
|
305 |
|
306 |
train_dataset_count = len(train_dataset)
|
307 |
test_dataset_count = len(test_dataset)
|
308 |
+
contaminate_ratio = duplicates_count / test_dataset_count
|
309 |
'''
|
310 |
)
|
311 |
|