Spaces:
Sleeping
Sleeping
modify app.py
Browse files
app.py
CHANGED
@@ -10,6 +10,21 @@ from pathlib import Path
|
|
10 |
# 定义 tiktoken 编码器
|
11 |
encoding = tiktoken.get_encoding("cl100k_base")
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
# MGTHuman 类
|
14 |
class MGTHuman(datasets.GeneratorBasedBuilder):
|
15 |
VERSION = datasets.Version("1.0.0")
|
@@ -23,6 +38,22 @@ class MGTHuman(datasets.GeneratorBasedBuilder):
|
|
23 |
]
|
24 |
DEFAULT_CONFIG_NAME = "human"
|
25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
def truncate_text(self, text, max_tokens=2048):
|
27 |
tokens = encoding.encode(text, allowed_special={'<|endoftext|>'})
|
28 |
if len(tokens) > max_tokens:
|
|
|
10 |
# 定义 tiktoken 编码器
|
11 |
encoding = tiktoken.get_encoding("cl100k_base")
|
12 |
|
13 |
+
_CITATION = """\
|
14 |
+
@InProceedings{huggingface:dataset,
|
15 |
+
title = {MGT detection},
|
16 |
+
author={Trustworthy AI Lab},
|
17 |
+
year={2024}
|
18 |
+
}
|
19 |
+
"""
|
20 |
+
|
21 |
+
_DESCRIPTION = """\
|
22 |
+
For detecting machine generated text.
|
23 |
+
"""
|
24 |
+
|
25 |
+
_HOMEPAGE = ""
|
26 |
+
_LICENSE = ""
|
27 |
+
|
28 |
# MGTHuman 类
|
29 |
class MGTHuman(datasets.GeneratorBasedBuilder):
|
30 |
VERSION = datasets.Version("1.0.0")
|
|
|
38 |
]
|
39 |
DEFAULT_CONFIG_NAME = "human"
|
40 |
|
41 |
+
def _info(self):
|
42 |
+
features = datasets.Features(
|
43 |
+
{
|
44 |
+
"id": datasets.Value("int32"),
|
45 |
+
"text": datasets.Value("string"),
|
46 |
+
"file": datasets.Value("string"),
|
47 |
+
}
|
48 |
+
)
|
49 |
+
return datasets.DatasetInfo(
|
50 |
+
description=_DESCRIPTION,
|
51 |
+
features=features,
|
52 |
+
homepage=_HOMEPAGE,
|
53 |
+
license=_LICENSE,
|
54 |
+
citation=_CITATION,
|
55 |
+
)
|
56 |
+
|
57 |
def truncate_text(self, text, max_tokens=2048):
|
58 |
tokens = encoding.encode(text, allowed_special={'<|endoftext|>'})
|
59 |
if len(tokens) > max_tokens:
|