Evan73 commited on
Commit
36f0c73
·
1 Parent(s): 296f63c

modify app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -0
app.py CHANGED
@@ -10,6 +10,21 @@ from pathlib import Path
10
  # 定义 tiktoken 编码器
11
  encoding = tiktoken.get_encoding("cl100k_base")
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  # MGTHuman 类
14
  class MGTHuman(datasets.GeneratorBasedBuilder):
15
  VERSION = datasets.Version("1.0.0")
@@ -23,6 +38,22 @@ class MGTHuman(datasets.GeneratorBasedBuilder):
23
  ]
24
  DEFAULT_CONFIG_NAME = "human"
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  def truncate_text(self, text, max_tokens=2048):
27
  tokens = encoding.encode(text, allowed_special={'<|endoftext|>'})
28
  if len(tokens) > max_tokens:
 
10
  # 定义 tiktoken 编码器
11
  encoding = tiktoken.get_encoding("cl100k_base")
12
 
13
+ _CITATION = """\
14
+ @InProceedings{huggingface:dataset,
15
+ title = {MGT detection},
16
+ author={Trustworthy AI Lab},
17
+ year={2024}
18
+ }
19
+ """
20
+
21
+ _DESCRIPTION = """\
22
+ For detecting machine generated text.
23
+ """
24
+
25
+ _HOMEPAGE = ""
26
+ _LICENSE = ""
27
+
28
  # MGTHuman 类
29
  class MGTHuman(datasets.GeneratorBasedBuilder):
30
  VERSION = datasets.Version("1.0.0")
 
38
  ]
39
  DEFAULT_CONFIG_NAME = "human"
40
 
41
+ def _info(self):
42
+ features = datasets.Features(
43
+ {
44
+ "id": datasets.Value("int32"),
45
+ "text": datasets.Value("string"),
46
+ "file": datasets.Value("string"),
47
+ }
48
+ )
49
+ return datasets.DatasetInfo(
50
+ description=_DESCRIPTION,
51
+ features=features,
52
+ homepage=_HOMEPAGE,
53
+ license=_LICENSE,
54
+ citation=_CITATION,
55
+ )
56
+
57
  def truncate_text(self, text, max_tokens=2048):
58
  tokens = encoding.encode(text, allowed_special={'<|endoftext|>'})
59
  if len(tokens) > max_tokens: