songyiliao CoinW commited on
Commit
1244519
·
verified ·
1 Parent(s): e293588

feat: initial cmommit (#1)

Browse files

- feat: initial cmommit (5039b3ff5c805715fb723be2a373267cdf886398)


Co-authored-by: Coin W <CoinW@users.noreply.huggingface.co>

LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Shuhai Zhang
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
MMD.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+
4
+ def flexible_kernel(X, Y, X_org, Y_org, sigma, sigma0=0.1, epsilon=1e-08):
5
+ """Flexible kernel calculation as in MMDu."""
6
+ Dxy = Pdist2(X, Y)
7
+ Dxy_org = Pdist2(X_org, Y_org)
8
+ L = 1
9
+ Kxy = (1 - epsilon) * torch.exp(
10
+ -((Dxy / sigma0) ** L) - Dxy_org / sigma
11
+ ) + epsilon * torch.exp(-Dxy_org / sigma)
12
+ return Kxy
13
+
14
+
15
+ def MMD_Diff_Var(Kyy, Kzz, Kxy, Kxz, epsilon=1e-08):
16
+ """Compute the variance of the difference statistic MMDXY - MMDXZ."""
17
+ """Referenced from: https://github.com/eugenium/MMD/blob/master/mmd.py"""
18
+ m = Kxy.shape[0]
19
+ n = Kyy.shape[0]
20
+ r = Kzz.shape[0]
21
+
22
+ # Remove diagonal elements
23
+ Kyynd = Kyy - torch.diag(torch.diag(Kyy))
24
+ Kzznd = Kzz - torch.diag(torch.diag(Kzz))
25
+
26
+ u_yy = torch.sum(Kyynd) * (1.0 / (n * (n - 1)))
27
+ u_zz = torch.sum(Kzznd) * (1.0 / (r * (r - 1)))
28
+ u_xy = torch.sum(Kxy) / (m * n)
29
+ u_xz = torch.sum(Kxz) / (m * r)
30
+
31
+ t1 = (1.0 / n**3) * torch.sum(Kyynd.T @ Kyynd) - u_yy**2
32
+ t2 = (1.0 / (n**2 * m)) * torch.sum(Kxy.T @ Kxy) - u_xy**2
33
+ t3 = (1.0 / (n * m**2)) * torch.sum(Kxy @ Kxy.T) - u_xy**2
34
+ t4 = (1.0 / r**3) * torch.sum(Kzznd.T @ Kzznd) - u_zz**2
35
+ t5 = (1.0 / (r * m**2)) * torch.sum(Kxz @ Kxz.T) - u_xz**2
36
+ t6 = (1.0 / (r**2 * m)) * torch.sum(Kxz.T @ Kxz) - u_xz**2
37
+ t7 = (1.0 / (n**2 * m)) * torch.sum(Kyynd @ Kxy.T) - u_yy * u_xy
38
+ t8 = (1.0 / (n * m * r)) * torch.sum(Kxy.T @ Kxz) - u_xz * u_xy
39
+ t9 = (1.0 / (r**2 * m)) * torch.sum(Kzznd @ Kxz.T) - u_zz * u_xz
40
+
41
+ if type(epsilon) == torch.Tensor:
42
+ epsilon_tensor = epsilon.clone().detach()
43
+ else:
44
+ epsilon_tensor = torch.tensor(epsilon, device=Kyy.device)
45
+ zeta1 = torch.max(t1 + t2 + t3 + t4 + t5 + t6 - 2 * (t7 + t8 + t9), epsilon_tensor)
46
+ zeta2 = torch.max(
47
+ (1 / m / (m - 1)) * torch.sum((Kyynd - Kzznd - Kxy.T - Kxy + Kxz + Kxz.T) ** 2)
48
+ - (u_yy - 2 * u_xy - (u_zz - 2 * u_xz)) ** 2,
49
+ epsilon_tensor,
50
+ )
51
+
52
+ data = {
53
+ "t1": t1.item(),
54
+ "t2": t2.item(),
55
+ "t3": t3.item(),
56
+ "t4": t4.item(),
57
+ "t5": t5.item(),
58
+ "t6": t6.item(),
59
+ "t7": t7.item(),
60
+ "t8": t8.item(),
61
+ "t9": t9.item(),
62
+ "zeta1": zeta1.item(),
63
+ "zeta2": zeta2.item(),
64
+ }
65
+
66
+ Var = (4 * (m - 2) / (m * (m - 1))) * zeta1
67
+ Var_z2 = Var + (2.0 / (m * (m - 1))) * zeta2
68
+
69
+ return Var, Var_z2, data
70
+
71
+
72
+ def Pdist2(x, y):
73
+ """compute the paired distance between x and y."""
74
+ x_norm = (x**2).sum(1).view(-1, 1)
75
+ if y is not None:
76
+ y_norm = (y**2).sum(1).view(1, -1)
77
+ else:
78
+ y = x
79
+ y_norm = x_norm.view(1, -1)
80
+ Pdist = x_norm + y_norm - 2.0 * torch.mm(x, torch.transpose(y, 0, 1))
81
+ Pdist[Pdist < 0] = 0
82
+ return Pdist
83
+
84
+
85
+ def MMD_batch2(
86
+ Fea,
87
+ len_s,
88
+ Fea_org,
89
+ sigma,
90
+ sigma0=0.1,
91
+ epsilon=10 ** (-10),
92
+ is_var_computed=True,
93
+ use_1sample_U=True,
94
+ coeff_xy=2,
95
+ ):
96
+ X = Fea[0:len_s, :]
97
+ Y = Fea[len_s:, :]
98
+ L = 1 # generalized Gaussian (if L>1)
99
+
100
+ nx = X.shape[0]
101
+ ny = Y.shape[0]
102
+ Dxx = Pdist2(X, X)
103
+ Dyy = torch.zeros(Fea.shape[0] - len_s, 1).to(Dxx.device)
104
+ # Dyy = Pdist2(Y, Y)
105
+ Dxy = Pdist2(X, Y).transpose(0, 1)
106
+ Kx = torch.exp(-Dxx / sigma0)
107
+ Ky = torch.exp(-Dyy / sigma0)
108
+ Kxy = torch.exp(-Dxy / sigma0)
109
+
110
+ nx = Kx.shape[0]
111
+
112
+ is_unbiased = False
113
+ xx = torch.div((torch.sum(Kx)), (nx * nx))
114
+ yy = Ky.reshape(-1)
115
+ xy = torch.div(torch.sum(Kxy, dim=1), (nx))
116
+
117
+ mmd2 = xx - 2 * xy + yy
118
+ return mmd2
119
+
120
+
121
+ # MMD for three samples
122
+ def MMD_3_Sample_Test(
123
+ ref_fea,
124
+ fea_y,
125
+ fea_z,
126
+ ref_fea_org,
127
+ fea_y_org,
128
+ fea_z_org,
129
+ sigma,
130
+ sigma0,
131
+ epsilon,
132
+ alpha,
133
+ ):
134
+ """Run three-sample test (TST) using deep kernel kernel."""
135
+ X = ref_fea.clone().detach()
136
+ Y = fea_y.clone().detach()
137
+ Z = fea_z.clone().detach()
138
+ X_org = ref_fea_org.clone().detach()
139
+ Y_org = fea_y_org.clone().detach()
140
+ Z_org = fea_z_org.clone().detach()
141
+
142
+ Kyy = flexible_kernel(Y, Y, Y_org, Y_org, sigma, sigma0, epsilon)
143
+ Kzz = flexible_kernel(Z, Z, Z_org, Z_org, sigma, sigma0, epsilon)
144
+ Kxy = flexible_kernel(X, Y, X_org, Y_org, sigma, sigma0, epsilon)
145
+ Kxz = flexible_kernel(X, Z, X_org, Z_org, sigma, sigma0, epsilon)
146
+
147
+ Kyynd = Kyy - torch.diag(torch.diag(Kyy))
148
+ Kzznd = Kzz - torch.diag(torch.diag(Kzz))
149
+
150
+ Diff_Var, _, _ = MMD_Diff_Var(Kyy, Kzz, Kxy, Kxz, epsilon)
151
+
152
+ u_yy = torch.sum(Kyynd) / (Y.shape[0] * (Y.shape[0] - 1))
153
+ u_zz = torch.sum(Kzznd) / (Z.shape[0] * (Z.shape[0] - 1))
154
+ u_xy = torch.sum(Kxy) / (X.shape[0] * Y.shape[0])
155
+ u_xz = torch.sum(Kxz) / (X.shape[0] * Z.shape[0])
156
+
157
+ t = u_yy - 2 * u_xy - (u_zz - 2 * u_xz)
158
+ if Diff_Var.item() <= 0:
159
+ Diff_Var = torch.max(epsilon, torch.tensor(1e-08))
160
+ p_value = torch.distributions.Normal(0, 1).cdf(-t / torch.sqrt((Diff_Var)))
161
+ t = t / torch.sqrt(Diff_Var)
162
+
163
+ if p_value > alpha:
164
+ h = 0
165
+ else:
166
+ h = 1
167
+
168
+ return h, p_value.item(), t.item()
app.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from utils import init_random_seeds, config
4
+ from relative_tester import RelativeTester
5
+
6
+ # from two_sample_tester import two_sample_tester
7
+
8
+
9
+ def run_test(input_text):
10
+ if not input_text:
11
+ return "Now that you've built a demo, you'll probably want to share it with others. Gradio demos can be shared in two ways: using a temporary share link or permanent hosting on Spaces."
12
+ # return two_sample_tester.test(input_text.strip())
13
+ return relative_tester.test(input_text.strip())
14
+ return f"Prediction: Human (Mocked for {input_text})"
15
+
16
+
17
+ css = """
18
+ #header { text-align: center; font-size: 3em; margin-bottom: 20px; color: #black; font-weight: bold;}
19
+ #output-text { font-weight: bold; font-size: 1.2em; border-radius: 10px; padding: 10px; background-color: #f4f4f4;}
20
+ .links {
21
+ display: flex;
22
+ justify-content: flex-end;
23
+ gap: 10px;
24
+ margin-right: 10px;
25
+ align-items: center;
26
+ font-size: 0.9em;
27
+ color: #ADD8E6;
28
+ }
29
+ .separator {
30
+ margin: 0 5px;
31
+ color: #000;
32
+ }
33
+
34
+ /* Adjusting layout for Input Text and Inference Result */
35
+ .input-row {
36
+ display: flex;
37
+ width: 100%;
38
+ }
39
+
40
+ .input-text {
41
+ flex: 3; /* 4 parts of the row */
42
+ margin-right: 1px;
43
+ border-radius: 8px;
44
+ padding: 12px;
45
+ border: 2px soild #d1d1d1;
46
+ }
47
+
48
+ .output-text {
49
+ flex: 1; /* 1 part of the row */
50
+ border-radius: 8px;
51
+ padding: 12px;
52
+ border: 2px soild #d1d1d1;
53
+ }
54
+
55
+ /* Set button widths to match the Select Model width */
56
+ .button {
57
+ width: 250px; /* Same as the select box width */
58
+ height: 100px; /* Button height */
59
+ background-color: #ADD8E6;
60
+ color: white;
61
+ font-weight: bold;
62
+ border-radius: 8px;
63
+ }
64
+
65
+ .button:hover {
66
+ background-color: #0000FF;
67
+ }
68
+
69
+ /* Set height for the Select Model dropdown */
70
+ .select {
71
+ height: 100px; /* Set height to 100px */
72
+ }
73
+
74
+ /* Accordion Styling */
75
+ .accordion {
76
+ width: 100%; /* Set the width of the accordion to match the parent */
77
+ max-height: 200px; /* Set a max-height for accordion */
78
+ overflow-y: auto; /* Allow scrolling if the content exceeds max height */
79
+ margin-bottom: 10px; /* Add space below accordion */
80
+ box-sizing: border-box; /* Ensure padding is included in width/height */
81
+ }
82
+
83
+ /* Accordion content max-height */
84
+ .accordion-content {
85
+ max-height: 200px; /* Limit the height of the content */
86
+ overflow-y: auto; /* Add a scrollbar if content overflows */
87
+ }
88
+
89
+ .demo-banner {
90
+ background-color: #f3f4f6;
91
+ padding: 20px;
92
+ border-radius: 10px;
93
+ font-size: 1.1em;
94
+ font-weight: bold;
95
+ text-align: center;
96
+ margin-bottom: 20px;
97
+ color: #ff5722;
98
+ }
99
+ """
100
+
101
+ # Gradio App
102
+ with gr.Blocks(css=css) as app:
103
+ with gr.Row():
104
+ gr.HTML('<div id="header">R-detect On HuggingFace</div>')
105
+ with gr.Row():
106
+ gr.HTML(
107
+ """
108
+ <div class="links">
109
+ <a href="https://openreview.net/forum?id=z9j7wctoGV" target="_blank">Paper</a>
110
+ <span class="separator">|</span>
111
+ <a href="https://github.com/xLearn-AU/R-Detect" target="_blank">Code</a>
112
+ <span class="separator">|</span>
113
+ <a href="mailto:1730421718@qq.com" target="_blank">Contact</a>
114
+ </div>
115
+ """
116
+ )
117
+
118
+ with gr.Row():
119
+ gr.HTML(
120
+ '<div class="demo-banner">This is a demo. For the full version, please refer to the <a href="https://github.com/xLearn-AU/R-Detect" target="_blank">GitHub</a> or the <a href="https://openreview.net/forum?id=z9j7wctoGV" target="_blank">Paper</a>.</div>'
121
+ )
122
+
123
+ with gr.Row():
124
+ input_text = gr.Textbox(
125
+ label="Input Text",
126
+ placeholder="Enter Text Here",
127
+ lines=8,
128
+ elem_classes=["input-text"], # Applying the CSS class
129
+ value="Hugging Face is a company and community that has become one of the leading platforms in the field of natural language processing (NLP). It is best known for developing and maintaining the Transformers library, which simplifies the use of state-of-the-art machine learning models for tasks such as text classification, language generation, translation, and more.",
130
+ )
131
+ output = gr.Textbox(
132
+ label="Inference Result",
133
+ placeholder="Made by Human or AI",
134
+ elem_id="output-text",
135
+ lines=8,
136
+ elem_classes=["output-text"],
137
+ )
138
+ with gr.Row():
139
+ submit_button = gr.Button(
140
+ "Run Detection", variant="primary", elem_classes=["button"]
141
+ )
142
+ clear_button = gr.Button("Clear", variant="secondary", elem_classes=["button"])
143
+
144
+ submit_button.click(run_test, inputs=[input_text], outputs=output)
145
+ clear_button.click(lambda: ("", ""), inputs=[], outputs=[input_text, output])
146
+
147
+ with gr.Accordion("Disclaimer", open=False, elem_classes=["accordion"]):
148
+ gr.Markdown(
149
+ """
150
+ - **Disclaimer**: This tool is for demonstration purposes only. It is not a foolproof AI detector.
151
+ - **Accuracy**: Results may vary based on input length and quality.
152
+ """
153
+ )
154
+
155
+ with gr.Accordion("Citations", open=False, elem_classes=["accordion"]):
156
+ gr.Markdown(
157
+ """
158
+ ```
159
+ @inproceedings{zhangs2024MMDMP,
160
+ title={Detecting Machine-Generated Texts by Multi-Population Aware Optimization for Maximum Mean Discrepancy},
161
+ author={Zhang, Shuhai and Song, Yiliao and Yang, Jiahao and Li, Yuanqing and Han, Bo and Tan, Mingkui},
162
+ booktitle = {International Conference on Learning Representations (ICLR)},
163
+ year={2024}
164
+ }
165
+ ```
166
+ """
167
+ )
168
+
169
+
170
+ if __name__ == "__main__":
171
+ config["use_gpu"] = False
172
+ config["local_model"] = ""
173
+ config["feature_ref_HWT"] = "./feature_ref_HWT_500.pt"
174
+ config["feature_ref_MGT"] = "./feature_ref_MGT_500.pt"
175
+ init_random_seeds()
176
+ relative_tester = RelativeTester()
177
+ app.launch()
data_loader.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import tqdm
3
+ import datasets
4
+ import re
5
+ import transformers
6
+ import numpy as np
7
+ from utils import MGT, HWT, config
8
+
9
+ preproc_tokenizer = transformers.AutoTokenizer.from_pretrained(
10
+ "google-t5/t5-small", model_max_length=512
11
+ )
12
+
13
+
14
+ def process_spaces(text):
15
+ text = (
16
+ text.replace(" ,", ",")
17
+ .replace(" .", ".")
18
+ .replace(" ?", "?")
19
+ .replace(" !", "!")
20
+ .replace(" ;", ";")
21
+ .replace(" '", "'")
22
+ .replace(" ’ ", "'")
23
+ .replace(" :", ":")
24
+ .replace("<newline>", "\n")
25
+ .replace("`` ", '"')
26
+ .replace(" ''", '"')
27
+ .replace("''", '"')
28
+ .replace(".. ", "... ")
29
+ .replace(" )", ")")
30
+ .replace("( ", "(")
31
+ .replace(" n't", "n't")
32
+ .replace(" i ", " I ")
33
+ .replace(" i'", " I'")
34
+ .replace("\\'", "'")
35
+ .replace("\n ", "\n")
36
+ .strip()
37
+ )
38
+ text = text.replace("\r\n", "\n").replace("\\n", "").replace("!\n", "")
39
+ return re.sub("\n+", "\n", text)
40
+
41
+
42
+ def trim_to_shorter_length(texta, textb):
43
+ # truncate to shorter of o and s
44
+ shorter_length = min(len(texta.split(" ")), len(textb.split(" ")))
45
+ texta = " ".join(texta.split(" ")[:shorter_length])
46
+ textb = " ".join(textb.split(" ")[:shorter_length])
47
+ return texta, textb
48
+
49
+
50
+ def load_HC3():
51
+ if config["local_dataset"]:
52
+ print("Loading local HC3 dataset", config["local_dataset"])
53
+ else:
54
+ print("Loading remote HC3 dataset")
55
+ ds = (
56
+ datasets.load_dataset(
57
+ config["local_dataset"], name="all", trust_remote_code=True
58
+ )
59
+ if config["local_dataset"]
60
+ else datasets.load_dataset("Hello-SimpleAI/HC3", name="all")
61
+ )
62
+ ds = ds["train"] # DatasetDict -> Dataset
63
+ filtered_ds = [
64
+ item
65
+ for item in ds
66
+ if (
67
+ len(item["human_answers"]) > 0
68
+ and len(item["chatgpt_answers"]) > 0
69
+ and len(item["human_answers"][0].split()) > 5
70
+ and len(item["chatgpt_answers"][0].split()) > 5
71
+ )
72
+ ]
73
+ # print("DEBUG: filtered_ds[0]:", filtered_ds[0])
74
+
75
+ data_new = {"text": [], "label": []}
76
+
77
+ for i in tqdm.tqdm(range(len(filtered_ds)), desc="Parsing data"):
78
+ data_new["text"].append(process_spaces(filtered_ds[i]["human_answers"][0]))
79
+ data_new["label"].append(HWT)
80
+ data_new["text"].append(process_spaces(filtered_ds[i]["chatgpt_answers"][0]))
81
+ data_new["label"].append(MGT)
82
+ return data_new
83
+
84
+
85
+ def filter_data(data_o, long_train_threshold_low=150, long_train_threshold_high=512):
86
+ data_HWT = [
87
+ text for text, label in zip(data_o["text"], data_o["label"]) if label == HWT
88
+ ]
89
+ data_MGT = [
90
+ text for text, label in zip(data_o["text"], data_o["label"]) if label == MGT
91
+ ]
92
+
93
+ # keep only examples with <= 512 tokens according to mask_tokenizer
94
+ # this step has the extra effect of removing examples with low-quality/garbage content
95
+ tokenized_data = preproc_tokenizer(data_HWT)
96
+ long_HWT = [
97
+ x
98
+ for x, y in zip(data_HWT, tokenized_data["input_ids"])
99
+ if long_train_threshold_low <= len(y) <= long_train_threshold_high
100
+ ]
101
+ tokenized_data = preproc_tokenizer(data_MGT)
102
+ long_MGT = [
103
+ x
104
+ for x, y in zip(data_MGT, tokenized_data["input_ids"])
105
+ if long_train_threshold_low <= len(y) <= long_train_threshold_high
106
+ ]
107
+
108
+ # print stats about remainining data
109
+ print(f"Total number of samples: {len(long_HWT)}")
110
+ print(f"Average number of words: {np.mean([len(x.split()) for x in long_HWT])}")
111
+
112
+ data = {
113
+ HWT: [],
114
+ MGT: [],
115
+ }
116
+
117
+ # print(len(long_HWT), len(long_MGT))
118
+ for o, s in zip(long_HWT, long_MGT):
119
+ o, s = trim_to_shorter_length(o, s)
120
+
121
+ # add to the data
122
+ data[HWT].append(o)
123
+ data[MGT].append(s)
124
+
125
+ return data
126
+
127
+
128
+ # Test code
129
+ # data_o = load_HC3()
130
+ # data = filter_data(data_o)
131
+ # real = data[HWT] # [:args.train_real_num] len== n_samples, many sentences of words
132
+ # generated = data[MGT]
133
+ # print(real[:5])
134
+ # print(generated[:5])
demo_text_gpt.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Hugging Face is a company and community that has become one of the leading platforms in the field of natural language processing (NLP). It is best known for developing and maintaining the Transformers library, which simplifies the use of state-of-the-art machine learning models for tasks such as text classification, language generation, translation, and more.
download_model_and_dataset.sh ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # If you are in China, you can use set the following mirror link to download the model and dataset
4
+ # export HF_ENDPOINT=https://hf-mirror.com
5
+
6
+ huggingface-cli download --resume-download openai-community/roberta-base-openai-detector --local-dir llm-models/roberta-base
7
+ huggingface-cli download --repo-type dataset --resume-download Hello-SimpleAI/HC3 --local-dir datasets/HC3
8
+
feature_ref_HWT_500.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63fefc57e0086d1ff5becda59300f8e530677dc6acb2b79e70f2f6255efab318
3
+ size 153601240
feature_ref_MGT_500.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ecf694cd8892975c37a4a008b7fcb1ab5dce3c4d6986e556cd7ab86b886db98
3
+ size 153601240
feature_ref_for_test.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f690f4244c0b245823f847a75d1fd25a151ad96075744156e628c0c1dda65302
3
+ size 42240786
feature_ref_generater.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import tqdm
3
+ import numpy as np
4
+ import nltk
5
+ import argparse
6
+
7
+ from utils import FeatureExtractor, HWT, MGT, config
8
+ from roberta_model_loader import RobertaModelLoader
9
+ from meta_train import net
10
+ from data_loader import load_HC3, filter_data
11
+
12
+
13
+ if __name__ == "__main__":
14
+ parser = argparse.ArgumentParser(description="R-Detect the file content")
15
+ parser.add_argument(
16
+ "--target",
17
+ type=str,
18
+ help="The target of generated feature ref. Default is MGT",
19
+ default=MGT,
20
+ )
21
+ parser.add_argument(
22
+ "--sample_size",
23
+ type=int,
24
+ help="The sample size of generated feature ref. Default is 1000, must bigger than 100 and smaller than 30000",
25
+ default=1000,
26
+ )
27
+ parser.add_argument(
28
+ "--use_gpu",
29
+ action="store_true",
30
+ help="Use GPU or not.",
31
+ )
32
+ parser.add_argument(
33
+ "--local_model",
34
+ type=str,
35
+ help="Use local model or not, you need to download the model first, and set the path. Default is Empty. Script will use remote if this param is empty.",
36
+ default="",
37
+ )
38
+ parser.add_argument(
39
+ "--local_dataset",
40
+ type=str,
41
+ help="Use local dataset or not, you need to download the dataset first, and set the path. Default is Empty",
42
+ default="",
43
+ )
44
+ args = parser.parse_args()
45
+ config["target"] = args.target
46
+ if args.sample_size < 100 or args.sample_size > 30000:
47
+ print("Sample size must be between 100 and 30000, set to 1000")
48
+ config["sample_size"] = 1000
49
+ else:
50
+ config["sample_size"] = args.sample_size
51
+ config["use_gpu"] = args.use_gpu
52
+ config["local_model"] = args.local_model
53
+ config["local_dataset"] = args.local_dataset
54
+ target = HWT if config["target"] == HWT else MGT
55
+ # load model and feature extractor
56
+ roberta_model = RobertaModelLoader()
57
+ feature_extractor = FeatureExtractor(roberta_model, net)
58
+ # load target data
59
+ data_o = load_HC3()
60
+ data = filter_data(data_o)
61
+ data = data[target]
62
+ # print(data[:3])
63
+
64
+ # split with nltk
65
+ nltk.download("punkt", quiet=True)
66
+ nltk.download("punkt_tab", quiet=True)
67
+ paragraphs = [nltk.sent_tokenize(paragraph)[1:-1] for paragraph in data]
68
+ data = [
69
+ sent for paragraph in paragraphs for sent in paragraph if 5 < len(sent.split())
70
+ ]
71
+ # print(data[:3])
72
+
73
+ # extract features
74
+ feature_ref = []
75
+ for i in tqdm.tqdm(
76
+ range(config["sample_size"]), desc=f"Generating feature ref for {target}"
77
+ ):
78
+ feature_ref.append(
79
+ feature_extractor.process(data[i], False).detach()
80
+ ) # detach to save memory
81
+ torch.save(
82
+ torch.cat(feature_ref, dim=0),
83
+ f"feature_ref_{target}_{config["sample_size"]}.pt",
84
+ )
85
+ print(f"Feature ref for {target} generated successfully")
feature_ref_loader.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+
4
+ from utils import get_device, config
5
+
6
+ DEVICE = get_device()
7
+
8
+
9
+ def feature_ref_loader(feature_ref_file_name, num_ref=5000):
10
+ print("Feature Ref Loader load: ", feature_ref_file_name)
11
+ load_ref_data = torch.load(feature_ref_file_name, map_location=DEVICE) # cpu
12
+ load_ref_data = load_ref_data.to(DEVICE)
13
+ feature_ref = load_ref_data[np.random.permutation(load_ref_data.shape[0])][
14
+ :num_ref
15
+ ].to(DEVICE)
16
+ return feature_ref
17
+
18
+
19
+ feature_two_sample_tester_ref = feature_ref_loader("./feature_ref_for_test.pt")
logistic_regression_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce9afa92d745bff3bbfb4ae028db369cb35710a9099159a12ac1048fed3760e6
3
+ size 756
main.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import argparse
3
+ from relative_tester import RelativeTester
4
+ from utils import init_random_seeds, config, get_device
5
+
6
+
7
+ if __name__ == "__main__":
8
+ init_random_seeds()
9
+ parser = argparse.ArgumentParser(description="R-Detect the file content")
10
+ parser.add_argument(
11
+ "--test_file",
12
+ type=str,
13
+ help="The file path of the test file. Default is demo_text_gpt.txt",
14
+ default="./demo_text_gpt.txt",
15
+ )
16
+ parser.add_argument(
17
+ "--use_gpu",
18
+ action="store_true",
19
+ help="Use GPU or not.",
20
+ )
21
+ parser.add_argument(
22
+ "--local_model",
23
+ type=str,
24
+ help="Use local model or not, you need to download the model first, and set the path. Script will use remote if this param is empty.",
25
+ default="",
26
+ )
27
+ parser.add_argument(
28
+ "--feature_ref_HWT",
29
+ type=str,
30
+ help="The feature ref path of HWT. Script will use remote if this param is empty.",
31
+ default="",
32
+ )
33
+ parser.add_argument(
34
+ "--feature_ref_MGT",
35
+ type=str,
36
+ help="The feature ref path of MGT. Default is Empty",
37
+ default="",
38
+ )
39
+ args = parser.parse_args()
40
+ config["test_file"] = args.test_file
41
+ config["use_gpu"] = args.use_gpu
42
+ config["local_model"] = args.local_model
43
+ config["feature_ref_HWT"] = args.feature_ref_HWT
44
+ config["feature_ref_MGT"] = args.feature_ref_MGT
45
+ print(f"Running on device", get_device())
46
+ with open(config["test_file"], "r") as file:
47
+ content = file.read()
48
+ relative_tester = RelativeTester()
49
+ print(relative_tester.test(content))
50
+ # print(content)
meta_train.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ from collections import namedtuple
4
+ import math
5
+ from utils import get_device
6
+ from pytorch_transformers.modeling_bert import (
7
+ BertEncoder,
8
+ BertPreTrainedModel,
9
+ BertConfig,
10
+ )
11
+
12
+ DEVICE = get_device()
13
+
14
+ class GeLU(nn.Module):
15
+ """Implementation of the gelu activation function.
16
+ For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
17
+ 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
18
+ Also see https://arxiv.org/abs/1606.08415
19
+ """
20
+
21
+ def __init__(self):
22
+ super().__init__()
23
+
24
+ def forward(self, x):
25
+ return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
26
+
27
+
28
+ class BertLayerNorm(nn.Module):
29
+ def __init__(self, hidden_size, eps=1e-12):
30
+ """Construct a layernorm module in the TF style (epsilon inside the square root)."""
31
+ super(BertLayerNorm, self).__init__()
32
+ self.weight = nn.Parameter(torch.ones(hidden_size))
33
+ self.bias = nn.Parameter(torch.zeros(hidden_size))
34
+ self.variance_epsilon = eps
35
+
36
+ def forward(self, x):
37
+ u = x.mean(-1, keepdim=True)
38
+ s = (x - u).pow(2).mean(-1, keepdim=True)
39
+ x = (x - u) / torch.sqrt(s + self.variance_epsilon)
40
+ return self.weight * x + self.bias
41
+
42
+
43
+ class mlp_meta(nn.Module):
44
+ def __init__(self, config):
45
+ super().__init__()
46
+ self.mlp = nn.Sequential(
47
+ nn.Linear(config.hid_dim, config.hid_dim),
48
+ GeLU(),
49
+ BertLayerNorm(config.hid_dim, eps=1e-12),
50
+ nn.Dropout(config.dropout),
51
+ )
52
+
53
+ def forward(self, x):
54
+ return self.mlp(x)
55
+
56
+
57
+ class Bert_Transformer_Layer(BertPreTrainedModel):
58
+ def __init__(self, fusion_config):
59
+ super().__init__(BertConfig(**fusion_config))
60
+ bertconfig_fusion = BertConfig(**fusion_config)
61
+ self.encoder = BertEncoder(bertconfig_fusion)
62
+ self.init_weights()
63
+
64
+ def forward(self, input, mask=None):
65
+ """
66
+ input:(bs, 4, dim)
67
+ """
68
+ batch, feats, dim = input.size()
69
+ if mask is not None:
70
+ mask_ = torch.ones(size=(batch, feats), device=mask.device)
71
+ mask_[:, 1:] = mask
72
+ mask_ = torch.bmm(
73
+ mask_.view(batch, 1, -1).transpose(1, 2), mask_.view(batch, 1, -1)
74
+ )
75
+ mask_ = mask_.unsqueeze(1)
76
+
77
+ else:
78
+ mask = torch.Tensor([1.0]).to(input.device)
79
+ mask_ = mask.repeat(batch, 1, feats, feats)
80
+
81
+ extend_mask = (1 - mask_) * -10000
82
+ assert not extend_mask.requires_grad
83
+ head_mask = [None] * self.config.num_hidden_layers
84
+
85
+ enc_output = self.encoder(input, extend_mask, head_mask=head_mask)
86
+ output = enc_output[0]
87
+ all_attention = enc_output[1]
88
+
89
+ return output, all_attention
90
+
91
+
92
+ class mmdPreModel(nn.Module):
93
+ def __init__(
94
+ self,
95
+ config,
96
+ num_mlp=0,
97
+ transformer_flag=False,
98
+ num_hidden_layers=1,
99
+ mlp_flag=True,
100
+ ):
101
+ super(mmdPreModel, self).__init__()
102
+ self.num_mlp = num_mlp
103
+ self.transformer_flag = transformer_flag
104
+ self.mlp_flag = mlp_flag
105
+ token_num = config.token_num
106
+ self.mlp = nn.Sequential(
107
+ nn.Linear(config.in_dim, config.hid_dim),
108
+ GeLU(),
109
+ BertLayerNorm(config.hid_dim, eps=1e-12),
110
+ nn.Dropout(config.dropout),
111
+ # nn.Linear(config.hid_dim, config.out_dim),
112
+ )
113
+ self.fusion_config = {
114
+ "hidden_size": config.in_dim,
115
+ "num_hidden_layers": num_hidden_layers,
116
+ "num_attention_heads": 4,
117
+ "output_attentions": True,
118
+ }
119
+ if self.num_mlp > 0:
120
+ self.mlp2 = nn.ModuleList([mlp_meta(config) for _ in range(self.num_mlp)])
121
+ if self.transformer_flag:
122
+ self.transformer = Bert_Transformer_Layer(self.fusion_config)
123
+ self.feature = nn.Linear(config.hid_dim * token_num, config.out_dim)
124
+
125
+ def forward(self, features):
126
+ """
127
+ input: [batch, token_num, hidden_size], output: [batch, token_num * config.out_dim]
128
+ """
129
+
130
+ if self.transformer_flag:
131
+ features, _ = self.transformer(features)
132
+ if self.mlp_flag:
133
+ features = self.mlp(features)
134
+
135
+ if self.num_mlp > 0:
136
+ # features = self.mlp2(features)
137
+ for _ in range(1):
138
+ for mlp in self.mlp2:
139
+ features = mlp(features)
140
+
141
+ features = self.feature(features.view(features.shape[0], -1))
142
+ return features # features.view(features.shape[0], -1)
143
+
144
+
145
+ class NetLoader:
146
+ def __init__(self):
147
+ token_num, hidden_size = 100, 768
148
+ Config = namedtuple(
149
+ "Config", ["in_dim", "hid_dim", "dropout", "out_dim", "token_num"]
150
+ )
151
+ config = Config(
152
+ in_dim=hidden_size,
153
+ token_num=token_num,
154
+ hid_dim=512,
155
+ dropout=0.2,
156
+ out_dim=300,
157
+ )
158
+ self.config = config
159
+ self.net = mmdPreModel(
160
+ config=config, num_mlp=0, transformer_flag=True, num_hidden_layers=1
161
+ )
162
+ checkpoint_filename = "./net.pt"
163
+ checkpoint = torch.load(checkpoint_filename, map_location=DEVICE)
164
+ self.net.load_state_dict(checkpoint["net"])
165
+ self.sigma, self.sigma0_u, self.ep = (
166
+ checkpoint["sigma"],
167
+ checkpoint["sigma0_u"],
168
+ checkpoint["ep"],
169
+ )
170
+ self.net = self.net.to(DEVICE)
171
+ self.sigma, self.sigma0_u, self.ep = (
172
+ self.sigma.to(DEVICE),
173
+ self.sigma0_u.to(DEVICE),
174
+ self.ep.to(DEVICE),
175
+ )
176
+
177
+
178
+ net = NetLoader()
net.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51eddf445c758f5d42e8f4783b1e8ecc5833db5a9bc170203478d73a761ddd6a
3
+ size 91379438
regression_model_loader.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import joblib
2
+ from utils import get_device
3
+
4
+ DEVICE = get_device()
5
+
6
+
7
+ class RegressionModelLoader:
8
+ def __init__(self):
9
+ print("Regression Model init")
10
+ self.model = self.load("./logistic_regression_model.pkl")
11
+
12
+ def load(self, regression_model_file_name):
13
+ return joblib.load(regression_model_file_name)
14
+
15
+
16
+ regression_model = RegressionModelLoader()
relative_tester.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import nltk
3
+ from roberta_model_loader import RobertaModelLoader
4
+ from feature_ref_loader import feature_ref_loader
5
+ from meta_train import net
6
+ from regression_model_loader import regression_model
7
+ from MMD import MMD_3_Sample_Test
8
+ from utils import FeatureExtractor, HWT, MGT, config
9
+
10
+
11
+ class RelativeTester:
12
+ def __init__(self):
13
+ print("Relative Tester init")
14
+ self.feature_extractor = FeatureExtractor(RobertaModelLoader(), net)
15
+ self.feature_hwt_ref = feature_ref_loader(config["feature_ref_HWT"])
16
+ self.feature_mgt_ref = feature_ref_loader(config["feature_ref_MGT"])
17
+
18
+ def sents_split(self, text):
19
+ nltk.download("punkt", quiet=True)
20
+ nltk.download("punkt_tab", quiet=True)
21
+ sents = nltk.sent_tokenize(text)
22
+ return [sent for sent in sents if 5 < len(sent.split())]
23
+
24
+ def test(self, input_text, threshold=0.2, round=20):
25
+ print("Relative Tester test")
26
+ # Split the input text
27
+ sents = self.sents_split(input_text)
28
+ print("DEBUG: sents:", len(sents))
29
+ # Extract features
30
+ feature_for_sents = self.feature_extractor.process_sents(sents, False)
31
+ if len(feature_for_sents) <= 1:
32
+ # print("DEBUG: tooshort")
33
+ return "Too short to test! Please input more than 2 sentences."
34
+ # Cutoff the features
35
+ min_len = min(
36
+ len(feature_for_sents),
37
+ len(self.feature_hwt_ref),
38
+ len(self.feature_mgt_ref),
39
+ )
40
+ # Calculate MMD
41
+ h_u_list = []
42
+ p_value_list = []
43
+ t_list = []
44
+
45
+ for i in range(round):
46
+ feature_for_sents_sample = feature_for_sents[
47
+ torch.randperm(len(feature_for_sents))[:min_len]
48
+ ]
49
+ feature_hwt_ref_sample = self.feature_hwt_ref[
50
+ torch.randperm(len(self.feature_hwt_ref))[:min_len]
51
+ ]
52
+ feature_mgt_ref_sample = self.feature_mgt_ref[
53
+ torch.randperm(len(self.feature_mgt_ref))[:min_len]
54
+ ]
55
+ h_u, p_value, t, *rest = MMD_3_Sample_Test(
56
+ net.net(feature_for_sents_sample),
57
+ net.net(feature_hwt_ref_sample),
58
+ net.net(feature_mgt_ref_sample),
59
+ feature_for_sents_sample.view(feature_for_sents_sample.shape[0], -1),
60
+ feature_hwt_ref_sample.view(feature_hwt_ref_sample.shape[0], -1),
61
+ feature_mgt_ref_sample.view(feature_mgt_ref_sample.shape[0], -1),
62
+ net.sigma,
63
+ net.sigma0_u,
64
+ net.ep,
65
+ 0.05,
66
+ )
67
+
68
+ h_u_list.append(h_u)
69
+ p_value_list.append(p_value)
70
+ t_list.append(t)
71
+
72
+ power = sum(h_u_list) / len(h_u_list)
73
+ print("DEBUG: power:", power)
74
+ print("DEBUG: power list:", h_u_list)
75
+ # Return the result
76
+ return "Human" if power <= threshold else "AI"
requirements.txt ADDED
Binary file (184 Bytes). View file
 
roberta_model_loader.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import RobertaTokenizer, RobertaModel
2
+ import torch
3
+ from utils import config
4
+
5
+
6
+ class RobertaModelLoader:
7
+ def __init__(
8
+ self,
9
+ model_name="roberta-base-openai-detector",
10
+ cache_dir=".cache",
11
+ ):
12
+ self.model_name = model_name
13
+ self.cache_dir = cache_dir
14
+ self.tokenizer, self.model = self.load_base_model_and_tokenizer()
15
+
16
+ def load_base_model_and_tokenizer(self):
17
+ if config["local_model"]: # load model from local
18
+ print("Load model from local: ", self.model_name, config["local_model"])
19
+ return RobertaTokenizer.from_pretrained(
20
+ config["local_model"], cache_dir=self.cache_dir
21
+ ), RobertaModel.from_pretrained(
22
+ config["local_model"],
23
+ output_hidden_states=True,
24
+ cache_dir=self.cache_dir,
25
+ )
26
+
27
+ print("Load model from remote: ", self.model_name)
28
+ return RobertaTokenizer.from_pretrained(
29
+ self.model_name, cache_dir=self.cache_dir
30
+ ), RobertaModel.from_pretrained(
31
+ self.model_name, output_hidden_states=True, cache_dir=self.cache_dir
32
+ )
two_sample_tester.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from roberta_model_loader import RobertaModelLoader
3
+ from feature_ref_loader import feature_two_sample_tester_ref
4
+ from meta_train import net
5
+ from regression_model_loader import regression_model
6
+ from MMD import MMD_batch2
7
+ from utils import DEVICE, FeatureExtractor
8
+
9
+
10
+ class TwoSampleTester:
11
+ def __init__(self):
12
+ print("TwoSample Tester init")
13
+ self.net = net
14
+ self.feature_extractor = FeatureExtractor(RobertaModelLoader(), net)
15
+
16
+ def test(self, input_text):
17
+ print("TwoSample Tester test")
18
+ # Get the feature for input text
19
+ feature_for_input_text = self.feature_extractor.process(input_text)
20
+ # print(
21
+ # "DEBUG: feature_for_input_text:",
22
+ # feature_for_input_text.shape,
23
+ # feature_two_sample_tester_ref.shape,
24
+ # )
25
+ # Calculate MMD
26
+ mmd_feature_for_input_text = MMD_batch2(
27
+ torch.cat([feature_two_sample_tester_ref, feature_for_input_text], dim=0),
28
+ feature_two_sample_tester_ref.shape[0],
29
+ 0,
30
+ self.net.sigma,
31
+ self.net.sigma0_u,
32
+ self.net.ep,
33
+ ).to("cpu")
34
+ # Use the regression model to get the 2-sample test result
35
+ y_pred_loaded = regression_model.model.predict(
36
+ mmd_feature_for_input_text.detach().numpy().reshape(-1, 1)
37
+ )
38
+
39
+ prediction = int(y_pred_loaded[0])
40
+ if prediction == 0:
41
+ return "Human"
42
+ elif prediction == 1:
43
+ return "AI"
utils.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import random
3
+ import numpy as np
4
+
5
+ config = {}
6
+
7
+
8
+ def get_device():
9
+ return (
10
+ torch.device("cuda:0") if config.get("use_gpu", False) else torch.device("cpu")
11
+ )
12
+
13
+
14
+ HWT = "HWT"
15
+ MGT = "MGT"
16
+
17
+
18
+ def init_random_seeds():
19
+ print("Init random seeds")
20
+ random.seed(0)
21
+ np.random.seed(0)
22
+ torch.manual_seed(0)
23
+ torch.cuda.manual_seed(0)
24
+ torch.cuda.manual_seed_all(0)
25
+ torch.backends.cudnn.benchmark = False
26
+ torch.backends.cudnn.deterministic = True
27
+
28
+
29
+ class FeatureExtractor:
30
+ def __init__(self, model, net=None):
31
+ self.llm_model = model # TODO: support different models
32
+ self.net = net
33
+
34
+ def process(self, text, net_required=True):
35
+ DEVICE = get_device()
36
+ # Tokenize
37
+ tokens = self.llm_model.tokenizer(
38
+ [text],
39
+ padding="max_length",
40
+ truncation=True,
41
+ max_length=100,
42
+ return_tensors="pt",
43
+ ).to(DEVICE)
44
+ # Predict
45
+ outputs = self.llm_model.model(**tokens)
46
+ # Get the feature for input text
47
+ attention_mask = tokens["attention_mask"].unsqueeze(-1)
48
+ hidden_states_masked = (
49
+ outputs.last_hidden_state * attention_mask
50
+ ) # Ignore the padding tokens
51
+ if net_required and self.net is not None:
52
+ feature = self.net.net(hidden_states_masked)
53
+ return feature
54
+ else:
55
+ return hidden_states_masked
56
+
57
+ def process_sents(self, sents, net_required=True):
58
+ features = []
59
+ for sent in sents:
60
+ features.append(self.process(sent, net_required))
61
+ if not features:
62
+ return torch.tensor([])
63
+ return torch.cat(features, dim=0)