Spaces:

songyiliao
/

R-Detect

Sleeping

App Files Files Community

songyiliao

CoinW commited on 23 days ago

Commit

1244519

verified ·

1 Parent(s): e293588

feat: initial cmommit (#1)

Browse files

- feat: initial cmommit (5039b3ff5c805715fb723be2a373267cdf886398)

Co-authored-by: Coin W <CoinW@users.noreply.huggingface.co>

Files changed (21) hide show

LICENSE +21 -0
MMD.py +168 -0
app.py +177 -0
data_loader.py +134 -0
demo_text_gpt.txt +1 -0
download_model_and_dataset.sh +8 -0
feature_ref_HWT_500.pt +3 -0
feature_ref_MGT_500.pt +3 -0
feature_ref_for_test.pt +3 -0
feature_ref_generater.py +85 -0
feature_ref_loader.py +19 -0
logistic_regression_model.pkl +3 -0
main.py +50 -0
meta_train.py +178 -0
net.pt +3 -0
regression_model_loader.py +16 -0
relative_tester.py +76 -0
requirements.txt +0 -0
roberta_model_loader.py +32 -0
two_sample_tester.py +43 -0
utils.py +63 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 Shuhai Zhang
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

MMD.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import torch
+def flexible_kernel(X, Y, X_org, Y_org, sigma, sigma0=0.1, epsilon=1e-08):
+    """Flexible kernel calculation as in MMDu."""
+    Dxy = Pdist2(X, Y)
+    Dxy_org = Pdist2(X_org, Y_org)
+    L = 1
+    Kxy = (1 - epsilon) * torch.exp(
+        -((Dxy / sigma0) ** L) - Dxy_org / sigma
+    ) + epsilon * torch.exp(-Dxy_org / sigma)
+    return Kxy
+def MMD_Diff_Var(Kyy, Kzz, Kxy, Kxz, epsilon=1e-08):
+    """Compute the variance of the difference statistic MMDXY - MMDXZ."""
+    """Referenced from: https://github.com/eugenium/MMD/blob/master/mmd.py"""
+    m = Kxy.shape[0]
+    n = Kyy.shape[0]
+    r = Kzz.shape[0]
+    # Remove diagonal elements
+    Kyynd = Kyy - torch.diag(torch.diag(Kyy))
+    Kzznd = Kzz - torch.diag(torch.diag(Kzz))
+    u_yy = torch.sum(Kyynd) * (1.0 / (n * (n - 1)))
+    u_zz = torch.sum(Kzznd) * (1.0 / (r * (r - 1)))
+    u_xy = torch.sum(Kxy) / (m * n)
+    u_xz = torch.sum(Kxz) / (m * r)
+    t1 = (1.0 / n**3) * torch.sum(Kyynd.T @ Kyynd) - u_yy**2
+    t2 = (1.0 / (n**2 * m)) * torch.sum(Kxy.T @ Kxy) - u_xy**2
+    t3 = (1.0 / (n * m**2)) * torch.sum(Kxy @ Kxy.T) - u_xy**2
+    t4 = (1.0 / r**3) * torch.sum(Kzznd.T @ Kzznd) - u_zz**2
+    t5 = (1.0 / (r * m**2)) * torch.sum(Kxz @ Kxz.T) - u_xz**2
+    t6 = (1.0 / (r**2 * m)) * torch.sum(Kxz.T @ Kxz) - u_xz**2
+    t7 = (1.0 / (n**2 * m)) * torch.sum(Kyynd @ Kxy.T) - u_yy * u_xy
+    t8 = (1.0 / (n * m * r)) * torch.sum(Kxy.T @ Kxz) - u_xz * u_xy
+    t9 = (1.0 / (r**2 * m)) * torch.sum(Kzznd @ Kxz.T) - u_zz * u_xz
+    if type(epsilon) == torch.Tensor:
+        epsilon_tensor = epsilon.clone().detach()
+    else:
+        epsilon_tensor = torch.tensor(epsilon, device=Kyy.device)
+    zeta1 = torch.max(t1 + t2 + t3 + t4 + t5 + t6 - 2 * (t7 + t8 + t9), epsilon_tensor)
+    zeta2 = torch.max(
+        (1 / m / (m - 1)) * torch.sum((Kyynd - Kzznd - Kxy.T - Kxy + Kxz + Kxz.T) ** 2)
+        - (u_yy - 2 * u_xy - (u_zz - 2 * u_xz)) ** 2,
+        epsilon_tensor,
+    )
+    data = {
+        "t1": t1.item(),
+        "t2": t2.item(),
+        "t3": t3.item(),
+        "t4": t4.item(),
+        "t5": t5.item(),
+        "t6": t6.item(),
+        "t7": t7.item(),
+        "t8": t8.item(),
+        "t9": t9.item(),
+        "zeta1": zeta1.item(),
+        "zeta2": zeta2.item(),
+    }
+    Var = (4 * (m - 2) / (m * (m - 1))) * zeta1
+    Var_z2 = Var + (2.0 / (m * (m - 1))) * zeta2
+    return Var, Var_z2, data
+def Pdist2(x, y):
+    """compute the paired distance between x and y."""
+    x_norm = (x**2).sum(1).view(-1, 1)
+    if y is not None:
+        y_norm = (y**2).sum(1).view(1, -1)
+    else:
+        y = x
+        y_norm = x_norm.view(1, -1)
+    Pdist = x_norm + y_norm - 2.0 * torch.mm(x, torch.transpose(y, 0, 1))
+    Pdist[Pdist < 0] = 0
+    return Pdist
+def MMD_batch2(
+    Fea,
+    len_s,
+    Fea_org,
+    sigma,
+    sigma0=0.1,
+    epsilon=10 ** (-10),
+    is_var_computed=True,
+    use_1sample_U=True,
+    coeff_xy=2,
+):
+    X = Fea[0:len_s, :]
+    Y = Fea[len_s:, :]
+    L = 1  # generalized Gaussian (if L>1)
+    nx = X.shape[0]
+    ny = Y.shape[0]
+    Dxx = Pdist2(X, X)
+    Dyy = torch.zeros(Fea.shape[0] - len_s, 1).to(Dxx.device)
+    # Dyy = Pdist2(Y, Y)
+    Dxy = Pdist2(X, Y).transpose(0, 1)
+    Kx = torch.exp(-Dxx / sigma0)
+    Ky = torch.exp(-Dyy / sigma0)
+    Kxy = torch.exp(-Dxy / sigma0)
+    nx = Kx.shape[0]
+    is_unbiased = False
+    xx = torch.div((torch.sum(Kx)), (nx * nx))
+    yy = Ky.reshape(-1)
+    xy = torch.div(torch.sum(Kxy, dim=1), (nx))
+    mmd2 = xx - 2 * xy + yy
+    return mmd2
+# MMD for three samples
+def MMD_3_Sample_Test(
+    ref_fea,
+    fea_y,
+    fea_z,
+    ref_fea_org,
+    fea_y_org,
+    fea_z_org,
+    sigma,
+    sigma0,
+    epsilon,
+    alpha,
+):
+    """Run three-sample test (TST) using deep kernel kernel."""
+    X = ref_fea.clone().detach()
+    Y = fea_y.clone().detach()
+    Z = fea_z.clone().detach()
+    X_org = ref_fea_org.clone().detach()
+    Y_org = fea_y_org.clone().detach()
+    Z_org = fea_z_org.clone().detach()
+    Kyy = flexible_kernel(Y, Y, Y_org, Y_org, sigma, sigma0, epsilon)
+    Kzz = flexible_kernel(Z, Z, Z_org, Z_org, sigma, sigma0, epsilon)
+    Kxy = flexible_kernel(X, Y, X_org, Y_org, sigma, sigma0, epsilon)
+    Kxz = flexible_kernel(X, Z, X_org, Z_org, sigma, sigma0, epsilon)
+    Kyynd = Kyy - torch.diag(torch.diag(Kyy))
+    Kzznd = Kzz - torch.diag(torch.diag(Kzz))
+    Diff_Var, _, _ = MMD_Diff_Var(Kyy, Kzz, Kxy, Kxz, epsilon)
+    u_yy = torch.sum(Kyynd) / (Y.shape[0] * (Y.shape[0] - 1))
+    u_zz = torch.sum(Kzznd) / (Z.shape[0] * (Z.shape[0] - 1))
+    u_xy = torch.sum(Kxy) / (X.shape[0] * Y.shape[0])
+    u_xz = torch.sum(Kxz) / (X.shape[0] * Z.shape[0])
+    t = u_yy - 2 * u_xy - (u_zz - 2 * u_xz)
+    if Diff_Var.item() <= 0:
+        Diff_Var = torch.max(epsilon, torch.tensor(1e-08))
+    p_value = torch.distributions.Normal(0, 1).cdf(-t / torch.sqrt((Diff_Var)))
+    t = t / torch.sqrt(Diff_Var)
+    if p_value > alpha:
+        h = 0
+    else:
+        h = 1
+    return h, p_value.item(), t.item()

app.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import gradio as gr
+from utils import init_random_seeds, config
+from relative_tester import RelativeTester
+# from two_sample_tester import two_sample_tester
+def run_test(input_text):
+    if not input_text:
+        return "Now that you've built a demo, you'll probably want to share it with others. Gradio demos can be shared in two ways: using a temporary share link or permanent hosting on Spaces."
+    # return two_sample_tester.test(input_text.strip())
+    return relative_tester.test(input_text.strip())
+    return f"Prediction: Human (Mocked for {input_text})"
+css = """
+#header { text-align: center; font-size: 3em; margin-bottom: 20px; color: #black; font-weight: bold;}
+#output-text { font-weight: bold; font-size: 1.2em; border-radius: 10px; padding: 10px; background-color: #f4f4f4;}
+.links {
+    display: flex;
+    justify-content: flex-end;
+    gap: 10px;
+    margin-right: 10px;
+    align-items: center;
+    font-size: 0.9em;
+    color: #ADD8E6;
+}
+.separator {
+    margin: 0 5px;
+    color: #000;
+}
+/* Adjusting layout for Input Text and Inference Result */
+.input-row {
+    display: flex;
+    width: 100%;
+}
+.input-text {
+    flex: 3;  /* 4 parts of the row */
+    margin-right: 1px;
+    border-radius: 8px;
+    padding: 12px;
+    border: 2px soild #d1d1d1;
+}
+.output-text {
+    flex: 1;  /* 1 part of the row */
+    border-radius: 8px;
+    padding: 12px;
+    border: 2px soild #d1d1d1;
+}
+/* Set button widths to match the Select Model width */
+.button {
+    width: 250px;  /* Same as the select box width */
+    height: 100px;  /* Button height */
+    background-color: #ADD8E6;
+    color: white;
+    font-weight: bold;
+    border-radius: 8px;
+}
+.button:hover {
+    background-color: #0000FF;
+}
+/* Set height for the Select Model dropdown */
+.select {
+    height: 100px;  /* Set height to 100px */
+}
+/* Accordion Styling */
+.accordion {
+    width: 100%;  /* Set the width of the accordion to match the parent */
+    max-height: 200px;  /* Set a max-height for accordion */
+    overflow-y: auto;  /* Allow scrolling if the content exceeds max height */
+    margin-bottom: 10px;  /* Add space below accordion */
+    box-sizing: border-box;  /* Ensure padding is included in width/height */
+}
+/* Accordion content max-height */
+.accordion-content {
+    max-height: 200px;  /* Limit the height of the content */
+    overflow-y: auto;  /* Add a scrollbar if content overflows */
+}
+.demo-banner {
+    background-color: #f3f4f6;
+    padding: 20px;
+    border-radius: 10px;
+    font-size: 1.1em;
+    font-weight: bold;
+    text-align: center;
+    margin-bottom: 20px;
+    color: #ff5722;
+}
+"""
+# Gradio App
+with gr.Blocks(css=css) as app:
+    with gr.Row():
+        gr.HTML('<div id="header">R-detect On HuggingFace</div>')
+    with gr.Row():
+        gr.HTML(
+            """
+        <div class="links">
+            <a href="https://openreview.net/forum?id=z9j7wctoGV" target="_blank">Paper</a>
+            <span class="separator">|</span>
+            <a href="https://github.com/xLearn-AU/R-Detect" target="_blank">Code</a>
+            <span class="separator">|</span>
+            <a href="mailto:1730421718@qq.com" target="_blank">Contact</a>
+        </div>
+        """
+        )
+    with gr.Row():
+        gr.HTML(
+            '<div class="demo-banner">This is a demo. For the full version, please refer to the <a href="https://github.com/xLearn-AU/R-Detect" target="_blank">GitHub</a> or the <a href="https://openreview.net/forum?id=z9j7wctoGV" target="_blank">Paper</a>.</div>'
+        )
+    with gr.Row():
+        input_text = gr.Textbox(
+            label="Input Text",
+            placeholder="Enter Text Here",
+            lines=8,
+            elem_classes=["input-text"],  # Applying the CSS class
+            value="Hugging Face is a company and community that has become one of the leading platforms in the field of natural language processing (NLP). It is best known for developing and maintaining the Transformers library, which simplifies the use of state-of-the-art machine learning models for tasks such as text classification, language generation, translation, and more.",
+        )
+        output = gr.Textbox(
+            label="Inference Result",
+            placeholder="Made by Human or AI",
+            elem_id="output-text",
+            lines=8,
+            elem_classes=["output-text"],
+        )
+    with gr.Row():
+        submit_button = gr.Button(
+            "Run Detection", variant="primary", elem_classes=["button"]
+        )
+        clear_button = gr.Button("Clear", variant="secondary", elem_classes=["button"])
+    submit_button.click(run_test, inputs=[input_text], outputs=output)
+    clear_button.click(lambda: ("", ""), inputs=[], outputs=[input_text, output])
+    with gr.Accordion("Disclaimer", open=False, elem_classes=["accordion"]):
+        gr.Markdown(
+            """
+        - **Disclaimer**: This tool is for demonstration purposes only. It is not a foolproof AI detector.
+        - **Accuracy**: Results may vary based on input length and quality.
+        """
+        )
+    with gr.Accordion("Citations", open=False, elem_classes=["accordion"]):
+        gr.Markdown(
+            """
+        ```
+        @inproceedings{zhangs2024MMDMP,
+            title={Detecting Machine-Generated Texts by Multi-Population Aware Optimization for Maximum Mean Discrepancy},
+            author={Zhang, Shuhai and Song, Yiliao and Yang, Jiahao and Li, Yuanqing and Han, Bo and Tan, Mingkui},
+            booktitle = {International Conference on Learning Representations (ICLR)},
+            year={2024}
+        }
+        ```
+        """
+        )
+if __name__ == "__main__":
+    config["use_gpu"] = False
+    config["local_model"] = ""
+    config["feature_ref_HWT"] = "./feature_ref_HWT_500.pt"
+    config["feature_ref_MGT"] = "./feature_ref_MGT_500.pt"
+    init_random_seeds()
+    relative_tester = RelativeTester()
+    app.launch()

data_loader.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import random
+import tqdm
+import datasets
+import re
+import transformers
+import numpy as np
+from utils import MGT, HWT, config
+preproc_tokenizer = transformers.AutoTokenizer.from_pretrained(
+    "google-t5/t5-small", model_max_length=512
+)
+def process_spaces(text):
+    text = (
+        text.replace(" ,", ",")
+        .replace(" .", ".")
+        .replace(" ?", "?")
+        .replace(" !", "!")
+        .replace(" ;", ";")
+        .replace(" '", "'")
+        .replace(" ’ ", "'")
+        .replace(" :", ":")
+        .replace("<newline>", "\n")
+        .replace("`` ", '"')
+        .replace(" ''", '"')
+        .replace("''", '"')
+        .replace(".. ", "... ")
+        .replace(" )", ")")
+        .replace("( ", "(")
+        .replace(" n't", "n't")
+        .replace(" i ", " I ")
+        .replace(" i'", " I'")
+        .replace("\\'", "'")
+        .replace("\n ", "\n")
+        .strip()
+    )
+    text = text.replace("\r\n", "\n").replace("\\n", "").replace("!\n", "")
+    return re.sub("\n+", "\n", text)
+def trim_to_shorter_length(texta, textb):
+    # truncate to shorter of o and s
+    shorter_length = min(len(texta.split(" ")), len(textb.split(" ")))
+    texta = " ".join(texta.split(" ")[:shorter_length])
+    textb = " ".join(textb.split(" ")[:shorter_length])
+    return texta, textb
+def load_HC3():
+    if config["local_dataset"]:
+        print("Loading local HC3 dataset", config["local_dataset"])
+    else:
+        print("Loading remote HC3 dataset")
+    ds = (
+        datasets.load_dataset(
+            config["local_dataset"], name="all", trust_remote_code=True
+        )
+        if config["local_dataset"]
+        else datasets.load_dataset("Hello-SimpleAI/HC3", name="all")
+    )
+    ds = ds["train"]  # DatasetDict -> Dataset
+    filtered_ds = [
+        item
+        for item in ds
+        if (
+            len(item["human_answers"]) > 0
+            and len(item["chatgpt_answers"]) > 0
+            and len(item["human_answers"][0].split()) > 5
+            and len(item["chatgpt_answers"][0].split()) > 5
+        )
+    ]
+    # print("DEBUG: filtered_ds[0]:", filtered_ds[0])
+    data_new = {"text": [], "label": []}
+    for i in tqdm.tqdm(range(len(filtered_ds)), desc="Parsing data"):
+        data_new["text"].append(process_spaces(filtered_ds[i]["human_answers"][0]))
+        data_new["label"].append(HWT)
+        data_new["text"].append(process_spaces(filtered_ds[i]["chatgpt_answers"][0]))
+        data_new["label"].append(MGT)
+    return data_new
+def filter_data(data_o, long_train_threshold_low=150, long_train_threshold_high=512):
+    data_HWT = [
+        text for text, label in zip(data_o["text"], data_o["label"]) if label == HWT
+    ]
+    data_MGT = [
+        text for text, label in zip(data_o["text"], data_o["label"]) if label == MGT
+    ]
+    # keep only examples with <= 512 tokens according to mask_tokenizer
+    # this step has the extra effect of removing examples with low-quality/garbage content
+    tokenized_data = preproc_tokenizer(data_HWT)
+    long_HWT = [
+        x
+        for x, y in zip(data_HWT, tokenized_data["input_ids"])
+        if long_train_threshold_low <= len(y) <= long_train_threshold_high
+    ]
+    tokenized_data = preproc_tokenizer(data_MGT)
+    long_MGT = [
+        x
+        for x, y in zip(data_MGT, tokenized_data["input_ids"])
+        if long_train_threshold_low <= len(y) <= long_train_threshold_high
+    ]
+    # print stats about remainining data
+    print(f"Total number of samples: {len(long_HWT)}")
+    print(f"Average number of words: {np.mean([len(x.split()) for x in long_HWT])}")
+    data = {
+        HWT: [],
+        MGT: [],
+    }
+    # print(len(long_HWT), len(long_MGT))
+    for o, s in zip(long_HWT, long_MGT):
+        o, s = trim_to_shorter_length(o, s)
+        # add to the data
+        data[HWT].append(o)
+        data[MGT].append(s)
+    return data
+# Test code
+# data_o = load_HC3()
+# data = filter_data(data_o)
+# real = data[HWT]  # [:args.train_real_num]  len== n_samples, many sentences of words
+# generated = data[MGT]
+# print(real[:5])
+# print(generated[:5])

demo_text_gpt.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Hugging Face is a company and community that has become one of the leading platforms in the field of natural language processing (NLP). It is best known for developing and maintaining the Transformers library, which simplifies the use of state-of-the-art machine learning models for tasks such as text classification, language generation, translation, and more.

download_model_and_dataset.sh ADDED Viewed

	@@ -0,0 +1,8 @@

+#!/bin/bash
+# If you are in China, you can use set the following mirror link to download the model and dataset
+# export HF_ENDPOINT=https://hf-mirror.com
+huggingface-cli download --resume-download openai-community/roberta-base-openai-detector --local-dir llm-models/roberta-base
+huggingface-cli download --repo-type dataset --resume-download Hello-SimpleAI/HC3 --local-dir datasets/HC3

feature_ref_HWT_500.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:63fefc57e0086d1ff5becda59300f8e530677dc6acb2b79e70f2f6255efab318
+size 153601240

feature_ref_MGT_500.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ecf694cd8892975c37a4a008b7fcb1ab5dce3c4d6986e556cd7ab86b886db98
+size 153601240

feature_ref_for_test.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f690f4244c0b245823f847a75d1fd25a151ad96075744156e628c0c1dda65302
+size 42240786

feature_ref_generater.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import torch
+import tqdm
+import numpy as np
+import nltk
+import argparse
+from utils import FeatureExtractor, HWT, MGT, config
+from roberta_model_loader import RobertaModelLoader
+from meta_train import net
+from data_loader import load_HC3, filter_data
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="R-Detect the file content")
+    parser.add_argument(
+        "--target",
+        type=str,
+        help="The target of generated feature ref. Default is MGT",
+        default=MGT,
+    )
+    parser.add_argument(
+        "--sample_size",
+        type=int,
+        help="The sample size of generated feature ref. Default is 1000, must bigger than 100 and smaller than 30000",
+        default=1000,
+    )
+    parser.add_argument(
+        "--use_gpu",
+        action="store_true",
+        help="Use GPU or not.",
+    )
+    parser.add_argument(
+        "--local_model",
+        type=str,
+        help="Use local model or not, you need to download the model first, and set the path. Default is Empty. Script will use remote if this param is empty.",
+        default="",
+    )
+    parser.add_argument(
+        "--local_dataset",
+        type=str,
+        help="Use local dataset or not, you need to download the dataset first, and set the path. Default is Empty",
+        default="",
+    )
+    args = parser.parse_args()
+    config["target"] = args.target
+    if args.sample_size < 100 or args.sample_size > 30000:
+        print("Sample size must be between 100 and 30000, set to 1000")
+        config["sample_size"] = 1000
+    else:
+        config["sample_size"] = args.sample_size
+    config["use_gpu"] = args.use_gpu
+    config["local_model"] = args.local_model
+    config["local_dataset"] = args.local_dataset
+    target = HWT if config["target"] == HWT else MGT
+    # load model and feature extractor
+    roberta_model = RobertaModelLoader()
+    feature_extractor = FeatureExtractor(roberta_model, net)
+    # load target data
+    data_o = load_HC3()
+    data = filter_data(data_o)
+    data = data[target]
+    # print(data[:3])
+    # split with nltk
+    nltk.download("punkt", quiet=True)
+    nltk.download("punkt_tab", quiet=True)
+    paragraphs = [nltk.sent_tokenize(paragraph)[1:-1] for paragraph in data]
+    data = [
+        sent for paragraph in paragraphs for sent in paragraph if 5 < len(sent.split())
+    ]
+    # print(data[:3])
+    # extract features
+    feature_ref = []
+    for i in tqdm.tqdm(
+        range(config["sample_size"]), desc=f"Generating feature ref for {target}"
+    ):
+        feature_ref.append(
+            feature_extractor.process(data[i], False).detach()
+        )  # detach to save memory
+    torch.save(
+        torch.cat(feature_ref, dim=0),
+        f"feature_ref_{target}_{config["sample_size"]}.pt",
+    )
+    print(f"Feature ref for {target} generated successfully")

feature_ref_loader.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import torch
+import numpy as np
+from utils import get_device, config
+DEVICE = get_device()
+def feature_ref_loader(feature_ref_file_name, num_ref=5000):
+    print("Feature Ref Loader load: ", feature_ref_file_name)
+    load_ref_data = torch.load(feature_ref_file_name, map_location=DEVICE)  # cpu
+    load_ref_data = load_ref_data.to(DEVICE)
+    feature_ref = load_ref_data[np.random.permutation(load_ref_data.shape[0])][
+        :num_ref
+    ].to(DEVICE)
+    return feature_ref
+feature_two_sample_tester_ref = feature_ref_loader("./feature_ref_for_test.pt")

logistic_regression_model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ce9afa92d745bff3bbfb4ae028db369cb35710a9099159a12ac1048fed3760e6
+size 756

main.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import sys
+import argparse
+from relative_tester import RelativeTester
+from utils import init_random_seeds, config, get_device
+if __name__ == "__main__":
+    init_random_seeds()
+    parser = argparse.ArgumentParser(description="R-Detect the file content")
+    parser.add_argument(
+        "--test_file",
+        type=str,
+        help="The file path of the test file. Default is demo_text_gpt.txt",
+        default="./demo_text_gpt.txt",
+    )
+    parser.add_argument(
+        "--use_gpu",
+        action="store_true",
+        help="Use GPU or not.",
+    )
+    parser.add_argument(
+        "--local_model",
+        type=str,
+        help="Use local model or not, you need to download the model first, and set the path. Script will use remote if this param is empty.",
+        default="",
+    )
+    parser.add_argument(
+        "--feature_ref_HWT",
+        type=str,
+        help="The feature ref path of HWT. Script will use remote if this param is empty.",
+        default="",
+    )
+    parser.add_argument(
+        "--feature_ref_MGT",
+        type=str,
+        help="The feature ref path of MGT. Default is Empty",
+        default="",
+    )
+    args = parser.parse_args()
+    config["test_file"] = args.test_file
+    config["use_gpu"] = args.use_gpu
+    config["local_model"] = args.local_model
+    config["feature_ref_HWT"] = args.feature_ref_HWT
+    config["feature_ref_MGT"] = args.feature_ref_MGT
+    print(f"Running on device", get_device())
+    with open(config["test_file"], "r") as file:
+        content = file.read()
+        relative_tester = RelativeTester()
+        print(relative_tester.test(content))
+    # print(content)

meta_train.py ADDED Viewed

	@@ -0,0 +1,178 @@

+import torch
+from torch import nn
+from collections import namedtuple
+import math
+from utils import get_device
+from pytorch_transformers.modeling_bert import (
+    BertEncoder,
+    BertPreTrainedModel,
+    BertConfig,
+)
+DEVICE = get_device()
+class GeLU(nn.Module):
+    """Implementation of the gelu activation function.
+    For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
+    0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+    Also see https://arxiv.org/abs/1606.08415
+    """
+    def __init__(self):
+        super().__init__()
+    def forward(self, x):
+        return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+class BertLayerNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-12):
+        """Construct a layernorm module in the TF style (epsilon inside the square root)."""
+        super(BertLayerNorm, self).__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.bias = nn.Parameter(torch.zeros(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, x):
+        u = x.mean(-1, keepdim=True)
+        s = (x - u).pow(2).mean(-1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
+        return self.weight * x + self.bias
+class mlp_meta(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(config.hid_dim, config.hid_dim),
+            GeLU(),
+            BertLayerNorm(config.hid_dim, eps=1e-12),
+            nn.Dropout(config.dropout),
+        )
+    def forward(self, x):
+        return self.mlp(x)
+class Bert_Transformer_Layer(BertPreTrainedModel):
+    def __init__(self, fusion_config):
+        super().__init__(BertConfig(**fusion_config))
+        bertconfig_fusion = BertConfig(**fusion_config)
+        self.encoder = BertEncoder(bertconfig_fusion)
+        self.init_weights()
+    def forward(self, input, mask=None):
+        """
+        input:(bs, 4, dim)
+        """
+        batch, feats, dim = input.size()
+        if mask is not None:
+            mask_ = torch.ones(size=(batch, feats), device=mask.device)
+            mask_[:, 1:] = mask
+            mask_ = torch.bmm(
+                mask_.view(batch, 1, -1).transpose(1, 2), mask_.view(batch, 1, -1)
+            )
+            mask_ = mask_.unsqueeze(1)
+        else:
+            mask = torch.Tensor([1.0]).to(input.device)
+            mask_ = mask.repeat(batch, 1, feats, feats)
+        extend_mask = (1 - mask_) * -10000
+        assert not extend_mask.requires_grad
+        head_mask = [None] * self.config.num_hidden_layers
+        enc_output = self.encoder(input, extend_mask, head_mask=head_mask)
+        output = enc_output[0]
+        all_attention = enc_output[1]
+        return output, all_attention
+class mmdPreModel(nn.Module):
+    def __init__(
+        self,
+        config,
+        num_mlp=0,
+        transformer_flag=False,
+        num_hidden_layers=1,
+        mlp_flag=True,
+    ):
+        super(mmdPreModel, self).__init__()
+        self.num_mlp = num_mlp
+        self.transformer_flag = transformer_flag
+        self.mlp_flag = mlp_flag
+        token_num = config.token_num
+        self.mlp = nn.Sequential(
+            nn.Linear(config.in_dim, config.hid_dim),
+            GeLU(),
+            BertLayerNorm(config.hid_dim, eps=1e-12),
+            nn.Dropout(config.dropout),
+            # nn.Linear(config.hid_dim, config.out_dim),
+        )
+        self.fusion_config = {
+            "hidden_size": config.in_dim,
+            "num_hidden_layers": num_hidden_layers,
+            "num_attention_heads": 4,
+            "output_attentions": True,
+        }
+        if self.num_mlp > 0:
+            self.mlp2 = nn.ModuleList([mlp_meta(config) for _ in range(self.num_mlp)])
+        if self.transformer_flag:
+            self.transformer = Bert_Transformer_Layer(self.fusion_config)
+        self.feature = nn.Linear(config.hid_dim * token_num, config.out_dim)
+    def forward(self, features):
+        """
+        input: [batch, token_num, hidden_size], output: [batch, token_num * config.out_dim]
+        """
+        if self.transformer_flag:
+            features, _ = self.transformer(features)
+        if self.mlp_flag:
+            features = self.mlp(features)
+        if self.num_mlp > 0:
+            # features = self.mlp2(features)
+            for _ in range(1):
+                for mlp in self.mlp2:
+                    features = mlp(features)
+        features = self.feature(features.view(features.shape[0], -1))
+        return features  # features.view(features.shape[0], -1)
+class NetLoader:
+    def __init__(self):
+        token_num, hidden_size = 100, 768
+        Config = namedtuple(
+            "Config", ["in_dim", "hid_dim", "dropout", "out_dim", "token_num"]
+        )
+        config = Config(
+            in_dim=hidden_size,
+            token_num=token_num,
+            hid_dim=512,
+            dropout=0.2,
+            out_dim=300,
+        )
+        self.config = config
+        self.net = mmdPreModel(
+            config=config, num_mlp=0, transformer_flag=True, num_hidden_layers=1
+        )
+        checkpoint_filename = "./net.pt"
+        checkpoint = torch.load(checkpoint_filename, map_location=DEVICE)
+        self.net.load_state_dict(checkpoint["net"])
+        self.sigma, self.sigma0_u, self.ep = (
+            checkpoint["sigma"],
+            checkpoint["sigma0_u"],
+            checkpoint["ep"],
+        )
+        self.net = self.net.to(DEVICE)
+        self.sigma, self.sigma0_u, self.ep = (
+            self.sigma.to(DEVICE),
+            self.sigma0_u.to(DEVICE),
+            self.ep.to(DEVICE),
+        )
+net = NetLoader()

net.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:51eddf445c758f5d42e8f4783b1e8ecc5833db5a9bc170203478d73a761ddd6a
+size 91379438

regression_model_loader.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import joblib
+from utils import get_device
+DEVICE = get_device()
+class RegressionModelLoader:
+    def __init__(self):
+        print("Regression Model init")
+        self.model = self.load("./logistic_regression_model.pkl")
+    def load(self, regression_model_file_name):
+        return joblib.load(regression_model_file_name)
+regression_model = RegressionModelLoader()

relative_tester.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import torch
+import nltk
+from roberta_model_loader import RobertaModelLoader
+from feature_ref_loader import feature_ref_loader
+from meta_train import net
+from regression_model_loader import regression_model
+from MMD import MMD_3_Sample_Test
+from utils import FeatureExtractor, HWT, MGT, config
+class RelativeTester:
+    def __init__(self):
+        print("Relative Tester init")
+        self.feature_extractor = FeatureExtractor(RobertaModelLoader(), net)
+        self.feature_hwt_ref = feature_ref_loader(config["feature_ref_HWT"])
+        self.feature_mgt_ref = feature_ref_loader(config["feature_ref_MGT"])
+    def sents_split(self, text):
+        nltk.download("punkt", quiet=True)
+        nltk.download("punkt_tab", quiet=True)
+        sents = nltk.sent_tokenize(text)
+        return [sent for sent in sents if 5 < len(sent.split())]
+    def test(self, input_text, threshold=0.2, round=20):
+        print("Relative Tester test")
+        # Split the input text
+        sents = self.sents_split(input_text)
+        print("DEBUG: sents:", len(sents))
+        # Extract features
+        feature_for_sents = self.feature_extractor.process_sents(sents, False)
+        if len(feature_for_sents) <= 1:
+            # print("DEBUG: tooshort")
+            return "Too short to test! Please input more than 2 sentences."
+        # Cutoff the features
+        min_len = min(
+            len(feature_for_sents),
+            len(self.feature_hwt_ref),
+            len(self.feature_mgt_ref),
+        )
+        # Calculate MMD
+        h_u_list = []
+        p_value_list = []
+        t_list = []
+        for i in range(round):
+            feature_for_sents_sample = feature_for_sents[
+                torch.randperm(len(feature_for_sents))[:min_len]
+            ]
+            feature_hwt_ref_sample = self.feature_hwt_ref[
+                torch.randperm(len(self.feature_hwt_ref))[:min_len]
+            ]
+            feature_mgt_ref_sample = self.feature_mgt_ref[
+                torch.randperm(len(self.feature_mgt_ref))[:min_len]
+            ]
+            h_u, p_value, t, *rest = MMD_3_Sample_Test(
+                net.net(feature_for_sents_sample),
+                net.net(feature_hwt_ref_sample),
+                net.net(feature_mgt_ref_sample),
+                feature_for_sents_sample.view(feature_for_sents_sample.shape[0], -1),
+                feature_hwt_ref_sample.view(feature_hwt_ref_sample.shape[0], -1),
+                feature_mgt_ref_sample.view(feature_mgt_ref_sample.shape[0], -1),
+                net.sigma,
+                net.sigma0_u,
+                net.ep,
+                0.05,
+            )
+            h_u_list.append(h_u)
+            p_value_list.append(p_value)
+            t_list.append(t)
+        power = sum(h_u_list) / len(h_u_list)
+        print("DEBUG: power:", power)
+        print("DEBUG: power list:", h_u_list)
+        # Return the result
+        return "Human" if power <= threshold else "AI"

requirements.txt ADDED Viewed

Binary file (184 Bytes). View file

roberta_model_loader.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from transformers import RobertaTokenizer, RobertaModel
+import torch
+from utils import config
+class RobertaModelLoader:
+    def __init__(
+        self,
+        model_name="roberta-base-openai-detector",
+        cache_dir=".cache",
+    ):
+        self.model_name = model_name
+        self.cache_dir = cache_dir
+        self.tokenizer, self.model = self.load_base_model_and_tokenizer()
+    def load_base_model_and_tokenizer(self):
+        if config["local_model"]:  # load model from local
+            print("Load model from local: ", self.model_name, config["local_model"])
+            return RobertaTokenizer.from_pretrained(
+                config["local_model"], cache_dir=self.cache_dir
+            ), RobertaModel.from_pretrained(
+                config["local_model"],
+                output_hidden_states=True,
+                cache_dir=self.cache_dir,
+            )
+        print("Load model from remote: ", self.model_name)
+        return RobertaTokenizer.from_pretrained(
+            self.model_name, cache_dir=self.cache_dir
+        ), RobertaModel.from_pretrained(
+            self.model_name, output_hidden_states=True, cache_dir=self.cache_dir
+        )

two_sample_tester.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import torch
+from roberta_model_loader import RobertaModelLoader
+from feature_ref_loader import feature_two_sample_tester_ref
+from meta_train import net
+from regression_model_loader import regression_model
+from MMD import MMD_batch2
+from utils import DEVICE, FeatureExtractor
+class TwoSampleTester:
+    def __init__(self):
+        print("TwoSample Tester init")
+        self.net = net
+        self.feature_extractor = FeatureExtractor(RobertaModelLoader(), net)
+    def test(self, input_text):
+        print("TwoSample Tester test")
+        # Get the feature for input text
+        feature_for_input_text = self.feature_extractor.process(input_text)
+        # print(
+        #     "DEBUG: feature_for_input_text:",
+        #     feature_for_input_text.shape,
+        #     feature_two_sample_tester_ref.shape,
+        # )
+        # Calculate MMD
+        mmd_feature_for_input_text = MMD_batch2(
+            torch.cat([feature_two_sample_tester_ref, feature_for_input_text], dim=0),
+            feature_two_sample_tester_ref.shape[0],
+            0,
+            self.net.sigma,
+            self.net.sigma0_u,
+            self.net.ep,
+        ).to("cpu")
+        # Use the regression model to get the 2-sample test result
+        y_pred_loaded = regression_model.model.predict(
+            mmd_feature_for_input_text.detach().numpy().reshape(-1, 1)
+        )
+        prediction = int(y_pred_loaded[0])
+        if prediction == 0:
+            return "Human"
+        elif prediction == 1:
+            return "AI"

utils.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import torch
+import random
+import numpy as np
+config = {}
+def get_device():
+    return (
+        torch.device("cuda:0") if config.get("use_gpu", False) else torch.device("cpu")
+    )
+HWT = "HWT"
+MGT = "MGT"
+def init_random_seeds():
+    print("Init random seeds")
+    random.seed(0)
+    np.random.seed(0)
+    torch.manual_seed(0)
+    torch.cuda.manual_seed(0)
+    torch.cuda.manual_seed_all(0)
+    torch.backends.cudnn.benchmark = False
+    torch.backends.cudnn.deterministic = True
+class FeatureExtractor:
+    def __init__(self, model, net=None):
+        self.llm_model = model  # TODO: support different models
+        self.net = net
+    def process(self, text, net_required=True):
+        DEVICE = get_device()
+        # Tokenize
+        tokens = self.llm_model.tokenizer(
+            [text],
+            padding="max_length",
+            truncation=True,
+            max_length=100,
+            return_tensors="pt",
+        ).to(DEVICE)
+        # Predict
+        outputs = self.llm_model.model(**tokens)
+        # Get the feature for input text
+        attention_mask = tokens["attention_mask"].unsqueeze(-1)
+        hidden_states_masked = (
+            outputs.last_hidden_state * attention_mask
+        )  # Ignore the padding tokens
+        if net_required and self.net is not None:
+            feature = self.net.net(hidden_states_masked)
+            return feature
+        else:
+            return hidden_states_masked
+    def process_sents(self, sents, net_required=True):
+        features = []
+        for sent in sents:
+            features.append(self.process(sent, net_required))
+        if not features:
+            return torch.tensor([])
+        return torch.cat(features, dim=0)