File size: 4,242 Bytes
3525348
6a49579
 
 
 
 
 
 
 
47f34c8
6a49579
 
 
 
 
47f34c8
6a49579
 
 
 
 
 
b131c94
6a49579
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47f34c8
6a49579
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6a70e6b
6a49579
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import spaces
import gradio as gr

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch


class ModelProcessor:
    def __init__(self, repo_id="HuggingFaceTB/cosmo-1b"):
        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
        # Initialize the tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(repo_id, use_fast=True)

        # Initialize and configure the model
        self.model = AutoModelForCausalLM.from_pretrained(
            repo_id, torch_dtype=torch.float16, device_map={"": self.device}, trust_remote_code=True
        )
        self.model.eval()  # Set the model to evaluation mode

        # Set padding token as end-of-sequence token
        self.tokenizer.pad_token = self.tokenizer.eos_token

    
    @torch.inference_mode()
    def process_data_and_compute_statistics(self, prompt):
        # Tokenize the prompt and move to the device
        tokens = self.tokenizer(
            prompt, return_tensors="pt", truncation=True, max_length=512
        ).to(self.model.device)

        # Get the model outputs and logits
        outputs = self.model(tokens["input_ids"])
        logits = outputs.logits

        # Shift right to align with logits' prediction position
        shifted_labels = tokens["input_ids"][..., 1:].contiguous()
        shifted_logits = logits[..., :-1, :].contiguous()

        # Calculate entropy
        shifted_probs = torch.softmax(shifted_logits, dim=-1)
        shifted_log_probs = torch.log_softmax(shifted_logits, dim=-1)
        entropy = -torch.sum(shifted_probs * shifted_log_probs, dim=-1).squeeze()

        # Flatten the logits and labels
        logits_flat = shifted_logits.view(-1, shifted_logits.size(-1))
        labels_flat = shifted_labels.view(-1)

        # Calculate the negative log-likelihood loss
        probabilities_flat = torch.softmax(logits_flat, dim=-1)
        true_class_probabilities = probabilities_flat.gather(
            1, labels_flat.unsqueeze(1)
        ).squeeze(1)
        nll = -torch.log(
            true_class_probabilities.clamp(min=1e-9)
        )  # Clamp to prevent log(0)

        ranks = (
            shifted_logits.argsort(dim=-1, descending=True)
            == shifted_labels.unsqueeze(-1)
        ).nonzero()[:, -1]

        if entropy.clamp(max=4).median() < 2.0:
            return 1

        return 1 if (ranks.clamp(max=4) * nll.clamp(max=4)).mean() < 5.2 else 0


processor = ModelProcessor()

@spaces.GPU(duration=180)
def detect(prompt):
    prediction = processor.process_data_and_compute_statistics(prompt)
    if prediction == 1:
        return "The text is likely **generated** by a language model."
    else:
        return "The text is likely **not generated** by a language model."


with gr.Blocks(
    css="""
    .gradio-container {
        max-width: 800px;
        margin: 0 auto;
    }
    .gr-box {
        box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
        padding: 20px;
        border-radius: 4px;
    }
    .gr-button {
        background-color: #007bff;
        color: white;        padding: 10px 20px;
        border-radius: 4px;
    }
    .gr-button:hover {
        background-color:    }
    .hyperlinks a {
        margin-right: 10px;
    }
"""
) as demo:
    with gr.Row():
        with gr.Column(scale=3):
            gr.Markdown("# ENTELL Model Detection")
            gr.Markdown("Please visit my website for better detection quality [svenska-detektor.se](https://svenska-detektor.se)")
        with gr.Column(scale=1):
            gr.HTML(
                """
            <p>
            <a href="" target="_blank">paper</a>
                
            <a href="" target="_blank">code</a>
                
            <a href="mailto:mohamad.jaallouk@gmail.com" target="_blank">contact</a>
            """,
                elem_classes="hyperlinks",
            )
    with gr.Row():
        with gr.Column():
            prompt = gr.Textbox(
                lines=8,
                placeholder="Type your prompt here...",
                label="Prompt",
            )
            submit_btn = gr.Button("Submit", variant="primary")
            output = gr.Markdown()

    submit_btn.click(fn=detect, inputs=prompt, outputs=output)

demo.launch()