Niansuh commited on
Commit
349ceab
1 Parent(s): ad0b426

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -24
app.py CHANGED
@@ -6,59 +6,40 @@ import torch
6
  class ModelProcessor:
7
  def __init__(self, repo_id="HuggingFaceTB/cosmo-1b"):
8
  self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
9
- # Initialize the tokenizer
10
  self.tokenizer = AutoTokenizer.from_pretrained(repo_id, use_fast=True)
11
-
12
- # Initialize and configure the model
13
  self.model = AutoModelForCausalLM.from_pretrained(
14
  repo_id, torch_dtype=torch.float16, device_map={"": self.device}, trust_remote_code=True
15
  )
16
- self.model.eval() # Set the model to evaluation mode
17
-
18
- # Set padding token as end-of-sequence token
19
  self.tokenizer.pad_token = self.tokenizer.eos_token
20
 
21
  @torch.inference_mode()
22
  def process_data_and_compute_statistics(self, prompt):
23
- # Tokenize the prompt and move to the device
24
  tokens = self.tokenizer(
25
  prompt, return_tensors="pt", truncation=True, max_length=512
26
  ).to(self.model.device)
27
-
28
- # Get the model outputs and logits
29
  outputs = self.model(tokens["input_ids"])
30
  logits = outputs.logits
31
-
32
- # Shift right to align with logits' prediction position
33
  shifted_labels = tokens["input_ids"][..., 1:].contiguous()
34
  shifted_logits = logits[..., :-1, :].contiguous()
35
-
36
- # Calculate entropy
37
  shifted_probs = torch.softmax(shifted_logits, dim=-1)
38
  shifted_log_probs = torch.log_softmax(shifted_logits, dim=-1)
39
  entropy = -torch.sum(shifted_probs * shifted_log_probs, dim=-1).squeeze()
40
-
41
- # Flatten the logits and labels
42
  logits_flat = shifted_logits.view(-1, shifted_logits.size(-1))
43
  labels_flat = shifted_labels.view(-1)
44
-
45
- # Calculate the negative log-likelihood loss
46
  probabilities_flat = torch.softmax(logits_flat, dim=-1)
47
  true_class_probabilities = probabilities_flat.gather(
48
  1, labels_flat.unsqueeze(1)
49
  ).squeeze(1)
50
  nll = -torch.log(
51
  true_class_probabilities.clamp(min=1e-9)
52
- ) # Clamp to prevent log(0)
53
-
54
  ranks = (
55
  shifted_logits.argsort(dim=-1, descending=True)
56
  == shifted_labels.unsqueeze(-1)
57
  ).nonzero()[:, -1]
58
-
59
  if entropy.clamp(max=4).median() < 2.0:
60
  return 1
61
-
62
  return 1 if (ranks.clamp(max=4) * nll.clamp(max=4)).mean() < 5.2 else 0
63
 
64
  processor = ModelProcessor()
@@ -67,9 +48,9 @@ processor = ModelProcessor()
67
  def detect(prompt):
68
  prediction = processor.process_data_and_compute_statistics(prompt)
69
  if prediction == 1:
70
- return "The text is likely **generated** by a language model."
71
  else:
72
- return "The text is likely **not generated** by a language model."
73
 
74
  with gr.Blocks(
75
  css="""
@@ -118,7 +99,7 @@ with gr.Blocks(
118
  label="Prompt",
119
  )
120
  submit_btn = gr.Button("Submit", variant="primary")
121
- output = gr.Markdown(elem_id="output-text")
122
 
123
  submit_btn.click(fn=detect, inputs=prompt, outputs=output)
124
 
 
6
  class ModelProcessor:
7
  def __init__(self, repo_id="HuggingFaceTB/cosmo-1b"):
8
  self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
 
9
  self.tokenizer = AutoTokenizer.from_pretrained(repo_id, use_fast=True)
 
 
10
  self.model = AutoModelForCausalLM.from_pretrained(
11
  repo_id, torch_dtype=torch.float16, device_map={"": self.device}, trust_remote_code=True
12
  )
13
+ self.model.eval()
 
 
14
  self.tokenizer.pad_token = self.tokenizer.eos_token
15
 
16
  @torch.inference_mode()
17
  def process_data_and_compute_statistics(self, prompt):
 
18
  tokens = self.tokenizer(
19
  prompt, return_tensors="pt", truncation=True, max_length=512
20
  ).to(self.model.device)
 
 
21
  outputs = self.model(tokens["input_ids"])
22
  logits = outputs.logits
 
 
23
  shifted_labels = tokens["input_ids"][..., 1:].contiguous()
24
  shifted_logits = logits[..., :-1, :].contiguous()
 
 
25
  shifted_probs = torch.softmax(shifted_logits, dim=-1)
26
  shifted_log_probs = torch.log_softmax(shifted_logits, dim=-1)
27
  entropy = -torch.sum(shifted_probs * shifted_log_probs, dim=-1).squeeze()
 
 
28
  logits_flat = shifted_logits.view(-1, shifted_logits.size(-1))
29
  labels_flat = shifted_labels.view(-1)
 
 
30
  probabilities_flat = torch.softmax(logits_flat, dim=-1)
31
  true_class_probabilities = probabilities_flat.gather(
32
  1, labels_flat.unsqueeze(1)
33
  ).squeeze(1)
34
  nll = -torch.log(
35
  true_class_probabilities.clamp(min=1e-9)
36
+ )
 
37
  ranks = (
38
  shifted_logits.argsort(dim=-1, descending=True)
39
  == shifted_labels.unsqueeze(-1)
40
  ).nonzero()[:, -1]
 
41
  if entropy.clamp(max=4).median() < 2.0:
42
  return 1
 
43
  return 1 if (ranks.clamp(max=4) * nll.clamp(max=4)).mean() < 5.2 else 0
44
 
45
  processor = ModelProcessor()
 
48
  def detect(prompt):
49
  prediction = processor.process_data_and_compute_statistics(prompt)
50
  if prediction == 1:
51
+ return "<div class='output-text'>The text is likely <b>generated</b> by a language model.</div>"
52
  else:
53
+ return "<div class='output-text'>The text is likely <b>not generated</b> by a language model.</div>"
54
 
55
  with gr.Blocks(
56
  css="""
 
99
  label="Prompt",
100
  )
101
  submit_btn = gr.Button("Submit", variant="primary")
102
+ output = gr.HTML() # Changed to gr.HTML() to support custom HTML
103
 
104
  submit_btn.click(fn=detect, inputs=prompt, outputs=output)
105