article_writer / gptzero_free.py
eljanmahammadli's picture
added RAG
03fd59b
"""
This code a slight modification of perplexity by hugging face
https://huggingface.co/docs/transformers/perplexity
Both this code and the orignal code are published under the MIT license.
by Burhan Ul tayyab and Nicholas Chua
"""
import torch
import re
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from collections import OrderedDict
class GPT2PPL:
def __init__(self, device="cpu", model_id="gpt2"):
self.device = device
self.model_id = model_id
self.model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
self.tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
self.max_length = self.model.config.n_positions
self.stride = 512
def getResults(self, threshold):
# if threshold < 60:
# label = 0
# return "The Text is generated by AI.", label
# elif threshold < 80:
# label = 0
# return "The Text is most probably contain parts which are generated by AI. (require more text for better Judgement)", label
# else:
# label = 1
# return "The Text is written by Human.", label
normalized_score = (threshold - 55) / 30
# Cap the values at 0 and 100
human_score = max(0, min(1, normalized_score))
return {"HUMAN": human_score, "AI": 1 - human_score}
def __call__(self, sentence):
"""
Takes in a sentence split by full stop
and print the perplexity of the total sentence
split the lines based on full stop and find the perplexity of each sentence and print
average perplexity
Burstiness is the max perplexity of each sentence
"""
results = OrderedDict()
total_valid_char = re.findall("[a-zA-Z0-9]+", sentence)
total_valid_char = sum([len(x) for x in total_valid_char]) # finds len of all the valid characters a sentence
if total_valid_char < 100:
return {"status": "Please input more text (min 100 characters)"}, "Please input more text (min 100 characters)"
lines = re.split(r'(?<=[.?!][ \[\(])|(?<=\n)\s*',sentence)
lines = list(filter(lambda x: (x is not None) and (len(x) > 0), lines))
ppl = self.getPPL(sentence)
print(f"Perplexity {ppl}")
results["Perplexity"] = ppl
offset = ""
Perplexity_per_line = []
for i, line in enumerate(lines):
if re.search("[a-zA-Z0-9]+", line) == None:
continue
if len(offset) > 0:
line = offset + line
offset = ""
# remove the new line pr space in the first sentence if exists
if line[0] == "\n" or line[0] == " ":
line = line[1:]
if line[-1] == "\n" or line[-1] == " ":
line = line[:-1]
elif line[-1] == "[" or line[-1] == "(":
offset = line[-1]
line = line[:-1]
ppl = self.getPPL(line)
Perplexity_per_line.append(ppl)
print(f"Perplexity per line {sum(Perplexity_per_line)/len(Perplexity_per_line)}")
results["Perplexity per line"] = sum(Perplexity_per_line)/len(Perplexity_per_line)
print(f"Burstiness {max(Perplexity_per_line)}")
results["Burstiness"] = max(Perplexity_per_line)
# out, label = self.getResults(results["Perplexity per line"])
# results["label"] = label
# return results, out
return self.getResults(results["Perplexity per line"])
def getPPL(self,sentence):
encodings = self.tokenizer(sentence, return_tensors="pt")
seq_len = encodings.input_ids.size(1)
nlls = []
likelihoods = []
prev_end_loc = 0
for begin_loc in range(0, seq_len, self.stride):
end_loc = min(begin_loc + self.max_length, seq_len)
trg_len = end_loc - prev_end_loc
input_ids = encodings.input_ids[:, begin_loc:end_loc].to(self.device)
target_ids = input_ids.clone()
target_ids[:, :-trg_len] = -100
with torch.no_grad():
outputs = self.model(input_ids, labels=target_ids)
neg_log_likelihood = outputs.loss * trg_len
likelihoods.append(neg_log_likelihood)
nlls.append(neg_log_likelihood)
prev_end_loc = end_loc
if end_loc == seq_len:
break
ppl = int(torch.exp(torch.stack(nlls).sum() / end_loc))
return ppl