Spaces:

polygraf-ai
/

article_writer

Runtime error

App Files Files Community

article_writer / gptzero_free.py

eljanmahammadli

added RAG

03fd59b 8 months ago

raw

history blame contribute delete

4.55 kB

	"""
	This code a slight modification of perplexity by hugging face
	https://huggingface.co/docs/transformers/perplexity

	Both this code and the orignal code are published under the MIT license.

	by Burhan Ul tayyab and Nicholas Chua
	"""

	import torch
	import re
	from transformers import GPT2LMHeadModel, GPT2TokenizerFast
	from collections import OrderedDict


	class GPT2PPL:
	def __init__(self, device="cpu", model_id="gpt2"):
	self.device = device
	self.model_id = model_id
	self.model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
	self.tokenizer = GPT2TokenizerFast.from_pretrained(model_id)

	self.max_length = self.model.config.n_positions
	self.stride = 512

	def getResults(self, threshold):
	# if threshold < 60:
	# label = 0
	# return "The Text is generated by AI.", label
	# elif threshold < 80:
	# label = 0
	# return "The Text is most probably contain parts which are generated by AI. (require more text for better Judgement)", label
	# else:
	# label = 1
	# return "The Text is written by Human.", label
	normalized_score = (threshold - 55) / 30
	# Cap the values at 0 and 100
	human_score = max(0, min(1, normalized_score))
	return {"HUMAN": human_score, "AI": 1 - human_score}

	def __call__(self, sentence):
	"""
	Takes in a sentence split by full stop
	and print the perplexity of the total sentence

	split the lines based on full stop and find the perplexity of each sentence and print
	average perplexity

	Burstiness is the max perplexity of each sentence
	"""
	results = OrderedDict()

	total_valid_char = re.findall("[a-zA-Z0-9]+", sentence)
	total_valid_char = sum([len(x) for x in total_valid_char]) # finds len of all the valid characters a sentence

	if total_valid_char < 100:
	return {"status": "Please input more text (min 100 characters)"}, "Please input more text (min 100 characters)"

	lines = re.split(r'(?<=[.?!][ \[\(])\|(?<=\n)\s*',sentence)
	lines = list(filter(lambda x: (x is not None) and (len(x) > 0), lines))

	ppl = self.getPPL(sentence)
	print(f"Perplexity {ppl}")
	results["Perplexity"] = ppl

	offset = ""
	Perplexity_per_line = []
	for i, line in enumerate(lines):
	if re.search("[a-zA-Z0-9]+", line) == None:
	continue
	if len(offset) > 0:
	line = offset + line
	offset = ""
	# remove the new line pr space in the first sentence if exists
	if line[0] == "\n" or line[0] == " ":
	line = line[1:]
	if line[-1] == "\n" or line[-1] == " ":
	line = line[:-1]
	elif line[-1] == "[" or line[-1] == "(":
	offset = line[-1]
	line = line[:-1]
	ppl = self.getPPL(line)
	Perplexity_per_line.append(ppl)
	print(f"Perplexity per line {sum(Perplexity_per_line)/len(Perplexity_per_line)}")
	results["Perplexity per line"] = sum(Perplexity_per_line)/len(Perplexity_per_line)

	print(f"Burstiness {max(Perplexity_per_line)}")
	results["Burstiness"] = max(Perplexity_per_line)

	# out, label = self.getResults(results["Perplexity per line"])
	# results["label"] = label

	# return results, out

	return self.getResults(results["Perplexity per line"])

	def getPPL(self,sentence):
	encodings = self.tokenizer(sentence, return_tensors="pt")
	seq_len = encodings.input_ids.size(1)

	nlls = []
	likelihoods = []
	prev_end_loc = 0
	for begin_loc in range(0, seq_len, self.stride):
	end_loc = min(begin_loc + self.max_length, seq_len)
	trg_len = end_loc - prev_end_loc
	input_ids = encodings.input_ids[:, begin_loc:end_loc].to(self.device)
	target_ids = input_ids.clone()
	target_ids[:, :-trg_len] = -100

	with torch.no_grad():
	outputs = self.model(input_ids, labels=target_ids)
	neg_log_likelihood = outputs.loss * trg_len
	likelihoods.append(neg_log_likelihood)

	nlls.append(neg_log_likelihood)

	prev_end_loc = end_loc
	if end_loc == seq_len:
	break
	ppl = int(torch.exp(torch.stack(nlls).sum() / end_loc))
	return ppl