Spaces:

polygraf-ai
/

copyright_checker

Running

App Files Files Community

copyright_checker / plagiarism.py

aliasgerovs

Update plagiarism.py

2bc427a 10 months ago

raw

history blame

9.54 kB

	import time
	from nltk.tokenize import sent_tokenize
	from googleapiclient.discovery import build
	from collections import Counter
	import re, math
	from sentence_transformers import SentenceTransformer, util
	import asyncio
	import httpx
	from bs4 import BeautifulSoup
	import numpy as np
	import concurrent
	from multiprocessing import Pool


	WORD = re.compile(r"\w+")
	model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")


	# returns cosine similarity of two vectors
	# input: two vectors
	# output: integer between 0 and 1.

	def get_cosine(vec1, vec2):
	intersection = set(vec1.keys()) & set(vec2.keys())

	# calculating numerator
	numerator = sum([vec1[x] * vec2[x] for x in intersection])

	# calculating denominator
	sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
	sum2 = sum([vec2[x] ** 2 for x in vec2.keys()])
	denominator = math.sqrt(sum1) * math.sqrt(sum2)

	# checking for divide by zero
	if denominator == 0:
	return 0.0
	else:
	return float(numerator) / denominator


	# converts given text into a vector
	def text_to_vector(text):
	# uses the Regular expression above and gets all words
	words = WORD.findall(text)
	# returns a counter of all the words (count of number of occurences)
	return Counter(words)


	# returns cosine similarity of two words
	# uses: text_to_vector(text) and get_cosine(v1,v2)
	def cosineSim(text1, text2):
	vector1 = text_to_vector(text1)
	vector2 = text_to_vector(text2)
	# print vector1,vector2
	cosine = get_cosine(vector1, vector2)
	return cosine


	def cos_sim_torch(embedding_1, embedding_2):
	return util.pytorch_cos_sim(embedding_1, embedding_2).item()


	def embed_text(text):
	return model.encode(text, convert_to_tensor=True)


	def sentence_similarity(text1, text2):
	embedding_1 = model.encode(text1, convert_to_tensor=True)
	embedding_2 = model.encode(text2, convert_to_tensor=True)

	o = util.pytorch_cos_sim(embedding_1, embedding_2)
	return o.item()


	def google_search(
	plag_option,
	sentences,
	urlCount,
	scoreArray,
	urlList,
	sorted_date,
	domains_to_skip,
	api_key,
	cse_id,
	**kwargs,
	):
	service = build("customsearch", "v1", developerKey=api_key)
	for i, sentence in enumerate(sentences):
	results = (
	service.cse()
	.list(q=sentence, cx=cse_id, sort=sorted_date, **kwargs)
	.execute()
	)
	if "items" in results and len(results["items"]) > 0:
	for count, link in enumerate(results["items"]):
	# stop after 3 pages
	if count >= 4:
	break
	# skip user selected domains
	if any(
	("." + domain) in link["link"] for domain in domains_to_skip
	):
	continue
	# clean up snippet of '...'
	snippet = link["snippet"]
	ind = snippet.find("...")
	if ind < 20 and ind > 9:
	snippet = snippet[ind + len("... ") :]
	ind = snippet.find("...")
	if ind > len(snippet) - 5:
	snippet = snippet[:ind]

	# update cosine similarity between snippet and given text
	url = link["link"]
	if url not in urlList:
	urlList.append(url)
	scoreArray.append([0] * len(sentences))
	urlCount[url] = urlCount[url] + 1 if url in urlCount else 1
	if plag_option == "Standard":
	scoreArray[urlList.index(url)][i] = cosineSim(
	sentence, snippet
	)
	else:
	scoreArray[urlList.index(url)][i] = sentence_similarity(
	sentence, snippet
	)
	return urlCount, scoreArray


	def split_sentence_blocks(text):

	sents = sent_tokenize(text)
	two_sents = []
	for i in range(len(sents)):
	if (i % 4) == 0:
	two_sents.append(sents[i])
	else:
	two_sents[len(two_sents) - 1] += " " + sents[i]
	return two_sents


	months = {
	"January": "01",
	"February": "02",
	"March": "03",
	"April": "04",
	"May": "05",
	"June": "06",
	"July": "07",
	"August": "08",
	"September": "09",
	"October": "10",
	"November": "11",
	"December": "12",
	}


	def build_date(year=2024, month="March", day=1):
	return f"{year}{months[month]}{day}"


	async def get_url_data(url, client):
	try:
	r = await client.get(url)
	# print(r.status_code)
	if r.status_code == 200:
	# print("in")
	soup = BeautifulSoup(r.content, "html.parser")
	return soup
	except Exception:
	return None


	def remove_punc(text):
	res = re.sub(r"[^\w\s]", "", text)
	return res


	def split_ngrams(text, n):
	# return n-grams of size n
	words = text.split()
	return [words[i : i + n] for i in range(len(words) - n + 1)]


	async def parallel_scrap(urls):
	async with httpx.AsyncClient(timeout=30) as client:
	tasks = []
	for url in urls:
	tasks.append(get_url_data(url=url, client=client))
	results = await asyncio.gather(*tasks, return_exceptions=True)
	return results



	def matching_score(sentence_content_tuple):
	sentence, content = sentence_content_tuple
	if sentence in content:
	return 1
	else:
	n = 5
	ngrams = split_ngrams(sentence, n)
	if len(ngrams) == 0:
	return 0
	matched = [x for x in ngrams if " ".join(x) in content]
	return len(matched) / len(ngrams)

	def process_with_multiprocessing(input_data):
	with Pool(processes=4) as pool:
	scores = pool.map(matching_score, input_data)
	return scores

	def plagiarism_check(
	plag_option,
	input,
	year_from,
	month_from,
	day_from,
	year_to,
	month_to,
	day_to,
	domains_to_skip,
	):
	api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
	api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
	api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
	# api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
	api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
	cse_id = "851813e81162b4ed4"

	sentences = split_sentence_blocks(input)
	urlCount = {}
	ScoreArray = []
	urlList = []
	date_from = build_date(year_from, month_from, day_from)
	date_to = build_date(year_to, month_to, day_to)
	sort_date = f"date:r:{date_from}:{date_to}"
	# get list of URLS to check
	urlCount, ScoreArray = google_search(
	plag_option,
	sentences,
	urlCount,
	ScoreArray,
	urlList,
	sort_date,
	domains_to_skip,
	api_key,
	cse_id,
	)

	# Scrape URLs in list
	formatted_tokens = []
	soups = asyncio.run(parallel_scrap(urlList))

	# # Populate matching scores for scrapped pages
	# for i, soup in enumerate(soups):
	# print(f"Analyzing {i+1} of {len(soups)} soups........................")
	# if soup:
	# page_content = soup.text

	# for j, sent in enumerate(sentences):
	# args_list = (sent, page_content)
	# score = matching_score(args_list)
	# # score = cos_sim_torch(embed_text(sent), source_embeddings[i])
	# ScoreArray[i][j] = score

	input_data = []
	for i, soup in enumerate(soups):
	if soup:
	page_content = soup.text
	for j, sent in enumerate(sentences):
	input_data.append((sent, page_content))

	scores = process_with_multiprocessing(input_data)
	k = 0
	for i, soup in enumerate(soups):
	if soup:
	for j, _ in enumerate(sentences):
	ScoreArray[i][j] = scores[k]
	k += 1

	sentenceToMaxURL = [-1] * len(sentences)

	for j in range(len(sentences)):
	if j > 0:
	maxScore = ScoreArray[sentenceToMaxURL[j - 1]][j]
	sentenceToMaxURL[j] = sentenceToMaxURL[j - 1]
	else:
	maxScore = -1

	for i in range(len(ScoreArray)):
	margin = (
	0.1
	if (j > 0 and sentenceToMaxURL[j] == sentenceToMaxURL[j - 1])
	else 0
	)
	if ScoreArray[i][j] - maxScore > margin:
	maxScore = ScoreArray[i][j]
	sentenceToMaxURL[j] = i

	index = np.unique(sentenceToMaxURL)

	urlScore = {}
	for url in index:
	s = [
	ScoreArray[url][sen]
	for sen in range(len(sentences))
	if sentenceToMaxURL[sen] == url
	]
	urlScore[url] = sum(s) / len(s)

	index_descending = sorted(urlScore, key=urlScore.get, reverse=True)

	urlMap = {}
	for count, i in enumerate(index_descending):
	urlMap[i] = count + 1

	for i, sent in enumerate(sentences):
	formatted_tokens.append(
	(sent, "[" + str(urlMap[sentenceToMaxURL[i]]) + "]")
	)

	formatted_tokens.append("\n\n\n")

	for ind in index_descending:
	formatted_tokens.append(
	(
	urlList[ind]
	+ " --- Matching Score: "
	+ f"{str(round(urlScore[ind] * 100, 2))}%",
	"[" + str(urlMap[ind]) + "]",
	)
	)
	formatted_tokens.append(("\n", None))

	return formatted_tokens