Spaces:

polygraf-ai
/

copyright_checker

Running

App Files Files Community

minko186 commited on Mar 6, 2024

Commit

029c7a1

1 Parent(s): fe15d80

refactored plagiarism checker

Browse files

Files changed (2) hide show

app.py +1 -187
plagiarism.py +340 -0

app.py CHANGED Viewed

@@ -32,6 +32,7 @@ from utils import cos_sim_torch, embed_text
 import multiprocessing
 from functools import partial
 import concurrent.futures
 nltk.download("punkt")
@@ -50,193 +51,6 @@ from writing_analysis import (
 np.set_printoptions(suppress=True)
-def plagiarism_check(
-    plag_option,
-    input,
-    year_from,
-    month_from,
-    day_from,
-    year_to,
-    month_to,
-    day_to,
-    domains_to_skip,
-):
-    api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
-    api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
-    api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
-    # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
-    api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
-    cse_id = "851813e81162b4ed4"
-    time1 = time.perf_counter()
-    start = time.perf_counter()
-    sentences = getSentences(input)
-    urlCount = {}
-    ScoreArray = []
-    urlList = []
-    date_from = build_date(year_from, month_from, day_from)
-    date_to = build_date(year_to, month_to, day_to)
-    sort_date = f"date:r:{date_from}:{date_to}"
-    # get list of URLS to check
-    urlCount, ScoreArray = googleSearch(
-        plag_option,
-        sentences,
-        urlCount,
-        ScoreArray,
-        urlList,
-        sort_date,
-        domains_to_skip,
-        api_key,
-        cse_id,
-    )
-    print(f"Time for google search: {time.perf_counter()-time1}")
-    time1 = time.perf_counter()
-    print("Number of URLs: ", len(urlCount))
-    print(urlList)
-    # Scrape URLs in list
-    formatted_tokens = []
-    soups = asyncio.run(parallel_scrap(urlList))
-    print(f"Time for scraping: {time.perf_counter()-time1}")
-    time1 = time.perf_counter()
-    print(len(soups))
-    print(
-        "Successful scraping: "
-        + str(len([x for x in soups if x is not None]))
-        + "out of "
-        + str(len(urlList))
-    )
-    source_embeddings = []
-    for i, soup in enumerate(soups):
-        if soup:
-            page_content = soup.text
-            source_embeddings.append(embed_text(page_content))
-        else:
-            source_embeddings.append(None)
-    # Populate matching scores for scrapped pages
-    # for i, soup in enumerate(soups):
-    #     print(f"Analyzing {i+1} of {len(soups)} soups........................")
-    #     if soup:
-    #         page_content = soup.text
-    #         for j, sent in enumerate(sentences):
-    #             # score = matchingScore(sent, page_content)
-    #             # score = matchingScoreWithTimeout(sent, page_content)
-    #             score = cos_sim_torch(embed_text(sent), source_embeddings[i])
-    #             ScoreArray[i][j] = score
-    def compute_cosine_similarity(args):
-        sent, source_embedding, i, j = args
-        score = cos_sim_torch(embed_text(sent), source_embedding)
-        return i, j, score
-    def main(soups, sentences):
-        source_embeddings = [preprocess(soup) for soup in soups]
-        ScoreArray = [[0 for _ in sentences] for _ in soups]
-        args_list = []
-        for i, soup in enumerate(soups):
-            if soup:
-                for j, sent in enumerate(sentences):
-                    args_list.append((sent, source_embeddings[i], i, j))
-        with concurrent.futures.ProcessPoolExecutor() as executor:
-            results = executor.map(compute_cosine_similarity, args_list)
-            for i, j, score in results:
-                ScoreArray[i][j] = score
-        return ScoreArray
-    ScoreArray = main(soups, sentences)
-    print(f"Time for matching score: {time.perf_counter()-time1}")
-    time1 = time.perf_counter()
-    # ScoreArray = asyncio.run(parallel_analyze_2(soups, sentences, ScoreArray))
-    # print("New Score Array:\n")
-    # print2D(ScoreArray)
-    # Gradio formatting section
-    sentencePlag = [False] * len(sentences)
-    sentenceToMaxURL = [-1] * len(sentences)
-    for j in range(len(sentences)):
-        if j > 0:
-            maxScore = ScoreArray[sentenceToMaxURL[j - 1]][j]
-            sentenceToMaxURL[j] = sentenceToMaxURL[j - 1]
-        else:
-            maxScore = -1
-        for i in range(len(ScoreArray)):
-            margin = (
-                0.1
-                if (j > 0 and sentenceToMaxURL[j] == sentenceToMaxURL[j - 1])
-                else 0
-            )
-            if ScoreArray[i][j] - maxScore > margin:
-                maxScore = ScoreArray[i][j]
-                sentenceToMaxURL[j] = i
-        if maxScore > 0.5:
-            sentencePlag[j] = True
-    if (
-        (len(sentences) > 1)
-        and (sentenceToMaxURL[1] != sentenceToMaxURL[0])
-        and (
-            ScoreArray[sentenceToMaxURL[0]][0]
-            - ScoreArray[sentenceToMaxURL[1]][0]
-            < 0.1
-        )
-    ):
-        sentenceToMaxURL[0] = sentenceToMaxURL[1]
-    index = np.unique(sentenceToMaxURL)
-    urlScore = {}
-    for url in index:
-        s = [
-            ScoreArray[url][sen]
-            for sen in range(len(sentences))
-            if sentenceToMaxURL[sen] == url
-        ]
-        urlScore[url] = sum(s) / len(s)
-    index_descending = sorted(urlScore, key=urlScore.get, reverse=True)
-    urlMap = {}
-    for count, i in enumerate(index_descending):
-        urlMap[i] = count + 1
-    for i, sent in enumerate(sentences):
-        formatted_tokens.append(
-            (sent, "[" + str(urlMap[sentenceToMaxURL[i]]) + "]")
-        )
-    formatted_tokens.append(("\n", None))
-    formatted_tokens.append(("\n", None))
-    formatted_tokens.append(("\n", None))
-    print(formatted_tokens)
-    print(index_descending)
-    for ind in index_descending:
-        formatted_tokens.append(
-            (
-                urlList[ind]
-                + " --- Matching Score: "
-                + f"{str(round(urlScore[ind] * 100, 2))}%",
-                "[" + str(urlMap[ind]) + "]",
-            )
-        )
-        formatted_tokens.append(("\n", None))
-    print(f"Formatted Tokens: {formatted_tokens}")
-    print(f"Time for plagiarism check: {time.perf_counter()-start}")
-    return formatted_tokens
 """
 AI DETECTION SECTION
 """

 import multiprocessing
 from functools import partial
 import concurrent.futures
+from plagiarism import plagiarism_check
 nltk.download("punkt")
 np.set_printoptions(suppress=True)
 """
 AI DETECTION SECTION
 """

plagiarism.py CHANGED Viewed

	@@ -0,0 +1,340 @@

+import time
+from nltk.tokenize import sent_tokenize
+from googleapiclient.discovery import build
+from collections import Counter
+import re, math
+from sentence_transformers import SentenceTransformer, util
+import asyncio
+import httpx
+from bs4 import BeautifulSoup
+import numpy as np
+WORD = re.compile(r"\w+")
+model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+# returns cosine similarity of two vectors
+# input: two vectors
+# output: integer between 0 and 1.
+def get_cosine(vec1, vec2):
+    intersection = set(vec1.keys()) & set(vec2.keys())
+    # calculating numerator
+    numerator = sum([vec1[x] * vec2[x] for x in intersection])
+    # calculating denominator
+    sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
+    sum2 = sum([vec2[x] ** 2 for x in vec2.keys()])
+    denominator = math.sqrt(sum1) * math.sqrt(sum2)
+    # checking for divide by zero
+    if denominator == 0:
+        return 0.0
+    else:
+        return float(numerator) / denominator
+# converts given text into a vector
+def text_to_vector(text):
+    # uses the Regular expression above and gets all words
+    words = WORD.findall(text)
+    # returns a counter of all the words (count of number of occurences)
+    return Counter(words)
+# returns cosine similarity of two words
+# uses: text_to_vector(text) and get_cosine(v1,v2)
+def cosineSim(text1, text2):
+    vector1 = text_to_vector(text1)
+    vector2 = text_to_vector(text2)
+    # print vector1,vector2
+    cosine = get_cosine(vector1, vector2)
+    return cosine
+def cos_sim_torch(embedding_1, embedding_2):
+    return util.pytorch_cos_sim(embedding_1, embedding_2).item()
+def embed_text(text):
+    return model.encode(text, convert_to_tensor=True)
+def sentence_similarity(text1, text2):
+    embedding_1 = model.encode(text1, convert_to_tensor=True)
+    embedding_2 = model.encode(text2, convert_to_tensor=True)
+    o = util.pytorch_cos_sim(embedding_1, embedding_2)
+    return o.item()
+def google_search(
+    plag_option,
+    sentences,
+    urlCount,
+    scoreArray,
+    urlList,
+    sorted_date,
+    domains_to_skip,
+    api_key,
+    cse_id,
+    **kwargs,
+):
+    service = build("customsearch", "v1", developerKey=api_key)
+    for i, sentence in enumerate(sentences):
+        results = (
+            service.cse()
+            .list(q=sentence, cx=cse_id, sort=sorted_date, **kwargs)
+            .execute()
+        )
+        if "items" in results and len(results["items"]) > 0:
+            for count, link in enumerate(results["items"]):
+                # stop after 3 pages
+                if count >= 3:
+                    break
+                # skip user selected domains
+                if any(
+                    ("." + domain) in link["link"] for domain in domains_to_skip
+                ):
+                    continue
+                # clean up snippet of '...'
+                snippet = link["snippet"]
+                ind = snippet.find("...")
+                if ind < 20 and ind > 9:
+                    snippet = snippet[ind + len("... ") :]
+                ind = snippet.find("...")
+                if ind > len(snippet) - 5:
+                    snippet = snippet[:ind]
+                # update cosine similarity between snippet and given text
+                url = link["link"]
+                if url not in urlList:
+                    urlList.append(url)
+                    scoreArray.append([0] * len(sentences))
+                urlCount[url] = urlCount[url] + 1 if url in urlCount else 1
+                if plag_option == "Standard":
+                    scoreArray[urlList.index(url)][i] = cosineSim(
+                        sentence, snippet
+                    )
+                else:
+                    scoreArray[urlList.index(url)][i] = sentence_similarity(
+                        sentence, snippet
+                    )
+    return urlCount, scoreArray
+def split_sentence_blocks(text):
+    sents = sent_tokenize(text)
+    two_sents = []
+    for i in range(len(sents)):
+        if (i % 2) == 0:
+            two_sents.append(sents[i])
+        else:
+            two_sents[len(two_sents) - 1] += " " + sents[i]
+    return two_sents
+months = {
+    "January": "01",
+    "February": "02",
+    "March": "03",
+    "April": "04",
+    "May": "05",
+    "June": "06",
+    "July": "07",
+    "August": "08",
+    "September": "09",
+    "October": "10",
+    "November": "11",
+    "December": "12",
+}
+def build_date(year=2024, month="March", day=1):
+    return f"{year}{months[month]}{day}"
+async def get_url_data(url, client):
+    try:
+        r = await client.get(url)
+        # print(r.status_code)
+        if r.status_code == 200:
+            # print("in")
+            soup = BeautifulSoup(r.content, "html.parser")
+            return soup
+    except Exception:
+        return None
+def remove_punc(text):
+    res = re.sub(r"[^\w\s]", "", text)
+    return res
+def split_ngrams(text, n):
+    # return n-grams of size n
+    words = text.split()
+    return [words[i : i + n] for i in range(len(words) - n + 1)]
+async def parallel_scrap(urls):
+    async with httpx.AsyncClient(timeout=30) as client:
+        tasks = []
+        for url in urls:
+            tasks.append(get_url_data(url=url, client=client))
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+    return results
+def matching_score(sentence, content):
+    sentence = remove_punc(sentence)
+    content = remove_punc(content)
+    if sentence in content:
+        return 1
+    else:
+        n = 5
+        ngrams = split_ngrams(sentence, n)
+        if len(ngrams) == 0:
+            return 0
+        matched = [x for x in ngrams if " ".join(x) in content]
+    return len(matched) / len(ngrams)
+def plagiarism_check(
+    plag_option,
+    input,
+    year_from,
+    month_from,
+    day_from,
+    year_to,
+    month_to,
+    day_to,
+    domains_to_skip,
+):
+    api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
+    api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
+    api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
+    # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
+    api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
+    cse_id = "851813e81162b4ed4"
+    sentences = split_sentence_blocks(input)
+    urlCount = {}
+    ScoreArray = []
+    urlList = []
+    date_from = build_date(year_from, month_from, day_from)
+    date_to = build_date(year_to, month_to, day_to)
+    sort_date = f"date:r:{date_from}:{date_to}"
+    # get list of URLS to check
+    urlCount, ScoreArray = google_search(
+        plag_option,
+        sentences,
+        urlCount,
+        ScoreArray,
+        urlList,
+        sort_date,
+        domains_to_skip,
+        api_key,
+        cse_id,
+    )
+    # Scrape URLs in list
+    formatted_tokens = []
+    soups = asyncio.run(parallel_scrap(urlList))
+    # Populate matching scores for scrapped pages
+    for i, soup in enumerate(soups):
+        print(f"Analyzing {i+1} of {len(soups)} soups........................")
+        if soup:
+            page_content = soup.text
+            for j, sent in enumerate(sentences):
+                score = matching_score(sent, page_content)
+                score = matching_score(sent, page_content)
+                # score = cos_sim_torch(embed_text(sent), source_embeddings[i])
+                ScoreArray[i][j] = score
+    # *****IF THIS IS TO BE USED, PLEASE PROVIDE "preprocess()" FUNCTION IN LINE 248**************
+    # source_embeddings = []
+    # for i, soup in enumerate(soups):
+    #     if soup:
+    #         page_content = soup.text
+    #         source_embeddings.append(embed_text(page_content))
+    #     else:
+    #        source_embeddings.append(None)
+    # def compute_cosine_similarity(args):
+    #     sent, source_embedding, i, j = args
+    #     score = cos_sim_torch(embed_text(sent), source_embedding)
+    #     return i, j, score
+    # def main(soups, sentences):
+    #     source_embeddings = [preprocess(soup) for soup in soups]
+    #     ScoreArray = [[0 for _ in sentences] for _ in soups]
+    #     args_list = []
+    #     for i, soup in enumerate(soups):
+    #         if soup:
+    #             for j, sent in enumerate(sentences):
+    #                 args_list.append((sent, source_embeddings[i], i, j))
+    #     with concurrent.futures.ProcessPoolExecutor() as executor:
+    #         results = executor.map(compute_cosine_similarity, args_list)
+    #         for i, j, score in results:
+    #             ScoreArray[i][j] = score
+    #     return ScoreArray
+    # # Populate matching scores for scrapped pages
+    # ScoreArray = main(soups, sentences)
+    # *******************************************************************************************
+    # Calculate URL of max matching score for each sentence chunk
+    sentenceToMaxURL = [-1] * len(sentences)
+    for j in range(len(sentences)):
+        if j > 0:
+            maxScore = ScoreArray[sentenceToMaxURL[j - 1]][j]
+            sentenceToMaxURL[j] = sentenceToMaxURL[j - 1]
+        else:
+            maxScore = -1
+        for i in range(len(ScoreArray)):
+            margin = (
+                0.1
+                if (j > 0 and sentenceToMaxURL[j] == sentenceToMaxURL[j - 1])
+                else 0
+            )
+            if ScoreArray[i][j] - maxScore > margin:
+                maxScore = ScoreArray[i][j]
+                sentenceToMaxURL[j] = i
+    index = np.unique(sentenceToMaxURL)
+    urlScore = {}
+    for url in index:
+        s = [
+            ScoreArray[url][sen]
+            for sen in range(len(sentences))
+            if sentenceToMaxURL[sen] == url
+        ]
+        urlScore[url] = sum(s) / len(s)
+    index_descending = sorted(urlScore, key=urlScore.get, reverse=True)
+    urlMap = {}
+    for count, i in enumerate(index_descending):
+        urlMap[i] = count + 1
+    for i, sent in enumerate(sentences):
+        formatted_tokens.append(
+            (sent, "[" + str(urlMap[sentenceToMaxURL[i]]) + "]")
+        )
+    for ind in index_descending:
+        formatted_tokens.append(
+            (
+                urlList[ind]
+                + " --- Matching Score: "
+                + f"{str(round(urlScore[ind] * 100, 2))}%",
+                "[" + str(urlMap[ind]) + "]",
+            )
+        )
+        formatted_tokens.append(("\n", None))
+    return formatted_tokens