Spaces:

polygraf-ai
/

copyright_checker

Running

App Files Files Community

aliasgerovs commited on Feb 7

Commit

1be431a

•

1 Parent(s): 9d99259

Updated

Browse files

Files changed (3) hide show

app.py +401 -0
requirements.txt +19 -0
utils.py +250 -0

app.py ADDED Viewed

	@@ -0,0 +1,401 @@

+from utils import cosineSim, googleSearch, getSentences, parallel_scrap, matchingScore
+import gradio as gr
+from urllib.request import urlopen, Request
+from googleapiclient.discovery import build
+import requests
+import httpx
+import re
+from bs4 import BeautifulSoup
+import numpy as np
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import asyncio
+from scipy.special import softmax
+from evaluate import load
+from datetime import date
+import nltk
+np.set_printoptions(suppress=True)
+def plagiarism_check(
+    input,
+    year_from,
+    month_from,
+    day_from,
+    year_to,
+    month_to,
+    day_to,
+    domains_to_skip,
+):
+    api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
+    api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
+    api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
+    api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
+    # api_key = "AIzaSyBrx_pgb6A64wPFQXSGQRgGtukoxVV_0Fk"
+    cse_id = "851813e81162b4ed4"
+    sentences = getSentences(input)
+    urlCount = {}
+    ScoreArray = []
+    urlList = []
+    date_from = build_date(year_from, month_from, day_from)
+    date_to = build_date(year_to, month_to, day_to)
+    sort_date = f"date:r:{date_from}:{date_to}"
+    # get list of URLS to check
+    urlCount, ScoreArray = googleSearch(
+        sentences,
+        urlCount,
+        ScoreArray,
+        urlList,
+        sort_date,
+        domains_to_skip,
+        api_key,
+        cse_id,
+    )
+    print("Number of URLs: ", len(urlCount))
+    # print("Old Score Array:\n")
+    # print2D(ScoreArray)
+    # Scrape URLs in list
+    formatted_tokens = []
+    soups = asyncio.run(parallel_scrap(urlList))
+    print(len(soups))
+    print(
+        "Successful scraping: "
+        + str(len([x for x in soups if x is not None]))
+        + "out of "
+        + str(len(urlList))
+    )
+    # Populate matching scores for scrapped pages
+    for i, soup in enumerate(soups):
+        print(f"Analyzing {i+1} of {len(soups)} soups........................")
+        if soup:
+            page_content = soup.text
+            for j, sent in enumerate(sentences):
+                score = matchingScore(sent, page_content)
+                ScoreArray[i][j] = score
+    # ScoreArray = asyncio.run(parallel_analyze_2(soups, sentences, ScoreArray))
+    # print("New Score Array:\n")
+    # print2D(ScoreArray)
+    # Gradio formatting section
+    sentencePlag = [False] * len(sentences)
+    sentenceToMaxURL = [-1] * len(sentences)
+    for j in range(len(sentences)):
+        if j > 0:
+            maxScore = ScoreArray[sentenceToMaxURL[j - 1]][j]
+            sentenceToMaxURL[j] = sentenceToMaxURL[j - 1]
+        else:
+            maxScore = -1
+        for i in range(len(ScoreArray)):
+            margin = (
+                0.1
+                if (j > 0 and sentenceToMaxURL[j] == sentenceToMaxURL[j - 1])
+                else 0
+            )
+            if ScoreArray[i][j] - maxScore > margin:
+                maxScore = ScoreArray[i][j]
+                sentenceToMaxURL[j] = i
+        if maxScore > 0.5:
+            sentencePlag[j] = True
+    if (
+        (len(sentences) > 1)
+        and (sentenceToMaxURL[1] != sentenceToMaxURL[0])
+        and (
+            ScoreArray[sentenceToMaxURL[0]][0]
+            - ScoreArray[sentenceToMaxURL[1]][0]
+            < 0.1
+        )
+    ):
+        sentenceToMaxURL[0] = sentenceToMaxURL[1]
+    index = np.unique(sentenceToMaxURL)
+    urlMap = {}
+    for count, i in enumerate(index):
+        urlMap[i] = count + 1
+    for i, sent in enumerate(sentences):
+        formatted_tokens.append(
+            (sent, "[" + str(urlMap[sentenceToMaxURL[i]]) + "]")
+        )
+    formatted_tokens.append(("\n", None))
+    formatted_tokens.append(("\n", None))
+    formatted_tokens.append(("\n", None))
+    urlScore = {}
+    for url in index:
+        s = [
+            ScoreArray[url][sen]
+            for sen in range(len(sentences))
+            if sentenceToMaxURL[sen] == url
+        ]
+        urlScore[url] = sum(s) / len(s)
+    for ind in index:
+        formatted_tokens.append(
+            (
+                urlList[ind] + " --- Matching Score: " + str(urlScore[ind]),
+                "[" + str(urlMap[ind]) + "]",
+            )
+        )
+        formatted_tokens.append(("\n", None))
+    print(f"Formatted Tokens: {formatted_tokens}")
+    return formatted_tokens
+"""
+AI DETECTION SECTION
+"""
+text_bc_model_path = "polygraf-ai/ai-text-bc-bert-1-4m"
+text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
+text_bc_model = AutoModelForSequenceClassification.from_pretrained(
+    text_bc_model_path
+)
+text_mc_model_path = "polygraf-ai/ai-text-mc-v5-lighter-spec"
+text_mc_tokenizer = AutoTokenizer.from_pretrained(text_mc_model_path)
+text_mc_model = AutoModelForSequenceClassification.from_pretrained(
+    text_mc_model_path
+)
+def remove_special_characters(text):
+    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
+    return cleaned_text
+def predict_bc(model, tokenizer, text):
+    tokens = tokenizer(
+        text, padding=True, truncation=True, return_tensors="pt"
+    )["input_ids"]
+    output = model(tokens)
+    output_norm = softmax(output.logits.detach().numpy(), 1)[0]
+    print("BC Score: ", output_norm)
+    bc_score = {"AI": output_norm[1].item(), "HUMAN": output_norm[0].item()}
+    return bc_score
+def predict_mc(model, tokenizer, text):
+    tokens = tokenizer(
+        text, padding=True, truncation=True, return_tensors="pt"
+    )["input_ids"]
+    output = model(tokens)
+    output_norm = softmax(output.logits.detach().numpy(), 1)[0]
+    print("MC Score: ", output_norm)
+    mc_score = {}
+    label_map = ["GPT 3.5", "GPT 4", "CLAUDE", "BARD", "LLAMA 2"]
+    for score, label in zip(output_norm, label_map):
+        mc_score[label.upper()] = score.item()
+    return mc_score
+def ai_generated_test(input, models):
+    cleaned_text = remove_special_characters(input)
+    bc_score = predict_bc(text_bc_model, text_bc_tokenizer, cleaned_text)
+    mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text)
+    sum_prob = 1 - bc_score["HUMAN"]
+    for key, value in mc_score.items():
+        mc_score[key] = value * sum_prob
+    return bc_score, mc_score
+# COMBINED
+def main(
+    input,
+    models,
+    year_from,
+    month_from,
+    day_from,
+    year_to,
+    month_to,
+    day_to,
+    domains_to_skip,
+):
+    bc_score, mc_score = ai_generated_test(input, models)
+    formatted_tokens = plaigiarism_check(
+        input,
+        year_from,
+        month_from,
+        day_from,
+        year_to,
+        month_to,
+        day_to,
+        domains_to_skip,
+    )
+    return (
+        bc_score,
+        mc_score,
+        formatted_tokens,
+    )
+def build_date(year, month, day):
+    return f"{year}{months[month]}{day}"
+# START OF GRADIO
+title = "Plagiarism Demo"
+months = {
+    "January": "01",
+    "February": "02",
+    "March": "03",
+    "April": "04",
+    "May": "05",
+    "June": "06",
+    "July": "07",
+    "August": "08",
+    "September": "09",
+    "October": "10",
+    "November": "11",
+    "December": "12",
+}
+with gr.Blocks() as demo:
+    today = date.today()
+    # dd/mm/YY
+    d1 = today.strftime("%d/%B/%Y")
+    d1 = d1.split("/")
+    model_list = ["GPT 3.5", "GPT 4", "CLAUDE", "BARD", "LLAMA2"]
+    domain_list = ["com", "org", "net", "int", "edu", "gov", "mil"]
+    gr.Markdown(
+        """
+    # Plagiarism Detection Demo
+    """
+    )
+    input_text = gr.Textbox(label="Input text", lines=5, placeholder="")
+    with gr.Row():
+        with gr.Column():
+            only_ai_btn = gr.Button("AI Check")
+        with gr.Column():
+            only_plagiarism_btn = gr.Button("Plagiarism Check")
+        with gr.Column():
+            submit_btn = gr.Button("Full Check")
+    gr.Markdown(
+        """
+        ## Output
+        """
+    )
+    with gr.Row():
+        models = gr.Dropdown(
+                model_list,
+                value=model_list,
+                multiselect=True,
+                label="Models to test against",
+            )
+    with gr.Row():
+        with gr.Column():
+            bcLabel = gr.Label(label="Source")
+        with gr.Column():
+            mcLabel = gr.Label(label="Creator")
+    with gr.Group():
+        with gr.Row():
+            month_from = gr.Dropdown(
+                choices=months,
+                label="From Month",
+                value="January",
+                interactive=True,
+            )
+            day_from = gr.Textbox(label="From Day", value="01")
+            year_from = gr.Textbox(label="From Year", value="2000")
+            # from_date_button = gr.Button("Submit")
+        with gr.Row():
+            month_to = gr.Dropdown(
+                choices=months,
+                label="To Month",
+                value=d1[1],
+                interactive=True,
+            )
+            day_to = gr.Textbox(label="To Day", value=d1[0])
+            year_to = gr.Textbox(label="To Year", value=d1[2])
+            # to_date_button = gr.Button("Submit")
+        with gr.Row():
+            domains_to_skip = gr.Dropdown(
+                domain_list,
+                multiselect=True,
+                label="Domain To Skip",
+            )
+    with gr.Row():
+        with gr.Column():
+            sentenceBreakdown = gr.HighlightedText(
+                label="Plagiarism Sentence Breakdown",
+                combine_adjacent=True,
+                color_map={
+                    "[1]": "red",
+                    "[2]": "orange",
+                    "[3]": "yellow",
+                    "[4]": "green",
+                },
+            )
+    submit_btn.click(
+        fn=main,
+        inputs=[
+            input_text,
+            models,
+            year_from,
+            month_from,
+            day_from,
+            year_to,
+            month_to,
+            day_to,
+            domains_to_skip,
+        ],
+        outputs=[
+            bcLabel,
+            mcLabel,
+            sentenceBreakdown,
+        ],
+        api_name="main",
+    )
+    only_ai_btn.click(
+        fn=ai_generated_test,
+        inputs=[input_text, models],
+        outputs=[
+            bcLabel,
+            mcLabel,
+        ],
+        api_name="ai_check",
+    )
+    only_plagiarism_btn.click(
+        fn=plaigiarism_check,
+        inputs=[
+            input_text,
+            year_from,
+            month_from,
+            day_from,
+            year_to,
+            month_to,
+            day_to,
+            domains_to_skip,
+        ],
+        outputs=[
+            sentenceBreakdown,
+        ],
+        api_name="plagiarism_check",
+    )
+    date_from = ""
+    date_to = ""
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+gradio
+python-docx
+google-api-python-client
+nltk
+BeautifulSoup4
+scrapingbee
+requests
+numpy
+torch==1.13.0
+transformers==4.25.1
+transformers-interpret
+textstat
+scipy
+scikit-learn
+joblib
+evaluate
+tensorflow
+keras
+spacy

utils.py ADDED Viewed

	@@ -0,0 +1,250 @@

+from urllib.request import urlopen, Request
+from googleapiclient.discovery import build
+import requests
+import httpx
+import re
+from bs4 import BeautifulSoup
+import re, math
+from collections import Counter
+import numpy as np
+import asyncio
+import nltk
+nltk.download('punkt')
+WORD = re.compile(r"\w+")
+# returns cosine similarity of two vectors
+# input: two vectors
+# output: integer between 0 and 1.
+def get_cosine(vec1, vec2):
+    intersection = set(vec1.keys()) & set(vec2.keys())
+    # calculating numerator
+    numerator = sum([vec1[x] * vec2[x] for x in intersection])
+    # calculating denominator
+    sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
+    sum2 = sum([vec2[x] ** 2 for x in vec2.keys()])
+    denominator = math.sqrt(sum1) * math.sqrt(sum2)
+    # checking for divide by zero
+    if denominator == 0:
+        return 0.0
+    else:
+        return float(numerator) / denominator
+# converts given text into a vector
+def text_to_vector(text):
+    # uses the Regular expression above and gets all words
+    words = WORD.findall(text)
+    # returns a counter of all the words (count of number of occurences)
+    return Counter(words)
+# returns cosine similarity of two words
+# uses: text_to_vector(text) and get_cosine(v1,v2)
+def cosineSim(text1, text2):
+    vector1 = text_to_vector(text1)
+    vector2 = text_to_vector(text2)
+    # print vector1,vector2
+    cosine = get_cosine(vector1, vector2)
+    return cosine
+def get_soup_requests(url):
+    page = requests.get(url)
+    if page.status_code == 200:
+        soup = BeautifulSoup(page.content, "html.parser")
+        return soup
+    print("HTML soup failed")
+    return None
+def get_soup_httpx(url):
+    client = httpx.Client(timeout=30)
+    try:
+        page = client.get(url)
+        if page.status_code == httpx.codes.OK:
+            soup = BeautifulSoup(page.content, "html.parser")
+            return soup
+    except:
+        print("HTTPx soup failed")
+        return None
+def getSentences(text):
+    from nltk.tokenize import sent_tokenize
+    sents = sent_tokenize(text)
+    two_sents = []
+    for i in range(len(sents)):
+        if (i % 2) == 0:
+            two_sents.append(sents[i])
+        else:
+            two_sents[len(two_sents) - 1] += " " + sents[i]
+    return two_sents
+def googleSearch(
+    sentences,
+    urlCount,
+    scoreArray,
+    urlList,
+    sorted_date,
+    domains_to_skip,
+    api_key,
+    cse_id,
+    **kwargs,
+):
+    service = build("customsearch", "v1", developerKey=api_key)
+    for i, sentence in enumerate(sentences):
+        results = (
+            service.cse()
+            .list(q=sentence, cx=cse_id, sort=sorted_date, **kwargs)
+            .execute()
+        )
+        if "items" in results and len(results["items"]) > 0:
+            for count, link in enumerate(results["items"]):
+                # stop after 5 pages
+                if count > 4:
+                    break
+                # skip user selected domains
+                if any(
+                    ("." + domain) in link["link"]
+                    for domain in domains_to_skip
+                ):
+                    continue
+                # clean up snippet of '...'
+                snippet = link["snippet"]
+                ind = snippet.find("...")
+                if ind < 20 and ind > 9:
+                    snippet = snippet[ind + len("... ") :]
+                ind = snippet.find("...")
+                if ind > len(snippet) - 5:
+                    snippet = snippet[:ind]
+                # update cosine similarity between snippet and given text
+                url = link["link"]
+                if url not in urlList:
+                    urlList.append(url)
+                    scoreArray.append([0] * len(sentences))
+                urlCount[url] = urlCount[url] + 1 if url in urlCount else 1
+                scoreArray[urlList.index(url)][i] = cosineSim(
+                    sentence, snippet
+                )
+        else:
+            print("Google Search failed")
+    return urlCount, scoreArray
+def getQueries(text, n):
+    # return n-grams of size n
+    finalq = []
+    words = text.split()
+    l = len(words)
+    for i in range(0, l - n + 1):
+        finalq.append(words[i : i + n])
+    return finalq
+def print2D(array):
+    print(np.array(array))
+def removePunc(text):
+    res = re.sub(r"[^\w\s]", "", text)
+    return res
+async def get_url_data(url, client):
+    try:
+        r = await client.get(url)
+        # print(r.status_code)
+        if r.status_code == 200:
+            # print("in")
+            soup = BeautifulSoup(r.content, "html.parser")
+            return soup
+    except Exception:
+        print("HTTPx parallel soup failed")
+        return None
+async def parallel_scrap(urls):
+    async with httpx.AsyncClient(timeout=30) as client:
+        tasks = []
+        for url in urls:
+            tasks.append(get_url_data(url=url, client=client))
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+    return results
+def matchingScore(sentence, content):
+    if sentence in content:
+        return 1
+    sentence = removePunc(sentence)
+    content = removePunc(content)
+    if sentence in content:
+        return 1
+    else:
+        n = 5
+        ngrams = getQueries(sentence, n)
+        if len(ngrams) == 0:
+            return 0
+        matched = [x for x in ngrams if " ".join(x) in content]
+        return len(matched) / len(ngrams)
+async def matchingScoreAsync(sentences, content, content_idx, ScoreArray):
+    content = removePunc(content)
+    for j, sentence in enumerate(sentences):
+        sentence = removePunc(sentence)
+        if sentence in content:
+            ScoreArray[content_idx][j] = 1
+        else:
+            n = 5
+            ngrams = getQueries(sentence, n)
+            if len(ngrams) == 0:
+                return 0
+            matched = [x for x in ngrams if " ".join(x) in content]
+            ScoreArray[content_idx][j] = len(matched) / len(ngrams)
+    print(
+        f"Analyzed {content_idx+1} of soups (SOUP SUCCEEDED)........................"
+    )
+    return ScoreArray
+async def parallel_analyze(soups, sentences, ScoreArray):
+    tasks = []
+    for i, soup in enumerate(soups):
+        if soup:
+            page_content = soup.text
+            tasks.append(
+                matchingScoreAsync(sentences, page_content, i, ScoreArray)
+            )
+        else:
+            print(
+                f"Analyzed {i+1} of soups (SOUP FAILED)........................"
+            )
+    ScoreArray = await asyncio.gather(*tasks, return_exceptions=True)
+    return ScoreArray
+async def parallel_analyze_2(soups, sentences, ScoreArray):
+    tasks = [[0] * len(ScoreArray[0]) for i in range(len(ScoreArray))]
+    for i, soup in enumerate(soups):
+        if soup:
+            page_content = soup.text
+            for j, sent in enumerate(sentences):
+                print(
+                    f"Analyzing {i+1} of {len(soups)} soups with {j+1} of {len(sentences)} sentences........................"
+                )
+                tasks[i][j] = matchingScore(sent, page_content)
+        else:
+            print(
+                f"Analyzed {i+1} of soups (SOUP FAILED)........................"
+            )
+    ScoreArray = await asyncio.gather(*tasks, return_exceptions=True)
+    return ScoreArray