Spaces:

polygraf-ai
/

copyright_checker

Sleeping

App Files Files Community

minko186 commited on Jul 16, 2024

Commit

8a482d3

verified ·

1 Parent(s): 227a8b5

Update predictors.py

Browse files

Files changed (1) hide show

predictors.py +24 -564

predictors.py CHANGED Viewed

@@ -1,17 +1,4 @@
-import time
-from nltk.tokenize import sent_tokenize
-from googleapiclient.discovery import build
-from collections import Counter
-import re, math
-from sentence_transformers import SentenceTransformer, util
-import asyncio
-import httpx
-from bs4 import BeautifulSoup
-import numpy as np
-import concurrent
-from multiprocessing import Pool
-from const import url_types
-from collections import defaultdictimport torch
 import numpy as np
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import nltk
@@ -307,555 +294,28 @@ def predict_mc_scores(input):
     mc_scores = []
     segments_mc = split_text_allow_complete_sentences_nltk(
         input, type_det="mc"
-WORD = re.compile(r"\w+")
-model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
-months = {
-    "January": "01",
-    "February": "02",
-    "March": "03",
-    "April": "04",
-    "May": "05",
-    "June": "06",
-    "July": "07",
-    "August": "08",
-    "September": "09",
-    "October": "10",
-    "November": "11",
-    "December": "12",
-}
-color_map = [
-    "#cf2323",
-    "#d65129",
-    "#d66329",
-    "#d67129",
-    "#eb9d59",
-    "#c2ad36",
-    "#d6ae29",
-    "#d6b929",
-    "#e1ed72",
-    "#c2db76",
-    "#a2db76",
-]
-def text_to_vector(text):
-    words = WORD.findall(text)
-    return Counter(words)
-def cosineSim(text1, text2):
-    vector1 = text_to_vector(text1)
-    vector2 = text_to_vector(text2)
-    # print vector1,vector2
-    cosine = get_cosine(vector1, vector2)
-    return cosine
-def get_cosine(vec1, vec2):
-    intersection = set(vec1.keys()) & set(vec2.keys())
-    numerator = sum([vec1[x] * vec2[x] for x in intersection])
-    sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
-    sum2 = sum([vec2[x] ** 2 for x in vec2.keys()])
-    denominator = math.sqrt(sum1) * math.sqrt(sum2)
-    if denominator == 0:
-        return 0.0
-    else:
-        return float(numerator) / denominator
-def split_sentence_blocks(text, size):
-    if size == "Paragraph":
-        blocks = text.strip().split("\n")
-        return blocks
-    else:
-        sents = sent_tokenize(text.strip())
-        return sents
-def build_date(year=2024, month="March", day=1):
-    return f"{year}{months[month]}{day}"
-def split_ngrams(text, n):
-    words = text.split()
-    return [tuple(words[i : i + n]) for i in range(len(words) - n + 1)]
-def sentence_similarity(text1, text2):
-    embedding_1 = model.encode(text1, convert_to_tensor=True)
-    embedding_2 = model.encode(text2, convert_to_tensor=True)
-    o = util.pytorch_cos_sim(embedding_1, embedding_2)
-    return o.item()
-async def get_url_data(url, client):
-    try:
-        r = await client.get(url)
-        if r.status_code == 200:
-            soup = BeautifulSoup(r.content, "html.parser")
-            return soup
-    except Exception:
-        return None
-async def parallel_scrap(urls):
-    async with httpx.AsyncClient(timeout=30) as client:
-        tasks = []
-        for url in urls:
-            tasks.append(get_url_data(url=url, client=client))
-        results = await asyncio.gather(*tasks, return_exceptions=True)
-    return results
-def merge_ngrams_into_sentence(ngrams):
-    if ngrams == None:
-        return ""
-    if len(ngrams) > 20:
-        ngrams = ngrams[:20]
-    merged_sentence = []
-    i = 0
-    for ngram in ngrams:
-        overlap = len(set(ngram) & set(merged_sentence[-len(ngram) :]))
-        if overlap == 0:
-            merged_sentence.extend(ngram)
-        elif overlap < len(ngram):
-            merged_sentence.extend(ngram[overlap:])
-    return " ".join(merged_sentence)
-def remove_ngrams_after(ngrams, target_ngram):
-    try:
-        index = ngrams.index(target_ngram)
-        return ngrams[: index + 1]
-    except ValueError:
-        return None
-def matching_score(sentence_content_tuple):
-    sentence, content, score = sentence_content_tuple
-    if sentence in content:
-        return 1, sentence
-    # if score > 0.9:
-    #     return score
-    else:
-        n = 5
-        # ngrams = split_ngrams(sentence, n)
-        # if len(ngrams) == 0:
-        #     return 0
-        # matched = [x for x in ngrams if " ".join(x) in content]
-        # return len(matched) / len(ngrams)
-        # list comprehension matching
-        # ngrams_sentence = split_ngrams(sentence, n)
-        # ngrams_content = [tuple(ngram) for ngram in split_ngrams(content, n)]
-        # if len(ngrams_sentence) == 0:
-        #     return 0, ""
-        # matched_ngrams = [
-        #     1 for ngram in ngrams_sentence if tuple(ngram) in ngrams_content
-        # ]
-        # matched_count = sum(matched_ngrams)
-        # set intersection matching
-        ngrams_sentence = set(split_ngrams(sentence, n))
-        ngrams_content = set(split_ngrams(content, n))
-        if len(ngrams_sentence) == 0:
-            return 0, ""
-        matched_ngrams = ngrams_sentence.intersection(ngrams_content)
-        matched_count = len(matched_ngrams)
-        # matched content
-        matched_content_ngrams = []
-        found = False
-        last_found = None
-        for ngram in ngrams_sentence:
-            for ngram_content in ngrams_content:
-                if tuple(ngram) == ngram_content:
-                    found = True
-                    last_found = ngram_content
-                if found:
-                    matched_content_ngrams.append(ngram_content)
-        matched_content_ngrams = remove_ngrams_after(
-            matched_content_ngrams, last_found
-        )
-        matched_content = merge_ngrams_into_sentence(matched_content_ngrams)
-        return matched_count / len(ngrams_sentence), matched_content
-def process_with_multiprocessing(input_data):
-    with Pool(processes=1) as pool:
-        scores = pool.map(matching_score, input_data)
-    return scores
-def map_sentence_url(sentences, score_array):
-    sentenceToMaxURL = [-1] * len(sentences)
-    for j in range(len(sentences)):
-        if j > 0:
-            maxScore = score_array[sentenceToMaxURL[j - 1]][j]
-            sentenceToMaxURL[j] = sentenceToMaxURL[j - 1]
-        else:
-            maxScore = -1
-        for i in range(len(score_array)):
-            margin = (
-                0.05
-                if (j > 0 and sentenceToMaxURL[j] == sentenceToMaxURL[j - 1])
-                else 0
-            )
-            if score_array[i][j] - maxScore > margin:
-                maxScore = score_array[i][j]
-                sentenceToMaxURL[j] = i
-    return sentenceToMaxURL
-def check_url_category(url):
-    for category, urls in url_types.items():
-        for u in urls:
-            if u in url:
-                return category
-    return "Internet Source"
-def google_search(
-    plag_option,
-    sentences,
-    url_count,
-    score_array,
-    url_list,
-    snippets,
-    sorted_date,
-    domains_to_skip,
-    api_key,
-    cse_id,
-    **kwargs,
-):
-    service = build("customsearch", "v1", developerKey=api_key)
-    num_pages = 1
-    for i, sentence in enumerate(sentences):
-        results = (
-            service.cse()
-            .list(q=sentence, cx=cse_id, sort=sorted_date, **kwargs)
-            .execute()
-        )
-        if "items" in results and len(results["items"]) > 0:
-            for count, link in enumerate(results["items"]):
-                if count >= num_pages:
-                    break
-                # skip user selected domains
-                if (domains_to_skip is not None) and any(
-                    ("." + domain) in link["link"] for domain in domains_to_skip
-                ):
-                    continue
-                # clean up snippet of '...'
-                snippet = link["snippet"]
-                ind = snippet.find("...")
-                if ind < 20 and ind > 9:
-                    snippet = snippet[ind + len("... ") :]
-                ind = snippet.find("...")
-                if ind > len(snippet) - 5:
-                    snippet = snippet[:ind]
-                # update cosine similarity between snippet and given text
-                url = link["link"]
-                if url not in url_list:
-                    url_list.append(url)
-                    score_array.append([0] * len(sentences))
-                    snippets.append([""] * len(sentences))
-                url_count[url] = url_count[url] + 1 if url in url_count else 1
-                snippets[url_list.index(url)][i] = snippet
-                if plag_option == "Standard":
-                    score_array[url_list.index(url)][i] = cosineSim(
-                        sentence, snippet
-                    )
-                else:
-                    score_array[url_list.index(url)][i] = sentence_similarity(
-                        sentence, snippet
-                    )
-    return url_count, score_array
-def plagiarism_check(
-    plag_option,
-    input,
-    year_from,
-    month_from,
-    day_from,
-    year_to,
-    month_to,
-    day_to,
-    domains_to_skip,
-    source_block_size,
-):
-    # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
-    # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
-    # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
-    api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
-    # api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
-    # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
-    # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
-    cse_id = "851813e81162b4ed4"
-    url_scores = []
-    sentence_scores = []
-    sentences = split_sentence_blocks(input, source_block_size)
-    url_count = {}
-    score_array = []
-    url_list = []
-    snippets = []
-    date_from = build_date(year_from, month_from, day_from)
-    date_to = build_date(year_to, month_to, day_to)
-    sort_date = f"date:r:{date_from}:{date_to}"
-    # get list of URLS to check
-    start_time = time.perf_counter()
-    url_count, score_array = google_search(
-        plag_option,
-        sentences,
-        url_count,
-        score_array,
-        url_list,
-        snippets,
-        sort_date,
-        domains_to_skip,
-        api_key,
-        cse_id,
     )
-    print("GOOGLE SEARCH PROCESSING TIME: ", time.perf_counter() - start_time)
-    # Scrape URLs in list
-    start_time = time.perf_counter()
-    soups = asyncio.run(parallel_scrap(url_list))
-    print("SCRAPING PROCESSING TIME: ", time.perf_counter() - start_time)
-    input_data = []
-    for i, soup in enumerate(soups):
-        if soup:
-            page_content = soup.text
-            for j, sent in enumerate(sentences):
-                input_data.append((sent, page_content, score_array[i][j]))
-    start_time = time.perf_counter()
-    # scores = process_with_multiprocessing(input_data)
-    scores = []
-    for i in input_data:
-        scores.append(matching_score(i))
-    print("MATCHING SCORE PROCESSING TIME: ", time.perf_counter() - start_time)
-    matched_sentence_array = [
-        ["" for _ in range(len(score_array[0]))]
-        for _ in range(len(score_array))
-    ]
-    k = 0
-    # Update score array for each (soup, sentence)
-    for i, soup in enumerate(soups):
-        if soup:
-            for j, _ in enumerate(sentences):
-                score_array[i][j] = scores[k][0]
-                matched_sentence_array[i][j] = scores[k][1]
-                k += 1
-    sentenceToMaxURL = map_sentence_url(sentences, score_array)
-    index = np.unique(sentenceToMaxURL)
-    url_source = {}
-    for url in index:
-        s = [
-            score_array[url][sen]
-            for sen in range(len(sentences))
-            if sentenceToMaxURL[sen] == url
-        ]
-        url_source[url] = sum(s) / len(s)
-    index_descending = sorted(url_source, key=url_source.get, reverse=True)
-    urlMap = {}
-    for count, i in enumerate(index_descending):
-        urlMap[i] = count + 1
-    # build results
-    for i, sent in enumerate(sentences):
-        ind = sentenceToMaxURL[i]
-        if url_source[ind] > 0.1:
-            sentence_scores.append(
-                [
-                    sent,
-                    round(url_source[ind] * 100, 2),
-                    url_list[ind],
-                    urlMap[ind],
-                ]
-            )
-        else:
-            sentence_scores.append([sent, None, url_list[ind], -1])
-    print("SNIPPETS: ", snippets)
-    snippets = [[item for item in sublist if item] for sublist in snippets]
-    for ind in index_descending:
-        if url_source[ind] > 0.1:
-            matched_sentence_array = [
-                [item for item in sublist if item]
-                for sublist in matched_sentence_array
-            ]
-            matched_sentence = "...".join(
-                [sent for sent in matched_sentence_array[ind]]
-            )
-            if matched_sentence == "":
-                matched_sentence = "...".join([sent for sent in snippets[ind]])
-            url_scores.append(
-                [
-                    url_list[ind],
-                    round(url_source[ind] * 100, 2),
-                    urlMap[ind],
-                    matched_sentence,
-                ]
-            )
-    return sentence_scores, url_scores
-def html_highlight(
-    plag_option,
-    input,
-    year_from,
-    month_from,
-    day_from,
-    year_to,
-    month_to,
-    day_to,
-    domains_to_skip,
-    source_block_size,
-):
-    start_time = time.perf_counter()
-    sentence_scores, url_scores = plagiarism_check(
-        plag_option,
-        input,
-        year_from,
-        month_from,
-        day_from,
-        year_to,
-        month_to,
-        day_to,
-        domains_to_skip,
-        source_block_size,
     )
-    html_content = """
-        <link href='https://fonts.googleapis.com/css?family=Roboto' rel='stylesheet'>
-        <div style='font-family: {font}; border: 2px solid black; padding: 10px; color: #FFFFFF;'>
-        <html>
-        <head>
-            <title>Toggle Details</title>
-            <style>
-                .score-container {
-                    display: flex;
-                    justify-content: space-around;
-                    align-items: left;
-                    padding: 20px;
-                }
-                .score-item {
-                    text-align: center;
-                    padding: 10px;
-                    background-color: #636362;
-                    border-radius: 5px;
-                    flex-grow: 1;
-                    margin: 0 5px;
-                }
-                .details {
-                    display: none;
-                    padding: 10px;
-                }
-                .url-link {
-                    font-size: 1.2em;
-                }
-                .url-link span {
-                    margin-right: 10px;
-                }
-                .toggle-button {
-                    color: #333;
-                    border: none;
-                    padding: 5px 10px;
-                    text-align: center;
-                    text-decoration: none;
-                    display: inline-block;
-                    cursor: pointer;
-                }
-            </style>
-        </head>
-    """
-    prev_idx = None
-    combined_sentence = ""
-    total_score = 0
-    total_count = 0
-    category_scores = defaultdict(set)
-    for sentence, score, url, idx in sentence_scores:
-        category = check_url_category(url)
-        if score is None:
-            total_score += 0
-        else:
-            total_score += score
-            category_scores[category].add(score)
-        total_count += 1
-        if idx != prev_idx and prev_idx is not None:
-            color = color_map[prev_idx - 1]
-            index_part = f"<span>[{prev_idx}]</span>"
-            formatted_sentence = f'<p style="background-color: {color}; padding: 2px;">{combined_sentence} {index_part}</p>'
-            html_content += formatted_sentence
-            combined_sentence = ""
-        combined_sentence += " " + sentence
-        prev_idx = idx
-    print(category_scores)
-    total_average_score = round(total_score / total_count, 2)
-    category_averages = {
-        category: round((sum(scores) / len(scores)), 2)
-        for category, scores in category_scores.items()
-    }
-    if combined_sentence:
-        color = color_map[prev_idx - 1]
-        index_part = ""
-        if prev_idx != -1:
-            index_part = f"<span>[{prev_idx}]</span>"
-        formatted_sentence = f'<p style="background-color: {color}; padding: 2px;">{combined_sentence} {index_part}</p>'
-        html_content += formatted_sentence
-    html_content += "<hr>"
-    html_content += f"""
-        <div class="score-container">
-        <div class="score-item">
-            <h3>Overall Similarity</h3>
-            <p>{total_average_score}%</p>
-        </div>
-    """
-    for category, score in category_averages.items():
-        html_content += f"""
-            <div class="score-item"><h3>{category}</h3><p>{score}%</p></div>
-        """
-    html_content += "</div>"
-    for url, score, idx, sentence in url_scores:
-        url_category = check_url_category(url)
-        color = color_map[idx - 1]
-        formatted_url = f"""
-            <p style="background-color: {color}; padding: 5px; font-size: 1.2em">[{idx}] <b>{url}</b></p><p><i>{url_category}</i></p>
-            <p> --- <b>Matching Score: </b>{score}%</p>
-            <p> --- <b>Original Source Content: </b>{sentence}</p>
-        """
-        # formatted_url = f"""
-        #     <div class="url-link">
-        #         <p style="background-color: {color}; padding: 5px; font-size: 1.2em">[{idx}] <b>{url}</b></p><p>{url_category}</p>
-        #         <a href="#" onclick="toggleDetails(event)" class="toggle-button">&gt;</a>
-        #     </div>
-        #     <div id="detailsContainer" class="details">
-        #         <p> --- <b>Matching Score: </b>{score}%</p>
-        #         <p> --- <b>Original Source Content: </b>{sentence}</p>
-        #     </div>
-        # """
-        html_content += formatted_url
-    html_content += "</html>"
-    print("PLAGIARISM PROCESSING TIME: ", time.perf_counter() - start_time)
-    return html_content

+import torch
 import numpy as np
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import nltk
     mc_scores = []
     segments_mc = split_text_allow_complete_sentences_nltk(
         input, type_det="mc"
     )
+    samples_len_mc = len(
+        split_text_allow_complete_sentences_nltk(input, type_det="mc")
     )
+    for i in range(samples_len_mc):
+        cleaned_text_mc = remove_special_characters(segments_mc[i])
+        mc_score = predict_mc(
+            text_mc_model, text_mc_tokenizer, cleaned_text_mc
+        )
+        mc_scores.append(mc_score)
+    mc_scores_array = np.array(mc_scores)
+    average_mc_scores = np.mean(mc_scores_array, axis=0)
+    mc_score_list = average_mc_scores.tolist()
+    mc_score = {}
+    for score, label in zip(mc_score_list, mc_label_map):
+        mc_score[label.upper()] = score
+    sum_prob = 1 - bc_score["HUMAN"]
+    for key, value in mc_score.items():
+        mc_score[key] = value * sum_prob
+    print("MC Score:", mc_score)
+    if sum_prob < 0.01:
+        mc_score = {}
+    return mc_score