Spaces:

polygraf-ai
/

copyright_checker

Runtime error

App Files Files Community

minko186 commited on Apr 12, 2024

Commit

7ec48d6

1 Parent(s): c0a6bc9

update HTML viewer

Browse files

Files changed (2) hide show

app.py +0 -10
plagiarism.py +55 -83

app.py CHANGED Viewed

@@ -224,16 +224,6 @@ with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
-            # sentenceBreakdown = gr.HighlightedText(
-            #     label="Source Detection Sentence Breakdown",
-            #     combine_adjacent=True,
-            #     color_map={
-            #         "[1]": "red",
-            #         "[2]": "orange",
-            #         "[3]": "yellow",
-            #         "[4]": "green",
-            #     },
-            # )
             sentenceBreakdown = gr.HTML(
                 label="Source Detection Sentence Breakdown",
                 value="Source Detection Sentence Breakdown",

     with gr.Row():
         with gr.Column():
             sentenceBreakdown = gr.HTML(
                 label="Source Detection Sentence Breakdown",
                 value="Source Detection Sentence Breakdown",

plagiarism.py CHANGED Viewed

@@ -19,8 +19,6 @@ model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
 # returns cosine similarity of two vectors
 # input: two vectors
 # output: integer between 0 and 1.
 def get_cosine(vec1, vec2):
     intersection = set(vec1.keys()) & set(vec2.keys())
@@ -129,14 +127,14 @@ def google_search(
 def split_sentence_blocks(text):
-    sents = sent_tokenize(text)
     two_sents = []
-    for i in range(len(sents)):
-        if (i % 2) == 0:
-            two_sents.append(sents[i])
-        else:
-            two_sents[len(two_sents) - 1] += " " + sents[i]
     return two_sents
@@ -216,6 +214,26 @@ def print2d(array):
         print(row)
 def html_highlight(
     plag_option,
     input,
@@ -239,24 +257,38 @@ def html_highlight(
         domains_to_skip,
     )
     color_map = [
-        "#e06b63",
         "#eb9d59",
         "#c2ad36",
         "#e1ed72",
         "#c2db76",
         "#a2db76",
     ]
-    html_content = "<div style='font-family: Roboto; border: 2px solid black; background-color: #333333; padding: 10px; color: #FFFFFF;'>"
     for sentence, _, _, idx in sentence_scores:
-        color = color_map[idx - 1]
-        formatted_sentence = f'<p style="background-color: {color}; padding: 5px;">{sentence} [{idx}]</p>'
         html_content += formatted_sentence
     html_content += "<hr>"
     for url, score, idx in url_scores:
         color = color_map[idx - 1]
-        formatted_name = f'<p style="background-color: {color}; padding: 5px;">({idx}) {url} --- Matching Score:{score}</p>'
-        html_content += formatted_name
     html_content += "</div>"
@@ -278,13 +310,11 @@ def plagiarism_check(
     api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
     api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
     # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
-    api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
     cse_id = "851813e81162b4ed4"
     url_scores = []
     sentence_scores = []
-    # for input in input.split("\n\n"):
-    print(input)
     sentences = split_sentence_blocks(input)
     url_count = {}
     score_array = []
@@ -305,21 +335,7 @@ def plagiarism_check(
         cse_id,
     )
     # Scrape URLs in list
-    formatted_tokens = []
     soups = asyncio.run(parallel_scrap(url_list))
-    # # Populate matching scores for scrapped pages
-    # for i, soup in enumerate(soups):
-    #     print(f"Analyzing {i+1} of {len(soups)} soups........................")
-    #     if soup:
-    #         page_content = soup.text
-    #         for j, sent in enumerate(sentences):
-    #             args_list = (sent, page_content)
-    #             score = matching_score(args_list)
-    #             # score = cos_sim_torch(embed_text(sent), source_embeddings[i])
-    #             score_array[i][j] = score
     input_data = []
     for i, soup in enumerate(soups):
         if soup:
@@ -336,29 +352,7 @@ def plagiarism_check(
                 score_array[i][j] = scores[k]
                 k += 1
-    # Map sentence with max URL with small margin to keep consider same URL
-    # for consecutive sentences
-    sentenceToMaxURL = [-1] * len(sentences)
-    for j in range(len(sentences)):
-        if j > 0:
-            maxScore = score_array[sentenceToMaxURL[j - 1]][j]
-            sentenceToMaxURL[j] = sentenceToMaxURL[j - 1]
-        else:
-            maxScore = -1
-        for i in range(len(score_array)):
-            margin = (
-                0.05
-                if (j > 0 and sentenceToMaxURL[j] == sentenceToMaxURL[j - 1])
-                else 0
-            )
-            if score_array[i][j] - maxScore > margin:
-                maxScore = score_array[i][j]
-                sentenceToMaxURL[j] = i
-            # if score_array[i][j] > maxScore:
-            #     maxScore = score_array[i][j]
-            #     sentenceToMaxURL[j] = i
     index = np.unique(sentenceToMaxURL)
     url_source = {}
@@ -369,13 +363,12 @@ def plagiarism_check(
             if sentenceToMaxURL[sen] == url
         ]
         url_source[url] = sum(s) / len(s)
     index_descending = sorted(url_source, key=url_source.get, reverse=True)
     urlMap = {}
     for count, i in enumerate(index_descending):
         urlMap[i] = count + 1
     for i, sent in enumerate(sentences):
         ind = sentenceToMaxURL[i]
         if url_source[ind] > 0.1:
@@ -383,32 +376,11 @@ def plagiarism_check(
                 [sent, url_source[ind], url_list[ind], urlMap[ind]]
             )
         else:
-            sentence_scores.append([sent, None, url_list[ind], urlMap[ind]])
     for ind in index_descending:
-        url_scores.append(
-            [url_list[ind], round(url_source[ind] * 100, 2), urlMap[ind]]
-        )
     return sentence_scores, url_scores
-    # for i, sent in enumerate(sentences):
-    #     formatted_tokens.append(
-    #         (sent, "[" + str(urlMap[sentenceToMaxURL[i]]) + "]")
-    #     )
-    # formatted_tokens.append(("\n", None))
-    # formatted_tokens.append(("\n", None))
-    # formatted_tokens.append(("\n", None))
-    # for ind in index_descending:
-    #     formatted_tokens.append(
-    #         (
-    #             url_list[ind]
-    #             + " --- Matching Score: "
-    #             + f"{str(round(url_source[ind] * 100, 2))}%",
-    #             "[" + str(urlMap[ind]) + "]",
-    #         )
-    #     )
-    #     formatted_tokens.append(("\n", None))
-    # return formatted_tokens

 # returns cosine similarity of two vectors
 # input: two vectors
 # output: integer between 0 and 1.
 def get_cosine(vec1, vec2):
     intersection = set(vec1.keys()) & set(vec2.keys())
 def split_sentence_blocks(text):
     two_sents = []
+    for para in text.split("\n\n"):
+        sents = sent_tokenize(para)
+        for i in range(len(sents)):
+            if (i % 2) == 0:
+                two_sents.append(sents[i])
+            else:
+                two_sents[len(two_sents) - 1] += " " + sents[i]
     return two_sents
         print(row)
+def map_sentence_url(sentences, score_array):
+    sentenceToMaxURL = [-1] * len(sentences)
+    for j in range(len(sentences)):
+        if j > 0:
+            maxScore = score_array[sentenceToMaxURL[j - 1]][j]
+            sentenceToMaxURL[j] = sentenceToMaxURL[j - 1]
+        else:
+            maxScore = -1
+        for i in range(len(score_array)):
+            margin = (
+                0.05
+                if (j > 0 and sentenceToMaxURL[j] == sentenceToMaxURL[j - 1])
+                else 0
+            )
+            if score_array[i][j] - maxScore > margin:
+                maxScore = score_array[i][j]
+                sentenceToMaxURL[j] = i
+    return sentenceToMaxURL
 def html_highlight(
     plag_option,
     input,
         domains_to_skip,
     )
     color_map = [
+        "#cf2323",
         "#eb9d59",
         "#c2ad36",
         "#e1ed72",
         "#c2db76",
         "#a2db76",
     ]
+    font = "Roboto"
+    html_content = "<link href='https://fonts.googleapis.com/css?family=Roboto' rel='stylesheet'>\n<div style='font-family: {font}; border: 2px solid black; background-color: #333333; padding: 10px; color: #FFFFFF;'>"
+    prev_idx = None
+    combined_sentence = ""
     for sentence, _, _, idx in sentence_scores:
+        if idx != prev_idx and prev_idx is not None:
+            color = color_map[prev_idx - 1]
+            index_part = f'<span style="background-color: {color}; padding: 2px;">[{prev_idx}]</span>'
+            formatted_sentence = f"<p>{combined_sentence} {index_part}</p>"
+            html_content += formatted_sentence
+            combined_sentence = ""
+        combined_sentence += " " + sentence
+        prev_idx = idx
+    if combined_sentence:
+        color = color_map[prev_idx - 1]
+        index_part = f'<span style="background-color: {color}; padding: 2px;">[{prev_idx}]</span>'
+        formatted_sentence = f"<p>{combined_sentence} {index_part}</p>"
         html_content += formatted_sentence
     html_content += "<hr>"
     for url, score, idx in url_scores:
         color = color_map[idx - 1]
+        formatted_url = f'<p style="background-color: {color}; padding: 5px;">({idx}) <b>{url}</b></p><p> --- Matching Score: {score}%</p>'
+        html_content += formatted_url
     html_content += "</div>"
     api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
     api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
     # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
+    # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
     cse_id = "851813e81162b4ed4"
     url_scores = []
     sentence_scores = []
     sentences = split_sentence_blocks(input)
     url_count = {}
     score_array = []
         cse_id,
     )
     # Scrape URLs in list
     soups = asyncio.run(parallel_scrap(url_list))
     input_data = []
     for i, soup in enumerate(soups):
         if soup:
                 score_array[i][j] = scores[k]
                 k += 1
+    sentenceToMaxURL = map_sentence_url(sentences, score_array)
     index = np.unique(sentenceToMaxURL)
     url_source = {}
             if sentenceToMaxURL[sen] == url
         ]
         url_source[url] = sum(s) / len(s)
     index_descending = sorted(url_source, key=url_source.get, reverse=True)
     urlMap = {}
     for count, i in enumerate(index_descending):
         urlMap[i] = count + 1
+    # build results
     for i, sent in enumerate(sentences):
         ind = sentenceToMaxURL[i]
         if url_source[ind] > 0.1:
                 [sent, url_source[ind], url_list[ind], urlMap[ind]]
             )
         else:
+            sentence_scores.append([sent, None, url_list[ind], -1])
     for ind in index_descending:
+        if url_source[ind] > 0.1:
+            url_scores.append(
+                [url_list[ind], round(url_source[ind] * 100, 2), urlMap[ind]]
+            )
     return sentence_scores, url_scores