Spaces:

polygraf-ai
/

copyright_checker

Running

App Files Files Community

aliasgerovs commited on Mar 7

Commit

c78ec74

•

1 Parent(s): c55931d

Update plagiarism.py

Browse files

Files changed (1) hide show

plagiarism.py +33 -47

plagiarism.py CHANGED Viewed

@@ -9,6 +9,7 @@ import httpx
 from bs4 import BeautifulSoup
 import numpy as np
 import concurrent
 WORD = re.compile(r"\w+")
@@ -189,9 +190,9 @@ async def parallel_scrap(urls):
     return results
-def matching_score(args_list):
-    sentence = remove_punc(args_list[0])
-    content = remove_punc(args_list[1])
     if sentence in content:
         return 1
     else:
@@ -200,9 +201,13 @@ def matching_score(args_list):
         if len(ngrams) == 0:
             return 0
         matched = [x for x in ngrams if " ".join(x) in content]
-    return len(matched) / len(ngrams)
 def plagiarism_check(
     plag_option,
     input,
@@ -244,55 +249,36 @@ def plagiarism_check(
     # Scrape URLs in list
     formatted_tokens = []
     soups = asyncio.run(parallel_scrap(urlList))
-    # Populate matching scores for scrapped pages
-    for i, soup in enumerate(soups):
-        print(f"Analyzing {i+1} of {len(soups)} soups........................")
-        if soup:
-            page_content = soup.text
-            for j, sent in enumerate(sentences):
-                args_list = (sent, page_content)
-                score = matching_score(args_list)
-                # score = cos_sim_torch(embed_text(sent), source_embeddings[i])
-                ScoreArray[i][j] = score
-    # with concurrent.futures.ProcessPoolExecutor() as executor:
-    #     results = executor.map(matching_score, args_list)
-    # *****IF THIS IS TO BE USED, PLEASE PROVIDE "preprocess()" FUNCTION IN LINE 248**************
-    # source_embeddings = []
     # for i, soup in enumerate(soups):
     #     if soup:
     #         page_content = soup.text
-    #         source_embeddings.append(embed_text(page_content))
-    #     else:
-    #        source_embeddings.append(None)
-    # def compute_cosine_similarity(args):
-    #     sent, source_embedding, i, j = args
-    #     score = cos_sim_torch(embed_text(sent), source_embedding)
-    #     return i, j, score
-    # def main(soups, sentences):
-    #     source_embeddings = [preprocess(soup) for soup in soups]
-    #     ScoreArray = [[0 for _ in sentences] for _ in soups]
-    #     args_list = []
-    #     for i, soup in enumerate(soups):
-    #         if soup:
-    #             for j, sent in enumerate(sentences):
-    #                 args_list.append((sent, source_embeddings[i], i, j))
-    #     with concurrent.futures.ProcessPoolExecutor() as executor:
-    #         results = executor.map(compute_cosine_similarity, args_list)
-    #         for i, j, score in results:
     #             ScoreArray[i][j] = score
-    #     return ScoreArray
-    # # Populate matching scores for scrapped pages
-    # ScoreArray = main(soups, sentences)
-    # *******************************************************************************************
-    # Calculate URL of max matching score for each sentence chunk
     sentenceToMaxURL = [-1] * len(sentences)
     for j in range(len(sentences)):
         if j > 0:
             maxScore = ScoreArray[sentenceToMaxURL[j - 1]][j]

 from bs4 import BeautifulSoup
 import numpy as np
 import concurrent
+from multiprocessing import Pool
 WORD = re.compile(r"\w+")
     return results
+def matching_score(sentence_content_tuple):
+    sentence, content = sentence_content_tuple
     if sentence in content:
         return 1
     else:
         if len(ngrams) == 0:
             return 0
         matched = [x for x in ngrams if " ".join(x) in content]
+        return len(matched) / len(ngrams)
+def process_with_multiprocessing(input_data):
+    with Pool(processes=4) as pool:
+        scores = pool.map(matching_score, input_data)
+    return scores
 def plagiarism_check(
     plag_option,
     input,
     # Scrape URLs in list
     formatted_tokens = []
     soups = asyncio.run(parallel_scrap(urlList))
+    # # Populate matching scores for scrapped pages
     # for i, soup in enumerate(soups):
+    #     print(f"Analyzing {i+1} of {len(soups)} soups........................")
     #     if soup:
     #         page_content = soup.text
+    #         for j, sent in enumerate(sentences):
+    #             args_list = (sent, page_content)
+    #             score = matching_score(args_list)
+    #             # score = cos_sim_torch(embed_text(sent), source_embeddings[i])
     #             ScoreArray[i][j] = score
+    input_data = []
+    for i, soup in enumerate(soups):
+        if soup:
+            page_content = soup.text
+            for j, sent in enumerate(sentences):
+                input_data.append((sent, page_content))
+    scores = process_with_multiprocessing(input_data)
+    k = 0
+    for i, soup in enumerate(soups):
+        if soup:
+            for j, _ in enumerate(sentences):
+                ScoreArray[i][j] = scores[k]
+                k += 1
     sentenceToMaxURL = [-1] * len(sentences)
     for j in range(len(sentences)):
         if j > 0:
             maxScore = ScoreArray[sentenceToMaxURL[j - 1]][j]