Spaces:
Running
Running
aliasgerovs
commited on
Commit
•
c78ec74
1
Parent(s):
c55931d
Update plagiarism.py
Browse files- plagiarism.py +33 -47
plagiarism.py
CHANGED
@@ -9,6 +9,7 @@ import httpx
|
|
9 |
from bs4 import BeautifulSoup
|
10 |
import numpy as np
|
11 |
import concurrent
|
|
|
12 |
|
13 |
|
14 |
WORD = re.compile(r"\w+")
|
@@ -189,9 +190,9 @@ async def parallel_scrap(urls):
|
|
189 |
return results
|
190 |
|
191 |
|
192 |
-
|
193 |
-
|
194 |
-
content =
|
195 |
if sentence in content:
|
196 |
return 1
|
197 |
else:
|
@@ -200,9 +201,13 @@ def matching_score(args_list):
|
|
200 |
if len(ngrams) == 0:
|
201 |
return 0
|
202 |
matched = [x for x in ngrams if " ".join(x) in content]
|
203 |
-
|
204 |
-
|
205 |
|
|
|
|
|
|
|
|
|
|
|
206 |
def plagiarism_check(
|
207 |
plag_option,
|
208 |
input,
|
@@ -244,55 +249,36 @@ def plagiarism_check(
|
|
244 |
# Scrape URLs in list
|
245 |
formatted_tokens = []
|
246 |
soups = asyncio.run(parallel_scrap(urlList))
|
247 |
-
|
248 |
-
# Populate matching scores for scrapped pages
|
249 |
-
for i, soup in enumerate(soups):
|
250 |
-
print(f"Analyzing {i+1} of {len(soups)} soups........................")
|
251 |
-
if soup:
|
252 |
-
page_content = soup.text
|
253 |
-
for j, sent in enumerate(sentences):
|
254 |
-
args_list = (sent, page_content)
|
255 |
-
score = matching_score(args_list)
|
256 |
-
# score = cos_sim_torch(embed_text(sent), source_embeddings[i])
|
257 |
-
ScoreArray[i][j] = score
|
258 |
-
|
259 |
-
# with concurrent.futures.ProcessPoolExecutor() as executor:
|
260 |
-
# results = executor.map(matching_score, args_list)
|
261 |
-
|
262 |
-
# *****IF THIS IS TO BE USED, PLEASE PROVIDE "preprocess()" FUNCTION IN LINE 248**************
|
263 |
-
# source_embeddings = []
|
264 |
# for i, soup in enumerate(soups):
|
|
|
265 |
# if soup:
|
266 |
# page_content = soup.text
|
267 |
-
|
268 |
-
#
|
269 |
-
#
|
270 |
-
|
271 |
-
#
|
272 |
-
# sent, source_embedding, i, j = args
|
273 |
-
# score = cos_sim_torch(embed_text(sent), source_embedding)
|
274 |
-
# return i, j, score
|
275 |
-
|
276 |
-
# def main(soups, sentences):
|
277 |
-
# source_embeddings = [preprocess(soup) for soup in soups]
|
278 |
-
# ScoreArray = [[0 for _ in sentences] for _ in soups]
|
279 |
-
# args_list = []
|
280 |
-
# for i, soup in enumerate(soups):
|
281 |
-
# if soup:
|
282 |
-
# for j, sent in enumerate(sentences):
|
283 |
-
# args_list.append((sent, source_embeddings[i], i, j))
|
284 |
-
# with concurrent.futures.ProcessPoolExecutor() as executor:
|
285 |
-
# results = executor.map(compute_cosine_similarity, args_list)
|
286 |
-
# for i, j, score in results:
|
287 |
# ScoreArray[i][j] = score
|
288 |
-
# return ScoreArray
|
289 |
|
290 |
-
|
291 |
-
|
292 |
-
|
|
|
|
|
|
|
293 |
|
294 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
295 |
sentenceToMaxURL = [-1] * len(sentences)
|
|
|
296 |
for j in range(len(sentences)):
|
297 |
if j > 0:
|
298 |
maxScore = ScoreArray[sentenceToMaxURL[j - 1]][j]
|
|
|
9 |
from bs4 import BeautifulSoup
|
10 |
import numpy as np
|
11 |
import concurrent
|
12 |
+
from multiprocessing import Pool
|
13 |
|
14 |
|
15 |
WORD = re.compile(r"\w+")
|
|
|
190 |
return results
|
191 |
|
192 |
|
193 |
+
|
194 |
+
def matching_score(sentence_content_tuple):
|
195 |
+
sentence, content = sentence_content_tuple
|
196 |
if sentence in content:
|
197 |
return 1
|
198 |
else:
|
|
|
201 |
if len(ngrams) == 0:
|
202 |
return 0
|
203 |
matched = [x for x in ngrams if " ".join(x) in content]
|
204 |
+
return len(matched) / len(ngrams)
|
|
|
205 |
|
206 |
+
def process_with_multiprocessing(input_data):
|
207 |
+
with Pool(processes=4) as pool:
|
208 |
+
scores = pool.map(matching_score, input_data)
|
209 |
+
return scores
|
210 |
+
|
211 |
def plagiarism_check(
|
212 |
plag_option,
|
213 |
input,
|
|
|
249 |
# Scrape URLs in list
|
250 |
formatted_tokens = []
|
251 |
soups = asyncio.run(parallel_scrap(urlList))
|
252 |
+
|
253 |
+
# # Populate matching scores for scrapped pages
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
254 |
# for i, soup in enumerate(soups):
|
255 |
+
# print(f"Analyzing {i+1} of {len(soups)} soups........................")
|
256 |
# if soup:
|
257 |
# page_content = soup.text
|
258 |
+
|
259 |
+
# for j, sent in enumerate(sentences):
|
260 |
+
# args_list = (sent, page_content)
|
261 |
+
# score = matching_score(args_list)
|
262 |
+
# # score = cos_sim_torch(embed_text(sent), source_embeddings[i])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
263 |
# ScoreArray[i][j] = score
|
|
|
264 |
|
265 |
+
input_data = []
|
266 |
+
for i, soup in enumerate(soups):
|
267 |
+
if soup:
|
268 |
+
page_content = soup.text
|
269 |
+
for j, sent in enumerate(sentences):
|
270 |
+
input_data.append((sent, page_content))
|
271 |
|
272 |
+
scores = process_with_multiprocessing(input_data)
|
273 |
+
k = 0
|
274 |
+
for i, soup in enumerate(soups):
|
275 |
+
if soup:
|
276 |
+
for j, _ in enumerate(sentences):
|
277 |
+
ScoreArray[i][j] = scores[k]
|
278 |
+
k += 1
|
279 |
+
|
280 |
sentenceToMaxURL = [-1] * len(sentences)
|
281 |
+
|
282 |
for j in range(len(sentences)):
|
283 |
if j > 0:
|
284 |
maxScore = ScoreArray[sentenceToMaxURL[j - 1]][j]
|