aliasgerovs commited on
Commit
c78ec74
1 Parent(s): c55931d

Update plagiarism.py

Browse files
Files changed (1) hide show
  1. plagiarism.py +33 -47
plagiarism.py CHANGED
@@ -9,6 +9,7 @@ import httpx
9
  from bs4 import BeautifulSoup
10
  import numpy as np
11
  import concurrent
 
12
 
13
 
14
  WORD = re.compile(r"\w+")
@@ -189,9 +190,9 @@ async def parallel_scrap(urls):
189
  return results
190
 
191
 
192
- def matching_score(args_list):
193
- sentence = remove_punc(args_list[0])
194
- content = remove_punc(args_list[1])
195
  if sentence in content:
196
  return 1
197
  else:
@@ -200,9 +201,13 @@ def matching_score(args_list):
200
  if len(ngrams) == 0:
201
  return 0
202
  matched = [x for x in ngrams if " ".join(x) in content]
203
- return len(matched) / len(ngrams)
204
-
205
 
 
 
 
 
 
206
  def plagiarism_check(
207
  plag_option,
208
  input,
@@ -244,55 +249,36 @@ def plagiarism_check(
244
  # Scrape URLs in list
245
  formatted_tokens = []
246
  soups = asyncio.run(parallel_scrap(urlList))
247
-
248
- # Populate matching scores for scrapped pages
249
- for i, soup in enumerate(soups):
250
- print(f"Analyzing {i+1} of {len(soups)} soups........................")
251
- if soup:
252
- page_content = soup.text
253
- for j, sent in enumerate(sentences):
254
- args_list = (sent, page_content)
255
- score = matching_score(args_list)
256
- # score = cos_sim_torch(embed_text(sent), source_embeddings[i])
257
- ScoreArray[i][j] = score
258
-
259
- # with concurrent.futures.ProcessPoolExecutor() as executor:
260
- # results = executor.map(matching_score, args_list)
261
-
262
- # *****IF THIS IS TO BE USED, PLEASE PROVIDE "preprocess()" FUNCTION IN LINE 248**************
263
- # source_embeddings = []
264
  # for i, soup in enumerate(soups):
 
265
  # if soup:
266
  # page_content = soup.text
267
- # source_embeddings.append(embed_text(page_content))
268
- # else:
269
- # source_embeddings.append(None)
270
-
271
- # def compute_cosine_similarity(args):
272
- # sent, source_embedding, i, j = args
273
- # score = cos_sim_torch(embed_text(sent), source_embedding)
274
- # return i, j, score
275
-
276
- # def main(soups, sentences):
277
- # source_embeddings = [preprocess(soup) for soup in soups]
278
- # ScoreArray = [[0 for _ in sentences] for _ in soups]
279
- # args_list = []
280
- # for i, soup in enumerate(soups):
281
- # if soup:
282
- # for j, sent in enumerate(sentences):
283
- # args_list.append((sent, source_embeddings[i], i, j))
284
- # with concurrent.futures.ProcessPoolExecutor() as executor:
285
- # results = executor.map(compute_cosine_similarity, args_list)
286
- # for i, j, score in results:
287
  # ScoreArray[i][j] = score
288
- # return ScoreArray
289
 
290
- # # Populate matching scores for scrapped pages
291
- # ScoreArray = main(soups, sentences)
292
- # *******************************************************************************************
 
 
 
293
 
294
- # Calculate URL of max matching score for each sentence chunk
 
 
 
 
 
 
 
295
  sentenceToMaxURL = [-1] * len(sentences)
 
296
  for j in range(len(sentences)):
297
  if j > 0:
298
  maxScore = ScoreArray[sentenceToMaxURL[j - 1]][j]
 
9
  from bs4 import BeautifulSoup
10
  import numpy as np
11
  import concurrent
12
+ from multiprocessing import Pool
13
 
14
 
15
  WORD = re.compile(r"\w+")
 
190
  return results
191
 
192
 
193
+
194
+ def matching_score(sentence_content_tuple):
195
+ sentence, content = sentence_content_tuple
196
  if sentence in content:
197
  return 1
198
  else:
 
201
  if len(ngrams) == 0:
202
  return 0
203
  matched = [x for x in ngrams if " ".join(x) in content]
204
+ return len(matched) / len(ngrams)
 
205
 
206
+ def process_with_multiprocessing(input_data):
207
+ with Pool(processes=4) as pool:
208
+ scores = pool.map(matching_score, input_data)
209
+ return scores
210
+
211
  def plagiarism_check(
212
  plag_option,
213
  input,
 
249
  # Scrape URLs in list
250
  formatted_tokens = []
251
  soups = asyncio.run(parallel_scrap(urlList))
252
+
253
+ # # Populate matching scores for scrapped pages
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
  # for i, soup in enumerate(soups):
255
+ # print(f"Analyzing {i+1} of {len(soups)} soups........................")
256
  # if soup:
257
  # page_content = soup.text
258
+
259
+ # for j, sent in enumerate(sentences):
260
+ # args_list = (sent, page_content)
261
+ # score = matching_score(args_list)
262
+ # # score = cos_sim_torch(embed_text(sent), source_embeddings[i])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
  # ScoreArray[i][j] = score
 
264
 
265
+ input_data = []
266
+ for i, soup in enumerate(soups):
267
+ if soup:
268
+ page_content = soup.text
269
+ for j, sent in enumerate(sentences):
270
+ input_data.append((sent, page_content))
271
 
272
+ scores = process_with_multiprocessing(input_data)
273
+ k = 0
274
+ for i, soup in enumerate(soups):
275
+ if soup:
276
+ for j, _ in enumerate(sentences):
277
+ ScoreArray[i][j] = scores[k]
278
+ k += 1
279
+
280
  sentenceToMaxURL = [-1] * len(sentences)
281
+
282
  for j in range(len(sentences)):
283
  if j > 0:
284
  maxScore = ScoreArray[sentenceToMaxURL[j - 1]][j]