minko186 commited on
Commit
8a482d3
·
verified ·
1 Parent(s): 227a8b5

Update predictors.py

Browse files
Files changed (1) hide show
  1. predictors.py +24 -564
predictors.py CHANGED
@@ -1,17 +1,4 @@
1
- import time
2
- from nltk.tokenize import sent_tokenize
3
- from googleapiclient.discovery import build
4
- from collections import Counter
5
- import re, math
6
- from sentence_transformers import SentenceTransformer, util
7
- import asyncio
8
- import httpx
9
- from bs4 import BeautifulSoup
10
- import numpy as np
11
- import concurrent
12
- from multiprocessing import Pool
13
- from const import url_types
14
- from collections import defaultdictimport torch
15
  import numpy as np
16
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
17
  import nltk
@@ -307,555 +294,28 @@ def predict_mc_scores(input):
307
  mc_scores = []
308
  segments_mc = split_text_allow_complete_sentences_nltk(
309
  input, type_det="mc"
310
-
311
-
312
- WORD = re.compile(r"\w+")
313
- model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
314
-
315
-
316
- months = {
317
- "January": "01",
318
- "February": "02",
319
- "March": "03",
320
- "April": "04",
321
- "May": "05",
322
- "June": "06",
323
- "July": "07",
324
- "August": "08",
325
- "September": "09",
326
- "October": "10",
327
- "November": "11",
328
- "December": "12",
329
- }
330
-
331
- color_map = [
332
- "#cf2323",
333
- "#d65129",
334
- "#d66329",
335
- "#d67129",
336
- "#eb9d59",
337
- "#c2ad36",
338
- "#d6ae29",
339
- "#d6b929",
340
- "#e1ed72",
341
- "#c2db76",
342
- "#a2db76",
343
- ]
344
-
345
-
346
- def text_to_vector(text):
347
- words = WORD.findall(text)
348
- return Counter(words)
349
-
350
-
351
- def cosineSim(text1, text2):
352
- vector1 = text_to_vector(text1)
353
- vector2 = text_to_vector(text2)
354
- # print vector1,vector2
355
- cosine = get_cosine(vector1, vector2)
356
- return cosine
357
-
358
-
359
- def get_cosine(vec1, vec2):
360
- intersection = set(vec1.keys()) & set(vec2.keys())
361
- numerator = sum([vec1[x] * vec2[x] for x in intersection])
362
- sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
363
- sum2 = sum([vec2[x] ** 2 for x in vec2.keys()])
364
- denominator = math.sqrt(sum1) * math.sqrt(sum2)
365
- if denominator == 0:
366
- return 0.0
367
- else:
368
- return float(numerator) / denominator
369
-
370
-
371
- def split_sentence_blocks(text, size):
372
- if size == "Paragraph":
373
- blocks = text.strip().split("\n")
374
- return blocks
375
- else:
376
- sents = sent_tokenize(text.strip())
377
- return sents
378
-
379
-
380
- def build_date(year=2024, month="March", day=1):
381
- return f"{year}{months[month]}{day}"
382
-
383
-
384
- def split_ngrams(text, n):
385
- words = text.split()
386
- return [tuple(words[i : i + n]) for i in range(len(words) - n + 1)]
387
-
388
-
389
- def sentence_similarity(text1, text2):
390
- embedding_1 = model.encode(text1, convert_to_tensor=True)
391
- embedding_2 = model.encode(text2, convert_to_tensor=True)
392
- o = util.pytorch_cos_sim(embedding_1, embedding_2)
393
- return o.item()
394
-
395
-
396
- async def get_url_data(url, client):
397
- try:
398
- r = await client.get(url)
399
- if r.status_code == 200:
400
- soup = BeautifulSoup(r.content, "html.parser")
401
- return soup
402
- except Exception:
403
- return None
404
-
405
-
406
- async def parallel_scrap(urls):
407
- async with httpx.AsyncClient(timeout=30) as client:
408
- tasks = []
409
- for url in urls:
410
- tasks.append(get_url_data(url=url, client=client))
411
- results = await asyncio.gather(*tasks, return_exceptions=True)
412
- return results
413
-
414
-
415
- def merge_ngrams_into_sentence(ngrams):
416
- if ngrams == None:
417
- return ""
418
- if len(ngrams) > 20:
419
- ngrams = ngrams[:20]
420
- merged_sentence = []
421
- i = 0
422
- for ngram in ngrams:
423
- overlap = len(set(ngram) & set(merged_sentence[-len(ngram) :]))
424
- if overlap == 0:
425
- merged_sentence.extend(ngram)
426
- elif overlap < len(ngram):
427
- merged_sentence.extend(ngram[overlap:])
428
- return " ".join(merged_sentence)
429
-
430
-
431
- def remove_ngrams_after(ngrams, target_ngram):
432
- try:
433
- index = ngrams.index(target_ngram)
434
- return ngrams[: index + 1]
435
- except ValueError:
436
- return None
437
-
438
-
439
- def matching_score(sentence_content_tuple):
440
- sentence, content, score = sentence_content_tuple
441
- if sentence in content:
442
- return 1, sentence
443
- # if score > 0.9:
444
- # return score
445
- else:
446
- n = 5
447
-
448
- # ngrams = split_ngrams(sentence, n)
449
- # if len(ngrams) == 0:
450
- # return 0
451
- # matched = [x for x in ngrams if " ".join(x) in content]
452
- # return len(matched) / len(ngrams)
453
-
454
- # list comprehension matching
455
- # ngrams_sentence = split_ngrams(sentence, n)
456
- # ngrams_content = [tuple(ngram) for ngram in split_ngrams(content, n)]
457
- # if len(ngrams_sentence) == 0:
458
- # return 0, ""
459
- # matched_ngrams = [
460
- # 1 for ngram in ngrams_sentence if tuple(ngram) in ngrams_content
461
- # ]
462
- # matched_count = sum(matched_ngrams)
463
-
464
- # set intersection matching
465
- ngrams_sentence = set(split_ngrams(sentence, n))
466
- ngrams_content = set(split_ngrams(content, n))
467
- if len(ngrams_sentence) == 0:
468
- return 0, ""
469
- matched_ngrams = ngrams_sentence.intersection(ngrams_content)
470
- matched_count = len(matched_ngrams)
471
-
472
- # matched content
473
- matched_content_ngrams = []
474
- found = False
475
- last_found = None
476
- for ngram in ngrams_sentence:
477
- for ngram_content in ngrams_content:
478
- if tuple(ngram) == ngram_content:
479
- found = True
480
- last_found = ngram_content
481
- if found:
482
- matched_content_ngrams.append(ngram_content)
483
- matched_content_ngrams = remove_ngrams_after(
484
- matched_content_ngrams, last_found
485
- )
486
- matched_content = merge_ngrams_into_sentence(matched_content_ngrams)
487
-
488
- return matched_count / len(ngrams_sentence), matched_content
489
-
490
-
491
- def process_with_multiprocessing(input_data):
492
- with Pool(processes=1) as pool:
493
- scores = pool.map(matching_score, input_data)
494
- return scores
495
-
496
-
497
- def map_sentence_url(sentences, score_array):
498
- sentenceToMaxURL = [-1] * len(sentences)
499
- for j in range(len(sentences)):
500
- if j > 0:
501
- maxScore = score_array[sentenceToMaxURL[j - 1]][j]
502
- sentenceToMaxURL[j] = sentenceToMaxURL[j - 1]
503
- else:
504
- maxScore = -1
505
- for i in range(len(score_array)):
506
- margin = (
507
- 0.05
508
- if (j > 0 and sentenceToMaxURL[j] == sentenceToMaxURL[j - 1])
509
- else 0
510
- )
511
- if score_array[i][j] - maxScore > margin:
512
- maxScore = score_array[i][j]
513
- sentenceToMaxURL[j] = i
514
- return sentenceToMaxURL
515
-
516
-
517
- def check_url_category(url):
518
- for category, urls in url_types.items():
519
- for u in urls:
520
- if u in url:
521
- return category
522
- return "Internet Source"
523
-
524
-
525
- def google_search(
526
- plag_option,
527
- sentences,
528
- url_count,
529
- score_array,
530
- url_list,
531
- snippets,
532
- sorted_date,
533
- domains_to_skip,
534
- api_key,
535
- cse_id,
536
- **kwargs,
537
- ):
538
- service = build("customsearch", "v1", developerKey=api_key)
539
- num_pages = 1
540
- for i, sentence in enumerate(sentences):
541
- results = (
542
- service.cse()
543
- .list(q=sentence, cx=cse_id, sort=sorted_date, **kwargs)
544
- .execute()
545
- )
546
- if "items" in results and len(results["items"]) > 0:
547
- for count, link in enumerate(results["items"]):
548
- if count >= num_pages:
549
- break
550
- # skip user selected domains
551
- if (domains_to_skip is not None) and any(
552
- ("." + domain) in link["link"] for domain in domains_to_skip
553
- ):
554
- continue
555
- # clean up snippet of '...'
556
- snippet = link["snippet"]
557
- ind = snippet.find("...")
558
- if ind < 20 and ind > 9:
559
- snippet = snippet[ind + len("... ") :]
560
- ind = snippet.find("...")
561
- if ind > len(snippet) - 5:
562
- snippet = snippet[:ind]
563
-
564
- # update cosine similarity between snippet and given text
565
- url = link["link"]
566
- if url not in url_list:
567
- url_list.append(url)
568
- score_array.append([0] * len(sentences))
569
- snippets.append([""] * len(sentences))
570
- url_count[url] = url_count[url] + 1 if url in url_count else 1
571
- snippets[url_list.index(url)][i] = snippet
572
- if plag_option == "Standard":
573
- score_array[url_list.index(url)][i] = cosineSim(
574
- sentence, snippet
575
- )
576
- else:
577
- score_array[url_list.index(url)][i] = sentence_similarity(
578
- sentence, snippet
579
- )
580
- return url_count, score_array
581
-
582
-
583
- def plagiarism_check(
584
- plag_option,
585
- input,
586
- year_from,
587
- month_from,
588
- day_from,
589
- year_to,
590
- month_to,
591
- day_to,
592
- domains_to_skip,
593
- source_block_size,
594
- ):
595
- # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
596
- # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
597
- # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
598
- api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
599
- # api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
600
- # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
601
- # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
602
- cse_id = "851813e81162b4ed4"
603
-
604
- url_scores = []
605
- sentence_scores = []
606
- sentences = split_sentence_blocks(input, source_block_size)
607
- url_count = {}
608
- score_array = []
609
- url_list = []
610
- snippets = []
611
- date_from = build_date(year_from, month_from, day_from)
612
- date_to = build_date(year_to, month_to, day_to)
613
- sort_date = f"date:r:{date_from}:{date_to}"
614
- # get list of URLS to check
615
- start_time = time.perf_counter()
616
- url_count, score_array = google_search(
617
- plag_option,
618
- sentences,
619
- url_count,
620
- score_array,
621
- url_list,
622
- snippets,
623
- sort_date,
624
- domains_to_skip,
625
- api_key,
626
- cse_id,
627
  )
628
- print("GOOGLE SEARCH PROCESSING TIME: ", time.perf_counter() - start_time)
629
- # Scrape URLs in list
630
- start_time = time.perf_counter()
631
- soups = asyncio.run(parallel_scrap(url_list))
632
- print("SCRAPING PROCESSING TIME: ", time.perf_counter() - start_time)
633
- input_data = []
634
- for i, soup in enumerate(soups):
635
- if soup:
636
- page_content = soup.text
637
- for j, sent in enumerate(sentences):
638
- input_data.append((sent, page_content, score_array[i][j]))
639
- start_time = time.perf_counter()
640
- # scores = process_with_multiprocessing(input_data)
641
- scores = []
642
- for i in input_data:
643
- scores.append(matching_score(i))
644
- print("MATCHING SCORE PROCESSING TIME: ", time.perf_counter() - start_time)
645
- matched_sentence_array = [
646
- ["" for _ in range(len(score_array[0]))]
647
- for _ in range(len(score_array))
648
- ]
649
-
650
- k = 0
651
- # Update score array for each (soup, sentence)
652
- for i, soup in enumerate(soups):
653
- if soup:
654
- for j, _ in enumerate(sentences):
655
- score_array[i][j] = scores[k][0]
656
- matched_sentence_array[i][j] = scores[k][1]
657
- k += 1
658
-
659
- sentenceToMaxURL = map_sentence_url(sentences, score_array)
660
- index = np.unique(sentenceToMaxURL)
661
-
662
- url_source = {}
663
- for url in index:
664
- s = [
665
- score_array[url][sen]
666
- for sen in range(len(sentences))
667
- if sentenceToMaxURL[sen] == url
668
- ]
669
- url_source[url] = sum(s) / len(s)
670
- index_descending = sorted(url_source, key=url_source.get, reverse=True)
671
- urlMap = {}
672
- for count, i in enumerate(index_descending):
673
- urlMap[i] = count + 1
674
-
675
- # build results
676
- for i, sent in enumerate(sentences):
677
- ind = sentenceToMaxURL[i]
678
- if url_source[ind] > 0.1:
679
- sentence_scores.append(
680
- [
681
- sent,
682
- round(url_source[ind] * 100, 2),
683
- url_list[ind],
684
- urlMap[ind],
685
- ]
686
- )
687
- else:
688
- sentence_scores.append([sent, None, url_list[ind], -1])
689
- print("SNIPPETS: ", snippets)
690
- snippets = [[item for item in sublist if item] for sublist in snippets]
691
- for ind in index_descending:
692
- if url_source[ind] > 0.1:
693
- matched_sentence_array = [
694
- [item for item in sublist if item]
695
- for sublist in matched_sentence_array
696
- ]
697
- matched_sentence = "...".join(
698
- [sent for sent in matched_sentence_array[ind]]
699
- )
700
- if matched_sentence == "":
701
- matched_sentence = "...".join([sent for sent in snippets[ind]])
702
- url_scores.append(
703
- [
704
- url_list[ind],
705
- round(url_source[ind] * 100, 2),
706
- urlMap[ind],
707
- matched_sentence,
708
- ]
709
- )
710
-
711
- return sentence_scores, url_scores
712
-
713
-
714
- def html_highlight(
715
- plag_option,
716
- input,
717
- year_from,
718
- month_from,
719
- day_from,
720
- year_to,
721
- month_to,
722
- day_to,
723
- domains_to_skip,
724
- source_block_size,
725
- ):
726
- start_time = time.perf_counter()
727
- sentence_scores, url_scores = plagiarism_check(
728
- plag_option,
729
- input,
730
- year_from,
731
- month_from,
732
- day_from,
733
- year_to,
734
- month_to,
735
- day_to,
736
- domains_to_skip,
737
- source_block_size,
738
  )
739
-
740
- html_content = """
741
- <link href='https://fonts.googleapis.com/css?family=Roboto' rel='stylesheet'>
742
- <div style='font-family: {font}; border: 2px solid black; padding: 10px; color: #FFFFFF;'>
743
- <html>
744
- <head>
745
- <title>Toggle Details</title>
746
- <style>
747
- .score-container {
748
- display: flex;
749
- justify-content: space-around;
750
- align-items: left;
751
- padding: 20px;
752
- }
753
- .score-item {
754
- text-align: center;
755
- padding: 10px;
756
- background-color: #636362;
757
- border-radius: 5px;
758
- flex-grow: 1;
759
- margin: 0 5px;
760
- }
761
- .details {
762
- display: none;
763
- padding: 10px;
764
- }
765
- .url-link {
766
- font-size: 1.2em;
767
- }
768
- .url-link span {
769
- margin-right: 10px;
770
- }
771
- .toggle-button {
772
- color: #333;
773
- border: none;
774
- padding: 5px 10px;
775
- text-align: center;
776
- text-decoration: none;
777
- display: inline-block;
778
- cursor: pointer;
779
- }
780
- </style>
781
- </head>
782
- """
783
-
784
- prev_idx = None
785
- combined_sentence = ""
786
- total_score = 0
787
- total_count = 0
788
- category_scores = defaultdict(set)
789
- for sentence, score, url, idx in sentence_scores:
790
- category = check_url_category(url)
791
- if score is None:
792
- total_score += 0
793
- else:
794
- total_score += score
795
- category_scores[category].add(score)
796
- total_count += 1
797
-
798
- if idx != prev_idx and prev_idx is not None:
799
- color = color_map[prev_idx - 1]
800
- index_part = f"<span>[{prev_idx}]</span>"
801
- formatted_sentence = f'<p style="background-color: {color}; padding: 2px;">{combined_sentence} {index_part}</p>'
802
- html_content += formatted_sentence
803
- combined_sentence = ""
804
- combined_sentence += " " + sentence
805
- prev_idx = idx
806
-
807
- print(category_scores)
808
- total_average_score = round(total_score / total_count, 2)
809
- category_averages = {
810
- category: round((sum(scores) / len(scores)), 2)
811
- for category, scores in category_scores.items()
812
- }
813
-
814
- if combined_sentence:
815
- color = color_map[prev_idx - 1]
816
- index_part = ""
817
- if prev_idx != -1:
818
- index_part = f"<span>[{prev_idx}]</span>"
819
- formatted_sentence = f'<p style="background-color: {color}; padding: 2px;">{combined_sentence} {index_part}</p>'
820
- html_content += formatted_sentence
821
-
822
- html_content += "<hr>"
823
-
824
- html_content += f"""
825
- <div class="score-container">
826
- <div class="score-item">
827
- <h3>Overall Similarity</h3>
828
- <p>{total_average_score}%</p>
829
- </div>
830
- """
831
- for category, score in category_averages.items():
832
- html_content += f"""
833
- <div class="score-item"><h3>{category}</h3><p>{score}%</p></div>
834
- """
835
- html_content += "</div>"
836
-
837
- for url, score, idx, sentence in url_scores:
838
- url_category = check_url_category(url)
839
- color = color_map[idx - 1]
840
- formatted_url = f"""
841
- <p style="background-color: {color}; padding: 5px; font-size: 1.2em">[{idx}] <b>{url}</b></p><p><i>{url_category}</i></p>
842
- <p> --- <b>Matching Score: </b>{score}%</p>
843
- <p> --- <b>Original Source Content: </b>{sentence}</p>
844
- """
845
- # formatted_url = f"""
846
- # <div class="url-link">
847
- # <p style="background-color: {color}; padding: 5px; font-size: 1.2em">[{idx}] <b>{url}</b></p><p>{url_category}</p>
848
- # <a href="#" onclick="toggleDetails(event)" class="toggle-button">&gt;</a>
849
- # </div>
850
- # <div id="detailsContainer" class="details">
851
- # <p> --- <b>Matching Score: </b>{score}%</p>
852
- # <p> --- <b>Original Source Content: </b>{sentence}</p>
853
- # </div>
854
- # """
855
- html_content += formatted_url
856
-
857
- html_content += "</html>"
858
-
859
- print("PLAGIARISM PROCESSING TIME: ", time.perf_counter() - start_time)
860
-
861
- return html_content
 
1
+ import torch
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import numpy as np
3
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
4
  import nltk
 
294
  mc_scores = []
295
  segments_mc = split_text_allow_complete_sentences_nltk(
296
  input, type_det="mc"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
297
  )
298
+ samples_len_mc = len(
299
+ split_text_allow_complete_sentences_nltk(input, type_det="mc")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
300
  )
301
+ for i in range(samples_len_mc):
302
+ cleaned_text_mc = remove_special_characters(segments_mc[i])
303
+ mc_score = predict_mc(
304
+ text_mc_model, text_mc_tokenizer, cleaned_text_mc
305
+ )
306
+ mc_scores.append(mc_score)
307
+ mc_scores_array = np.array(mc_scores)
308
+ average_mc_scores = np.mean(mc_scores_array, axis=0)
309
+ mc_score_list = average_mc_scores.tolist()
310
+ mc_score = {}
311
+ for score, label in zip(mc_score_list, mc_label_map):
312
+ mc_score[label.upper()] = score
313
+
314
+ sum_prob = 1 - bc_score["HUMAN"]
315
+ for key, value in mc_score.items():
316
+ mc_score[key] = value * sum_prob
317
+ print("MC Score:", mc_score)
318
+ if sum_prob < 0.01:
319
+ mc_score = {}
320
+
321
+ return mc_score