aliasgerovs commited on
Commit
d53b62d
1 Parent(s): 3d16af9
Files changed (1) hide show
  1. app.py +28 -7
app.py CHANGED
@@ -17,6 +17,7 @@ import fitz
17
  from transformers import GPT2LMHeadModel, GPT2TokenizerFast
18
  import nltk, spacy, subprocess, torch
19
  import plotly.graph_objects as go
 
20
  import nltk
21
  from unidecode import unidecode
22
 
@@ -104,7 +105,6 @@ def plagiarism_check(
104
  # print("New Score Array:\n")
105
  # print2D(ScoreArray)
106
 
107
-
108
  # Gradio formatting section
109
  sentencePlag = [False] * len(sentences)
110
  sentenceToMaxURL = [-1] * len(sentences)
@@ -192,9 +192,11 @@ text_mc_model_path = "polygraf-ai/ai-text-detection-mc-robert-open-ai-detector-v
192
  text_mc_tokenizer = AutoTokenizer.from_pretrained(text_mc_model_path)
193
  text_mc_model = AutoModelForSequenceClassification.from_pretrained(text_mc_model_path).to(device)
194
 
 
 
 
 
195
  def remove_accents(input_str):
196
- # nfkd_form = unicodedata.normalize('NFKD', input_str)
197
- # return "".join([char for char in nfkd_form if not unicodedata.combining(char)])
198
  text_no_accents = unidecode(input_str)
199
  return text_no_accents
200
 
@@ -266,12 +268,17 @@ def split_text_allow_complete_sentences_nltk(text, max_length=256, tolerance=30,
266
  decoded_segments.append(decoded_segment)
267
  return decoded_segments
268
 
 
 
 
 
 
 
269
 
270
  def predict_bc(model, tokenizer, text):
271
  tokens = text_bc_tokenizer(
272
  text, padding='max_length', truncation=True, max_length=256, return_tensors="pt"
273
  ).to(device)["input_ids"]
274
-
275
  output = model(tokens)
276
  output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
277
  print("BC Score: ", output_norm)
@@ -360,12 +367,14 @@ def main(
360
  )
361
  depth_analysis_plot = depth_analysis(input)
362
  bc_score, mc_score = ai_generated_test(ai_option,input)
 
363
 
364
  return (
365
  bc_score,
366
  mc_score,
367
  formatted_tokens,
368
  depth_analysis_plot,
 
369
  )
370
 
371
 
@@ -520,8 +529,11 @@ with gr.Blocks() as demo:
520
  only_ai_btn = gr.Button("AI Check")
521
 
522
  with gr.Column():
523
- only_plagiarism_btn = gr.Button("Source Detection")
524
-
 
 
 
525
  with gr.Row():
526
  depth_analysis_btn = gr.Button("Detailed Writing Analysis")
527
 
@@ -546,7 +558,8 @@ with gr.Blocks() as demo:
546
  bcLabel = gr.Label(label="Source")
547
  with gr.Column():
548
  mcLabel = gr.Label(label="Creator")
549
-
 
550
  with gr.Group():
551
  with gr.Row():
552
  month_from = gr.Dropdown(
@@ -615,6 +628,7 @@ with gr.Blocks() as demo:
615
  mcLabel,
616
  sentenceBreakdown,
617
  writing_analysis_plot,
 
618
  ],
619
  api_name="main",
620
  )
@@ -629,6 +643,13 @@ with gr.Blocks() as demo:
629
  api_name="ai_check",
630
  )
631
 
 
 
 
 
 
 
 
632
  only_plagiarism_btn.click(
633
  fn=plagiarism_check,
634
  inputs=[
 
17
  from transformers import GPT2LMHeadModel, GPT2TokenizerFast
18
  import nltk, spacy, subprocess, torch
19
  import plotly.graph_objects as go
20
+ import torch.nn.functional as F
21
  import nltk
22
  from unidecode import unidecode
23
 
 
105
  # print("New Score Array:\n")
106
  # print2D(ScoreArray)
107
 
 
108
  # Gradio formatting section
109
  sentencePlag = [False] * len(sentences)
110
  sentenceToMaxURL = [-1] * len(sentences)
 
192
  text_mc_tokenizer = AutoTokenizer.from_pretrained(text_mc_model_path)
193
  text_mc_model = AutoModelForSequenceClassification.from_pretrained(text_mc_model_path).to(device)
194
 
195
+ quillbot_labels = ["Original", "QuillBot"]
196
+ quillbot_tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
197
+ quillbot_model = AutoModelForSequenceClassification.from_pretrained("polygraf-ai/quillbot-detector-roberta-base-28K").to(device)
198
+
199
  def remove_accents(input_str):
 
 
200
  text_no_accents = unidecode(input_str)
201
  return text_no_accents
202
 
 
268
  decoded_segments.append(decoded_segment)
269
  return decoded_segments
270
 
271
+ def predict_quillbot(text):
272
+ tokenized_text = quillbot_tokenizer(text, padding="max_length", truncation=True, max_length=256, return_tensors="pt").to(device)["input_ids"]
273
+ output = quillbot_model(tokenized_text)
274
+ output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
275
+ q_score = {"QuillBot": output_norm[1].item(), "Original": output_norm[0].item()}
276
+ return q_score
277
 
278
  def predict_bc(model, tokenizer, text):
279
  tokens = text_bc_tokenizer(
280
  text, padding='max_length', truncation=True, max_length=256, return_tensors="pt"
281
  ).to(device)["input_ids"]
 
282
  output = model(tokens)
283
  output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
284
  print("BC Score: ", output_norm)
 
367
  )
368
  depth_analysis_plot = depth_analysis(input)
369
  bc_score, mc_score = ai_generated_test(ai_option,input)
370
+ quilscore = predict_quillbot(input)
371
 
372
  return (
373
  bc_score,
374
  mc_score,
375
  formatted_tokens,
376
  depth_analysis_plot,
377
+ quilscore
378
  )
379
 
380
 
 
529
  only_ai_btn = gr.Button("AI Check")
530
 
531
  with gr.Column():
532
+ only_plagiarism_btn = gr.Button("Source Check")
533
+
534
+ with gr.Row():
535
+ quillbot_check = gr.Button("Humanized Text Check (Quillbot)")
536
+
537
  with gr.Row():
538
  depth_analysis_btn = gr.Button("Detailed Writing Analysis")
539
 
 
558
  bcLabel = gr.Label(label="Source")
559
  with gr.Column():
560
  mcLabel = gr.Label(label="Creator")
561
+ with gr.Row():
562
+ QLabel = gr.Label(label="Humanized")
563
  with gr.Group():
564
  with gr.Row():
565
  month_from = gr.Dropdown(
 
628
  mcLabel,
629
  sentenceBreakdown,
630
  writing_analysis_plot,
631
+ QLabel
632
  ],
633
  api_name="main",
634
  )
 
643
  api_name="ai_check",
644
  )
645
 
646
+ quillbot_check.click(
647
+ fn=predict_quillbot,
648
+ inputs=[input_text],
649
+ outputs=[QLabel],
650
+ api_name="quillbot_check",
651
+ )
652
+
653
  only_plagiarism_btn.click(
654
  fn=plagiarism_check,
655
  inputs=[