Spaces:
Running
Running
from utils import ( | |
cosineSim, | |
googleSearch, | |
getSentences, | |
parallel_scrap, | |
matchingScore, | |
) | |
import gradio as gr | |
from urllib.request import urlopen, Request | |
from googleapiclient.discovery import build | |
import requests | |
import httpx | |
import torch | |
import re | |
from bs4 import BeautifulSoup | |
import numpy as np | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
import asyncio | |
from scipy.special import softmax | |
from evaluate import load | |
from datetime import date | |
import nltk | |
import fitz | |
from transformers import GPT2LMHeadModel, GPT2TokenizerFast | |
import nltk, spacy, subprocess, torch | |
import plotly.graph_objects as go | |
import torch.nn.functional as F | |
import nltk | |
from unidecode import unidecode | |
import time | |
from utils import cos_sim_torch, embed_text | |
import multiprocessing | |
from functools import partial | |
import concurrent.futures | |
from plagiarism import plagiarism_check | |
nltk.download("punkt") | |
from writing_analysis import ( | |
normalize, | |
preprocess_text1, | |
preprocess_text2, | |
vocabulary_richness_ttr, | |
calculate_gunning_fog, | |
calculate_average_sentence_length, | |
calculate_average_word_length, | |
calculate_syntactic_tree_depth, | |
calculate_perplexity, | |
) | |
np.set_printoptions(suppress=True) | |
""" | |
AI DETECTION SECTION | |
""" | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
text_bc_model_path = "polygraf-ai/text-detect-bc-v11-4m" | |
text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path) | |
text_bc_model = AutoModelForSequenceClassification.from_pretrained( | |
text_bc_model_path | |
).to(device) | |
text_mc_model_path = ( | |
"polygraf-ai/ai-text-detection-mc-robert-open-ai-detector-v4" | |
) | |
text_mc_tokenizer = AutoTokenizer.from_pretrained(text_mc_model_path) | |
text_mc_model = AutoModelForSequenceClassification.from_pretrained( | |
text_mc_model_path | |
).to(device) | |
quillbot_labels = ["Original", "QuillBot"] | |
quillbot_tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base") | |
quillbot_model = AutoModelForSequenceClassification.from_pretrained( | |
"polygraf-ai/quillbot-detector-28k" | |
).to(device) | |
def remove_accents(input_str): | |
text_no_accents = unidecode(input_str) | |
return text_no_accents | |
def remove_special_characters(text): | |
text = remove_accents(text) | |
pattern = r'[^\w\s\d.,!?\'"()-;]+' | |
text = re.sub(pattern, "", text) | |
return text | |
def remove_special_characters_2(text): | |
pattern = r"[^a-zA-Z0-9 ]+" | |
text = re.sub(pattern, "", text) | |
return text | |
def update_character_count(text): | |
return f"{len(text)} characters" | |
def split_text_allow_complete_sentences_nltk( | |
text, | |
max_length=256, | |
tolerance=30, | |
min_last_segment_length=100, | |
type_det="bc", | |
): | |
sentences = nltk.sent_tokenize(text) | |
segments = [] | |
current_segment = [] | |
current_length = 0 | |
if type_det == "bc": | |
tokenizer = text_bc_tokenizer | |
max_length = 333 | |
elif type_det == "mc": | |
tokenizer = text_mc_tokenizer | |
max_length = 256 | |
for sentence in sentences: | |
tokens = tokenizer.tokenize(sentence) | |
sentence_length = len(tokens) | |
if current_length + sentence_length <= max_length + tolerance - 2: | |
current_segment.append(sentence) | |
current_length += sentence_length | |
else: | |
if current_segment: | |
encoded_segment = tokenizer.encode( | |
" ".join(current_segment), | |
add_special_tokens=True, | |
max_length=max_length + tolerance, | |
truncation=True, | |
) | |
segments.append((current_segment, len(encoded_segment))) | |
current_segment = [sentence] | |
current_length = sentence_length | |
if current_segment: | |
encoded_segment = tokenizer.encode( | |
" ".join(current_segment), | |
add_special_tokens=True, | |
max_length=max_length + tolerance, | |
truncation=True, | |
) | |
segments.append((current_segment, len(encoded_segment))) | |
final_segments = [] | |
for i, (seg, length) in enumerate(segments): | |
if i == len(segments) - 1: | |
if length < min_last_segment_length and len(final_segments) > 0: | |
prev_seg, prev_length = final_segments[-1] | |
combined_encoded = tokenizer.encode( | |
" ".join(prev_seg + seg), | |
add_special_tokens=True, | |
max_length=max_length + tolerance, | |
truncation=True, | |
) | |
if len(combined_encoded) <= max_length + tolerance: | |
final_segments[-1] = (prev_seg + seg, len(combined_encoded)) | |
else: | |
final_segments.append((seg, length)) | |
else: | |
final_segments.append((seg, length)) | |
else: | |
final_segments.append((seg, length)) | |
decoded_segments = [] | |
encoded_segments = [] | |
for seg, _ in final_segments: | |
encoded_segment = tokenizer.encode( | |
" ".join(seg), | |
add_special_tokens=True, | |
max_length=max_length + tolerance, | |
truncation=True, | |
) | |
decoded_segment = tokenizer.decode(encoded_segment) | |
decoded_segments.append(decoded_segment) | |
return decoded_segments | |
def predict_quillbot(text): | |
with torch.no_grad(): | |
quillbot_model.eval() | |
tokenized_text = quillbot_tokenizer( | |
text, | |
padding="max_length", | |
truncation=True, | |
max_length=256, | |
return_tensors="pt", | |
).to(device) | |
output = quillbot_model(**tokenized_text) | |
output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0] | |
q_score = { | |
"QuillBot": output_norm[1].item(), | |
"Original": output_norm[0].item(), | |
} | |
return q_score | |
def predict_bc(model, tokenizer, text): | |
with torch.no_grad(): | |
model.eval() | |
tokens = text_bc_tokenizer( | |
text, | |
padding="max_length", | |
truncation=True, | |
max_length=333, | |
return_tensors="pt", | |
).to(device) | |
output = model(**tokens) | |
output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0] | |
print("BC Score: ", output_norm) | |
return output_norm | |
def predict_mc(model, tokenizer, text): | |
with torch.no_grad(): | |
model.eval() | |
tokens = text_mc_tokenizer( | |
text, | |
padding="max_length", | |
truncation=True, | |
return_tensors="pt", | |
max_length=256, | |
).to(device) | |
output = model(**tokens) | |
output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0] | |
print("MC Score: ", output_norm) | |
return output_norm | |
def ai_generated_test(ai_option, input): | |
bc_scores = [] | |
mc_scores = [] | |
samples_len_bc = len( | |
split_text_allow_complete_sentences_nltk(input, type_det="bc") | |
) | |
samples_len_mc = len( | |
split_text_allow_complete_sentences_nltk(input, type_det="mc") | |
) | |
segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc") | |
segments_mc = split_text_allow_complete_sentences_nltk(input, type_det="bc") | |
for i in range(samples_len_bc): | |
cleaned_text_bc = remove_special_characters(segments_bc[i]) | |
bc_score = predict_bc(text_bc_model, text_bc_tokenizer, cleaned_text_bc) | |
bc_scores.append(bc_score) | |
for i in range(samples_len_mc): | |
cleaned_text_mc = remove_special_characters(segments_mc[i]) | |
mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text_mc) | |
mc_scores.append(mc_score) | |
bc_scores_array = np.array(bc_scores) | |
mc_scores_array = np.array(mc_scores) | |
average_bc_scores = np.mean(bc_scores_array, axis=0) | |
average_mc_scores = np.mean(mc_scores_array, axis=0) | |
bc_score_list = average_bc_scores.tolist() | |
mc_score_list = average_mc_scores.tolist() | |
bc_score = {"AI": bc_score_list[1], "HUMAN": bc_score_list[0]} | |
mc_score = {} | |
label_map = ["OpenAI GPT", "Mistral", "CLAUDE", "Gemini", "LLAMA 2"] | |
for score, label in zip(mc_score_list, label_map): | |
mc_score[label.upper()] = score | |
sum_prob = 1 - bc_score["HUMAN"] | |
for key, value in mc_score.items(): | |
mc_score[key] = value * sum_prob | |
if ai_option == "Human vs AI": | |
mc_score = {} | |
if sum_prob < 0.01: | |
mc_score = {} | |
return bc_score, mc_score | |
else: | |
return bc_score, mc_score | |
# COMBINED | |
def main( | |
ai_option, | |
plag_option, | |
input, | |
# models, | |
year_from, | |
month_from, | |
day_from, | |
year_to, | |
month_to, | |
day_to, | |
domains_to_skip, | |
): | |
formatted_tokens = plagiarism_check( | |
plag_option, | |
input, | |
year_from, | |
month_from, | |
day_from, | |
year_to, | |
month_to, | |
day_to, | |
domains_to_skip, | |
) | |
depth_analysis_plot = depth_analysis(input) | |
bc_score, mc_score = ai_generated_test(ai_option, input) | |
quilscore = predict_quillbot(input) | |
return ( | |
bc_score, | |
mc_score, | |
formatted_tokens, | |
depth_analysis_plot, | |
quilscore, | |
) | |
def build_date(year, month, day): | |
return f"{year}{months[month]}{day}" | |
def len_validator(text): | |
min_tokens = 200 | |
lengt = len(text_bc_tokenizer.tokenize(text=text, return_tensors="pt")) | |
if lengt < min_tokens: | |
return f"Warning! Input length is {lengt}. Please input a text that is greater than {min_tokens} tokens long. Recommended length {min_tokens*2} tokens." | |
else: | |
return f"Input length ({lengt}) is satisified." | |
def extract_text_from_pdf(pdf_path): | |
doc = fitz.open(pdf_path) | |
text = "" | |
for page in doc: | |
text += page.get_text() | |
return text | |
# DEPTH ANALYSIS | |
print("loading depth analysis") | |
nltk.download("stopwords") | |
nltk.download("punkt") | |
command = ["python3", "-m", "spacy", "download", "en_core_web_sm"] | |
# Execute the command | |
subprocess.run(command) | |
nlp = spacy.load("en_core_web_sm") | |
# for perplexity | |
model_id = "gpt2" | |
gpt2_model = GPT2LMHeadModel.from_pretrained(model_id).to(device) | |
gpt2_tokenizer = GPT2TokenizerFast.from_pretrained(model_id) | |
def depth_analysis(input_text): | |
# vocanulary richness | |
processed_words = preprocess_text1(input_text) | |
ttr_value = vocabulary_richness_ttr(processed_words) | |
# readability | |
gunning_fog = calculate_gunning_fog(input_text) | |
gunning_fog_norm = normalize(gunning_fog, min_value=0, max_value=20) | |
# average sentence length and average word length | |
words, sentences = preprocess_text2(input_text) | |
average_sentence_length = calculate_average_sentence_length(sentences) | |
average_word_length = calculate_average_word_length(words) | |
average_sentence_length_norm = normalize( | |
average_sentence_length, min_value=0, max_value=40 | |
) | |
average_word_length_norm = normalize( | |
average_word_length, min_value=0, max_value=8 | |
) | |
# syntactic_tree_depth | |
average_tree_depth = calculate_syntactic_tree_depth(nlp, input_text) | |
average_tree_depth_norm = normalize( | |
average_tree_depth, min_value=0, max_value=10 | |
) | |
# perplexity | |
perplexity = calculate_perplexity( | |
input_text, gpt2_model, gpt2_tokenizer, device | |
) | |
perplexity_norm = normalize(perplexity, min_value=0, max_value=30) | |
features = { | |
"readability": gunning_fog_norm, | |
"syntactic tree depth": average_tree_depth_norm, | |
"vocabulary richness": ttr_value, | |
"perplexity": perplexity_norm, | |
"average sentence length": average_sentence_length_norm, | |
"average word length": average_word_length_norm, | |
} | |
print(features) | |
fig = go.Figure() | |
fig.add_trace( | |
go.Scatterpolar( | |
r=list(features.values()), | |
theta=list(features.keys()), | |
fill="toself", | |
name="Radar Plot", | |
) | |
) | |
fig.update_layout( | |
polar=dict( | |
radialaxis=dict( | |
visible=True, | |
range=[0, 100], | |
) | |
), | |
showlegend=False, | |
# autosize=False, | |
# width=600, | |
# height=600, | |
margin=dict( | |
l=10, | |
r=20, | |
b=10, | |
t=10, | |
# pad=100 | |
), | |
) | |
return fig | |
# START OF GRADIO | |
title = "Copyright Checker" | |
months = { | |
"January": "01", | |
"February": "02", | |
"March": "03", | |
"April": "04", | |
"May": "05", | |
"June": "06", | |
"July": "07", | |
"August": "08", | |
"September": "09", | |
"October": "10", | |
"November": "11", | |
"December": "12", | |
} | |
with gr.Blocks() as demo: | |
today = date.today() | |
# dd/mm/YY | |
d1 = today.strftime("%d/%B/%Y") | |
d1 = d1.split("/") | |
model_list = ["OpenAI GPT", "Mistral", "CLAUDE", "Gemini", "LLAMA2"] | |
domain_list = ["com", "org", "net", "int", "edu", "gov", "mil"] | |
gr.Markdown( | |
""" | |
# Copyright Checker | |
""" | |
) | |
with gr.Row(): | |
input_text = gr.Textbox(label="Input text", lines=6, placeholder="") | |
file_input = gr.File(label="Upload PDF") | |
file_input.change( | |
fn=extract_text_from_pdf, inputs=file_input, outputs=input_text | |
) | |
char_count = gr.Textbox(label="Minumum Character Limit Check") | |
input_text.change(fn=len_validator, inputs=input_text, outputs=char_count) | |
with gr.Row(): | |
with gr.Column(): | |
ai_option = gr.Radio( | |
["Human vs AI", "Human vs AI Source Models"], | |
label="Choose an option please.", | |
) | |
with gr.Column(): | |
plag_option = gr.Radio( | |
["Standard", "Advanced"], label="Choose an option please." | |
) | |
with gr.Row(): | |
with gr.Column(): | |
only_ai_btn = gr.Button("AI Check") | |
with gr.Column(): | |
only_plagiarism_btn = gr.Button("Source Check") | |
with gr.Row(): | |
quillbot_check = gr.Button("Humanized Text Check (Quillbot)") | |
with gr.Row(): | |
depth_analysis_btn = gr.Button("Detailed Writing Analysis") | |
with gr.Row(): | |
full_check_btn = gr.Button("Full Check") | |
gr.Markdown( | |
""" | |
## Output | |
""" | |
) | |
# models = gr.Dropdown( | |
# model_list, | |
# value=model_list, | |
# multiselect=True, | |
# label="Models to test against", | |
# ) | |
with gr.Row(): | |
with gr.Column(): | |
bcLabel = gr.Label(label="Source") | |
with gr.Column(): | |
mcLabel = gr.Label(label="Creator") | |
with gr.Row(): | |
QLabel = gr.Label(label="Humanized") | |
with gr.Group(): | |
with gr.Row(): | |
month_from = gr.Dropdown( | |
choices=months, | |
label="From Month", | |
value="January", | |
interactive=True, | |
) | |
day_from = gr.Textbox(label="From Day", value="01") | |
year_from = gr.Textbox(label="From Year", value="2000") | |
# from_date_button = gr.Button("Submit") | |
with gr.Row(): | |
month_to = gr.Dropdown( | |
choices=months, | |
label="To Month", | |
value=d1[1], | |
interactive=True, | |
) | |
day_to = gr.Textbox(label="To Day", value=d1[0]) | |
year_to = gr.Textbox(label="To Year", value=d1[2]) | |
# to_date_button = gr.Button("Submit") | |
with gr.Row(): | |
domains_to_skip = gr.Dropdown( | |
domain_list, | |
multiselect=True, | |
label="Domain To Skip", | |
) | |
with gr.Row(): | |
with gr.Column(): | |
sentenceBreakdown = gr.HighlightedText( | |
label="Source Detection Sentence Breakdown", | |
combine_adjacent=True, | |
color_map={ | |
"[1]": "red", | |
"[2]": "orange", | |
"[3]": "yellow", | |
"[4]": "green", | |
}, | |
) | |
with gr.Row(): | |
with gr.Column(): | |
writing_analysis_plot = gr.Plot(label="Writing Analysis Plot") | |
full_check_btn.click( | |
fn=main, | |
inputs=[ | |
ai_option, | |
plag_option, | |
input_text, | |
# models, | |
year_from, | |
month_from, | |
day_from, | |
year_to, | |
month_to, | |
day_to, | |
domains_to_skip, | |
], | |
outputs=[ | |
bcLabel, | |
mcLabel, | |
sentenceBreakdown, | |
writing_analysis_plot, | |
QLabel, | |
], | |
api_name="main", | |
) | |
only_ai_btn.click( | |
fn=ai_generated_test, | |
inputs=[ai_option, input_text], | |
outputs=[ | |
bcLabel, | |
mcLabel, | |
], | |
api_name="ai_check", | |
) | |
quillbot_check.click( | |
fn=predict_quillbot, | |
inputs=[input_text], | |
outputs=[QLabel], | |
api_name="quillbot_check", | |
) | |
only_plagiarism_btn.click( | |
fn=plagiarism_check, | |
inputs=[ | |
plag_option, | |
input_text, | |
year_from, | |
month_from, | |
day_from, | |
year_to, | |
month_to, | |
day_to, | |
domains_to_skip, | |
], | |
outputs=[ | |
sentenceBreakdown, | |
], | |
api_name="plagiarism_check", | |
) | |
depth_analysis_btn.click( | |
fn=depth_analysis, | |
inputs=[input_text], | |
outputs=[writing_analysis_plot], | |
api_name="depth_analysis", | |
) | |
date_from = "" | |
date_to = "" | |
demo.launch( | |
share=True, server_name="0.0.0.0", auth=("polygraf-admin", "test@aisd") | |
) | |