Spaces:
Sleeping
Sleeping
aliasgerovs
commited on
Merge branch 'demo'
Browse files- .gitignore +1 -1
- app.py +14 -10
- highlighter.py +1 -1
- isotonic_regression_model.joblib +0 -0
- plagiarism.py +2 -0
- predictors.py +66 -5
- requirements.txt +3 -0
- utils.py +20 -5
.gitignore
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
__pycache__/
|
2 |
-
|
3 |
copy_check/
|
|
|
1 |
__pycache__/
|
2 |
+
venv/
|
3 |
copy_check/
|
app.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1 |
import gradio as gr
|
2 |
import numpy as np
|
3 |
from datetime import date
|
4 |
-
from predictors import predict_bc_scores, predict_mc_scores
|
|
|
5 |
from analysis import depth_analysis
|
6 |
from predictors import predict_quillbot
|
7 |
from plagiarism import plagiarism_check, build_date, html_highlight
|
@@ -29,7 +30,7 @@ def ai_generated_test(option, input, models):
|
|
29 |
if option == "Human vs AI":
|
30 |
return predict_bc_scores(input), None
|
31 |
elif option == "Human vs AI Source Models":
|
32 |
-
return predict_bc_scores(input),
|
33 |
return None, None
|
34 |
|
35 |
|
@@ -74,7 +75,7 @@ def main(
|
|
74 |
)
|
75 |
depth_analysis_plot = depth_analysis(input)
|
76 |
bc_score = predict_bc_scores(input)
|
77 |
-
mc_score =
|
78 |
quilscore = predict_quillbot(input)
|
79 |
|
80 |
return (
|
@@ -88,7 +89,7 @@ def main(
|
|
88 |
|
89 |
# START OF GRADIO
|
90 |
|
91 |
-
title = "
|
92 |
months = {
|
93 |
"January": "01",
|
94 |
"February": "02",
|
@@ -114,7 +115,7 @@ with gr.Blocks() as demo:
|
|
114 |
domain_list = ["com", "org", "net", "int", "edu", "gov", "mil"]
|
115 |
gr.Markdown(
|
116 |
"""
|
117 |
-
#
|
118 |
"""
|
119 |
)
|
120 |
with gr.Row():
|
@@ -127,6 +128,12 @@ with gr.Blocks() as demo:
|
|
127 |
char_count = gr.Textbox(label="Minumum Character Limit Check")
|
128 |
input_text.change(fn=len_validator, inputs=input_text, outputs=char_count)
|
129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
with gr.Row():
|
131 |
models = gr.Dropdown(
|
132 |
model_list,
|
@@ -382,8 +389,5 @@ with gr.Blocks() as demo:
|
|
382 |
date_from = ""
|
383 |
date_to = ""
|
384 |
|
385 |
-
|
386 |
-
|
387 |
-
demo.launch(
|
388 |
-
share=True, server_name="0.0.0.0", auth=("polygraf-admin", "test@aisd")
|
389 |
-
)
|
|
|
1 |
import gradio as gr
|
2 |
import numpy as np
|
3 |
from datetime import date
|
4 |
+
from predictors import predict_bc_scores, predict_mc_scores
|
5 |
+
from predictors import update, correct_text, split_text
|
6 |
from analysis import depth_analysis
|
7 |
from predictors import predict_quillbot
|
8 |
from plagiarism import plagiarism_check, build_date, html_highlight
|
|
|
30 |
if option == "Human vs AI":
|
31 |
return predict_bc_scores(input), None
|
32 |
elif option == "Human vs AI Source Models":
|
33 |
+
return predict_bc_scores(input), predict_mc_scores(input, models)
|
34 |
return None, None
|
35 |
|
36 |
|
|
|
75 |
)
|
76 |
depth_analysis_plot = depth_analysis(input)
|
77 |
bc_score = predict_bc_scores(input)
|
78 |
+
mc_score = predict_mc_scores(input, models)
|
79 |
quilscore = predict_quillbot(input)
|
80 |
|
81 |
return (
|
|
|
89 |
|
90 |
# START OF GRADIO
|
91 |
|
92 |
+
title = "AI Detection and Source Analysis"
|
93 |
months = {
|
94 |
"January": "01",
|
95 |
"February": "02",
|
|
|
115 |
domain_list = ["com", "org", "net", "int", "edu", "gov", "mil"]
|
116 |
gr.Markdown(
|
117 |
"""
|
118 |
+
# AI Detection and Source Analysis
|
119 |
"""
|
120 |
)
|
121 |
with gr.Row():
|
|
|
128 |
char_count = gr.Textbox(label="Minumum Character Limit Check")
|
129 |
input_text.change(fn=len_validator, inputs=input_text, outputs=char_count)
|
130 |
|
131 |
+
with gr.Row():
|
132 |
+
btn = gr.Button("Bias Buster")
|
133 |
+
out = gr.Textbox(label="Bias Corrected Full Input", interactive=False)
|
134 |
+
corrections_output = gr.Textbox(label="Bias Corrections", interactive=False)
|
135 |
+
btn.click(fn=update, inputs=input_text, outputs=[out, corrections_output])
|
136 |
+
|
137 |
with gr.Row():
|
138 |
models = gr.Dropdown(
|
139 |
model_list,
|
|
|
389 |
date_from = ""
|
390 |
date_to = ""
|
391 |
|
392 |
+
if __name__ == "__main__":
|
393 |
+
demo.launch(share=True, server_name="0.0.0.0", server_port = 80, auth=("polygraf-admin", "test@aisd"))
|
|
|
|
|
|
highlighter.py
CHANGED
@@ -14,7 +14,7 @@ def explainer(text, model_type):
|
|
14 |
sentences = [sent for sent in sent_tokenize(text)]
|
15 |
num_sentences = len(sentences)
|
16 |
exp = explainer_.explain_instance(
|
17 |
-
text, predictor_wrapper, num_features=num_sentences, num_samples=
|
18 |
)
|
19 |
weights_mapping = exp.as_map()[1]
|
20 |
sentences_weights = {sentence: 0 for sentence in sentences}
|
|
|
14 |
sentences = [sent for sent in sent_tokenize(text)]
|
15 |
num_sentences = len(sentences)
|
16 |
exp = explainer_.explain_instance(
|
17 |
+
text, predictor_wrapper, num_features=num_sentences, num_samples=2000
|
18 |
)
|
19 |
weights_mapping = exp.as_map()[1]
|
20 |
sentences_weights = {sentence: 0 for sentence in sentences}
|
isotonic_regression_model.joblib
CHANGED
Binary files a/isotonic_regression_model.joblib and b/isotonic_regression_model.joblib differ
|
|
plagiarism.py
CHANGED
@@ -224,6 +224,8 @@ def plagiarism_check(
|
|
224 |
domains_to_skip,
|
225 |
source_block_size,
|
226 |
):
|
|
|
|
|
227 |
# api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
|
228 |
# api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
|
229 |
# api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
|
|
|
224 |
domains_to_skip,
|
225 |
source_block_size,
|
226 |
):
|
227 |
+
api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
|
228 |
+
api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
|
229 |
# api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
|
230 |
# api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
|
231 |
# api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
|
predictors.py
CHANGED
@@ -8,12 +8,23 @@ from scipy.special import softmax
|
|
8 |
import yaml
|
9 |
from utils import *
|
10 |
import joblib
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
with open("config.yaml", "r") as file:
|
13 |
params = yaml.safe_load(file)
|
14 |
nltk.download("punkt")
|
15 |
nltk.download("stopwords")
|
16 |
-
|
|
|
17 |
text_bc_model_path = params["TEXT_BC_MODEL_PATH"]
|
18 |
text_mc_model_path = params["TEXT_MC_MODEL_PATH"]
|
19 |
text_quillbot_model_path = params["TEXT_QUILLBOT_MODEL_PATH"]
|
@@ -23,6 +34,8 @@ mc_label_map = params["MC_OUTPUT_LABELS"]
|
|
23 |
text_1on1_label_map = params["1ON1_OUTPUT_LABELS"]
|
24 |
mc_token_size = int(params["MC_TOKEN_SIZE"])
|
25 |
bc_token_size = int(params["BC_TOKEN_SIZE"])
|
|
|
|
|
26 |
text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
|
27 |
text_bc_model = AutoModelForSequenceClassification.from_pretrained(
|
28 |
text_bc_model_path
|
@@ -43,24 +56,71 @@ for model_name, model in zip(mc_label_map, text_1on1_models):
|
|
43 |
AutoModelForSequenceClassification.from_pretrained(model).to(device)
|
44 |
)
|
45 |
|
|
|
|
|
46 |
# proxy models for explainability
|
47 |
mini_bc_model_name = "polygraf-ai/bc-model-bert-mini"
|
48 |
bc_tokenizer_mini = AutoTokenizer.from_pretrained(mini_bc_model_name)
|
49 |
bc_model_mini = AutoModelForSequenceClassification.from_pretrained(
|
50 |
mini_bc_model_name
|
51 |
-
).to(
|
52 |
mini_humanizer_model_name = "polygraf-ai/quillbot-detector-bert-mini-9K"
|
53 |
humanizer_tokenizer_mini = AutoTokenizer.from_pretrained(
|
54 |
mini_humanizer_model_name
|
55 |
)
|
56 |
humanizer_model_mini = AutoModelForSequenceClassification.from_pretrained(
|
57 |
mini_humanizer_model_name
|
58 |
-
).to(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
# model score calibration
|
61 |
iso_reg = joblib.load("isotonic_regression_model.joblib")
|
62 |
|
63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
def split_text_allow_complete_sentences_nltk(
|
65 |
text,
|
66 |
max_length=256,
|
@@ -181,7 +241,7 @@ def predict_for_explainanility(text, model_type=None):
|
|
181 |
padding="max_length",
|
182 |
truncation=True,
|
183 |
max_length=max_length,
|
184 |
-
).to(
|
185 |
outputs = model(**tokenized_text)
|
186 |
tensor_logits = outputs[0]
|
187 |
probas = F.softmax(tensor_logits).detach().cpu().numpy()
|
@@ -279,6 +339,7 @@ def predict_bc_scores(input):
|
|
279 |
human_score = 1 - ai_score
|
280 |
bc_score = {"AI": ai_score, "HUMAN": human_score}
|
281 |
print(f"Calibration BC scores: AI: {ai_score}, HUMAN: {human_score}")
|
|
|
282 |
return bc_score
|
283 |
|
284 |
|
@@ -313,7 +374,7 @@ def predict_1on1_single(input, model):
|
|
313 |
return predictions
|
314 |
|
315 |
|
316 |
-
def
|
317 |
|
318 |
if len(models) == 0:
|
319 |
return {}
|
|
|
8 |
import yaml
|
9 |
from utils import *
|
10 |
import joblib
|
11 |
+
from optimum.bettertransformer import BetterTransformer
|
12 |
+
import gc
|
13 |
+
from cleantext import clean
|
14 |
+
import gradio as gr
|
15 |
+
from tqdm.auto import tqdm
|
16 |
+
from transformers import pipeline
|
17 |
+
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
18 |
+
import nltk
|
19 |
+
from nltk.tokenize import sent_tokenize
|
20 |
+
from optimum.pipelines import pipeline
|
21 |
|
22 |
with open("config.yaml", "r") as file:
|
23 |
params = yaml.safe_load(file)
|
24 |
nltk.download("punkt")
|
25 |
nltk.download("stopwords")
|
26 |
+
device_needed = "cuda" if torch.cuda.is_available() else "cpu"
|
27 |
+
device = 'cpu'
|
28 |
text_bc_model_path = params["TEXT_BC_MODEL_PATH"]
|
29 |
text_mc_model_path = params["TEXT_MC_MODEL_PATH"]
|
30 |
text_quillbot_model_path = params["TEXT_QUILLBOT_MODEL_PATH"]
|
|
|
34 |
text_1on1_label_map = params["1ON1_OUTPUT_LABELS"]
|
35 |
mc_token_size = int(params["MC_TOKEN_SIZE"])
|
36 |
bc_token_size = int(params["BC_TOKEN_SIZE"])
|
37 |
+
bias_checker_model_name = params['BIAS_CHECKER_MODEL_PATH']
|
38 |
+
bias_corrector_model_name = params['BIAS_CORRECTOR_MODEL_PATH']
|
39 |
text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
|
40 |
text_bc_model = AutoModelForSequenceClassification.from_pretrained(
|
41 |
text_bc_model_path
|
|
|
56 |
AutoModelForSequenceClassification.from_pretrained(model).to(device)
|
57 |
)
|
58 |
|
59 |
+
|
60 |
+
|
61 |
# proxy models for explainability
|
62 |
mini_bc_model_name = "polygraf-ai/bc-model-bert-mini"
|
63 |
bc_tokenizer_mini = AutoTokenizer.from_pretrained(mini_bc_model_name)
|
64 |
bc_model_mini = AutoModelForSequenceClassification.from_pretrained(
|
65 |
mini_bc_model_name
|
66 |
+
).to(device_needed)
|
67 |
mini_humanizer_model_name = "polygraf-ai/quillbot-detector-bert-mini-9K"
|
68 |
humanizer_tokenizer_mini = AutoTokenizer.from_pretrained(
|
69 |
mini_humanizer_model_name
|
70 |
)
|
71 |
humanizer_model_mini = AutoModelForSequenceClassification.from_pretrained(
|
72 |
mini_humanizer_model_name
|
73 |
+
).to(device_needed)
|
74 |
+
|
75 |
+
bc_model_mini = BetterTransformer.transform(bc_model_mini)
|
76 |
+
humanizer_model_mini = BetterTransformer.transform(humanizer_model_mini)
|
77 |
+
text_bc_model = BetterTransformer.transform(text_bc_model)
|
78 |
+
text_mc_model = BetterTransformer.transform(text_mc_model)
|
79 |
+
quillbot_model = BetterTransformer.transform(quillbot_model)
|
80 |
+
|
81 |
+
bias_model_checker = AutoModelForSequenceClassification.from_pretrained(bias_checker_model_name)
|
82 |
+
tokenizer = AutoTokenizer.from_pretrained(bias_checker_model_name)
|
83 |
+
bias_model_checker = BetterTransformer.transform(bias_model_checker, keep_original_model=False)
|
84 |
+
bias_checker = pipeline(
|
85 |
+
"text-classification",
|
86 |
+
model=bias_checker_model_name,
|
87 |
+
tokenizer=bias_checker_model_name,
|
88 |
+
)
|
89 |
+
gc.collect()
|
90 |
+
bias_corrector = pipeline( "text2text-generation", model=bias_corrector_model_name, accelerator="ort")
|
91 |
|
92 |
# model score calibration
|
93 |
iso_reg = joblib.load("isotonic_regression_model.joblib")
|
94 |
|
95 |
|
96 |
+
def split_text(text: str) -> list:
|
97 |
+
sentences = sent_tokenize(text)
|
98 |
+
return [[sentence] for sentence in sentences]
|
99 |
+
|
100 |
+
def correct_text(text: str, bias_checker, bias_corrector, separator: str = " ") -> tuple:
|
101 |
+
sentence_batches = split_text(text)
|
102 |
+
corrected_text = []
|
103 |
+
corrections = []
|
104 |
+
for batch in tqdm(sentence_batches, total=len(sentence_batches), desc="correcting text.."):
|
105 |
+
raw_text = " ".join(batch)
|
106 |
+
results = bias_checker(raw_text)
|
107 |
+
if results[0]["label"] != "LABEL_1" or (results[0]["label"] == "LABEL_1" and results[0]["score"] < 0.9):
|
108 |
+
corrected_batch = bias_corrector(raw_text)
|
109 |
+
corrected_version = corrected_batch[0]["generated_text"]
|
110 |
+
corrected_text.append(corrected_version)
|
111 |
+
corrections.append((raw_text, corrected_version))
|
112 |
+
else:
|
113 |
+
corrected_text.append(raw_text)
|
114 |
+
corrected_text = separator.join(corrected_text)
|
115 |
+
return corrected_text, corrections
|
116 |
+
|
117 |
+
def update(text: str):
|
118 |
+
text = clean(text, lower=False)
|
119 |
+
corrected_text, corrections = correct_text(text, bias_checker, bias_corrector)
|
120 |
+
corrections_display = "\n\n".join([f"Original: {orig}\nCorrected: {corr}" for orig, corr in corrections])
|
121 |
+
return corrected_text, corrections_display
|
122 |
+
|
123 |
+
|
124 |
def split_text_allow_complete_sentences_nltk(
|
125 |
text,
|
126 |
max_length=256,
|
|
|
241 |
padding="max_length",
|
242 |
truncation=True,
|
243 |
max_length=max_length,
|
244 |
+
).to(device_needed)
|
245 |
outputs = model(**tokenized_text)
|
246 |
tensor_logits = outputs[0]
|
247 |
probas = F.softmax(tensor_logits).detach().cpu().numpy()
|
|
|
339 |
human_score = 1 - ai_score
|
340 |
bc_score = {"AI": ai_score, "HUMAN": human_score}
|
341 |
print(f"Calibration BC scores: AI: {ai_score}, HUMAN: {human_score}")
|
342 |
+
print(f"Input Text: {cleaned_text_bc}")
|
343 |
return bc_score
|
344 |
|
345 |
|
|
|
374 |
return predictions
|
375 |
|
376 |
|
377 |
+
def predict_mc_scores(input, models):
|
378 |
|
379 |
if len(models) == 0:
|
380 |
return {}
|
requirements.txt
CHANGED
@@ -26,6 +26,9 @@ Unidecode
|
|
26 |
python-dotenv
|
27 |
lime
|
28 |
joblib
|
|
|
|
|
|
|
29 |
emoji==1.6.1
|
30 |
matplotlib
|
31 |
seaborn
|
|
|
26 |
python-dotenv
|
27 |
lime
|
28 |
joblib
|
29 |
+
optimum
|
30 |
+
clean-text
|
31 |
+
optimum[onnxruntime]
|
32 |
emoji==1.6.1
|
33 |
matplotlib
|
34 |
seaborn
|
utils.py
CHANGED
@@ -14,13 +14,28 @@ def remove_accents(input_str):
|
|
14 |
|
15 |
|
16 |
def remove_special_characters(text):
|
17 |
-
text =
|
18 |
-
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
return text
|
22 |
|
23 |
-
|
24 |
def remove_special_characters_2(text):
|
25 |
pattern = r"[^a-zA-Z0-9 ]+"
|
26 |
text = re.sub(pattern, "", text)
|
|
|
14 |
|
15 |
|
16 |
def remove_special_characters(text):
|
17 |
+
text = re.sub(r'https?://\S+|www\.\S+', '', text)
|
18 |
+
emoji_pattern = re.compile("["
|
19 |
+
u"\U0001F600-\U0001F64F" # emoticons
|
20 |
+
u"\U0001F300-\U0001F5FF" # symbols & pictographs
|
21 |
+
u"\U0001F680-\U0001F6FF" # transport & map symbols
|
22 |
+
u"\U0001F700-\U0001F77F" # alchemical symbols
|
23 |
+
u"\U0001F780-\U0001F7FF" # Geometric Shapes Extended
|
24 |
+
u"\U0001F800-\U0001F8FF" # Supplemental Arrows-C
|
25 |
+
u"\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs
|
26 |
+
u"\U0001FA00-\U0001FA6F" # Chess Symbols
|
27 |
+
u"\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A
|
28 |
+
u"\U00002702-\U000027B0" # Dingbats
|
29 |
+
u"\U000024C2-\U0001F251"
|
30 |
+
"]+", flags=re.UNICODE)
|
31 |
+
text = emoji_pattern.sub('', text)
|
32 |
+
text = re.sub(r'#\w+', '', text)
|
33 |
+
text = re.sub(r'[^\w\s\d.,!?\'"()-;]', '', text)
|
34 |
+
text = re.sub(r'\s+([.,!?;])', r'\1', text)
|
35 |
+
text = re.sub(r'([.,!?;])(\S)', r'\1 \2', text)
|
36 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
37 |
return text
|
38 |
|
|
|
39 |
def remove_special_characters_2(text):
|
40 |
pattern = r"[^a-zA-Z0-9 ]+"
|
41 |
text = re.sub(pattern, "", text)
|