Spaces:
Sleeping
Sleeping
aliasgerovs
commited on
Merge branch 'main' into demo
Browse files- app.py +26 -10
- requirements.txt +2 -1
app.py
CHANGED
@@ -18,6 +18,8 @@ from transformers import GPT2LMHeadModel, GPT2TokenizerFast
|
|
18 |
import nltk, spacy, subprocess, torch
|
19 |
import plotly.graph_objects as go
|
20 |
import nltk
|
|
|
|
|
21 |
|
22 |
nltk.download('punkt')
|
23 |
tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-uncased')
|
@@ -180,17 +182,30 @@ AI DETECTION SECTION
|
|
180 |
"""
|
181 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
182 |
|
183 |
-
text_bc_model_path = "polygraf-ai/v3-bert-3-2m-trun-bc"
|
184 |
text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
|
185 |
text_bc_model = AutoModelForSequenceClassification.from_pretrained(text_bc_model_path).to(device)
|
186 |
|
187 |
-
text_mc_model_path = "polygraf-ai/text-detect-mc-bert-base-uncased-v1-bert-429k"
|
188 |
text_mc_tokenizer = AutoTokenizer.from_pretrained(text_mc_model_path)
|
189 |
text_mc_model = AutoModelForSequenceClassification.from_pretrained(text_mc_model_path).to(device)
|
190 |
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
def remove_special_characters(text):
|
192 |
-
|
193 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
194 |
|
195 |
def update_character_count(text):
|
196 |
return f"{len(text)} characters"
|
@@ -256,7 +271,7 @@ def predict_bc(model, tokenizer, text):
|
|
256 |
|
257 |
def predict_mc(model, tokenizer, text):
|
258 |
tokens = tokenizer(
|
259 |
-
text, padding='max_length', truncation=True, return_tensors="pt", max_length=
|
260 |
).to(device)["input_ids"]
|
261 |
output = model(tokens)
|
262 |
output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
|
@@ -271,9 +286,10 @@ def ai_generated_test(ai_option, input):
|
|
271 |
segments = split_text_allow_complete_sentences_nltk(input)
|
272 |
|
273 |
for i in range(samples_len):
|
274 |
-
|
275 |
-
|
276 |
-
|
|
|
277 |
bc_scores.append(bc_score)
|
278 |
mc_scores.append(mc_score)
|
279 |
|
@@ -633,5 +649,5 @@ with gr.Blocks() as demo:
|
|
633 |
|
634 |
date_from = ""
|
635 |
date_to = ""
|
636 |
-
|
637 |
-
demo.launch(share=True, server_name="0.0.0.0",
|
|
|
18 |
import nltk, spacy, subprocess, torch
|
19 |
import plotly.graph_objects as go
|
20 |
import nltk
|
21 |
+
from unidecode import unidecode
|
22 |
+
|
23 |
|
24 |
nltk.download('punkt')
|
25 |
tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-uncased')
|
|
|
182 |
"""
|
183 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
184 |
|
185 |
+
text_bc_model_path = "polygraf-ai/v3-bert-3-2m-trun-bc-lighter-spec"
|
186 |
text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
|
187 |
text_bc_model = AutoModelForSequenceClassification.from_pretrained(text_bc_model_path).to(device)
|
188 |
|
189 |
+
text_mc_model_path = "polygraf-ai/text-detect-mc-bert-base-uncased-v1-bert-429k-256"
|
190 |
text_mc_tokenizer = AutoTokenizer.from_pretrained(text_mc_model_path)
|
191 |
text_mc_model = AutoModelForSequenceClassification.from_pretrained(text_mc_model_path).to(device)
|
192 |
|
193 |
+
def remove_accents(input_str):
|
194 |
+
# nfkd_form = unicodedata.normalize('NFKD', input_str)
|
195 |
+
# return "".join([char for char in nfkd_form if not unicodedata.combining(char)])
|
196 |
+
text_no_accents = unidecode(input_str)
|
197 |
+
return text_no_accents
|
198 |
+
|
199 |
def remove_special_characters(text):
|
200 |
+
text = remove_accents(text)
|
201 |
+
pattern = r'[^\w\s\d.,!?\'"()-;]+'
|
202 |
+
text = re.sub(pattern, '', text)
|
203 |
+
return text
|
204 |
+
|
205 |
+
def remove_special_characters_2(text):
|
206 |
+
pattern = r'[^a-zA-Z0-9 ]+'
|
207 |
+
text = re.sub(pattern, '', text)
|
208 |
+
return text
|
209 |
|
210 |
def update_character_count(text):
|
211 |
return f"{len(text)} characters"
|
|
|
271 |
|
272 |
def predict_mc(model, tokenizer, text):
|
273 |
tokens = tokenizer(
|
274 |
+
text, padding='max_length', truncation=True, return_tensors="pt", max_length=256
|
275 |
).to(device)["input_ids"]
|
276 |
output = model(tokens)
|
277 |
output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
|
|
|
286 |
segments = split_text_allow_complete_sentences_nltk(input)
|
287 |
|
288 |
for i in range(samples_len):
|
289 |
+
cleaned_text_bc = remove_special_characters(segments[i])
|
290 |
+
cleaned_text_mc = remove_special_characters_2(segments[i])
|
291 |
+
bc_score = predict_bc(text_bc_model, text_bc_tokenizer,cleaned_text_bc )
|
292 |
+
mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text_mc)
|
293 |
bc_scores.append(bc_score)
|
294 |
mc_scores.append(mc_score)
|
295 |
|
|
|
649 |
|
650 |
date_from = ""
|
651 |
date_to = ""
|
652 |
+
|
653 |
+
demo.launch(share=True, server_name="0.0.0.0", auth=("polygraf-admin", "test@aisd"))
|
requirements.txt
CHANGED
@@ -21,4 +21,5 @@ textstat
|
|
21 |
plotly
|
22 |
tqdm
|
23 |
pymupdf
|
24 |
-
sentence-transformers
|
|
|
|
21 |
plotly
|
22 |
tqdm
|
23 |
pymupdf
|
24 |
+
sentence-transformers
|
25 |
+
Unidecode
|