aliasgerovs commited on
Commit
5da39de
·
2 Parent(s): d2b718a a1420b8

Merge branch 'main' into demo

Browse files
Files changed (2) hide show
  1. app.py +26 -10
  2. requirements.txt +2 -1
app.py CHANGED
@@ -18,6 +18,8 @@ from transformers import GPT2LMHeadModel, GPT2TokenizerFast
18
  import nltk, spacy, subprocess, torch
19
  import plotly.graph_objects as go
20
  import nltk
 
 
21
 
22
  nltk.download('punkt')
23
  tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-uncased')
@@ -180,17 +182,30 @@ AI DETECTION SECTION
180
  """
181
  device = "cuda" if torch.cuda.is_available() else "cpu"
182
 
183
- text_bc_model_path = "polygraf-ai/v3-bert-3-2m-trun-bc"
184
  text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
185
  text_bc_model = AutoModelForSequenceClassification.from_pretrained(text_bc_model_path).to(device)
186
 
187
- text_mc_model_path = "polygraf-ai/text-detect-mc-bert-base-uncased-v1-bert-429k"
188
  text_mc_tokenizer = AutoTokenizer.from_pretrained(text_mc_model_path)
189
  text_mc_model = AutoModelForSequenceClassification.from_pretrained(text_mc_model_path).to(device)
190
 
 
 
 
 
 
 
191
  def remove_special_characters(text):
192
- cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
193
- return cleaned_text
 
 
 
 
 
 
 
194
 
195
  def update_character_count(text):
196
  return f"{len(text)} characters"
@@ -256,7 +271,7 @@ def predict_bc(model, tokenizer, text):
256
 
257
  def predict_mc(model, tokenizer, text):
258
  tokens = tokenizer(
259
- text, padding='max_length', truncation=True, return_tensors="pt", max_length=512
260
  ).to(device)["input_ids"]
261
  output = model(tokens)
262
  output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
@@ -271,9 +286,10 @@ def ai_generated_test(ai_option, input):
271
  segments = split_text_allow_complete_sentences_nltk(input)
272
 
273
  for i in range(samples_len):
274
- cleaned_text = remove_special_characters(segments[i])
275
- bc_score = predict_bc(text_bc_model, text_bc_tokenizer,cleaned_text )
276
- mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text)
 
277
  bc_scores.append(bc_score)
278
  mc_scores.append(mc_score)
279
 
@@ -633,5 +649,5 @@ with gr.Blocks() as demo:
633
 
634
  date_from = ""
635
  date_to = ""
636
-
637
- demo.launch(share=True, server_name="0.0.0.0", server_port = 80, auth=("polygraf-admin", "test@aisd"))
 
18
  import nltk, spacy, subprocess, torch
19
  import plotly.graph_objects as go
20
  import nltk
21
+ from unidecode import unidecode
22
+
23
 
24
  nltk.download('punkt')
25
  tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-uncased')
 
182
  """
183
  device = "cuda" if torch.cuda.is_available() else "cpu"
184
 
185
+ text_bc_model_path = "polygraf-ai/v3-bert-3-2m-trun-bc-lighter-spec"
186
  text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
187
  text_bc_model = AutoModelForSequenceClassification.from_pretrained(text_bc_model_path).to(device)
188
 
189
+ text_mc_model_path = "polygraf-ai/text-detect-mc-bert-base-uncased-v1-bert-429k-256"
190
  text_mc_tokenizer = AutoTokenizer.from_pretrained(text_mc_model_path)
191
  text_mc_model = AutoModelForSequenceClassification.from_pretrained(text_mc_model_path).to(device)
192
 
193
+ def remove_accents(input_str):
194
+ # nfkd_form = unicodedata.normalize('NFKD', input_str)
195
+ # return "".join([char for char in nfkd_form if not unicodedata.combining(char)])
196
+ text_no_accents = unidecode(input_str)
197
+ return text_no_accents
198
+
199
  def remove_special_characters(text):
200
+ text = remove_accents(text)
201
+ pattern = r'[^\w\s\d.,!?\'"()-;]+'
202
+ text = re.sub(pattern, '', text)
203
+ return text
204
+
205
+ def remove_special_characters_2(text):
206
+ pattern = r'[^a-zA-Z0-9 ]+'
207
+ text = re.sub(pattern, '', text)
208
+ return text
209
 
210
  def update_character_count(text):
211
  return f"{len(text)} characters"
 
271
 
272
  def predict_mc(model, tokenizer, text):
273
  tokens = tokenizer(
274
+ text, padding='max_length', truncation=True, return_tensors="pt", max_length=256
275
  ).to(device)["input_ids"]
276
  output = model(tokens)
277
  output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
 
286
  segments = split_text_allow_complete_sentences_nltk(input)
287
 
288
  for i in range(samples_len):
289
+ cleaned_text_bc = remove_special_characters(segments[i])
290
+ cleaned_text_mc = remove_special_characters_2(segments[i])
291
+ bc_score = predict_bc(text_bc_model, text_bc_tokenizer,cleaned_text_bc )
292
+ mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text_mc)
293
  bc_scores.append(bc_score)
294
  mc_scores.append(mc_score)
295
 
 
649
 
650
  date_from = ""
651
  date_to = ""
652
+
653
+ demo.launch(share=True, server_name="0.0.0.0", auth=("polygraf-admin", "test@aisd"))
requirements.txt CHANGED
@@ -21,4 +21,5 @@ textstat
21
  plotly
22
  tqdm
23
  pymupdf
24
- sentence-transformers
 
 
21
  plotly
22
  tqdm
23
  pymupdf
24
+ sentence-transformers
25
+ Unidecode