aliasgerovs commited on
Commit
dd9b08a
·
1 Parent(s): ca39c04

Added latest updates related to higlighter fix

Browse files
Files changed (3) hide show
  1. app.py +3 -3
  2. highlighter.py +19 -11
  3. predictors.py +47 -116
app.py CHANGED
@@ -6,7 +6,7 @@ from predictors import update,update_main, correct_text, split_text
6
  from analysis import depth_analysis
7
  from predictors import predict_quillbot
8
  from plagiarism import plagiarism_check, build_date, html_highlight
9
- from highlighter import analyze_and_highlight
10
  from utils import extract_text_from_pdf, len_validator
11
  import yaml
12
  from functools import partial
@@ -20,9 +20,9 @@ with open("config.yaml", "r") as file:
20
  model_list = params["MC_OUTPUT_LABELS"]
21
 
22
 
23
- analyze_and_highlight_bc = partial(analyze_and_highlight, model_type="bc")
24
  analyze_and_highlight_quillbot = partial(
25
- analyze_and_highlight, model_type="quillbot"
26
  )
27
 
28
 
 
6
  from analysis import depth_analysis
7
  from predictors import predict_quillbot
8
  from plagiarism import plagiarism_check, build_date, html_highlight
9
+ from highlighter import segmented_higlighter
10
  from utils import extract_text_from_pdf, len_validator
11
  import yaml
12
  from functools import partial
 
20
  model_list = params["MC_OUTPUT_LABELS"]
21
 
22
 
23
+ analyze_and_highlight_bc = partial(segmented_higlighter, model_type="bc")
24
  analyze_and_highlight_quillbot = partial(
25
+ segmented_higlighter, model_type="quillbot"
26
  )
27
 
28
 
highlighter.py CHANGED
@@ -2,7 +2,7 @@ from lime.lime_text import LimeTextExplainer
2
  from nltk.tokenize import sent_tokenize
3
  from predictors import predict_for_explainanility
4
  from predictors import update, correct_text, split_text
5
-
6
 
7
  def explainer(text, model_type):
8
  def predictor_wrapper(text):
@@ -15,7 +15,7 @@ def explainer(text, model_type):
15
  sentences = [sent for sent in sent_tokenize(text)]
16
  num_sentences = len(sentences)
17
  exp = explainer_.explain_instance(
18
- text, predictor_wrapper, num_features=num_sentences, num_samples=2000
19
  )
20
  weights_mapping = exp.as_map()[1]
21
  sentences_weights = {sentence: 0 for sentence in sentences}
@@ -23,15 +23,12 @@ def explainer(text, model_type):
23
  if 0 <= idx < len(sentences):
24
  sentences_weights[sentences[idx]] = weight
25
  print(sentences_weights, model_type)
26
- return sentences_weights, exp
27
 
28
 
29
  def analyze_and_highlight(text, bias_buster_selected, model_type):
30
- if bias_buster_selected:
31
- text = update(text)
32
-
33
  highlighted_text = ""
34
- sentences_weights, _ = explainer(text, model_type)
35
  positive_weights = [weight for weight in sentences_weights.values() if weight >= 0]
36
  negative_weights = [weight for weight in sentences_weights.values() if weight < 0]
37
 
@@ -44,7 +41,8 @@ def analyze_and_highlight(text, bias_buster_selected, model_type):
44
  max_positive_weight += smoothing_factor
45
  min_negative_weight -= smoothing_factor
46
 
47
- for sentence, weight in sentences_weights.items():
 
48
  sentence = sentence.strip()
49
  if not sentence:
50
  continue
@@ -67,6 +65,17 @@ def analyze_and_highlight(text, bias_buster_selected, model_type):
67
  )
68
  highlighted_text += highlighted_sentence
69
 
 
 
 
 
 
 
 
 
 
 
 
70
  if model_type == "bc":
71
  gradient_labels = ["HUMAN", "AI"]
72
  elif model_type == "quillbot":
@@ -76,7 +85,7 @@ def analyze_and_highlight(text, bias_buster_selected, model_type):
76
 
77
  highlighted_text = (
78
  "<div>"
79
- + highlighted_text
80
  + "<div style='margin-top: 20px; text-align: center;'>"
81
  + "<div style='position: relative; display: inline-block; width: 60%; height: 20px; background: linear-gradient(to right, #00FF00, #FFFFFF, #FF0000); font-family: \"Segoe UI\", Tahoma, Geneva, Verdana, sans-serif; font-size: 10px; font-weight: 600; color: #222; border-radius: 10px; box-shadow: 0px 2px 5px rgba(0, 0, 0, 0.1);'>"
82
  + f"<span style='position: absolute; left: 5px; top: 50%; transform: translateY(-50%); color: #000; font-weight: 600;'>{gradient_labels[0]}</span>"
@@ -85,5 +94,4 @@ def analyze_and_highlight(text, bias_buster_selected, model_type):
85
  + "</div>"
86
  + "</div>"
87
  )
88
-
89
- return highlighted_text
 
2
  from nltk.tokenize import sent_tokenize
3
  from predictors import predict_for_explainanility
4
  from predictors import update, correct_text, split_text
5
+ from predictors import split_text_allow_complete_sentences_nltk, get_token_length
6
 
7
  def explainer(text, model_type):
8
  def predictor_wrapper(text):
 
15
  sentences = [sent for sent in sent_tokenize(text)]
16
  num_sentences = len(sentences)
17
  exp = explainer_.explain_instance(
18
+ text, predictor_wrapper, num_features=num_sentences, num_samples=100
19
  )
20
  weights_mapping = exp.as_map()[1]
21
  sentences_weights = {sentence: 0 for sentence in sentences}
 
23
  if 0 <= idx < len(sentences):
24
  sentences_weights[sentences[idx]] = weight
25
  print(sentences_weights, model_type)
26
+ return sentences_weights, sentences, exp
27
 
28
 
29
  def analyze_and_highlight(text, bias_buster_selected, model_type):
 
 
 
30
  highlighted_text = ""
31
+ sentences_weights, sentences, _ = explainer(text, model_type)
32
  positive_weights = [weight for weight in sentences_weights.values() if weight >= 0]
33
  negative_weights = [weight for weight in sentences_weights.values() if weight < 0]
34
 
 
41
  max_positive_weight += smoothing_factor
42
  min_negative_weight -= smoothing_factor
43
 
44
+ for sentence in sentences:
45
+ weight = sentences_weights[sentence]
46
  sentence = sentence.strip()
47
  if not sentence:
48
  continue
 
65
  )
66
  highlighted_text += highlighted_sentence
67
 
68
+ return highlighted_text
69
+
70
+ def segmented_higlighter(text, bias_buster_selected, model_type):
71
+ if bias_buster_selected:
72
+ text = update(text)
73
+ result = ""
74
+ segmented_results = split_text_allow_complete_sentences_nltk(text)
75
+ for segment in segmented_results:
76
+ chunk = analyze_and_highlight(segment, model_type)
77
+ result = result + " " + chunk
78
+ print(result)
79
  if model_type == "bc":
80
  gradient_labels = ["HUMAN", "AI"]
81
  elif model_type == "quillbot":
 
85
 
86
  highlighted_text = (
87
  "<div>"
88
+ + result
89
  + "<div style='margin-top: 20px; text-align: center;'>"
90
  + "<div style='position: relative; display: inline-block; width: 60%; height: 20px; background: linear-gradient(to right, #00FF00, #FFFFFF, #FF0000); font-family: \"Segoe UI\", Tahoma, Geneva, Verdana, sans-serif; font-size: 10px; font-weight: 600; color: #222; border-radius: 10px; box-shadow: 0px 2px 5px rgba(0, 0, 0, 0.1);'>"
91
  + f"<span style='position: absolute; left: 5px; top: 50%; transform: translateY(-50%); color: #000; font-weight: 600;'>{gradient_labels[0]}</span>"
 
94
  + "</div>"
95
  + "</div>"
96
  )
97
+ return highlighted_text
 
predictors.py CHANGED
@@ -24,7 +24,6 @@ with open("config.yaml", "r") as file:
24
  nltk.download("punkt")
25
  nltk.download("stopwords")
26
  device_needed = "cuda" if torch.cuda.is_available() else "cpu"
27
- device = 'cpu'
28
  text_bc_model_path = params["TEXT_BC_MODEL_PATH"]
29
  text_mc_model_path = params["TEXT_MC_MODEL_PATH"]
30
  text_quillbot_model_path = params["TEXT_QUILLBOT_MODEL_PATH"]
@@ -50,12 +49,12 @@ quillbot_model = AutoModelForSequenceClassification.from_pretrained(
50
 
51
 
52
  # proxy models for explainability
53
- mini_bc_model_name = "polygraf-ai/bc-model-bert-mini"
54
  bc_tokenizer_mini = AutoTokenizer.from_pretrained(mini_bc_model_name)
55
  bc_model_mini = AutoModelForSequenceClassification.from_pretrained(
56
  mini_bc_model_name
57
  ).to(device_needed)
58
- mini_humanizer_model_name = "polygraf-ai/quillbot-detector-bert-mini-9K"
59
  humanizer_tokenizer_mini = AutoTokenizer.from_pretrained(
60
  mini_humanizer_model_name
61
  )
@@ -119,83 +118,58 @@ def update_main(text: str):
119
  corrections_display = "\n\n".join([f"Original: {orig}\nCorrected: {corr}" for orig, corr in corrections])
120
  return corrected_text, corrections_display
121
 
122
- def split_text_allow_complete_sentences_nltk(
123
- text,
124
- max_length=256,
125
- tolerance=30,
126
- min_last_segment_length=100,
127
- type_det="bc",
128
- ):
129
- sentences = nltk.sent_tokenize(text)
130
- segments = []
131
- current_segment = []
 
132
  current_length = 0
133
  if type_det == "bc":
134
  tokenizer = text_bc_tokenizer
135
- max_length = bc_token_size
136
  elif type_det == "mc":
137
  tokenizer = text_mc_tokenizer
138
- max_length = mc_token_size
139
- for sentence in sentences:
140
- tokens = tokenizer.tokenize(sentence)
141
- sentence_length = len(tokens)
 
 
 
 
 
 
 
 
 
 
 
142
 
143
- if current_length + sentence_length <= max_length + tolerance - 2:
144
- current_segment.append(sentence)
145
- current_length += sentence_length
146
- else:
147
- if current_segment:
148
- encoded_segment = tokenizer.encode(
149
- " ".join(current_segment),
150
- add_special_tokens=True,
151
- max_length=max_length + tolerance,
152
- truncation=True,
153
- )
154
- segments.append((current_segment, len(encoded_segment)))
155
- current_segment = [sentence]
156
- current_length = sentence_length
157
-
158
- if current_segment:
159
- encoded_segment = tokenizer.encode(
160
- " ".join(current_segment),
161
- add_special_tokens=True,
162
- max_length=max_length + tolerance,
163
- truncation=True,
164
- )
165
- segments.append((current_segment, len(encoded_segment)))
166
-
167
- final_segments = []
168
- for i, (seg, length) in enumerate(segments):
169
- if i == len(segments) - 1:
170
- if length < min_last_segment_length and len(final_segments) > 0:
171
- prev_seg, prev_length = final_segments[-1]
172
- combined_encoded = tokenizer.encode(
173
- " ".join(prev_seg + seg),
174
- add_special_tokens=True,
175
- max_length=max_length + tolerance,
176
- truncation=True,
177
- )
178
- if len(combined_encoded) <= max_length + tolerance:
179
- final_segments[-1] = (prev_seg + seg, len(combined_encoded))
180
- else:
181
- final_segments.append((seg, length))
182
  else:
183
- final_segments.append((seg, length))
 
184
  else:
185
- final_segments.append((seg, length))
186
-
187
- decoded_segments = []
188
- encoded_segments = []
189
- for seg, _ in final_segments:
190
- encoded_segment = tokenizer.encode(
191
- " ".join(seg),
192
- add_special_tokens=True,
193
- max_length=max_length + tolerance,
194
- truncation=True,
195
- )
196
- decoded_segment = tokenizer.decode(encoded_segment)
197
- decoded_segments.append(decoded_segment)
198
- return decoded_segments
199
 
200
 
201
  def predict_quillbot(text, bias_buster_selected):
@@ -227,7 +201,7 @@ def predict_for_explainanility(text, model_type=None):
227
  tokenizer = humanizer_tokenizer_mini
228
  elif model_type == "bc":
229
  cleaning = True
230
- max_length = 512
231
  model = bc_model_mini
232
  tokenizer = bc_tokenizer_mini
233
  else:
@@ -278,46 +252,6 @@ def predict_mc(model, tokenizer, text):
278
  return output_norm
279
 
280
 
281
- def predict_mc_scores(input):
282
- bc_scores = []
283
- mc_scores = []
284
-
285
- samples_len_bc = len(
286
- split_text_allow_complete_sentences_nltk(input, type_det="bc")
287
- )
288
- segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
289
- for i in range(samples_len_bc):
290
- cleaned_text_bc = remove_special_characters(segments_bc[i])
291
- bc_score = predict_bc(text_bc_model, text_bc_tokenizer, cleaned_text_bc)
292
- bc_scores.append(bc_score)
293
- bc_scores_array = np.array(bc_scores)
294
- average_bc_scores = np.mean(bc_scores_array, axis=0)
295
- bc_score_list = average_bc_scores.tolist()
296
- bc_score = {"AI": bc_score_list[1], "HUMAN": bc_score_list[0]}
297
- segments_mc = split_text_allow_complete_sentences_nltk(input, type_det="mc")
298
- samples_len_mc = len(
299
- split_text_allow_complete_sentences_nltk(input, type_det="mc")
300
- )
301
- for i in range(samples_len_mc):
302
- cleaned_text_mc = remove_special_characters(segments_mc[i])
303
- mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text_mc)
304
- mc_scores.append(mc_score)
305
- mc_scores_array = np.array(mc_scores)
306
- average_mc_scores = np.mean(mc_scores_array, axis=0)
307
- mc_score_list = average_mc_scores.tolist()
308
- mc_score = {}
309
- for score, label in zip(mc_score_list, mc_label_map):
310
- mc_score[label.upper()] = score
311
-
312
- sum_prob = 1 - bc_score["HUMAN"]
313
- for key, value in mc_score.items():
314
- mc_score[key] = value * sum_prob
315
- if sum_prob < 0.01:
316
- mc_score = {}
317
-
318
- return mc_score
319
-
320
-
321
  def predict_bc_scores(input):
322
  bc_scores = []
323
  samples_len_bc = len(
@@ -385,9 +319,6 @@ def predict_mc_scores(input):
385
  for score, label in zip(mc_score_list, mc_label_map):
386
  mc_score[label.upper()] = score
387
 
388
- total = sum(mc_score.values())
389
- # Normalize each value by dividing it by the total
390
- mc_score = {key: value / total for key, value in mc_score.items()}
391
  sum_prob = 1 - bc_score["HUMAN"]
392
  for key, value in mc_score.items():
393
  mc_score[key] = value * sum_prob
 
24
  nltk.download("punkt")
25
  nltk.download("stopwords")
26
  device_needed = "cuda" if torch.cuda.is_available() else "cpu"
 
27
  text_bc_model_path = params["TEXT_BC_MODEL_PATH"]
28
  text_mc_model_path = params["TEXT_MC_MODEL_PATH"]
29
  text_quillbot_model_path = params["TEXT_QUILLBOT_MODEL_PATH"]
 
49
 
50
 
51
  # proxy models for explainability
52
+ mini_bc_model_name = "polygraf-ai/bc-model"
53
  bc_tokenizer_mini = AutoTokenizer.from_pretrained(mini_bc_model_name)
54
  bc_model_mini = AutoModelForSequenceClassification.from_pretrained(
55
  mini_bc_model_name
56
  ).to(device_needed)
57
+ mini_humanizer_model_name = "polygraf-ai/humanizer-model"
58
  humanizer_tokenizer_mini = AutoTokenizer.from_pretrained(
59
  mini_humanizer_model_name
60
  )
 
118
  corrections_display = "\n\n".join([f"Original: {orig}\nCorrected: {corr}" for orig, corr in corrections])
119
  return corrected_text, corrections_display
120
 
121
+ def split_text(text: str) -> list:
122
+ sentences = sent_tokenize(text)
123
+ return [[sentence] for sentence in sentences]
124
+
125
+ def get_token_length(tokenizer, sentence):
126
+ return len(tokenizer.tokenize(sentence))
127
+
128
+ def split_text_allow_complete_sentences_nltk(text, type_det="bc"):
129
+ sentences = sent_tokenize(text)
130
+ chunks = []
131
+ current_chunk = []
132
  current_length = 0
133
  if type_det == "bc":
134
  tokenizer = text_bc_tokenizer
135
+ max_tokens = bc_token_size
136
  elif type_det == "mc":
137
  tokenizer = text_mc_tokenizer
138
+ max_tokens = mc_token_size
139
+
140
+ elif type_det == "quillbot":
141
+ tokenizer = quillbot_tokenizer
142
+ max_tokens = 256
143
+
144
+ def add_sentence_to_chunk(sentence):
145
+ nonlocal current_chunk, current_length
146
+ sentence_length = get_token_length(tokenizer, sentence)
147
+ if current_length + sentence_length > max_tokens:
148
+ chunks.append((current_chunk, current_length))
149
+ current_chunk = []
150
+ current_length = 0
151
+ current_chunk.append(sentence)
152
+ current_length += sentence_length
153
 
154
+ for sentence in sentences:
155
+ add_sentence_to_chunk(sentence)
156
+ if current_chunk:
157
+ chunks.append((current_chunk, current_length))
158
+ adjusted_chunks = []
159
+ while chunks:
160
+ chunk = chunks.pop(0)
161
+ if len(chunks) > 0 and chunk[1] < max_tokens / 2:
162
+ next_chunk = chunks.pop(0)
163
+ combined_length = chunk[1] + next_chunk[1]
164
+ if combined_length <= max_tokens:
165
+ adjusted_chunks.append((chunk[0] + next_chunk[0], combined_length))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  else:
167
+ adjusted_chunks.append(chunk)
168
+ chunks.insert(0, next_chunk)
169
  else:
170
+ adjusted_chunks.append(chunk)
171
+ result_chunks = [" ".join(chunk[0]) for chunk in adjusted_chunks]
172
+ return result_chunks
 
 
 
 
 
 
 
 
 
 
 
173
 
174
 
175
  def predict_quillbot(text, bias_buster_selected):
 
201
  tokenizer = humanizer_tokenizer_mini
202
  elif model_type == "bc":
203
  cleaning = True
204
+ max_length = bc_token_size
205
  model = bc_model_mini
206
  tokenizer = bc_tokenizer_mini
207
  else:
 
252
  return output_norm
253
 
254
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
  def predict_bc_scores(input):
256
  bc_scores = []
257
  samples_len_bc = len(
 
319
  for score, label in zip(mc_score_list, mc_label_map):
320
  mc_score[label.upper()] = score
321
 
 
 
 
322
  sum_prob = 1 - bc_score["HUMAN"]
323
  for key, value in mc_score.items():
324
  mc_score[key] = value * sum_prob