minko186 commited on
Commit
7ec48d6
·
1 Parent(s): c0a6bc9

update HTML viewer

Browse files
Files changed (2) hide show
  1. app.py +0 -10
  2. plagiarism.py +55 -83
app.py CHANGED
@@ -224,16 +224,6 @@ with gr.Blocks() as demo:
224
 
225
  with gr.Row():
226
  with gr.Column():
227
- # sentenceBreakdown = gr.HighlightedText(
228
- # label="Source Detection Sentence Breakdown",
229
- # combine_adjacent=True,
230
- # color_map={
231
- # "[1]": "red",
232
- # "[2]": "orange",
233
- # "[3]": "yellow",
234
- # "[4]": "green",
235
- # },
236
- # )
237
  sentenceBreakdown = gr.HTML(
238
  label="Source Detection Sentence Breakdown",
239
  value="Source Detection Sentence Breakdown",
 
224
 
225
  with gr.Row():
226
  with gr.Column():
 
 
 
 
 
 
 
 
 
 
227
  sentenceBreakdown = gr.HTML(
228
  label="Source Detection Sentence Breakdown",
229
  value="Source Detection Sentence Breakdown",
plagiarism.py CHANGED
@@ -19,8 +19,6 @@ model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
19
  # returns cosine similarity of two vectors
20
  # input: two vectors
21
  # output: integer between 0 and 1.
22
-
23
-
24
  def get_cosine(vec1, vec2):
25
  intersection = set(vec1.keys()) & set(vec2.keys())
26
 
@@ -129,14 +127,14 @@ def google_search(
129
 
130
 
131
  def split_sentence_blocks(text):
132
-
133
- sents = sent_tokenize(text)
134
  two_sents = []
135
- for i in range(len(sents)):
136
- if (i % 2) == 0:
137
- two_sents.append(sents[i])
138
- else:
139
- two_sents[len(two_sents) - 1] += " " + sents[i]
 
 
140
  return two_sents
141
 
142
 
@@ -216,6 +214,26 @@ def print2d(array):
216
  print(row)
217
 
218
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  def html_highlight(
220
  plag_option,
221
  input,
@@ -239,24 +257,38 @@ def html_highlight(
239
  domains_to_skip,
240
  )
241
  color_map = [
242
- "#e06b63",
243
  "#eb9d59",
244
  "#c2ad36",
245
  "#e1ed72",
246
  "#c2db76",
247
  "#a2db76",
248
  ]
249
- html_content = "<div style='font-family: Roboto; border: 2px solid black; background-color: #333333; padding: 10px; color: #FFFFFF;'>"
 
 
 
250
  for sentence, _, _, idx in sentence_scores:
251
- color = color_map[idx - 1]
252
- formatted_sentence = f'<p style="background-color: {color}; padding: 5px;">{sentence} [{idx}]</p>'
 
 
 
 
 
 
 
 
 
 
 
253
  html_content += formatted_sentence
254
 
255
  html_content += "<hr>"
256
  for url, score, idx in url_scores:
257
  color = color_map[idx - 1]
258
- formatted_name = f'<p style="background-color: {color}; padding: 5px;">({idx}) {url} --- Matching Score:{score}</p>'
259
- html_content += formatted_name
260
 
261
  html_content += "</div>"
262
 
@@ -278,13 +310,11 @@ def plagiarism_check(
278
  api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
279
  api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
280
  # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
281
- api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
282
  cse_id = "851813e81162b4ed4"
283
 
284
  url_scores = []
285
  sentence_scores = []
286
- # for input in input.split("\n\n"):
287
- print(input)
288
  sentences = split_sentence_blocks(input)
289
  url_count = {}
290
  score_array = []
@@ -305,21 +335,7 @@ def plagiarism_check(
305
  cse_id,
306
  )
307
  # Scrape URLs in list
308
- formatted_tokens = []
309
  soups = asyncio.run(parallel_scrap(url_list))
310
-
311
- # # Populate matching scores for scrapped pages
312
- # for i, soup in enumerate(soups):
313
- # print(f"Analyzing {i+1} of {len(soups)} soups........................")
314
- # if soup:
315
- # page_content = soup.text
316
-
317
- # for j, sent in enumerate(sentences):
318
- # args_list = (sent, page_content)
319
- # score = matching_score(args_list)
320
- # # score = cos_sim_torch(embed_text(sent), source_embeddings[i])
321
- # score_array[i][j] = score
322
-
323
  input_data = []
324
  for i, soup in enumerate(soups):
325
  if soup:
@@ -336,29 +352,7 @@ def plagiarism_check(
336
  score_array[i][j] = scores[k]
337
  k += 1
338
 
339
- # Map sentence with max URL with small margin to keep consider same URL
340
- # for consecutive sentences
341
- sentenceToMaxURL = [-1] * len(sentences)
342
- for j in range(len(sentences)):
343
- if j > 0:
344
- maxScore = score_array[sentenceToMaxURL[j - 1]][j]
345
- sentenceToMaxURL[j] = sentenceToMaxURL[j - 1]
346
- else:
347
- maxScore = -1
348
-
349
- for i in range(len(score_array)):
350
- margin = (
351
- 0.05
352
- if (j > 0 and sentenceToMaxURL[j] == sentenceToMaxURL[j - 1])
353
- else 0
354
- )
355
- if score_array[i][j] - maxScore > margin:
356
- maxScore = score_array[i][j]
357
- sentenceToMaxURL[j] = i
358
- # if score_array[i][j] > maxScore:
359
- # maxScore = score_array[i][j]
360
- # sentenceToMaxURL[j] = i
361
-
362
  index = np.unique(sentenceToMaxURL)
363
 
364
  url_source = {}
@@ -369,13 +363,12 @@ def plagiarism_check(
369
  if sentenceToMaxURL[sen] == url
370
  ]
371
  url_source[url] = sum(s) / len(s)
372
-
373
  index_descending = sorted(url_source, key=url_source.get, reverse=True)
374
-
375
  urlMap = {}
376
  for count, i in enumerate(index_descending):
377
  urlMap[i] = count + 1
378
 
 
379
  for i, sent in enumerate(sentences):
380
  ind = sentenceToMaxURL[i]
381
  if url_source[ind] > 0.1:
@@ -383,32 +376,11 @@ def plagiarism_check(
383
  [sent, url_source[ind], url_list[ind], urlMap[ind]]
384
  )
385
  else:
386
- sentence_scores.append([sent, None, url_list[ind], urlMap[ind]])
387
  for ind in index_descending:
388
- url_scores.append(
389
- [url_list[ind], round(url_source[ind] * 100, 2), urlMap[ind]]
390
- )
 
391
 
392
  return sentence_scores, url_scores
393
-
394
- # for i, sent in enumerate(sentences):
395
- # formatted_tokens.append(
396
- # (sent, "[" + str(urlMap[sentenceToMaxURL[i]]) + "]")
397
- # )
398
-
399
- # formatted_tokens.append(("\n", None))
400
- # formatted_tokens.append(("\n", None))
401
- # formatted_tokens.append(("\n", None))
402
-
403
- # for ind in index_descending:
404
- # formatted_tokens.append(
405
- # (
406
- # url_list[ind]
407
- # + " --- Matching Score: "
408
- # + f"{str(round(url_source[ind] * 100, 2))}%",
409
- # "[" + str(urlMap[ind]) + "]",
410
- # )
411
- # )
412
- # formatted_tokens.append(("\n", None))
413
-
414
- # return formatted_tokens
 
19
  # returns cosine similarity of two vectors
20
  # input: two vectors
21
  # output: integer between 0 and 1.
 
 
22
  def get_cosine(vec1, vec2):
23
  intersection = set(vec1.keys()) & set(vec2.keys())
24
 
 
127
 
128
 
129
  def split_sentence_blocks(text):
 
 
130
  two_sents = []
131
+ for para in text.split("\n\n"):
132
+ sents = sent_tokenize(para)
133
+ for i in range(len(sents)):
134
+ if (i % 2) == 0:
135
+ two_sents.append(sents[i])
136
+ else:
137
+ two_sents[len(two_sents) - 1] += " " + sents[i]
138
  return two_sents
139
 
140
 
 
214
  print(row)
215
 
216
 
217
+ def map_sentence_url(sentences, score_array):
218
+ sentenceToMaxURL = [-1] * len(sentences)
219
+ for j in range(len(sentences)):
220
+ if j > 0:
221
+ maxScore = score_array[sentenceToMaxURL[j - 1]][j]
222
+ sentenceToMaxURL[j] = sentenceToMaxURL[j - 1]
223
+ else:
224
+ maxScore = -1
225
+ for i in range(len(score_array)):
226
+ margin = (
227
+ 0.05
228
+ if (j > 0 and sentenceToMaxURL[j] == sentenceToMaxURL[j - 1])
229
+ else 0
230
+ )
231
+ if score_array[i][j] - maxScore > margin:
232
+ maxScore = score_array[i][j]
233
+ sentenceToMaxURL[j] = i
234
+ return sentenceToMaxURL
235
+
236
+
237
  def html_highlight(
238
  plag_option,
239
  input,
 
257
  domains_to_skip,
258
  )
259
  color_map = [
260
+ "#cf2323",
261
  "#eb9d59",
262
  "#c2ad36",
263
  "#e1ed72",
264
  "#c2db76",
265
  "#a2db76",
266
  ]
267
+ font = "Roboto"
268
+ html_content = "<link href='https://fonts.googleapis.com/css?family=Roboto' rel='stylesheet'>\n<div style='font-family: {font}; border: 2px solid black; background-color: #333333; padding: 10px; color: #FFFFFF;'>"
269
+ prev_idx = None
270
+ combined_sentence = ""
271
  for sentence, _, _, idx in sentence_scores:
272
+ if idx != prev_idx and prev_idx is not None:
273
+ color = color_map[prev_idx - 1]
274
+ index_part = f'<span style="background-color: {color}; padding: 2px;">[{prev_idx}]</span>'
275
+ formatted_sentence = f"<p>{combined_sentence} {index_part}</p>"
276
+ html_content += formatted_sentence
277
+ combined_sentence = ""
278
+ combined_sentence += " " + sentence
279
+ prev_idx = idx
280
+
281
+ if combined_sentence:
282
+ color = color_map[prev_idx - 1]
283
+ index_part = f'<span style="background-color: {color}; padding: 2px;">[{prev_idx}]</span>'
284
+ formatted_sentence = f"<p>{combined_sentence} {index_part}</p>"
285
  html_content += formatted_sentence
286
 
287
  html_content += "<hr>"
288
  for url, score, idx in url_scores:
289
  color = color_map[idx - 1]
290
+ formatted_url = f'<p style="background-color: {color}; padding: 5px;">({idx}) <b>{url}</b></p><p> --- Matching Score: {score}%</p>'
291
+ html_content += formatted_url
292
 
293
  html_content += "</div>"
294
 
 
310
  api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
311
  api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
312
  # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
313
+ # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
314
  cse_id = "851813e81162b4ed4"
315
 
316
  url_scores = []
317
  sentence_scores = []
 
 
318
  sentences = split_sentence_blocks(input)
319
  url_count = {}
320
  score_array = []
 
335
  cse_id,
336
  )
337
  # Scrape URLs in list
 
338
  soups = asyncio.run(parallel_scrap(url_list))
 
 
 
 
 
 
 
 
 
 
 
 
 
339
  input_data = []
340
  for i, soup in enumerate(soups):
341
  if soup:
 
352
  score_array[i][j] = scores[k]
353
  k += 1
354
 
355
+ sentenceToMaxURL = map_sentence_url(sentences, score_array)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
356
  index = np.unique(sentenceToMaxURL)
357
 
358
  url_source = {}
 
363
  if sentenceToMaxURL[sen] == url
364
  ]
365
  url_source[url] = sum(s) / len(s)
 
366
  index_descending = sorted(url_source, key=url_source.get, reverse=True)
 
367
  urlMap = {}
368
  for count, i in enumerate(index_descending):
369
  urlMap[i] = count + 1
370
 
371
+ # build results
372
  for i, sent in enumerate(sentences):
373
  ind = sentenceToMaxURL[i]
374
  if url_source[ind] > 0.1:
 
376
  [sent, url_source[ind], url_list[ind], urlMap[ind]]
377
  )
378
  else:
379
+ sentence_scores.append([sent, None, url_list[ind], -1])
380
  for ind in index_descending:
381
+ if url_source[ind] > 0.1:
382
+ url_scores.append(
383
+ [url_list[ind], round(url_source[ind] * 100, 2), urlMap[ind]]
384
+ )
385
 
386
  return sentence_scores, url_scores