jgyasu commited on
Commit
02d0f22
1 Parent(s): 41fe9d1

Delete text_paraphraser.py

Browse files
Files changed (1) hide show
  1. text_paraphraser.py +0 -503
text_paraphraser.py DELETED
@@ -1,503 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- """text-paraphraser.ipynb
3
-
4
- Automatically generated by Colab.
5
-
6
- Original file is located at
7
- https://colab.research.google.com/drive/1pFGR4uvXMMWVJFQeFmn--arumSxqa5Yy
8
- """
9
-
10
- !pip install gradio
11
-
12
- import gradio as gr
13
-
14
- # import streamlit as st
15
- from transformers import AutoTokenizer
16
- from transformers import AutoModelForSeq2SeqLM
17
- import plotly.graph_objects as go
18
- from transformers import pipeline
19
- import re
20
- import time
21
- import requests
22
- from PIL import Image
23
- import itertools
24
- import numpy as np
25
- import matplotlib.pyplot as plt
26
- import matplotlib
27
- from matplotlib.colors import ListedColormap, rgb2hex
28
- import ipywidgets as widgets
29
- from IPython.display import display, HTML
30
- import pandas as pd
31
- from pprint import pprint
32
- from tenacity import retry
33
- from tqdm import tqdm
34
- # import tiktoken
35
- import scipy.stats
36
- import torch
37
- from transformers import GPT2LMHeadModel
38
- import seaborn as sns
39
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
40
- # from colorama import Fore, Style
41
- # import openai
42
- import random
43
- from nltk.corpus import stopwords
44
- from termcolor import colored
45
- import nltk
46
- from nltk.translate.bleu_score import sentence_bleu
47
- from transformers import BertTokenizer, BertModel
48
-
49
- import nltk
50
- nltk.download('stopwords')
51
-
52
- # Function to Initialize the Model
53
- def init_model():
54
- para_tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
55
- para_model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
56
- return para_tokenizer, para_model
57
-
58
- # Function to Paraphrase the Text
59
- def paraphrase(question, para_tokenizer, para_model, num_beams=5, num_beam_groups=5, num_return_sequences=5, repetition_penalty=10.0, diversity_penalty=3.0, no_repeat_ngram_size=2, temperature=0.7, max_length=64):
60
- input_ids = para_tokenizer(
61
- f'paraphrase: {question}',
62
- return_tensors="pt", padding="longest",
63
- max_length=max_length,
64
- truncation=True,
65
- ).input_ids
66
- outputs = para_model.generate(
67
- input_ids, temperature=temperature, repetition_penalty=repetition_penalty,
68
- num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
69
- num_beams=num_beams, num_beam_groups=num_beam_groups,
70
- max_length=max_length, diversity_penalty=diversity_penalty
71
- )
72
- res = para_tokenizer.batch_decode(outputs, skip_special_tokens=True)
73
- return res
74
-
75
- # Function to Find the Longest Common Substring Words Subsequence
76
- def longest_common_subss(original_sentence, paraphrased_sentences):
77
- stop_words = set(stopwords.words('english'))
78
- original_sentence_lower = original_sentence.lower()
79
- paraphrased_sentences_lower = [s.lower() for s in paraphrased_sentences]
80
- paraphrased_sentences_no_stopwords = []
81
-
82
- for sentence in paraphrased_sentences_lower:
83
- words = re.findall(r'\b\w+\b', sentence)
84
- filtered_sentence = ' '.join([word for word in words if word not in stop_words])
85
- paraphrased_sentences_no_stopwords.append(filtered_sentence)
86
-
87
- results = []
88
- for sentence in paraphrased_sentences_no_stopwords:
89
- common_words = set(original_sentence_lower.split()) & set(sentence.split())
90
- for word in common_words:
91
- sentence = sentence.replace(word, colored(word, 'green'))
92
- results.append({
93
- "Original Sentence": original_sentence_lower,
94
- "Paraphrased Sentence": sentence,
95
- "Substrings Word Pair": common_words
96
- })
97
- return results
98
-
99
- # Function to Find Common Substring Word between each paraphrase sentences
100
- def common_substring_word(original_sentence, paraphrased_sentences):
101
- stop_words = set(stopwords.words('english'))
102
- original_sentence_lower = original_sentence.lower()
103
- paraphrased_sentences_lower = [s.lower() for s in paraphrased_sentences]
104
- paraphrased_sentences_no_stopwords = []
105
-
106
- for sentence in paraphrased_sentences_lower:
107
- words = re.findall(r'\b\w+\b', sentence)
108
- filtered_sentence = ' '.join([word for word in words if word not in stop_words])
109
- paraphrased_sentences_no_stopwords.append(filtered_sentence)
110
-
111
- results = []
112
- for idx, sentence in enumerate(paraphrased_sentences_no_stopwords):
113
- common_words = set(original_sentence_lower.split()) & set(sentence.split())
114
- common_substrings = ', '.join(sorted(common_words))
115
- for word in common_words:
116
- sentence = sentence.replace(word, colored(word, 'green'))
117
- results.append({
118
- f"Paraphrased Sentence {idx+1}": sentence,
119
- "Common Substrings": common_substrings
120
- })
121
- return results
122
-
123
- # Function to Watermark a Word Take Randomly Between Each lcs Point (Random Sampling)
124
- def random_sampling(original_sentence, paraphrased_sentences):
125
- stop_words = set(stopwords.words('english'))
126
- original_sentence_lower = original_sentence.lower()
127
- paraphrased_sentences_lower = [s.lower() for s in paraphrased_sentences]
128
- paraphrased_sentences_no_stopwords = []
129
-
130
- for sentence in paraphrased_sentences_lower:
131
- words = re.findall(r'\b\w+\b', sentence)
132
- filtered_sentence = ' '.join([word for word in words if word not in stop_words])
133
- paraphrased_sentences_no_stopwords.append(filtered_sentence)
134
-
135
- results = []
136
- for idx, sentence in enumerate(paraphrased_sentences_no_stopwords):
137
- common_words = set(original_sentence_lower.split()) & set(sentence.split())
138
- common_substrings = ', '.join(sorted(common_words))
139
-
140
- words_to_replace = [word for word in sentence.split() if word not in common_words]
141
- if words_to_replace:
142
- word_to_mark = random.choice(words_to_replace)
143
- sentence = sentence.replace(word_to_mark, colored(word_to_mark, 'red'))
144
-
145
- for word in common_words:
146
- sentence = sentence.replace(word, colored(word, 'green'))
147
-
148
- results.append({
149
- f"Paraphrased Sentence {idx+1}": sentence,
150
- "Common Substrings": common_substrings
151
- })
152
- return results
153
-
154
- # Function for Inverse Transform Sampling
155
- def inverse_transform_sampling(original_sentence, paraphrased_sentences):
156
- stop_words = set(stopwords.words('english'))
157
- original_sentence_lower = original_sentence.lower()
158
- paraphrased_sentences_lower = [s.lower() for s in paraphrased_sentences]
159
- paraphrased_sentences_no_stopwords = []
160
-
161
- for sentence in paraphrased_sentences_lower:
162
- words = re.findall(r'\b\w+\b', sentence)
163
- filtered_sentence = ' '.join([word for word in words if word not in stop_words])
164
- paraphrased_sentences_no_stopwords.append(filtered_sentence)
165
-
166
- results = []
167
- for idx, sentence in enumerate(paraphrased_sentences_no_stopwords):
168
- common_words = set(original_sentence_lower.split()) & set(sentence.split())
169
- common_substrings = ', '.join(sorted(common_words))
170
-
171
- words_to_replace = [word for word in sentence.split() if word not in common_words]
172
- if words_to_replace:
173
- probabilities = [1 / len(words_to_replace)] * len(words_to_replace)
174
- chosen_word = random.choices(words_to_replace, weights=probabilities)[0]
175
- sentence = sentence.replace(chosen_word, colored(chosen_word, 'magenta'))
176
-
177
- for word in common_words:
178
- sentence = sentence.replace(word, colored(word, 'green'))
179
-
180
- results.append({
181
- f"Paraphrased Sentence {idx+1}": sentence,
182
- "Common Substrings": common_substrings
183
- })
184
- return results
185
-
186
- # Function for Contextual Sampling
187
- def contextual_sampling(original_sentence, paraphrased_sentences):
188
- stop_words = set(stopwords.words('english'))
189
- original_sentence_lower = original_sentence.lower()
190
- paraphrased_sentences_lower = [s.lower() for s in paraphrased_sentences]
191
- paraphrased_sentences_no_stopwords = []
192
-
193
- for sentence in paraphrased_sentences_lower:
194
- words = re.findall(r'\b\w+\b', sentence)
195
- filtered_sentence = ' '.join([word for word in words if word not in stop_words])
196
- paraphrased_sentences_no_stopwords.append(filtered_sentence)
197
-
198
- results = []
199
- for idx, sentence in enumerate(paraphrased_sentences_no_stopwords):
200
- common_words = set(original_sentence_lower.split()) & set(sentence.split())
201
- common_substrings = ', '.join(sorted(common_words))
202
-
203
- words_to_replace = [word for word in sentence.split() if word not in common_words]
204
- if words_to_replace:
205
- context = " ".join([word for word in sentence.split() if word not in common_words])
206
- chosen_word = random.choice(words_to_replace)
207
- sentence = sentence.replace(chosen_word, colored(chosen_word, 'red'))
208
-
209
- for word in common_words:
210
- sentence = sentence.replace(word, colored(word, 'green'))
211
-
212
- results.append({
213
- f"Paraphrased Sentence {idx+1}": sentence,
214
- "Common Substrings": common_substrings
215
- })
216
- return results
217
-
218
- # Function for Exponential Minimum Sampling
219
- def exponential_minimum_sampling(original_sentence, paraphrased_sentences):
220
- stop_words = set(stopwords.words('english'))
221
- original_sentence_lower = original_sentence.lower()
222
- paraphrased_sentences_lower = [s.lower() for s in paraphrased_sentences]
223
- paraphrased_sentences_no_stopwords = []
224
-
225
- for sentence in paraphrased_sentences_lower:
226
- words = re.findall(r'\b\w+\b', sentence)
227
- filtered_sentence = ' '.join([word for word in words if word not in stop_words])
228
- paraphrased_sentences_no_stopwords.append(filtered_sentence)
229
-
230
- results = []
231
- for idx, sentence in enumerate(paraphrased_sentences_no_stopwords):
232
- common_words = set(original_sentence_lower.split()) & set(sentence.split())
233
- common_substrings = ', '.join(sorted(common_words))
234
-
235
- words_to_replace = [word for word in sentence.split() if word not in common_words]
236
- if words_to_replace:
237
- num_words = len(words_to_replace)
238
- probabilities = [2 ** (-i) for i in range(num_words)]
239
- chosen_word = random.choices(words_to_replace, weights=probabilities)[0]
240
- sentence = sentence.replace(chosen_word, colored(chosen_word, 'red'))
241
-
242
- for word in common_words:
243
- sentence = sentence.replace(word, colored(word, 'green'))
244
-
245
- results.append({
246
- f"Paraphrased Sentence {idx+1}": sentence,
247
- "Common Substrings": common_substrings
248
- })
249
- return results
250
-
251
- # Function to Calculate the BLEU score
252
- def calculate_bleu(reference, candidate):
253
- return sentence_bleu([reference], candidate)
254
-
255
- # Function to calculate BERT score
256
- def calculate_bert(reference, candidate):
257
- tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
258
- model = BertModel.from_pretrained('bert-base-uncased')
259
-
260
- reference_tokens = tokenizer.tokenize(reference)
261
- candidate_tokens = tokenizer.tokenize(candidate)
262
-
263
- reference_ids = tokenizer.encode(reference, add_special_tokens=True, max_length=512, truncation=True, return_tensors="pt")
264
- candidate_ids = tokenizer.encode(candidate, add_special_tokens=True, max_length=512, truncation=True, return_tensors="pt")
265
-
266
- with torch.no_grad():
267
- reference_outputs = model(reference_ids)
268
- candidate_outputs = model(candidate_ids)
269
-
270
- reference_embeddings = reference_outputs[0][:, 0, :].numpy()
271
- candidate_embeddings = candidate_outputs[0][:, 0, :].numpy()
272
-
273
- cosine_similarity = np.dot(reference_embeddings, candidate_embeddings.T) / (np.linalg.norm(reference_embeddings) * np.linalg.norm(candidate_embeddings))
274
- return np.mean(cosine_similarity)
275
-
276
- # Function to calculate minimum edit distance
277
- def min_edit_distance(reference, candidate):
278
- m = len(reference)
279
- n = len(candidate)
280
-
281
- dp = [[0] * (n + 1) for _ in range(m + 1)]
282
-
283
- for i in range(m + 1):
284
- for j in range(n + 1):
285
- if i == 0:
286
- dp[i][j] = j
287
- elif j == 0:
288
- dp[i][j] = i
289
- elif reference[i - 1] == candidate[j - 1]:
290
- dp[i][j] = dp[i - 1][j - 1]
291
- else:
292
- dp[i][j] = 1 + min(dp[i][j - 1], # Insert
293
- dp[i - 1][j], # Remove
294
- dp[i - 1][j - 1]) # Replace
295
-
296
- return dp[m][n]
297
-
298
- def generate_paraphrase(question):
299
- para_tokenizer, para_model = init_model()
300
- res = paraphrase(question, para_tokenizer, para_model)
301
- return res
302
-
303
- question = "Following the declaration of the State of Israel in 1948, neighboring Arab states invaded. The war ended with Israel controlling a significant portion of the territory. Many Palestinians became refugees."
304
-
305
- import nltk
306
- nltk.download('punkt')
307
- import re
308
- from nltk.corpus import stopwords
309
- from nltk.tokenize import word_tokenize
310
-
311
- import re
312
- from nltk.corpus import stopwords
313
-
314
- def find_common_subsequences(sentence, str_list):
315
- stop_words = set(stopwords.words('english'))
316
- sentence = sentence.lower()
317
-
318
- str_list = [s.lower() for s in str_list]
319
-
320
- def is_present(lcs, str_list):
321
- for string in str_list:
322
- if lcs not in string:
323
- return False
324
- return True
325
-
326
- def remove_stop_words_and_special_chars(sentence):
327
- sentence = re.sub(r'[^\w\s]', '', sentence)
328
- words = sentence.split()
329
- filtered_words = [word for word in words if word.lower() not in stop_words]
330
- return " ".join(filtered_words)
331
-
332
- sentence = remove_stop_words_and_special_chars(sentence)
333
- str_list = [remove_stop_words_and_special_chars(s) for s in str_list]
334
-
335
- words = sentence.split(" ")
336
- common_grams = []
337
- added_phrases = set()
338
-
339
- def is_covered(subseq, added_phrases):
340
- for phrase in added_phrases:
341
- if subseq in phrase:
342
- return True
343
- return False
344
-
345
- for i in range(len(words) - 4):
346
- penta = " ".join(words[i:i+5])
347
- if is_present(penta, str_list):
348
- common_grams.append(penta)
349
- added_phrases.add(penta)
350
-
351
- for i in range(len(words) - 3):
352
- quad = " ".join(words[i:i+4])
353
- if is_present(quad, str_list) and not is_covered(quad, added_phrases):
354
- common_grams.append(quad)
355
- added_phrases.add(quad)
356
-
357
- for i in range(len(words) - 2):
358
- tri = " ".join(words[i:i+3])
359
- if is_present(tri, str_list) and not is_covered(tri, added_phrases):
360
- common_grams.append(tri)
361
- added_phrases.add(tri)
362
-
363
- for i in range(len(words) - 1):
364
- bi = " ".join(words[i:i+2])
365
- if is_present(bi, str_list) and not is_covered(bi, added_phrases):
366
- common_grams.append(bi)
367
- added_phrases.add(bi)
368
-
369
- for i in range(len(words)):
370
- uni = words[i]
371
- if is_present(uni, str_list) and not is_covered(uni, added_phrases):
372
- common_grams.append(uni)
373
- added_phrases.add(uni)
374
-
375
- return common_grams
376
-
377
- question = '''the colorado republican party sent a mass email last week with the subject line "god hates pride"'''
378
- res = generate_paraphrase(question)
379
-
380
- res
381
-
382
- common_grams = find_common_subsequences(question, res[0:3])
383
- common_grams
384
-
385
- common_gram_words = [word for gram in common_grams for word in gram.split()]
386
- common_gram_words
387
-
388
- def llm_output(prompt):
389
- # sequences = text_generator(prompt)
390
- # gen_text = sequences[0]["generated_text"]
391
- # sentences = gen_text.split('.')
392
- # # first_sentence = get_first_sentence(gen_text[len(prompt):])
393
- # return gen_text,sentences[-3]
394
- return prompt,prompt
395
-
396
- import re
397
- import html
398
-
399
- def highlight_phrases_with_colors(sentences, phrases):
400
- color_map = {} # Dictionary to store color assignments for each phrase
401
- color_index = 0 # Index to assign colors sequentially
402
-
403
- # Generate HTML for highlighting each sentence
404
- highlighted_html = []
405
- idx = 1
406
- for sentence in sentences:
407
- sentence_with_idx = f"{idx}. {sentence}"
408
- idx += 1
409
- highlighted_sentence = html.escape(sentence_with_idx)
410
- phrase_count = 0
411
-
412
- # Split sentence into words to apply numbering
413
- words = re.findall(r'\b\w+\b', sentence)
414
- word_index = 1 # Index to track words
415
-
416
- # Highlight each phrase with a unique color and number
417
- for phrase in phrases:
418
- if phrase not in color_map:
419
- # Assign a new color if the phrase hasn't been encountered before
420
- color_map[phrase] = f'hsl({color_index * 60 % 360}, 70%, 80%)'
421
- color_index += 1
422
-
423
- escaped_phrase = re.escape(phrase)
424
- pattern = rf'\b{escaped_phrase}\b'
425
- highlighted_sentence, num_replacements = re.subn(
426
- pattern,
427
- lambda m, count=phrase_count, color=color_map[phrase], index=word_index: (
428
- f'<span style="background-color: {color}; font-weight: bold;'
429
- f' padding: 2px 4px; border-radius: 2px; position: relative;">'
430
- f'<span style="background-color: black; color: white; border-radius: 50%;'
431
- f' padding: 2px 5px; margin-right: 5px;">{index}</span>'
432
- f'{m.group(0)}'
433
- f'</span>'
434
- ),
435
- highlighted_sentence,
436
- flags=re.IGNORECASE
437
- )
438
- if num_replacements > 0:
439
- phrase_count += 1
440
- word_index += 1 # Increment word index after each replacement
441
-
442
- highlighted_html.append(highlighted_sentence)
443
-
444
- # Join sentences with line breaks
445
- final_html = "<br><br>".join(highlighted_html)
446
-
447
- # Wrap in a container div for styling
448
- return f'''
449
- <div style="border: solid 1px #; padding: 16px; background-color: #FFFFFF; color: #374151; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); border-radius: 12px;">
450
- <h3 style="margin-top: 0; font-size: 1.25em; color: #111827;">Paraphrased And Highlighted Text</h3>
451
- <div style="background-color: #F5F5F5; line-height: 1.6; padding: 15px; border-radius: 12px;">{final_html}</div>
452
- </div>
453
- '''
454
-
455
- def model(prompt):
456
- generated,sentence = llm_output(prompt)
457
- res = generate_paraphrase(sentence)
458
- common_subs = longest_common_subss(sentence,res)
459
- # non_melting = non_melting_points(sentence, res)
460
- common_grams = find_common_subsequences(sentence,res)
461
- # common_gram_words = [word for gram in common_grams for word in gram.split()]
462
- for i in range(len(common_subs)):
463
- common_subs[i]["Paraphrased Sentence"] = res[i]
464
- result = highlight_phrases_with_colors(res,common_grams)
465
- return generated, result
466
-
467
- # model(question)
468
-
469
- with gr.Blocks(theme = gr.themes.Monochrome()) as demo:
470
- gr.Markdown("# Paraphrases the Text and Highlights the Non-melting Points")
471
-
472
- with gr.Row():
473
- user_input = gr.Textbox(label="User Prompt")
474
-
475
- with gr.Row():
476
- submit_button = gr.Button("Submit")
477
- clear_button = gr.Button("Clear")
478
-
479
- with gr.Row():
480
- ai_output = gr.Textbox(label="AI-generated Text (Llama3)")
481
-
482
- with gr.Row():
483
- selected_sentence = gr.Textbox(label="Selected Sentence")
484
-
485
- with gr.Row():
486
- html_output = gr.HTML()
487
-
488
- with gr.Row():
489
-
490
- submit_button.click(model, inputs=user_input, outputs=[ai_output, html_output])
491
- clear_button.click(lambda: "", inputs=None, outputs=user_input)
492
- clear_button.click(lambda: "", inputs=None, outputs=[ai_output, html_output])
493
-
494
- # Launch the demo
495
- demo.launch()
496
-
497
- # from pyngrok import ngrok, conf
498
- # conf.get_default().auth_token = '2hsSp28infbSQYi8Es6O0XxbY8R_4nCeErYLzjdjBMDLcfji'
499
- # public_url = ngrok.connect(7861).public_url
500
- # print(public_url)
501
-
502
- # demo.queue().launch(server_port=7861, inline=False, share=False, debug=True)
503
- # demo.launch(share=True,debug=True,inline = False)