Towhidul commited on
Commit
1416b93
1 Parent(s): 26f648a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +149 -38
app.py CHANGED
@@ -27,6 +27,8 @@ import seaborn as sns
27
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
28
  from colorama import Fore, Style
29
  # import openai
 
 
30
 
31
 
32
  para_tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
@@ -62,28 +64,118 @@ def paraphrase(
62
  return res
63
 
64
 
65
- def find_longest_common_sequences(main_sentence, paraphrases):
66
- main_tokens = main_sentence.split()
67
- common_sequences = set()
68
 
69
- for paraphrase in paraphrases:
70
- paraphrase_tokens = paraphrase.split()
71
- for i in range(len(main_tokens)):
72
- for j in range(len(paraphrase_tokens)):
73
- # Start comparing pairs of words
74
- m = i
75
- n = j
76
- while m < len(main_tokens) and n < len(paraphrase_tokens) and main_tokens[m] == paraphrase_tokens[n]:
77
- m += 1
78
- n += 1
79
- # If we found a longer common sequence, update it
80
- if m - i > 1:
81
- sequence = ' '.join(main_tokens[i:m])
82
- is_subsequence = any(sequence in existing_seq for existing_seq in common_sequences)
83
- if not is_subsequence:
84
- common_sequences.add(sequence)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
- return sorted(common_sequences, key=len, reverse=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
 
89
 
@@ -123,26 +215,45 @@ longest_common_sequences = find_longest_common_sequences(main_sentence, paraphra
123
  color_palette = ["#FF0000", "#008000", "#0000FF", "#FF00FF", "#00FFFF"]
124
  highlighted_sentences = []
125
 
126
- # Highlighting sequences in main sentence and paraphrases
127
- for sentence in [main_sentence] + paraphrases:
128
- highlighted_sentence = sentence
129
- for i, sequence in enumerate(longest_common_sequences):
130
- color = color_palette[i % len(color_palette)]
131
- highlighted_sentence = highlighted_sentence.replace(sequence, f"<span style='color:{color}'>{sequence}</span>")
132
- highlighted_sentences.append(highlighted_sentence)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
- # Display paraphrases with numbers
135
- st.markdown("**Paraphrases**:")
136
- for i, para in enumerate(paraphrases, 1):
137
- st.write(f"Paraphrase {i}:")
138
- st.write(para)
139
 
140
 
141
- # Displaying the main sentence with highlighted longest common sequences
142
- st.markdown("**Main sentence with highlighted longest common sequences**:")
143
- st.markdown(highlighted_sentences[0], unsafe_allow_html=True)
144
 
145
 
146
- st.markdown("**Paraphrases with highlighted longest common sequences**:")
147
- for paraphrase in highlighted_sentences[1:]:
148
- st.markdown(paraphrase, unsafe_allow_html=True)
 
27
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
28
  from colorama import Fore, Style
29
  # import openai
30
+ import re
31
+ from termcolor import colored
32
 
33
 
34
  para_tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
 
64
  return res
65
 
66
 
 
 
 
67
 
68
+ def remove_punctuations(text):
69
+ # Remove punctuations while preserving hyphenated words
70
+ return re.sub(r'(?<!\w)-|-(?!\w)', ' ', re.sub(r'[^\w\s-]', '', text))
71
+
72
+ def tokenize(sentence):
73
+ # Remove punctuations using the updated function and tokenize the sentence into words
74
+ cleaned_sentence = remove_punctuations(sentence)
75
+ return cleaned_sentence.split()
76
+
77
+
78
+ def generate_bigrams(words):
79
+ # Generate bigrams from a list of words
80
+ return [(words[i], words[i+1]) for i in range(len(words)-1)]
81
+
82
+ def hash_bigram(bigram):
83
+ # Hash function for bigrams
84
+ return hash(tuple(bigram))
85
+
86
+ def find_matching_words(sentence1, sentence2):
87
+ # Tokenize the sentences
88
+ words1 = tokenize(sentence1)
89
+ words2 = tokenize(sentence2)
90
+
91
+ # Generate bigrams
92
+ bigrams1 = generate_bigrams(words1)
93
+ bigrams2 = generate_bigrams(words2)
94
+
95
+ # Hash bigrams of sentence 1 and store them in a set for efficient lookup
96
+ hashed_bigrams_set = set(hash_bigram(bigram) for bigram in bigrams1)
97
+
98
+ # Find matching words by comparing hashed bigrams of sentence 2 with the set of hashed bigrams from sentence 1
99
+ matching_words = []
100
+ for i, bigram in enumerate(bigrams2):
101
+ if hash_bigram(bigram) in hashed_bigrams_set:
102
+ word1_idx = sentence2.find(bigram[0], sum(len(word) for word in sentence2.split()[:i]))
103
+ word2_idx = sentence2.find(bigram[1], word1_idx + len(bigram[0]))
104
+ matching_words.append((sentence2[word1_idx:word1_idx+len(bigram[0])], sentence2[word2_idx:word2_idx+len(bigram[1])]))
105
+
106
+ return matching_words
107
+
108
+
109
+
110
+ matching_bigrams_list = []
111
+ combined_words_list = []
112
+
113
+ for paraphrase in paraphrases:
114
+ # Find matching words
115
+ matching_words = find_matching_words(main_sentence, paraphrase)
116
+ matching_bigrams_list.append(matching_words)
117
+
118
+ def combine_matching_bigrams(matching_bigrams):
119
+ combined_words = []
120
+ combined_word = ""
121
+
122
+ for i, bigram in enumerate(matching_bigrams):
123
+ if i == 0:
124
+ combined_word += ' '.join(bigram)
125
+ elif bigram[0] == matching_bigrams[i-1][1]:
126
+ combined_word += ' ' + bigram[1]
127
+ else:
128
+ combined_words.append(combined_word)
129
+ combined_word = ' '.join(bigram)
130
+
131
+ # Append the last combined word
132
+ combined_words.append(combined_word)
133
+
134
+ return combined_words
135
 
136
+ # Combine matching bigrams into single words
137
+ combined_words = combine_matching_bigrams(matching_words)
138
+ combined_words_list.append(combined_words)
139
+
140
+ def remove_overlapping(input_set):
141
+ sorted_set = sorted(input_set, key=len, reverse=True)
142
+ output_set = set()
143
+
144
+ for word in sorted_set:
145
+ if not any(word in existing_word for existing_word in output_set):
146
+ output_set.add(word)
147
+
148
+ return output_set
149
+
150
+
151
+ def find_longest_match(string1, string2):
152
+ # Initialize variables
153
+ longest_match = ''
154
+
155
+ # Iterate through all possible substrings of string1
156
+ for i in range(len(string1)):
157
+ for j in range(i + 1, len(string1) + 1):
158
+ substring = string1[i:j]
159
+ if ' ' + substring + ' ' in ' ' + string2 + ' ':
160
+ if len(substring) > len(longest_match):
161
+ longest_match = substring
162
+
163
+ return longest_match
164
+
165
+ common_substrings = set()
166
+ highlighted_text = []
167
+
168
+ for i in combined_words_list[0]:
169
+ for j in combined_words_list[1]:
170
+ for k in combined_words_list[2]:
171
+ for l in combined_words_list[3]:
172
+ for m in combined_words_list[4]:
173
+ matching_portion = find_longest_match(i, j)
174
+ matching_portion = find_longest_match(matching_portion, k)
175
+ matching_portion = find_longest_match(matching_portion, l)
176
+ matching_portion = find_longest_match(matching_portion, m)
177
+ if matching_portion:
178
+ common_substrings.add(matching_portion)
179
 
180
 
181
 
 
215
  color_palette = ["#FF0000", "#008000", "#0000FF", "#FF00FF", "#00FFFF"]
216
  highlighted_sentences = []
217
 
218
+
219
+ highlighted_sentence = main_sentence
220
+
221
+ for substring in remove_overlapping(common_substrings):
222
+ highlighted_sentence = highlighted_sentence.replace(substring, colored(substring, 'white', 'on_blue'))
223
+ highlighted_text.append(substring)
224
+
225
+ st.markdown(("Common substrings that occur in all five lists:")
226
+ for substring in highlighted_text:
227
+ st.markdown((substring)
228
+
229
+ st.markdown(("\nHighlighted Main Sentence:")
230
+ st.markdown(highlighted_sentence)
231
+
232
+
233
+
234
+
235
+
236
+
237
+ # # Highlighting sequences in main sentence and paraphrases
238
+ # for sentence in [main_sentence] + paraphrases:
239
+ # highlighted_sentence = sentence
240
+ # for i, sequence in enumerate(longest_common_sequences):
241
+ # color = color_palette[i % len(color_palette)]
242
+ # highlighted_sentence = highlighted_sentence.replace(sequence, f"<span style='color:{color}'>{sequence}</span>")
243
+ # highlighted_sentences.append(highlighted_sentence)
244
 
245
+ # # Display paraphrases with numbers
246
+ # st.markdown("**Paraphrases**:")
247
+ # for i, para in enumerate(paraphrases, 1):
248
+ # st.write(f"Paraphrase {i}:")
249
+ # st.write(para)
250
 
251
 
252
+ # # Displaying the main sentence with highlighted longest common sequences
253
+ # st.markdown("**Main sentence with highlighted longest common sequences**:")
254
+ # st.markdown(highlighted_sentences[0], unsafe_allow_html=True)
255
 
256
 
257
+ # st.markdown("**Paraphrases with highlighted longest common sequences**:")
258
+ # for paraphrase in highlighted_sentences[1:]:
259
+ # st.markdown(paraphrase, unsafe_allow_html=True)