sashtech commited on
Commit
858fdd3
1 Parent(s): 8220ab9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +122 -156
app.py CHANGED
@@ -5,7 +5,28 @@ import spacy
5
  import subprocess
6
  import nltk
7
  from nltk.corpus import wordnet
 
 
8
  from spellchecker import SpellChecker
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  # Initialize the English text classification pipeline for AI detection
11
  pipeline_en = pipeline(task="text-classification", model="Hello-SimpleAI/chatgpt-detector-roberta")
@@ -13,10 +34,6 @@ pipeline_en = pipeline(task="text-classification", model="Hello-SimpleAI/chatgpt
13
  # Initialize the spell checker
14
  spell = SpellChecker()
15
 
16
- # Ensure necessary NLTK data is downloaded
17
- nltk.download('wordnet')
18
- nltk.download('omw-1.4')
19
-
20
  # Ensure the SpaCy model is installed
21
  try:
22
  nlp = spacy.load("en_core_web_sm")
@@ -24,27 +41,74 @@ except OSError:
24
  subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
25
  nlp = spacy.load("en_core_web_sm")
26
 
27
- # Function to predict the label and score for English text (AI Detection)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  def predict_en(text):
29
  res = pipeline_en(text)[0]
30
  return res['label'], res['score']
31
 
32
- # Function to get synonyms using NLTK WordNet
33
- def get_synonyms_nltk(word, pos):
34
- synsets = wordnet.synsets(word, pos=pos)
35
- if synsets:
36
- lemmas = synsets[0].lemmas()
37
- return [lemma.name() for lemma in lemmas]
38
- return []
39
-
40
- # Function to remove redundant and meaningless words
41
  def remove_redundant_words(text):
42
  doc = nlp(text)
43
  meaningless_words = {"actually", "basically", "literally", "really", "very", "just"}
44
  filtered_text = [token.text for token in doc if token.text.lower() not in meaningless_words]
45
  return ' '.join(filtered_text)
46
 
47
- # Function to capitalize the first letter of sentences and proper nouns
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  def capitalize_sentences_and_nouns(text):
49
  doc = nlp(text)
50
  corrected_text = []
@@ -52,9 +116,9 @@ def capitalize_sentences_and_nouns(text):
52
  for sent in doc.sents:
53
  sentence = []
54
  for token in sent:
55
- if token.i == sent.start: # First word of the sentence
56
  sentence.append(token.text.capitalize())
57
- elif token.pos_ == "PROPN": # Proper noun
58
  sentence.append(token.text.capitalize())
59
  else:
60
  sentence.append(token.text)
@@ -62,13 +126,19 @@ def capitalize_sentences_and_nouns(text):
62
 
63
  return ' '.join(corrected_text)
64
 
65
- # Function to force capitalization of the first letter of every sentence (NEW)
66
  def force_first_letter_capital(text):
67
- sentences = text.split(". ") # Split by period to get each sentence
68
- capitalized_sentences = [sentence[0].capitalize() + sentence[1:] if sentence else "" for sentence in sentences]
69
- return ". ".join(capitalized_sentences)
 
 
 
 
 
 
 
 
70
 
71
- # Function to correct tense errors in a sentence
72
  def correct_tense_errors(text):
73
  doc = nlp(text)
74
  corrected_text = []
@@ -80,29 +150,6 @@ def correct_tense_errors(text):
80
  corrected_text.append(token.text)
81
  return ' '.join(corrected_text)
82
 
83
- # Function to correct singular/plural errors
84
- def correct_singular_plural_errors(text):
85
- doc = nlp(text)
86
- corrected_text = []
87
-
88
- for token in doc:
89
- if token.pos_ == "NOUN":
90
- if token.tag_ == "NN": # Singular noun
91
- if any(child.text.lower() in ['many', 'several', 'few'] for child in token.head.children):
92
- corrected_text.append(token.lemma_ + 's')
93
- else:
94
- corrected_text.append(token.text)
95
- elif token.tag_ == "NNS": # Plural noun
96
- if any(child.text.lower() in ['a', 'one'] for child in token.head.children):
97
- corrected_text.append(token.lemma_)
98
- else:
99
- corrected_text.append(token.text)
100
- else:
101
- corrected_text.append(token.text)
102
-
103
- return ' '.join(corrected_text)
104
-
105
- # Function to check and correct article errors
106
  def correct_article_errors(text):
107
  doc = nlp(text)
108
  corrected_text = []
@@ -119,131 +166,52 @@ def correct_article_errors(text):
119
  corrected_text.append(token.text)
120
  return ' '.join(corrected_text)
121
 
122
- # Function to get the correct synonym while maintaining verb form
123
- def replace_with_synonym(token):
124
- pos = None
125
- if token.pos_ == "VERB":
126
- pos = wordnet.VERB
127
- elif token.pos_ == "NOUN":
128
- pos = wordnet.NOUN
129
- elif token.pos_ == "ADJ":
130
- pos = wordnet.ADJ
131
- elif token.pos_ == "ADV":
132
- pos = wordnet.ADV
133
-
134
- synonyms = get_synonyms_nltk(token.lemma_, pos)
135
-
136
- if synonyms:
137
- synonym = synonyms[0]
138
- if token.tag_ == "VBG": # Present participle (e.g., running)
139
- synonym = synonym + 'ing'
140
- elif token.tag_ == "VBD" or token.tag_ == "VBN": # Past tense or past participle
141
- synonym = synonym + 'ed'
142
- elif token.tag_ == "VBZ": # Third-person singular present
143
- synonym = synonym + 's'
144
- return synonym
145
- return token.text
146
-
147
- # Function to check for and avoid double negatives
148
- def correct_double_negatives(text):
149
- doc = nlp(text)
150
- corrected_text = []
151
- for token in doc:
152
- if token.text.lower() == "not" and any(child.text.lower() == "never" for child in token.head.children):
153
- corrected_text.append("always")
154
- else:
155
- corrected_text.append(token.text)
156
- return ' '.join(corrected_text)
157
-
158
- # Function to ensure subject-verb agreement
159
  def ensure_subject_verb_agreement(text):
160
  doc = nlp(text)
161
  corrected_text = []
162
  for token in doc:
163
  if token.dep_ == "nsubj" and token.head.pos_ == "VERB":
164
- if token.tag_ == "NN" and token.head.tag_ != "VBZ": # Singular noun, should use singular verb
165
  corrected_text.append(token.head.lemma_ + "s")
166
- elif token.tag_ == "NNS" and token.head.tag_ == "VBZ": # Plural noun, should not use singular verb
167
  corrected_text.append(token.head.lemma_)
168
  corrected_text.append(token.text)
169
  return ' '.join(corrected_text)
170
 
171
- # Function to correct spelling errors
172
  def correct_spelling(text):
173
- words = text.split()
174
  corrected_words = []
175
- for word in words:
176
- corrected_word = spell.correction(word)
177
- if corrected_word is not None:
178
- corrected_words.append(corrected_word)
179
- else:
180
- corrected_words.append(word) # Keep the original word if correction is None
181
- return ' '.join(corrected_words)
182
-
183
- # Function to rephrase text and replace words with their synonyms while maintaining form
184
- def rephrase_with_synonyms(text):
185
- doc = nlp(text)
186
- rephrased_text = []
187
 
188
- for token in doc:
189
- pos_tag = None
190
- if token.pos_ == "NOUN":
191
- pos_tag = wordnet.NOUN
192
- elif token.pos_ == "VERB":
193
- pos_tag = wordnet.VERB
194
- elif token.pos_ == "ADJ":
195
- pos_tag = wordnet.ADJ
196
- elif token.pos_ == "ADV":
197
- pos_tag = wordnet.ADV
198
-
199
- if pos_tag:
200
- synonyms = get_synonyms_nltk(token.text, pos_tag)
201
- if synonyms:
202
- synonym = synonyms[0] # Just using the first synonym for simplicity
203
- if token.pos_ == "VERB":
204
- if token.tag_ == "VBG": # Present participle (e.g., running)
205
- synonym = synonym + 'ing'
206
- elif token.tag_ == "VBD" or token.tag_ == "VBN": # Past tense or past participle
207
- synonym = synonym + 'ed'
208
- elif token.tag_ == "VBZ": # Third-person singular present
209
- synonym = synonym + 's'
210
- elif token.pos_ == "NOUN" and token.tag_ == "NNS": # Plural nouns
211
- synonym += 's' if not synonym.endswith('s') else ""
212
- rephrased_text.append(synonym)
213
- else:
214
- rephrased_text.append(token.text)
215
  else:
216
- rephrased_text.append(token.text)
217
 
218
- return ' '.join(rephrased_text)
219
 
220
- # Function to paraphrase and correct grammar with enhanced accuracy
221
  def paraphrase_and_correct(text):
222
- # Remove meaningless or redundant words first
223
- cleaned_text = remove_redundant_words(text)
224
-
225
- # Capitalize sentences and nouns
226
- paraphrased_text = capitalize_sentences_and_nouns(cleaned_text)
227
-
228
- # Ensure first letter of each sentence is capitalized
229
- paraphrased_text = force_first_letter_capital(paraphrased_text)
230
-
231
- # Apply grammatical corrections
232
- paraphrased_text = correct_article_errors(paraphrased_text)
233
- paraphrased_text = correct_singular_plural_errors(paraphrased_text)
234
- paraphrased_text = correct_tense_errors(paraphrased_text)
235
- paraphrased_text = correct_double_negatives(paraphrased_text)
236
- paraphrased_text = ensure_subject_verb_agreement(paraphrased_text)
237
-
238
- # Rephrase with synonyms while maintaining grammatical forms
239
- paraphrased_text = rephrase_with_synonyms(paraphrased_text)
240
-
241
- # Correct spelling errors
242
- paraphrased_text = correct_spelling(paraphrased_text)
243
-
244
- return paraphrased_text
245
-
246
- # Gradio app setup with two tabs
247
  with gr.Blocks() as demo:
248
  with gr.Tab("AI Detection"):
249
  t1 = gr.Textbox(lines=5, label='Text')
@@ -251,7 +219,6 @@ with gr.Blocks() as demo:
251
  label1 = gr.Textbox(lines=1, label='Predicted Label 🎃')
252
  score1 = gr.Textbox(lines=1, label='Prob')
253
 
254
- # Connect the prediction function to the button
255
  button1.click(fn=predict_en, inputs=t1, outputs=[label1, score1])
256
 
257
  with gr.Tab("Paraphrasing & Grammar Correction"):
@@ -259,7 +226,6 @@ with gr.Blocks() as demo:
259
  button2 = gr.Button("🔄 Paraphrase and Correct")
260
  result2 = gr.Textbox(lines=5, label='Corrected Text')
261
 
262
- # Connect the paraphrasing and correction function to the button
263
  button2.click(fn=paraphrase_and_correct, inputs=t2, outputs=result2)
264
 
265
- demo.launch(share=True) # Share=True to create a public link
 
5
  import subprocess
6
  import nltk
7
  from nltk.corpus import wordnet
8
+ from nltk.corpus import stopwords
9
+ from nltk.tokenize import word_tokenize
10
  from spellchecker import SpellChecker
11
+ import re
12
+ import string
13
+ import random
14
+
15
+ # Download necessary NLTK data
16
+ nltk.download('punkt')
17
+ nltk.download('stopwords')
18
+ nltk.download('averaged_perceptron_tagger')
19
+ nltk.download('averaged_perceptron_tagger_eng')
20
+ nltk.download('wordnet')
21
+ nltk.download('omw-1.4')
22
+ nltk.download('punkt_tab')
23
+
24
+ # Initialize stopwords
25
+ stop_words = set(stopwords.words("english"))
26
+
27
+ # Words we don't want to replace
28
+ exclude_tags = {'PRP', 'PRP$', 'MD', 'VBZ', 'VBP', 'VBD', 'VBG', 'VBN', 'TO', 'IN', 'DT', 'CC'}
29
+ exclude_words = {'is', 'am', 'are', 'was', 'were', 'have', 'has', 'do', 'does', 'did', 'will', 'shall', 'should', 'would', 'could', 'can', 'may', 'might'}
30
 
31
  # Initialize the English text classification pipeline for AI detection
32
  pipeline_en = pipeline(task="text-classification", model="Hello-SimpleAI/chatgpt-detector-roberta")
 
34
  # Initialize the spell checker
35
  spell = SpellChecker()
36
 
 
 
 
 
37
  # Ensure the SpaCy model is installed
38
  try:
39
  nlp = spacy.load("en_core_web_sm")
 
41
  subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
42
  nlp = spacy.load("en_core_web_sm")
43
 
44
+ def plagiarism_removal(text):
45
+ def plagiarism_remover(word):
46
+ if word.lower() in stop_words or word.lower() in exclude_words or word in string.punctuation:
47
+ return word
48
+
49
+ # Find synonyms
50
+ synonyms = set()
51
+ for syn in wordnet.synsets(word):
52
+ for lemma in syn.lemmas():
53
+ if "_" not in lemma.name() and lemma.name().isalpha() and lemma.name().lower() != word.lower():
54
+ synonyms.add(lemma.name())
55
+
56
+ pos_tag_word = nltk.pos_tag([word])[0]
57
+
58
+ if pos_tag_word[1] in exclude_tags:
59
+ return word
60
+
61
+ filtered_synonyms = [syn for syn in synonyms if nltk.pos_tag([syn])[0][1] == pos_tag_word[1]]
62
+
63
+ if not filtered_synonyms:
64
+ return word
65
+
66
+ synonym_choice = random.choice(filtered_synonyms)
67
+
68
+ if word.istitle():
69
+ return synonym_choice.title()
70
+ return synonym_choice
71
+
72
+ para_split = word_tokenize(text)
73
+ final_text = [plagiarism_remover(word) for word in para_split]
74
+
75
+ corrected_text = []
76
+ for i in range(len(final_text)):
77
+ if final_text[i] in string.punctuation and i > 0:
78
+ corrected_text[-1] += final_text[i]
79
+ else:
80
+ corrected_text.append(final_text[i])
81
+
82
+ return " ".join(corrected_text)
83
+
84
  def predict_en(text):
85
  res = pipeline_en(text)[0]
86
  return res['label'], res['score']
87
 
 
 
 
 
 
 
 
 
 
88
  def remove_redundant_words(text):
89
  doc = nlp(text)
90
  meaningless_words = {"actually", "basically", "literally", "really", "very", "just"}
91
  filtered_text = [token.text for token in doc if token.text.lower() not in meaningless_words]
92
  return ' '.join(filtered_text)
93
 
94
+ def fix_punctuation_spacing(text):
95
+ words = text.split(' ')
96
+ cleaned_words = []
97
+ punctuation_marks = {',', '.', "'", '!', '?', ':'}
98
+
99
+ for word in words:
100
+ if cleaned_words and word and word[0] in punctuation_marks:
101
+ cleaned_words[-1] += word
102
+ else:
103
+ cleaned_words.append(word)
104
+
105
+ return ' '.join(cleaned_words).replace(' ,', ',').replace(' .', '.').replace(" '", "'") \
106
+ .replace(' !', '!').replace(' ?', '?').replace(' :', ':')
107
+
108
+ def fix_possessives(text):
109
+ text = re.sub(r'(\w)\s\'\s?s', r"\1's", text)
110
+ return text
111
+
112
  def capitalize_sentences_and_nouns(text):
113
  doc = nlp(text)
114
  corrected_text = []
 
116
  for sent in doc.sents:
117
  sentence = []
118
  for token in sent:
119
+ if token.i == sent.start:
120
  sentence.append(token.text.capitalize())
121
+ elif token.pos_ == "PROPN":
122
  sentence.append(token.text.capitalize())
123
  else:
124
  sentence.append(token.text)
 
126
 
127
  return ' '.join(corrected_text)
128
 
 
129
  def force_first_letter_capital(text):
130
+ sentences = re.split(r'(?<=\w[.!?])\s+', text)
131
+ capitalized_sentences = []
132
+
133
+ for sentence in sentences:
134
+ if sentence:
135
+ capitalized_sentence = sentence[0].capitalize() + sentence[1:]
136
+ if not re.search(r'[.!?]$', capitalized_sentence):
137
+ capitalized_sentence += '.'
138
+ capitalized_sentences.append(capitalized_sentence)
139
+
140
+ return " ".join(capitalized_sentences)
141
 
 
142
  def correct_tense_errors(text):
143
  doc = nlp(text)
144
  corrected_text = []
 
150
  corrected_text.append(token.text)
151
  return ' '.join(corrected_text)
152
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  def correct_article_errors(text):
154
  doc = nlp(text)
155
  corrected_text = []
 
166
  corrected_text.append(token.text)
167
  return ' '.join(corrected_text)
168
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  def ensure_subject_verb_agreement(text):
170
  doc = nlp(text)
171
  corrected_text = []
172
  for token in doc:
173
  if token.dep_ == "nsubj" and token.head.pos_ == "VERB":
174
+ if token.tag_ == "NN" and token.head.tag_ != "VBZ":
175
  corrected_text.append(token.head.lemma_ + "s")
176
+ elif token.tag_ == "NNS" and token.head.tag_ == "VBZ":
177
  corrected_text.append(token.head.lemma_)
178
  corrected_text.append(token.text)
179
  return ' '.join(corrected_text)
180
 
 
181
  def correct_spelling(text):
182
+ words = word_tokenize(text)
183
  corrected_words = []
 
 
 
 
 
 
 
 
 
 
 
 
184
 
185
+ for word in words:
186
+ corrected_word = spell.candidates(word)
187
+ if corrected_word:
188
+ corrected_words.append(spell.candidates(word).pop()) # Choose the first candidate as the correction
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  else:
190
+ corrected_words.append(word) # If it's not misspelled, keep the original word
191
 
192
+ return ' '.join(corrected_words)
193
 
 
194
  def paraphrase_and_correct(text):
195
+ paragraphs = text.split("\n\n") # Split by paragraphs
196
+
197
+ # Process each paragraph separately
198
+ processed_paragraphs = []
199
+ for paragraph in paragraphs:
200
+ cleaned_text = remove_redundant_words(paragraph)
201
+ plag_removed = plagiarism_removal(cleaned_text)
202
+ paraphrased_text = capitalize_sentences_and_nouns(plag_removed)
203
+ paraphrased_text = force_first_letter_capital(paraphrased_text)
204
+ paraphrased_text = correct_article_errors(paraphrased_text)
205
+ paraphrased_text = correct_tense_errors(paraphrased_text)
206
+ paraphrased_text = ensure_subject_verb_agreement(paraphrased_text)
207
+ paraphrased_text = fix_possessives(paraphrased_text)
208
+ paraphrased_text = correct_spelling(paraphrased_text) # Spelling correction
209
+ paraphrased_text = fix_punctuation_spacing(paraphrased_text)
210
+ processed_paragraphs.append(paraphrased_text)
211
+
212
+ return "\n\n".join(processed_paragraphs) # Reassemble the text with paragraphs
213
+
214
+ # Gradio app setup
 
 
 
 
 
215
  with gr.Blocks() as demo:
216
  with gr.Tab("AI Detection"):
217
  t1 = gr.Textbox(lines=5, label='Text')
 
219
  label1 = gr.Textbox(lines=1, label='Predicted Label 🎃')
220
  score1 = gr.Textbox(lines=1, label='Prob')
221
 
 
222
  button1.click(fn=predict_en, inputs=t1, outputs=[label1, score1])
223
 
224
  with gr.Tab("Paraphrasing & Grammar Correction"):
 
226
  button2 = gr.Button("🔄 Paraphrase and Correct")
227
  result2 = gr.Textbox(lines=5, label='Corrected Text')
228
 
 
229
  button2.click(fn=paraphrase_and_correct, inputs=t2, outputs=result2)
230
 
231
+ demo.launch(share=True)