NoaiGPT commited on
Commit
0363ba6
1 Parent(s): 9ea1b86
Files changed (1) hide show
  1. app.py +200 -5
app.py CHANGED
@@ -1,9 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import json
3
  import gradio as gr
4
  import spaces
5
  import torch
6
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
7
  from sentence_splitter import SentenceSplitter
8
  from itertools import product
9
 
@@ -24,6 +207,11 @@ classifier_model_name = "andreas122001/roberta-mixed-detector"
24
  classifier_tokenizer = AutoTokenizer.from_pretrained(classifier_model_name)
25
  classifier_model = AutoModelForSequenceClassification.from_pretrained(classifier_model_name).to(device)
26
 
 
 
 
 
 
27
  # Initialize sentence splitter
28
  splitter = SentenceSplitter(language='en')
29
 
@@ -37,6 +225,12 @@ def classify_text(text):
37
  main_score = probabilities[0][predicted_class].item()
38
  return main_label, main_score
39
 
 
 
 
 
 
 
40
  @spaces.GPU
41
  def generate_paraphrases(text, setting, output_format):
42
  sentences = splitter.split(text)
@@ -88,7 +282,7 @@ def generate_paraphrases(text, setting, output_format):
88
  }
89
 
90
  for i, sentence in enumerate(sentences):
91
- inputs = paraphraser_tokenizer(f'{sentence}', return_tensors="pt", padding="longest", truncation=True, max_length=max_length).to(device)
92
 
93
  # Generate paraphrases using the specified parameters
94
  outputs = paraphraser_model.generate(
@@ -133,11 +327,12 @@ def generate_paraphrases(text, setting, output_format):
133
  # Classify combined versions
134
  human_versions = []
135
  for i, version in enumerate(combined_versions, 1):
136
- label, score = classify_text(version)
137
- formatted_output += f"Version {i}:\n{version}\n"
 
138
  formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n"
139
  if label == "human-produced" or (label == "machine-generated" and score < 0.98):
140
- human_versions.append((version, label, score))
141
 
142
  formatted_output += "\nHuman-like or Less Confident Machine-generated versions:\n"
143
  for i, (version, label, score) in enumerate(human_versions, 1):
 
1
+ # import os
2
+ # import json
3
+ # import gradio as gr
4
+ # import spaces
5
+ # import torch
6
+ # from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
7
+ # from sentence_splitter import SentenceSplitter
8
+ # from itertools import product
9
+
10
+ # # Get the Hugging Face token from environment variable
11
+ # hf_token = os.getenv('HF_TOKEN')
12
+
13
+ # cuda_available = torch.cuda.is_available()
14
+ # device = torch.device("cuda" if cuda_available else "cpu")
15
+ # print(f"Using device: {device}")
16
+
17
+ # # Initialize paraphraser model and tokenizer
18
+ # paraphraser_model_name = "NoaiGPT/777"
19
+ # paraphraser_tokenizer = AutoTokenizer.from_pretrained(paraphraser_model_name, use_auth_token=hf_token)
20
+ # paraphraser_model = AutoModelForSeq2SeqLM.from_pretrained(paraphraser_model_name, use_auth_token=hf_token).to(device)
21
+
22
+ # # Initialize classifier model and tokenizer
23
+ # classifier_model_name = "andreas122001/roberta-mixed-detector"
24
+ # classifier_tokenizer = AutoTokenizer.from_pretrained(classifier_model_name)
25
+ # classifier_model = AutoModelForSequenceClassification.from_pretrained(classifier_model_name).to(device)
26
+
27
+ # # Initialize sentence splitter
28
+ # splitter = SentenceSplitter(language='en')
29
+
30
+ # def classify_text(text):
31
+ # inputs = classifier_tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
32
+ # with torch.no_grad():
33
+ # outputs = classifier_model(**inputs)
34
+ # probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
35
+ # predicted_class = torch.argmax(probabilities, dim=-1).item()
36
+ # main_label = classifier_model.config.id2label[predicted_class]
37
+ # main_score = probabilities[0][predicted_class].item()
38
+ # return main_label, main_score
39
+
40
+ # @spaces.GPU
41
+ # def generate_paraphrases(text, setting, output_format):
42
+ # sentences = splitter.split(text)
43
+ # all_sentence_paraphrases = []
44
+
45
+ # if setting == 1:
46
+ # num_return_sequences = 5
47
+ # repetition_penalty = 1.1
48
+ # no_repeat_ngram_size = 2
49
+ # temperature = 1.0
50
+ # max_length = 128
51
+ # elif setting == 2:
52
+ # num_return_sequences = 10
53
+ # repetition_penalty = 1.2
54
+ # no_repeat_ngram_size = 3
55
+ # temperature = 1.2
56
+ # max_length = 192
57
+ # elif setting == 3:
58
+ # num_return_sequences = 15
59
+ # repetition_penalty = 1.3
60
+ # no_repeat_ngram_size = 4
61
+ # temperature = 1.4
62
+ # max_length = 256
63
+ # elif setting == 4:
64
+ # num_return_sequences = 20
65
+ # repetition_penalty = 1.4
66
+ # no_repeat_ngram_size = 5
67
+ # temperature = 1.6
68
+ # max_length = 320
69
+ # else:
70
+ # num_return_sequences = 25
71
+ # repetition_penalty = 1.5
72
+ # no_repeat_ngram_size = 6
73
+ # temperature = 1.8
74
+ # max_length = 384
75
+
76
+ # top_k = 50
77
+ # top_p = 0.95
78
+ # length_penalty = 1.0
79
+
80
+ # formatted_output = "Original text:\n" + text + "\n\n"
81
+ # formatted_output += "Paraphrased versions:\n"
82
+
83
+ # json_output = {
84
+ # "original_text": text,
85
+ # "paraphrased_versions": [],
86
+ # "combined_versions": [],
87
+ # "human_like_versions": []
88
+ # }
89
+
90
+ # for i, sentence in enumerate(sentences):
91
+ # inputs = paraphraser_tokenizer(f'{sentence}', return_tensors="pt", padding="longest", truncation=True, max_length=max_length).to(device)
92
+
93
+ # # Generate paraphrases using the specified parameters
94
+ # outputs = paraphraser_model.generate(
95
+ # inputs.input_ids,
96
+ # attention_mask=inputs.attention_mask,
97
+ # num_return_sequences=num_return_sequences,
98
+ # repetition_penalty=repetition_penalty,
99
+ # no_repeat_ngram_size=no_repeat_ngram_size,
100
+ # temperature=temperature,
101
+ # max_length=max_length,
102
+ # top_k=top_k,
103
+ # top_p=top_p,
104
+ # do_sample=True,
105
+ # early_stopping=False,
106
+ # length_penalty=length_penalty
107
+ # )
108
+
109
+ # paraphrases = paraphraser_tokenizer.batch_decode(outputs, skip_special_tokens=True)
110
+
111
+ # formatted_output += f"Original sentence {i+1}: {sentence}\n"
112
+ # for j, paraphrase in enumerate(paraphrases, 1):
113
+ # formatted_output += f" Paraphrase {j}: {paraphrase}\n"
114
+
115
+ # json_output["paraphrased_versions"].append({
116
+ # f"original_sentence_{i+1}": sentence,
117
+ # "paraphrases": paraphrases
118
+ # })
119
+
120
+ # all_sentence_paraphrases.append(paraphrases)
121
+ # formatted_output += "\n"
122
+
123
+ # all_combinations = list(product(*all_sentence_paraphrases))
124
+
125
+ # formatted_output += "\nCombined paraphrased versions:\n"
126
+ # combined_versions = []
127
+ # for i, combination in enumerate(all_combinations[:50], 1): # Limit to 50 combinations
128
+ # combined_paraphrase = " ".join(combination)
129
+ # combined_versions.append(combined_paraphrase)
130
+
131
+ # json_output["combined_versions"] = combined_versions
132
+
133
+ # # Classify combined versions
134
+ # human_versions = []
135
+ # for i, version in enumerate(combined_versions, 1):
136
+ # label, score = classify_text(version)
137
+ # formatted_output += f"Version {i}:\n{version}\n"
138
+ # formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n"
139
+ # if label == "human-produced" or (label == "machine-generated" and score < 0.98):
140
+ # human_versions.append((version, label, score))
141
+
142
+ # formatted_output += "\nHuman-like or Less Confident Machine-generated versions:\n"
143
+ # for i, (version, label, score) in enumerate(human_versions, 1):
144
+ # formatted_output += f"Version {i}:\n{version}\n"
145
+ # formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n"
146
+
147
+ # json_output["human_like_versions"] = [
148
+ # {"version": version, "label": label, "confidence_score": score}
149
+ # for version, label, score in human_versions
150
+ # ]
151
+
152
+ # # If no human-like versions, include the top 5 least confident machine-generated versions
153
+ # if not human_versions:
154
+ # human_versions = sorted([(v, l, s) for v, l, s in zip(combined_versions, [classify_text(v)[0] for v in combined_versions], [classify_text(v)[1] for v in combined_versions])], key=lambda x: x[2])[:5]
155
+ # formatted_output += "\nNo human-like versions found. Showing top 5 least confident machine-generated versions:\n"
156
+ # for i, (version, label, score) in enumerate(human_versions, 1):
157
+ # formatted_output += f"Version {i}:\n{version}\n"
158
+ # formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n"
159
+
160
+ # if output_format == "text":
161
+ # return formatted_output, "\n\n".join([v[0] for v in human_versions])
162
+ # else:
163
+ # return json.dumps(json_output, indent=2), "\n\n".join([v[0] for v in human_versions])
164
+
165
+ # # Define the Gradio interface
166
+ # iface = gr.Interface(
167
+ # fn=generate_paraphrases,
168
+ # inputs=[
169
+ # gr.Textbox(lines=5, label="Input Text"),
170
+ # gr.Slider(minimum=1, maximum=5, step=1, label="Readability to Human-like Setting"),
171
+ # gr.Radio(["text", "json"], label="Output Format")
172
+ # ],
173
+ # outputs=[
174
+ # gr.Textbox(lines=20, label="Detailed Paraphrases and Classifications"),
175
+ # gr.Textbox(lines=10, label="Human-like or Less Confident Machine-generated Paraphrases")
176
+ # ],
177
+ # title="Advanced Diverse Paraphraser with Human-like Filter",
178
+ # description="Enter a text, select a setting from readable to human-like, and choose the output format to generate diverse paraphrased versions. Combined versions are classified, and those detected as human-produced or less confidently machine-generated are presented in the final output."
179
+ # )
180
+
181
+ # # Launch the interface
182
+ # iface.launch()
183
+
184
  import os
185
  import json
186
  import gradio as gr
187
  import spaces
188
  import torch
189
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, T5ForConditionalGeneration
190
  from sentence_splitter import SentenceSplitter
191
  from itertools import product
192
 
 
207
  classifier_tokenizer = AutoTokenizer.from_pretrained(classifier_model_name)
208
  classifier_model = AutoModelForSequenceClassification.from_pretrained(classifier_model_name).to(device)
209
 
210
+ # Initialize grammar correction model and tokenizer
211
+ grammar_model_name = "grammarly/coedit-large"
212
+ grammar_tokenizer = AutoTokenizer.from_pretrained(grammar_model_name)
213
+ grammar_model = T5ForConditionalGeneration.from_pretrained(grammar_model_name).to(device)
214
+
215
  # Initialize sentence splitter
216
  splitter = SentenceSplitter(language='en')
217
 
 
225
  main_score = probabilities[0][predicted_class].item()
226
  return main_label, main_score
227
 
228
+ def correct_grammar(text):
229
+ inputs = grammar_tokenizer(f'Fix grammatical errors in this sentence: {text}', return_tensors="pt").input_ids.to(device)
230
+ outputs = grammar_model.generate(inputs, max_length=256)
231
+ corrected_text = grammar_tokenizer.decode(outputs[0], skip_special_tokens=True)
232
+ return corrected_text
233
+
234
  @spaces.GPU
235
  def generate_paraphrases(text, setting, output_format):
236
  sentences = splitter.split(text)
 
282
  }
283
 
284
  for i, sentence in enumerate(sentences):
285
+ inputs = paraphraser_tokenizer(f'paraphraser: {sentence}', return_tensors="pt", padding="longest", truncation=True, max_length=max_length).to(device)
286
 
287
  # Generate paraphrases using the specified parameters
288
  outputs = paraphraser_model.generate(
 
327
  # Classify combined versions
328
  human_versions = []
329
  for i, version in enumerate(combined_versions, 1):
330
+ corrected_version = correct_grammar(version)
331
+ label, score = classify_text(corrected_version)
332
+ formatted_output += f"Version {i}:\n{corrected_version}\n"
333
  formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n"
334
  if label == "human-produced" or (label == "machine-generated" and score < 0.98):
335
+ human_versions.append((corrected_version, label, score))
336
 
337
  formatted_output += "\nHuman-like or Less Confident Machine-generated versions:\n"
338
  for i, (version, label, score) in enumerate(human_versions, 1):