Spaces:
Build error
Build error
# # # import os | |
# # # import json | |
# # # import gradio as gr | |
# # # import spaces | |
# # # import torch | |
# # # from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification | |
# # # from sentence_splitter import SentenceSplitter | |
# # # from itertools import product | |
# # # # Get the Hugging Face token from environment variable | |
# # # hf_token = os.getenv('HF_TOKEN') | |
# # # cuda_available = torch.cuda.is_available() | |
# # # device = torch.device("cpu" if cuda_available else "cpu") | |
# # # print(f"Using device: {device}") | |
# # # # Initialize paraphraser model and tokenizer | |
# # # paraphraser_model_name = "NoaiGPT/777" | |
# # # paraphraser_tokenizer = AutoTokenizer.from_pretrained(paraphraser_model_name, use_auth_token=hf_token) | |
# # # paraphraser_model = AutoModelForSeq2SeqLM.from_pretrained(paraphraser_model_name, use_auth_token=hf_token).to(device) | |
# # # # Initialize classifier model and tokenizer | |
# # # classifier_model_name = "andreas122001/roberta-mixed-detector" | |
# # # classifier_tokenizer = AutoTokenizer.from_pretrained(classifier_model_name) | |
# # # classifier_model = AutoModelForSequenceClassification.from_pretrained(classifier_model_name).to(device) | |
# # # # Initialize sentence splitter | |
# # # splitter = SentenceSplitter(language='en') | |
# # # def classify_text(text): | |
# # # inputs = classifier_tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device) | |
# # # with torch.no_grad(): | |
# # # outputs = classifier_model(**inputs) | |
# # # probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1) | |
# # # predicted_class = torch.argmax(probabilities, dim=-1).item() | |
# # # main_label = classifier_model.config.id2label[predicted_class] | |
# # # main_score = probabilities[0][predicted_class].item() | |
# # # return main_label, main_score | |
# # # # @spaces.GPU | |
# # # def generate_paraphrases(text, setting, output_format): | |
# # # sentences = splitter.split(text) | |
# # # all_sentence_paraphrases = [] | |
# # # if setting == 1: | |
# # # num_return_sequences = 5 | |
# # # repetition_penalty = 1.1 | |
# # # no_repeat_ngram_size = 2 | |
# # # temperature = 1.0 | |
# # # max_length = 128 | |
# # # elif setting == 2: | |
# # # num_return_sequences = 10 | |
# # # repetition_penalty = 1.2 | |
# # # no_repeat_ngram_size = 3 | |
# # # temperature = 1.2 | |
# # # max_length = 192 | |
# # # elif setting == 3: | |
# # # num_return_sequences = 15 | |
# # # repetition_penalty = 1.3 | |
# # # no_repeat_ngram_size = 4 | |
# # # temperature = 1.4 | |
# # # max_length = 256 | |
# # # elif setting == 4: | |
# # # num_return_sequences = 20 | |
# # # repetition_penalty = 1.4 | |
# # # no_repeat_ngram_size = 5 | |
# # # temperature = 1.6 | |
# # # max_length = 320 | |
# # # else: | |
# # # num_return_sequences = 25 | |
# # # repetition_penalty = 1.5 | |
# # # no_repeat_ngram_size = 6 | |
# # # temperature = 1.8 | |
# # # max_length = 384 | |
# # # top_k = 50 | |
# # # top_p = 0.95 | |
# # # length_penalty = 1.0 | |
# # # formatted_output = "Original text:\n" + text + "\n\n" | |
# # # formatted_output += "Paraphrased versions:\n" | |
# # # json_output = { | |
# # # "original_text": text, | |
# # # "paraphrased_versions": [], | |
# # # "combined_versions": [], | |
# # # "human_like_versions": [] | |
# # # } | |
# # # for i, sentence in enumerate(sentences): | |
# # # inputs = paraphraser_tokenizer(f'paraphraser: {sentence}', return_tensors="pt", padding="longest", truncation=True, max_length=max_length).to(device) | |
# # # # Generate paraphrases using the specified parameters | |
# # # outputs = paraphraser_model.generate( | |
# # # inputs.input_ids, | |
# # # attention_mask=inputs.attention_mask, | |
# # # num_return_sequences=num_return_sequences, | |
# # # repetition_penalty=repetition_penalty, | |
# # # no_repeat_ngram_size=no_repeat_ngram_size, | |
# # # temperature=temperature, | |
# # # max_length=max_length, | |
# # # top_k=top_k, | |
# # # top_p=top_p, | |
# # # do_sample=True, | |
# # # early_stopping=False, | |
# # # length_penalty=length_penalty | |
# # # ) | |
# # # paraphrases = paraphraser_tokenizer.batch_decode(outputs, skip_special_tokens=True) | |
# # # formatted_output += f"Original sentence {i+1}: {sentence}\n" | |
# # # for j, paraphrase in enumerate(paraphrases, 1): | |
# # # formatted_output += f" Paraphrase {j}: {paraphrase}\n" | |
# # # json_output["paraphrased_versions"].append({ | |
# # # f"original_sentence_{i+1}": sentence, | |
# # # "paraphrases": paraphrases | |
# # # }) | |
# # # all_sentence_paraphrases.append(paraphrases) | |
# # # formatted_output += "\n" | |
# # # all_combinations = list(product(*all_sentence_paraphrases)) | |
# # # formatted_output += "\nCombined paraphrased versions:\n" | |
# # # combined_versions = [] | |
# # # for i, combination in enumerate(all_combinations[:50], 1): # Limit to 50 combinations | |
# # # combined_paraphrase = " ".join(combination) | |
# # # combined_versions.append(combined_paraphrase) | |
# # # json_output["combined_versions"] = combined_versions | |
# # # # Classify combined versions | |
# # # human_versions = [] | |
# # # for i, version in enumerate(combined_versions, 1): | |
# # # label, score = classify_text(version) | |
# # # formatted_output += f"Version {i}:\n{version}\n" | |
# # # formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n" | |
# # # if label == "human-produced" or (label == "machine-generated" and score < 0.98): | |
# # # human_versions.append((version, label, score)) | |
# # # formatted_output += "\nHuman-like or Less Confident Machine-generated versions:\n" | |
# # # for i, (version, label, score) in enumerate(human_versions, 1): | |
# # # formatted_output += f"Version {i}:\n{version}\n" | |
# # # formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n" | |
# # # json_output["human_like_versions"] = [ | |
# # # {"version": version, "label": label, "confidence_score": score} | |
# # # for version, label, score in human_versions | |
# # # ] | |
# # # # If no human-like versions, include the top 5 least confident machine-generated versions | |
# # # if not human_versions: | |
# # # human_versions = sorted([(v, l, s) for v, l, s in zip(combined_versions, [classify_text(v)[0] for v in combined_versions], [classify_text(v)[1] for v in combined_versions])], key=lambda x: x[2])[:5] | |
# # # formatted_output += "\nNo human-like versions found. Showing top 5 least confident machine-generated versions:\n" | |
# # # for i, (version, label, score) in enumerate(human_versions, 1): | |
# # # formatted_output += f"Version {i}:\n{version}\n" | |
# # # formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n" | |
# # # if output_format == "text": | |
# # # return formatted_output, "\n\n".join([v[0] for v in human_versions]) | |
# # # else: | |
# # # return json.dumps(json_output, indent=2), "\n\n".join([v[0] for v in human_versions]) | |
# # # # Define the Gradio interface | |
# # # iface = gr.Interface( | |
# # # fn=generate_paraphrases, | |
# # # inputs=[ | |
# # # gr.Textbox(lines=5, label="Input Text"), | |
# # # gr.Slider(minimum=1, maximum=5, step=1, label="Readability to Human-like Setting"), | |
# # # gr.Radio(["text", "json"], label="Output Format") | |
# # # ], | |
# # # outputs=[ | |
# # # gr.Textbox(lines=20, label="Detailed Paraphrases and Classifications"), | |
# # # gr.Textbox(lines=10, label="Human-like or Less Confident Machine-generated Paraphrases") | |
# # # ], | |
# # # title="Advanced Diverse Paraphraser with Human-like Filter", | |
# # # description="Enter a text, select a setting from readable to human-like, and choose the output format to generate diverse paraphrased versions. Combined versions are classified, and those detected as human-produced or less confidently machine-generated are presented in the final output." | |
# # # ) | |
# # # # Launch the interface | |
# # # iface.launch() | |
# # import os | |
# # import json | |
# # import gradio as gr | |
# # import spaces | |
# # import torch | |
# # from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification | |
# # from sentence_splitter import SentenceSplitter | |
# # from itertools import product | |
# # # Get the Hugging Face token from environment variable | |
# # hf_token = os.getenv('HF_TOKEN') | |
# # cuda_available = torch.cuda.is_available() | |
# # device = torch.device("cuda" if cuda_available else "cpu") | |
# # print(f"Using device: {device}") | |
# # # Initialize paraphraser model and tokenizer | |
# # paraphraser_model_name = "sharad/ParaphraseGPT" | |
# # paraphraser_tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base") | |
# # paraphraser_model = AutoModelForSeq2SeqLM.from_pretrained(paraphraser_model_name).to(device) | |
# # paraphrase_pipeline = pipeline("text2text-generation", model=paraphraser_model, tokenizer=paraphraser_tokenizer, device=0 if cuda_available else -1) | |
# # # Initialize classifier model and tokenizer | |
# # classifier_model_name = "andreas122001/roberta-mixed-detector" | |
# # classifier_tokenizer = AutoTokenizer.from_pretrained(classifier_model_name) | |
# # classifier_model = AutoModelForSequenceClassification.from_pretrained(classifier_model_name).to(device) | |
# # # Initialize sentence splitter | |
# # splitter = SentenceSplitter(language='en') | |
# # def classify_text(text): | |
# # inputs = classifier_tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device) | |
# # with torch.no_grad(): | |
# # outputs = classifier_model(**inputs) | |
# # probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1) | |
# # predicted_class = torch.argmax(probabilities, dim=-1).item() | |
# # main_label = classifier_model.config.id2label[predicted_class] | |
# # main_score = probabilities[0][predicted_class].item() | |
# # return main_label, main_score | |
# # @spaces.GPU | |
# # def generate_paraphrases(text, setting, output_format): | |
# # sentences = splitter.split(text) | |
# # all_sentence_paraphrases = [] | |
# # if setting == 1: | |
# # num_return_sequences = 5 | |
# # repetition_penalty = 1.1 | |
# # no_repeat_ngram_size = 2 | |
# # temperature = 0.9 | |
# # max_length = 128 | |
# # elif setting == 2: | |
# # num_return_sequences = 5 | |
# # repetition_penalty = 1.2 | |
# # no_repeat_ngram_size = 3 | |
# # temperature = 0.95 | |
# # max_length = 192 | |
# # elif setting == 3: | |
# # num_return_sequences = 5 | |
# # repetition_penalty = 1.3 | |
# # no_repeat_ngram_size = 4 | |
# # temperature = 1.0 | |
# # max_length = 256 | |
# # elif setting == 4: | |
# # num_return_sequences = 5 | |
# # repetition_penalty = 1.4 | |
# # no_repeat_ngram_size = 5 | |
# # temperature = 1.05 | |
# # max_length = 320 | |
# # else: | |
# # num_return_sequences = 5 | |
# # repetition_penalty = 1.5 | |
# # no_repeat_ngram_size = 6 | |
# # temperature = 1.1 | |
# # max_length = 384 | |
# # top_k = 50 | |
# # top_p = 0.95 | |
# # length_penalty = 1.0 | |
# # formatted_output = "Original text:\n" + text + "\n\n" | |
# # formatted_output += "Paraphrased versions:\n" | |
# # json_output = { | |
# # "original_text": text, | |
# # "paraphrased_versions": [], | |
# # "combined_versions": [], | |
# # "human_like_versions": [] | |
# # } | |
# # for i, sentence in enumerate(sentences): | |
# # paraphrases = paraphrase_pipeline( | |
# # sentence, | |
# # num_return_sequences=num_return_sequences, | |
# # do_sample=True, | |
# # top_k=top_k, | |
# # top_p=top_p, | |
# # temperature=temperature, | |
# # no_repeat_ngram_size=no_repeat_ngram_size, | |
# # repetition_penalty=repetition_penalty, | |
# # max_length=max_length | |
# # ) | |
# # paraphrases_texts = [p['generated_text'] for p in paraphrases] | |
# # formatted_output += f"Original sentence {i+1}: {sentence}\n" | |
# # for j, paraphrase in enumerate(paraphrases_texts, 1): | |
# # formatted_output += f" Paraphrase {j}: {paraphrase}\n" | |
# # json_output["paraphrased_versions"].append({ | |
# # f"original_sentence_{i+1}": sentence, | |
# # "paraphrases": paraphrases_texts | |
# # }) | |
# # all_sentence_paraphrases.append(paraphrases_texts) | |
# # formatted_output += "\n" | |
# # all_combinations = list(product(*all_sentence_paraphrases)) | |
# # formatted_output += "\nCombined paraphrased versions:\n" | |
# # combined_versions = [] | |
# # for i, combination in enumerate(all_combinations[:50], 1): # Limit to 50 combinations | |
# # combined_paraphrase = " ".join(combination) | |
# # combined_versions.append(combined_paraphrase) | |
# # json_output["combined_versions"] = combined_versions | |
# # # Classify combined versions | |
# # human_versions = [] | |
# # for i, version in enumerate(combined_versions, 1): | |
# # label, score = classify_text(version) | |
# # formatted_output += f"Version {i}:\n{version}\n" | |
# # formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n" | |
# # if label == "human-produced" or (label == "machine-generated" and score < 0.98): | |
# # human_versions.append((version, label, score)) | |
# # formatted_output += "\nHuman-like or Less Confident Machine-generated versions:\n" | |
# # for i, (version, label, score) in enumerate(human_versions, 1): | |
# # formatted_output += f"Version {i}:\n{version}\n" | |
# # formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n" | |
# # json_output["human_like_versions"] = [ | |
# # {"version": version, "label": label, "confidence_score": score} | |
# # for version, label, score in human_versions | |
# # ] | |
# # # If no human-like versions, include the top 5 least confident machine-generated versions | |
# # if not human_versions: | |
# # human_versions = sorted([(v, l, s) for v, l, s in zip(combined_versions, [classify_text(v)[0] for v in combined_versions], [classify_text(v)[1] for v in combined_versions])], key=lambda x: x[2])[:5] | |
# # formatted_output += "\nNo human-like versions found. Showing top 5 least confident machine-generated versions:\n" | |
# # for i, (version, label, score) in enumerate(human_versions, 1): | |
# # formatted_output += f"Version {i}:\n{version}\n" | |
# # formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n" | |
# # if output_format == "text": | |
# # return formatted_output, "\n\n".join([v[0] for v in human_versions]) | |
# # else: | |
# # return json.dumps(json_output, indent=2), "\n\n".join([v[0] for v in human_versions]) | |
# # # Define the Gradio interface | |
# # iface = gr.Interface( | |
# # fn=generate_paraphrases, | |
# # inputs=[ | |
# # gr.Textbox(lines=5, label="Input Text"), | |
# # gr.Slider(minimum=1, maximum=5, step=1, label="Readability to Human-like Setting"), | |
# # gr.Radio(["text", "json"], label="Output Format") | |
# # ], | |
# # outputs=[ | |
# # gr.Textbox(lines=20, label="Detailed Paraphrases and Classifications"), | |
# # gr.Textbox(lines=10, label="Human-like or Less Confident Machine-generated Paraphrases") | |
# # ], | |
# # title="Advanced Diverse Paraphraser with Human-like Filter", | |
# # description="Enter a text, select a setting from readable to human-like, and choose the output format to generate diverse paraphrased versions. Combined versions are classified, and those detected as human-produced or less confidently machine-generated are presented in the final output." | |
# # ) | |
# # # Launch the interface | |
# # iface.launch() | |
# import os | |
# import json | |
# import gradio as gr | |
# import spaces | |
# import torch | |
# import sys | |
# import subprocess | |
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification | |
# from sentence_splitter import SentenceSplitter | |
# from itertools import product | |
# # Ensure sentencepiece is installed | |
# try: | |
# import sentencepiece | |
# except ImportError: | |
# subprocess.check_call([sys.executable, "-m", "pip", "install", "sentencepiece"]) | |
# # Get the Hugging Face token from environment variable | |
# hf_token = os.getenv('HF_TOKEN') | |
# cuda_available = torch.cuda.is_available() | |
# device = torch.device("cuda" if cuda_available else "cpu") | |
# print(f"Using device: {device}") | |
# # Initialize paraphraser model and tokenizer | |
# paraphraser_model_name = "ramsrigouthamg/t5-large-paraphraser-diverse-high-quality" | |
# paraphraser_tokenizer = AutoTokenizer.from_pretrained(paraphraser_model_name, use_fast=False) | |
# paraphraser_model = AutoModelForSeq2SeqLM.from_pretrained(paraphraser_model_name).to(device) | |
# # Initialize classifier model and tokenizer | |
# classifier_model_name = "andreas122001/roberta-mixed-detector" | |
# classifier_tokenizer = AutoTokenizer.from_pretrained(classifier_model_name) | |
# classifier_model = AutoModelForSequenceClassification.from_pretrained(classifier_model_name).to(device) | |
# # Initialize sentence splitter | |
# splitter = SentenceSplitter(language='en') | |
# def classify_text(text): | |
# inputs = classifier_tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device) | |
# with torch.no_grad(): | |
# outputs = classifier_model(**inputs) | |
# probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1) | |
# predicted_class = torch.argmax(probabilities, dim=-1).item() | |
# main_label = classifier_model.config.id2label[predicted_class] | |
# main_score = probabilities[0][predicted_class].item() | |
# return main_label, main_score | |
# @spaces.GPU | |
# def generate_paraphrases(text, setting, output_format): | |
# sentences = splitter.split(text) | |
# all_sentence_paraphrases = [] | |
# if setting == 1: | |
# num_return_sequences = 3 | |
# num_beams = 5 | |
# max_length = 128 | |
# elif setting == 2: | |
# num_return_sequences = 3 | |
# num_beams = 7 | |
# max_length = 192 | |
# elif setting == 3: | |
# num_return_sequences = 3 | |
# num_beams = 9 | |
# max_length = 256 | |
# elif setting == 4: | |
# num_return_sequences = 3 | |
# num_beams = 11 | |
# max_length = 320 | |
# else: | |
# num_return_sequences = 3 | |
# num_beams = 15 | |
# max_length = 384 | |
# formatted_output = "Original text:\n" + text + "\n\n" | |
# formatted_output += "Paraphrased versions:\n" | |
# json_output = { | |
# "original_text": text, | |
# "paraphrased_versions": [], | |
# "combined_versions": [], | |
# "human_like_versions": [] | |
# } | |
# for i, sentence in enumerate(sentences): | |
# text = "paraphrase: " + sentence + " </s>" | |
# encoding = paraphraser_tokenizer.encode_plus(text, max_length=max_length, padding=True, return_tensors="pt") | |
# input_ids, attention_mask = encoding["input_ids"].to(device), encoding["attention_mask"].to(device) | |
# paraphraser_model.eval() | |
# beam_outputs = paraphraser_model.generate( | |
# input_ids=input_ids, | |
# attention_mask=attention_mask, | |
# max_length=max_length, | |
# early_stopping=True, | |
# num_beams=num_beams, | |
# num_return_sequences=num_return_sequences | |
# ) | |
# paraphrases_texts = [paraphraser_tokenizer.decode(beam_output, skip_special_tokens=True, clean_up_tokenization_spaces=True) for beam_output in beam_outputs] | |
# formatted_output += f"Original sentence {i+1}: {sentence}\n" | |
# for j, paraphrase in enumerate(paraphrases_texts, 1): | |
# formatted_output += f" Paraphrase {j}: {paraphrase}\n" | |
# json_output["paraphrased_versions"].append({ | |
# f"original_sentence_{i+1}": sentence, | |
# "paraphrases": paraphrases_texts | |
# }) | |
# all_sentence_paraphrases.append(paraphrases_texts) | |
# formatted_output += "\n" | |
# all_combinations = list(product(*all_sentence_paraphrases)) | |
# formatted_output += "\nCombined paraphrased versions:\n" | |
# combined_versions = [] | |
# for i, combination in enumerate(all_combinations[:50], 1): # Limit to 50 combinations | |
# combined_paraphrase = " ".join(combination) | |
# combined_versions.append(combined_paraphrase) | |
# json_output["combined_versions"] = combined_versions | |
# # Classify combined versions | |
# human_versions = [] | |
# for i, version in enumerate(combined_versions, 1): | |
# label, score = classify_text(version) | |
# formatted_output += f"Version {i}:\n{version}\n" | |
# formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n" | |
# if label == "human-produced" or (label == "machine-generated" and score < 0.90): # Adjusted threshold | |
# human_versions.append((version, label, score)) | |
# formatted_output += "\nHuman-like or Less Confident Machine-generated versions:\n" | |
# for i, (version, label, score) in enumerate(human_versions, 1): | |
# formatted_output += f"Version {i}:\n{version}\n" | |
# formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n" | |
# json_output["human_like_versions"] = [ | |
# {"version": version, "label": label, "confidence_score": score} | |
# for version, label, score in human_versions | |
# ] | |
# # If no human-like versions, include the top 5 least confident machine-generated versions | |
# if not human_versions: | |
# human_versions = sorted([(v, l, s) for v, l, s in zip(combined_versions, [classify_text(v)[0] for v in combined_versions], [classify_text(v)[1] for v in combined_versions])], key=lambda x: x[2])[:5] | |
# formatted_output += "\nNo human-like versions found. Showing top 5 least confident machine-generated versions:\n" | |
# for i, (version, label, score) in enumerate(human_versions, 1): | |
# formatted_output += f"Version {i}:\n{version}\n" | |
# formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n" | |
# if output_format == "text": | |
# return formatted_output, "\n\n".join([v[0] for v in human_versions]) | |
# else: | |
# return json.dumps(json_output, indent=2), "\n\n".join([v[0] for v in human_versions]) | |
# # Define the Gradio interface | |
# iface = gr.Interface( | |
# fn=generate_paraphrases, | |
# inputs=[ | |
# gr.Textbox(lines=5, label="Input Text"), | |
# gr.Slider(minimum=1, maximum=5, step=1, label="Readability to Human-like Setting"), | |
# gr.Radio(["text", "json"], label="Output Format") | |
# ], | |
# outputs=[ | |
# gr.Textbox(lines=20, label="Detailed Paraphrases and Classifications"), | |
# gr.Textbox(lines=10, label="Human-like or Less Confident Machine-generated Paraphrases") | |
# ], | |
# title="Advanced Diverse Paraphraser with Human-like Filter", | |
# description="Enter a text, select a setting from readable to human-like, and choose the output format to generate diverse paraphrased versions. Combined versions are classified, and those detected as human-produced or less confidently machine-generated are presented in the final output." | |
# ) | |
# # Launch the interface | |
# iface.launch() | |
import os | |
import json | |
import gradio as gr | |
import spaces | |
import torch | |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification | |
from sentence_splitter import SentenceSplitter | |
from itertools import product | |
# Get the Hugging Face token from environment variable | |
hf_token = os.getenv('HF_TOKEN') | |
cuda_available = torch.cuda.is_available() | |
device = torch.device("cuda" if cuda_available else "cpu") | |
print(f"Using device: {device}") | |
# Initialize paraphraser model and tokenizer | |
paraphraser_model_name = "ramsrigouthamg/t5-large-paraphraser-diverse-high-quality" | |
paraphraser_tokenizer = AutoTokenizer.from_pretrained(paraphraser_model_name) | |
paraphraser_model = AutoModelForSeq2SeqLM.from_pretrained(paraphraser_model_name).to(device) | |
# Initialize classifier model and tokenizer | |
classifier_model_name = "andreas122001/roberta-mixed-detector" | |
classifier_tokenizer = AutoTokenizer.from_pretrained(classifier_model_name) | |
classifier_model = AutoModelForSequenceClassification.from_pretrained(classifier_model_name).to(device) | |
# Initialize sentence splitter | |
splitter = SentenceSplitter(language='en') | |
def classify_text(text): | |
inputs = classifier_tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device) | |
with torch.no_grad(): | |
outputs = classifier_model(**inputs) | |
probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1) | |
predicted_class = torch.argmax(probabilities, dim=-1).item() | |
main_label = classifier_model.config.id2label[predicted_class] | |
main_score = probabilities[0][predicted_class].item() | |
return main_label, main_score | |
def clean_text(text): | |
return text.replace("paraphrasedoutput: ", "") | |
def generate_paraphrases(text, setting, output_format): | |
sentences = splitter.split(text) | |
all_sentence_paraphrases = [] | |
if setting == 1: | |
num_return_sequences = 5 | |
temperature = 1.0 | |
top_k = 50 | |
top_p = 0.95 | |
max_length = 128 | |
elif setting == 2: | |
num_return_sequences = 7 | |
temperature = 1.2 | |
top_k = 50 | |
top_p = 0.95 | |
max_length = 192 | |
elif setting == 3: | |
num_return_sequences = 10 | |
temperature = 1.4 | |
top_k = 50 | |
top_p = 0.95 | |
max_length = 256 | |
elif setting == 4: | |
num_return_sequences = 15 | |
temperature = 1.6 | |
top_k = 50 | |
top_p = 0.95 | |
max_length = 320 | |
else: | |
num_return_sequences = 20 | |
temperature = 1.8 | |
top_k = 50 | |
top_p = 0.95 | |
max_length = 384 | |
formatted_output = "Original text:\n" + text + "\n\n" | |
formatted_output += "Paraphrased versions:\n" | |
json_output = { | |
"original_text": text, | |
"paraphrased_versions": [], | |
"combined_versions": [], | |
"human_like_versions": [] | |
} | |
for i, sentence in enumerate(sentences): | |
text = "paraphrase: " + sentence + " </s>" | |
encoding = paraphraser_tokenizer.encode_plus(text, max_length=max_length, padding=True, return_tensors="pt") | |
input_ids, attention_mask = encoding["input_ids"].to(device), encoding["attention_mask"].to(device) | |
paraphraser_model.eval() | |
outputs = paraphraser_model.generate( | |
input_ids=input_ids, | |
attention_mask=attention_mask, | |
max_length=max_length, | |
num_return_sequences=num_return_sequences, | |
do_sample=True, | |
top_k=top_k, | |
top_p=top_p, | |
temperature=temperature | |
) | |
paraphrases_texts = [clean_text(paraphraser_tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True)) for output in outputs] | |
formatted_output += f"Original sentence {i+1}: {sentence}\n" | |
for j, paraphrase in enumerate(paraphrases_texts, 1): | |
formatted_output += f" Paraphrase {j}: {paraphrase}\n" | |
json_output["paraphrased_versions"].append({ | |
f"original_sentence_{i+1}": sentence, | |
"paraphrases": paraphrases_texts | |
}) | |
all_sentence_paraphrases.append(paraphrases_texts) | |
formatted_output += "\n" | |
all_combinations = list(product(*all_sentence_paraphrases)) | |
formatted_output += "\nCombined paraphrased versions:\n" | |
combined_versions = [] | |
for i, combination in enumerate(all_combinations[:50], 1): # Limit to 50 combinations | |
combined_paraphrase = " ".join(combination) | |
combined_versions.append(combined_paraphrase) | |
json_output["combined_versions"] = combined_versions | |
# Classify combined versions | |
human_versions = [] | |
for i, version in enumerate(combined_versions, 1): | |
label, score = classify_text(version) | |
formatted_output += f"Version {i}:\n{version}\n" | |
formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n" | |
if label == "human-produced" or (label == "machine-generated" and score < 0.90): # Adjusted threshold | |
human_versions.append((version, label, score)) | |
formatted_output += "\nHuman-like or Less Confident Machine-generated versions:\n" | |
for i, (version, label, score) in enumerate(human_versions, 1): | |
formatted_output += f"Version {i}:\n{version}\n" | |
formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n" | |
json_output["human_like_versions"] = [ | |
{"version": version, "label": label, "confidence_score": score} | |
for version, label, score in human_versions | |
] | |
# If no human-like versions, include the top 5 least confident machine-generated versions | |
if not human_versions: | |
human_versions = sorted([(v, l, s) for v, l, s in zip(combined_versions, [classify_text(v)[0] for v in combined_versions], [classify_text(v)[1] for v in combined_versions])], key=lambda x: x[2])[:5] | |
formatted_output += "\nNo human-like versions found. Showing top 5 least confident machine-generated versions:\n" | |
for i, (version, label, score) in enumerate(human_versions, 1): | |
formatted_output += f"Version {i}:\n{version}\n" | |
formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n" | |
if output_format == "text": | |
return formatted_output, "\n\n".join([v[0] for v in human_versions]) | |
else: | |
return json.dumps(json_output, indent=2), "\n\n".join([v[0] for v in human_versions]) | |
# Define the Gradio interface | |
iface = gr.Interface( | |
fn=generate_paraphrases, | |
inputs=[ | |
gr.Textbox(lines=5, label="Input Text"), | |
gr.Slider(minimum=1, maximum=5, step=1, label="Readability to Human-like Setting"), | |
gr.Radio(["text", "json"], label="Output Format") | |
], | |
outputs=[ | |
gr.Textbox(lines=20, label="Detailed Paraphrases and Classifications"), | |
gr.Textbox(lines=10, label="Human-like or Less Confident Machine-generated Paraphrases") | |
], | |
title="Advanced Diverse Paraphraser with Human-like Filter", | |
description="Enter a text, select a setting from readable to human-like, and choose the output format to generate diverse paraphrased versions. Combined versions are classified, and those detected as human-produced or less confidently machine-generated are presented in the final output." | |
) | |
# Launch the interface | |
iface.launch() |