Spaces:

polygraf-ai
/

article_writer

Runtime error

App Files Files Community

article_writer / humanize.py

aliasgerovs

Updated with latest

d994b45 6 months ago

raw

history blame

3.82 kB

	import torch
	from nltk import sent_tokenize
	import nltk
	from tqdm import tqdm
	import gradio as gr
	from transformers import T5ForConditionalGeneration, T5Tokenizer

	nltk.download("punkt")
	# autodetect the available device
	GPU_IDX = 1 # which GPU to use
	if torch.cuda.is_available():
	num_gpus = torch.cuda.device_count()
	print(f"Number of available GPUs: {num_gpus}")
	assert GPU_IDX < num_gpus, f"GPU index {GPU_IDX} not available."
	device = torch.device(f"cuda:{GPU_IDX}")
	print(f"Using GPU: {GPU_IDX}")
	else:
	print("CUDA is not available. Using CPU instead.")
	device = torch.device("cpu")

	# Configuration for models and their adapters
	model_config = {
	"Base Model": "polygraf-ai/poly-humanizer-base",
	"Large Model": "polygraf-ai/poly-humanizer-large",
	# "XL Model": {
	# "path": "google/flan-t5-xl",
	# "adapters": {
	# "XL Model Adapter": "polygraf-ai/poly-humanizer-XL-adapter",
	# "XL Law Model Adapter": "polygraf-ai/poly-humanizer-XL-law-adapter",
	# "XL Marketing Model Adapter": "polygraf-ai/marketing-cleaned-13K-grad-acum-4-full",
	# "XL Child Style Model Adapter": "polygraf-ai/poly-humanizer-XL-children-adapter-checkpoint-4000",
	# },
	# },
	}

	# cache the base models, tokenizers, and adapters
	models, tokenizers = {}, {}
	for name, config in model_config.items():
	path = config if isinstance(config, str) else config["path"]
	# initialize model and tokenizer
	model = T5ForConditionalGeneration.from_pretrained(path, torch_dtype=torch.bfloat16).to(device)
	models[name] = model
	tokenizers[name] = T5Tokenizer.from_pretrained(path)
	# load all avalable adapters, each being additional roughly 150M parameters
	if isinstance(config, dict) and "adapters" in config:
	for adapter_name, adapter_path in config["adapters"].items():
	model.load_adapter(adapter_path, adapter_name=adapter_name)
	print(f"Loaded adapter: {adapter_name}, Num. params: {model.num_parameters()}")


	def paraphrase_text(
	text,
	progress=gr.Progress(),
	model_name="Base Model",
	temperature=1.2,
	repetition_penalty=1.0,
	top_k=50,
	length_penalty=1.0,
	):
	progress(0, desc="Starting to Humanize")
	progress(0.05)
	# select the model, tokenizer and adapter
	if "XL" in model_name: # dynamic adapter load/unload for XL models
	# all adapter models use the XL model as the base
	tokenizer, model = tokenizers["XL Model"], models["XL Model"]
	# set the adapter if it's not already set
	if model.active_adapters() != [f"{model_name} Adapter"]:
	model.set_adapter(f"{model_name} Adapter")
	print(f"Using adapter: {model_name} Adapter")
	else:
	tokenizer = tokenizers[model_name]
	model = models[model_name]

	# paraphrase each chunk of text
	sentences = sent_tokenize(text) # sentence boundary detection
	paraphrases = []
	for sentence in progress.tqdm(sentences, desc="Humanizing"):
	sentence = sentence.strip()
	if len(sentence) == 0:
	continue
	inputs = tokenizer("Please paraphrase this sentence: " + sentence, return_tensors="pt").to(device)
	outputs = model.generate(
	**inputs,
	do_sample=True,
	temperature=temperature,
	repetition_penalty=repetition_penalty,
	max_length=128,
	top_k=top_k,
	length_penalty=length_penalty,
	)
	paraphrased_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
	paraphrases.append(paraphrased_sentence)
	print(f"\nOriginal: {sentence}")
	print(f"Paraphrased: {paraphrased_sentence}")

	combined_paraphrase = " ".join(paraphrases)
	return combined_paraphrase