""" constrained_generation.py - use constrained beam search to generate text from a model with entered constraints """ import copy import logging logging.basicConfig(level=logging.INFO) import time from pathlib import Path import yake from transformers import AutoTokenizer, PhrasalConstraint def get_tokenizer(model_name="gpt2", verbose=False): """ get_tokenizer - returns a tokenizer object :param model_name: name of the model to use, default gpt2 :param verbose: verbosity """ tokenizer = AutoTokenizer.from_pretrained( model_name, add_special_tokens=False, padding=True, truncation=True ) tokenizer.pad_token = tokenizer.eos_token if verbose: print(f"loaded tokenizer {model_name}") return tokenizer def unique_words(list_of_strings): """ unique_words - return a list of unique words from a list of strings. Uses set to remove duplicates. """ unique_words = [] output_list = [] for string in list_of_strings: # split string into words words = string.split() # check if word is unique unique_status = True for word in words: if word not in unique_words: unique_words.append(word) else: unique_status = False break if unique_status: output_list.append(string) return output_list def create_kw_extractor( language="en", max_ngram_size=3, deduplication_algo="seqm", windowSize=10, numOfKeywords=10, ddpt=0.7, ): """ creates a keyword extractor object :param language: language of the text :param max_ngram_size: max ngram size :param deduplication_algo: deduplication algorithm :param windowSize: window size :param numOfKeywords: number of keywords :param ddpt: Deduplication Percentage Threshold :return: keyword extractor object """ assert ddpt >= 0 and ddpt <= 1, f"need 0