cluster-summ / summarize.py
jaisidhsingh's picture
Update summarize.py
90b124a
raw
history blame
3.02 kB
from utils.sentence_embedding import *
from utils.clustering import *
from models.summarizers import *
from nltk.tokenize import sent_tokenize, word_tokenize
import math
from time import perf_counter
import time
def get_summary(model_name, article, max_length, min_length, increment):
start_time = perf_counter()
summarization_model, summarization_tokenizer = load_summarizer(model_name)
summarizer_token_limit = summarization_tokenizer.model_max_length
print("Going Beyong Token limit:", summarizer_token_limit)
input_word_toks = word_tokenize(article)
num_words = len(input_word_toks)
if num_words <= summarizer_token_limit and model_name == "t5":
pred_summary = summarize_input(article, summarization_model, summarization_tokenizer)
end_time = perf_counter()
print("Time taken: ", end_time - start_time)
else:
input_sent_toks = sent_tokenize(article)
embeddings = make_embeddings(input_sent_toks, mean_pooling)
embeddings = embeddings.numpy()
increment[0] = 20
n_clusters_estimate = math.ceil(num_words / summarizer_token_limit)
clemb = ClusterEmbeddings(
cluster_estimate=n_clusters_estimate,
cluster_fn="agglo", # much better
embeddings=embeddings,
sentences=np.array(input_sent_toks),
words=np.array(input_word_toks)
)
increment[0] = 50
sentence_clusters = clemb.get_sentence_clusters()
n = len(sentence_clusters)
summs = ""
for cluster in sentence_clusters:
cluster_summary = summarize_input(
cluster,
summarization_model,
summarization_tokenizer,
max_length=250,
min_length=50,
)
if type(cluster_summary) == list:
cluster_summary = cluster_summary[0]
summs += cluster_summary + " "
increment[0] += 40 / n
pred_summary = summarize_input(
summs,
summarization_model,
summarization_tokenizer,
max_length=max_length,
min_length=min_length,
)
increment[0] += 100
end_time = perf_counter()
time_taken = end_time - start_time
return pred_summary, time_taken
def test():
article = """Recent text-to-image matching models apply contrastive learning to large corpora of uncurated pairs of images and sentences. While such models can provide a powerful score for matching and subsequent zero-shot tasks, they are not capable of generating caption given an image. In this work, we repurpose such models to generate a descriptive text given an image at inference time, without any further training or tuning step. This is done by combining the visual-semantic model with a large language model, benefiting from the knowledge in both web-scale models. The resulting captions are much less restrictive than those obtained by supervised captioning methods. Moreover, as a zero-shot learning method, it is extremely flexible and wedemonstrate its ability to perform image arithmetic in which the inputs can be either images or text and the output is a sentence."""
model_name = "BART"
summ, time_taken = get_summary(model_name, article, 250, 150)
print(summ)
print(time_taken)