Spaces:
Runtime error
Runtime error
from utils.sentence_embedding import * | |
from utils.clustering import * | |
from models.summarizers import * | |
from nltk.tokenize import sent_tokenize, word_tokenize | |
import math | |
from time import perf_counter | |
import time | |
def get_summary(model_name, article, max_length, min_length, increment): | |
start_time = perf_counter() | |
summarization_model, summarization_tokenizer = load_summarizer(model_name) | |
summarizer_token_limit = summarization_tokenizer.model_max_length | |
print("Going Beyong Token limit:", summarizer_token_limit) | |
input_word_toks = word_tokenize(article) | |
num_words = len(input_word_toks) | |
if num_words <= summarizer_token_limit and model_name == "t5": | |
pred_summary = summarize_input(article, summarization_model, summarization_tokenizer) | |
end_time = perf_counter() | |
print("Time taken: ", end_time - start_time) | |
else: | |
input_sent_toks = sent_tokenize(article) | |
embeddings = make_embeddings(input_sent_toks, mean_pooling) | |
embeddings = embeddings.numpy() | |
increment[0] = 20 | |
n_clusters_estimate = math.ceil(num_words / summarizer_token_limit) | |
clemb = ClusterEmbeddings( | |
cluster_estimate=n_clusters_estimate, | |
cluster_fn="agglo", # much better | |
embeddings=embeddings, | |
sentences=np.array(input_sent_toks), | |
words=np.array(input_word_toks) | |
) | |
increment[0] = 50 | |
sentence_clusters = clemb.get_sentence_clusters() | |
n = len(sentence_clusters) | |
summs = "" | |
for cluster in sentence_clusters: | |
cluster_summary = summarize_input( | |
cluster, | |
summarization_model, | |
summarization_tokenizer, | |
max_length=250, | |
min_length=50, | |
) | |
if type(cluster_summary) == list: | |
cluster_summary = cluster_summary[0] | |
summs += cluster_summary + " " | |
increment[0] += 40 / n | |
pred_summary = summarize_input( | |
summs, | |
summarization_model, | |
summarization_tokenizer, | |
max_length=max_length, | |
min_length=min_length, | |
) | |
increment[0] += 100 | |
end_time = perf_counter() | |
time_taken = end_time - start_time | |
return pred_summary, time_taken | |
def test(): | |
article = """Recent text-to-image matching models apply contrastive learning to large corpora of uncurated pairs of images and sentences. While such models can provide a powerful score for matching and subsequent zero-shot tasks, they are not capable of generating caption given an image. In this work, we repurpose such models to generate a descriptive text given an image at inference time, without any further training or tuning step. This is done by combining the visual-semantic model with a large language model, benefiting from the knowledge in both web-scale models. The resulting captions are much less restrictive than those obtained by supervised captioning methods. Moreover, as a zero-shot learning method, it is extremely flexible and wedemonstrate its ability to perform image arithmetic in which the inputs can be either images or text and the output is a sentence.""" | |
model_name = "BART" | |
summ, time_taken = get_summary(model_name, article, 250, 150) | |
print(summ) | |
print(time_taken) | |