Spaces:
Runtime error
Runtime error
File size: 2,863 Bytes
55b5a53 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
"""
Shared utility methods for this module.
"""
from ctypes import Array
import datetime
import re
from transformers import AutoModelForSequenceClassification, AutoTokenizer, MBartForConditionalGeneration, MBartTokenizer, pipeline
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
def lowercase_string(string: str) -> str:
"""Returns a lowercased string
Args:
string: String to lowercase
Returns:
String in lowercase
"""
if isinstance(string, str):
return string.lower()
return None
from functools import lru_cache
@lru_cache
def get_sentiment_pipeline():
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
return sentiment_pipeline
def score_sentiment(input: str):
"""Score sentiment of an input string with a pretrained Transformers Pipeline
Args:
input (str): Text to be scored
Returns:
tuple: (label, score)
"""
sentiment_pipeline = get_sentiment_pipeline()
result = sentiment_pipeline(input.lower())[0]
# print("label:{0} input:{1}".format(result['label'], input))
return result['label'], result['score']
@lru_cache
def get_summarization_pipeline_nl():
undisputed_best_model = MBartForConditionalGeneration.from_pretrained(
"ml6team/mbart-large-cc25-cnn-dailymail-nl"
)
tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25")
summarization_pipeline = pipeline(
task="summarization",
model=undisputed_best_model,
tokenizer=tokenizer,
)
summarization_pipeline.model.config.decoder_start_token_id = tokenizer.lang_code_to_id[
"nl_XX"
]
return summarization_pipeline
def summarize_nl(input: str) -> str:
summarization_pipeline = get_summarization_pipeline_nl()
summary = summarization_pipeline(
input,
do_sample=True,
top_p=0.75,
top_k=50,
# num_beams=4,
min_length=50,
early_stopping=True,
truncation=True,
)[0]["summary_text"]
return summary
@lru_cache
def get_pegasus():
model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
return model, tokenizer
def summarize_en(input: str) -> str:
model, tokenizer = get_pegasus()
inputs = tokenizer(input, max_length=1024, return_tensors="pt")
# Generate Summary
summary_ids = model.generate(inputs["input_ids"])
result = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
return result |