import spacy import pytextrank from spacy.tokens import Span # Define decorator for converting to singular version of words @spacy.registry.misc("plural_scrubber") def plural_scrubber(): def scrubber_func(span: Span) -> str: return span.lemma_ return scrubber_func def model_selector(target_language: str): # Load subset of non-english models language_model = { "spa": "es_core_news_sm", "fra": "fr_core_news_sm", "pol": "pl_core_news_sm", "deu": "de_core_news_sm", "ita": "it_core_news_sm", "por": "pt_core_news_sm", "nld": "nl_core_news_sm", "fin": "fi_core_news_sm", "ron": "ro_core_news_sm", "rus": "ru_core_news_sm" } try: nlp = spacy.load(language_model[target_language]) except KeyError: # Load a spaCy English model nlp = spacy.load("en_core_web_lg") # Add TextRank component to pipeline with stopwords nlp.add_pipe("textrank", config={ "stopwords": {token:["NOUN"] for token in nlp.Defaults.stop_words}, "scrubber": {"@misc": "plural_scrubber"}}) return nlp def extract_terms(text, target_language, length): nlp = model_selector(target_language) # Perform fact extraction on overall summary and segment summaries doc = nlp(text) if length < 100: # Get single most used key term phrases = {phrase.text for phrase in doc._.phrases[:1]} elif length >= 100 and length < 300: # Create unique set from top 2 ranked phrases phrases = {phrase.text for phrase in doc._.phrases[:2]} if length >= 300: # Create unique set from top 3 ranked phrases phrases = {phrase.text for phrase in doc._.phrases[:3]} return list(phrases)