File size: 1,820 Bytes
ddc5bbd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dddc4b3
ddc5bbd
 
dddc4b3
ddc5bbd
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import spacy
import pytextrank
from spacy.tokens import Span

# Define decorator for converting to singular version of words
@spacy.registry.misc("plural_scrubber")
def plural_scrubber():
    def scrubber_func(span: Span) -> str:
        return span.lemma_
    return scrubber_func


def model_selector(target_language: str):

    # Load subset of non-english models
    language_model = {
        "spa": "es_core_news_sm",
        "fra": "fr_core_news_sm",
        "pol": "pl_core_news_sm",
        "deu": "de_core_news_sm",
        "ita": "it_core_news_sm",
        "por": "pt_core_news_sm",
        "nld": "nl_core_news_sm",
        "fin": "fi_core_news_sm",
        "ron": "ro_core_news_sm",
        "rus": "ru_core_news_sm"
    }

    try:
        nlp = spacy.load(language_model[target_language])

    except KeyError:
        # Load a spaCy English model
        nlp = spacy.load("en_core_web_lg")

    # Add TextRank component to pipeline with stopwords
    nlp.add_pipe("textrank", config={
                            "stopwords": {token:["NOUN"] for token in nlp.Defaults.stop_words},
                            "scrubber": {"@misc": "plural_scrubber"}})

    return nlp


def extract_terms(text, target_language, length):
    nlp = model_selector(target_language)

    # Perform fact extraction on overall summary and segment summaries
    doc = nlp(text)

    if length < 100:
        # Get single most used key term
        phrases = {phrase.text for phrase in doc._.phrases[:1]}
    elif length >= 100 and length < 300:
        # Create unique set from top 2 ranked phrases
        phrases = {phrase.text for phrase in doc._.phrases[:2]}
    if length >= 300:
        # Create unique set from top 3 ranked phrases
        phrases = {phrase.text for phrase in doc._.phrases[:3]}

    return list(phrases)