InterpreTalk / backend /utils /text_rank.py
benjolo's picture
Update backend/utils/text_rank.py
dddc4b3 verified
raw
history blame
No virus
1.82 kB
import spacy
import pytextrank
from spacy.tokens import Span
# Define decorator for converting to singular version of words
@spacy.registry.misc("plural_scrubber")
def plural_scrubber():
def scrubber_func(span: Span) -> str:
return span.lemma_
return scrubber_func
def model_selector(target_language: str):
# Load subset of non-english models
language_model = {
"spa": "es_core_news_sm",
"fra": "fr_core_news_sm",
"pol": "pl_core_news_sm",
"deu": "de_core_news_sm",
"ita": "it_core_news_sm",
"por": "pt_core_news_sm",
"nld": "nl_core_news_sm",
"fin": "fi_core_news_sm",
"ron": "ro_core_news_sm",
"rus": "ru_core_news_sm"
}
try:
nlp = spacy.load(language_model[target_language])
except KeyError:
# Load a spaCy English model
nlp = spacy.load("en_core_web_lg")
# Add TextRank component to pipeline with stopwords
nlp.add_pipe("textrank", config={
"stopwords": {token:["NOUN"] for token in nlp.Defaults.stop_words},
"scrubber": {"@misc": "plural_scrubber"}})
return nlp
def extract_terms(text, target_language, length):
nlp = model_selector(target_language)
# Perform fact extraction on overall summary and segment summaries
doc = nlp(text)
if length < 100:
# Get single most used key term
phrases = {phrase.text for phrase in doc._.phrases[:1]}
elif length >= 100 and length < 300:
# Create unique set from top 2 ranked phrases
phrases = {phrase.text for phrase in doc._.phrases[:2]}
if length >= 300:
# Create unique set from top 3 ranked phrases
phrases = {phrase.text for phrase in doc._.phrases[:3]}
return list(phrases)