Spaces:
Paused
Paused
Adding key term labeller
Browse files- backend/utils/text_rank.py +40 -0
backend/utils/text_rank.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import spacy
|
2 |
+
import pytextrank
|
3 |
+
from spacy.tokens import Span
|
4 |
+
|
5 |
+
# Define decorator for converting to singular version of words
|
6 |
+
@spacy.registry.misc("plural_scrubber")
|
7 |
+
def plural_scrubber():
|
8 |
+
def scrubber_func(span: Span) -> str:
|
9 |
+
return span.lemma_
|
10 |
+
return scrubber_func
|
11 |
+
|
12 |
+
|
13 |
+
# Load a spaCy model
|
14 |
+
nlp = spacy.load("en_core_web_lg")
|
15 |
+
|
16 |
+
|
17 |
+
# Exclude potential stopwords
|
18 |
+
nlp.Defaults.stop_words |= {"okay", "like"}
|
19 |
+
|
20 |
+
# Add TextRank component to pipeline with stopwords
|
21 |
+
nlp.add_pipe("textrank", config={
|
22 |
+
"stopwords": {token:["NOUN"] for token in nlp.Defaults.stop_words},
|
23 |
+
"scrubber": {"@misc": "plural_scrubber"}})
|
24 |
+
|
25 |
+
|
26 |
+
def extract_terms(text, length):
|
27 |
+
# Perform fact extraction on overall summary and segment summaries
|
28 |
+
doc = nlp(text)
|
29 |
+
|
30 |
+
if length < 200:
|
31 |
+
# Get single most used key term
|
32 |
+
phrases = {phrase.text for phrase in doc._.phrases[:1]}
|
33 |
+
elif length > 200 and length < 400:
|
34 |
+
# Create unique set from top 2 ranked phrases
|
35 |
+
phrases = {phrase.text for phrase in doc._.phrases[:2]}
|
36 |
+
if length > 400:
|
37 |
+
# Create unique set from top 3 ranked phrases
|
38 |
+
phrases = {phrase.text for phrase in doc._.phrases[:3]}
|
39 |
+
|
40 |
+
return list(phrases)
|