benjolo commited on
Commit
9828ebf
1 Parent(s): 4167c02

Adding key term labeller

Browse files
Files changed (1) hide show
  1. backend/utils/text_rank.py +40 -0
backend/utils/text_rank.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ import pytextrank
3
+ from spacy.tokens import Span
4
+
5
+ # Define decorator for converting to singular version of words
6
+ @spacy.registry.misc("plural_scrubber")
7
+ def plural_scrubber():
8
+ def scrubber_func(span: Span) -> str:
9
+ return span.lemma_
10
+ return scrubber_func
11
+
12
+
13
+ # Load a spaCy model
14
+ nlp = spacy.load("en_core_web_lg")
15
+
16
+
17
+ # Exclude potential stopwords
18
+ nlp.Defaults.stop_words |= {"okay", "like"}
19
+
20
+ # Add TextRank component to pipeline with stopwords
21
+ nlp.add_pipe("textrank", config={
22
+ "stopwords": {token:["NOUN"] for token in nlp.Defaults.stop_words},
23
+ "scrubber": {"@misc": "plural_scrubber"}})
24
+
25
+
26
+ def extract_terms(text, length):
27
+ # Perform fact extraction on overall summary and segment summaries
28
+ doc = nlp(text)
29
+
30
+ if length < 200:
31
+ # Get single most used key term
32
+ phrases = {phrase.text for phrase in doc._.phrases[:1]}
33
+ elif length > 200 and length < 400:
34
+ # Create unique set from top 2 ranked phrases
35
+ phrases = {phrase.text for phrase in doc._.phrases[:2]}
36
+ if length > 400:
37
+ # Create unique set from top 3 ranked phrases
38
+ phrases = {phrase.text for phrase in doc._.phrases[:3]}
39
+
40
+ return list(phrases)