prashant commited on
Commit
53e0cf4
·
1 Parent(s): c8b3108

keyword extraction update

Browse files
paramconfig.cfg CHANGED
@@ -33,4 +33,3 @@ SPLIT_OVERLAP_SENTENCE = 1
33
 
34
  [tfidf]
35
  TOP_N = 20
36
- TEXTRANK_WORDS = 20
 
33
 
34
  [tfidf]
35
  TOP_N = 20
 
utils/keyword_extraction.py CHANGED
@@ -27,14 +27,31 @@ except Exception:
27
 
28
 
29
  def sort_coo(coo_matrix):
 
 
 
 
30
  tuples = zip(coo_matrix.col, coo_matrix.data)
31
  return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
32
 
33
- def extract_topn_from_vector(feature_names, sorted_items, topn=10):
34
- """get the feature names and tf-idf score of top n items"""
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  #use only topn items from vector
37
- sorted_items = sorted_items[:topn]
38
  score_vals = []
39
  feature_vals = []
40
 
@@ -53,6 +70,20 @@ def extract_topn_from_vector(feature_names, sorted_items, topn=10):
53
  return results
54
 
55
  def keywordExtraction(sdg:int,sdgdata:List[Text]):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  model_path = "docStore/sdg{}/".format(sdg)
57
  vectorizer = pickle.load(open(model_path+'vectorizer.pkl', 'rb'))
58
  tfidfmodel = pickle.load(open(model_path+'tfidfmodel.pkl', 'rb'))
@@ -64,7 +95,21 @@ def keywordExtraction(sdg:int,sdgdata:List[Text]):
64
  keywords = [keyword for keyword in results]
65
  return keywords
66
 
67
- def textrank(textdata, ratio = 0.1, words = 0):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  if words == 0:
69
  try:
70
  words = int(config.get('sdg','TOP_KEY'))
 
27
 
28
 
29
  def sort_coo(coo_matrix):
30
+ """
31
+ It takes Coordinate format scipy sparse matrix and extracts info from same.\
32
+ 1. https://kavita-ganesan.com/python-keyword-extraction/#.Y2-TFHbMJPb
33
+ """
34
  tuples = zip(coo_matrix.col, coo_matrix.data)
35
  return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
36
 
37
+ def extract_topn_from_vector(feature_names, sorted_items, top_n=10):
38
+ """get the feature names and tf-idf score of top n items
39
+
40
+ Params
41
+ ---------
42
+ feature_names: list of words from vectorizer
43
+ sorted_items: tuple returned by sort_coo function defined in \
44
+ keyword_extraction.py
45
+ topn: topn words to be extracted using tfidf
46
+
47
+ Return
48
+ ----------
49
+ results: top extracted keywords
50
+
51
+ """
52
 
53
  #use only topn items from vector
54
+ sorted_items = sorted_items[:top_n]
55
  score_vals = []
56
  feature_vals = []
57
 
 
70
  return results
71
 
72
  def keywordExtraction(sdg:int,sdgdata:List[Text]):
73
+ """
74
+ TFIDF based keywords extraction
75
+
76
+ Params
77
+ ---------
78
+ sdg: which sdg tfidf model to be used
79
+ sdgdata: text data to which needs keyword extraction
80
+
81
+
82
+ Return
83
+ ----------
84
+ keywords: top extracted keywords
85
+
86
+ """
87
  model_path = "docStore/sdg{}/".format(sdg)
88
  vectorizer = pickle.load(open(model_path+'vectorizer.pkl', 'rb'))
89
  tfidfmodel = pickle.load(open(model_path+'tfidfmodel.pkl', 'rb'))
 
95
  keywords = [keyword for keyword in results]
96
  return keywords
97
 
98
+ def textrank(textdata:Text, ratio:float = 0.1, words = 0):
99
+ """
100
+ wrappper function to perform textrank, uses either ratio or wordcount to
101
+ extract top keywords limited by words or ratio.
102
+
103
+ Params
104
+ --------
105
+ textdata: text data to perform the textrank.
106
+ ratio: float to limit the number of keywords as proportion of total token \
107
+ in textdata
108
+ words: number of keywords to be extracted. Takes priority over ratio if \
109
+ Non zero. Howevr incase the pagerank returns lesser keywords than \
110
+ compared to fix value then ratio is used.
111
+
112
+ """
113
  if words == 0:
114
  try:
115
  words = int(config.get('sdg','TOP_KEY'))
utils/uploadAndExample.py CHANGED
@@ -31,11 +31,3 @@ def add_upload(choice):
31
  file_name = file_path = files[option]
32
  st.session_state['filename'] = file_name
33
  st.session_state['filepath'] = file_path
34
- # if option is 'South Africa:Low Emission strategy':
35
- # file_name = file_path = 'docStore/sample/South Africa_s Low Emission Development Strategy.txt'
36
- # st.session_state['filename'] = file_name
37
- # st.session_state['filepath'] = file_path
38
- # else:
39
- # file_name = file_path = 'docStore/sample/Ethiopia_s_2021_10 Year Development Plan.txt'
40
- # st.session_state['filename'] = file_name
41
- # st.session_state['filepath'] = file_path
 
31
  file_name = file_path = files[option]
32
  st.session_state['filename'] = file_name
33
  st.session_state['filepath'] = file_path