rdose commited on
Commit
0b55d23
·
1 Parent(s): 2b1c9f1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -88
app.py CHANGED
@@ -13,20 +13,20 @@ from transformers import pipeline
13
  import itertools
14
  import pandas as pd
15
 
16
- from bertopic import BERTopic
17
- from huggingface_hub import hf_hub_url, cached_download
18
 
19
- import nltk
20
- nltk.download('stopwords')
21
- nltk.download('wordnet')
22
- nltk.download('omw-1.4')
23
- from nltk.corpus import stopwords
24
- from nltk.stem import WordNetLemmatizer
25
- from nltk.stem import PorterStemmer
26
 
27
- from unicodedata import normalize
28
 
29
- import re
30
 
31
 
32
  OUT_HEADERS = ['E','S','G']
@@ -37,80 +37,80 @@ MODEL_ONNX_FNAME = "ESG_classifier_batch.onnx"
37
  MODEL_SENTIMENT_ANALYSIS = "ProsusAI/finbert"
38
 
39
 
40
- BERTOPIC_REPO_ID = "oMateos2020/BERTopic-paraphrase-MiniLM-L3-v2-51topics-guided-model3"
41
- BERTOPIC_FILENAME = "BERTopic-paraphrase-MiniLM-L3-v2-51topics-guided-model3"
42
- bertopic_model = BERTopic.load(cached_download(hf_hub_url(BERTOPIC_REPO_ID , BERTOPIC_FILENAME )), embedding_model="paraphrase-MiniLM-L3-v2")
43
-
44
- def _topic_sanitize_word(text):
45
- """Función realiza una primera limpieza-normalización del texto a traves de expresiones regex"""
46
- text = re.sub(r'@[\w_]+|#[\w_]+|https?://[\w_./]+', '', text) # Elimina menciones y URL, esto sería más para Tweets pero por si hay alguna mención o URL al ser criticas web
47
- text = re.sub('\S*@\S*\s?', '', text) # Elimina correos electronicos
48
- text = re.sub(r'\((\d+)\)', '', text) #Elimina numeros entre parentesis
49
- text = re.sub(r'^\d+', '', text) #Elimina numeros sueltos
50
- text = re.sub(r'\n', '', text) #Elimina saltos de linea
51
- text = re.sub('\s+', ' ', text) # Elimina espacios en blanco adicionales
52
- text = re.sub(r'[“”]', '', text) # Elimina caracter citas
53
- text = re.sub(r'[()]', '', text) # Elimina parentesis
54
- text = re.sub('\.', '', text) # Elimina punto
55
- text = re.sub('\,', '', text) # Elimina coma
56
- text = re.sub('’s', '', text) # Elimina posesivos
57
- #text = re.sub(r'-+', '', text) # Quita guiones para unir palabras compuestas (normalizaría algunos casos, exmujer y ex-mujer, todos a exmujer)
58
- text = re.sub(r'\.{3}', ' ', text) # Reemplaza puntos suspensivos
59
- # Esta exp regular se ha incluido "a mano" tras ver que era necesaria para algunos ejemplos
60
- text = re.sub(r"([\.\?])", r"\1 ", text) # Introduce espacio despues de punto e interrogacion
61
- # -> NFD (Normalization Form Canonical Decomposition) y eliminar diacríticos
62
- text = re.sub(r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+", r"\1",
63
- normalize( "NFD", text), 0, re.I) # Eliminación de diacriticos (acentos y variantes puntuadas de caracteres por su forma simple excepto la 'ñ')
64
- # -> NFC (Normalization Form Canonical Composition)
65
- text = normalize( 'NFC', text)
66
-
67
- return text.lower().strip()
68
-
69
- def _topic_clean_text(text, lemmatize=True, stem=True):
70
- words = text.split()
71
- non_stopwords = [word for word in words if word not in stopwords.words('english')]
72
- clean_text = [_topic_sanitize_word(word) for word in non_stopwords]
73
- if lemmatize:
74
- lemmatizer = WordNetLemmatizer()
75
- clean_text = [lemmatizer.lemmatize(word) for word in clean_text]
76
- if stem:
77
- ps =PorterStemmer()
78
- clean_text = [ps.stem(word) for word in clean_text]
79
-
80
- return ' '.join(clean_text).strip()
81
-
82
-
83
- #SECTOR_LIST = list(DF_SP500.Sector.unique())
84
- SECTOR_LIST = ['Industry',
85
- 'Health',
86
- 'Technology',
87
- 'Communication',
88
- 'Consumer Staples',
89
- 'Consumer Discretionary',
90
- 'Utilities',
91
- 'Financials',
92
- 'Materials',
93
- 'Real Estate',
94
- 'Energy']
95
-
96
- SECTOR_TOPICS = []
97
- for sector in SECTOR_LIST:
98
- topics, _ = bertopic_model.find_topics(_topic_clean_text(sector), top_n=5)
99
- SECTOR_TOPICS.append(topics)
100
-
101
- def _topic2sector(pred_topics):
102
- out = []
103
- for pred_topic in pred_topics:
104
- relevant_sectors = []
105
- for i in range(len(SECTOR_LIST)):
106
- if pred_topic in SECTOR_TOPICS[i]:
107
- relevant_sectors.append(list(DF_SP500.Sector.unique())[i])
108
- out.append(relevant_sectors)
109
- return out
110
-
111
- def _inference_topic_match(text):
112
- out, _ = bertopic_model.transform([_topic_clean_text(t) for t in text])
113
- return out
114
 
115
  def get_company_sectors(extracted_names, threshold=0.95):
116
  '''
@@ -287,8 +287,8 @@ def inference(input_batch,isurl,use_archive,limit_companies=10):
287
  sentiment = _inference_sentiment_model_pipeline(input_batch_content )
288
  print("[i] Running NER using custom spancat inference...")
289
  ner_labels = _inference_ner_spancat(input_batch_content ,limit_outputs=limit_companies)
290
- print("[i] BERTopic...")
291
- topics = _inference_topic_match(input_batch_content)
292
 
293
  df = pd.DataFrame(prob_outs,columns =['E','S','G'])
294
  if isurl:
@@ -297,7 +297,7 @@ def inference(input_batch,isurl,use_archive,limit_companies=10):
297
  df['content_id'] = range(1, len(input_batch_r)+1)
298
  df['sent_lbl'] = [d['label'] for d in sentiment ]
299
  df['sent_score'] = [d['score'] for d in sentiment ]
300
- df['sector_pred'] = pd.DataFrame(_topic2sector(topics)).iloc[:, 0]
301
  print("[i] Pandas output shape:",df.shape)
302
 
303
  #[[], [('Nvidia', 'Information Technology')], [('Twitter', 'Communication Services'), ('Apple', 'Information Technology')], [], [], [], [], [], []]
 
13
  import itertools
14
  import pandas as pd
15
 
16
+ # from bertopic import BERTopic
17
+ # from huggingface_hub import hf_hub_url, cached_download
18
 
19
+ # import nltk
20
+ # nltk.download('stopwords')
21
+ # nltk.download('wordnet')
22
+ # nltk.download('omw-1.4')
23
+ # from nltk.corpus import stopwords
24
+ # from nltk.stem import WordNetLemmatizer
25
+ # from nltk.stem import PorterStemmer
26
 
27
+ # from unicodedata import normalize
28
 
29
+ # import re
30
 
31
 
32
  OUT_HEADERS = ['E','S','G']
 
37
  MODEL_SENTIMENT_ANALYSIS = "ProsusAI/finbert"
38
 
39
 
40
+ # BERTOPIC_REPO_ID = "oMateos2020/BERTopic-paraphrase-MiniLM-L3-v2-51topics-guided-model3"
41
+ # BERTOPIC_FILENAME = "BERTopic-paraphrase-MiniLM-L3-v2-51topics-guided-model3"
42
+ # bertopic_model = BERTopic.load(cached_download(hf_hub_url(BERTOPIC_REPO_ID , BERTOPIC_FILENAME )), embedding_model="paraphrase-MiniLM-L3-v2")
43
+
44
+ # def _topic_sanitize_word(text):
45
+ # """Función realiza una primera limpieza-normalización del texto a traves de expresiones regex"""
46
+ # text = re.sub(r'@[\w_]+|#[\w_]+|https?://[\w_./]+', '', text) # Elimina menciones y URL, esto sería más para Tweets pero por si hay alguna mención o URL al ser criticas web
47
+ # text = re.sub('\S*@\S*\s?', '', text) # Elimina correos electronicos
48
+ # text = re.sub(r'\((\d+)\)', '', text) #Elimina numeros entre parentesis
49
+ # text = re.sub(r'^\d+', '', text) #Elimina numeros sueltos
50
+ # text = re.sub(r'\n', '', text) #Elimina saltos de linea
51
+ # text = re.sub('\s+', ' ', text) # Elimina espacios en blanco adicionales
52
+ # text = re.sub(r'[“”]', '', text) # Elimina caracter citas
53
+ # text = re.sub(r'[()]', '', text) # Elimina parentesis
54
+ # text = re.sub('\.', '', text) # Elimina punto
55
+ # text = re.sub('\,', '', text) # Elimina coma
56
+ # text = re.sub('’s', '', text) # Elimina posesivos
57
+ # #text = re.sub(r'-+', '', text) # Quita guiones para unir palabras compuestas (normalizaría algunos casos, exmujer y ex-mujer, todos a exmujer)
58
+ # text = re.sub(r'\.{3}', ' ', text) # Reemplaza puntos suspensivos
59
+ # # Esta exp regular se ha incluido "a mano" tras ver que era necesaria para algunos ejemplos
60
+ # text = re.sub(r"([\.\?])", r"\1 ", text) # Introduce espacio despues de punto e interrogacion
61
+ # # -> NFD (Normalization Form Canonical Decomposition) y eliminar diacríticos
62
+ # text = re.sub(r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+", r"\1",
63
+ # normalize( "NFD", text), 0, re.I) # Eliminación de diacriticos (acentos y variantes puntuadas de caracteres por su forma simple excepto la 'ñ')
64
+ # # -> NFC (Normalization Form Canonical Composition)
65
+ # text = normalize( 'NFC', text)
66
+
67
+ # return text.lower().strip()
68
+
69
+ # def _topic_clean_text(text, lemmatize=True, stem=True):
70
+ # words = text.split()
71
+ # non_stopwords = [word for word in words if word not in stopwords.words('english')]
72
+ # clean_text = [_topic_sanitize_word(word) for word in non_stopwords]
73
+ # if lemmatize:
74
+ # lemmatizer = WordNetLemmatizer()
75
+ # clean_text = [lemmatizer.lemmatize(word) for word in clean_text]
76
+ # if stem:
77
+ # ps =PorterStemmer()
78
+ # clean_text = [ps.stem(word) for word in clean_text]
79
+
80
+ # return ' '.join(clean_text).strip()
81
+
82
+
83
+ # #SECTOR_LIST = list(DF_SP500.Sector.unique())
84
+ # SECTOR_LIST = ['Industry',
85
+ # 'Health',
86
+ # 'Technology',
87
+ # 'Communication',
88
+ # 'Consumer Staples',
89
+ # 'Consumer Discretionary',
90
+ # 'Utilities',
91
+ # 'Financials',
92
+ # 'Materials',
93
+ # 'Real Estate',
94
+ # 'Energy']
95
+
96
+ # SECTOR_TOPICS = []
97
+ # for sector in SECTOR_LIST:
98
+ # topics, _ = bertopic_model.find_topics(_topic_clean_text(sector), top_n=5)
99
+ # SECTOR_TOPICS.append(topics)
100
+
101
+ # def _topic2sector(pred_topics):
102
+ # out = []
103
+ # for pred_topic in pred_topics:
104
+ # relevant_sectors = []
105
+ # for i in range(len(SECTOR_LIST)):
106
+ # if pred_topic in SECTOR_TOPICS[i]:
107
+ # relevant_sectors.append(list(DF_SP500.Sector.unique())[i])
108
+ # out.append(relevant_sectors)
109
+ # return out
110
+
111
+ # def _inference_topic_match(text):
112
+ # out, _ = bertopic_model.transform([_topic_clean_text(t) for t in text])
113
+ # return out
114
 
115
  def get_company_sectors(extracted_names, threshold=0.95):
116
  '''
 
287
  sentiment = _inference_sentiment_model_pipeline(input_batch_content )
288
  print("[i] Running NER using custom spancat inference...")
289
  ner_labels = _inference_ner_spancat(input_batch_content ,limit_outputs=limit_companies)
290
+ # print("[i] BERTopic...")
291
+ # topics = _inference_topic_match(input_batch_content)
292
 
293
  df = pd.DataFrame(prob_outs,columns =['E','S','G'])
294
  if isurl:
 
297
  df['content_id'] = range(1, len(input_batch_r)+1)
298
  df['sent_lbl'] = [d['label'] for d in sentiment ]
299
  df['sent_score'] = [d['score'] for d in sentiment ]
300
+ #df['sector_pred'] = pd.DataFrame(_topic2sector(topics)).iloc[:, 0]
301
  print("[i] Pandas output shape:",df.shape)
302
 
303
  #[[], [('Nvidia', 'Information Technology')], [('Twitter', 'Communication Services'), ('Apple', 'Information Technology')], [], [], [], [], [], []]