rdose commited on
Commit
01c98ec
·
1 Parent(s): 7660882

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +92 -5
app.py CHANGED
@@ -13,6 +13,21 @@ from transformers import pipeline
13
  import itertools
14
  import pandas as pd
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  OUT_HEADERS = ['E','S','G']
18
  DF_SP500 = pd.read_csv('SP500_constituents.zip',compression=dict(method='zip'))
@@ -20,11 +35,82 @@ DF_SP500 = pd.read_csv('SP500_constituents.zip',compression=dict(method='zip'))
20
  MODEL_TRANSFORMER_BASED = "distilbert-base-uncased"
21
  MODEL_ONNX_FNAME = "ESG_classifier_batch.onnx"
22
  MODEL_SENTIMENT_ANALYSIS = "ProsusAI/finbert"
23
- #MODEL_SUMMARY_PEGASUS = "oMateos2020/pegasus-newsroom-cnn_full-adafactor-bs6"
24
 
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
- #API_HF_SENTIMENT_URL = "https://api-inference.huggingface.co/models/cardiffnlp/twitter-roberta-base-sentiment"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  def get_company_sectors(extracted_names, threshold=0.95):
30
  '''
@@ -198,12 +284,12 @@ def inference(input_batch,isurl,use_archive,limit_companies=10):
198
  prob_outs = _inference_classifier(input_batch_content)
199
  print("[i] Classifier output shape:",prob_outs.shape)
200
  print("[i] Running sentiment using",MODEL_SENTIMENT_ANALYSIS ,"inference...")
201
- #sentiment = _inference_sentiment_model_via_api_query({"inputs": extracted['content']})
202
  sentiment = _inference_sentiment_model_pipeline(input_batch_content )
203
  print("[i] Running NER using custom spancat inference...")
204
- #summary = _inference_summary_model_pipeline(input_batch_content )[0]['generated_text']
205
  ner_labels = _inference_ner_spancat(input_batch_content ,limit_outputs=limit_companies)
206
- print(ner_labels)
 
 
207
  df = pd.DataFrame(prob_outs,columns =['E','S','G'])
208
  if isurl:
209
  df['URL'] = url_list
@@ -211,6 +297,7 @@ def inference(input_batch,isurl,use_archive,limit_companies=10):
211
  df['content_id'] = range(1, len(input_batch_r)+1)
212
  df['sent_lbl'] = [d['label'] for d in sentiment ]
213
  df['sent_score'] = [d['score'] for d in sentiment ]
 
214
  print("[i] Pandas output shape:",df.shape)
215
 
216
  #[[], [('Nvidia', 'Information Technology')], [('Twitter', 'Communication Services'), ('Apple', 'Information Technology')], [], [], [], [], [], []]
 
13
  import itertools
14
  import pandas as pd
15
 
16
+ from bertopic import BERTopic
17
+ from huggingface_hub import hf_hub_url, cached_download
18
+
19
+ import nltk
20
+ nltk.download('stopwords')
21
+ nltk.download('wordnet')
22
+ nltk.download('omw-1.4')
23
+ from nltk.corpus import stopwords
24
+ from nltk.stem import WordNetLemmatizer
25
+ from nltk.stem import PorterStemmer
26
+
27
+ from unicodedata import normalize
28
+
29
+ import re
30
+
31
 
32
  OUT_HEADERS = ['E','S','G']
33
  DF_SP500 = pd.read_csv('SP500_constituents.zip',compression=dict(method='zip'))
 
35
  MODEL_TRANSFORMER_BASED = "distilbert-base-uncased"
36
  MODEL_ONNX_FNAME = "ESG_classifier_batch.onnx"
37
  MODEL_SENTIMENT_ANALYSIS = "ProsusAI/finbert"
 
38
 
39
 
40
+ BERTOPIC_REPO_ID = "oMateos2020/BERTopic-paraphrase-MiniLM-L3-v2-51topics-guided-model3"
41
+ BERTOPIC_FILENAME = "BERTopic-paraphrase-MiniLM-L3-v2-51topics-guided-model3"
42
+ bertopic_model = BERTopic.load(cached_download(hf_hub_url(REPO_ID, FILENAME)), embedding_model="paraphrase-MiniLM-L3-v2")
43
+
44
+ def _topic_sanitize_word(text):
45
+ """Función realiza una primera limpieza-normalización del texto a traves de expresiones regex"""
46
+ text = re.sub(r'@[\w_]+|#[\w_]+|https?://[\w_./]+', '', text) # Elimina menciones y URL, esto sería más para Tweets pero por si hay alguna mención o URL al ser criticas web
47
+ text = re.sub('\S*@\S*\s?', '', text) # Elimina correos electronicos
48
+ text = re.sub(r'\((\d+)\)', '', text) #Elimina numeros entre parentesis
49
+ text = re.sub(r'^\d+', '', text) #Elimina numeros sueltos
50
+ text = re.sub(r'\n', '', text) #Elimina saltos de linea
51
+ text = re.sub('\s+', ' ', text) # Elimina espacios en blanco adicionales
52
+ text = re.sub(r'[“”]', '', text) # Elimina caracter citas
53
+ text = re.sub(r'[()]', '', text) # Elimina parentesis
54
+ text = re.sub('\.', '', text) # Elimina punto
55
+ text = re.sub('\,', '', text) # Elimina coma
56
+ text = re.sub('’s', '', text) # Elimina posesivos
57
+ #text = re.sub(r'-+', '', text) # Quita guiones para unir palabras compuestas (normalizaría algunos casos, exmujer y ex-mujer, todos a exmujer)
58
+ text = re.sub(r'\.{3}', ' ', text) # Reemplaza puntos suspensivos
59
+ # Esta exp regular se ha incluido "a mano" tras ver que era necesaria para algunos ejemplos
60
+ text = re.sub(r"([\.\?])", r"\1 ", text) # Introduce espacio despues de punto e interrogacion
61
+ # -> NFD (Normalization Form Canonical Decomposition) y eliminar diacríticos
62
+ text = re.sub(r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+", r"\1",
63
+ normalize( "NFD", text), 0, re.I) # Eliminación de diacriticos (acentos y variantes puntuadas de caracteres por su forma simple excepto la 'ñ')
64
+ # -> NFC (Normalization Form Canonical Composition)
65
+ text = normalize( 'NFC', text)
66
+
67
+ return text.lower().strip()
68
+
69
+ def _topic_clean_text(text, lemmatize=True, stem=True):
70
+ words = text.split()
71
+ non_stopwords = [word for word in words if word not in stopwords.words('english')]
72
+ clean_text = [_topic_sanitize_word(word) for word in non_stopwords]
73
+ if lemmatize:
74
+ lemmatizer = WordNetLemmatizer()
75
+ clean_text = [lemmatizer.lemmatize(word) for word in clean_text]
76
+ if stem:
77
+ ps =PorterStemmer()
78
+ clean_text = [ps.stem(word) for word in clean_text]
79
 
80
+ return ' '.join(clean_text).strip()
81
+
82
+
83
+ #SECTOR_LIST = list(DF_SP500.Sector.unique())
84
+ SECTOR_LIST = ['Industry',
85
+ 'Health',
86
+ 'Technology',
87
+ 'Communication',
88
+ 'Consumer Staples',
89
+ 'Consumer Discretionary',
90
+ 'Utilities',
91
+ 'Financials',
92
+ 'Materials',
93
+ 'Real Estate',
94
+ 'Energy']
95
+
96
+ SECTOR_TOPICS = []
97
+ for sector in SECTOR_LIST:
98
+ topics, _ = bertopic_model.find_topics(_topic_clean_text(sector), top_n=5)
99
+ SECTOR_TOPICS.append(topics)
100
+
101
+ def _topic2sector(pred_topics):
102
+ out = []
103
+ for pred_topic in pred_topics:
104
+ relevant_sectors = []
105
+ for i in range(len(SECTOR_LIST)):
106
+ if pred_topic in SECTOR_TOPICS[i]:
107
+ relevant_sectors.append(list(DF_SP500.Sector.unique())[i])
108
+ out.append(relevant_sectors)
109
+ return out
110
+
111
+ def _inference_topic_match(text):
112
+ out, _ = bertopic_model.transform([_topic_clean_text(t) for t in text])
113
+ return out
114
 
115
  def get_company_sectors(extracted_names, threshold=0.95):
116
  '''
 
284
  prob_outs = _inference_classifier(input_batch_content)
285
  print("[i] Classifier output shape:",prob_outs.shape)
286
  print("[i] Running sentiment using",MODEL_SENTIMENT_ANALYSIS ,"inference...")
 
287
  sentiment = _inference_sentiment_model_pipeline(input_batch_content )
288
  print("[i] Running NER using custom spancat inference...")
 
289
  ner_labels = _inference_ner_spancat(input_batch_content ,limit_outputs=limit_companies)
290
+ print("[i] BERTopic...")
291
+ topics = _inference_topic_match(input_batch_content)
292
+
293
  df = pd.DataFrame(prob_outs,columns =['E','S','G'])
294
  if isurl:
295
  df['URL'] = url_list
 
297
  df['content_id'] = range(1, len(input_batch_r)+1)
298
  df['sent_lbl'] = [d['label'] for d in sentiment ]
299
  df['sent_score'] = [d['score'] for d in sentiment ]
300
+ df['sector_pred'] = pd.DataFrame(_topic2sector(topics)).iloc[:, 0]
301
  print("[i] Pandas output shape:",df.shape)
302
 
303
  #[[], [('Nvidia', 'Information Technology')], [('Twitter', 'Communication Services'), ('Apple', 'Information Technology')], [], [], [], [], [], []]