Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -13,20 +13,20 @@ from transformers import pipeline
|
|
13 |
import itertools
|
14 |
import pandas as pd
|
15 |
|
16 |
-
from bertopic import BERTopic
|
17 |
-
from huggingface_hub import hf_hub_url, cached_download
|
18 |
|
19 |
-
import nltk
|
20 |
-
nltk.download('stopwords')
|
21 |
-
nltk.download('wordnet')
|
22 |
-
nltk.download('omw-1.4')
|
23 |
-
from nltk.corpus import stopwords
|
24 |
-
from nltk.stem import WordNetLemmatizer
|
25 |
-
from nltk.stem import PorterStemmer
|
26 |
|
27 |
-
from unicodedata import normalize
|
28 |
|
29 |
-
import re
|
30 |
|
31 |
|
32 |
OUT_HEADERS = ['E','S','G']
|
@@ -37,80 +37,80 @@ MODEL_ONNX_FNAME = "ESG_classifier_batch.onnx"
|
|
37 |
MODEL_SENTIMENT_ANALYSIS = "ProsusAI/finbert"
|
38 |
|
39 |
|
40 |
-
BERTOPIC_REPO_ID = "oMateos2020/BERTopic-paraphrase-MiniLM-L3-v2-51topics-guided-model3"
|
41 |
-
BERTOPIC_FILENAME = "BERTopic-paraphrase-MiniLM-L3-v2-51topics-guided-model3"
|
42 |
-
bertopic_model = BERTopic.load(cached_download(hf_hub_url(BERTOPIC_REPO_ID , BERTOPIC_FILENAME )), embedding_model="paraphrase-MiniLM-L3-v2")
|
43 |
-
|
44 |
-
def _topic_sanitize_word(text):
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
def _topic_clean_text(text, lemmatize=True, stem=True):
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
#SECTOR_LIST = list(DF_SP500.Sector.unique())
|
84 |
-
SECTOR_LIST = ['Industry',
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
SECTOR_TOPICS = []
|
97 |
-
for sector in SECTOR_LIST:
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
def _topic2sector(pred_topics):
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
def _inference_topic_match(text):
|
112 |
-
|
113 |
-
|
114 |
|
115 |
def get_company_sectors(extracted_names, threshold=0.95):
|
116 |
'''
|
@@ -287,8 +287,8 @@ def inference(input_batch,isurl,use_archive,limit_companies=10):
|
|
287 |
sentiment = _inference_sentiment_model_pipeline(input_batch_content )
|
288 |
print("[i] Running NER using custom spancat inference...")
|
289 |
ner_labels = _inference_ner_spancat(input_batch_content ,limit_outputs=limit_companies)
|
290 |
-
print("[i] BERTopic...")
|
291 |
-
topics = _inference_topic_match(input_batch_content)
|
292 |
|
293 |
df = pd.DataFrame(prob_outs,columns =['E','S','G'])
|
294 |
if isurl:
|
@@ -297,7 +297,7 @@ def inference(input_batch,isurl,use_archive,limit_companies=10):
|
|
297 |
df['content_id'] = range(1, len(input_batch_r)+1)
|
298 |
df['sent_lbl'] = [d['label'] for d in sentiment ]
|
299 |
df['sent_score'] = [d['score'] for d in sentiment ]
|
300 |
-
df['sector_pred'] = pd.DataFrame(_topic2sector(topics)).iloc[:, 0]
|
301 |
print("[i] Pandas output shape:",df.shape)
|
302 |
|
303 |
#[[], [('Nvidia', 'Information Technology')], [('Twitter', 'Communication Services'), ('Apple', 'Information Technology')], [], [], [], [], [], []]
|
|
|
13 |
import itertools
|
14 |
import pandas as pd
|
15 |
|
16 |
+
# from bertopic import BERTopic
|
17 |
+
# from huggingface_hub import hf_hub_url, cached_download
|
18 |
|
19 |
+
# import nltk
|
20 |
+
# nltk.download('stopwords')
|
21 |
+
# nltk.download('wordnet')
|
22 |
+
# nltk.download('omw-1.4')
|
23 |
+
# from nltk.corpus import stopwords
|
24 |
+
# from nltk.stem import WordNetLemmatizer
|
25 |
+
# from nltk.stem import PorterStemmer
|
26 |
|
27 |
+
# from unicodedata import normalize
|
28 |
|
29 |
+
# import re
|
30 |
|
31 |
|
32 |
OUT_HEADERS = ['E','S','G']
|
|
|
37 |
MODEL_SENTIMENT_ANALYSIS = "ProsusAI/finbert"
|
38 |
|
39 |
|
40 |
+
# BERTOPIC_REPO_ID = "oMateos2020/BERTopic-paraphrase-MiniLM-L3-v2-51topics-guided-model3"
|
41 |
+
# BERTOPIC_FILENAME = "BERTopic-paraphrase-MiniLM-L3-v2-51topics-guided-model3"
|
42 |
+
# bertopic_model = BERTopic.load(cached_download(hf_hub_url(BERTOPIC_REPO_ID , BERTOPIC_FILENAME )), embedding_model="paraphrase-MiniLM-L3-v2")
|
43 |
+
|
44 |
+
# def _topic_sanitize_word(text):
|
45 |
+
# """Función realiza una primera limpieza-normalización del texto a traves de expresiones regex"""
|
46 |
+
# text = re.sub(r'@[\w_]+|#[\w_]+|https?://[\w_./]+', '', text) # Elimina menciones y URL, esto sería más para Tweets pero por si hay alguna mención o URL al ser criticas web
|
47 |
+
# text = re.sub('\S*@\S*\s?', '', text) # Elimina correos electronicos
|
48 |
+
# text = re.sub(r'\((\d+)\)', '', text) #Elimina numeros entre parentesis
|
49 |
+
# text = re.sub(r'^\d+', '', text) #Elimina numeros sueltos
|
50 |
+
# text = re.sub(r'\n', '', text) #Elimina saltos de linea
|
51 |
+
# text = re.sub('\s+', ' ', text) # Elimina espacios en blanco adicionales
|
52 |
+
# text = re.sub(r'[“”]', '', text) # Elimina caracter citas
|
53 |
+
# text = re.sub(r'[()]', '', text) # Elimina parentesis
|
54 |
+
# text = re.sub('\.', '', text) # Elimina punto
|
55 |
+
# text = re.sub('\,', '', text) # Elimina coma
|
56 |
+
# text = re.sub('’s', '', text) # Elimina posesivos
|
57 |
+
# #text = re.sub(r'-+', '', text) # Quita guiones para unir palabras compuestas (normalizaría algunos casos, exmujer y ex-mujer, todos a exmujer)
|
58 |
+
# text = re.sub(r'\.{3}', ' ', text) # Reemplaza puntos suspensivos
|
59 |
+
# # Esta exp regular se ha incluido "a mano" tras ver que era necesaria para algunos ejemplos
|
60 |
+
# text = re.sub(r"([\.\?])", r"\1 ", text) # Introduce espacio despues de punto e interrogacion
|
61 |
+
# # -> NFD (Normalization Form Canonical Decomposition) y eliminar diacríticos
|
62 |
+
# text = re.sub(r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+", r"\1",
|
63 |
+
# normalize( "NFD", text), 0, re.I) # Eliminación de diacriticos (acentos y variantes puntuadas de caracteres por su forma simple excepto la 'ñ')
|
64 |
+
# # -> NFC (Normalization Form Canonical Composition)
|
65 |
+
# text = normalize( 'NFC', text)
|
66 |
+
|
67 |
+
# return text.lower().strip()
|
68 |
+
|
69 |
+
# def _topic_clean_text(text, lemmatize=True, stem=True):
|
70 |
+
# words = text.split()
|
71 |
+
# non_stopwords = [word for word in words if word not in stopwords.words('english')]
|
72 |
+
# clean_text = [_topic_sanitize_word(word) for word in non_stopwords]
|
73 |
+
# if lemmatize:
|
74 |
+
# lemmatizer = WordNetLemmatizer()
|
75 |
+
# clean_text = [lemmatizer.lemmatize(word) for word in clean_text]
|
76 |
+
# if stem:
|
77 |
+
# ps =PorterStemmer()
|
78 |
+
# clean_text = [ps.stem(word) for word in clean_text]
|
79 |
+
|
80 |
+
# return ' '.join(clean_text).strip()
|
81 |
+
|
82 |
+
|
83 |
+
# #SECTOR_LIST = list(DF_SP500.Sector.unique())
|
84 |
+
# SECTOR_LIST = ['Industry',
|
85 |
+
# 'Health',
|
86 |
+
# 'Technology',
|
87 |
+
# 'Communication',
|
88 |
+
# 'Consumer Staples',
|
89 |
+
# 'Consumer Discretionary',
|
90 |
+
# 'Utilities',
|
91 |
+
# 'Financials',
|
92 |
+
# 'Materials',
|
93 |
+
# 'Real Estate',
|
94 |
+
# 'Energy']
|
95 |
+
|
96 |
+
# SECTOR_TOPICS = []
|
97 |
+
# for sector in SECTOR_LIST:
|
98 |
+
# topics, _ = bertopic_model.find_topics(_topic_clean_text(sector), top_n=5)
|
99 |
+
# SECTOR_TOPICS.append(topics)
|
100 |
+
|
101 |
+
# def _topic2sector(pred_topics):
|
102 |
+
# out = []
|
103 |
+
# for pred_topic in pred_topics:
|
104 |
+
# relevant_sectors = []
|
105 |
+
# for i in range(len(SECTOR_LIST)):
|
106 |
+
# if pred_topic in SECTOR_TOPICS[i]:
|
107 |
+
# relevant_sectors.append(list(DF_SP500.Sector.unique())[i])
|
108 |
+
# out.append(relevant_sectors)
|
109 |
+
# return out
|
110 |
+
|
111 |
+
# def _inference_topic_match(text):
|
112 |
+
# out, _ = bertopic_model.transform([_topic_clean_text(t) for t in text])
|
113 |
+
# return out
|
114 |
|
115 |
def get_company_sectors(extracted_names, threshold=0.95):
|
116 |
'''
|
|
|
287 |
sentiment = _inference_sentiment_model_pipeline(input_batch_content )
|
288 |
print("[i] Running NER using custom spancat inference...")
|
289 |
ner_labels = _inference_ner_spancat(input_batch_content ,limit_outputs=limit_companies)
|
290 |
+
# print("[i] BERTopic...")
|
291 |
+
# topics = _inference_topic_match(input_batch_content)
|
292 |
|
293 |
df = pd.DataFrame(prob_outs,columns =['E','S','G'])
|
294 |
if isurl:
|
|
|
297 |
df['content_id'] = range(1, len(input_batch_r)+1)
|
298 |
df['sent_lbl'] = [d['label'] for d in sentiment ]
|
299 |
df['sent_score'] = [d['score'] for d in sentiment ]
|
300 |
+
#df['sector_pred'] = pd.DataFrame(_topic2sector(topics)).iloc[:, 0]
|
301 |
print("[i] Pandas output shape:",df.shape)
|
302 |
|
303 |
#[[], [('Nvidia', 'Information Technology')], [('Twitter', 'Communication Services'), ('Apple', 'Information Technology')], [], [], [], [], [], []]
|