Spaces:

ESG-TFM-UV
/

ESG_API_BATCH

Build error

App Files Files Community

rdose commited on Sep 14, 2022

Commit

0b55d23

1 Parent(s): 2b1c9f1

Update app.py

Browse files

Files changed (1) hide show

app.py +88 -88

app.py CHANGED Viewed

@@ -13,20 +13,20 @@ from transformers import pipeline
 import itertools
 import pandas as pd
-from bertopic import BERTopic
-from huggingface_hub import hf_hub_url, cached_download
-import nltk
-nltk.download('stopwords')
-nltk.download('wordnet')
-nltk.download('omw-1.4')
-from nltk.corpus import stopwords
-from nltk.stem import WordNetLemmatizer
-from nltk.stem import PorterStemmer
-from unicodedata import normalize
-import re
 OUT_HEADERS = ['E','S','G']
@@ -37,80 +37,80 @@ MODEL_ONNX_FNAME = "ESG_classifier_batch.onnx"
 MODEL_SENTIMENT_ANALYSIS = "ProsusAI/finbert"
-BERTOPIC_REPO_ID = "oMateos2020/BERTopic-paraphrase-MiniLM-L3-v2-51topics-guided-model3"
-BERTOPIC_FILENAME = "BERTopic-paraphrase-MiniLM-L3-v2-51topics-guided-model3"
-bertopic_model = BERTopic.load(cached_download(hf_hub_url(BERTOPIC_REPO_ID , BERTOPIC_FILENAME )), embedding_model="paraphrase-MiniLM-L3-v2")
-def _topic_sanitize_word(text):
-    """Función realiza una primera limpieza-normalización del texto a traves de expresiones regex"""
-    text = re.sub(r'@[\w_]+|#[\w_]+|https?://[\w_./]+', '', text) # Elimina menciones y URL, esto sería más para Tweets pero por si hay alguna mención o URL al ser criticas web
-    text = re.sub('\S*@\S*\s?', '', text) # Elimina correos electronicos
-    text = re.sub(r'\((\d+)\)', '', text) #Elimina numeros entre parentesis
-    text = re.sub(r'^\d+', '', text) #Elimina numeros sueltos
-    text = re.sub(r'\n', '', text) #Elimina saltos de linea
-    text = re.sub('\s+', ' ', text) # Elimina espacios en blanco adicionales
-    text = re.sub(r'[“”]', '', text) # Elimina caracter citas
-    text = re.sub(r'[()]', '', text) # Elimina parentesis
-    text = re.sub('\.', '', text) # Elimina punto
-    text = re.sub('\,', '', text) # Elimina coma
-    text = re.sub('’s', '', text) # Elimina posesivos
-    #text = re.sub(r'-+', '', text) # Quita guiones para unir palabras compuestas (normalizaría algunos casos, exmujer y ex-mujer, todos a exmujer)
-    text = re.sub(r'\.{3}', ' ', text) # Reemplaza puntos suspensivos
-    # Esta exp regular se ha incluido "a mano" tras ver que era necesaria para algunos ejemplos
-    text = re.sub(r"([\.\?])", r"\1 ", text) # Introduce espacio despues de punto e interrogacion
-    # -> NFD (Normalization Form Canonical Decomposition) y eliminar diacríticos
-    text = re.sub(r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+", r"\1",
-                  normalize( "NFD", text), 0, re.I) # Eliminación de diacriticos (acentos y variantes puntuadas de caracteres por su forma simple excepto la 'ñ')
-    # -> NFC (Normalization Form Canonical Composition)
-    text = normalize( 'NFC', text)
-    return text.lower().strip()
-def _topic_clean_text(text, lemmatize=True, stem=True):
-  words = text.split()
-  non_stopwords = [word for word in words if word not in stopwords.words('english')]
-  clean_text = [_topic_sanitize_word(word) for word in non_stopwords]
-  if lemmatize:
-    lemmatizer = WordNetLemmatizer()
-    clean_text = [lemmatizer.lemmatize(word) for word in clean_text]
-  if stem:
-    ps =PorterStemmer()
-    clean_text = [ps.stem(word) for word in clean_text]
-  return ' '.join(clean_text).strip()
-#SECTOR_LIST = list(DF_SP500.Sector.unique())
-SECTOR_LIST = ['Industry',
-               'Health',
-               'Technology',
-               'Communication',
-               'Consumer Staples',
-               'Consumer Discretionary',
-               'Utilities',
-               'Financials',
-               'Materials',
-               'Real Estate',
-               'Energy']
-SECTOR_TOPICS = []
-for sector in SECTOR_LIST:
-  topics, _ = bertopic_model.find_topics(_topic_clean_text(sector), top_n=5)
-  SECTOR_TOPICS.append(topics)
-def _topic2sector(pred_topics):
-  out = []
-  for pred_topic in pred_topics:
-    relevant_sectors = []
-    for i in range(len(SECTOR_LIST)):
-      if pred_topic in SECTOR_TOPICS[i]:
-        relevant_sectors.append(list(DF_SP500.Sector.unique())[i])
-    out.append(relevant_sectors)
-  return out
-def _inference_topic_match(text):
-  out, _ = bertopic_model.transform([_topic_clean_text(t) for t in text])
-  return out
 def get_company_sectors(extracted_names, threshold=0.95):
   '''
@@ -287,8 +287,8 @@ def inference(input_batch,isurl,use_archive,limit_companies=10):
     sentiment = _inference_sentiment_model_pipeline(input_batch_content )
     print("[i] Running NER using custom spancat inference...")
     ner_labels = _inference_ner_spancat(input_batch_content ,limit_outputs=limit_companies)
-    print("[i] BERTopic...")
-    topics = _inference_topic_match(input_batch_content)
     df = pd.DataFrame(prob_outs,columns =['E','S','G'])
     if isurl:
@@ -297,7 +297,7 @@ def inference(input_batch,isurl,use_archive,limit_companies=10):
         df['content_id'] = range(1, len(input_batch_r)+1)
     df['sent_lbl'] = [d['label'] for d in sentiment ]
     df['sent_score'] = [d['score'] for d in sentiment ]
-    df['sector_pred'] = pd.DataFrame(_topic2sector(topics)).iloc[:, 0]
     print("[i] Pandas output shape:",df.shape)
     #[[], [('Nvidia', 'Information Technology')], [('Twitter', 'Communication Services'), ('Apple', 'Information Technology')], [], [], [], [], [], []]

 import itertools
 import pandas as pd
+# from bertopic import BERTopic
+# from huggingface_hub import hf_hub_url, cached_download
+# import nltk
+# nltk.download('stopwords')
+# nltk.download('wordnet')
+# nltk.download('omw-1.4')
+# from nltk.corpus import stopwords
+# from nltk.stem import WordNetLemmatizer
+# from nltk.stem import PorterStemmer
+# from unicodedata import normalize
+# import re
 OUT_HEADERS = ['E','S','G']
 MODEL_SENTIMENT_ANALYSIS = "ProsusAI/finbert"
+# BERTOPIC_REPO_ID = "oMateos2020/BERTopic-paraphrase-MiniLM-L3-v2-51topics-guided-model3"
+# BERTOPIC_FILENAME = "BERTopic-paraphrase-MiniLM-L3-v2-51topics-guided-model3"
+# bertopic_model = BERTopic.load(cached_download(hf_hub_url(BERTOPIC_REPO_ID , BERTOPIC_FILENAME )), embedding_model="paraphrase-MiniLM-L3-v2")
+# def _topic_sanitize_word(text):
+#     """Función realiza una primera limpieza-normalización del texto a traves de expresiones regex"""
+#     text = re.sub(r'@[\w_]+|#[\w_]+|https?://[\w_./]+', '', text) # Elimina menciones y URL, esto sería más para Tweets pero por si hay alguna mención o URL al ser criticas web
+#     text = re.sub('\S*@\S*\s?', '', text) # Elimina correos electronicos
+#     text = re.sub(r'\((\d+)\)', '', text) #Elimina numeros entre parentesis
+#     text = re.sub(r'^\d+', '', text) #Elimina numeros sueltos
+#     text = re.sub(r'\n', '', text) #Elimina saltos de linea
+#     text = re.sub('\s+', ' ', text) # Elimina espacios en blanco adicionales
+#     text = re.sub(r'[“”]', '', text) # Elimina caracter citas
+#     text = re.sub(r'[()]', '', text) # Elimina parentesis
+#     text = re.sub('\.', '', text) # Elimina punto
+#     text = re.sub('\,', '', text) # Elimina coma
+#     text = re.sub('’s', '', text) # Elimina posesivos
+#     #text = re.sub(r'-+', '', text) # Quita guiones para unir palabras compuestas (normalizaría algunos casos, exmujer y ex-mujer, todos a exmujer)
+#     text = re.sub(r'\.{3}', ' ', text) # Reemplaza puntos suspensivos
+#     # Esta exp regular se ha incluido "a mano" tras ver que era necesaria para algunos ejemplos
+#     text = re.sub(r"([\.\?])", r"\1 ", text) # Introduce espacio despues de punto e interrogacion
+#     # -> NFD (Normalization Form Canonical Decomposition) y eliminar diacríticos
+#     text = re.sub(r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+", r"\1",
+#                   normalize( "NFD", text), 0, re.I) # Eliminación de diacriticos (acentos y variantes puntuadas de caracteres por su forma simple excepto la 'ñ')
+#     # -> NFC (Normalization Form Canonical Composition)
+#     text = normalize( 'NFC', text)
+#     return text.lower().strip()
+# def _topic_clean_text(text, lemmatize=True, stem=True):
+#   words = text.split()
+#   non_stopwords = [word for word in words if word not in stopwords.words('english')]
+#   clean_text = [_topic_sanitize_word(word) for word in non_stopwords]
+#   if lemmatize:
+#     lemmatizer = WordNetLemmatizer()
+#     clean_text = [lemmatizer.lemmatize(word) for word in clean_text]
+#   if stem:
+#     ps =PorterStemmer()
+#     clean_text = [ps.stem(word) for word in clean_text]
+#   return ' '.join(clean_text).strip()
+# #SECTOR_LIST = list(DF_SP500.Sector.unique())
+# SECTOR_LIST = ['Industry',
+#                'Health',
+#                'Technology',
+#                'Communication',
+#                'Consumer Staples',
+#                'Consumer Discretionary',
+#                'Utilities',
+#                'Financials',
+#                'Materials',
+#                'Real Estate',
+#                'Energy']
+# SECTOR_TOPICS = []
+# for sector in SECTOR_LIST:
+#   topics, _ = bertopic_model.find_topics(_topic_clean_text(sector), top_n=5)
+#   SECTOR_TOPICS.append(topics)
+# def _topic2sector(pred_topics):
+#   out = []
+#   for pred_topic in pred_topics:
+#     relevant_sectors = []
+#     for i in range(len(SECTOR_LIST)):
+#       if pred_topic in SECTOR_TOPICS[i]:
+#         relevant_sectors.append(list(DF_SP500.Sector.unique())[i])
+#     out.append(relevant_sectors)
+#   return out
+# def _inference_topic_match(text):
+#   out, _ = bertopic_model.transform([_topic_clean_text(t) for t in text])
+#   return out
 def get_company_sectors(extracted_names, threshold=0.95):
   '''
     sentiment = _inference_sentiment_model_pipeline(input_batch_content )
     print("[i] Running NER using custom spancat inference...")
     ner_labels = _inference_ner_spancat(input_batch_content ,limit_outputs=limit_companies)
+    # print("[i] BERTopic...")
+    # topics = _inference_topic_match(input_batch_content)
     df = pd.DataFrame(prob_outs,columns =['E','S','G'])
     if isurl:
         df['content_id'] = range(1, len(input_batch_r)+1)
     df['sent_lbl'] = [d['label'] for d in sentiment ]
     df['sent_score'] = [d['score'] for d in sentiment ]
+    #df['sector_pred'] = pd.DataFrame(_topic2sector(topics)).iloc[:, 0]
     print("[i] Pandas output shape:",df.shape)
     #[[], [('Nvidia', 'Information Technology')], [('Twitter', 'Communication Services'), ('Apple', 'Information Technology')], [], [], [], [], [], []]