Spaces:

ESG-TFM-UV
/

ESG_API_BATCH

Build error

App Files Files Community

rdose commited on Sep 11, 2022

Commit

0107ad0

1 Parent(s): 0afcea0

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -49

app.py CHANGED Viewed

@@ -17,6 +17,8 @@ MODEL_ONNX_FNAME = "ESG_classifier.onnx"
 MODEL_SENTIMENT_ANALYSIS = "ProsusAI/finbert"
 MODEL_SUMMARY_PEGASUS = "oMateos2020/pegasus-newsroom-cnn_full-adafactor-bs6"
 #API_HF_SENTIMENT_URL = "https://api-inference.huggingface.co/models/cardiffnlp/twitter-roberta-base-sentiment"
 def _inference_ner_spancat(text, summary, penalty=0.5, normalise=True, limit_outputs=10):
@@ -51,32 +53,24 @@ def _inference_sentiment_model_pipeline(text):
 #    response = requests.post(API_HF_SENTIMENT_URL , headers={"Authorization": os.environ['hf_api_token']}, json=payload)
 #    return response.json()
-def convert_listwords_text(list_words):
-  text = ""
-  for word in list_words:
-      text = text + " " + word
-  return text
-def clean_text(text):
-  nlp = spacy.load("en_core_web_sm")
-  nlp.max_length=2000000
-  if (text != ""):
-    list_word = []
-    for token in nlp(text):
-        if (not token.is_punct
-            and not token.is_stop
-            and not token.like_url
-            and not token.is_space
-            and not token.like_email
-            #and not token.like_num
-            and not token.pos_ == "CONJ"):
-            list_word.append(token.lemma_)
-    return convert_listwords_text(list_words=list_word)
-  else:
-    return -1
 def sigmoid(x):
   return 1 / (1 + np.exp(-x))
@@ -103,7 +97,7 @@ def is_in_archive(url):
 def _inference_classifier(text):
     tokenizer = AutoTokenizer.from_pretrained(MODEL_TRANSFORMER_BASED)
-    inputs = tokenizer(clean_text(text), return_tensors="np", padding="max_length", truncation=True) #this assumes head-only!
     ort_session = onnxruntime.InferenceSession(MODEL_ONNX_FNAME)
     onnx_model = onnx.load(MODEL_ONNX_FNAME)
     onnx.checker.check_model(onnx_model)
@@ -113,20 +107,27 @@ def _inference_classifier(text):
     return sigmoid(ort_outs[0])[0]
-def inference(url,use_archive,limit_companies=10):
-    if use_archive:
-        archive = is_in_archive(url)
-        if archive['archived']:
-            url = archive['url']
-    #Extract the data from url
-    extracted = Extractor().extract(requests.get(url).text)
-    prob_outs = _inference_classifier(extracted['content'])
     #sentiment = _inference_sentiment_model_via_api_query({"inputs": extracted['content']})
-    sentiment = _inference_sentiment_model_pipeline(extracted['content'])[0]
-    summary = _inference_summary_model_pipeline(extracted['content'])[0]['generated_text']
-    ner_labels = _inference_ner_spancat(extracted['content'],summary, penalty = 0.8, limit_outputs=limit_companies)
-    return ner_labels, {'E':float(prob_outs[0]),"S":float(prob_outs[1]),"G":float(prob_outs[2])},{sentiment['label']:float(sentiment['score'])},"**Summary:**\n\n" + summary
 title = "ESG API Demo"
 description = """This is a demonstration of the full ESG pipeline backend where given a URL (english, news) the news contents are extracted, using extractnet, and fed to three models:
@@ -141,14 +142,25 @@ API input parameters:
 - `limit_companies`: integer. Number of found relevant companies to report.
 """
-examples = [['https://www.bbc.com/news/uk-62732447',False,5],
-            ['https://www.bbc.com/news/business-62747401',False,5],
-            ['https://www.bbc.com/news/technology-62744858',False,5],
-            ['https://www.bbc.com/news/science-environment-62758811',False,5],
-            ['https://www.theguardian.com/business/2022/sep/02/nord-stream-1-gazprom-announces-indefinite-shutdown-of-pipeline',False,5],
-            ['https://www.bbc.com/news/world-europe-62766867',False,5],
-            ['https://www.bbc.com/news/business-62524031',False,5],
-            ['https://www.bbc.com/news/business-62728621',False,5],
-            ['https://www.bbc.com/news/science-environment-62680423',False,5]]
-demo = gr.Interface(fn=inference, inputs=[gr.Textbox(label='URL'),gr.Checkbox(label='grab cached from archive.org'), gr.Slider(minimum=1, maximum=10, step=1, label='Limit NER output')], outputs=[gr.Label(label='Company'), gr.Label(label='ESG'),gr.Label(label='Sentiment'),gr.Markdown()], title=title, description=description, examples=examples)
 demo.launch()

 MODEL_SENTIMENT_ANALYSIS = "ProsusAI/finbert"
 MODEL_SUMMARY_PEGASUS = "oMateos2020/pegasus-newsroom-cnn_full-adafactor-bs6"
 #API_HF_SENTIMENT_URL = "https://api-inference.huggingface.co/models/cardiffnlp/twitter-roberta-base-sentiment"
 def _inference_ner_spancat(text, summary, penalty=0.5, normalise=True, limit_outputs=10):
 #    response = requests.post(API_HF_SENTIMENT_URL , headers={"Authorization": os.environ['hf_api_token']}, json=payload)
 #    return response.json()
+def _lematise_text(text):
+   nlp = spacy.load("en_core_web_sm", disable=['ner'])
+   text_out = []
+   for doc in nlp.pipe(text): #see https://spacy.io/models#design
+       new_text = ""
+       for token in doc:
+           if (not token.is_punct
+               and not token.is_stop
+               and not token.like_url
+               and not token.is_space
+               and not token.like_email
+               #and not token.like_num
+               and not token.pos_ == "CONJ"):
+                new_text = new_text + " " + token.lemma_
+            text_out.append( new_text )
+   return text_out
 def sigmoid(x):
   return 1 / (1 + np.exp(-x))
 def _inference_classifier(text):
     tokenizer = AutoTokenizer.from_pretrained(MODEL_TRANSFORMER_BASED)
+    inputs = tokenizer(_lematise_text(text), return_tensors="np", padding="max_length", truncation=True) #this assumes head-only!
     ort_session = onnxruntime.InferenceSession(MODEL_ONNX_FNAME)
     onnx_model = onnx.load(MODEL_ONNX_FNAME)
     onnx.checker.check_model(onnx_model)
     return sigmoid(ort_outs[0])[0]
+def inference(input_batch,isurl,use_archive,limit_companies=10):
+    input_batch_content = []
+    if isurl:
+        for url in input_batch:
+            if use_archive:
+                archive = is_in_archive(url)
+                if archive['archived']:
+                    url = archive['url']
+            #Extract the data from url
+            extracted = Extractor().extract(requests.get(url).text)
+            input_batch_content.append(extracted['content'])
+    else:
+        input_batch_content = input_batch
+    prob_outs = _inference_classifier(input_batch_content)
     #sentiment = _inference_sentiment_model_via_api_query({"inputs": extracted['content']})
+    #sentiment = _inference_sentiment_model_pipeline(input_batch_content )[0]
+    #summary = _inference_summary_model_pipeline(input_batch_content )[0]['generated_text']
+    #ner_labels = _inference_ner_spancat(input_batch_content ,summary, penalty = 0.8, limit_outputs=limit_companies)
+    return prob_outs #ner_labels, {'E':float(prob_outs[0]),"S":float(prob_outs[1]),"G":float(prob_outs[2])},{sentiment['label']:float(sentiment['score'])},"**Summary:**\n\n" + summary
 title = "ESG API Demo"
 description = """This is a demonstration of the full ESG pipeline backend where given a URL (english, news) the news contents are extracted, using extractnet, and fed to three models:
 - `limit_companies`: integer. Number of found relevant companies to report.
 """
+#examples = [['https://www.bbc.com/news/uk-62732447',False,5],
+#            ['https://www.bbc.com/news/business-62747401',False,5],
+#            ['https://www.bbc.com/news/technology-62744858',False,5],
+#            ['https://www.bbc.com/news/science-environment-62758811',False,5],
+#            ['https://www.theguardian.com/business/2022/sep/02/nord-stream-1-gazprom-announces-indefinite-shutdown-of-pipeline',False,5],
+#            ['https://www.bbc.com/news/world-europe-62766867',False,5],
+#            ['https://www.bbc.com/news/business-62524031',False,5],
+#            ['https://www.bbc.com/news/business-62728621',False,5],
+#            ['https://www.bbc.com/news/science-environment-62680423',False,5]]
+demo = gr.Interface(fn=inference,
+                    inputs=[gr.Dataframe(label='input batch', col_count=1, datatype='str', type='array', wrap=True),
+                            gr.Dropdown(label='data type', choices=['text','url'], type='index'),
+                            gr.Checkbox(label='if url parse cached in archive.org'),
+                            gr.Slider(minimum=1, maximum=10, step=1, label='Limit NER output')],
+                    outputs=[gr.Dataframe(label='output raw', col_count=1, datatype='number', type='array', wrap=True)],
+                             #gr.Label(label='Company'),
+                             #gr.Label(label='ESG'),
+                             #gr.Label(label='Sentiment'),
+                             #gr.Markdown()],
+                    title=title,
+                    description=description)#, examples=examples)
 demo.launch()