Spaces:

CyberPeace-Institute
/

SecureBERT-NER-Space

Sleeping

App Files Files Community

cpi-connect commited on Jun 23, 2023

Commit

dc4e268

1 Parent(s): 63cd75b

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -10

app.py CHANGED Viewed

@@ -2,6 +2,7 @@ import streamlit as st
 from annotated_text import annotated_text
 from transformers import AutoModelForTokenClassification
 from transformers import AutoTokenizer
 import requests
 import random
 import justext
@@ -12,15 +13,6 @@ import jsonlines
 st.sidebar.markdown("Enter the URLs to be processed!")
-model_checkpoint = "../SecureBERT-finetuned-ner/"
-device = "cpu"
-tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
-model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=40).to(device)
-with open("../ner_classes.pkl", "rb") as f:
-    ner_classes = pickle.load(f)
 query_input = st.text_input("URL:")
 if query_input:
     headers = {
@@ -40,4 +32,33 @@ if query_input:
         if not paragraph.is_boilerplate:
             text += paragraph.text + "\n"
-    text = text.split("\n")

 from annotated_text import annotated_text
 from transformers import AutoModelForTokenClassification
 from transformers import AutoTokenizer
+from transformers import pipeline
 import requests
 import random
 import justext
 st.sidebar.markdown("Enter the URLs to be processed!")
 query_input = st.text_input("URL:")
 if query_input:
     headers = {
         if not paragraph.is_boilerplate:
             text += paragraph.text + "\n"
+    text = text.split("\n")
+    text = [text_block for text_block in text if text_block != ""]
+    pipe = pipeline("token-classification", model="cpi-connect/SecureBERT-NER", grouped_entities=True)
+    for text_block in text:
+        entities = pipe(text_block)
+        annotated = []
+        last_entity, last_idx = None, None
+        for entity in entities:
+            if last_entity is None and last_idx is None:
+                annotated.append(text_block[:entity["start"]])
+                annotated.append((text_block[entity["start"] : entity["end"]], entity["entity_group"]))
+                last_entity = entity["entity_group"]
+                last_idx = entity["end"]
+            elif last_entity == entity["entity_group"] and last_idx == entity["start"]:
+                new_text = annotated[-1][0] + text_block[entity["start"] : entity["end"]]
+                label = annotated[-1][1]
+                annotated[-1] = (new_text, label)
+                last_entity = entity["entity_group"]
+                last_idx = entity["end"]
+            else:
+                annotated.append(text_block[last_idx : entity["start"]])
+                annotated.append((text_block[entity["start"] : entity["end"]], entity["entity_group"]))
+                last_entity = entity["entity_group"]
+                last_idx = entity["end"]
+        annotated.append(text_block[last_idx : ])
+        annotated_text(annotated)