cpi-connect commited on
Commit
dc4e268
·
1 Parent(s): 63cd75b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -10
app.py CHANGED
@@ -2,6 +2,7 @@ import streamlit as st
2
  from annotated_text import annotated_text
3
  from transformers import AutoModelForTokenClassification
4
  from transformers import AutoTokenizer
 
5
  import requests
6
  import random
7
  import justext
@@ -12,15 +13,6 @@ import jsonlines
12
 
13
  st.sidebar.markdown("Enter the URLs to be processed!")
14
 
15
- model_checkpoint = "../SecureBERT-finetuned-ner/"
16
- device = "cpu"
17
-
18
- tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
19
- model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=40).to(device)
20
-
21
- with open("../ner_classes.pkl", "rb") as f:
22
- ner_classes = pickle.load(f)
23
-
24
  query_input = st.text_input("URL:")
25
  if query_input:
26
  headers = {
@@ -40,4 +32,33 @@ if query_input:
40
  if not paragraph.is_boilerplate:
41
  text += paragraph.text + "\n"
42
 
43
- text = text.split("\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  from annotated_text import annotated_text
3
  from transformers import AutoModelForTokenClassification
4
  from transformers import AutoTokenizer
5
+ from transformers import pipeline
6
  import requests
7
  import random
8
  import justext
 
13
 
14
  st.sidebar.markdown("Enter the URLs to be processed!")
15
 
 
 
 
 
 
 
 
 
 
16
  query_input = st.text_input("URL:")
17
  if query_input:
18
  headers = {
 
32
  if not paragraph.is_boilerplate:
33
  text += paragraph.text + "\n"
34
 
35
+ text = text.split("\n")
36
+ text = [text_block for text_block in text if text_block != ""]
37
+
38
+ pipe = pipeline("token-classification", model="cpi-connect/SecureBERT-NER", grouped_entities=True)
39
+
40
+ for text_block in text:
41
+ entities = pipe(text_block)
42
+ annotated = []
43
+
44
+ last_entity, last_idx = None, None
45
+ for entity in entities:
46
+ if last_entity is None and last_idx is None:
47
+ annotated.append(text_block[:entity["start"]])
48
+ annotated.append((text_block[entity["start"] : entity["end"]], entity["entity_group"]))
49
+ last_entity = entity["entity_group"]
50
+ last_idx = entity["end"]
51
+ elif last_entity == entity["entity_group"] and last_idx == entity["start"]:
52
+ new_text = annotated[-1][0] + text_block[entity["start"] : entity["end"]]
53
+ label = annotated[-1][1]
54
+ annotated[-1] = (new_text, label)
55
+ last_entity = entity["entity_group"]
56
+ last_idx = entity["end"]
57
+ else:
58
+ annotated.append(text_block[last_idx : entity["start"]])
59
+ annotated.append((text_block[entity["start"] : entity["end"]], entity["entity_group"]))
60
+ last_entity = entity["entity_group"]
61
+ last_idx = entity["end"]
62
+
63
+ annotated.append(text_block[last_idx : ])
64
+ annotated_text(annotated)