Spaces:
Sleeping
Sleeping
File size: 1,608 Bytes
6a804e1 c6e6478 6a804e1 c6e6478 07067dd c6e6478 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
import streamlit as st
from transformers import pipeline, BigBirdTokenizerFast, AutoModelForTokenClassification
def pii_app():
st.title('PII Data Detection')
text_input = st.text_area('Enter a Paragraph below to get list of PII in your text.')
tokenizer = BigBirdTokenizerFast.from_pretrained("google/bigbird-roberta-base", block_size=2)
model = AutoModelForTokenClassification.from_pretrained("vedantM/BigBird-PII")
big_bird_classifier = pipeline(task="token-classification",
model=model,
aggregation_strategy="average",
tokenizer=tokenizer)
output = big_bird_classifier(text_input)
st.header('List of Entities:')
for entity in output:
st.write(f"Entity: {entity['word']}, Type: {entity['entity_group']}")
highlighted_text = highlight_pii(text_input, output)
st.header('\nPII Detected Output:')
st.markdown(highlighted_text, unsafe_allow_html=True)
def highlight_pii(text, entities):
highlighted_text = text
offset = 0
for entity in entities:
start_idx = entity["start"] + offset
end_idx = entity["end"] + offset
highlighted_text = (
highlighted_text[:start_idx]
+ f'<span style="background-color: blue">{highlighted_text[start_idx:end_idx]}</span>'
+ highlighted_text[end_idx:]
)
offset += len('<span style="background-color: blue"></span>') # Adjust offset for HTML tags
return highlighted_text
if __name__ == "__main__":
pii_app()
|