import streamlit as st
from transformers import pipeline, BigBirdTokenizerFast, AutoModelForTokenClassification
def pii_app():
st.title('PII Data Detection')
text_input = st.text_area('Enter a Paragraph below to get list of PII in your text.')
tokenizer = BigBirdTokenizerFast.from_pretrained("google/bigbird-roberta-base", block_size=2)
model = AutoModelForTokenClassification.from_pretrained("vedantM/BigBird-PII")
big_bird_classifier = pipeline(task="token-classification",
model=model,
aggregation_strategy="average",
tokenizer=tokenizer)
output = big_bird_classifier(text_input)
st.header('List of Entities:')
for entity in output:
st.write(f"Entity: {entity['word']}, Type: {entity['entity_group']}")
highlighted_text = highlight_pii(text_input, output)
st.header('\nPII Detected Output:')
st.markdown(highlighted_text, unsafe_allow_html=True)
def highlight_pii(text, entities):
highlighted_text = text
offset = 0
for entity in entities:
start_idx = entity["start"] + offset
end_idx = entity["end"] + offset
highlighted_text = (
highlighted_text[:start_idx]
+ f'{highlighted_text[start_idx:end_idx]}'
+ highlighted_text[end_idx:]
)
offset += len('') # Adjust offset for HTML tags
return highlighted_text
if __name__ == "__main__":
pii_app()