Spaces:
Sleeping
Sleeping
import streamlit as st | |
from transformers import pipeline, BigBirdTokenizerFast, AutoModelForTokenClassification | |
def pii_app(): | |
st.title('PII Data Detection') | |
text_input = st.text_area('Enter a Paragraph below to get list of PII in your text.') | |
tokenizer = BigBirdTokenizerFast.from_pretrained("google/bigbird-roberta-base", block_size=2) | |
model = AutoModelForTokenClassification.from_pretrained("vedantM/BigBird-PII") | |
big_bird_classifier = pipeline(task="token-classification", | |
model=model, | |
aggregation_strategy="average", | |
tokenizer=tokenizer) | |
output = big_bird_classifier(text_input) | |
st.header('List of Entities:') | |
for entity in output: | |
st.write(f"Entity: {entity['word']}, Type: {entity['entity_group']}") | |
highlighted_text = highlight_pii(text_input, output) | |
st.header('\nPII Detected Output:') | |
st.markdown(highlighted_text, unsafe_allow_html=True) | |
def highlight_pii(text, entities): | |
highlighted_text = text | |
offset = 0 | |
for entity in entities: | |
start_idx = entity["start"] + offset | |
end_idx = entity["end"] + offset | |
highlighted_text = ( | |
highlighted_text[:start_idx] | |
+ f'<span style="background-color: blue">{highlighted_text[start_idx:end_idx]}</span>' | |
+ highlighted_text[end_idx:] | |
) | |
offset += len('<span style="background-color: blue"></span>') # Adjust offset for HTML tags | |
return highlighted_text | |
if __name__ == "__main__": | |
pii_app() | |