Spaces:

vedantM
/

PII-Data-Detection

Sleeping

Vedant Mahangade

removed unused imports

07067dd 7 months ago

1.61 kB

	import streamlit as st
	from transformers import pipeline, BigBirdTokenizerFast, AutoModelForTokenClassification


	def pii_app():
	st.title('PII Data Detection')
	text_input = st.text_area('Enter a Paragraph below to get list of PII in your text.')
	tokenizer = BigBirdTokenizerFast.from_pretrained("google/bigbird-roberta-base", block_size=2)
	model = AutoModelForTokenClassification.from_pretrained("vedantM/BigBird-PII")
	big_bird_classifier = pipeline(task="token-classification",
	model=model,
	aggregation_strategy="average",
	tokenizer=tokenizer)
	output = big_bird_classifier(text_input)
	st.header('List of Entities:')
	for entity in output:
	st.write(f"Entity: {entity['word']}, Type: {entity['entity_group']}")

	highlighted_text = highlight_pii(text_input, output)
	st.header('\nPII Detected Output:')
	st.markdown(highlighted_text, unsafe_allow_html=True)


	def highlight_pii(text, entities):
	highlighted_text = text
	offset = 0
	for entity in entities:
	start_idx = entity["start"] + offset
	end_idx = entity["end"] + offset
	highlighted_text = (
	highlighted_text[:start_idx]
	+ f'<span style="background-color: blue">{highlighted_text[start_idx:end_idx]}</span>'
	+ highlighted_text[end_idx:]
	)
	offset += len('<span style="background-color: blue"></span>') # Adjust offset for HTML tags
	return highlighted_text


	if __name__ == "__main__":
	pii_app()