import json import os import random import gradio as gr import spacy from huggingface_hub import snapshot_download from spacy import displacy from spacy.tokens import Span # download spacy model -- os.system('python -m spacy download en_core_web_sm') # # set up colors for PII types --- # options = { # "colors": { # "NAME_STUDENT": "#7FDBFF", # Soft blue # "EMAIL": "#008080", # Dark cyan # "USERNAME": "#C3B1E1", # Pastel violet # "ID_NUM": "#2ECC40", # Medium green # "PHONE_NUM": "#FF851B", # Deep orange # "URL_PERSONAL": "#4682B4", # Steel blue # "STREET_ADDRESS": "#808000", # Muted olive # } # } options = { "colors": { "NAME_STUDENT": "#6EB5FF", # Lighter blue "EMAIL": "#42D4B5", # Light teal "USERNAME": "#D8B4E2", # Light lavender "ID_NUM": "#7AE88F", # Light green "PHONE_NUM": "#FFB87D", # Light peach "URL_PERSONAL": "#C9B4E2", # Pale purple "STREET_ADDRESS": "#B4B77F" # Light olive } } # download datamix --- def download_data(): snapshot_download( repo_id="rbiswasfc/pii-datamix", repo_type="dataset", local_dir="./data", ) print("Data downloaded!") download_data() # load data --- with open("./data/datamix.json") as f: data = json.load(f) subsets = list(data.keys()) pii_types = list(options["colors"].keys()) pii_types.append("Random") nlp = spacy.load("en_core_web_sm") # render sample -- def render_sample(subset, pii_type): candidates = data[subset] while True: sample = random.choice(candidates) if pii_type == "Random": break elif pii_type in sample['piis']: break print("---" * 10) print(sample['document']) print("---" * 10) # render doc = spacy.tokens.Doc(nlp.vocab, words=sample['tokens'], spaces=sample['trailing_whitespace']) # ents = [] in_entity = False start, end = 0, 0 for index, label in enumerate(sample['labels']): if label.startswith('B-'): if in_entity: # End the previous entity ents.append(Span(doc, start, end, sample['labels'][start][2:])) start, end = index, index + 1 # Start a new entity in_entity = True elif label.startswith('I-') and in_entity: end = index + 1 # Continue the entity elif in_entity: # End the current entity and reset ents.append(Span(doc, start, end, sample['labels'][start][2:])) in_entity = False # Add the last entity if we're still in one if in_entity: ents.append(Span(doc, start, end, sample['labels'][start][2:])) doc.ents = ents output = displacy.render(doc, style="ent", jupyter=False, options=options) return {'document': sample['document']}, output # app layout & callback --- # with gr.Blocks(theme=gr.themes.Soft()) as demo: with gr.Blocks() as demo: with gr.Row(): subset_dropdown = gr.Dropdown( subsets, value=subsets[0], label="Subset", info="Select data subset..." ) focus_pii = gr.Dropdown( pii_types, value="Random", label="PII Focus", info="Select a PII type to focus on..." ) sample_btn = gr.Button("Sample") document_id_display = gr.JSON(label="Document ID") sample_display = gr.HTML(label="Example") # callback --- sample_btn.click( fn=render_sample, inputs=[subset_dropdown, focus_pii], outputs=[document_id_display, sample_display], ) # launch app --- demo.launch()