import re from pathlib import Path import gradio as gr import matplotlib.pyplot as plt import pandas as pd import seaborn as sns import yaml import pteredactyl as pt from pteredactyl.defaults import change_model sample_text = """ 1. Dr. Huntington (Patient No: 1234567890) diagnosed Ms. Alzheimer with Alzheimer's disease during her last visit to the Huntington Medical Center on 12/12/2023. The prognosis was grim, but Dr. Huntington assured Ms. Alzheimer that the facility was well-equipped to handle her condition despite the lack of a cure for Alzheimer's. 2. Paget Brewster (Patient No: 0987654321), a 45-year-old woman, was recently diagnosed with Paget's disease of bone by her physician, Dr. Graves at St. Jenny's Hospital on 01/06/2026 Postcode: JE30 6YN. Paget's disease is a chronic di[PERSON]der that affects bone remodeling, leading to weakened and deformed bones. Brewster's case is not related to Grave's disease, an autoimmune disorder affecting the thyroid gland. 3. Crohn Marshall (Patient No: 943 476 5918), a 28-year-old man, has been battling Crohn's disease for the past five years. Crohn's disease is a type of inflammatory bowel disease (IBD) that causes inflammation of the digestive tract. Marshall's condition is managed by his gastroenterologist, Dr. Ulcerative Colitis, who specializes in treating IBD patients at the Royal Free Hospital. 4. Addison Montgomery (NHS No: 5566778899), a 32-year-old woman, was rushed to University College Hospital on 18/09/2023 after experiencing severe abdominal pain and fatigue. After a series of tests, Dr. Cushing diagnosed Montgomery with Addison's disease, a rare disorder of the adrenal glands. Postcode is NH77 9AF. Montgomery's condition is not related to Cushing's syndrome, which is caused by excessive cortisol production. 5. Lou Gehrig (NHS No: 943 476 5919), a renowned baseball player, was diagnosed with amyotrophic lateral sclerosis (ALS) in 1939 at the Mayo Clinic. ALS, also known as Lou Gehrig's disease, is a progressive neurodegenerative disorder that affects nerve cells in the brain and spinal cord. Gehrig's diagnosis was confirmed by his neurologist, Dr. Bell, who noted that the condition was not related to Bell's palsy, a temporary facial paralysis. 6. Parkinson Brown (Patient No No: 3344556677), a 62-year-old man, has been living with Parkinson's disease for the past decade. Parkinson's disease is a neurodegenerative disorder that affects movement and balance. Brown's condition is managed by his neurologist, Dr. Lewy Body, who noted that Brown's symptoms were not related to Lewy body dementia, another neurodegenerative disorder, at King's College Hospital. 7. Kaposi Sarcoma (Patient No: 9988776655), a 35-year-old man, was recently diagnosed with Kaposi's sarcoma, a type of cancer that develops from the cells that line lymph or blood vessels, at Guy's Hospital on 17/04/2023. Sarcoma's diagnosis was confirmed by his oncologist, Dr. Burkitt Lymphoma, who noted that the condition was not related to Burkitt's lymphoma, an aggressive form of non-Hodgkin's lymphoma. He died on 17/04/2023. 8. Dr. Kawasaki (Patient No No: 2233445566) treated young Henoch Schonlein for Henoch-Schönlein purpura, a rare disorder that causes inflammation of the blood vessels, at Great Ormond Street Hospital on 05/05/2024. Schonlein's case was not related to Kawasaki disease, a condition that primarily affects children and causes inflammation in the walls of medium-sized arteries. 9. Wilson Menkes (NHS No: 943 476 5916), a 42-year-old man, was diagnosed with Wilson's disease, a rare genetic disorder that causes copper to accumulate in the body. Menkes' diagnosis was confirmed by his geneticist, Dr. Niemann Pick, at Addenbrooke's Hospital on 02/02/2025, who noted that the condition was not related to Niemann-Pick disease, another rare genetic disorder that affects lipid storage. Postcode was GH75 3HF. 10. Dr. Marfan (Patient No No: 4455667788) treated Ms. Ehlers Danlos for Ehlers-Danlos syndrome, a group of inherited disorders that affect the connective tissues, at the Royal Brompton Hospital on 30/11/2024. Danlos' case was not related to Marfan syndrome, another genetic disorder that affects connective tissue development and leads to abnormalities in the bones, eyes, and cardiovascular system. Dr Jab's username is: jabba """ # Gold Standard Text reference_text = """ 1. [PERSON] (Patient No: [ID]) diagnosed [PERSON] with Alzheimer's disease during her last visit to the [LOCATION] on [DATE_TIME]. The prognosis was grim, but [PERSON] assured [PERSON] that the facility was well-equipped to handle her condition despite the lack of a cure for Alzheimer's. 2. [PERSON] (Patient No: [ID]), a 45-year-old woman, was recently diagnosed with Paget's disease of bone by her physician, [PERSON] at [LOCATION] on [DATE_TIME] Postcode: [POSTCODE]. Paget's disease is a chronic disorder that affects bone remodeling, leading to weakened and deformed bones. [PERSON]'s case is not related to Grave's disease, an autoimmune disorder affecting the thyroid gland. 3. [PERSON] ([LOCATION] No: [NHS_NUMBER]), a 28-year-old man, has been battling Crohn's disease for the past five years. Crohn's disease is a type of inflammatory bowel disease (IBD) that causes inflammation of the digestive tract. [PERSON]'s condition is managed by his gastroenterologist, [PERSON] Ulcerative Colitis, who specializes in treating IBD patients at the [LOCATION]. 4. [PERSON] (Patient No: [ID]), a 32-year-old woman, was rushed to [LOCATION] on [DATE_TIME] after experiencing severe abdominal pain and fatigue. After a series of tests, [PERSON] diagnosed [PERSON] with Addison's disease, a rare disorder of the adrenal glands. Postcode is [POSTCODE]. [PERSON]'s condition is not related to Cushing's syndrome, which is caused by excessive cortisol production. 5. [PERSON] ([LOCATION] No: [NHS_NUMBER]), a renowned baseball player, was diagnosed with amyotrophic lateral sclerosis (ALS) in 1939 at the [LOCATION]. ALS, also known as Lou Gehrig's disease, is a progressive neurodegenerative disorder that affects nerve cells in the brain and spinal cord. Gehrig's diagnosis was confirmed by his neurologist, [PERSON], who noted that the condition was not related to Bell's palsy, a temporary facial paralysis. 6. [PERSON] (Patient No: [ID]), a 62-year-old man, has been living with Parkinson's disease for the past decade. Parkinson's disease is a neurodegenerative disorder that affects movement and balance. [PERSON]'s condition is managed by his neurologist, [PERSON], who noted that [PERSON]'s symptoms were not related to Lewy body dementia, another neurodegenerative disorder, at [LOCATION]. 7. [PERSON] (Patient No: [ID]), a 35-year-old man, was recently diagnosed with Kaposi's sarcoma, a type of cancer that develops from the cells that line lymph or blood vessels, at [LOCATION] on [DATE_TIME]. Sarcoma's diagnosis was confirmed by his oncologist, [PERSON] Lymphoma, who noted that the condition was not related to Burkitt's lymphoma, an aggressive form of non-Hodgkin's lymphoma. He died on [DATE_TIME]. 8. [PERSON] (Patient No: [ID]) treated young Henoch Schonlein for Henoch-Schönlein purpura, a rare disorder that causes inflammation of the blood vessels, at [LOCATION] on [DATE_TIME]. [PERSON]'s case was not related to Kawasaki disease, a condition that primarily affects children and causes inflammation in the walls of medium-sized arteries. 9. [PERSON] ([LOCATION] No: [NHS_NUMBER]), a 42-year-old man, was diagnosed with Wilson's disease, a rare genetic disorder that causes copper to accumulate in the body. [PERSON]'s diagnosis was confirmed by his geneticist, [PERSON], at [LOCATION] on [DATE_TIME], who noted that the condition was not related to Niemann-Pick disease, another rare genetic disorder that affects lipid storage. Postcode was [POSTCODE]. 10. [PERSON] (Patient No: [ID]) treated [PERSON] for Ehlers-Danlos syndrome, a group of inherited disorders that affect the connective tissues, at the [LOCATION] on [DATE_TIME]. [PERSON]'s case was not related to Marfan syndrome, another genetic disorder that affects connective tissue development and leads to abnormalities in the bones, eyes, and cardiovascular system. Dr [PERSON]'s username is: [USERNAME] """ def redact(text: str, model_name: str): model_paths = { "Stanford Base De-Identifier": "StanfordAIMI/stanford-deidentifier-base", # "Stanford with Radiology and i2b2": "StanfordAIMI/stanford-deidentifier-with-radiology-reports-and-i2b2", "Deberta PII": "lakshyakh93/deberta_finetuned_pii", # "Gliner PII": "urchade/gliner_multi_pii-v1", # "Spacy PII": "beki/en_spacy_pii_distilbert", "Nikhilrk De-Identify": "nikhilrk/de-identify", } model_path = model_paths.get(model_name, "StanfordAIMI/stanford-deidentifier-base") if model_path: change_model(model_path) else: raise ValueError("No valid model path provided.") anonymized_text = pt.anonymise(text, model_path=model_path) # Pass model_path anonymized_text = anonymized_text.replace("<", "[").replace(">", "]") return anonymized_text def extract_tokens(text): tokens = re.findall(r"\[(.*?)\]", text) return tokens def compare_tokens(reference_tokens, redacted_tokens): tp = 0 fn = 0 fp = 0 reference_count = { token: reference_tokens.count(token) for token in set(reference_tokens) } redacted_count = { token: redacted_tokens.count(token) for token in set(redacted_tokens) } for token in reference_count: if token in redacted_count: tp += min(reference_count[token], redacted_count[token]) fn += max(reference_count[token] - redacted_count[token], 0) fp += max(redacted_count[token] - reference_count[token], 0) else: fn += reference_count[token] for token in redacted_count: if token not in reference_count: fp += redacted_count[token] return tp, fn, fp def calculate_true_negatives(total_tokens, total_entities, tp, fn, fp): tn = total_tokens - (total_entities + fp + fn + tp) return tn def count_entities_and_compute_metrics(reference_text: str, redacted_text: str): reference_tokens = extract_tokens(reference_text) redacted_tokens = extract_tokens(redacted_text) tp_count, fn_count, fp_count = compare_tokens(reference_tokens, redacted_tokens) total_tokens = len(reference_text.split()) total_entities = len(reference_tokens) tn_count = calculate_true_negatives( total_tokens, total_entities, tp_count, fn_count, fp_count ) return tp_count, fn_count, fp_count, tn_count def flag_errors(reference_text: str, redacted_text: str): fn_count = 0 fp_count = 0 reference_tokens = [ (match.group(1), match.start()) for match in re.finditer(r"\[(.*?)\]", reference_text) ] redacted_tokens = [ (match.group(1), match.start()) for match in re.finditer(r"\[(.*?)\]", redacted_text) ] reference_set = set(token for token, _ in reference_tokens) redacted_set = set(token for token, _ in redacted_tokens) flagged_reference_text = reference_text flagged_redacted_text = redacted_text for token, _ in reference_tokens: if token not in redacted_set: fn_count += 1 flagged_reference_text = flagged_reference_text.replace( f"[{token}]", f"[FALSE_NEGATIVE]{token}[/FALSE_NEGATIVE]" ) for token, _ in redacted_tokens: if token not in reference_set and token not in [ "FALSE_NEGATIVE", "/FALSE_NEGATIVE", ]: fp_count += 1 flagged_redacted_text = flagged_redacted_text.replace( f"[{token}]", f"[FALSE_POSITIVE]{token}[/FALSE_POSITIVE]" ) return flagged_reference_text, flagged_redacted_text, fn_count, fp_count def visualize_entities(redacted_text: str): colors = { "PERSON": "linear-gradient(90deg, #aa9cfc, #fc9ce7)", "ID": "linear-gradient(90deg, #ff9a9e, #fecfef)", "GPE": "linear-gradient(90deg, #fccb90, #d57eeb)", "NHS_NUMBER": "linear-gradient(90deg, #ff9a9e, #fecfef)", "DATE_TIME": "linear-gradient(90deg, #fddb92, #d1fdff)", "LOCATION": "linear-gradient(90deg, #a1c4fd, #c2e9fb)", "EVENT": "linear-gradient(90deg, #a6c0fe, #f68084)", "POSTCODE": "linear-gradient(90deg, #c2e59c, #64b3f4)", "USERNAME": "linear-gradient(90deg, #aa9cfc, #fc9ce7)", "FALSE_NEGATIVE": "linear-gradient(90deg, #ff6b6b, #ff9a9e)", # Red for false negatives "/FALSE_NEGATIVE": "linear-gradient(90deg, #ff6b6b, #ff9a9e)", # Red for false negatives "FALSE_POSITIVE": "linear-gradient(90deg, #ffcccb, #ff6666)", # Light red for false positives "/FALSE_POSITIVE": "linear-gradient(90deg, #ffcccb, #ff6666)", # Light red for false positives } token_colors = { "[PERSON]": "PERSON", "[LOCATION]": "LOCATION", "[ID]": "ID", "[NHS_NUMBER]": "NHS_NUMBER", "[DATE_TIME]": "DATE_TIME", "[EVENT]": "EVENT", "[POSTCODE]": "POSTCODE", "[USERNAME]": "USERNAME", "[FALSE_NEGATIVE]": "FALSE_NEGATIVE", "[/FALSE_NEGATIVE]": "/FALSE_NEGATIVE", "[FALSE_POSITIVE]": "FALSE_POSITIVE", "[/FALSE_POSITIVE]": "/FALSE_POSITIVE", } def wrap_token_in_html(text, token, color): parts = text.split(token) wrapped_token = f'{token}' return wrapped_token.join(parts) for token, color_class in token_colors.items(): redacted_text = wrap_token_in_html(redacted_text, token, colors[color_class]) return f'
Metric | Value |
---|---|
Accuracy: [(TP+TN) / (TP + FN + FP + TN)] | {accuracy:.2f} |
Precision: [TP / (TP + FP)] | {precision:.2f} |
Recall: [TP / (TP + FN)] | {recall:.2f} |
F1 Score: [2 * Precision * Recall / (Precision + Recall)] | {f1_score:.2f} |