File size: 6,886 Bytes
893ed02
 
 
 
6e8dad4
 
c069834
95a0a03
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6e8dad4
95a0a03
 
6e8dad4
95a0a03
 
 
 
 
6e8dad4
 
95a0a03
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import streamlit as st
import pandas as pd
import numpy as np

# Seed for reproducibility
np.random.seed(42)

# Function to generate synthetic Enrollments
def generate_enrollments(num_members):
    primary_keys = [f"PPK_{i+1:05d}" for i in range(num_members)]
    enrollments_data = {
        "MEM_AGE": np.random.randint(18, 80, num_members),
        "MEM_MSA_NAME": np.random.choice(["DETROIT", "HONOLULU", "LOS ANGELES"], num_members),
        "MEM_STAT": np.random.choice(["ACTIVE", "INACTIVE"], num_members),
        "MEMBER_ID": primary_keys,
        "PRIMARY_PERSON_KEY": primary_keys,
        "PAYER_LOB": np.random.choice(["MEDICAID", "COMMERCIAL", "MEDICARE"], num_members),
        "PAYER_TYPE": np.random.choice(["PPO", "HMO"], num_members),
        "PRIMARY_CHRONIC_CONDITION_ROLLUP_DESC": np.random.choice(["Cancer", "Diabetes", "Hypertension"], num_members),
        "Count of PRIMARY_CHRONIC_CONDITION_ROLLUP_ID": np.random.randint(1, 5, num_members),
        "PROD_TYPE": np.random.choice(["DENTAL", "VISION", "MEDICAL"], num_members),
        "RELATION": np.random.choice(["SUBSCRIBER", "DEPENDENT"], num_members),
        "YEARMO": np.random.randint(202201, 202412, num_members),
    }
    return pd.DataFrame(enrollments_data)

# Function to generate synthetic Members
def generate_members(num_members):
    primary_keys = [f"PPK_{i+1:05d}" for i in range(num_members)]
    members_data = {
        "MEM_ETHNICITY": np.random.choice(["Hispanic", "Non-Hispanic", None], num_members),
        "MEM_GENDER": ["F"] * num_members,
        "MEM_MSA_NAME": np.random.choice(["DETROIT", "HONOLULU", "LOS ANGELES"], num_members),
        "MEM_RACE": np.random.choice(["White", "Black", "Asian", None], num_members),
        "MEM_STATE": np.random.choice(["MI", "HI", "CA"], num_members),
        "MEM_ZIP3": np.random.randint(100, 999, num_members),
        "MEMBER_ID": primary_keys,
        "PRIMARY_PERSON_KEY": primary_keys,
    }
    return pd.DataFrame(members_data)

# Function to generate synthetic Providers
def generate_providers(num_providers):
    providers_data = {
        "PROV_CLINIC_STATE": np.random.choice(["MI", "HI", "CA"], num_providers),
        "PROV_CLINIC_ZIP": np.random.randint(10000, 99999, num_providers),
        "PROV_KEY": [f"PK_{i+1:05d}" for i in range(num_providers)],
        "Sum of PROV_NPI_ORG": np.random.randint(1, 50, num_providers),
        "PROV_TAXONOMY": np.random.choice(["208100000X", "207RE0101X"], num_providers),
        "PROV_TYPE": np.random.choice(["Type1", "Type2"], num_providers),
    }
    return pd.DataFrame(providers_data)

# Function to generate synthetic Services
def generate_services(num_services, primary_keys):
    services_data = {
        "PRIMARY_PERSON_KEY": np.random.choice(primary_keys, num_services),
        "Sum of AMT_ALLOWED": np.random.uniform(1000, 10000, num_services),
        "Sum of AMT_BILLED": np.random.uniform(1000, 15000, num_services),
        "Count of AMT_PAID": np.random.randint(1, 5, num_services),
        "ATT_PROV_KEY": [f"PK_{i+1:05d}" for i in np.random.randint(1, len(primary_keys), num_services)],
        "BILL_PROV_KEY": [f"PK_{i+1:05d}" for i in np.random.randint(1, len(primary_keys), num_services)],
        "CLAIM_IN_NETWORK": np.random.choice(["Y", "N", None], num_services),
        "RELATION": np.random.choice(["SUBSCRIBER", "DEPENDENT"], num_services),
        "SERVICE_SETTING": np.random.choice(["OUTPATIENT", "INPATIENT"], num_services),
        "Sum of SERVICE_LINE": np.random.randint(1, 10, num_services),
        "Sum of SV_UNITS": np.random.randint(1, 100, num_services),
        "YEARMO": np.random.randint(202201, 202412, num_services),
    }
    return pd.DataFrame(services_data)

# Function to generate synthetic BreastCancer data
def generate_breast_cancer_data(num_patients):
    patient_ids = [f"PPK_{i+1:05d}" for i in range(num_patients)]
    breast_cancer_data = {
        "Patient ID": patient_ids,
        "Age": np.random.randint(30, 80, num_patients),
        "Menopausal Status": np.random.choice(["Post-menopausal", "Pre-menopausal"], num_patients),
        "Tumor Size (cm)": np.round(np.random.lognormal(mean=0.7, sigma=0.5, size=num_patients), 2),
        "Lymph Node Involvement": np.random.choice(["Positive", "Negative"], num_patients),
        "Tumor Grade": np.random.choice([1, 2, 3], num_patients),
        "Tumor Stage": np.random.choice(["I", "II", "III", "IV"], num_patients),
        "ER Status": np.random.choice(["Positive", "Negative"], num_patients),
        "PR Status": np.random.choice(["Positive", "Negative"], num_patients),
        "HER2 Status": np.random.choice(["Positive", "Negative"], num_patients),
        "Ki-67 Level": np.random.choice(["High", "Low"], num_patients),
        "TNBC Status": np.random.choice(["Positive", "Negative"], num_patients),
        "BRCA Mutation": np.random.choice(["Positive", "Negative"], num_patients),
        "Overall Health": np.random.choice(["Good", "Poor"], num_patients),
        "Genomic Recurrence Score": np.random.choice(["Low", "Intermediate", "High", "N/A"], num_patients),
        "Treatment": np.random.choice(["Surgery", "Chemotherapy", "Radiation Therapy"], num_patients),
    }
    return pd.DataFrame(breast_cancer_data)

# Main Streamlit App
st.title("Synthetic Medical Data Generator")

# Input parameters
num_members = st.slider("Number of Members to Generate", 10, 1000, 100)
num_providers = st.slider("Number of Providers to Generate", 10, 500, 100)
num_services = st.slider("Number of Services to Generate", 10, 2000, 500)
num_patients = st.slider("Number of Breast Cancer Patients to Generate", 10, 500, 100)

if st.button("Generate Data"):
    enrollments_df = generate_enrollments(num_members)
    members_df = generate_members(num_members)
    providers_df = generate_providers(num_providers)
    services_df = generate_services(num_services, enrollments_df["PRIMARY_PERSON_KEY"].tolist())
    breast_cancer_df = generate_breast_cancer_data(num_patients)

    # Display data
    st.subheader("Enrollments Data")
    st.dataframe(enrollments_df.head())
    st.download_button("Download Enrollments", enrollments_df.to_csv(index=False), "enrollments.csv")

    st.subheader("Members Data")
    st.dataframe(members_df.head())
    st.download_button("Download Members", members_df.to_csv(index=False), "members.csv")

    st.subheader("Providers Data")
    st.dataframe(providers_df.head())
    st.download_button("Download Providers", providers_df.to_csv(index=False), "providers.csv")

    st.subheader("Services Data")
    st.dataframe(services_df.head())
    st.download_button("Download Services", services_df.to_csv(index=False), "services.csv")

    st.subheader("Breast Cancer Data")
    st.dataframe(breast_cancer_df.head())
    st.download_button("Download Breast Cancer Data", breast_cancer_df.to_csv(index=False), "breast_cancer.csv")