Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
# Seed for reproducibility | |
np.random.seed(42) | |
# Function to generate synthetic Enrollments | |
def generate_enrollments(num_members): | |
primary_keys = [f"PPK_{i+1:05d}" for i in range(num_members)] | |
enrollments_data = { | |
"MEM_AGE": np.random.randint(18, 80, num_members), | |
"MEM_MSA_NAME": np.random.choice(["DETROIT", "HONOLULU", "LOS ANGELES"], num_members), | |
"MEM_STAT": np.random.choice(["ACTIVE", "INACTIVE"], num_members), | |
"MEMBER_ID": primary_keys, | |
"PRIMARY_PERSON_KEY": primary_keys, | |
"PAYER_LOB": np.random.choice(["MEDICAID", "COMMERCIAL", "MEDICARE"], num_members), | |
"PAYER_TYPE": np.random.choice(["PPO", "HMO"], num_members), | |
"PRIMARY_CHRONIC_CONDITION_ROLLUP_DESC": np.random.choice(["Cancer", "Diabetes", "Hypertension"], num_members), | |
"Count of PRIMARY_CHRONIC_CONDITION_ROLLUP_ID": np.random.randint(1, 5, num_members), | |
"PROD_TYPE": np.random.choice(["DENTAL", "VISION", "MEDICAL"], num_members), | |
"RELATION": np.random.choice(["SUBSCRIBER", "DEPENDENT"], num_members), | |
"YEARMO": np.random.randint(202201, 202412, num_members), | |
} | |
return pd.DataFrame(enrollments_data) | |
# Function to generate synthetic Members | |
def generate_members(num_members): | |
primary_keys = [f"PPK_{i+1:05d}" for i in range(num_members)] | |
members_data = { | |
"MEM_ETHNICITY": np.random.choice(["Hispanic", "Non-Hispanic", None], num_members), | |
"MEM_GENDER": ["F"] * num_members, | |
"MEM_MSA_NAME": np.random.choice(["DETROIT", "HONOLULU", "LOS ANGELES"], num_members), | |
"MEM_RACE": np.random.choice(["White", "Black", "Asian", None], num_members), | |
"MEM_STATE": np.random.choice(["MI", "HI", "CA"], num_members), | |
"MEM_ZIP3": np.random.randint(100, 999, num_members), | |
"MEMBER_ID": primary_keys, | |
"PRIMARY_PERSON_KEY": primary_keys, | |
} | |
return pd.DataFrame(members_data) | |
# Function to generate synthetic Providers | |
def generate_providers(num_providers): | |
providers_data = { | |
"PROV_CLINIC_STATE": np.random.choice(["MI", "HI", "CA"], num_providers), | |
"PROV_CLINIC_ZIP": np.random.randint(10000, 99999, num_providers), | |
"PROV_KEY": [f"PK_{i+1:05d}" for i in range(num_providers)], | |
"Sum of PROV_NPI_ORG": np.random.randint(1, 50, num_providers), | |
"PROV_TAXONOMY": np.random.choice(["208100000X", "207RE0101X"], num_providers), | |
"PROV_TYPE": np.random.choice(["Type1", "Type2"], num_providers), | |
} | |
return pd.DataFrame(providers_data) | |
# Function to generate synthetic Services | |
def generate_services(num_services, primary_keys): | |
services_data = { | |
"PRIMARY_PERSON_KEY": np.random.choice(primary_keys, num_services), | |
"Sum of AMT_ALLOWED": np.random.uniform(1000, 10000, num_services), | |
"Sum of AMT_BILLED": np.random.uniform(1000, 15000, num_services), | |
"Count of AMT_PAID": np.random.randint(1, 5, num_services), | |
"ATT_PROV_KEY": [f"PK_{i+1:05d}" for i in np.random.randint(1, len(primary_keys), num_services)], | |
"BILL_PROV_KEY": [f"PK_{i+1:05d}" for i in np.random.randint(1, len(primary_keys), num_services)], | |
"CLAIM_IN_NETWORK": np.random.choice(["Y", "N", None], num_services), | |
"RELATION": np.random.choice(["SUBSCRIBER", "DEPENDENT"], num_services), | |
"SERVICE_SETTING": np.random.choice(["OUTPATIENT", "INPATIENT"], num_services), | |
"Sum of SERVICE_LINE": np.random.randint(1, 10, num_services), | |
"Sum of SV_UNITS": np.random.randint(1, 100, num_services), | |
"YEARMO": np.random.randint(202201, 202412, num_services), | |
} | |
return pd.DataFrame(services_data) | |
# Function to generate synthetic BreastCancer data | |
def generate_breast_cancer_data(num_patients): | |
patient_ids = [f"PPK_{i+1:05d}" for i in range(num_patients)] | |
breast_cancer_data = { | |
"Patient ID": patient_ids, | |
"Age": np.random.randint(30, 80, num_patients), | |
"Menopausal Status": np.random.choice(["Post-menopausal", "Pre-menopausal"], num_patients), | |
"Tumor Size (cm)": np.round(np.random.lognormal(mean=0.7, sigma=0.5, size=num_patients), 2), | |
"Lymph Node Involvement": np.random.choice(["Positive", "Negative"], num_patients), | |
"Tumor Grade": np.random.choice([1, 2, 3], num_patients), | |
"Tumor Stage": np.random.choice(["I", "II", "III", "IV"], num_patients), | |
"ER Status": np.random.choice(["Positive", "Negative"], num_patients), | |
"PR Status": np.random.choice(["Positive", "Negative"], num_patients), | |
"HER2 Status": np.random.choice(["Positive", "Negative"], num_patients), | |
"Ki-67 Level": np.random.choice(["High", "Low"], num_patients), | |
"TNBC Status": np.random.choice(["Positive", "Negative"], num_patients), | |
"BRCA Mutation": np.random.choice(["Positive", "Negative"], num_patients), | |
"Overall Health": np.random.choice(["Good", "Poor"], num_patients), | |
"Genomic Recurrence Score": np.random.choice(["Low", "Intermediate", "High", "N/A"], num_patients), | |
"Treatment": np.random.choice(["Surgery", "Chemotherapy", "Radiation Therapy"], num_patients), | |
} | |
return pd.DataFrame(breast_cancer_data) | |
# Main Streamlit App | |
st.title("Synthetic Medical Data Generator") | |
# Input parameters | |
num_members = st.slider("Number of Members to Generate", 10, 1000, 100) | |
num_providers = st.slider("Number of Providers to Generate", 10, 500, 100) | |
num_services = st.slider("Number of Services to Generate", 10, 2000, 500) | |
num_patients = st.slider("Number of Breast Cancer Patients to Generate", 10, 500, 100) | |
if st.button("Generate Data"): | |
enrollments_df = generate_enrollments(num_members) | |
members_df = generate_members(num_members) | |
providers_df = generate_providers(num_providers) | |
services_df = generate_services(num_services, enrollments_df["PRIMARY_PERSON_KEY"].tolist()) | |
breast_cancer_df = generate_breast_cancer_data(num_patients) | |
# Display data | |
st.subheader("Enrollments Data") | |
st.dataframe(enrollments_df.head()) | |
st.download_button("Download Enrollments", enrollments_df.to_csv(index=False), "enrollments.csv") | |
st.subheader("Members Data") | |
st.dataframe(members_df.head()) | |
st.download_button("Download Members", members_df.to_csv(index=False), "members.csv") | |
st.subheader("Providers Data") | |
st.dataframe(providers_df.head()) | |
st.download_button("Download Providers", providers_df.to_csv(index=False), "providers.csv") | |
st.subheader("Services Data") | |
st.dataframe(services_df.head()) | |
st.download_button("Download Services", services_df.to_csv(index=False), "services.csv") | |
st.subheader("Breast Cancer Data") | |
st.dataframe(breast_cancer_df.head()) | |
st.download_button("Download Breast Cancer Data", breast_cancer_df.to_csv(index=False), "breast_cancer.csv") | |