Spaces:
Sleeping
Sleeping
File size: 6,886 Bytes
893ed02 6e8dad4 c069834 95a0a03 6e8dad4 95a0a03 6e8dad4 95a0a03 6e8dad4 95a0a03 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import streamlit as st
import pandas as pd
import numpy as np
# Seed for reproducibility
np.random.seed(42)
# Function to generate synthetic Enrollments
def generate_enrollments(num_members):
primary_keys = [f"PPK_{i+1:05d}" for i in range(num_members)]
enrollments_data = {
"MEM_AGE": np.random.randint(18, 80, num_members),
"MEM_MSA_NAME": np.random.choice(["DETROIT", "HONOLULU", "LOS ANGELES"], num_members),
"MEM_STAT": np.random.choice(["ACTIVE", "INACTIVE"], num_members),
"MEMBER_ID": primary_keys,
"PRIMARY_PERSON_KEY": primary_keys,
"PAYER_LOB": np.random.choice(["MEDICAID", "COMMERCIAL", "MEDICARE"], num_members),
"PAYER_TYPE": np.random.choice(["PPO", "HMO"], num_members),
"PRIMARY_CHRONIC_CONDITION_ROLLUP_DESC": np.random.choice(["Cancer", "Diabetes", "Hypertension"], num_members),
"Count of PRIMARY_CHRONIC_CONDITION_ROLLUP_ID": np.random.randint(1, 5, num_members),
"PROD_TYPE": np.random.choice(["DENTAL", "VISION", "MEDICAL"], num_members),
"RELATION": np.random.choice(["SUBSCRIBER", "DEPENDENT"], num_members),
"YEARMO": np.random.randint(202201, 202412, num_members),
}
return pd.DataFrame(enrollments_data)
# Function to generate synthetic Members
def generate_members(num_members):
primary_keys = [f"PPK_{i+1:05d}" for i in range(num_members)]
members_data = {
"MEM_ETHNICITY": np.random.choice(["Hispanic", "Non-Hispanic", None], num_members),
"MEM_GENDER": ["F"] * num_members,
"MEM_MSA_NAME": np.random.choice(["DETROIT", "HONOLULU", "LOS ANGELES"], num_members),
"MEM_RACE": np.random.choice(["White", "Black", "Asian", None], num_members),
"MEM_STATE": np.random.choice(["MI", "HI", "CA"], num_members),
"MEM_ZIP3": np.random.randint(100, 999, num_members),
"MEMBER_ID": primary_keys,
"PRIMARY_PERSON_KEY": primary_keys,
}
return pd.DataFrame(members_data)
# Function to generate synthetic Providers
def generate_providers(num_providers):
providers_data = {
"PROV_CLINIC_STATE": np.random.choice(["MI", "HI", "CA"], num_providers),
"PROV_CLINIC_ZIP": np.random.randint(10000, 99999, num_providers),
"PROV_KEY": [f"PK_{i+1:05d}" for i in range(num_providers)],
"Sum of PROV_NPI_ORG": np.random.randint(1, 50, num_providers),
"PROV_TAXONOMY": np.random.choice(["208100000X", "207RE0101X"], num_providers),
"PROV_TYPE": np.random.choice(["Type1", "Type2"], num_providers),
}
return pd.DataFrame(providers_data)
# Function to generate synthetic Services
def generate_services(num_services, primary_keys):
services_data = {
"PRIMARY_PERSON_KEY": np.random.choice(primary_keys, num_services),
"Sum of AMT_ALLOWED": np.random.uniform(1000, 10000, num_services),
"Sum of AMT_BILLED": np.random.uniform(1000, 15000, num_services),
"Count of AMT_PAID": np.random.randint(1, 5, num_services),
"ATT_PROV_KEY": [f"PK_{i+1:05d}" for i in np.random.randint(1, len(primary_keys), num_services)],
"BILL_PROV_KEY": [f"PK_{i+1:05d}" for i in np.random.randint(1, len(primary_keys), num_services)],
"CLAIM_IN_NETWORK": np.random.choice(["Y", "N", None], num_services),
"RELATION": np.random.choice(["SUBSCRIBER", "DEPENDENT"], num_services),
"SERVICE_SETTING": np.random.choice(["OUTPATIENT", "INPATIENT"], num_services),
"Sum of SERVICE_LINE": np.random.randint(1, 10, num_services),
"Sum of SV_UNITS": np.random.randint(1, 100, num_services),
"YEARMO": np.random.randint(202201, 202412, num_services),
}
return pd.DataFrame(services_data)
# Function to generate synthetic BreastCancer data
def generate_breast_cancer_data(num_patients):
patient_ids = [f"PPK_{i+1:05d}" for i in range(num_patients)]
breast_cancer_data = {
"Patient ID": patient_ids,
"Age": np.random.randint(30, 80, num_patients),
"Menopausal Status": np.random.choice(["Post-menopausal", "Pre-menopausal"], num_patients),
"Tumor Size (cm)": np.round(np.random.lognormal(mean=0.7, sigma=0.5, size=num_patients), 2),
"Lymph Node Involvement": np.random.choice(["Positive", "Negative"], num_patients),
"Tumor Grade": np.random.choice([1, 2, 3], num_patients),
"Tumor Stage": np.random.choice(["I", "II", "III", "IV"], num_patients),
"ER Status": np.random.choice(["Positive", "Negative"], num_patients),
"PR Status": np.random.choice(["Positive", "Negative"], num_patients),
"HER2 Status": np.random.choice(["Positive", "Negative"], num_patients),
"Ki-67 Level": np.random.choice(["High", "Low"], num_patients),
"TNBC Status": np.random.choice(["Positive", "Negative"], num_patients),
"BRCA Mutation": np.random.choice(["Positive", "Negative"], num_patients),
"Overall Health": np.random.choice(["Good", "Poor"], num_patients),
"Genomic Recurrence Score": np.random.choice(["Low", "Intermediate", "High", "N/A"], num_patients),
"Treatment": np.random.choice(["Surgery", "Chemotherapy", "Radiation Therapy"], num_patients),
}
return pd.DataFrame(breast_cancer_data)
# Main Streamlit App
st.title("Synthetic Medical Data Generator")
# Input parameters
num_members = st.slider("Number of Members to Generate", 10, 1000, 100)
num_providers = st.slider("Number of Providers to Generate", 10, 500, 100)
num_services = st.slider("Number of Services to Generate", 10, 2000, 500)
num_patients = st.slider("Number of Breast Cancer Patients to Generate", 10, 500, 100)
if st.button("Generate Data"):
enrollments_df = generate_enrollments(num_members)
members_df = generate_members(num_members)
providers_df = generate_providers(num_providers)
services_df = generate_services(num_services, enrollments_df["PRIMARY_PERSON_KEY"].tolist())
breast_cancer_df = generate_breast_cancer_data(num_patients)
# Display data
st.subheader("Enrollments Data")
st.dataframe(enrollments_df.head())
st.download_button("Download Enrollments", enrollments_df.to_csv(index=False), "enrollments.csv")
st.subheader("Members Data")
st.dataframe(members_df.head())
st.download_button("Download Members", members_df.to_csv(index=False), "members.csv")
st.subheader("Providers Data")
st.dataframe(providers_df.head())
st.download_button("Download Providers", providers_df.to_csv(index=False), "providers.csv")
st.subheader("Services Data")
st.dataframe(services_df.head())
st.download_button("Download Services", services_df.to_csv(index=False), "services.csv")
st.subheader("Breast Cancer Data")
st.dataframe(breast_cancer_df.head())
st.download_button("Download Breast Cancer Data", breast_cancer_df.to_csv(index=False), "breast_cancer.csv")
|