Spaces:
Sleeping
Sleeping
eaglelandsonce
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -2,121 +2,145 @@ import streamlit as st
|
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
4 |
|
5 |
-
#
|
6 |
-
|
7 |
-
unique_ids = [f"MEM_{i:05d}" for i in range(1, num_members + 1)]
|
8 |
-
primary_keys = [f"PPK_{i:05d}" for i in range(1, num_members + 1)]
|
9 |
-
|
10 |
-
# Synthetic Enrollments
|
11 |
-
enrollments_data = {
|
12 |
-
"MEM_AGE": np.random.randint(18, 80, num_members),
|
13 |
-
"MEM_MSA_NAME": np.random.choice(["DETROIT", "HONOLULU", "LOS ANGELES"], num_members),
|
14 |
-
"MEM_STAT": np.random.choice(["ACTIVE", "INACTIVE"], num_members),
|
15 |
-
"MEMBER_ID": unique_ids,
|
16 |
-
"PRIMARY_PERSON_KEY": primary_keys,
|
17 |
-
"PAYER_LOB": np.random.choice(["MEDICAID", "COMMERCIAL", "MEDICARE"], num_members),
|
18 |
-
"PAYER_TYPE": np.random.choice(["PPO", "HMO"], num_members),
|
19 |
-
"PRIMARY_CHRONIC_CONDITION_ROLLUP_DESC": np.random.choice(["Cancer", "Diabetes", "Hypertension"], num_members),
|
20 |
-
"Count of PRIMARY_CHRONIC_CONDITION_ROLLUP_ID": np.random.randint(1, 5, num_members),
|
21 |
-
"PROD_TYPE": np.random.choice(["DENTAL", "VISION", "MEDICAL"], num_members),
|
22 |
-
"RELATION": np.random.choice(["SUBSCRIBER", "DEPENDENT"], num_members),
|
23 |
-
"Sum of QTY_MM_DN": np.random.randint(0, 10, num_members),
|
24 |
-
"Sum of QTY_MM_MD": np.random.randint(0, 10, num_members),
|
25 |
-
"Sum of QTY_MM_RX": np.random.randint(0, 10, num_members),
|
26 |
-
"Sum of QTY_MM_VS": np.random.randint(0, 10, num_members),
|
27 |
-
"YEARMO": np.random.randint(202201, 202412, num_members),
|
28 |
-
}
|
29 |
-
enrollments_df = pd.DataFrame(enrollments_data)
|
30 |
-
|
31 |
-
# Synthetic Members
|
32 |
-
members_data = {
|
33 |
-
"MEM_ETHNICITY": np.random.choice(["Hispanic", "Non-Hispanic", None], num_members),
|
34 |
-
"MEM_GENDER": ["F"] * num_members, # Ensuring all members are female
|
35 |
-
"MEM_MSA_NAME": enrollments_data["MEM_MSA_NAME"],
|
36 |
-
"MEM_RACE": np.random.choice(["White", "Black", "Asian", None], num_members),
|
37 |
-
"MEM_STATE": np.random.choice(["MI", "HI", "CA"], num_members),
|
38 |
-
"MEM_ZIP3": np.random.randint(100, 999, num_members),
|
39 |
-
"MEMBER_ID": unique_ids,
|
40 |
-
"PRIMARY_PERSON_KEY": primary_keys,
|
41 |
-
}
|
42 |
-
members_df = pd.DataFrame(members_data)
|
43 |
-
|
44 |
-
# Synthetic Providers
|
45 |
-
providers_data = {
|
46 |
-
"PROV_CLINIC_STATE": np.random.choice(["MI", "HI", "CA"], num_members),
|
47 |
-
"PROV_CLINIC_ZIP": np.random.randint(10000, 99999, num_members),
|
48 |
-
"PROV_KEY": [f"PK_{i:05d}" for i in range(1, num_members + 1)],
|
49 |
-
"PROV_NPI_ORG": np.random.randint(1, 50, num_members),
|
50 |
-
"PROV_TAXONOMY": np.random.choice(["208100000X", "207RE0101X"], num_members),
|
51 |
-
"PROV_TYPE": np.random.choice(["Type1", "Type2"], num_members),
|
52 |
-
}
|
53 |
-
providers_df = pd.DataFrame(providers_data)
|
54 |
-
|
55 |
-
# Synthetic Services
|
56 |
-
services_data = {
|
57 |
-
"MEMBER_ID": unique_ids,
|
58 |
-
"PRIMARY_PERSON_KEY": primary_keys,
|
59 |
-
"Sum of AMT_ALLOWED": np.random.uniform(1000, 10000, num_members),
|
60 |
-
"Sum of AMT_BILLED": np.random.uniform(1000, 15000, num_members),
|
61 |
-
"Count of AMT_PAID": np.random.randint(1, 5, num_members),
|
62 |
-
"ATT_PROV_KEY": np.random.choice(providers_data["PROV_KEY"], num_members),
|
63 |
-
"BILL_PROV_KEY": np.random.choice(providers_data["PROV_KEY"], num_members),
|
64 |
-
"CLAIM_IN_NETWORK": np.random.choice(["Y", "N", None], num_members),
|
65 |
-
"RELATION": enrollments_data["RELATION"],
|
66 |
-
"SERVICE_SETTING": np.random.choice(["OUTPATIENT", "INPATIENT"], num_members),
|
67 |
-
"Sum of SERVICE_LINE": np.random.randint(1, 10, num_members),
|
68 |
-
"Sum of SV_UNITS": np.random.randint(1, 100, num_members),
|
69 |
-
"YEARMO": enrollments_data["YEARMO"],
|
70 |
-
}
|
71 |
-
services_df = pd.DataFrame(services_data)
|
72 |
-
|
73 |
-
return enrollments_df, members_df, providers_df, services_df
|
74 |
-
|
75 |
-
|
76 |
-
# Streamlit App
|
77 |
-
st.title("Synthetic Medical Billing Data Generator")
|
78 |
-
|
79 |
-
# Slider for number of members
|
80 |
-
num_members = st.slider("Select number of unique members:", min_value=10, max_value=1000, step=10, value=100)
|
81 |
-
|
82 |
-
# Generate synthetic data
|
83 |
-
enrollments_df, members_df, providers_df, services_df = generate_synthetic_data(num_members)
|
84 |
-
|
85 |
-
# Display dataframes
|
86 |
-
st.subheader("Preview of Generated Data")
|
87 |
-
st.write("Enrollments Data")
|
88 |
-
st.dataframe(enrollments_df.head())
|
89 |
-
st.write("Members Data")
|
90 |
-
st.dataframe(members_df.head())
|
91 |
-
st.write("Providers Data")
|
92 |
-
st.dataframe(providers_df.head())
|
93 |
-
st.write("Services Data")
|
94 |
-
st.dataframe(services_df.head())
|
95 |
-
|
96 |
-
# Allow downloading the generated files
|
97 |
-
st.subheader("Download Synthetic Data")
|
98 |
-
st.download_button(
|
99 |
-
label="Download Enrollments Data",
|
100 |
-
data=enrollments_df.to_csv(index=False),
|
101 |
-
file_name="Synthetic_Enrollments.csv",
|
102 |
-
mime="text/csv",
|
103 |
-
)
|
104 |
-
st.download_button(
|
105 |
-
label="Download Members Data",
|
106 |
-
data=members_df.to_csv(index=False),
|
107 |
-
file_name="Synthetic_Members.csv",
|
108 |
-
mime="text/csv",
|
109 |
-
)
|
110 |
-
st.download_button(
|
111 |
-
label="Download Providers Data",
|
112 |
-
data=providers_df.to_csv(index=False),
|
113 |
-
file_name="Synthetic_Providers.csv",
|
114 |
-
mime="text/csv",
|
115 |
-
)
|
116 |
-
st.download_button(
|
117 |
-
label="Download Services Data",
|
118 |
-
data=services_df.to_csv(index=False),
|
119 |
-
file_name="Synthetic_Services.csv",
|
120 |
-
mime="text/csv",
|
121 |
-
)
|
122 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
4 |
|
5 |
+
# Seed for reproducibility
|
6 |
+
np.random.seed(42)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
+
# Function to generate synthetic breast cancer data
|
9 |
+
def generate_breast_cancer_data(num_patients=100):
|
10 |
+
patient_ids = [f"BC_{i+1:05d}" for i in range(num_patients)]
|
11 |
+
ages = []
|
12 |
+
menopausal_status = []
|
13 |
+
tumor_sizes = []
|
14 |
+
lymph_nodes = []
|
15 |
+
grades = []
|
16 |
+
stages = []
|
17 |
+
er_status = []
|
18 |
+
pr_status = []
|
19 |
+
her2_status = []
|
20 |
+
ki67_level = []
|
21 |
+
tnbc_status = []
|
22 |
+
brca_mutation = []
|
23 |
+
overall_health = []
|
24 |
+
genomic_score = []
|
25 |
+
treatment = []
|
26 |
+
|
27 |
+
for i in range(num_patients):
|
28 |
+
age = int(np.random.normal(60, 10))
|
29 |
+
age = max(30, min(age, 80))
|
30 |
+
ages.append(age)
|
31 |
+
|
32 |
+
menopausal = "Post-menopausal" if age >= 50 else "Pre-menopausal"
|
33 |
+
menopausal_status.append(menopausal)
|
34 |
+
|
35 |
+
tumor_size = round(np.random.lognormal(mean=0.7, sigma=0.5), 2)
|
36 |
+
tumor_sizes.append(tumor_size)
|
37 |
+
|
38 |
+
lymph_node = (
|
39 |
+
"Positive"
|
40 |
+
if (tumor_size > 2.0 and np.random.rand() < 0.6)
|
41 |
+
or (tumor_size <= 2.0 and np.random.rand() < 0.3)
|
42 |
+
else "Negative"
|
43 |
+
)
|
44 |
+
lymph_nodes.append(lymph_node)
|
45 |
+
|
46 |
+
grade = np.random.choice([1, 2, 3], p=[0.1, 0.4, 0.5] if tumor_size > 2.0 else [0.3, 0.5, 0.2])
|
47 |
+
grades.append(grade)
|
48 |
+
|
49 |
+
if tumor_size <= 2.0 and lymph_node == "Negative":
|
50 |
+
stage = "I"
|
51 |
+
elif (tumor_size > 2.0 and tumor_size <= 5.0) and lymph_node == "Negative":
|
52 |
+
stage = "II"
|
53 |
+
elif lymph_node == "Positive" or tumor_size > 5.0:
|
54 |
+
stage = "III"
|
55 |
+
else:
|
56 |
+
stage = "II"
|
57 |
+
if np.random.rand() < 0.05:
|
58 |
+
stage = "IV"
|
59 |
+
stages.append(stage)
|
60 |
+
|
61 |
+
er = np.random.choice(["Positive", "Negative"], p=[0.75, 0.25])
|
62 |
+
pr = "Positive" if er == "Positive" and np.random.rand() > 0.1 else "Negative"
|
63 |
+
er_status.append(er)
|
64 |
+
pr_status.append(pr)
|
65 |
+
|
66 |
+
her2 = np.random.choice(["Positive", "Negative"], p=[0.3, 0.7] if grade == 3 else [0.15, 0.85])
|
67 |
+
her2_status.append(her2)
|
68 |
+
|
69 |
+
ki67 = "High" if grade == 3 and np.random.rand() < 0.8 else "Low"
|
70 |
+
ki67_level.append(ki67)
|
71 |
+
|
72 |
+
tnbc = "Positive" if er == "Negative" and pr == "Negative" and her2 == "Negative" else "Negative"
|
73 |
+
tnbc_status.append(tnbc)
|
74 |
+
|
75 |
+
brca = "Positive" if (tnbc == "Positive" or age < 40) and np.random.rand() < 0.2 else "Negative"
|
76 |
+
brca_mutation.append(brca)
|
77 |
+
|
78 |
+
health = "Good" if age < 65 and np.random.rand() < 0.9 else "Poor"
|
79 |
+
overall_health.append(health)
|
80 |
+
|
81 |
+
recurrence_score = (
|
82 |
+
np.random.choice(["Low", "Intermediate", "High"], p=[0.6, 0.3, 0.1])
|
83 |
+
if er == "Positive" and her2 == "Negative"
|
84 |
+
else "N/A"
|
85 |
+
)
|
86 |
+
genomic_score.append(recurrence_score)
|
87 |
+
|
88 |
+
if stage in ["I", "II"]:
|
89 |
+
if tnbc == "Positive":
|
90 |
+
treat = "Surgery, Chemotherapy, and Radiation Therapy"
|
91 |
+
elif er == "Positive" and recurrence_score != "N/A":
|
92 |
+
if recurrence_score == "High":
|
93 |
+
treat = "Surgery, Chemotherapy, Hormone Therapy, and Radiation Therapy"
|
94 |
+
elif recurrence_score == "Intermediate":
|
95 |
+
treat = "Surgery, Consider Chemotherapy, Hormone Therapy, and Radiation Therapy"
|
96 |
+
else:
|
97 |
+
treat = "Surgery, Hormone Therapy, and Radiation Therapy"
|
98 |
+
elif her2 == "Positive":
|
99 |
+
treat = "Surgery, HER2-Targeted Therapy, Chemotherapy, and Radiation Therapy"
|
100 |
+
else:
|
101 |
+
treat = "Surgery, Chemotherapy, and Radiation Therapy"
|
102 |
+
elif stage == "III":
|
103 |
+
treat = (
|
104 |
+
"Neoadjuvant Chemotherapy, Surgery, Radiation Therapy"
|
105 |
+
+ (", HER2-Targeted Therapy" if her2 == "Positive" else "")
|
106 |
+
+ (", Hormone Therapy" if er == "Positive" else "")
|
107 |
+
)
|
108 |
+
else:
|
109 |
+
treat = "Systemic Therapy (Palliative Care)"
|
110 |
+
treatment.append(treat)
|
111 |
+
|
112 |
+
return pd.DataFrame(
|
113 |
+
{
|
114 |
+
"Patient ID": patient_ids,
|
115 |
+
"Age": ages,
|
116 |
+
"Menopausal Status": menopausal_status,
|
117 |
+
"Tumor Size (cm)": tumor_sizes,
|
118 |
+
"Lymph Node Involvement": lymph_nodes,
|
119 |
+
"Tumor Grade": grades,
|
120 |
+
"Tumor Stage": stages,
|
121 |
+
"ER Status": er_status,
|
122 |
+
"PR Status": pr_status,
|
123 |
+
"HER2 Status": her2_status,
|
124 |
+
"Ki-67 Level": ki67_level,
|
125 |
+
"TNBC Status": tnbc_status,
|
126 |
+
"BRCA Mutation": brca_mutation,
|
127 |
+
"Overall Health": overall_health,
|
128 |
+
"Genomic Recurrence Score": genomic_score,
|
129 |
+
"Treatment": treatment,
|
130 |
+
}
|
131 |
+
)
|
132 |
+
|
133 |
+
|
134 |
+
# Main Streamlit App
|
135 |
+
st.title("Synthetic Data Generator")
|
136 |
+
num_patients = st.slider("Number of Patients to Generate", 10, 1000, 100)
|
137 |
+
if st.button("Generate Data"):
|
138 |
+
breast_cancer_data = generate_breast_cancer_data(num_patients)
|
139 |
+
st.write("Breast Cancer Data")
|
140 |
+
st.dataframe(breast_cancer_data)
|
141 |
+
st.download_button(
|
142 |
+
"Download Breast Cancer Data",
|
143 |
+
data=breast_cancer_data.to_csv(index=False),
|
144 |
+
file_name="breast_cancer_data.csv",
|
145 |
+
mime="text/csv",
|
146 |
+
)
|