eaglelandsonce commited on
Commit
6e8dad4
·
verified ·
1 Parent(s): c069834

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +141 -117
app.py CHANGED
@@ -2,121 +2,145 @@ import streamlit as st
2
  import pandas as pd
3
  import numpy as np
4
 
5
- # Function to generate synthetic data
6
- def generate_synthetic_data(num_members):
7
- unique_ids = [f"MEM_{i:05d}" for i in range(1, num_members + 1)]
8
- primary_keys = [f"PPK_{i:05d}" for i in range(1, num_members + 1)]
9
-
10
- # Synthetic Enrollments
11
- enrollments_data = {
12
- "MEM_AGE": np.random.randint(18, 80, num_members),
13
- "MEM_MSA_NAME": np.random.choice(["DETROIT", "HONOLULU", "LOS ANGELES"], num_members),
14
- "MEM_STAT": np.random.choice(["ACTIVE", "INACTIVE"], num_members),
15
- "MEMBER_ID": unique_ids,
16
- "PRIMARY_PERSON_KEY": primary_keys,
17
- "PAYER_LOB": np.random.choice(["MEDICAID", "COMMERCIAL", "MEDICARE"], num_members),
18
- "PAYER_TYPE": np.random.choice(["PPO", "HMO"], num_members),
19
- "PRIMARY_CHRONIC_CONDITION_ROLLUP_DESC": np.random.choice(["Cancer", "Diabetes", "Hypertension"], num_members),
20
- "Count of PRIMARY_CHRONIC_CONDITION_ROLLUP_ID": np.random.randint(1, 5, num_members),
21
- "PROD_TYPE": np.random.choice(["DENTAL", "VISION", "MEDICAL"], num_members),
22
- "RELATION": np.random.choice(["SUBSCRIBER", "DEPENDENT"], num_members),
23
- "Sum of QTY_MM_DN": np.random.randint(0, 10, num_members),
24
- "Sum of QTY_MM_MD": np.random.randint(0, 10, num_members),
25
- "Sum of QTY_MM_RX": np.random.randint(0, 10, num_members),
26
- "Sum of QTY_MM_VS": np.random.randint(0, 10, num_members),
27
- "YEARMO": np.random.randint(202201, 202412, num_members),
28
- }
29
- enrollments_df = pd.DataFrame(enrollments_data)
30
-
31
- # Synthetic Members
32
- members_data = {
33
- "MEM_ETHNICITY": np.random.choice(["Hispanic", "Non-Hispanic", None], num_members),
34
- "MEM_GENDER": ["F"] * num_members, # Ensuring all members are female
35
- "MEM_MSA_NAME": enrollments_data["MEM_MSA_NAME"],
36
- "MEM_RACE": np.random.choice(["White", "Black", "Asian", None], num_members),
37
- "MEM_STATE": np.random.choice(["MI", "HI", "CA"], num_members),
38
- "MEM_ZIP3": np.random.randint(100, 999, num_members),
39
- "MEMBER_ID": unique_ids,
40
- "PRIMARY_PERSON_KEY": primary_keys,
41
- }
42
- members_df = pd.DataFrame(members_data)
43
-
44
- # Synthetic Providers
45
- providers_data = {
46
- "PROV_CLINIC_STATE": np.random.choice(["MI", "HI", "CA"], num_members),
47
- "PROV_CLINIC_ZIP": np.random.randint(10000, 99999, num_members),
48
- "PROV_KEY": [f"PK_{i:05d}" for i in range(1, num_members + 1)],
49
- "PROV_NPI_ORG": np.random.randint(1, 50, num_members),
50
- "PROV_TAXONOMY": np.random.choice(["208100000X", "207RE0101X"], num_members),
51
- "PROV_TYPE": np.random.choice(["Type1", "Type2"], num_members),
52
- }
53
- providers_df = pd.DataFrame(providers_data)
54
-
55
- # Synthetic Services
56
- services_data = {
57
- "MEMBER_ID": unique_ids,
58
- "PRIMARY_PERSON_KEY": primary_keys,
59
- "Sum of AMT_ALLOWED": np.random.uniform(1000, 10000, num_members),
60
- "Sum of AMT_BILLED": np.random.uniform(1000, 15000, num_members),
61
- "Count of AMT_PAID": np.random.randint(1, 5, num_members),
62
- "ATT_PROV_KEY": np.random.choice(providers_data["PROV_KEY"], num_members),
63
- "BILL_PROV_KEY": np.random.choice(providers_data["PROV_KEY"], num_members),
64
- "CLAIM_IN_NETWORK": np.random.choice(["Y", "N", None], num_members),
65
- "RELATION": enrollments_data["RELATION"],
66
- "SERVICE_SETTING": np.random.choice(["OUTPATIENT", "INPATIENT"], num_members),
67
- "Sum of SERVICE_LINE": np.random.randint(1, 10, num_members),
68
- "Sum of SV_UNITS": np.random.randint(1, 100, num_members),
69
- "YEARMO": enrollments_data["YEARMO"],
70
- }
71
- services_df = pd.DataFrame(services_data)
72
-
73
- return enrollments_df, members_df, providers_df, services_df
74
-
75
-
76
- # Streamlit App
77
- st.title("Synthetic Medical Billing Data Generator")
78
-
79
- # Slider for number of members
80
- num_members = st.slider("Select number of unique members:", min_value=10, max_value=1000, step=10, value=100)
81
-
82
- # Generate synthetic data
83
- enrollments_df, members_df, providers_df, services_df = generate_synthetic_data(num_members)
84
-
85
- # Display dataframes
86
- st.subheader("Preview of Generated Data")
87
- st.write("Enrollments Data")
88
- st.dataframe(enrollments_df.head())
89
- st.write("Members Data")
90
- st.dataframe(members_df.head())
91
- st.write("Providers Data")
92
- st.dataframe(providers_df.head())
93
- st.write("Services Data")
94
- st.dataframe(services_df.head())
95
-
96
- # Allow downloading the generated files
97
- st.subheader("Download Synthetic Data")
98
- st.download_button(
99
- label="Download Enrollments Data",
100
- data=enrollments_df.to_csv(index=False),
101
- file_name="Synthetic_Enrollments.csv",
102
- mime="text/csv",
103
- )
104
- st.download_button(
105
- label="Download Members Data",
106
- data=members_df.to_csv(index=False),
107
- file_name="Synthetic_Members.csv",
108
- mime="text/csv",
109
- )
110
- st.download_button(
111
- label="Download Providers Data",
112
- data=providers_df.to_csv(index=False),
113
- file_name="Synthetic_Providers.csv",
114
- mime="text/csv",
115
- )
116
- st.download_button(
117
- label="Download Services Data",
118
- data=services_df.to_csv(index=False),
119
- file_name="Synthetic_Services.csv",
120
- mime="text/csv",
121
- )
122
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import pandas as pd
3
  import numpy as np
4
 
5
+ # Seed for reproducibility
6
+ np.random.seed(42)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
+ # Function to generate synthetic breast cancer data
9
+ def generate_breast_cancer_data(num_patients=100):
10
+ patient_ids = [f"BC_{i+1:05d}" for i in range(num_patients)]
11
+ ages = []
12
+ menopausal_status = []
13
+ tumor_sizes = []
14
+ lymph_nodes = []
15
+ grades = []
16
+ stages = []
17
+ er_status = []
18
+ pr_status = []
19
+ her2_status = []
20
+ ki67_level = []
21
+ tnbc_status = []
22
+ brca_mutation = []
23
+ overall_health = []
24
+ genomic_score = []
25
+ treatment = []
26
+
27
+ for i in range(num_patients):
28
+ age = int(np.random.normal(60, 10))
29
+ age = max(30, min(age, 80))
30
+ ages.append(age)
31
+
32
+ menopausal = "Post-menopausal" if age >= 50 else "Pre-menopausal"
33
+ menopausal_status.append(menopausal)
34
+
35
+ tumor_size = round(np.random.lognormal(mean=0.7, sigma=0.5), 2)
36
+ tumor_sizes.append(tumor_size)
37
+
38
+ lymph_node = (
39
+ "Positive"
40
+ if (tumor_size > 2.0 and np.random.rand() < 0.6)
41
+ or (tumor_size <= 2.0 and np.random.rand() < 0.3)
42
+ else "Negative"
43
+ )
44
+ lymph_nodes.append(lymph_node)
45
+
46
+ grade = np.random.choice([1, 2, 3], p=[0.1, 0.4, 0.5] if tumor_size > 2.0 else [0.3, 0.5, 0.2])
47
+ grades.append(grade)
48
+
49
+ if tumor_size <= 2.0 and lymph_node == "Negative":
50
+ stage = "I"
51
+ elif (tumor_size > 2.0 and tumor_size <= 5.0) and lymph_node == "Negative":
52
+ stage = "II"
53
+ elif lymph_node == "Positive" or tumor_size > 5.0:
54
+ stage = "III"
55
+ else:
56
+ stage = "II"
57
+ if np.random.rand() < 0.05:
58
+ stage = "IV"
59
+ stages.append(stage)
60
+
61
+ er = np.random.choice(["Positive", "Negative"], p=[0.75, 0.25])
62
+ pr = "Positive" if er == "Positive" and np.random.rand() > 0.1 else "Negative"
63
+ er_status.append(er)
64
+ pr_status.append(pr)
65
+
66
+ her2 = np.random.choice(["Positive", "Negative"], p=[0.3, 0.7] if grade == 3 else [0.15, 0.85])
67
+ her2_status.append(her2)
68
+
69
+ ki67 = "High" if grade == 3 and np.random.rand() < 0.8 else "Low"
70
+ ki67_level.append(ki67)
71
+
72
+ tnbc = "Positive" if er == "Negative" and pr == "Negative" and her2 == "Negative" else "Negative"
73
+ tnbc_status.append(tnbc)
74
+
75
+ brca = "Positive" if (tnbc == "Positive" or age < 40) and np.random.rand() < 0.2 else "Negative"
76
+ brca_mutation.append(brca)
77
+
78
+ health = "Good" if age < 65 and np.random.rand() < 0.9 else "Poor"
79
+ overall_health.append(health)
80
+
81
+ recurrence_score = (
82
+ np.random.choice(["Low", "Intermediate", "High"], p=[0.6, 0.3, 0.1])
83
+ if er == "Positive" and her2 == "Negative"
84
+ else "N/A"
85
+ )
86
+ genomic_score.append(recurrence_score)
87
+
88
+ if stage in ["I", "II"]:
89
+ if tnbc == "Positive":
90
+ treat = "Surgery, Chemotherapy, and Radiation Therapy"
91
+ elif er == "Positive" and recurrence_score != "N/A":
92
+ if recurrence_score == "High":
93
+ treat = "Surgery, Chemotherapy, Hormone Therapy, and Radiation Therapy"
94
+ elif recurrence_score == "Intermediate":
95
+ treat = "Surgery, Consider Chemotherapy, Hormone Therapy, and Radiation Therapy"
96
+ else:
97
+ treat = "Surgery, Hormone Therapy, and Radiation Therapy"
98
+ elif her2 == "Positive":
99
+ treat = "Surgery, HER2-Targeted Therapy, Chemotherapy, and Radiation Therapy"
100
+ else:
101
+ treat = "Surgery, Chemotherapy, and Radiation Therapy"
102
+ elif stage == "III":
103
+ treat = (
104
+ "Neoadjuvant Chemotherapy, Surgery, Radiation Therapy"
105
+ + (", HER2-Targeted Therapy" if her2 == "Positive" else "")
106
+ + (", Hormone Therapy" if er == "Positive" else "")
107
+ )
108
+ else:
109
+ treat = "Systemic Therapy (Palliative Care)"
110
+ treatment.append(treat)
111
+
112
+ return pd.DataFrame(
113
+ {
114
+ "Patient ID": patient_ids,
115
+ "Age": ages,
116
+ "Menopausal Status": menopausal_status,
117
+ "Tumor Size (cm)": tumor_sizes,
118
+ "Lymph Node Involvement": lymph_nodes,
119
+ "Tumor Grade": grades,
120
+ "Tumor Stage": stages,
121
+ "ER Status": er_status,
122
+ "PR Status": pr_status,
123
+ "HER2 Status": her2_status,
124
+ "Ki-67 Level": ki67_level,
125
+ "TNBC Status": tnbc_status,
126
+ "BRCA Mutation": brca_mutation,
127
+ "Overall Health": overall_health,
128
+ "Genomic Recurrence Score": genomic_score,
129
+ "Treatment": treatment,
130
+ }
131
+ )
132
+
133
+
134
+ # Main Streamlit App
135
+ st.title("Synthetic Data Generator")
136
+ num_patients = st.slider("Number of Patients to Generate", 10, 1000, 100)
137
+ if st.button("Generate Data"):
138
+ breast_cancer_data = generate_breast_cancer_data(num_patients)
139
+ st.write("Breast Cancer Data")
140
+ st.dataframe(breast_cancer_data)
141
+ st.download_button(
142
+ "Download Breast Cancer Data",
143
+ data=breast_cancer_data.to_csv(index=False),
144
+ file_name="breast_cancer_data.csv",
145
+ mime="text/csv",
146
+ )