eaglelandsonce commited on
Commit
2d31353
1 Parent(s): ad9022b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +140 -95
app.py CHANGED
@@ -5,52 +5,145 @@ import numpy as np
5
  # Seed for reproducibility
6
  np.random.seed(42)
7
 
8
- # Function to generate synthetic Enrollments
9
- def generate_enrollments(num_members):
10
- primary_keys = [f"PPK_{i+1:05d}" for i in range(num_members)]
11
- enrollments_data = {
12
- "MEM_AGE": np.random.randint(18, 80, num_members),
13
- "MEM_MSA_NAME": np.random.choice(["DETROIT", "HONOLULU", "LOS ANGELES"], num_members),
14
- "MEM_STAT": np.random.choice(["ACTIVE", "INACTIVE"], num_members),
15
- "MEMBER_ID": primary_keys,
16
- "PRIMARY_PERSON_KEY": primary_keys,
17
- "PAYER_LOB": np.random.choice(["MEDICAID", "COMMERCIAL", "MEDICARE"], num_members),
18
- "PAYER_TYPE": np.random.choice(["PPO", "HMO"], num_members),
19
- "PRIMARY_CHRONIC_CONDITION_ROLLUP_DESC": np.random.choice(["Cancer", "Diabetes", "Hypertension"], num_members),
20
- "Count of PRIMARY_CHRONIC_CONDITION_ROLLUP_ID": np.random.randint(1, 5, num_members),
21
- "PROD_TYPE": np.random.choice(["DENTAL", "VISION", "MEDICAL"], num_members),
22
- "RELATION": np.random.choice(["SUBSCRIBER", "DEPENDENT"], num_members),
23
- "YEARMO": np.random.randint(202201, 202412, num_members),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  }
25
- return pd.DataFrame(enrollments_data)
 
26
 
27
  # Function to generate synthetic Members
28
- def generate_members(num_members):
29
- primary_keys = [f"PPK_{i+1:05d}" for i in range(num_members)]
30
  members_data = {
31
- "MEM_ETHNICITY": np.random.choice(["Hispanic", "Non-Hispanic", None], num_members),
32
- "MEM_GENDER": ["F"] * num_members, # All members are female
33
- "MEM_MSA_NAME": np.random.choice(["DETROIT", "HONOLULU", "LOS ANGELES"], num_members),
34
- "MEM_RACE": np.random.choice(["White", "Black", "Asian", None], num_members),
35
- "MEM_STATE": np.random.choice(["MI", "HI", "CA"], num_members),
36
- "MEM_ZIP3": np.random.randint(100, 999, num_members),
37
- "MEMBER_ID": primary_keys,
38
- "PRIMARY_PERSON_KEY": primary_keys,
39
  }
40
  return pd.DataFrame(members_data)
41
 
42
- # Function to generate synthetic Providers
43
- def generate_providers(num_providers):
44
- providers_data = {
45
- "PROV_CLINIC_STATE": np.random.choice(["MI", "HI", "CA"], num_providers),
46
- "PROV_CLINIC_ZIP": np.random.randint(10000, 99999, num_providers),
47
- "PROV_KEY": [f"PK_{i+1:05d}" for i in range(num_providers)],
48
- "Sum of PROV_NPI_ORG": np.random.randint(1, 50, num_providers),
49
- "PROV_TAXONOMY": np.random.choice(["208100000X", "207RE0101X"], num_providers),
50
- "PROV_TYPE": np.random.choice(["Type1", "Type2"], num_providers),
51
- }
52
- return pd.DataFrame(providers_data)
53
-
54
  # Function to generate synthetic Services
55
  def generate_services(num_services, primary_keys):
56
  services_data = {
@@ -58,79 +151,31 @@ def generate_services(num_services, primary_keys):
58
  "Sum of AMT_ALLOWED": np.random.uniform(1000, 10000, num_services),
59
  "Sum of AMT_BILLED": np.random.uniform(1000, 15000, num_services),
60
  "Count of AMT_PAID": np.random.randint(1, 5, num_services),
61
- "ATT_PROV_KEY": [f"PK_{i+1:05d}" for i in np.random.randint(1, len(primary_keys), num_services)],
62
- "BILL_PROV_KEY": [f"PK_{i+1:05d}" for i in np.random.randint(1, len(primary_keys), num_services)],
63
- "CLAIM_IN_NETWORK": np.random.choice(["Y", "N", None], num_services),
64
- "RELATION": np.random.choice(["SUBSCRIBER", "DEPENDENT"], num_services),
65
  "SERVICE_SETTING": np.random.choice(["OUTPATIENT", "INPATIENT"], num_services),
66
- "Sum of SERVICE_LINE": np.random.randint(1, 10, num_services),
67
- "Sum of SV_UNITS": np.random.randint(1, 100, num_services),
68
- "YEARMO": np.random.randint(202201, 202412, num_services),
69
  }
70
  return pd.DataFrame(services_data)
71
 
72
- # Function to generate synthetic BreastCancer data
73
- def generate_breast_cancer_data(members_df, num_patients):
74
- # Randomly sample from PRIMARY_PERSON_KEY in Members
75
- patient_ids = np.random.choice(members_df["PRIMARY_PERSON_KEY"], num_patients, replace=False)
76
-
77
- breast_cancer_data = {
78
- "Patient ID": patient_ids,
79
- "Age": np.random.randint(30, 80, num_patients),
80
- "Menopausal Status": np.random.choice(["Post-menopausal", "Pre-menopausal"], num_patients),
81
- "Tumor Size (cm)": np.round(np.random.lognormal(mean=0.7, sigma=0.5, size=num_patients), 2),
82
- "Lymph Node Involvement": np.random.choice(["Positive", "Negative"], num_patients),
83
- "Tumor Grade": np.random.choice([1, 2, 3], num_patients),
84
- "Tumor Stage": np.random.choice(["I", "II", "III", "IV"], num_patients),
85
- "ER Status": np.random.choice(["Positive", "Negative"], num_patients),
86
- "PR Status": np.random.choice(["Positive", "Negative"], num_patients),
87
- "HER2 Status": np.random.choice(["Positive", "Negative"], num_patients),
88
- "Ki-67 Level": np.random.choice(["High", "Low"], num_patients),
89
- "TNBC Status": np.random.choice(["Positive", "Negative"], num_patients),
90
- "BRCA Mutation": np.random.choice(["Positive", "Negative"], num_patients),
91
- "Overall Health": np.random.choice(["Good", "Poor"], num_patients),
92
- "Genomic Recurrence Score": np.random.choice(["Low", "Intermediate", "High", "N/A"], num_patients),
93
- "Treatment": np.random.choice(["Surgery", "Chemotherapy", "Radiation Therapy"], num_patients),
94
- }
95
- return pd.DataFrame(breast_cancer_data)
96
-
97
  # Main Streamlit App
98
  st.title("Synthetic Medical Data Generator")
99
 
100
- # Input parameters
101
- num_members = st.slider("Number of Members to Generate", 10, 1000, 100)
102
- num_providers = st.slider("Number of Providers to Generate", 10, 500, 100)
103
  num_services = st.slider("Number of Services to Generate", 10, 2000, 500)
104
- num_patients = st.slider("Number of Breast Cancer Patients to Generate", 10, 500, 100)
105
 
106
  if st.button("Generate Data"):
107
- # Generate data
108
- enrollments_df = generate_enrollments(num_members)
109
- members_df = generate_members(num_members)
110
- providers_df = generate_providers(num_providers)
111
- services_df = generate_services(num_services, enrollments_df["PRIMARY_PERSON_KEY"].tolist())
112
-
113
- # Adjust Breast Cancer Patients to not exceed Members
114
- max_breast_cancer_patients = min(num_patients, num_members)
115
- breast_cancer_df = generate_breast_cancer_data(members_df, max_breast_cancer_patients)
116
 
117
  # Display and download data
118
- st.subheader("Enrollments Data")
119
- st.dataframe(enrollments_df.head())
120
- st.download_button("Download Enrollments", enrollments_df.to_csv(index=False), "enrollments.csv")
121
 
122
  st.subheader("Members Data")
123
  st.dataframe(members_df.head())
124
  st.download_button("Download Members", members_df.to_csv(index=False), "members.csv")
125
 
126
- st.subheader("Providers Data")
127
- st.dataframe(providers_df.head())
128
- st.download_button("Download Providers", providers_df.to_csv(index=False), "providers.csv")
129
-
130
  st.subheader("Services Data")
131
  st.dataframe(services_df.head())
132
  st.download_button("Download Services", services_df.to_csv(index=False), "services.csv")
133
-
134
- st.subheader("Breast Cancer Data")
135
- st.dataframe(breast_cancer_df.head())
136
- st.download_button("Download Breast Cancer Data", breast_cancer_df.to_csv(index=False), "breast_cancer.csv")
 
5
  # Seed for reproducibility
6
  np.random.seed(42)
7
 
8
+ # Function to generate synthetic BreastCancer data
9
+ def generate_breast_cancer_data(num_patients):
10
+ primary_keys = [f"PPK_{i+1:05d}" for i in range(num_patients)]
11
+
12
+ ages = []
13
+ menopausal_status = []
14
+ tumor_sizes = []
15
+ lymph_nodes = []
16
+ grades = []
17
+ stages = []
18
+ er_status = []
19
+ pr_status = []
20
+ her2_status = []
21
+ ki67_level = []
22
+ tnbc_status = []
23
+ brca_mutation = []
24
+ overall_health = []
25
+ genomic_score = []
26
+ treatment = []
27
+
28
+ for i in range(num_patients):
29
+ age = int(np.random.normal(60, 10))
30
+ age = max(30, min(age, 80))
31
+ ages.append(age)
32
+
33
+ menopausal = "Post-menopausal" if age >= 50 else "Pre-menopausal"
34
+ menopausal_status.append(menopausal)
35
+
36
+ tumor_size = round(np.random.lognormal(mean=0.7, sigma=0.5), 2)
37
+ tumor_sizes.append(tumor_size)
38
+
39
+ lymph_node = (
40
+ "Positive"
41
+ if (tumor_size > 2.0 and np.random.rand() < 0.6)
42
+ or (tumor_size <= 2.0 and np.random.rand() < 0.3)
43
+ else "Negative"
44
+ )
45
+ lymph_nodes.append(lymph_node)
46
+
47
+ grade = np.random.choice([1, 2, 3], p=[0.1, 0.4, 0.5] if tumor_size > 2.0 else [0.3, 0.5, 0.2])
48
+ grades.append(grade)
49
+
50
+ if tumor_size <= 2.0 and lymph_node == "Negative":
51
+ stage = "I"
52
+ elif (tumor_size > 2.0 and tumor_size <= 5.0) and lymph_node == "Negative":
53
+ stage = "II"
54
+ elif lymph_node == "Positive" or tumor_size > 5.0:
55
+ stage = "III"
56
+ else:
57
+ stage = "II"
58
+ if np.random.rand() < 0.05:
59
+ stage = "IV"
60
+ stages.append(stage)
61
+
62
+ er = np.random.choice(["Positive", "Negative"], p=[0.75, 0.25])
63
+ pr = "Positive" if er == "Positive" and np.random.rand() > 0.1 else "Negative"
64
+ er_status.append(er)
65
+ pr_status.append(pr)
66
+
67
+ her2 = np.random.choice(["Positive", "Negative"], p=[0.3, 0.7] if grade == 3 else [0.15, 0.85])
68
+ her2_status.append(her2)
69
+
70
+ ki67 = "High" if grade == 3 and np.random.rand() < 0.8 else "Low"
71
+ ki67_level.append(ki67)
72
+
73
+ tnbc = "Positive" if er == "Negative" and pr == "Negative" and her2 == "Negative" else "Negative"
74
+ tnbc_status.append(tnbc)
75
+
76
+ brca = "Positive" if (tnbc == "Positive" or age < 40) and np.random.rand() < 0.2 else "Negative"
77
+ brca_mutation.append(brca)
78
+
79
+ health = "Good" if age < 65 and np.random.rand() < 0.9 else "Poor"
80
+ overall_health.append(health)
81
+
82
+ recurrence_score = (
83
+ np.random.choice(["Low", "Intermediate", "High"], p=[0.6, 0.3, 0.1])
84
+ if er == "Positive" and her2 == "Negative"
85
+ else "N/A"
86
+ )
87
+ genomic_score.append(recurrence_score)
88
+
89
+ if stage in ["I", "II"]:
90
+ if tnbc == "Positive":
91
+ treat = "Surgery, Chemotherapy, and Radiation Therapy"
92
+ elif er == "Positive" and recurrence_score != "N/A":
93
+ if recurrence_score == "High":
94
+ treat = "Surgery, Chemotherapy, Hormone Therapy, and Radiation Therapy"
95
+ elif recurrence_score == "Intermediate":
96
+ treat = "Surgery, Consider Chemotherapy, Hormone Therapy, and Radiation Therapy"
97
+ else:
98
+ treat = "Surgery, Hormone Therapy, and Radiation Therapy"
99
+ elif her2 == "Positive":
100
+ treat = "Surgery, HER2-Targeted Therapy, Chemotherapy, and Radiation Therapy"
101
+ else:
102
+ treat = "Surgery, Chemotherapy, and Radiation Therapy"
103
+ elif stage == "III":
104
+ treat = (
105
+ "Neoadjuvant Chemotherapy, Surgery, Radiation Therapy"
106
+ + (", HER2-Targeted Therapy" if her2 == "Positive" else "")
107
+ + (", Hormone Therapy" if er == "Positive" else "")
108
+ )
109
+ else:
110
+ treat = "Systemic Therapy (Palliative Care)"
111
+ treatment.append(treat)
112
+
113
+ breast_cancer_data = {
114
+ "Patient ID": primary_keys,
115
+ "Age": ages,
116
+ "Menopausal Status": menopausal_status,
117
+ "Tumor Size (cm)": tumor_sizes,
118
+ "Lymph Node Involvement": lymph_nodes,
119
+ "Tumor Grade": grades,
120
+ "Tumor Stage": stages,
121
+ "ER Status": er_status,
122
+ "PR Status": pr_status,
123
+ "HER2 Status": her2_status,
124
+ "Ki-67 Level": ki67_level,
125
+ "TNBC Status": tnbc_status,
126
+ "BRCA Mutation": brca_mutation,
127
+ "Overall Health": overall_health,
128
+ "Genomic Recurrence Score": genomic_score,
129
+ "Treatment": treatment,
130
  }
131
+
132
+ return pd.DataFrame(breast_cancer_data)
133
 
134
  # Function to generate synthetic Members
135
+ def generate_members_from_breast_cancer(breast_cancer_df):
 
136
  members_data = {
137
+ "MEMBER_ID": breast_cancer_df["Patient ID"],
138
+ "PRIMARY_PERSON_KEY": breast_cancer_df["Patient ID"],
139
+ "MEM_GENDER": ["F"] * len(breast_cancer_df),
140
+ "MEM_ETHNICITY": np.random.choice(["Hispanic", "Non-Hispanic", None], len(breast_cancer_df)),
141
+ "MEM_RACE": np.random.choice(["White", "Black", "Asian", None], len(breast_cancer_df)),
142
+ "MEM_STATE": np.random.choice(["MI", "HI", "CA"], len(breast_cancer_df)),
143
+ "MEM_ZIP3": np.random.randint(100, 999, len(breast_cancer_df)),
 
144
  }
145
  return pd.DataFrame(members_data)
146
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  # Function to generate synthetic Services
148
  def generate_services(num_services, primary_keys):
149
  services_data = {
 
151
  "Sum of AMT_ALLOWED": np.random.uniform(1000, 10000, num_services),
152
  "Sum of AMT_BILLED": np.random.uniform(1000, 15000, num_services),
153
  "Count of AMT_PAID": np.random.randint(1, 5, num_services),
 
 
 
 
154
  "SERVICE_SETTING": np.random.choice(["OUTPATIENT", "INPATIENT"], num_services),
 
 
 
155
  }
156
  return pd.DataFrame(services_data)
157
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  # Main Streamlit App
159
  st.title("Synthetic Medical Data Generator")
160
 
161
+ # Slider for breast cancer patients
162
+ num_patients = st.slider("Number of Breast Cancer Patients to Generate", 10, 1000, 100)
 
163
  num_services = st.slider("Number of Services to Generate", 10, 2000, 500)
 
164
 
165
  if st.button("Generate Data"):
166
+ breast_cancer_df = generate_breast_cancer_data(num_patients)
167
+ members_df = generate_members_from_breast_cancer(breast_cancer_df)
168
+ services_df = generate_services(num_services, breast_cancer_df["Patient ID"].tolist())
 
 
 
 
 
 
169
 
170
  # Display and download data
171
+ st.subheader("Breast Cancer Data")
172
+ st.dataframe(breast_cancer_df.head())
173
+ st.download_button("Download Breast Cancer Data", breast_cancer_df.to_csv(index=False), "breast_cancer.csv")
174
 
175
  st.subheader("Members Data")
176
  st.dataframe(members_df.head())
177
  st.download_button("Download Members", members_df.to_csv(index=False), "members.csv")
178
 
 
 
 
 
179
  st.subheader("Services Data")
180
  st.dataframe(services_df.head())
181
  st.download_button("Download Services", services_df.to_csv(index=False), "services.csv")