eaglelandsonce commited on
Commit
d202149
·
verified ·
1 Parent(s): 4f0bf15

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -113
app.py CHANGED
@@ -1,6 +1,10 @@
1
  import streamlit as st
2
  import pandas as pd
3
  import numpy as np
 
 
 
 
4
 
5
  # Seed for reproducibility
6
  np.random.seed(42)
@@ -8,128 +12,40 @@ np.random.seed(42)
8
  # Function to generate synthetic BreastCancer data
9
  def generate_breast_cancer_data(num_patients):
10
  primary_keys = [f"PPK_{i+1:05d}" for i in range(num_patients)]
11
-
12
- ages = []
13
- menopausal_status = []
14
- tumor_sizes = []
15
- lymph_nodes = []
16
- grades = []
17
- stages = []
18
- er_status = []
19
- pr_status = []
20
- her2_status = []
21
- ki67_level = []
22
- tnbc_status = []
23
- brca_mutation = []
24
- overall_health = []
25
- genomic_score = []
26
- treatment = []
27
-
28
- for i in range(num_patients):
29
- age = int(np.random.normal(60, 10))
30
- age = max(30, min(age, 80))
31
- ages.append(age)
32
-
33
- menopausal = "Post-menopausal" if age >= 50 else "Pre-menopausal"
34
- menopausal_status.append(menopausal)
35
-
36
- tumor_size = round(np.random.lognormal(mean=0.7, sigma=0.5), 2)
37
- tumor_sizes.append(tumor_size)
38
-
39
- lymph_node = (
40
- "Positive"
41
- if (tumor_size > 2.0 and np.random.rand() < 0.6)
42
- or (tumor_size <= 2.0 and np.random.rand() < 0.3)
43
- else "Negative"
44
- )
45
- lymph_nodes.append(lymph_node)
46
-
47
- grade = np.random.choice([1, 2, 3], p=[0.1, 0.4, 0.5] if tumor_size > 2.0 else [0.3, 0.5, 0.2])
48
- grades.append(grade)
49
-
50
- if tumor_size <= 2.0 and lymph_node == "Negative":
51
- stage = "I"
52
- elif (tumor_size > 2.0 and tumor_size <= 5.0) and lymph_node == "Negative":
53
- stage = "II"
54
- elif lymph_node == "Positive" or tumor_size > 5.0:
55
- stage = "III"
56
- else:
57
- stage = "II"
58
- if np.random.rand() < 0.05:
59
- stage = "IV"
60
- stages.append(stage)
61
-
62
- er = np.random.choice(["Positive", "Negative"], p=[0.75, 0.25])
63
- pr = "Positive" if er == "Positive" and np.random.rand() > 0.1 else "Negative"
64
- er_status.append(er)
65
- pr_status.append(pr)
66
-
67
- her2 = np.random.choice(["Positive", "Negative"], p=[0.3, 0.7] if grade == 3 else [0.15, 0.85])
68
- her2_status.append(her2)
69
-
70
- ki67 = "High" if grade == 3 and np.random.rand() < 0.8 else "Low"
71
- ki67_level.append(ki67)
72
-
73
- tnbc = "Positive" if er == "Negative" and pr == "Negative" and her2 == "Negative" else "Negative"
74
- tnbc_status.append(tnbc)
75
-
76
- brca = "Positive" if (tnbc == "Positive" or age < 40) and np.random.rand() < 0.2 else "Negative"
77
- brca_mutation.append(brca)
78
-
79
- health = "Good" if age < 65 and np.random.rand() < 0.9 else "Poor"
80
- overall_health.append(health)
81
-
82
- recurrence_score = (
83
- np.random.choice(["Low", "Intermediate", "High"], p=[0.6, 0.3, 0.1])
84
- if er == "Positive" and her2 == "Negative"
85
- else "N/A"
86
- )
87
- genomic_score.append(recurrence_score)
88
-
89
- if stage in ["I", "II"]:
90
- if tnbc == "Positive":
91
- treat = "Surgery, Chemotherapy, and Radiation Therapy"
92
- elif er == "Positive" and recurrence_score != "N/A":
93
- if recurrence_score == "High":
94
- treat = "Surgery, Chemotherapy, Hormone Therapy, and Radiation Therapy"
95
- elif recurrence_score == "Intermediate":
96
- treat = "Surgery, Consider Chemotherapy, Hormone Therapy, and Radiation Therapy"
97
- else:
98
- treat = "Surgery, Hormone Therapy, and Radiation Therapy"
99
- elif her2 == "Positive":
100
- treat = "Surgery, HER2-Targeted Therapy, Chemotherapy, and Radiation Therapy"
101
- else:
102
- treat = "Surgery, Chemotherapy, and Radiation Therapy"
103
- elif stage == "III":
104
- treat = (
105
- "Neoadjuvant Chemotherapy, Surgery, Radiation Therapy"
106
- + (", HER2-Targeted Therapy" if her2 == "Positive" else "")
107
- + (", Hormone Therapy" if er == "Positive" else "")
108
- )
109
- else:
110
- treat = "Systemic Therapy (Palliative Care)"
111
- treatment.append(treat)
112
-
113
- breast_cancer_data = {
114
  "PRIMARY_PERSON_KEY": primary_keys,
115
  "Age": ages,
116
  "Menopausal Status": menopausal_status,
117
  "Tumor Size (cm)": tumor_sizes,
118
  "Lymph Node Involvement": lymph_nodes,
119
- "Tumor Grade": grades,
120
- "Tumor Stage": stages,
121
  "ER Status": er_status,
122
  "PR Status": pr_status,
123
  "HER2 Status": her2_status,
124
- "Ki-67 Level": ki67_level,
125
  "TNBC Status": tnbc_status,
126
  "BRCA Mutation": brca_mutation,
127
  "Overall Health": overall_health,
128
  "Genomic Recurrence Score": genomic_score,
129
- "Treatment": treatment,
130
- }
131
-
132
- return pd.DataFrame(breast_cancer_data)
133
 
134
  # Function to generate Members from BreastCancer
135
  def generate_members_from_breast_cancer(breast_cancer_df):
@@ -177,22 +93,69 @@ def generate_providers(num_providers):
177
  "PROV_TAXONOMY": np.random.choice(["208100000X", "207RE0101X"], num_providers),
178
  })
179
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  # Main Streamlit App
181
- st.title("Synthetic Medical Data Generator")
182
 
183
  # Sliders
184
  num_patients = st.slider("Number of Breast Cancer Patients to Generate", 10, 1000, 100)
 
185
  num_services = st.slider("Number of Services to Generate", 10, 2000, 500)
186
  num_providers = st.slider("Number of Providers to Generate", 10, 500, 100)
187
 
 
 
 
 
 
188
  if st.button("Generate Data"):
 
 
189
  breast_cancer_df = generate_breast_cancer_data(num_patients)
190
  members_df = generate_members_from_breast_cancer(breast_cancer_df)
191
  enrollments_df = generate_enrollments_from_breast_cancer(breast_cancer_df)
192
- services_df = generate_services(num_services, breast_cancer_df["PRIMARY_PERSON_KEY"].tolist())
193
  providers_df = generate_providers(num_providers)
 
 
 
194
 
195
- # Display and download data
196
  st.subheader("Breast Cancer Data")
197
  st.dataframe(breast_cancer_df.head())
198
  st.download_button("Download Breast Cancer Data", breast_cancer_df.to_csv(index=False), "breast_cancer.csv")
@@ -212,3 +175,7 @@ if st.button("Generate Data"):
212
  st.subheader("Providers Data")
213
  st.dataframe(providers_df.head())
214
  st.download_button("Download Providers Data", providers_df.to_csv(index=False), "providers.csv")
 
 
 
 
 
1
  import streamlit as st
2
  import pandas as pd
3
  import numpy as np
4
+ import csv
5
+ import io
6
+ import random
7
+ from datetime import datetime, timedelta
8
 
9
  # Seed for reproducibility
10
  np.random.seed(42)
 
12
  # Function to generate synthetic BreastCancer data
13
  def generate_breast_cancer_data(num_patients):
14
  primary_keys = [f"PPK_{i+1:05d}" for i in range(num_patients)]
15
+ ages = np.random.randint(30, 80, size=num_patients)
16
+ menopausal_status = ["Post-menopausal" if age >= 50 else "Pre-menopausal" for age in ages]
17
+ tumor_sizes = np.round(np.random.lognormal(mean=0.7, sigma=0.5, size=num_patients), 2)
18
+ lymph_nodes = np.random.choice(["Positive", "Negative"], size=num_patients, p=[0.4, 0.6])
19
+ tumor_grades = np.random.choice([1, 2, 3], size=num_patients, p=[0.3, 0.5, 0.2])
20
+ tumor_stages = np.random.choice(["I", "II", "III", "IV"], size=num_patients, p=[0.4, 0.3, 0.2, 0.1])
21
+ er_status = np.random.choice(["Positive", "Negative"], size=num_patients, p=[0.75, 0.25])
22
+ pr_status = np.random.choice(["Positive", "Negative"], size=num_patients, p=[0.7, 0.3])
23
+ her2_status = np.random.choice(["Positive", "Negative"], size=num_patients, p=[0.3, 0.7])
24
+ ki67_levels = np.random.choice(["High", "Low"], size=num_patients, p=[0.6, 0.4])
25
+ tnbc_status = ["Positive" if er == "Negative" and pr == "Negative" and her2 == "Negative" else "Negative" for er, pr, her2 in zip(er_status, pr_status, her2_status)]
26
+ brca_mutation = np.random.choice(["Positive", "Negative"], size=num_patients, p=[0.1, 0.9])
27
+ overall_health = np.random.choice(["Good", "Poor"], size=num_patients, p=[0.7, 0.3])
28
+ genomic_score = np.random.choice(["Low", "Intermediate", "High", "N/A"], size=num_patients, p=[0.3, 0.2, 0.1, 0.4])
29
+ treatments = np.random.choice(["Surgery", "Chemotherapy", "Radiation Therapy"], size=num_patients)
30
+
31
+ return pd.DataFrame({
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  "PRIMARY_PERSON_KEY": primary_keys,
33
  "Age": ages,
34
  "Menopausal Status": menopausal_status,
35
  "Tumor Size (cm)": tumor_sizes,
36
  "Lymph Node Involvement": lymph_nodes,
37
+ "Tumor Grade": tumor_grades,
38
+ "Tumor Stage": tumor_stages,
39
  "ER Status": er_status,
40
  "PR Status": pr_status,
41
  "HER2 Status": her2_status,
42
+ "Ki-67 Level": ki67_levels,
43
  "TNBC Status": tnbc_status,
44
  "BRCA Mutation": brca_mutation,
45
  "Overall Health": overall_health,
46
  "Genomic Recurrence Score": genomic_score,
47
+ "Treatment": treatments
48
+ })
 
 
49
 
50
  # Function to generate Members from BreastCancer
51
  def generate_members_from_breast_cancer(breast_cancer_df):
 
93
  "PROV_TAXONOMY": np.random.choice(["208100000X", "207RE0101X"], num_providers),
94
  })
95
 
96
+ # Function to generate wearable data
97
+ def generate_wearable_data(num_patients, num_measurements, start_datetime, time_interval, cancer_rate, chemo_brain_effect, primary_keys):
98
+ num_cancer_patients = int((cancer_rate / 100) * num_patients)
99
+ cancer_patients = set(random.sample(primary_keys, num_cancer_patients))
100
+ baseline_activity = 2000
101
+ baseline_heart_rate = 80
102
+ baseline_o2 = 98.2
103
+ activity_reduction_factor = (100 - chemo_brain_effect) / 100.0
104
+ chemo_heart_rate_increase = 5
105
+
106
+ data_rows = []
107
+ timestamps = [start_datetime + i * time_interval for i in range(num_measurements)]
108
+
109
+ for pkey in primary_keys:
110
+ is_cancer = pkey in cancer_patients
111
+ for ts in timestamps:
112
+ activity_var = random.randint(-300, 300)
113
+ hr_var = random.randint(-3, 3)
114
+ o2_var = random.uniform(-0.3, 0.3)
115
+
116
+ if is_cancer:
117
+ activity = int((baseline_activity + activity_var) * activity_reduction_factor)
118
+ heart_rate = baseline_heart_rate + hr_var + chemo_heart_rate_increase
119
+ else:
120
+ activity = baseline_activity + activity_var
121
+ heart_rate = baseline_heart_rate + hr_var
122
+
123
+ o2_sat = baseline_o2 + o2_var
124
+
125
+ activity = max(activity, 0)
126
+ heart_rate = max(heart_rate, 50)
127
+ o2_sat = max(o2_sat, 90.0)
128
+
129
+ data_rows.append([pkey, ts.strftime("%Y-%m-%d %H:%M:%S"), activity, heart_rate, round(o2_sat, 1)])
130
+
131
+ return pd.DataFrame(data_rows, columns=["PRIMARY_PERSON_KEY", "Measurement_Timestamp", "Activity_Level", "Heart_Rate", "O2_Saturation"])
132
+
133
  # Main Streamlit App
134
+ st.title("Synthetic Medical Data Generator with Wearable Data")
135
 
136
  # Sliders
137
  num_patients = st.slider("Number of Breast Cancer Patients to Generate", 10, 1000, 100)
138
+ num_measurements = st.slider("Measurements per Patient (Wearable Data)", 1, 100, 10)
139
  num_services = st.slider("Number of Services to Generate", 10, 2000, 500)
140
  num_providers = st.slider("Number of Providers to Generate", 10, 500, 100)
141
 
142
+ start_date = st.date_input("Wearable Data Start Date", value=datetime(2024, 12, 1))
143
+ start_time = st.time_input("Wearable Data Start Time", value=datetime(2024, 12, 1, 8, 0).time())
144
+ cancer_rate = st.slider("Percentage of Patients with Cancer (Wearable Data)", 0, 100, 30)
145
+ chemo_brain_effect = st.slider("Chemo Brain Impact on Activity Level (in % reduction)", 0, 50, 20)
146
+
147
  if st.button("Generate Data"):
148
+ primary_keys = [f"PPK_{i+1:05d}" for i in range(num_patients)]
149
+ wearable_start_datetime = datetime.combine(start_date, start_time)
150
  breast_cancer_df = generate_breast_cancer_data(num_patients)
151
  members_df = generate_members_from_breast_cancer(breast_cancer_df)
152
  enrollments_df = generate_enrollments_from_breast_cancer(breast_cancer_df)
153
+ services_df = generate_services(num_services, primary_keys)
154
  providers_df = generate_providers(num_providers)
155
+ wearable_data = generate_wearable_data(
156
+ num_patients, num_measurements, wearable_start_datetime, timedelta(hours=1), cancer_rate, chemo_brain_effect, primary_keys
157
+ )
158
 
 
159
  st.subheader("Breast Cancer Data")
160
  st.dataframe(breast_cancer_df.head())
161
  st.download_button("Download Breast Cancer Data", breast_cancer_df.to_csv(index=False), "breast_cancer.csv")
 
175
  st.subheader("Providers Data")
176
  st.dataframe(providers_df.head())
177
  st.download_button("Download Providers Data", providers_df.to_csv(index=False), "providers.csv")
178
+
179
+ st.subheader("Wearable Data")
180
+ st.dataframe(wearable_data.head())
181
+ st.download_button("Download Wearable Data", wearable_data.to_csv(index=False), "wearable_data.csv")