Spaces:
Sleeping
Sleeping
eaglelandsonce
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,10 @@
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
|
|
|
|
|
|
|
|
4 |
|
5 |
# Seed for reproducibility
|
6 |
np.random.seed(42)
|
@@ -8,128 +12,40 @@ np.random.seed(42)
|
|
8 |
# Function to generate synthetic BreastCancer data
|
9 |
def generate_breast_cancer_data(num_patients):
|
10 |
primary_keys = [f"PPK_{i+1:05d}" for i in range(num_patients)]
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
for i in range(num_patients):
|
29 |
-
age = int(np.random.normal(60, 10))
|
30 |
-
age = max(30, min(age, 80))
|
31 |
-
ages.append(age)
|
32 |
-
|
33 |
-
menopausal = "Post-menopausal" if age >= 50 else "Pre-menopausal"
|
34 |
-
menopausal_status.append(menopausal)
|
35 |
-
|
36 |
-
tumor_size = round(np.random.lognormal(mean=0.7, sigma=0.5), 2)
|
37 |
-
tumor_sizes.append(tumor_size)
|
38 |
-
|
39 |
-
lymph_node = (
|
40 |
-
"Positive"
|
41 |
-
if (tumor_size > 2.0 and np.random.rand() < 0.6)
|
42 |
-
or (tumor_size <= 2.0 and np.random.rand() < 0.3)
|
43 |
-
else "Negative"
|
44 |
-
)
|
45 |
-
lymph_nodes.append(lymph_node)
|
46 |
-
|
47 |
-
grade = np.random.choice([1, 2, 3], p=[0.1, 0.4, 0.5] if tumor_size > 2.0 else [0.3, 0.5, 0.2])
|
48 |
-
grades.append(grade)
|
49 |
-
|
50 |
-
if tumor_size <= 2.0 and lymph_node == "Negative":
|
51 |
-
stage = "I"
|
52 |
-
elif (tumor_size > 2.0 and tumor_size <= 5.0) and lymph_node == "Negative":
|
53 |
-
stage = "II"
|
54 |
-
elif lymph_node == "Positive" or tumor_size > 5.0:
|
55 |
-
stage = "III"
|
56 |
-
else:
|
57 |
-
stage = "II"
|
58 |
-
if np.random.rand() < 0.05:
|
59 |
-
stage = "IV"
|
60 |
-
stages.append(stage)
|
61 |
-
|
62 |
-
er = np.random.choice(["Positive", "Negative"], p=[0.75, 0.25])
|
63 |
-
pr = "Positive" if er == "Positive" and np.random.rand() > 0.1 else "Negative"
|
64 |
-
er_status.append(er)
|
65 |
-
pr_status.append(pr)
|
66 |
-
|
67 |
-
her2 = np.random.choice(["Positive", "Negative"], p=[0.3, 0.7] if grade == 3 else [0.15, 0.85])
|
68 |
-
her2_status.append(her2)
|
69 |
-
|
70 |
-
ki67 = "High" if grade == 3 and np.random.rand() < 0.8 else "Low"
|
71 |
-
ki67_level.append(ki67)
|
72 |
-
|
73 |
-
tnbc = "Positive" if er == "Negative" and pr == "Negative" and her2 == "Negative" else "Negative"
|
74 |
-
tnbc_status.append(tnbc)
|
75 |
-
|
76 |
-
brca = "Positive" if (tnbc == "Positive" or age < 40) and np.random.rand() < 0.2 else "Negative"
|
77 |
-
brca_mutation.append(brca)
|
78 |
-
|
79 |
-
health = "Good" if age < 65 and np.random.rand() < 0.9 else "Poor"
|
80 |
-
overall_health.append(health)
|
81 |
-
|
82 |
-
recurrence_score = (
|
83 |
-
np.random.choice(["Low", "Intermediate", "High"], p=[0.6, 0.3, 0.1])
|
84 |
-
if er == "Positive" and her2 == "Negative"
|
85 |
-
else "N/A"
|
86 |
-
)
|
87 |
-
genomic_score.append(recurrence_score)
|
88 |
-
|
89 |
-
if stage in ["I", "II"]:
|
90 |
-
if tnbc == "Positive":
|
91 |
-
treat = "Surgery, Chemotherapy, and Radiation Therapy"
|
92 |
-
elif er == "Positive" and recurrence_score != "N/A":
|
93 |
-
if recurrence_score == "High":
|
94 |
-
treat = "Surgery, Chemotherapy, Hormone Therapy, and Radiation Therapy"
|
95 |
-
elif recurrence_score == "Intermediate":
|
96 |
-
treat = "Surgery, Consider Chemotherapy, Hormone Therapy, and Radiation Therapy"
|
97 |
-
else:
|
98 |
-
treat = "Surgery, Hormone Therapy, and Radiation Therapy"
|
99 |
-
elif her2 == "Positive":
|
100 |
-
treat = "Surgery, HER2-Targeted Therapy, Chemotherapy, and Radiation Therapy"
|
101 |
-
else:
|
102 |
-
treat = "Surgery, Chemotherapy, and Radiation Therapy"
|
103 |
-
elif stage == "III":
|
104 |
-
treat = (
|
105 |
-
"Neoadjuvant Chemotherapy, Surgery, Radiation Therapy"
|
106 |
-
+ (", HER2-Targeted Therapy" if her2 == "Positive" else "")
|
107 |
-
+ (", Hormone Therapy" if er == "Positive" else "")
|
108 |
-
)
|
109 |
-
else:
|
110 |
-
treat = "Systemic Therapy (Palliative Care)"
|
111 |
-
treatment.append(treat)
|
112 |
-
|
113 |
-
breast_cancer_data = {
|
114 |
"PRIMARY_PERSON_KEY": primary_keys,
|
115 |
"Age": ages,
|
116 |
"Menopausal Status": menopausal_status,
|
117 |
"Tumor Size (cm)": tumor_sizes,
|
118 |
"Lymph Node Involvement": lymph_nodes,
|
119 |
-
"Tumor Grade":
|
120 |
-
"Tumor Stage":
|
121 |
"ER Status": er_status,
|
122 |
"PR Status": pr_status,
|
123 |
"HER2 Status": her2_status,
|
124 |
-
"Ki-67 Level":
|
125 |
"TNBC Status": tnbc_status,
|
126 |
"BRCA Mutation": brca_mutation,
|
127 |
"Overall Health": overall_health,
|
128 |
"Genomic Recurrence Score": genomic_score,
|
129 |
-
"Treatment":
|
130 |
-
}
|
131 |
-
|
132 |
-
return pd.DataFrame(breast_cancer_data)
|
133 |
|
134 |
# Function to generate Members from BreastCancer
|
135 |
def generate_members_from_breast_cancer(breast_cancer_df):
|
@@ -177,22 +93,69 @@ def generate_providers(num_providers):
|
|
177 |
"PROV_TAXONOMY": np.random.choice(["208100000X", "207RE0101X"], num_providers),
|
178 |
})
|
179 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
# Main Streamlit App
|
181 |
-
st.title("Synthetic Medical Data Generator")
|
182 |
|
183 |
# Sliders
|
184 |
num_patients = st.slider("Number of Breast Cancer Patients to Generate", 10, 1000, 100)
|
|
|
185 |
num_services = st.slider("Number of Services to Generate", 10, 2000, 500)
|
186 |
num_providers = st.slider("Number of Providers to Generate", 10, 500, 100)
|
187 |
|
|
|
|
|
|
|
|
|
|
|
188 |
if st.button("Generate Data"):
|
|
|
|
|
189 |
breast_cancer_df = generate_breast_cancer_data(num_patients)
|
190 |
members_df = generate_members_from_breast_cancer(breast_cancer_df)
|
191 |
enrollments_df = generate_enrollments_from_breast_cancer(breast_cancer_df)
|
192 |
-
services_df = generate_services(num_services,
|
193 |
providers_df = generate_providers(num_providers)
|
|
|
|
|
|
|
194 |
|
195 |
-
# Display and download data
|
196 |
st.subheader("Breast Cancer Data")
|
197 |
st.dataframe(breast_cancer_df.head())
|
198 |
st.download_button("Download Breast Cancer Data", breast_cancer_df.to_csv(index=False), "breast_cancer.csv")
|
@@ -212,3 +175,7 @@ if st.button("Generate Data"):
|
|
212 |
st.subheader("Providers Data")
|
213 |
st.dataframe(providers_df.head())
|
214 |
st.download_button("Download Providers Data", providers_df.to_csv(index=False), "providers.csv")
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
4 |
+
import csv
|
5 |
+
import io
|
6 |
+
import random
|
7 |
+
from datetime import datetime, timedelta
|
8 |
|
9 |
# Seed for reproducibility
|
10 |
np.random.seed(42)
|
|
|
12 |
# Function to generate synthetic BreastCancer data
|
13 |
def generate_breast_cancer_data(num_patients):
|
14 |
primary_keys = [f"PPK_{i+1:05d}" for i in range(num_patients)]
|
15 |
+
ages = np.random.randint(30, 80, size=num_patients)
|
16 |
+
menopausal_status = ["Post-menopausal" if age >= 50 else "Pre-menopausal" for age in ages]
|
17 |
+
tumor_sizes = np.round(np.random.lognormal(mean=0.7, sigma=0.5, size=num_patients), 2)
|
18 |
+
lymph_nodes = np.random.choice(["Positive", "Negative"], size=num_patients, p=[0.4, 0.6])
|
19 |
+
tumor_grades = np.random.choice([1, 2, 3], size=num_patients, p=[0.3, 0.5, 0.2])
|
20 |
+
tumor_stages = np.random.choice(["I", "II", "III", "IV"], size=num_patients, p=[0.4, 0.3, 0.2, 0.1])
|
21 |
+
er_status = np.random.choice(["Positive", "Negative"], size=num_patients, p=[0.75, 0.25])
|
22 |
+
pr_status = np.random.choice(["Positive", "Negative"], size=num_patients, p=[0.7, 0.3])
|
23 |
+
her2_status = np.random.choice(["Positive", "Negative"], size=num_patients, p=[0.3, 0.7])
|
24 |
+
ki67_levels = np.random.choice(["High", "Low"], size=num_patients, p=[0.6, 0.4])
|
25 |
+
tnbc_status = ["Positive" if er == "Negative" and pr == "Negative" and her2 == "Negative" else "Negative" for er, pr, her2 in zip(er_status, pr_status, her2_status)]
|
26 |
+
brca_mutation = np.random.choice(["Positive", "Negative"], size=num_patients, p=[0.1, 0.9])
|
27 |
+
overall_health = np.random.choice(["Good", "Poor"], size=num_patients, p=[0.7, 0.3])
|
28 |
+
genomic_score = np.random.choice(["Low", "Intermediate", "High", "N/A"], size=num_patients, p=[0.3, 0.2, 0.1, 0.4])
|
29 |
+
treatments = np.random.choice(["Surgery", "Chemotherapy", "Radiation Therapy"], size=num_patients)
|
30 |
+
|
31 |
+
return pd.DataFrame({
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
"PRIMARY_PERSON_KEY": primary_keys,
|
33 |
"Age": ages,
|
34 |
"Menopausal Status": menopausal_status,
|
35 |
"Tumor Size (cm)": tumor_sizes,
|
36 |
"Lymph Node Involvement": lymph_nodes,
|
37 |
+
"Tumor Grade": tumor_grades,
|
38 |
+
"Tumor Stage": tumor_stages,
|
39 |
"ER Status": er_status,
|
40 |
"PR Status": pr_status,
|
41 |
"HER2 Status": her2_status,
|
42 |
+
"Ki-67 Level": ki67_levels,
|
43 |
"TNBC Status": tnbc_status,
|
44 |
"BRCA Mutation": brca_mutation,
|
45 |
"Overall Health": overall_health,
|
46 |
"Genomic Recurrence Score": genomic_score,
|
47 |
+
"Treatment": treatments
|
48 |
+
})
|
|
|
|
|
49 |
|
50 |
# Function to generate Members from BreastCancer
|
51 |
def generate_members_from_breast_cancer(breast_cancer_df):
|
|
|
93 |
"PROV_TAXONOMY": np.random.choice(["208100000X", "207RE0101X"], num_providers),
|
94 |
})
|
95 |
|
96 |
+
# Function to generate wearable data
|
97 |
+
def generate_wearable_data(num_patients, num_measurements, start_datetime, time_interval, cancer_rate, chemo_brain_effect, primary_keys):
|
98 |
+
num_cancer_patients = int((cancer_rate / 100) * num_patients)
|
99 |
+
cancer_patients = set(random.sample(primary_keys, num_cancer_patients))
|
100 |
+
baseline_activity = 2000
|
101 |
+
baseline_heart_rate = 80
|
102 |
+
baseline_o2 = 98.2
|
103 |
+
activity_reduction_factor = (100 - chemo_brain_effect) / 100.0
|
104 |
+
chemo_heart_rate_increase = 5
|
105 |
+
|
106 |
+
data_rows = []
|
107 |
+
timestamps = [start_datetime + i * time_interval for i in range(num_measurements)]
|
108 |
+
|
109 |
+
for pkey in primary_keys:
|
110 |
+
is_cancer = pkey in cancer_patients
|
111 |
+
for ts in timestamps:
|
112 |
+
activity_var = random.randint(-300, 300)
|
113 |
+
hr_var = random.randint(-3, 3)
|
114 |
+
o2_var = random.uniform(-0.3, 0.3)
|
115 |
+
|
116 |
+
if is_cancer:
|
117 |
+
activity = int((baseline_activity + activity_var) * activity_reduction_factor)
|
118 |
+
heart_rate = baseline_heart_rate + hr_var + chemo_heart_rate_increase
|
119 |
+
else:
|
120 |
+
activity = baseline_activity + activity_var
|
121 |
+
heart_rate = baseline_heart_rate + hr_var
|
122 |
+
|
123 |
+
o2_sat = baseline_o2 + o2_var
|
124 |
+
|
125 |
+
activity = max(activity, 0)
|
126 |
+
heart_rate = max(heart_rate, 50)
|
127 |
+
o2_sat = max(o2_sat, 90.0)
|
128 |
+
|
129 |
+
data_rows.append([pkey, ts.strftime("%Y-%m-%d %H:%M:%S"), activity, heart_rate, round(o2_sat, 1)])
|
130 |
+
|
131 |
+
return pd.DataFrame(data_rows, columns=["PRIMARY_PERSON_KEY", "Measurement_Timestamp", "Activity_Level", "Heart_Rate", "O2_Saturation"])
|
132 |
+
|
133 |
# Main Streamlit App
|
134 |
+
st.title("Synthetic Medical Data Generator with Wearable Data")
|
135 |
|
136 |
# Sliders
|
137 |
num_patients = st.slider("Number of Breast Cancer Patients to Generate", 10, 1000, 100)
|
138 |
+
num_measurements = st.slider("Measurements per Patient (Wearable Data)", 1, 100, 10)
|
139 |
num_services = st.slider("Number of Services to Generate", 10, 2000, 500)
|
140 |
num_providers = st.slider("Number of Providers to Generate", 10, 500, 100)
|
141 |
|
142 |
+
start_date = st.date_input("Wearable Data Start Date", value=datetime(2024, 12, 1))
|
143 |
+
start_time = st.time_input("Wearable Data Start Time", value=datetime(2024, 12, 1, 8, 0).time())
|
144 |
+
cancer_rate = st.slider("Percentage of Patients with Cancer (Wearable Data)", 0, 100, 30)
|
145 |
+
chemo_brain_effect = st.slider("Chemo Brain Impact on Activity Level (in % reduction)", 0, 50, 20)
|
146 |
+
|
147 |
if st.button("Generate Data"):
|
148 |
+
primary_keys = [f"PPK_{i+1:05d}" for i in range(num_patients)]
|
149 |
+
wearable_start_datetime = datetime.combine(start_date, start_time)
|
150 |
breast_cancer_df = generate_breast_cancer_data(num_patients)
|
151 |
members_df = generate_members_from_breast_cancer(breast_cancer_df)
|
152 |
enrollments_df = generate_enrollments_from_breast_cancer(breast_cancer_df)
|
153 |
+
services_df = generate_services(num_services, primary_keys)
|
154 |
providers_df = generate_providers(num_providers)
|
155 |
+
wearable_data = generate_wearable_data(
|
156 |
+
num_patients, num_measurements, wearable_start_datetime, timedelta(hours=1), cancer_rate, chemo_brain_effect, primary_keys
|
157 |
+
)
|
158 |
|
|
|
159 |
st.subheader("Breast Cancer Data")
|
160 |
st.dataframe(breast_cancer_df.head())
|
161 |
st.download_button("Download Breast Cancer Data", breast_cancer_df.to_csv(index=False), "breast_cancer.csv")
|
|
|
175 |
st.subheader("Providers Data")
|
176 |
st.dataframe(providers_df.head())
|
177 |
st.download_button("Download Providers Data", providers_df.to_csv(index=False), "providers.csv")
|
178 |
+
|
179 |
+
st.subheader("Wearable Data")
|
180 |
+
st.dataframe(wearable_data.head())
|
181 |
+
st.download_button("Download Wearable Data", wearable_data.to_csv(index=False), "wearable_data.csv")
|