Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -218,212 +218,3 @@ if __name__ == "__main__":
|
|
218 |
main()
|
219 |
|
220 |
|
221 |
-
"""
|
222 |
-
import streamlit as st
|
223 |
-
import pandas as pd
|
224 |
-
import torch
|
225 |
-
import torch.nn as nn
|
226 |
-
import torch.optim as optim
|
227 |
-
import matplotlib.pyplot as plt
|
228 |
-
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
229 |
-
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
|
230 |
-
import numpy as np
|
231 |
-
|
232 |
-
# Global scaler and label encoder for consistent preprocessing
|
233 |
-
scaler = StandardScaler()
|
234 |
-
label_encoder = LabelEncoder()
|
235 |
-
feature_columns = None # To store feature columns from the training data
|
236 |
-
|
237 |
-
# Preload default files
|
238 |
-
DEFAULT_TRAIN_FILE = "patientdata.csv"
|
239 |
-
DEFAULT_PREDICT_FILE = "synthetic_breast_cancer_notreatmentcolumn.csv"
|
240 |
-
DEFAULT_LABEL_FILE = "synthetic_breast_cancer_data_withColumn.csv"
|
241 |
-
|
242 |
-
def main():
|
243 |
-
global feature_columns
|
244 |
-
|
245 |
-
st.title("Patient Treatment Prediction App")
|
246 |
-
st.write("Upload patient data to train a model and predict treatments based on input data.")
|
247 |
-
|
248 |
-
# Upload training data
|
249 |
-
uploaded_file = st.file_uploader("Upload a CSV file for training", type="csv")
|
250 |
-
if uploaded_file is None:
|
251 |
-
st.write("Using default training data.")
|
252 |
-
data = pd.read_csv(DEFAULT_TRAIN_FILE)
|
253 |
-
else:
|
254 |
-
data = pd.read_csv(uploaded_file)
|
255 |
-
st.write("Training Dataset Preview:", data.head())
|
256 |
-
|
257 |
-
# Check for Treatment column in training data
|
258 |
-
if 'Treatment' not in data.columns:
|
259 |
-
st.error("The training data must contain a 'Treatment' column.")
|
260 |
-
return
|
261 |
-
|
262 |
-
# Prepare Data
|
263 |
-
X, y, input_dim, num_classes, feature_columns = preprocess_training_data(data)
|
264 |
-
|
265 |
-
# Model Parameters
|
266 |
-
hidden_dim = st.slider("Hidden Layer Dimension", 10, 100, 50)
|
267 |
-
learning_rate = st.number_input("Learning Rate", 0.0001, 0.1, 0.01)
|
268 |
-
epochs = st.number_input("Epochs", 1, 100, 20)
|
269 |
-
|
270 |
-
# Model training
|
271 |
-
if st.button("Train Model"):
|
272 |
-
model, loss_curve = train_model(X, y, input_dim, hidden_dim, num_classes, learning_rate, epochs)
|
273 |
-
plot_loss_curve(loss_curve)
|
274 |
-
|
275 |
-
# Upload data for prediction
|
276 |
-
st.write("Upload new data without the 'Treatment' column for prediction.")
|
277 |
-
new_data_file = st.file_uploader("Upload new CSV file for prediction", type="csv")
|
278 |
-
if new_data_file is None:
|
279 |
-
st.write("Using default prediction data.")
|
280 |
-
new_data = pd.read_csv(DEFAULT_PREDICT_FILE)
|
281 |
-
else:
|
282 |
-
new_data = pd.read_csv(new_data_file)
|
283 |
-
st.write("Prediction Dataset Preview:", new_data.head())
|
284 |
-
|
285 |
-
if 'model' in locals() and feature_columns is not None:
|
286 |
-
# Align columns to match training data
|
287 |
-
new_data_aligned = align_columns(new_data, feature_columns)
|
288 |
-
|
289 |
-
if new_data_aligned is not None:
|
290 |
-
predictions = predict_treatment(new_data_aligned, model)
|
291 |
-
|
292 |
-
# Display Predictions in an Output Box
|
293 |
-
st.subheader("Predicted Treatment Outcomes")
|
294 |
-
prediction_output = "\n".join([f"Patient {i+1}: {pred}" for i, pred in enumerate(predictions)])
|
295 |
-
st.text_area("Prediction Results", prediction_output, height=200)
|
296 |
-
|
297 |
-
# Compare predictions with actual labels
|
298 |
-
actual_data = pd.read_csv(DEFAULT_LABEL_FILE)
|
299 |
-
if 'Treatment' in actual_data.columns:
|
300 |
-
actual_labels = label_encoder.transform(actual_data['Treatment'])
|
301 |
-
evaluate_model_performance(predictions, actual_labels)
|
302 |
-
else:
|
303 |
-
st.error("Actual labels file must contain a 'Treatment' column.")
|
304 |
-
else:
|
305 |
-
st.error("Unable to align prediction data to the training feature columns.")
|
306 |
-
else:
|
307 |
-
st.warning("Please train the model first before predicting on new data.")
|
308 |
-
|
309 |
-
def preprocess_training_data(data):
|
310 |
-
global scaler, label_encoder
|
311 |
-
|
312 |
-
# Label encode the 'Treatment' target column
|
313 |
-
data['Treatment'] = label_encoder.fit_transform(data['Treatment'])
|
314 |
-
y = data['Treatment'].values
|
315 |
-
|
316 |
-
# Encode and standardize feature columns
|
317 |
-
X = data.drop('Treatment', axis=1)
|
318 |
-
feature_columns = X.columns # Store feature columns for later alignment
|
319 |
-
for col in X.select_dtypes(include=['object']).columns:
|
320 |
-
X[col] = LabelEncoder().fit_transform(X[col])
|
321 |
-
|
322 |
-
# Standardize features
|
323 |
-
X = scaler.fit_transform(X)
|
324 |
-
|
325 |
-
return torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.long), X.shape[1], len(np.unique(y)), feature_columns
|
326 |
-
|
327 |
-
def align_columns(new_data, feature_columns):
|
328 |
-
# Ensure the new data has the same columns as the training data
|
329 |
-
missing_cols = set(feature_columns) - set(new_data.columns)
|
330 |
-
extra_cols = set(new_data.columns) - set(feature_columns)
|
331 |
-
|
332 |
-
# Remove any extra columns
|
333 |
-
new_data = new_data.drop(columns=extra_cols)
|
334 |
-
|
335 |
-
# Add missing columns with default value 0
|
336 |
-
for col in missing_cols:
|
337 |
-
new_data[col] = 0
|
338 |
-
|
339 |
-
# Reorder columns to match the training data
|
340 |
-
new_data = new_data[feature_columns]
|
341 |
-
|
342 |
-
# Encode and standardize feature columns
|
343 |
-
for col in new_data.select_dtypes(include=['object']).columns:
|
344 |
-
new_data[col] = LabelEncoder().fit_transform(new_data[col])
|
345 |
-
|
346 |
-
# Scale features
|
347 |
-
new_data = scaler.transform(new_data)
|
348 |
-
|
349 |
-
return torch.tensor(new_data, dtype=torch.float32)
|
350 |
-
|
351 |
-
def train_model(X, y, input_dim, hidden_dim, num_classes, learning_rate, epochs):
|
352 |
-
# Model Definition
|
353 |
-
class SimpleNN(nn.Module):
|
354 |
-
def __init__(self, input_dim, hidden_dim, num_classes):
|
355 |
-
super(SimpleNN, self).__init__()
|
356 |
-
self.fc1 = nn.Linear(input_dim, hidden_dim)
|
357 |
-
self.relu = nn.ReLU()
|
358 |
-
self.fc2 = nn.Linear(hidden_dim, num_classes)
|
359 |
-
|
360 |
-
def forward(self, x):
|
361 |
-
x = self.fc1(x)
|
362 |
-
x = self.relu(x)
|
363 |
-
x = self.fc2(x)
|
364 |
-
return x
|
365 |
-
|
366 |
-
# Model, loss, optimizer
|
367 |
-
model = SimpleNN(input_dim, hidden_dim, num_classes)
|
368 |
-
criterion = nn.CrossEntropyLoss()
|
369 |
-
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
|
370 |
-
|
371 |
-
# Training
|
372 |
-
loss_curve = []
|
373 |
-
for epoch in range(epochs):
|
374 |
-
optimizer.zero_grad()
|
375 |
-
outputs = model(X)
|
376 |
-
loss = criterion(outputs, y)
|
377 |
-
loss.backward()
|
378 |
-
optimizer.step()
|
379 |
-
loss_curve.append(loss.item())
|
380 |
-
|
381 |
-
return model, loss_curve
|
382 |
-
|
383 |
-
def plot_loss_curve(loss_curve):
|
384 |
-
plt.figure()
|
385 |
-
plt.plot(loss_curve, label="Training Loss")
|
386 |
-
plt.xlabel("Epochs")
|
387 |
-
plt.ylabel("Loss")
|
388 |
-
plt.title("Loss Curve")
|
389 |
-
plt.legend()
|
390 |
-
st.pyplot(plt)
|
391 |
-
|
392 |
-
def predict_treatment(new_data, model, batch_size=32):
|
393 |
-
model.eval()
|
394 |
-
predictions = []
|
395 |
-
|
396 |
-
# Run predictions in batches for large datasets
|
397 |
-
with torch.no_grad():
|
398 |
-
for i in range(0, new_data.size(0), batch_size):
|
399 |
-
batch_data = new_data[i:i + batch_size]
|
400 |
-
outputs = model(batch_data)
|
401 |
-
_, batch_predictions = torch.max(outputs, 1)
|
402 |
-
predictions.extend(batch_predictions.numpy())
|
403 |
-
|
404 |
-
# Convert numeric predictions back to original label names
|
405 |
-
return label_encoder.inverse_transform(predictions)
|
406 |
-
|
407 |
-
def evaluate_model_performance(predictions, actual_labels):
|
408 |
-
# Ensure both predictions and actual_labels are consistently numeric
|
409 |
-
if isinstance(predictions[0], str):
|
410 |
-
actual_labels = label_encoder.inverse_transform(actual_labels)
|
411 |
-
elif isinstance(predictions[0], int):
|
412 |
-
actual_labels = label_encoder.transform(actual_labels)
|
413 |
-
|
414 |
-
# Calculate evaluation metrics
|
415 |
-
accuracy = accuracy_score(actual_labels, predictions)
|
416 |
-
precision = precision_score(actual_labels, predictions, average='weighted')
|
417 |
-
recall = recall_score(actual_labels, predictions, average='weighted')
|
418 |
-
f1 = f1_score(actual_labels, predictions, average='weighted')
|
419 |
-
|
420 |
-
# Display metrics
|
421 |
-
st.subheader("Model Evaluation Metrics")
|
422 |
-
st.write(f"**Accuracy:** {accuracy:.2f}")
|
423 |
-
st.write(f"**Precision:** {precision:.2f}")
|
424 |
-
st.write(f"**Recall:** {recall:.2f}")
|
425 |
-
st.write(f"**F1-Score:** {f1:.2f}")
|
426 |
-
|
427 |
-
if __name__ == "__main__":
|
428 |
-
main()
|
429 |
-
"""
|
|
|
218 |
main()
|
219 |
|
220 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|