HLMCC's picture
Update app.py
24b2c65 verified
import subprocess
import sys
# Install necessary libraries if not already installed
try:
from lifelines.utils import concordance_index
except ImportError:
subprocess.check_call([sys.executable, "-m", "pip", "install", "lifelines"])
from lifelines.utils import concordance_index
try:
from datasets import load_dataset, Dataset
from huggingface_hub import login, upload_file, hf_hub_download
except ImportError:
subprocess.check_call([sys.executable, "-m", "pip", "install", "datasets huggingface_hub"])
from datasets import load_dataset, Dataset
from huggingface_hub import login, upload_file, hf_hub_download
from pathlib import Path
import pandas as pd
import streamlit as st
import os
# Hugging Face authentication
hf_token = os.getenv('hf_token')
login(hf_token) # Log in with the token
# Load the ground truth dataset
dataset = load_dataset("csv", data_files="hf://datasets/HLMCC/tcga-paad-ground-truth/train.csv")
ground_truth = pd.DataFrame(dataset['train'])
# Set up a directory for storing submissions
submission_dir = Path("submissions")
submission_dir.mkdir(exist_ok=True)
# Download leaderboard from Hugging Face if it exists
leaderboard_file_path = Path("leaderboard.csv")
try:
hf_hub_download(
repo_id="HLMCC/tcga-paad-ground-truth",
filename="leaderboard.csv",
local_dir=".",
repo_type="dataset",
use_auth_token=hf_token
)
leaderboard_df = pd.read_csv(leaderboard_file_path)
print("Existing leaderboard loaded successfully.")
except:
print("No existing leaderboard found. A new leaderboard will be created.")
leaderboard_df = pd.DataFrame(columns=["Username", "C-Index", "Submission Date"])
# Streamlit app title and description
st.title("PAAD Survival Prediction Submission Portal")
# Streamlit warning message at the top of the app
st.warning("Your CSV file must only contain the following columns:\n\n"
"- `patient_id`: Unique identifier for each patient matching the patient_id in the test set\n"
"- `predicted_scores`: Predicted survival scores from your model\n\n"
"Please ensure your file follows this format before uploading.")
# Form to upload CSV file
with st.form("submission_form"):
username = st.text_input("Username (required)", max_chars=20)
uploaded_file = st.file_uploader("Upload your prediction CSV file", type="csv")
submit_button = st.form_submit_button("Submit")
# Process submission
if submit_button and uploaded_file:
if not username.strip():
st.error("Username is required. Please enter your username.")
elif not uploaded_file:
st.error("Please upload your prediction CSV file.")
else:
predictions = pd.read_csv(uploaded_file)
# Check if file format is correct
if "patient_id" in predictions.columns and "predicted_scores" in predictions.columns:
# Merge with ground truth to calculate C-Index
merged = pd.merge(ground_truth, predictions, on="patient_id", how="inner")
if merged.empty:
st.error("No matching patient IDs found between the ground truth and your submission. Please check your patient IDs.")
c_index = concordance_index(event_times=merged["survival_time"],
predicted_scores=merged["predicted_scores"],
event_observed=merged["vital_status"],
)
# Save submission to file
submission_file = submission_dir / f"{username}_{uploaded_file.name}"
predictions.to_csv(submission_file, index=False)
# Update leaderboard
new_entry = pd.DataFrame({
"Username": [username],
"C-Index": [c_index],
"Submission Date": [pd.Timestamp.now()],
})
leaderboard_df = pd.concat([leaderboard_df, new_entry], ignore_index=True)
# Save updated leaderboard locally
leaderboard_df.to_csv(leaderboard_file_path, index=False)
# Upload updated leaderboard to Hugging Face
upload_file(
path_or_fileobj=leaderboard_file_path,
path_in_repo="leaderboard.csv",
repo_type="dataset",
repo_id="HLMCC/tcga-paad-ground-truth",
token=hf_token,
)
# Display the calculated C-Index to the user
st.success(f"Submission received! Your C-Index score: {c_index:.4f}")
else:
st.error("Incorrect file format. Ensure columns include 'patient_id' and 'predicted_scores'.")
# Display the leaderboard
st.subheader("Leaderboard")
# st.write(leaderboard_df.sort_values(by="C-Index", ascending=False).reset_index(drop=True))
st.dataframe(leaderboard_df.sort_values(by="C-Index", ascending=False).reset_index(drop=True), height=400)