import subprocess import sys # Install necessary libraries if not already installed try: from lifelines.utils import concordance_index except ImportError: subprocess.check_call([sys.executable, "-m", "pip", "install", "lifelines"]) from lifelines.utils import concordance_index try: from datasets import load_dataset, Dataset from huggingface_hub import login, upload_file, hf_hub_download except ImportError: subprocess.check_call([sys.executable, "-m", "pip", "install", "datasets huggingface_hub"]) from datasets import load_dataset, Dataset from huggingface_hub import login, upload_file, hf_hub_download from pathlib import Path import pandas as pd import streamlit as st import os # Hugging Face authentication hf_token = os.getenv('hf_token') login(hf_token) # Log in with the token # Load the ground truth dataset dataset = load_dataset("csv", data_files="hf://datasets/HLMCC/tcga-paad-ground-truth/train.csv") ground_truth = pd.DataFrame(dataset['train']) # Set up a directory for storing submissions submission_dir = Path("submissions") submission_dir.mkdir(exist_ok=True) # Download leaderboard from Hugging Face if it exists leaderboard_file_path = Path("leaderboard.csv") try: hf_hub_download( repo_id="HLMCC/tcga-paad-ground-truth", filename="leaderboard.csv", local_dir=".", repo_type="dataset", use_auth_token=hf_token ) leaderboard_df = pd.read_csv(leaderboard_file_path) print("Existing leaderboard loaded successfully.") except: print("No existing leaderboard found. A new leaderboard will be created.") leaderboard_df = pd.DataFrame(columns=["Username", "C-Index", "Submission Date"]) # Streamlit app title and description st.title("PAAD Survival Prediction Submission Portal") # Streamlit warning message at the top of the app st.warning("Your CSV file must only contain the following columns:\n\n" "- `patient_id`: Unique identifier for each patient matching the patient_id in the test set\n" "- `predicted_scores`: Predicted survival scores from your model\n\n" "Please ensure your file follows this format before uploading.") # Form to upload CSV file with st.form("submission_form"): username = st.text_input("Username (required)", max_chars=20) uploaded_file = st.file_uploader("Upload your prediction CSV file", type="csv") submit_button = st.form_submit_button("Submit") # Process submission if submit_button and uploaded_file: if not username.strip(): st.error("Username is required. Please enter your username.") elif not uploaded_file: st.error("Please upload your prediction CSV file.") else: predictions = pd.read_csv(uploaded_file) # Check if file format is correct if "patient_id" in predictions.columns and "predicted_scores" in predictions.columns: # Merge with ground truth to calculate C-Index merged = pd.merge(ground_truth, predictions, on="patient_id", how="inner") if merged.empty: st.error("No matching patient IDs found between the ground truth and your submission. Please check your patient IDs.") c_index = concordance_index(event_times=merged["survival_time"], predicted_scores=merged["predicted_scores"], event_observed=merged["vital_status"], ) # Save submission to file submission_file = submission_dir / f"{username}_{uploaded_file.name}" predictions.to_csv(submission_file, index=False) # Update leaderboard new_entry = pd.DataFrame({ "Username": [username], "C-Index": [c_index], "Submission Date": [pd.Timestamp.now()], }) leaderboard_df = pd.concat([leaderboard_df, new_entry], ignore_index=True) # Save updated leaderboard locally leaderboard_df.to_csv(leaderboard_file_path, index=False) # Upload updated leaderboard to Hugging Face upload_file( path_or_fileobj=leaderboard_file_path, path_in_repo="leaderboard.csv", repo_type="dataset", repo_id="HLMCC/tcga-paad-ground-truth", token=hf_token, ) # Display the calculated C-Index to the user st.success(f"Submission received! Your C-Index score: {c_index:.4f}") else: st.error("Incorrect file format. Ensure columns include 'patient_id' and 'predicted_scores'.") # Display the leaderboard st.subheader("Leaderboard") # st.write(leaderboard_df.sort_values(by="C-Index", ascending=False).reset_index(drop=True)) st.dataframe(leaderboard_df.sort_values(by="C-Index", ascending=False).reset_index(drop=True), height=400)