File size: 4,979 Bytes
7fd9522
 
 
d05dc56
7fd9522
 
 
 
13dd8c6
ef94797
13dd8c6
d05dc56
220ec8b
13dd8c6
9f54d7d
d05dc56
220ec8b
d05dc56
f736933
ef94797
 
6dc5ea2
13dd8c6
d05dc56
13dd8c6
9f54d7d
13dd8c6
d05dc56
 
 
ef94797
d05dc56
 
ef94797
 
220ec8b
d05dc56
220ec8b
4d71980
 
 
 
 
 
 
d05dc56
9edbb4c
220ec8b
9edbb4c
ef94797
 
 
 
 
a9821d1
 
 
 
 
 
ef94797
 
24b2c65
ef94797
 
 
 
 
24b2c65
 
 
 
ef94797
24b2c65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef94797
 
 
96ea551
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import subprocess
import sys

# Install necessary libraries if not already installed
try:
    from lifelines.utils import concordance_index
except ImportError:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "lifelines"])
    from lifelines.utils import concordance_index

try:
    from datasets import load_dataset, Dataset
    from huggingface_hub import login, upload_file, hf_hub_download
except ImportError:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "datasets huggingface_hub"])
    from datasets import load_dataset, Dataset
    from huggingface_hub import login, upload_file, hf_hub_download

from pathlib import Path
import pandas as pd
import streamlit as st
import os

# Hugging Face authentication
hf_token = os.getenv('hf_token')
login(hf_token)  # Log in with the token

# Load the ground truth dataset
dataset = load_dataset("csv", data_files="hf://datasets/HLMCC/tcga-paad-ground-truth/train.csv")
ground_truth = pd.DataFrame(dataset['train'])

# Set up a directory for storing submissions
submission_dir = Path("submissions")
submission_dir.mkdir(exist_ok=True)

# Download leaderboard from Hugging Face if it exists
leaderboard_file_path = Path("leaderboard.csv")
try:
    hf_hub_download(
        repo_id="HLMCC/tcga-paad-ground-truth", 
        filename="leaderboard.csv", 
        local_dir=".", 
        repo_type="dataset", 
        use_auth_token=hf_token
    )
    leaderboard_df = pd.read_csv(leaderboard_file_path)
    print("Existing leaderboard loaded successfully.")
except:
    print("No existing leaderboard found. A new leaderboard will be created.")
    leaderboard_df = pd.DataFrame(columns=["Username", "C-Index", "Submission Date"])

# Streamlit app title and description
st.title("PAAD Survival Prediction Submission Portal")

# Streamlit warning message at the top of the app
st.warning("Your CSV file must only contain the following columns:\n\n"
           "- `patient_id`: Unique identifier for each patient matching the patient_id in the test set\n"
           "- `predicted_scores`: Predicted survival scores from your model\n\n"
           "Please ensure your file follows this format before uploading.")

# Form to upload CSV file
with st.form("submission_form"):
    username = st.text_input("Username (required)", max_chars=20)
    uploaded_file = st.file_uploader("Upload your prediction CSV file", type="csv")
    submit_button = st.form_submit_button("Submit")

# Process submission
if submit_button and uploaded_file:
    if not username.strip():
        st.error("Username is required. Please enter your username.")
    elif not uploaded_file:
        st.error("Please upload your prediction CSV file.")
    else:
        predictions = pd.read_csv(uploaded_file)
    
        # Check if file format is correct
        if "patient_id" in predictions.columns and "predicted_scores" in predictions.columns:
            # Merge with ground truth to calculate C-Index
            merged = pd.merge(ground_truth, predictions, on="patient_id", how="inner")
            if merged.empty:
                st.error("No matching patient IDs found between the ground truth and your submission. Please check your patient IDs.")
            
            c_index = concordance_index(event_times=merged["survival_time"], 
                                        predicted_scores=merged["predicted_scores"], 
                                        event_observed=merged["vital_status"],
                                        )
    
            # Save submission to file
            submission_file = submission_dir / f"{username}_{uploaded_file.name}"
            predictions.to_csv(submission_file, index=False)
    
            # Update leaderboard
            new_entry = pd.DataFrame({
                "Username": [username],
                "C-Index": [c_index],
                "Submission Date": [pd.Timestamp.now()],
            })
            leaderboard_df = pd.concat([leaderboard_df, new_entry], ignore_index=True)
    
            # Save updated leaderboard locally
            leaderboard_df.to_csv(leaderboard_file_path, index=False)
    
            # Upload updated leaderboard to Hugging Face
            upload_file(
                path_or_fileobj=leaderboard_file_path,
                path_in_repo="leaderboard.csv",
                repo_type="dataset",
                repo_id="HLMCC/tcga-paad-ground-truth",
                token=hf_token,
            )
    
            # Display the calculated C-Index to the user
            st.success(f"Submission received! Your C-Index score: {c_index:.4f}")
        else:
            st.error("Incorrect file format. Ensure columns include 'patient_id' and 'predicted_scores'.")

# Display the leaderboard
st.subheader("Leaderboard")
# st.write(leaderboard_df.sort_values(by="C-Index", ascending=False).reset_index(drop=True))
st.dataframe(leaderboard_df.sort_values(by="C-Index", ascending=False).reset_index(drop=True), height=400)