Spaces:

HLMCC
/

PAAD-submission-portal

Sleeping

App Files Files Community

PAAD-submission-portal / app.py

HLMCC

Update app.py

24b2c65 verified 4 months ago

raw

history blame contribute delete

4.98 kB

	import subprocess
	import sys

	# Install necessary libraries if not already installed
	try:
	from lifelines.utils import concordance_index
	except ImportError:
	subprocess.check_call([sys.executable, "-m", "pip", "install", "lifelines"])
	from lifelines.utils import concordance_index

	try:
	from datasets import load_dataset, Dataset
	from huggingface_hub import login, upload_file, hf_hub_download
	except ImportError:
	subprocess.check_call([sys.executable, "-m", "pip", "install", "datasets huggingface_hub"])
	from datasets import load_dataset, Dataset
	from huggingface_hub import login, upload_file, hf_hub_download

	from pathlib import Path
	import pandas as pd
	import streamlit as st
	import os

	# Hugging Face authentication
	hf_token = os.getenv('hf_token')
	login(hf_token) # Log in with the token

	# Load the ground truth dataset
	dataset = load_dataset("csv", data_files="hf://datasets/HLMCC/tcga-paad-ground-truth/train.csv")
	ground_truth = pd.DataFrame(dataset['train'])

	# Set up a directory for storing submissions
	submission_dir = Path("submissions")
	submission_dir.mkdir(exist_ok=True)

	# Download leaderboard from Hugging Face if it exists
	leaderboard_file_path = Path("leaderboard.csv")
	try:
	hf_hub_download(
	repo_id="HLMCC/tcga-paad-ground-truth",
	filename="leaderboard.csv",
	local_dir=".",
	repo_type="dataset",
	use_auth_token=hf_token
	)
	leaderboard_df = pd.read_csv(leaderboard_file_path)
	print("Existing leaderboard loaded successfully.")
	except:
	print("No existing leaderboard found. A new leaderboard will be created.")
	leaderboard_df = pd.DataFrame(columns=["Username", "C-Index", "Submission Date"])

	# Streamlit app title and description
	st.title("PAAD Survival Prediction Submission Portal")

	# Streamlit warning message at the top of the app
	st.warning("Your CSV file must only contain the following columns:\n\n"
	"- `patient_id`: Unique identifier for each patient matching the patient_id in the test set\n"
	"- `predicted_scores`: Predicted survival scores from your model\n\n"
	"Please ensure your file follows this format before uploading.")

	# Form to upload CSV file
	with st.form("submission_form"):
	username = st.text_input("Username (required)", max_chars=20)
	uploaded_file = st.file_uploader("Upload your prediction CSV file", type="csv")
	submit_button = st.form_submit_button("Submit")

	# Process submission
	if submit_button and uploaded_file:
	if not username.strip():
	st.error("Username is required. Please enter your username.")
	elif not uploaded_file:
	st.error("Please upload your prediction CSV file.")
	else:
	predictions = pd.read_csv(uploaded_file)

	# Check if file format is correct
	if "patient_id" in predictions.columns and "predicted_scores" in predictions.columns:
	# Merge with ground truth to calculate C-Index
	merged = pd.merge(ground_truth, predictions, on="patient_id", how="inner")
	if merged.empty:
	st.error("No matching patient IDs found between the ground truth and your submission. Please check your patient IDs.")

	c_index = concordance_index(event_times=merged["survival_time"],
	predicted_scores=merged["predicted_scores"],
	event_observed=merged["vital_status"],
	)

	# Save submission to file
	submission_file = submission_dir / f"{username}_{uploaded_file.name}"
	predictions.to_csv(submission_file, index=False)

	# Update leaderboard
	new_entry = pd.DataFrame({
	"Username": [username],
	"C-Index": [c_index],
	"Submission Date": [pd.Timestamp.now()],
	})
	leaderboard_df = pd.concat([leaderboard_df, new_entry], ignore_index=True)

	# Save updated leaderboard locally
	leaderboard_df.to_csv(leaderboard_file_path, index=False)

	# Upload updated leaderboard to Hugging Face
	upload_file(
	path_or_fileobj=leaderboard_file_path,
	path_in_repo="leaderboard.csv",
	repo_type="dataset",
	repo_id="HLMCC/tcga-paad-ground-truth",
	token=hf_token,
	)

	# Display the calculated C-Index to the user
	st.success(f"Submission received! Your C-Index score: {c_index:.4f}")
	else:
	st.error("Incorrect file format. Ensure columns include 'patient_id' and 'predicted_scores'.")

	# Display the leaderboard
	st.subheader("Leaderboard")
	# st.write(leaderboard_df.sort_values(by="C-Index", ascending=False).reset_index(drop=True))
	st.dataframe(leaderboard_df.sort_values(by="C-Index", ascending=False).reset_index(drop=True), height=400)