#!/bin/bash

# path where dataset will be downloaded
DATASET_DIR="data/diabetic-retinopathy-dataset"
mkdir -p "$DATASET_DIR"

# Start time of the script
start_time=$(date +%s)

# Array containing the names of the files you want to download
# Note: The files are split into multiple parts, so you need to 
# download all parts to get the complete file
# Reference: Check Data Explorer on Kaggle for the list of files
# https://www.kaggle.com/c/diabetic-retinopathy-detection/data
files=(
    "test.zip.001"
    "test.zip.002"
    "test.zip.003"
    "test.zip.004"
    "test.zip.005"
    "test.zip.006"
    "test.zip.007"
    "sampleSubmission.csv.zip"
    "sample.zip"
    "train.zip.001"
    "train.zip.002"
    "train.zip.003"
    "train.zip.004"
    "train.zip.005"
    "trainLabels.csv.zip"
)

# Define a function to download a single file
download_file() {
    kaggle competitions download -c diabetic-retinopathy-detection -f "$1" -p "$DATASET_DIR"

    local zip_file="$DATASET_DIR/$1"
    
    # If .zip extension not present in $1, append it
    if [[ "$1" != *.zip ]]; then
        zip_file="$zip_file.zip"
    fi

    # Check if zip file exists
    if [ ! -f "$zip_file" ]; then
        echo "Error: $zip_file does not exist."
        return 1
    fi

    unzip -o "$zip_file" -d "$DATASET_DIR" # -o flag to overwrite existing files
    rm -rf "$zip_file"
}

# Loop through the array of file names and download each file
for file in "${files[@]}"; do
    download_file "$file" &
done

# Wait for all background processes to finish
wait

# End time of the script
end_time=$(date +%s)

# Calculate total time taken in minutes
total_time=$(( (end_time - start_time)/60 ))

# Print total time taken
echo "Total time taken: ${total_time} minutes"