benjamin-paine's picture
initial commit
5a0ba44
#!/usr/bin/env bash
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
PIDFILE="$SCRIPT_DIR/.run.pid"
LOG_DIR="$SCRIPT_DIR/logs"
MAX_RESTARTS=3
mkdir -p "$LOG_DIR"
export PYTHONPATH="$SCRIPT_DIR:$PYTHONPATH"
######################
# Service Definitions
######################
declare -A SERVICE_CMDS=(
[nginx]="nginx -p $SCRIPT_DIR -c $SCRIPT_DIR/nginx.conf"
[taproot_dispatcher]="taproot dispatcher --config $SCRIPT_DIR/dispatcher.yaml --add-import anachrovox --debug"
[taproot_overseer]="taproot overseer --config $SCRIPT_DIR/overseer.yaml --debug"
)
declare -A SERVICE_LOGS_STDOUT=(
[nginx]="${LOG_DIR}/nginx.log"
[taproot_dispatcher]="${LOG_DIR}/taproot_dispatcher.log"
[taproot_overseer]="${LOG_DIR}/taproot_overseer.log"
)
declare -A SERVICE_LOGS_STDERR=(
[nginx]="${LOG_DIR}/nginx_err.log"
[taproot_dispatcher]="${LOG_DIR}/taproot_dispatcher_err.log"
[taproot_overseer]="${LOG_DIR}/taproot_overseer_err.log"
)
declare -A SERVICE_PIDFILES=(
[nginx]="${SCRIPT_DIR}/.nginx.pid"
[taproot_dispatcher]="${SCRIPT_DIR}/.dispatcher.pid"
[taproot_overseer]="${SCRIPT_DIR}/.overseer.pid"
)
# The PIDs we'll track
declare -A SERVICE_PIDS
# How many times we've restarted
declare -A SERVICE_RESTART_COUNT
# Record the script's start time
START_TIME=$(date +%s.%N)
# Function to echo a message with a timestamp
timestamp_echo() {
local current_time=$(date +%s.%N)
local elapsed=$(awk "BEGIN {print $current_time - $START_TIME}")
local hours=$(awk "BEGIN {print int($elapsed / 3600)}")
local minutes=$(awk "BEGIN {print int(($elapsed % 3600) / 60)}")
local seconds=$(awk "BEGIN {print int($elapsed % 60)}")
local milliseconds=$(awk "BEGIN {print int(($elapsed - int($elapsed)) * 10000)}")
# Format and echo the message
printf "[+%02d:%02d:%02d.%04d] %s\n" "$hours" "$minutes" "$seconds" "$milliseconds" "$*"
}
declare -A SHUTTING_DOWN
######################
# PIDFile check
######################
# Check if the PID file exists
if [[ -f "$PIDFILE" ]]; then
# Read the PID from the file
read -r PID < "$PIDFILE"
# Check if the process is still running
if kill -0 "$PID" 2>/dev/null; then
echo "Script is already running with PID $PID. Exiting."
exit 1
else
echo "Stale PID file detected. Removing and continuing."
rm -f "$PIDFILE"
# Make sure all the services that were running in the previous instance are stopped
# Read pidfiles
for svc in "${!SERVICE_PIDFILES[@]}"; do
pidfile="${SERVICE_PIDFILES[$svc]}"
if [[ -f "$pidfile" ]]; then
read -r pid < "$pidfile"
if kill -0 "$pid" 2>/dev/null; then
echo "Stopping $svc (PID $pid) from zombie process."
kill "$pid"
fi
fi
done
fi
fi
# Write the current PID to the file
echo $$ > "$PIDFILE"
##########################
# Cleanup on SIGINT/TERM
##########################
cleanup() {
# Prevent thrashing
if [ -n "$SHUTTING_DOWN" ]; then
return
fi
SHUTTING_DOWN=1
timestamp_echo "Stopping all processes..."
for svc in "${!SERVICE_PIDS[@]}"; do
pid="${SERVICE_PIDS[$svc]}"
if kill -0 "$pid" 2>/dev/null; then
kill "$pid"
fi
done
# Give them a moment
sleep 1
# Force kill if still alive
for svc in "${!SERVICE_PIDS[@]}"; do
pid="${SERVICE_PIDS[$svc]}"
if kill -0 "$pid" 2>/dev/null; then
kill -9 "$pid"
fi
done
timestamp_echo "All processes stopped."
rm -f "$PIDFILE"
exit 0
}
terminate() {
timestamp_echo "Caught SIGTERM, shutting down..."
cleanup
}
interrupt() {
timestamp_echo "Caught SIGINT, shutting down..."
cleanup
}
trap interrupt SIGINT
trap terminate SIGTERM
#######################
# Start a single svc
#######################
start_service() {
local svc="$1"
local cmd="${SERVICE_CMDS[$svc]}"
local out="${SERVICE_LOGS_STDOUT[$svc]}"
local err="${SERVICE_LOGS_STDERR[$svc]}"
timestamp_echo "Starting $svc (restart count ${SERVICE_RESTART_COUNT[$svc]})"
# Start in background
# Note: If the process daemonizes immediately, $! won't remain alive
# But let's try anyway
$cmd >>"$out" 2>>"$err" &
SERVICE_PIDS[$svc]=$!
sleep 0.2
# Check if it died instantly
if ! kill -0 "${SERVICE_PIDS[$svc]}" 2>/dev/null; then
timestamp_echo "$svc appears to have daemonized or exited immediately."
else
timestamp_echo "$svc started with PID ${SERVICE_PIDS[$svc]}"
echo "${SERVICE_PIDS[$svc]}" > "${SERVICE_PIDFILES[$svc]}"
fi
}
################################
# Restart logic
################################
attempt_restart() {
local svc="$1"
SERVICE_RESTART_COUNT[$svc]=$(( SERVICE_RESTART_COUNT[$svc] + 1 ))
if (( SERVICE_RESTART_COUNT[$svc] > MAX_RESTARTS )); then
timestamp_echo "$svc crashed too many times. Shutting everything down."
cleanup
else
start_service "$svc"
fi
}
#############################
# Main loop (polling)
#############################
monitor_services() {
while true; do
sleep 2 # poll every 2 seconds
for svc in "${!SERVICE_PIDS[@]}"; do
pid="${SERVICE_PIDS[$svc]}"
if ! kill -0 "$pid" 2>/dev/null; then
# It's dead
timestamp_echo "$svc (PID $pid) not alive! Attempting restart..."
attempt_restart "$svc"
fi
done
# Loop continues
done
}
main() {
# Zero out restart counters and start each service
for svc in "${!SERVICE_CMDS[@]}"; do
SERVICE_RESTART_COUNT[$svc]=0
start_service "$svc"
done
# Now just poll them
monitor_services
}
main