#!/bin/bash # To be run in the main checkpoints directory # Define the step number for the new model checkpoint path TARGET_STEP="91000" # Function to update a single checkpoint file update_checkpoint_file() { local checkpoint_file="$1" local tmp_file tmp_file=$(mktemp) # Initialize counters for the paths and timestamps path_count=0 timestamp_count=0 # Read the file line by line while IFS= read -r line; do # Count the number of path and timestamp entries if [[ $line == all_model_checkpoint_paths* ]]; then path_count=$((path_count + 1)) elif [[ $line == all_model_checkpoint_timestamps* ]]; then timestamp_count=$((timestamp_count + 1)) fi # Add the line to the temporary file echo "$line" >> "$tmp_file" done < "$checkpoint_file" # Remove the last two paths and timestamps and rename the model checkpoint path new_path_count=0 new_timestamp_count=0 while IFS= read -r line; do if [[ $line == all_model_checkpoint_paths* ]]; then new_path_count=$((new_path_count + 1)) # Skip the last two paths if (( new_path_count > path_count - 1 )); then continue fi elif [[ $line == all_model_checkpoint_timestamps* ]]; then new_timestamp_count=$((new_timestamp_count + 1)) # Skip the last two timestamps if (( new_timestamp_count > timestamp_count - 1 )); then continue fi elif [[ $line == model_checkpoint_path* ]]; then # Rename the model checkpoint path to the target step line="model_checkpoint_path: \"iter_ckpt_rank_$(dirname $checkpoint_file | cut -d'_' -f4)-$TARGET_STEP\"" fi # Add the line to the final temporary file echo "$line" >> "${tmp_file}.final" done < "$tmp_file" # Replace the original file with the updated content mv "${tmp_file}.final" "$checkpoint_file" # Clean up temporary files rm "$tmp_file" } # Find all checkpoint files with the given glob pattern for checkpoint_file in iter_ckpt_rank_*/checkpoint; do # Backup the original checkpoint file cp "$checkpoint_file" "${checkpoint_file}.bak" # Update the checkpoint file update_checkpoint_file "$checkpoint_file" done echo "Checkpoint files have been updated."