#!/bin/bash

input_dir="txt"
output_dir="sangraha_hi_phonemized"
lang=hi
num_files=50000
num_jobs=-1

process_file() {
    input_file="$1"
    output_file="$2"
    lang=hi
    
    # Create the output directory and its parent directories if they don't exist
    mkdir -p "$(dirname "$output_file")"
    
    phonemize --quiet -l $lang "$input_file" -o "$output_file" --strip --language-switch remove-flags --preserve-punctuation
    echo "Processed: $input_file -> $output_file"
}

export -f process_file

# Start the timer
start_time=$(date +%s)

# Use GNU Parallel with find to process files in parallel
find "$input_dir" -type f -name "*.txt" | head -n $num_files | parallel -j $num_jobs process_file "{}" "${output_dir}/phn_$(basename {})"

# End the timer
end_time=$(date +%s)

# Calculate the elapsed time
elapsed_time=$((end_time - start_time))

# Convert elapsed time to minutes and seconds
minutes=$((elapsed_time / 60))
seconds=$((elapsed_time % 60))

# Print the benchmark results
echo "Benchmark Results:"
echo "Number of files processed: $num_files"
echo "Number of parallel jobs: $num_jobs"
echo "Elapsed time: $minutes minutes $seconds seconds"