rtmo

File size: 5,264 Bytes

import numpy as np
import onnx
from onnxconverter_common import auto_mixed_precision_model_path
import argparse
from rtmo_gpu import RTMO_GPU, draw_skeleton
import cv2

PROVIDERS=[('TensorrtExecutionProvider', {'trt_fp16_enable':True,}), 'CUDAExecutionProvider', 'CPUExecutionProvider']

def detect_model_input_size(model_path):
    model = onnx.load(model_path)
    for input_tensor in model.graph.input:
        # Assuming the input node is named 'input'
        if input_tensor.name == 'input':
            tensor_shape = input_tensor.type.tensor_type.shape
            # Extract the dimensions: (batch_size, channels, height, width)
            dims = [dim.dim_value for dim in tensor_shape.dim]
            # Replace dynamic batch size (-1 or 0) with 1
            if dims[0] < 1:
                dims[0] = 1
            return tuple(dims[2:4])  # Return (height, width)
    raise ValueError("Input node 'input' not found in the model")

def load_and_preprocess_image(image_path, preprocesss=None):

    image = cv2.imread(image_path)

    if preprocesss is not None:
        image  = preprocesss(image)

    return image

def compare_result(res1, res2):
    keypoints1, scores1 = res1
    keypoints2, scores2 = res2

    from termcolor import colored

    for j, (d1, d2) in enumerate(zip(keypoints1, keypoints2)):
        print(f'Detection {j}: ')
        for i, (j1, j2) in enumerate(zip(d1, d2)):
            (x1, y1), (x2, y2) = j1, j2
            s1, s2 = scores1[j][i], scores2[j][i]
            print(f"Joint-{i:2d}:")
            print(f'\tOriginal  ({colored("x", "blue")},{colored("y","green")},{colored("score", "red")}) = ({colored("{:4.1f}".format(x1),"blue")}, {colored("{:4.1f}".format(y1),"green")}, {colored("{:5.4f}".format(s1),"red")})')
            print(f'\tConverted ({colored("x", "blue")},{colored("y","green")},{colored("score", "red")}) = ({colored("{:4.1f}".format(x2),"blue")}, {colored("{:4.1f}".format(y2),"green")}, {colored("{:5.4f}".format(s2),"red")})')

def validate_pose(res1, res2, postprocess=None):

    if postprocess is not None:
        res1 = postprocess(res1)
        res2 = postprocess(res2)

    compare_result(res1, res2)

    for r1, r2 in zip(res1, res2):
        if not np.allclose(r1, r2, rtol=args.rtol, atol=args.atol):
            return False
    return True

def infer_on_image(onnx_model, model_input_size, test_image_path):
    body = RTMO_GPU(onnx_model=onnx_model, 
        model_input_size=model_input_size, 
        is_yolo_nas_pose=args.yolo_nas_pose)

    frame = cv2.imread(test_image_path)
    img_show = frame.copy()
    keypoints, scores = body(img_show)

    img_show = draw_skeleton(img_show,
                            keypoints,
                            scores,
                            kpt_thr=0.3,
                            line_width=2)
    img_show = cv2.resize(img_show, (788, 525))
    cv2.imshow(f'{args.target_model_path}', img_show)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

def main(args):
    model_input_size = detect_model_input_size(args.source_model_path)

    body = RTMO_GPU(onnx_model=args.source_model_path,
                    model_input_size=model_input_size,
                    is_yolo_nas_pose=args.yolo_nas_pose)
    
    def preprocess(image, body, is_yolo_nas_pose):

        img, _ = body.preprocess(image)

        # build input to (1, 3, H, W)
        img = img.transpose(2, 0, 1)
        img = np.ascontiguousarray(img, dtype=np.float32 if not is_yolo_nas_pose else np.uint8)
        img = img[None, :, :, :]
        return img
    
    image = load_and_preprocess_image(args.test_image_path, lambda img: preprocess(img, body, args.yolo_nas_pose))

    input_feed = {'input': image}

    auto_mixed_precision_model_path.auto_convert_mixed_precision_model_path(source_model_path=args.source_model_path, 
                                    input_feed=input_feed, 
                                    target_model_path=args.target_model_path,
                                    customized_validate_func=lambda res1,res2:validate_pose(res1, res2, body.postprocess), 
                                    rtol=args.rtol, atol=args.atol,
                                    provider=PROVIDERS, 
                                    keep_io_types=True,
                                    verbose=True)

    infer_on_image(args.target_model_path, model_input_size, args.test_image_path)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Convert an ONNX model to mixed precision format.")
    parser.add_argument("source_model_path", type=str, help="Path to the source ONNX model.")
    parser.add_argument("target_model_path", type=str, help="Path where the mixed precision model will be saved.")
    parser.add_argument("test_image_path", type=str, help="Path to a test image for validating the model conversion.")
    parser.add_argument('--rtol', type=float, default=0.01, help=' the relative tolerance to do validation')
    parser.add_argument('--atol', type=float, default=0.001, help=' the absolute tolerance to do validation')
    parser.add_argument('--yolo_nas_pose', action='store_true', help='Use YOLO NAS Pose (flat format only) instead of RTMO Model')

    args = parser.parse_args()
    
    main(args)