{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "69faf98f-4067-4974-a3cf-2b7aa709d65c", "metadata": {}, "outputs": [], "source": [ "pip install coremltools==8.0b1 torch==2.3.0 torchvision torchaudio scikit-learn==1.1.2 " ] }, { "cell_type": "code", "execution_count": 38, "id": "56b386de-6f8c-4814-9159-79aef921c810", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Converting PyTorch Frontend ==> MIL Ops: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋| 440/441 [00:00<00:00, 6548.48 ops/s]\n", "Running MIL frontend_pytorch pipeline: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 139.19 passes/s]\n", "Running MIL default pipeline: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 79/79 [00:01<00:00, 57.60 passes/s]\n", "Running MIL backend_mlprogram pipeline: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 233.95 passes/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "OptimizationConfig LUT\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Running compression pass palettize_weights: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 67/67 [00:00<00:00, 99.79 ops/s]\n", "Running MIL frontend_milinternal pipeline: 0 passes [00:00, ? passes/s]\n", "Running MIL default pipeline: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 77/77 [00:00<00:00, 176.72 passes/s]\n", "Running MIL backend_mlprogram pipeline: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 180.92 passes/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "OptimizationConfig LINEAR\n", "-------- (W4) -------- \n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Running compression pass linear_quantize_weights: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 67/67 [00:00<00:00, 92.51 ops/s]\n", "Running MIL frontend_milinternal pipeline: 0 passes [00:00, ? passes/s]\n", "Running MIL default pipeline: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 77/77 [00:00<00:00, 167.87 passes/s]\n", "Running MIL backend_mlprogram pipeline: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 209.76 passes/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "-------- W8 selected! ---------- \n", "-------- Activation A8 quant! ---------- \n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Running activation compression pass insert_prefix_quantize_dequantize_pair: 100%|██████████████████████████████████████████████████████████████████████████████████| 522/522 [00:00<00:00, 7993.67 ops/s]\n", "Running compression pass linear_quantize_activations: start calibrating 10 samples\n", "Running compression pass linear_quantize_activations: calibration may take a while ...\n", "Running compression pass linear_quantize_activations: calibrating sample 1/10 succeeds.\n", "Running compression pass linear_quantize_activations: calibrating sample 2/10 succeeds.\n", "Running compression pass linear_quantize_activations: calibrating sample 3/10 succeeds.\n", "Running compression pass linear_quantize_activations: calibrating sample 4/10 succeeds.\n", "Running compression pass linear_quantize_activations: calibrating sample 5/10 succeeds.\n", "Running compression pass linear_quantize_activations: calibrating sample 6/10 succeeds.\n", "Running compression pass linear_quantize_activations: calibrating sample 7/10 succeeds.\n", "Running compression pass linear_quantize_activations: calibrating sample 8/10 succeeds.\n", "Running compression pass linear_quantize_activations: calibrating sample 9/10 succeeds.\n", "Running compression pass linear_quantize_activations: calibrating sample 10/10 succeeds.\n", "Running MIL frontend_milinternal pipeline: 0 passes [00:00, ? passes/s]\n", "Running MIL default pipeline: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 77/77 [00:01<00:00, 56.74 passes/s]\n", "Running MIL backend_mlprogram pipeline: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 76.64 passes/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "OptimizationConfig LUT(LINEAR)\n", "-------- LUT(W8) -------- \n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Running compression pass linear_quantize_weights: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 67/67 [00:00<00:00, 107.97 ops/s]\n", "Running MIL frontend_milinternal pipeline: 0 passes [00:00, ? passes/s]\n", "Running MIL default pipeline: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 77/77 [00:00<00:00, 176.48 passes/s]\n", "Running MIL backend_mlprogram pipeline: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 215.97 passes/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Running compression pass palettize_weights: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 121/121 [00:00<00:00, 116588.74 ops/s]\n", "Running MIL frontend_milinternal pipeline: 0 passes [00:00, ? passes/s]\n", "Running MIL default pipeline: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 77/77 [00:00<00:00, 180.58 passes/s]\n", "Running MIL backend_mlprogram pipeline: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 198.24 passes/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "-------- LUT4+W8 selected! ---------- \n", "-------- Activation A8 quant! ---------- \n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Running activation compression pass insert_prefix_quantize_dequantize_pair: 100%|██████████████████████████████████████████████████████████████████████████████████| 522/522 [00:00<00:00, 6895.20 ops/s]\n", "Running compression pass linear_quantize_activations: start calibrating 10 samples\n", "Running compression pass linear_quantize_activations: calibration may take a while ...\n", "Running compression pass linear_quantize_activations: calibrating sample 1/10 succeeds.\n", "Running compression pass linear_quantize_activations: calibrating sample 2/10 succeeds.\n", "Running compression pass linear_quantize_activations: calibrating sample 3/10 succeeds.\n", "Running compression pass linear_quantize_activations: calibrating sample 4/10 succeeds.\n", "Running compression pass linear_quantize_activations: calibrating sample 5/10 succeeds.\n", "Running compression pass linear_quantize_activations: calibrating sample 6/10 succeeds.\n", "Running compression pass linear_quantize_activations: calibrating sample 7/10 succeeds.\n", "Running compression pass linear_quantize_activations: calibrating sample 8/10 succeeds.\n", "Running compression pass linear_quantize_activations: calibrating sample 9/10 succeeds.\n", "Running compression pass linear_quantize_activations: calibrating sample 10/10 succeeds.\n", "Running MIL frontend_milinternal pipeline: 0 passes [00:00, ? passes/s]\n", "Running MIL default pipeline: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 77/77 [00:01<00:00, 70.87 passes/s]\n", "Running MIL backend_mlprogram pipeline: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 116.62 passes/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "rnfs-A8W8-LUT4-b1.mlpackage\n", "Done!\n" ] } ], "source": [ "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "import torchvision.transforms as transforms\n", "import coremltools as ct\n", "import coremltools.optimize as cto\n", "from PIL import Image\n", "import numpy as np\n", "import requests\n", "import os\n", "\n", "\n", "class BasicBlock(nn.Module):\n", " expansion = 1\n", "\n", " def __init__(self, in_planes, planes, stride=1):\n", " super(BasicBlock, self).__init__()\n", " self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)\n", " self.bn1 = nn.BatchNorm2d(planes)\n", " self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)\n", " self.bn2 = nn.BatchNorm2d(planes)\n", "\n", " self.shortcut = nn.Sequential()\n", " if stride != 1 or in_planes != self.expansion*planes:\n", " self.shortcut = nn.Sequential(\n", " nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),\n", " nn.BatchNorm2d(self.expansion*planes)\n", " )\n", "\n", " def forward(self, x):\n", " out = F.relu(self.bn1(self.conv1(x)))\n", " out = self.bn2(self.conv2(out))\n", " out += self.shortcut(x)\n", " out = F.relu(out)\n", " return out\n", "\n", "class Bottleneck(nn.Module):\n", " expansion = 4\n", "\n", " def __init__(self, in_planes, planes, stride=1):\n", " super(Bottleneck, self).__init__()\n", " self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)\n", " self.bn1 = nn.BatchNorm2d(planes)\n", " self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)\n", " self.bn2 = nn.BatchNorm2d(planes)\n", " self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False)\n", " self.bn3 = nn.BatchNorm2d(self.expansion*planes)\n", "\n", " self.shortcut = nn.Sequential()\n", " if stride != 1 or in_planes != self.expansion*planes:\n", " self.shortcut = nn.Sequential(\n", " nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),\n", " nn.BatchNorm2d(self.expansion*planes)\n", " )\n", "\n", " def forward(self, x):\n", " out = F.relu(self.bn1(self.conv1(x)))\n", " out = F.relu(self.bn2(self.conv2(out)))\n", " out = self.bn3(self.conv3(out))\n", " out += self.shortcut(x)\n", " out = F.relu(out)\n", " return out\n", "\n", "class ResNet(nn.Module):\n", " def __init__(self, block, num_blocks, num_classes=1000):\n", " super(ResNet, self).__init__()\n", " self.in_planes = 64\n", "\n", " self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)\n", " self.bn1 = nn.BatchNorm2d(64)\n", " self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)\n", " self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)\n", " self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)\n", " self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)\n", " self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)\n", " self.avgpool = nn.AdaptiveAvgPool2d((1, 1))\n", " self.fc = nn.Linear(512*block.expansion, num_classes)\n", "\n", " def _make_layer(self, block, planes, num_blocks, stride):\n", " strides = [stride] + [1]*(num_blocks-1)\n", " layers = []\n", " for stride in strides:\n", " layers.append(block(self.in_planes, planes, stride))\n", " self.in_planes = planes * block.expansion\n", " return nn.Sequential(*layers)\n", "\n", " def forward(self, x):\n", " x = F.relu(self.bn1(self.conv1(x)))\n", " x = self.maxpool(x)\n", " x = self.layer1(x)\n", " x = self.layer2(x)\n", " x = self.layer3(x)\n", " x = self.layer4(x)\n", " x = self.avgpool(x)\n", " x = torch.flatten(x, 1)\n", " x = self.fc(x)\n", " return x\n", "\n", "def ResNet50():\n", " return ResNet(Bottleneck, [3, 4, 6, 3])\n", "\n", "# Initialize the model\n", "model = ResNet50()\n", "model.eval() # Switch to inference mode\n", "\n", "# Custom batch size and image size\n", "batch_size = 1\n", "image_size = 224 #1024 #224 # You can change this value to any desired input size\n", "\n", "# Example input tensor with custom batch size and image size\n", "input_tensor = torch.randn(batch_size, 3, image_size, image_size)\n", "\n", "# Perform forward pass and trace the model\n", "traced_model = torch.jit.trace(model, input_tensor)\n", "#print(output)\n", "\n", "# Exporting for iOS18\n", "coreml_model_iOS18 = ct.convert(\n", " traced_model,\n", " inputs=[ct.TensorType(name=\"input\", shape=input_tensor.shape, dtype=np.float16)],\n", " #classifier_config=ct.ClassifierConfig(class_labels=class_labels),\n", " minimum_deployment_target=ct.target.iOS18\n", ")\n", "a = f\"resnet-from-scratch-b{batch_size}.mlpackage\"\n", "coreml_model_iOS18.save(a)\n", "\n", "# -------------------- quantization LUT only ----------------------------\n", "print(\"OptimizationConfig LUT\")\n", "\n", "config = cto.coreml.OptimizationConfig(\n", " global_config=cto.coreml.OpPalettizerConfig(mode=\"uniform\", nbits=4)\n", ")\n", "compressed_model = cto.coreml.palettize_weights(coreml_model_iOS18, config)\n", "a = f\"rnfs-4bit-b{batch_size}.mlpackage\"\n", "compressed_model.save(a)\n", "\n", "\n", "# -------------------- OptimizationConfig LINEAR ----------------------------\n", "print(\"OptimizationConfig LINEAR\")\n", "\n", "dt = ct.converters.mil.mil.types.int4 \n", "print(\"-------- (W4) -------- \")\n", "\n", "weight_config = cto.coreml.OptimizationConfig(\n", " global_config=cto.coreml.OpLinearQuantizerConfig(\n", " mode=\"linear_symmetric\", dtype=dt\n", " )\n", ")\n", "\n", "compressed_model2 = cto.coreml.linear_quantize_weights(coreml_model_iOS18, weight_config) \n", "print(\"-------- W8 selected! ---------- \")\n", "\n", "activation_config = cto.coreml.OptimizationConfig(\n", " global_config=cto.coreml.experimental.OpActivationLinearQuantizerConfig(\n", " mode=\"linear_symmetric\"\n", " )\n", ")\n", "print(\"-------- Activation A8 quant! ---------- \")\n", "compressed_model_a8 = cto.coreml.experimental.linear_quantize_activations(\n", " compressed_model2, \n", " activation_config, [{\"input\": torch.randn_like(input_tensor)+i} for i in range(10)]\n", ")\n", "a = f\"rnfs-A4W8-b{batch_size}.mlpackage\"\n", "compressed_model_a8.save(a)\n", "\n", "\n", "# -------------------- OptimizationConfig LUT(LINEAR)\" ----------------------------\n", "print(\"OptimizationConfig LUT(LINEAR)\")\n", "\n", "dt = ct.converters.mil.mil.types.int8 # lut is 4 bit already\n", "print(\"-------- LUT(W8) -------- \")\n", "weight_config = cto.coreml.OptimizationConfig(\n", " global_config=cto.coreml.OpLinearQuantizerConfig(\n", " mode=\"linear_symmetric\", dtype=dt\n", " )\n", ")\n", "\n", "compressed_model1 = cto.coreml.linear_quantize_weights(coreml_model_iOS18, weight_config) \n", "compressed_model2 = cto.coreml.palettize_weights(compressed_model1, config, joint_compression=True)\n", "print(\"-------- LUT4+W8 selected! ---------- \")\n", "\n", "activation_config = cto.coreml.OptimizationConfig(\n", " global_config=cto.coreml.experimental.OpActivationLinearQuantizerConfig(\n", " mode=\"linear_symmetric\"\n", " )\n", ")\n", "print(\"-------- Activation A8 quant! ---------- \")\n", "compressed_model_a8 = cto.coreml.experimental.linear_quantize_activations(\n", " compressed_model2, \n", " activation_config, [{\"input\": torch.randn_like(input_tensor)+i} for i in range(10)]\n", ")\n", "\n", "a = f\"rnfs-A8W8-LUT4-b{batch_size}.mlpackage\"\n", "compressed_model.save(a)\n", "\n", "print(a)\n", "print(\"Done!\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "6e7808a0-7228-4964-9fa7-6a703a34d6dc", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.14" } }, "nbformat": 4, "nbformat_minor": 5 }