aws-neuron
/

optimum-neuron-cache

Model card Files Files and versions Community

299

philschmid HF staff commited on Mar 5, 2024

Commit

1960ccb

verified ·

1 Parent(s): c813817

Create inference-cache-config/llama.json

Browse files

Files changed (1) hide show

inference-cache-config/llama.json +110 -0

inference-cache-config/llama.json ADDED Viewed

	@@ -0,0 +1,110 @@

+{
+   "meta-llama/Llama-2-7b-chat-hf": [
+    {
+      "batch_size": 1,
+      "sequence_length": 4096,
+      "num_cores": 2,
+      "auto_cast_type": "fp16"
+    },
+    {
+      "batch_size": 1,
+      "sequence_length": 4096,
+      "num_cores": 8,
+      "auto_cast_type": "fp16"
+    },
+    {
+      "batch_size": 1,
+      "sequence_length": 4096,
+      "num_cores": 24,
+      "auto_cast_type": "fp16"
+    },
+    {
+      "batch_size": 4,
+      "sequence_length": 4096,
+      "num_cores": 2,
+      "auto_cast_type": "fp16"
+    },
+    {
+      "batch_size": 4,
+      "sequence_length": 4096,
+      "num_cores": 8,
+      "auto_cast_type": "fp16"
+    },
+    {
+      "batch_size": 4,
+      "sequence_length": 4096,
+      "num_cores": 24,
+      "auto_cast_type": "fp16"
+    },
+    {
+      "batch_size": 8,
+      "sequence_length": 4096,
+      "num_cores": 8,
+      "auto_cast_type": "fp16"
+    },
+    {
+      "batch_size": 8,
+      "sequence_length": 4096,
+      "num_cores": 24,
+      "auto_cast_type": "fp16"
+    },
+    {
+      "batch_size": 16,
+      "sequence_length": 4096,
+      "num_cores": 8,
+      "auto_cast_type": "fp16"
+    },
+    {
+      "batch_size": 16,
+      "sequence_length": 4096,
+      "num_cores": 24,
+      "auto_cast_type": "fp16"
+    }
+  ],
+  "meta-llama/Llama-2-13b-chat-hf": [
+    {
+      "batch_size": 1,
+      "sequence_length": 4096,
+      "num_cores": 8,
+      "auto_cast_type": "fp16"
+    },
+    {
+      "batch_size": 1,
+      "sequence_length": 4096,
+      "num_cores": 24,
+      "auto_cast_type": "fp16"
+    },
+    {
+      "batch_size": 4,
+      "sequence_length": 4096,
+      "num_cores": 8,
+      "auto_cast_type": "fp16"
+    },
+    {
+      "batch_size": 4,
+      "sequence_length": 4096,
+      "num_cores": 24,
+      "auto_cast_type": "fp16"
+    },
+    {
+      "batch_size": 8,
+      "sequence_length": 4096,
+      "num_cores": 8,
+      "auto_cast_type": "fp16"
+    },
+    {
+      "batch_size": 8,
+      "sequence_length": 4096,
+      "num_cores": 24,
+      "auto_cast_type": "fp16"
+    }
+  ],
+  "meta-llama/Llama-2-70b-chat-hf": [
+    {
+      "batch_size": 1,
+      "sequence_length": 4096,
+      "num_cores": 24,
+      "auto_cast_type": "fp16"
+    }
+  ]
+}