metadata
license: llama2
To try this out running in a production-like environment, please use the pre-built docker image:
docker pull docker-eu-public.artifactory.swg-devops.com/res-zrl-snap-docker-local/tgis-os:spec.7
docker run -d --rm --gpus all \
--name my-tgis-server \
-p 8033:8033 \
-v /path/to/all/models:/models \
-e MODEL_NAME=/models/model_weights/llama/13B-F \
-e SPECULATOR_PATH=/models/speculator_weights/llama/13B-F \
-e FLASH_ATTENTION=true \
-e PAGED_ATTENTION=true \
-e DTYPE_STR=float16 \
docker-eu-public.artifactory.swg-devops.com/res-zrl-snap-docker-local/tgis-os:spec.7
# check logs and wait for "gRPC server started on port 8033" and "HTTP server started on port 3000"
docker logs my-tgis-server -f
# get the client sample (Note: The first prompt will take longer as there is a warmup time)
conda create -n tgis-env python=3.11
conda activate tgis-env
git clone --branch speculative-decoding --single-branch https://github.com/tdoublep/text-generation-inference.git
cd text-generation-inference/integration_tests
make gen-client
pip install . --no-cache-dir
python sample_client.py
To try this out with the fms-native compiled model, please execute the following:
batch_size=1 (compile + cudagraphs)
git clone https://github.com/foundation-model-stack/fms-extras
(cd fms-extras && pip install -e .)
pip install transformers==4.35.0 sentencepiece numpy
python fms-extras/scripts/paged_speculative_inference.py \
--variant=13b \
--model_path=/path/to/model_weights/llama/13B-F \
--model_source=hf \
--tokenizer=/path/to/llama/13B-F \
--speculator_path=/path/to/speculator_weights/llama/13B-F \
--speculator_source=hf \
--compile \
--compile_mode=reduce-overhead
batch_size=1 (compile)
git clone https://github.com/foundation-model-stack/fms-extras
(cd fms-extras && pip install -e .)
pip install transformers==4.35.0 sentencepiece numpy
python fms-extras/scripts/paged_speculative_inference.py \
--variant=13b \
--model_path=/path/to/model_weights/llama/13B-F \
--model_source=hf \
--tokenizer=/path/to/llama/13B-F \
--speculator_path=/path/to/speculator_weights/llama/13B-F \
--speculator_source=hf \
--compile \
batch_size=4 (compile)
git clone https://github.com/foundation-model-stack/fms-extras
(cd fms-extras && pip install -e .)
pip install transformers==4.35.0 sentencepiece numpy
python fms-extras/scripts/paged_speculative_inference.py \
--variant=13b \
--model_path=/path/to/model_weights/llama/13B-F \
--model_source=hf \
--tokenizer=/path/to/llama/13B-F \
--speculator_path=/path/to/speculator_weights/llama/13B-F \
--speculator_source=hf \
--batch_input \
--compile \