Spaces:

yusufs
/

vllm-inference

Paused

yusufs commited on Nov 27, 2024

Commit

84c6c4a

1 Parent(s): 2457cd7

fix(tensor_parallel_size): set to 1

Files changed (1) hide show

main.py CHANGED Viewed

@@ -43,7 +43,7 @@ engine_llama_3_2: LLM = LLM(
     max_num_batched_tokens=512,    # Reduced for T4
     max_num_seqs=16,               # Reduced for T4
     gpu_memory_utilization=0.85,   # Slightly increased, adjust if needed
-    tensor_parallel_size=cuda_num_device,
     # Llama-3.2-3B-Instruct max context length is 131072, but we reduce it to 32k.
     # 32k tokens, 3/4 of 32k is 24k words, each page average is 500 or 0.5k words,
     # so that's basically 24k / .5k = 24 x 2 =~48 pages.
@@ -64,7 +64,7 @@ engine_sailor_chat: LLM = LLM(
     max_num_batched_tokens=512,    # Reduced for T4
     max_num_seqs=16,               # Reduced for T4
     gpu_memory_utilization=0.85,   # Slightly increased, adjust if needed
-    tensor_parallel_size=cuda_num_device,
     # max_model_len=32768,
     enforce_eager=True,            # Disable CUDA graph
     dtype='auto',                  # Use 'half' if you want half precision

     max_num_batched_tokens=512,    # Reduced for T4
     max_num_seqs=16,               # Reduced for T4
     gpu_memory_utilization=0.85,   # Slightly increased, adjust if needed
+    tensor_parallel_size=1,
     # Llama-3.2-3B-Instruct max context length is 131072, but we reduce it to 32k.
     # 32k tokens, 3/4 of 32k is 24k words, each page average is 500 or 0.5k words,
     # so that's basically 24k / .5k = 24 x 2 =~48 pages.
     max_num_batched_tokens=512,    # Reduced for T4
     max_num_seqs=16,               # Reduced for T4
     gpu_memory_utilization=0.85,   # Slightly increased, adjust if needed
+    tensor_parallel_size=1,
     # max_model_len=32768,
     enforce_eager=True,            # Disable CUDA graph
     dtype='auto',                  # Use 'half' if you want half precision