yusufs commited on
Commit
84c6c4a
·
1 Parent(s): 2457cd7

fix(tensor_parallel_size): set to 1

Browse files
Files changed (1) hide show
  1. main.py +2 -2
main.py CHANGED
@@ -43,7 +43,7 @@ engine_llama_3_2: LLM = LLM(
43
  max_num_batched_tokens=512, # Reduced for T4
44
  max_num_seqs=16, # Reduced for T4
45
  gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
46
- tensor_parallel_size=cuda_num_device,
47
  # Llama-3.2-3B-Instruct max context length is 131072, but we reduce it to 32k.
48
  # 32k tokens, 3/4 of 32k is 24k words, each page average is 500 or 0.5k words,
49
  # so that's basically 24k / .5k = 24 x 2 =~48 pages.
@@ -64,7 +64,7 @@ engine_sailor_chat: LLM = LLM(
64
  max_num_batched_tokens=512, # Reduced for T4
65
  max_num_seqs=16, # Reduced for T4
66
  gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
67
- tensor_parallel_size=cuda_num_device,
68
  # max_model_len=32768,
69
  enforce_eager=True, # Disable CUDA graph
70
  dtype='auto', # Use 'half' if you want half precision
 
43
  max_num_batched_tokens=512, # Reduced for T4
44
  max_num_seqs=16, # Reduced for T4
45
  gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
46
+ tensor_parallel_size=1,
47
  # Llama-3.2-3B-Instruct max context length is 131072, but we reduce it to 32k.
48
  # 32k tokens, 3/4 of 32k is 24k words, each page average is 500 or 0.5k words,
49
  # so that's basically 24k / .5k = 24 x 2 =~48 pages.
 
64
  max_num_batched_tokens=512, # Reduced for T4
65
  max_num_seqs=16, # Reduced for T4
66
  gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
67
+ tensor_parallel_size=1,
68
  # max_model_len=32768,
69
  enforce_eager=True, # Disable CUDA graph
70
  dtype='auto', # Use 'half' if you want half precision