Spaces:
Paused
Paused
fix(tensor_parallel_size): set to 1
Browse files
main.py
CHANGED
@@ -43,7 +43,7 @@ engine_llama_3_2: LLM = LLM(
|
|
43 |
max_num_batched_tokens=512, # Reduced for T4
|
44 |
max_num_seqs=16, # Reduced for T4
|
45 |
gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
|
46 |
-
tensor_parallel_size=
|
47 |
# Llama-3.2-3B-Instruct max context length is 131072, but we reduce it to 32k.
|
48 |
# 32k tokens, 3/4 of 32k is 24k words, each page average is 500 or 0.5k words,
|
49 |
# so that's basically 24k / .5k = 24 x 2 =~48 pages.
|
@@ -64,7 +64,7 @@ engine_sailor_chat: LLM = LLM(
|
|
64 |
max_num_batched_tokens=512, # Reduced for T4
|
65 |
max_num_seqs=16, # Reduced for T4
|
66 |
gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
|
67 |
-
tensor_parallel_size=
|
68 |
# max_model_len=32768,
|
69 |
enforce_eager=True, # Disable CUDA graph
|
70 |
dtype='auto', # Use 'half' if you want half precision
|
|
|
43 |
max_num_batched_tokens=512, # Reduced for T4
|
44 |
max_num_seqs=16, # Reduced for T4
|
45 |
gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
|
46 |
+
tensor_parallel_size=1,
|
47 |
# Llama-3.2-3B-Instruct max context length is 131072, but we reduce it to 32k.
|
48 |
# 32k tokens, 3/4 of 32k is 24k words, each page average is 500 or 0.5k words,
|
49 |
# so that's basically 24k / .5k = 24 x 2 =~48 pages.
|
|
|
64 |
max_num_batched_tokens=512, # Reduced for T4
|
65 |
max_num_seqs=16, # Reduced for T4
|
66 |
gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
|
67 |
+
tensor_parallel_size=1,
|
68 |
# max_model_len=32768,
|
69 |
enforce_eager=True, # Disable CUDA graph
|
70 |
dtype='auto', # Use 'half' if you want half precision
|