Spaces:

yusufs
/

vllm-inference

Paused

File size: 6,701 Bytes

b41be20
2457cd7
4998ce7
 
ae7cfbb
 
 
 
 
 
 
 
2457cd7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ae7cfbb
 
 
 
0ef012d
 
ae7cfbb
4998ce7
ae7cfbb
 
0ef012d
c41cdb4
ae7cfbb
 
84c6c4a
c41cdb4
2425953
 
 
 
0ef012d
 
c41cdb4
 
 
 
 
ae7cfbb
d51e450
 
 
 
 
 
 
ae7cfbb
 
35decf8
 
 
 
 
 
 
 
 
 
 
 
 
 
586265c
 
ae7cfbb
 
4998ce7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
586265c
 
4998ce7
 
ae7cfbb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b41be20
ae7cfbb
 
 
 
 
 
 
 
b41be20
ae7cfbb
 
 
 
b44271e
 
 
 
 
 
 
 
b41be20
 
b44271e
b41be20
 
ae7cfbb
 
 
 
586265c
 
35decf8

import json
import random
import torch
from typing import Any
from typing import Optional
from fastapi import FastAPI
from pydantic import BaseModel
from vllm import LLM, SamplingParams, RequestOutput


# Don't forget to set HF_TOKEN in the env during running

cuda_num_device: int = 0
if torch.cuda.is_available() == 'cuda':
    random_seed = 42
    random.seed(random_seed)

    device = torch.device('cuda')
    torch.cuda.manual_seed(random_seed)

    print(f"Using device: {device}")
    print(f"CUDA available and enabled. {torch.cuda}")
    print(f"CUDA is available: {torch.cuda.is_available()}")
    print(f"CUDA device count: {torch.cuda.device_count()}")
    print(f"CUDA current device: {torch.cuda.current_device()}")

    for i in range(torch.cuda.device_count()):
        print('=================================================================')
        print(torch.cuda.get_device_name(i))
        print('Memory Usage:')
        print('Allocated:', round(torch.cuda.memory_allocated(i) / 1024 ** 3, 1), 'GB')
        print('Cached:   ', round(torch.cuda.memory_reserved(i) / 1024 ** 3, 1), 'GB')
app = FastAPI()

# Initialize the LLM engine
# Replace 'your-model-path' with the actual path or name of your model
# example:
# https://huggingface.co/spaces/damienbenveniste/deploy_vLLM/blob/b210a934d4ff7b68254d42fa28736d74649e610d/app.py#L17-L20

engine_llama_3_2: LLM = LLM(
    model='meta-llama/Llama-3.2-3B-Instruct',
    revision="0cb88a4f764b7a12671c53f0838cd831a0843b95",
    # https://github.com/vllm-project/vllm/blob/v0.6.4/vllm/config.py#L1062-L1065
    max_num_batched_tokens=32768,  # Reduced for T4, must equal with max_model_len
    max_num_seqs=16,               # Reduced for T4
    gpu_memory_utilization=0.85,   # Slightly increased, adjust if needed
    tensor_parallel_size=1,

    # Llama-3.2-3B-Instruct max context length is 131072, but we reduce it to 32k.
    # 32k tokens, 3/4 of 32k is 24k words, each page average is 500 or 0.5k words,
    # so that's basically 24k / .5k = 24 x 2 =~48 pages.
    # Because when we use maximum token length, it will be slower and the memory is not enough for T4.
    # https://github.com/vllm-project/vllm/blob/v0.6.4/vllm/config.py#L85-L86
    # https://github.com/vllm-project/vllm/blob/v0.6.4/vllm/config.py#L98-L102
    # [rank0]:     raise ValueError(
    # [rank0]: ValueError: The model's max seq len (131072)
    #   is larger than the maximum number of tokens that can be stored in KV cache (57056).
    #   Try increasing `gpu_memory_utilization` or decreasing `max_model_len` when initializing the engine.
    max_model_len=32768,           # Reduced for T4
    enforce_eager=True,            # Disable CUDA graph

    # File "/home/user/.local/lib/python3.12/site-packages/vllm/worker/worker.py",
    # line 479, in _check_if_gpu_supports_dtype
    # Bfloat16 is only supported on GPUs with compute capability of at least 8.0.
    # Your Tesla T4 GPU has compute capability 7.5.
    # You can use float16 instead by explicitly setting the`dtype` flag in CLI, for example: --dtype=half.
    dtype='half',                  # Use 'half' for T4
)

# # ValueError: max_num_batched_tokens (512) is smaller than max_model_len (32768).
# # This effectively limits the maximum sequence length to max_num_batched_tokens and makes vLLM reject longer sequences.
# # Please increase max_num_batched_tokens or decrease max_model_len.
# engine_sailor_chat: LLM = LLM(
#     model='sail/Sailor-4B-Chat',
#     revision="89a866a7041e6ec023dd462adeca8e28dd53c83e",
#     max_num_batched_tokens=32768,    # Reduced for T4
#     max_num_seqs=16,                 # Reduced for T4
#     gpu_memory_utilization=0.85,     # Slightly increased, adjust if needed
#     tensor_parallel_size=1,
#     max_model_len=32768,
#     enforce_eager=True,              # Disable CUDA graph
#     dtype='half',                    # Use 'half' for T4
# )


@app.get("/")
def greet_json():
    cuda_info: dict[str, Any] = {}
    if torch.cuda.is_available():
        cuda_current_device: int = torch.cuda.current_device()
        cuda_info = {
            "device_count": torch.cuda.device_count(),
            "cuda_device": torch.cuda.get_device_name(cuda_current_device),
            "cuda_capability": torch.cuda.get_device_capability(cuda_current_device),
            "allocated":  f"{round(torch.cuda.memory_allocated(cuda_current_device) / 1024 ** 3, 1)} GB",
            "cached": f"{round(torch.cuda.memory_reserved(cuda_current_device) / 1024 ** 3, 1)} GB",
        }

    return {
        "message": f"CUDA availability is {torch.cuda.is_available()}",
        "cuda_info": cuda_info,
        "model": [
            {
                "name": "meta-llama/Llama-3.2-3B-Instruct",
                "revision": "0cb88a4f764b7a12671c53f0838cd831a0843b95",
                "max_model_len": engine_llama_3_2.llm_engine.model_config.max_model_len,
            },
        ]
    }


class GenerationRequest(BaseModel):
    prompt: str
    max_tokens: int = 100
    temperature: float = 0.7
    logit_bias: Optional[dict[int, float]] = None


class GenerationResponse(BaseModel):
    text: Optional[str]
    error: Optional[str]


@app.post("/generate-llama3-2")
def generate_text(request: GenerationRequest) -> dict[str, Any]:
    try:
        sampling_params: SamplingParams = SamplingParams(
            temperature=request.temperature,
            max_tokens=request.max_tokens,
            logit_bias=request.logit_bias,
        )

        # Generate text
        response: list[RequestOutput] = engine_llama_3_2.generate(
            prompts=request.prompt,
            sampling_params=sampling_params
        )

        output: dict[str, Any] = {}
        for item in response:
            outputs: list[dict[str, Any]] = []
            for out in item.outputs:
                outputs.append({
                    "text": out.text,
                })
            output["output"] = outputs

        return {
            "output": output,
        }

    except Exception as e:
        return {
            "error": str(e)
        }


# @app.post("/generate-sailor-chat")
# def generate_text(request: GenerationRequest) -> list[RequestOutput] | dict[str, str]:
#     try:
#         sampling_params: SamplingParams = SamplingParams(
#             temperature=request.temperature,
#             max_tokens=request.max_tokens,
#             logit_bias=request.logit_bias,
#         )
#
#         # Generate text
#         return engine_sailor_chat.generate(
#             prompts=request.prompt,
#             sampling_params=sampling_params
#         )
#
#     except Exception as e:
#         return {
#             "error": str(e)
#         }
#