litellmlope

Running

File size: 4,034 Bytes

7db0ae4

#### What this tests ####
#    This adds perf testing to the router, to ensure it's never > 50ms slower than the azure-openai sdk.
import sys, os, time, inspect, asyncio, traceback
from datetime import datetime
import pytest

sys.path.insert(0, os.path.abspath("../.."))
import openai, litellm, uuid
from openai import AsyncAzureOpenAI

client = AsyncAzureOpenAI(
    api_key=os.getenv("AZURE_API_KEY"),
    azure_endpoint=os.getenv("AZURE_API_BASE"),  # type: ignore
    api_version=os.getenv("AZURE_API_VERSION"),
)

model_list = [
    {
        "model_name": "azure-test",
        "litellm_params": {
            "model": "azure/chatgpt-v-2",
            "api_key": os.getenv("AZURE_API_KEY"),
            "api_base": os.getenv("AZURE_API_BASE"),
            "api_version": os.getenv("AZURE_API_VERSION"),
        },
    }
]

router = litellm.Router(model_list=model_list)


async def _openai_completion():
    try:
        start_time = time.time()
        response = await client.chat.completions.create(
            model="chatgpt-v-2",
            messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
            stream=True,
        )
        time_to_first_token = None
        first_token_ts = None
        init_chunk = None
        async for chunk in response:
            if (
                time_to_first_token is None
                and len(chunk.choices) > 0
                and chunk.choices[0].delta.content is not None
            ):
                first_token_ts = time.time()
                time_to_first_token = first_token_ts - start_time
                init_chunk = chunk
        end_time = time.time()
        print(
            "OpenAI Call: ",
            init_chunk,
            start_time,
            first_token_ts,
            time_to_first_token,
            end_time,
        )
        return time_to_first_token
    except Exception as e:
        print(e)
        return None


async def _router_completion():
    try:
        start_time = time.time()
        response = await router.acompletion(
            model="azure-test",
            messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
            stream=True,
        )
        time_to_first_token = None
        first_token_ts = None
        init_chunk = None
        async for chunk in response:
            if (
                time_to_first_token is None
                and len(chunk.choices) > 0
                and chunk.choices[0].delta.content is not None
            ):
                first_token_ts = time.time()
                time_to_first_token = first_token_ts - start_time
                init_chunk = chunk
        end_time = time.time()
        print(
            "Router Call: ",
            init_chunk,
            start_time,
            first_token_ts,
            time_to_first_token,
            end_time - first_token_ts,
        )
        return time_to_first_token
    except Exception as e:
        print(e)
        return None


async def test_azure_completion_streaming():
    """
    Test azure streaming call - measure on time to first (non-null) token.
    """
    n = 3  # Number of concurrent tasks
    ## OPENAI AVG. TIME
    tasks = [_openai_completion() for _ in range(n)]
    chat_completions = await asyncio.gather(*tasks)
    successful_completions = [c for c in chat_completions if c is not None]
    total_time = 0
    for item in successful_completions:
        total_time += item
    avg_openai_time = total_time / 3
    ## ROUTER AVG. TIME
    tasks = [_router_completion() for _ in range(n)]
    chat_completions = await asyncio.gather(*tasks)
    successful_completions = [c for c in chat_completions if c is not None]
    total_time = 0
    for item in successful_completions:
        total_time += item
    avg_router_time = total_time / 3
    ## COMPARE
    print(f"avg_router_time: {avg_router_time}; avg_openai_time: {avg_openai_time}")
    assert avg_router_time < avg_openai_time + 0.5


# asyncio.run(test_azure_completion_streaming())