Spaces:
Running
Running
File size: 4,034 Bytes
7db0ae4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
#### What this tests ####
# This adds perf testing to the router, to ensure it's never > 50ms slower than the azure-openai sdk.
import sys, os, time, inspect, asyncio, traceback
from datetime import datetime
import pytest
sys.path.insert(0, os.path.abspath("../.."))
import openai, litellm, uuid
from openai import AsyncAzureOpenAI
client = AsyncAzureOpenAI(
api_key=os.getenv("AZURE_API_KEY"),
azure_endpoint=os.getenv("AZURE_API_BASE"), # type: ignore
api_version=os.getenv("AZURE_API_VERSION"),
)
model_list = [
{
"model_name": "azure-test",
"litellm_params": {
"model": "azure/chatgpt-v-2",
"api_key": os.getenv("AZURE_API_KEY"),
"api_base": os.getenv("AZURE_API_BASE"),
"api_version": os.getenv("AZURE_API_VERSION"),
},
}
]
router = litellm.Router(model_list=model_list)
async def _openai_completion():
try:
start_time = time.time()
response = await client.chat.completions.create(
model="chatgpt-v-2",
messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
stream=True,
)
time_to_first_token = None
first_token_ts = None
init_chunk = None
async for chunk in response:
if (
time_to_first_token is None
and len(chunk.choices) > 0
and chunk.choices[0].delta.content is not None
):
first_token_ts = time.time()
time_to_first_token = first_token_ts - start_time
init_chunk = chunk
end_time = time.time()
print(
"OpenAI Call: ",
init_chunk,
start_time,
first_token_ts,
time_to_first_token,
end_time,
)
return time_to_first_token
except Exception as e:
print(e)
return None
async def _router_completion():
try:
start_time = time.time()
response = await router.acompletion(
model="azure-test",
messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
stream=True,
)
time_to_first_token = None
first_token_ts = None
init_chunk = None
async for chunk in response:
if (
time_to_first_token is None
and len(chunk.choices) > 0
and chunk.choices[0].delta.content is not None
):
first_token_ts = time.time()
time_to_first_token = first_token_ts - start_time
init_chunk = chunk
end_time = time.time()
print(
"Router Call: ",
init_chunk,
start_time,
first_token_ts,
time_to_first_token,
end_time - first_token_ts,
)
return time_to_first_token
except Exception as e:
print(e)
return None
async def test_azure_completion_streaming():
"""
Test azure streaming call - measure on time to first (non-null) token.
"""
n = 3 # Number of concurrent tasks
## OPENAI AVG. TIME
tasks = [_openai_completion() for _ in range(n)]
chat_completions = await asyncio.gather(*tasks)
successful_completions = [c for c in chat_completions if c is not None]
total_time = 0
for item in successful_completions:
total_time += item
avg_openai_time = total_time / 3
## ROUTER AVG. TIME
tasks = [_router_completion() for _ in range(n)]
chat_completions = await asyncio.gather(*tasks)
successful_completions = [c for c in chat_completions if c is not None]
total_time = 0
for item in successful_completions:
total_time += item
avg_router_time = total_time / 3
## COMPARE
print(f"avg_router_time: {avg_router_time}; avg_openai_time: {avg_openai_time}")
assert avg_router_time < avg_openai_time + 0.5
# asyncio.run(test_azure_completion_streaming())
|