Spaces:
Runtime error
Runtime error
ncoop57
commited on
Commit
•
b97f6e6
1
Parent(s):
c457faa
Initial Commit with starter container
Browse files- Dockerfile +13 -0
- README.md +2 -2
- __init__.py +0 -0
- app.py +65 -0
- cgtok/added_tokens.json +40 -0
- cgtok/merges.txt +0 -0
- cgtok/openai_format/tokenizer.json +0 -0
- cgtok/openai_format/vocab.bpe +0 -0
- cgtok/special_tokens_map.json +5 -0
- cgtok/tokenizer.json +0 -0
- cgtok/tokenizer_config.json +10 -0
- cgtok/vocab.json +0 -0
- config/__init__.py +0 -0
- config/log_config.py +27 -0
- models.py +23 -0
- requirements.txt +6 -0
- utils/__init__.py +0 -0
- utils/codegen.py +258 -0
- utils/errors.py +19 -0
Dockerfile
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.10-slim-buster
|
2 |
+
|
3 |
+
WORKDIR /python-docker
|
4 |
+
|
5 |
+
COPY copilot_proxy/requirements.txt requirements.txt
|
6 |
+
|
7 |
+
RUN pip3 install --no-cache-dir -r requirements.txt
|
8 |
+
|
9 |
+
COPY copilot_proxy .
|
10 |
+
|
11 |
+
EXPOSE 5000
|
12 |
+
|
13 |
+
CMD ["uvicorn", "--host", "0.0.0.0", "--port", "5000", "app:app"]
|
README.md
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
---
|
2 |
title: Santacoder Openai
|
3 |
-
emoji:
|
4 |
colorFrom: gray
|
5 |
colorTo: blue
|
6 |
sdk: docker
|
7 |
pinned: false
|
8 |
---
|
9 |
|
10 |
-
|
|
|
1 |
---
|
2 |
title: Santacoder Openai
|
3 |
+
emoji: 🤖
|
4 |
colorFrom: gray
|
5 |
colorTo: blue
|
6 |
sdk: docker
|
7 |
pinned: false
|
8 |
---
|
9 |
|
10 |
+
This space uses the awesome FauxPilot Copilot server from this [repo](https://github.com/fauxpilot/fauxpilot/tree/main/copilot_proxy).
|
__init__.py
ADDED
File without changes
|
app.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
|
4 |
+
import uvicorn
|
5 |
+
from fastapi import FastAPI, Request, Response
|
6 |
+
from fastapi.responses import JSONResponse
|
7 |
+
from sse_starlette.sse import EventSourceResponse
|
8 |
+
|
9 |
+
from config.log_config import uvicorn_logger
|
10 |
+
from models import OpenAIinput
|
11 |
+
from utils.codegen import CodeGenProxy
|
12 |
+
from utils.errors import FauxPilotException
|
13 |
+
|
14 |
+
logging.config.dictConfig(uvicorn_logger)
|
15 |
+
|
16 |
+
codegen = CodeGenProxy(
|
17 |
+
host=os.environ.get("TRITON_HOST", "triton"),
|
18 |
+
port=os.environ.get("TRITON_PORT", 8001),
|
19 |
+
verbose=os.environ.get("TRITON_VERBOSITY", False)
|
20 |
+
)
|
21 |
+
|
22 |
+
app = FastAPI(
|
23 |
+
title="FauxPilot",
|
24 |
+
description="This is an attempt to build a locally hosted version of GitHub Copilot. It uses the SalesForce CodeGen"
|
25 |
+
"models inside of NVIDIA's Triton Inference Server with the FasterTransformer backend.",
|
26 |
+
docs_url="/",
|
27 |
+
swagger_ui_parameters={"defaultModelsExpandDepth": -1}
|
28 |
+
)
|
29 |
+
|
30 |
+
@app.exception_handler(FauxPilotException)
|
31 |
+
async def fauxpilot_handler(request: Request, exc: FauxPilotException):
|
32 |
+
return JSONResponse(
|
33 |
+
status_code=400,
|
34 |
+
content=exc.json()
|
35 |
+
)
|
36 |
+
|
37 |
+
@app.post("/v1/engines/codegen/completions")
|
38 |
+
@app.post("/v1/completions")
|
39 |
+
async def completions(data: OpenAIinput):
|
40 |
+
data = data.dict()
|
41 |
+
try:
|
42 |
+
content = codegen(data=data)
|
43 |
+
except codegen.TokensExceedsMaximum as E:
|
44 |
+
raise FauxPilotException(
|
45 |
+
message=str(E),
|
46 |
+
type="invalid_request_error",
|
47 |
+
param=None,
|
48 |
+
code=None,
|
49 |
+
)
|
50 |
+
|
51 |
+
if data.get("stream") is not None:
|
52 |
+
return EventSourceResponse(
|
53 |
+
content=content,
|
54 |
+
status_code=200,
|
55 |
+
media_type="text/event-stream"
|
56 |
+
)
|
57 |
+
else:
|
58 |
+
return Response(
|
59 |
+
status_code=200,
|
60 |
+
content=content,
|
61 |
+
media_type="application/json"
|
62 |
+
)
|
63 |
+
|
64 |
+
if __name__ == "__main__":
|
65 |
+
uvicorn.run("app:app", host="0.0.0.0", port=5000)
|
cgtok/added_tokens.json
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"\t\t": 50294,
|
3 |
+
"\t\t\t": 50293,
|
4 |
+
"\t\t\t\t": 50292,
|
5 |
+
"\t\t\t\t\t": 50291,
|
6 |
+
"\t\t\t\t\t\t": 50290,
|
7 |
+
"\t\t\t\t\t\t\t": 50289,
|
8 |
+
"\t\t\t\t\t\t\t\t": 50288,
|
9 |
+
"\t\t\t\t\t\t\t\t\t": 50287,
|
10 |
+
" ": 50286,
|
11 |
+
" ": 50285,
|
12 |
+
" ": 50284,
|
13 |
+
" ": 50283,
|
14 |
+
" ": 50282,
|
15 |
+
" ": 50281,
|
16 |
+
" ": 50280,
|
17 |
+
" ": 50279,
|
18 |
+
" ": 50278,
|
19 |
+
" ": 50277,
|
20 |
+
" ": 50276,
|
21 |
+
" ": 50275,
|
22 |
+
" ": 50274,
|
23 |
+
" ": 50273,
|
24 |
+
" ": 50272,
|
25 |
+
" ": 50271,
|
26 |
+
" ": 50270,
|
27 |
+
" ": 50269,
|
28 |
+
" ": 50268,
|
29 |
+
" ": 50267,
|
30 |
+
" ": 50266,
|
31 |
+
" ": 50265,
|
32 |
+
" ": 50264,
|
33 |
+
" ": 50263,
|
34 |
+
" ": 50262,
|
35 |
+
" ": 50261,
|
36 |
+
" ": 50260,
|
37 |
+
" ": 50259,
|
38 |
+
" ": 50258,
|
39 |
+
" ": 50257
|
40 |
+
}
|
cgtok/merges.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
cgtok/openai_format/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
cgtok/openai_format/vocab.bpe
ADDED
The diff for this file is too large to render.
See raw diff
|
|
cgtok/special_tokens_map.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": "<|endoftext|>",
|
3 |
+
"eos_token": "<|endoftext|>",
|
4 |
+
"unk_token": "<|endoftext|>"
|
5 |
+
}
|
cgtok/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
cgtok/tokenizer_config.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_prefix_space": false,
|
3 |
+
"bos_token": "<|endoftext|>",
|
4 |
+
"eos_token": "<|endoftext|>",
|
5 |
+
"model_max_length": 1024,
|
6 |
+
"name_or_path": "Salesforce/codegen-350M-mono",
|
7 |
+
"special_tokens_map_file": null,
|
8 |
+
"tokenizer_class": "CodeGenTokenizer",
|
9 |
+
"unk_token": "<|endoftext|>"
|
10 |
+
}
|
cgtok/vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
config/__init__.py
ADDED
File without changes
|
config/log_config.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# The uvicorn_logger is used to add timestamps
|
2 |
+
|
3 |
+
uvicorn_logger = {
|
4 |
+
"version": 1,
|
5 |
+
"disable_existing_loggers": False,
|
6 |
+
"formatters": {
|
7 |
+
"access": {
|
8 |
+
"()": "uvicorn.logging.AccessFormatter",
|
9 |
+
"fmt": '%(levelprefix)s %(asctime)s :: %(client_addr)s - "%(request_line)s" %(status_code)s',
|
10 |
+
"use_colors": True
|
11 |
+
},
|
12 |
+
},
|
13 |
+
"handlers": {
|
14 |
+
"access": {
|
15 |
+
"formatter": "access",
|
16 |
+
"class": "logging.StreamHandler",
|
17 |
+
"stream": "ext://sys.stdout",
|
18 |
+
},
|
19 |
+
},
|
20 |
+
"loggers": {
|
21 |
+
"uvicorn.access": {
|
22 |
+
"handlers": ["access"],
|
23 |
+
# "level": "INFO",
|
24 |
+
"propagate": False
|
25 |
+
},
|
26 |
+
},
|
27 |
+
}
|
models.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Optional, Union
|
2 |
+
|
3 |
+
from pydantic import BaseModel
|
4 |
+
|
5 |
+
|
6 |
+
class OpenAIinput(BaseModel):
|
7 |
+
model: str = "fastertransformer"
|
8 |
+
prompt: Optional[str]
|
9 |
+
suffix: Optional[str]
|
10 |
+
max_tokens: Optional[int] = 16
|
11 |
+
temperature: Optional[float] = 0.6
|
12 |
+
top_p: Optional[float] = 1.0
|
13 |
+
n: Optional[int] = 1
|
14 |
+
stream: Optional[bool]
|
15 |
+
logprobs: Optional[int] = None
|
16 |
+
echo: Optional[bool]
|
17 |
+
stop: Optional[Union[str, list]]
|
18 |
+
presence_penalty: Optional[float] = 0
|
19 |
+
frequency_penalty: Optional[float] = 1
|
20 |
+
best_of: Optional[int] = 1
|
21 |
+
logit_bias: Optional[dict]
|
22 |
+
user: Optional[str]
|
23 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fastapi==0.82.0
|
2 |
+
numpy==1.23.2
|
3 |
+
sse-starlette==1.1.6
|
4 |
+
tokenizers==0.12.1
|
5 |
+
tritonclient[all]==2.25.0
|
6 |
+
uvicorn==0.18.3
|
utils/__init__.py
ADDED
File without changes
|
utils/codegen.py
ADDED
@@ -0,0 +1,258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import random
|
3 |
+
import string
|
4 |
+
import time
|
5 |
+
|
6 |
+
import numpy as np
|
7 |
+
import tritonclient.grpc as client_util
|
8 |
+
from tokenizers import Tokenizer
|
9 |
+
from tritonclient.utils import np_to_triton_dtype, InferenceServerException
|
10 |
+
|
11 |
+
np.finfo(np.dtype("float32"))
|
12 |
+
np.finfo(np.dtype("float64"))
|
13 |
+
|
14 |
+
|
15 |
+
class CodeGenProxy:
|
16 |
+
def __init__(self, host: str = 'triton', port: int = 8001, verbose: bool = False):
|
17 |
+
self.tokenizer = Tokenizer.from_file('/python-docker/cgtok/tokenizer.json')
|
18 |
+
self.client = client_util.InferenceServerClient(url=f'{host}:{port}', verbose=verbose)
|
19 |
+
self.PAD_CHAR = 50256
|
20 |
+
|
21 |
+
# Max number of tokens the model can handle
|
22 |
+
self.MAX_MODEL_LEN = 2048
|
23 |
+
|
24 |
+
class TokensExceedsMaximum(Exception):
|
25 |
+
pass
|
26 |
+
|
27 |
+
@staticmethod
|
28 |
+
def prepare_tensor(name: str, tensor_input):
|
29 |
+
t = client_util.InferInput(
|
30 |
+
name, tensor_input.shape, np_to_triton_dtype(tensor_input.dtype))
|
31 |
+
t.set_data_from_numpy(tensor_input)
|
32 |
+
return t
|
33 |
+
|
34 |
+
@staticmethod
|
35 |
+
def trim_with_stopwords(output: str, stopwords: list) -> str:
|
36 |
+
for w in sorted(stopwords, key=len, reverse=True):
|
37 |
+
if output.endswith(w):
|
38 |
+
output = output[:-len(w)]
|
39 |
+
break
|
40 |
+
return output
|
41 |
+
|
42 |
+
@staticmethod
|
43 |
+
def to_word_list_format(word_dict, tokenizer):
|
44 |
+
flat_ids = []
|
45 |
+
offsets = []
|
46 |
+
for word_dict_item in word_dict:
|
47 |
+
item_flat_ids = []
|
48 |
+
item_offsets = []
|
49 |
+
|
50 |
+
for word in word_dict_item:
|
51 |
+
ids = tokenizer.encode(word).ids
|
52 |
+
|
53 |
+
if len(ids) == 0:
|
54 |
+
continue
|
55 |
+
|
56 |
+
item_flat_ids += ids
|
57 |
+
item_offsets.append(len(ids))
|
58 |
+
|
59 |
+
# Hack, can we do this better?
|
60 |
+
if word == '\n\n':
|
61 |
+
item_flat_ids += [198, 198]
|
62 |
+
item_offsets.append(2)
|
63 |
+
|
64 |
+
flat_ids.append(np.array(item_flat_ids))
|
65 |
+
offsets.append(np.cumsum(np.array(item_offsets)))
|
66 |
+
|
67 |
+
pad_to = max(1, max(len(ids) for ids in flat_ids))
|
68 |
+
|
69 |
+
for i, (ids, offs) in enumerate(zip(flat_ids, offsets)):
|
70 |
+
flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)), constant_values=0)
|
71 |
+
offsets[i] = np.pad(offs, (0, pad_to - len(offs)), constant_values=-1)
|
72 |
+
|
73 |
+
return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2))
|
74 |
+
|
75 |
+
def generate(self, data):
|
76 |
+
prompt = data['prompt']
|
77 |
+
n = data.get('n', 1)
|
78 |
+
model_name = data["model"]
|
79 |
+
# ugly hack to set the data type correctly. Huggingface models want int32, but fastertransformer needs uint32
|
80 |
+
# i could've done the conversion from uint32 to int32 in the model but that'd be inefficient.
|
81 |
+
np_type = np.int32 if model_name.startswith("py-") else np.uint32
|
82 |
+
|
83 |
+
input_start_ids = np.expand_dims(self.tokenizer.encode(prompt).ids, 0)
|
84 |
+
input_start_ids = np.repeat(input_start_ids, n, axis=0).astype(np_type)
|
85 |
+
prompt_len = input_start_ids.shape[1]
|
86 |
+
input_len = prompt_len * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
|
87 |
+
max_tokens = data.get('max_tokens', 16)
|
88 |
+
prompt_tokens: int = input_len[0][0]
|
89 |
+
requested_tokens = max_tokens + prompt_tokens
|
90 |
+
if requested_tokens > self.MAX_MODEL_LEN:
|
91 |
+
print(1)
|
92 |
+
raise self.TokensExceedsMaximum(
|
93 |
+
f"This model's maximum context length is {self.MAX_MODEL_LEN}, however you requested "
|
94 |
+
f"{requested_tokens} tokens ({prompt_tokens} in your prompt; {max_tokens} for the completion). "
|
95 |
+
f"Please reduce your prompt; or completion length."
|
96 |
+
)
|
97 |
+
output_len = np.ones_like(input_len).astype(np_type) * max_tokens
|
98 |
+
num_logprobs = data.get('logprobs', -1)
|
99 |
+
if num_logprobs is None:
|
100 |
+
num_logprobs = 1
|
101 |
+
want_logprobs = num_logprobs > 0
|
102 |
+
|
103 |
+
temperature = data.get('temperature', 0.2)
|
104 |
+
if temperature == 0.0:
|
105 |
+
temperature = 1.0
|
106 |
+
top_k = 1
|
107 |
+
else:
|
108 |
+
top_k = data.get('top_k', 0)
|
109 |
+
|
110 |
+
top_p = data.get('top_p', 1.0)
|
111 |
+
frequency_penalty = data.get('frequency_penalty', 1.0)
|
112 |
+
runtime_top_k = top_k * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
|
113 |
+
runtime_top_p = top_p * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
114 |
+
beam_search_diversity_rate = 0.0 * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
115 |
+
random_seed = np.random.randint(0, 2 ** 31 - 1, (input_start_ids.shape[0], 1), dtype=np.int32)
|
116 |
+
temperature = temperature * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
117 |
+
len_penalty = 1.0 * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
118 |
+
repetition_penalty = frequency_penalty * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
119 |
+
is_return_log_probs = want_logprobs * np.ones([input_start_ids.shape[0], 1]).astype(np.bool_)
|
120 |
+
beam_width = (1 * np.ones([input_start_ids.shape[0], 1])).astype(np_type)
|
121 |
+
start_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
|
122 |
+
end_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
|
123 |
+
|
124 |
+
stop_words = data.get('stop', [])
|
125 |
+
if stop_words is None:
|
126 |
+
stop_words = []
|
127 |
+
if stop_words:
|
128 |
+
stop_word_list = np.repeat(self.to_word_list_format([stop_words], self.tokenizer), input_start_ids.shape[0],
|
129 |
+
axis=0)
|
130 |
+
else:
|
131 |
+
stop_word_list = np.concatenate([np.zeros([input_start_ids.shape[0], 1, 1]).astype(
|
132 |
+
np.int32), (-1 * np.ones([input_start_ids.shape[0], 1, 1])).astype(np.int32)], axis=1)
|
133 |
+
|
134 |
+
# Not used
|
135 |
+
bad_words_list = np.concatenate([np.zeros([input_start_ids.shape[0], 1, 1]).astype(
|
136 |
+
np.int32), (-1 * np.ones([input_start_ids.shape[0], 1, 1])).astype(np.int32)], axis=1)
|
137 |
+
|
138 |
+
inputs = [
|
139 |
+
self.prepare_tensor("input_ids", input_start_ids),
|
140 |
+
self.prepare_tensor("input_lengths", input_len),
|
141 |
+
self.prepare_tensor("request_output_len", output_len),
|
142 |
+
self.prepare_tensor("runtime_top_k", runtime_top_k),
|
143 |
+
self.prepare_tensor("runtime_top_p", runtime_top_p),
|
144 |
+
self.prepare_tensor("beam_search_diversity_rate", beam_search_diversity_rate),
|
145 |
+
self.prepare_tensor("random_seed", random_seed),
|
146 |
+
self.prepare_tensor("temperature", temperature),
|
147 |
+
self.prepare_tensor("len_penalty", len_penalty),
|
148 |
+
self.prepare_tensor("repetition_penalty", repetition_penalty),
|
149 |
+
self.prepare_tensor("is_return_log_probs", is_return_log_probs),
|
150 |
+
self.prepare_tensor("beam_width", beam_width),
|
151 |
+
self.prepare_tensor("start_id", start_ids),
|
152 |
+
self.prepare_tensor("end_id", end_ids),
|
153 |
+
self.prepare_tensor("bad_words_list", bad_words_list),
|
154 |
+
self.prepare_tensor("stop_words_list", stop_word_list),
|
155 |
+
]
|
156 |
+
|
157 |
+
result = self.client.infer(model_name, inputs)
|
158 |
+
|
159 |
+
output_data = result.as_numpy("output_ids")
|
160 |
+
if output_data is None:
|
161 |
+
raise RuntimeError("No output data")
|
162 |
+
|
163 |
+
# All of these squeeze(1)s are to remove the beam width dimension.
|
164 |
+
output_data = output_data.squeeze(1)
|
165 |
+
if want_logprobs:
|
166 |
+
lp_data = result.as_numpy("output_log_probs").squeeze(1)
|
167 |
+
# clp_data = result.as_numpy("cum_log_probs").squeeze(1)
|
168 |
+
else:
|
169 |
+
lp_data = [None] * output_data.shape[0]
|
170 |
+
sequence_lengths = result.as_numpy("sequence_length").squeeze(1)
|
171 |
+
gen_len = sequence_lengths - input_len.squeeze(1)
|
172 |
+
|
173 |
+
decoded = self.tokenizer.decode_batch([out[prompt_len:prompt_len + g] for g, out in zip(gen_len, output_data)])
|
174 |
+
trimmed = [self.trim_with_stopwords(d, stop_words) for d in decoded]
|
175 |
+
|
176 |
+
choices = []
|
177 |
+
for i, (text, tokens, lps, g) in enumerate(zip(trimmed, output_data, lp_data, gen_len)):
|
178 |
+
reason = "length" if max_tokens == g else "stop"
|
179 |
+
if lps is not None:
|
180 |
+
tokens_str = [self.tokenizer.decode([t]) for t in tokens[prompt_len:prompt_len + g]]
|
181 |
+
offsets = [len(prompt)] + (np.cumsum([len(t) for t in tokens_str]) + len(prompt)).tolist()[:-1]
|
182 |
+
|
183 |
+
# Fake some log probs for top_logprobs
|
184 |
+
top_logprobs = []
|
185 |
+
for ii, t in enumerate(tokens_str):
|
186 |
+
fakedict = {}
|
187 |
+
top_token_lp = float(lps[ii])
|
188 |
+
fakedict[t] = top_token_lp
|
189 |
+
while len(fakedict) < num_logprobs:
|
190 |
+
random_token = random.randint(0, self.tokenizer.get_vocab_size() - 1)
|
191 |
+
random_token_str = self.tokenizer.decode([random_token])
|
192 |
+
if random_token_str in fakedict:
|
193 |
+
continue
|
194 |
+
random_token_lp = top_token_lp - random.random()
|
195 |
+
fakedict[random_token_str] = random_token_lp
|
196 |
+
top_logprobs.append(fakedict)
|
197 |
+
|
198 |
+
lpdict = {
|
199 |
+
'token_logprobs': lps.tolist(),
|
200 |
+
'top_logprobs': top_logprobs,
|
201 |
+
'tokens': tokens_str,
|
202 |
+
'text_offset': offsets,
|
203 |
+
}
|
204 |
+
else:
|
205 |
+
lpdict = None
|
206 |
+
|
207 |
+
choice = {
|
208 |
+
'text': text,
|
209 |
+
'index': i,
|
210 |
+
'finish_reason': reason,
|
211 |
+
'logprobs': lpdict,
|
212 |
+
}
|
213 |
+
choices.append(choice)
|
214 |
+
|
215 |
+
completion = {
|
216 |
+
'id': None, # fill in
|
217 |
+
'model': 'codegen',
|
218 |
+
'object': 'text_completion',
|
219 |
+
'created': int(time.time()),
|
220 |
+
'choices': None, # fill in
|
221 |
+
'usage': {
|
222 |
+
'completion_tokens': int(gen_len.sum()),
|
223 |
+
'prompt_tokens': int(prompt_len),
|
224 |
+
'total_tokens': int(gen_len.sum() + prompt_len),
|
225 |
+
}
|
226 |
+
}
|
227 |
+
return completion, choices
|
228 |
+
|
229 |
+
@staticmethod
|
230 |
+
def random_completion_id():
|
231 |
+
return 'cmpl-' + ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(29))
|
232 |
+
|
233 |
+
def streamed_response(self, completion, choices):
|
234 |
+
for c in choices:
|
235 |
+
completion['id'] = self.random_completion_id()
|
236 |
+
completion['choices'] = [c]
|
237 |
+
yield f'data: {json.dumps(completion)}\n\n'
|
238 |
+
yield 'data: [DONE]\n\n'
|
239 |
+
|
240 |
+
def non_streamed_response(self, completion, choices) -> str:
|
241 |
+
completion['id'] = self.random_completion_id()
|
242 |
+
completion['choices'] = choices
|
243 |
+
return json.dumps(completion)
|
244 |
+
|
245 |
+
def __call__(self, data: dict):
|
246 |
+
st = time.time()
|
247 |
+
try:
|
248 |
+
completion, choices = self.generate(data)
|
249 |
+
except InferenceServerException as E:
|
250 |
+
print(E)
|
251 |
+
completion = {}
|
252 |
+
choices = []
|
253 |
+
ed = time.time()
|
254 |
+
print(f"Returned completion in {(ed - st) * 1000} ms")
|
255 |
+
if data.get('stream', False):
|
256 |
+
return self.streamed_response(completion, choices)
|
257 |
+
else:
|
258 |
+
return self.non_streamed_response(completion, choices)
|
utils/errors.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import *
|
2 |
+
|
3 |
+
class FauxPilotException(Exception):
|
4 |
+
def __init__(self, message: str, type: Optional[str] = None, param: Optional[str] = None, code: Optional[int] = None):
|
5 |
+
super().__init__(message)
|
6 |
+
self.message = message
|
7 |
+
self.type = type
|
8 |
+
self.param = param
|
9 |
+
self.code = code
|
10 |
+
|
11 |
+
def json(self):
|
12 |
+
return {
|
13 |
+
'error': {
|
14 |
+
'message': self.message,
|
15 |
+
'type': self.type,
|
16 |
+
'param': self.param,
|
17 |
+
'code': self.code
|
18 |
+
}
|
19 |
+
}
|