File size: 4,171 Bytes
0dc25c1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
""" A model worker executes the model. """ import argparse import json import uuid from fastapi import FastAPI, Request from fastapi.responses import StreamingResponse from transformers import AutoModel, AutoTokenizer import torch import uvicorn import bitsandbytes as bnb from transformers import BitsAndBytesConfig from transformers.generation.streamers import BaseStreamer from threading import Thread from queue import Queue class TokenStreamer(BaseStreamer): def __init__(self, skip_prompt: bool = False, timeout=None): self.skip_prompt = skip_prompt # variables used in the streaming process self.token_queue = Queue() self.stop_signal = None self.next_tokens_are_prompt = True self.timeout = timeout def put(self, value): if len(value.shape) > 1 and value.shape[0] > 1: raise ValueError("TextStreamer only supports batch size 1") elif len(value.shape) > 1: value = value[0] if self.skip_prompt and self.next_tokens_are_prompt: self.next_tokens_are_prompt = False return for token in value.tolist(): self.token_queue.put(token) def end(self): self.token_queue.put(self.stop_signal) def __iter__(self): return self def __next__(self): value = self.token_queue.get(timeout=self.timeout) if value == self.stop_signal: raise StopIteration() else: return value class ModelWorker: def __init__(self, model_path, device='cuda'): self.device = device # Configure 4-bit quantization quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True ) self.glm_model = AutoModel.from_pretrained( model_path, trust_remote_code=True, device_map=device, # Use device_map instead of device quantization_config=quantization_config ).eval() # Remove .to(device) call self.glm_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) @torch.inference_mode() def generate_stream(self, params): tokenizer, model = self.glm_tokenizer, self.glm_model prompt = params["prompt"] temperature = float(params.get("temperature", 1.0)) top_p = float(params.get("top_p", 1.0)) max_new_tokens = int(params.get("max_new_tokens", 256)) inputs = tokenizer([prompt], return_tensors="pt") inputs = inputs.to(self.device) streamer = TokenStreamer(skip_prompt=True) thread = Thread(target=model.generate, kwargs=dict(**inputs, max_new_tokens=int(max_new_tokens), temperature=float(temperature), top_p=float(top_p), streamer=streamer)) thread.start() for token_id in streamer: yield (json.dumps({"token_id": token_id, "error_code": 0}) + "\n").encode() def generate_stream_gate(self, params): try: for x in self.generate_stream(params): yield x except Exception as e: print("Caught Unknown Error", e) ret = { "text": "Server Error", "error_code": 1, } yield (json.dumps(ret)+ "\n").encode() app = FastAPI() @app.post("/generate_stream") async def generate_stream(request: Request): params = await request.json() generator = worker.generate_stream_gate(params) return StreamingResponse(generator) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--host", type=str, default="localhost") parser.add_argument("--port", type=int, default=10000) parser.add_argument("--model-path", type=str, default="THUDM/glm-4-voice-9b") args = parser.parse_args() worker = ModelWorker(args.model_path) uvicorn.run(app, host=args.host, port=args.port, log_level="info") |