from typing import Union from fastapi import FastAPI from pydantic import BaseModel from llama_cpp import Llama class InferenceRequest(BaseModel): input: Union[str, None] = None max_tokens: Union[int, None] = 0 app = FastAPI() llm = Llama(model_path="./models/mistral-7b-openorca.Q4_K_S.gguf", verbose=False, n_ctx=8192) @app.get("/") async def root(): return {"message": "Hello World"} @app.post('/inference') async def inference(request: InferenceRequest): input_text = request.input max_tokens = 256 try: max_tokens = int(request.max_tokens) except: pass # process request try: result = llm(input_text, temperature=0.2, top_k=5, max_tokens=max_tokens, stop=["<|im_end|>"]) return result except: pass # create response return {}