from fastapi import FastAPI, HTTPException from pydantic import BaseModel from llama_cpp import Llama from typing import Optional import uvicorn app = FastAPI( title="OmniVLM API", description="API for text generation using OmniVLM model", version="1.0.0" ) # Initialize the model llm = Llama.from_pretrained( repo_id="NexaAIDev/OmniVLM-968M", filename="omnivision-text-optimized-llm-Q8_0.gguf", ) class GenerationRequest(BaseModel): prompt: str max_tokens: Optional[int] = 100 temperature: Optional[float] = 0.7 top_p: Optional[float] = 0.9 class GenerationResponse(BaseModel): generated_text: str @app.post("/generate", response_model=GenerationResponse) async def generate_text(request: GenerationRequest): try: output = llm( request.prompt, max_tokens=request.max_tokens, temperature=request.temperature, top_p=request.top_p ) return GenerationResponse(generated_text=output["choices"][0]["text"]) except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @app.get("/health") async def health_check(): return {"status": "healthy"} if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=8000)