from fastapi import FastAPI, HTTPException from pydantic import BaseModel from llama_cpp import Llama from typing import Optional import uvicorn import huggingface_hub app = FastAPI( title="OmniVLM API", description="API for text generation using OmniVLM model", version="1.0.0" ) # Download the model from Hugging Face Hub model_path = huggingface_hub.hf_hub_download( repo_id="NexaAIDev/OmniVLM-968M", filename="omnivision-text-optimized-llm-Q8_0.gguf" ) # Initialize the model with the downloaded file llm = Llama( model_path=model_path, n_ctx=2048, # Context window n_threads=4 # Number of CPU threads to use ) class GenerationRequest(BaseModel): prompt: str max_tokens: Optional[int] = 100 temperature: Optional[float] = 0.7 top_p: Optional[float] = 0.9 class GenerationResponse(BaseModel): generated_text: str @app.post("/generate", response_model=GenerationResponse) async def generate_text(request: GenerationRequest): try: output = llm( request.prompt, max_tokens=request.max_tokens, temperature=request.temperature, top_p=request.top_p ) return GenerationResponse(generated_text=output["choices"][0]["text"]) except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @app.get("/health") async def health_check(): return {"status": "healthy"} if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=8000)