from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from llama_cpp import Llama
from typing import Optional
import uvicorn

app = FastAPI(
	title="OmniVLM API",
	description="API for text generation using OmniVLM model",
	version="1.0.0"
)

# Initialize the model
llm = Llama.from_pretrained(
	repo_id="NexaAIDev/OmniVLM-968M",
	filename="omnivision-text-optimized-llm-Q8_0.gguf",
)

class GenerationRequest(BaseModel):
	prompt: str
	max_tokens: Optional[int] = 100
	temperature: Optional[float] = 0.7
	top_p: Optional[float] = 0.9

class GenerationResponse(BaseModel):
	generated_text: str

@app.post("/generate", response_model=GenerationResponse)
async def generate_text(request: GenerationRequest):
	try:
		output = llm(
			request.prompt,
			max_tokens=request.max_tokens,
			temperature=request.temperature,
			top_p=request.top_p
		)
		
		return GenerationResponse(generated_text=output["choices"][0]["text"])
	except Exception as e:
		raise HTTPException(status_code=500, detail=str(e))

@app.get("/health")
async def health_check():
	return {"status": "healthy"}

if __name__ == "__main__":
	uvicorn.run(app, host="0.0.0.0", port=8000)