from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from llama_cpp import Llama
from typing import Optional
import uvicorn
import huggingface_hub

app = FastAPI(
	title="OmniVLM API",
	description="API for text generation using OmniVLM model",
	version="1.0.0"
)

# Download the model from Hugging Face Hub
model_path = huggingface_hub.hf_hub_download(
	repo_id="NexaAIDev/OmniVLM-968M",
	filename="omnivision-text-optimized-llm-Q8_0.gguf"
)

# Initialize the model with the downloaded file
llm = Llama(
	model_path=model_path,
	n_ctx=2048,  # Context window
	n_threads=4   # Number of CPU threads to use
)

class GenerationRequest(BaseModel):
	prompt: str
	max_tokens: Optional[int] = 100
	temperature: Optional[float] = 0.7
	top_p: Optional[float] = 0.9

class GenerationResponse(BaseModel):
	generated_text: str

@app.post("/generate", response_model=GenerationResponse)
async def generate_text(request: GenerationRequest):
	try:
		output = llm(
			request.prompt,
			max_tokens=request.max_tokens,
			temperature=request.temperature,
			top_p=request.top_p
		)
		
		return GenerationResponse(generated_text=output["choices"][0]["text"])
	except Exception as e:
		raise HTTPException(status_code=500, detail=str(e))

@app.get("/health")
async def health_check():
	return {"status": "healthy"}

if __name__ == "__main__":
	uvicorn.run(app, host="0.0.0.0", port=8000)