from .localizer import build_localizer
from .llm import LLM
from .utils import image_w_box
import numpy as np

class LangGround:

    def __init__(self, loc_model="owl", llm_model="Qwen/Qwen2.5-7B-Instruct"):

        self.loc = build_localizer(loc_model)
        self.llm = LLM(llm_model)

    def localize(self, frame, question, **kwargs):

        frame = np.array(frame)
        objxbox = self.loc.localize(frame, kwargs.get("threshold", 0.5))
        locobjs = self.llm.answer(question, objxbox.keys())
        locobjxbox = {k: v for k, v in objxbox.items() if k in locobjs}
        all_box_image = image_w_box(frame, objxbox)
        llm_box_image = image_w_box(frame, locobjxbox)
        texts = [(text, str(idx)) for idx, text in enumerate(locobjs)]
        return texts, all_box_image, llm_box_image