import torch
import supervision as sv
import cv2
import numpy as np
import os
from segment_anything import SamPredictor, sam_model_registry
from diffusers import StableDiffusionInpaintPipeline
from torchvision.ops import box_convert
from typing import List

class SelfSupervised:
    def __init__(self):
        from groundingdino.util.inference import load_model
        
        # -----Set Image and CUDA
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        # ------SAM Parameters
        self.model_type = "vit_h"
        self.predictor = SamPredictor(sam_model_registry[self.model_type](
            checkpoint="./weights/sam_vit_h_4b8939.pth").to(device=self.device))
        # ------Stable Diffusion
        self.pipe = StableDiffusionInpaintPipeline.from_pretrained(
            "stabilityai/stable-diffusion-2-inpainting", torch_dtype=torch.float16,).to(self.device)
        # ----Grounding DINO
        self.groundingdino_model = load_model(
            "fengxai/config/GroundingDINO_SwinT_OGC.py", "weights/groundingdino_swint_ogc.pth")

    
    def checkAnnotate(self, image_source: np.ndarray, boxes: torch.Tensor, logits: torch.Tensor, phrases: List[str]):
        # 在原始图像中添加boxes
        h, w, _ = image_source.shape
        boxes = boxes * torch.Tensor([w, h, w, h])
        # 参考：https://pytorch.org/vision/main/generated/torchvision.ops.box_convert.html
        # xyxy: x1y1 为左上角，x2y2为右下角
        # cxcywh: 通过盒子的中心，cxcy为盒子的中心，wh为宽度和高度
        xyxy = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy()
        detections = sv.Detections(xyxy=xyxy)
        boxesHeight=int(xyxy[0][3]-xyxy[0][1])
        boxesWidth=int(xyxy[0][2]-xyxy[0][0])
        labels = [
            f"{phrase} {logit:.2f} w:{boxesWidth} h:{boxesHeight}"
            for phrase, logit in zip(phrases, logits)
        ]
        box_annotator = sv.BoxAnnotator()
        annotated_frame = cv2.cvtColor(image_source, cv2.COLOR_RGB2BGR)
        annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)
        return annotated_frame, xyxy

    
    # 预测图片
    def imagePredict(self, imageFile, item="clothing", boxThreshold=0.3, textTreshold=0.25):
        from groundingdino.util.inference import load_image, predict
        src, img = load_image(imageFile)
        h, w, _ = src.shape
        boxes, logits, phrases = predict(
            model=self.groundingdino_model,
            image=img,
            caption=item,
            box_threshold=boxThreshold,
            text_threshold=textTreshold
        )
        # 查看annotate相关的信息
        imgAnnnotated, xyxy = self.checkAnnotate(
            image_source=src, boxes=boxes, logits=logits, phrases=phrases
        )
        imgAnnnotated = imgAnnnotated[..., ::-1]
        
        boxesHeight=int(xyxy[0][3]-xyxy[0][1])
        boxesWidth=int(xyxy[0][2]-xyxy[0][0])
        
        imageOutPutFile = "data/annotated_image.jpg"
        fileList = imageOutPutFile.split("/")[0]
        if not os.path.exists(fileList):
            print("fileList=", fileList)
            os.mkdir(fileList)
        cv2.imwrite(imageOutPutFile, imgAnnnotated)
        print("os cwd=", os.getcwd())
        for root, dirs, files in os.walk(os.getcwd()):
            print("root=", root)
            print("files=", files)
        
        print("data=")
        for root, dirs, files in os.walk("data/"):
            print("root=", root)
            print("files=", files)

        return {
            "imageOutput": imageOutPutFile,
            "imageHeight": h,
            "imageWidth": w,
            "objectHeight": boxesHeight,
            "objectWidth": boxesWidth
        }