import json
import numpy as np
from infer_utils import create_mask
from shapely.wkt import loads
from collections import defaultdict
from tqdm import tqdm

def clean_string(s):
    return s.replace(' ', '-').replace('.', '').lower()

def get_class_dict(dataset):
    if dataset == "qfabric":
        class_dict = {
            "temporal_region_based_question_answering: What is the development status in this region [bbox] in image N?":
            {
                "prior-construction": 1, 
                "greenland ": 2, 
                "land-cleared": 3, 
                "excavation": 4, 
                "materials-dumped": 5, 
                "construction-started": 6, 
                "construction-midway": 7, 
                "construction-done": 8, 
                "operational": 9
            },
            "region_based_question_answering: Identify the type of urban development that has occurred in this area [bbox].": 
            {
                "residential": 10,
                "commercial": 11,
                "industrial": 12,
                "road": 13,
                "demolition": 14,
                "mega-projects": 15
            }
        }
    elif dataset == "xbd":
        class_dict = {
            "classification: Classify the level of damage experienced by the building at location [bbox] in the second image. Choose from: No damage, Minor Damage, Major Damage, Destroyed.": 
            {
                "no-damage": 1,
                "minor-damage": 2,
                "major-damage": 3,
                "destroyed": 4,
            }
        }
    else:
        raise ValueError(f"Dataset {dataset} should not be evaluated on segmentation classification.")
    return class_dict


def classification_segmentation(answer_path, dataset, per_class_f1=False, height=256, width=256):
    """
    Given the path to the answer file, this function creates segmentation masks on the original polygon for the predicted and ground truth classes.
    Returns the class-weighted per-pixel F1 between predicted and ground-truth masks.
    """
    with open(answer_path) as f:
        results = json.load(f)

    classes = get_class_dict(dataset)
    class_stats = defaultdict(lambda: {'tp': 0, 'fp': 0, 'fn': 0, 'count': 0})

    for result in tqdm(results.values()):
        if result['task'] not in classes:
            continue
        class_dict = classes[result['task']]
        predicted_class = clean_string(result['predicted'])
        try:
            ground_truth_class = clean_string(result["ground_truth"])
        except:
            ground_truth_class = clean_string(result["original_answer"])
        original_polygon = loads(result['original_input_polygon'])
        
        pred_msk = np.zeros((height, width), dtype='uint8')
        gt_msk = np.zeros((height, width), dtype='uint8')
        _msk = create_mask(original_polygon, im_size=(height, width))

        if predicted_class not in class_dict or ground_truth_class not in class_dict:
            continue
        
        pred_label = class_dict[predicted_class]
        gt_label = class_dict[ground_truth_class]
        pred_msk[_msk > 0] = pred_label
        gt_msk[_msk > 0] = gt_label

        for label in class_dict.values():
            pred_mask = (pred_msk == label)
            gt_mask = (gt_msk == label)
            tp = np.sum(pred_mask & gt_mask)
            fp = np.sum(pred_mask & ~gt_mask)
            fn = np.sum(~pred_mask & gt_mask)
            
            class_stats[label]['tp'] += tp
            class_stats[label]['fp'] += fp
            class_stats[label]['fn'] += fn
            class_stats[label]['count'] += np.sum(gt_mask)

    
    scores_dict = {}

    for task, class_info in classes.items():
        print(f"Task: {task}")
        class_f1_scores = {}
        weighted_f1_score = 0
        total_weight = 0

        tp = 0
        fp = 0
        fn = 0
        for class_name, class_label in class_info.items():
            stats = class_stats[class_label]
            total_samples = sum(stats['count'] for label, stats in class_stats.items() if label in class_info.values())

            if stats['tp'] + stats['fp'] == 0 or stats['tp'] + stats['fn'] == 0:
                f1 = 0.0
            else:
                precision = stats['tp'] / (stats['tp'] + stats['fp'])
                recall = stats['tp'] / (stats['tp'] + stats['fn'])
                if precision + recall == 0:
                    f1 = 0.0
                else:
                    f1 = 2 * (precision * recall) / (precision + recall)
            class_f1_scores[class_name] = f1

            if stats['count'] > 0:
                prevalence_inv = total_samples / stats['count']
                weighted_f1_score += f1 * prevalence_inv
                total_weight += prevalence_inv
            
            tp += stats['tp']
            fp += stats['fp']
            fn += stats['fn']
        
        if tp + fp == 0 or tp + fn == 0:
            micro_f1 = 0.0
        else:
            micro_f1 = tp / (tp + 0.5 * (fp + fn))

        if total_weight > 0:
            weighted_f1_score /= total_weight
        else:
            weighted_f1_score = 0.0

        scores_dict[task] = (class_f1_scores, weighted_f1_score)
        print(f"Per-class F1 scores: {class_f1_scores}")
        if dataset == 'qfabric':
            print(f"Micro average F1 score: ", micro_f1)
        else: 
            print(f"Weighted average F1 score: {weighted_f1_score}")
    
    return scores_dict