Spaces:

jedyang97
/

3D-GRAND

Running on Zero

File size: 12,227 Bytes

947767a

from collections import OrderedDict
import torch
import os
import copy
from dataclasses import dataclass
import json
import re
from typing import Dict, Optional, Sequence


import transformers

from llava.constants import (
    IGNORE_INDEX,
)
from torch.utils.data import Dataset

from llava.util.tokenization import (
    preprocess_llama_2,
    preprocess_llama_2_obj_identifier,
    preprocess_multimodal,
    preprocess,
)

from llava import conversation as conversation_lib


class ObjIdentifierDataset(Dataset):
    """Dataset for supervised fine-tuning."""

    def __init__(
        self,
        tokenizer: transformers.PreTrainedTokenizer,
        data_path: str | list,
        scene_to_obj_mapping: str,
        obj_context_feature_type: str = "text",
        mode: str = "train",
        **kwargs,
    ):
        super(ObjIdentifierDataset, self).__init__()

        self.tokenizer = tokenizer
        self.scene_to_obj_mapping = json.load(open(scene_to_obj_mapping, "r"))
        self.update_data(data_path)
        self.obj_context_feature_type = obj_context_feature_type
        self.mode = mode

    def __len__(self):
        return len(self.list_data_dict)

    def update_data(self, data_path: str):
        assert self.scene_to_obj_mapping is not None, "scene_to_obj_mapping needs to be set first."
        if isinstance(data_path, str):
            self.list_data_dict = json.load(open(data_path, "r"))
        elif isinstance(data_path, list):
            self.list_data_dict = data_path

    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
        sources = copy.deepcopy(self.list_data_dict[i])
        if isinstance(i, int):
            sources = [sources]
        assert len(sources) == 1, "Don't know why it is wrapped to a list"  # FIXME

        ############
        scene_id = sources[0]["scene_id"]
        input_obj_dict = copy.deepcopy(self.scene_to_obj_mapping[scene_id])

        # prepare the object-centric features
        # we want the LLM to see:
        # "%%%% Object-centric context: <obj_0>: <obj_0_feat>, <obj_1>: <obj_1_feat>, ..."
        # where <obj_i_feat> will later be replaced by the actual feature in vector form,
        # everything else is pure text string.
        # 1. We need to first change the object_id to a new object_id, e.g., 'obj_0', 'obj_1', ...,
        #   and replace the old object_id with new object_id in the text conversation
        # 2. Tokenize the conversation, and add object context to the tokenized conversation
        # 3. Gather and return the necessary information for each object,
        #   so that it can be later embeded into vector

        # 1. change the object_id to a new object_id
        # original object_id: 'wardrobe-0', 'three-seat/multi-seat sofa-1', ...
        # convert to obj_id: 'obj_0', 'obj_1', ...
        # and remember the mapping
        old_id_to_new_id_mapping = {}
        result_obj_dict = OrderedDict()
        # first pass, map the old object_id to new object_id
        for old_id, obj_info in input_obj_dict.items():
            # make sure old_id doesn't contain < or >
            assert (
                "<" not in old_id and ">" not in old_id
            ), "object_id in scene graph should not contain < or >"
            new_id = f"obj_{len(old_id_to_new_id_mapping)}"
            old_id_to_new_id_mapping[old_id] = new_id
        # second pass, create the new object-centric context, modify the object_id in the text content
        for old_id, obj_info in input_obj_dict.items():
            new_id = old_id_to_new_id_mapping[old_id]
            # TODO: Determine what information to include in the object-centric context
            result_obj_info_dict = {}
            result_obj_info_dict["category"] = obj_info["category"]
            # result_obj_info_dict["category_id"] = obj_info["category_id"]
            # for relations, we need to replace the old object_id with new object_id
            # result_obj_info_dict["relations"] = []
            # for relation in obj_info["relations"]:
            #     for local_old_id, local_new_id in old_id_to_new_id_mapping.items():
            #         if local_old_id in relation:
            #             result_obj_info_dict["relations"].append(
            #                 re.sub(rf"<{local_old_id}>", f"<{local_new_id}>", relation)
            #             )
            if "description" in obj_info:
                result_obj_info_dict["description"] = obj_info["description"]
            else:
                # print(f"WARNING: Object {old_id} does not have a description.")
                pass

            # use two decimal places for the centroid and extent
            result_obj_info_dict["centroid"] = (
                f"[{obj_info['centroid'][0]:.2f}, {obj_info['centroid'][1]:.2f}, {obj_info['centroid'][2]:.2f}]"
            )
            result_obj_info_dict["extent"] = (
                f"[{obj_info['extent'][0]:.2f}, {obj_info['extent'][1]:.2f}, {obj_info['extent'][2]:.2f}]"
            )
            result_obj_dict[new_id] = result_obj_info_dict

        # replace the old object_id with new object_id in the text content
        # text conversation example:
        #     {
        #     "id": "55f2b905-d367-443d-8f88-ef71b958c81f@LivingRoom-3973@1",
        #     "scene_id": "55f2b905-d367-443d-8f88-ef71b958c81f@LivingRoom-3973",
        #     "conversations": [
        #         {
        #             "from": "human",
        #             "value": "Can you describe the ambiance of this room?"
        #         },
        #         {
        #             "from": "gpt",
        #             "value": "In this Living Room, the arrangement of furniture caters to both style and function. The <p>warm wooden hue wardrobe</p>[<wardrobe-0>] stands with a retro flair, while the <p>neutral grey, sleek rectangular form multi-seat sofa</p>[<three-seat/multi-seat sofa-1>] and <p>neutral grey, sleek rectangular form three-seat</p>[<three-seat/multi-seat sofa-8>] provide modern and comfortable seating options. The <p>sleek black, dark grey, brown, rectangular coffee table</p>[<coffee table-2>] and <p>sleek black, dark grey, brown coffee table</p>[<coffee table-3>] in minimalist style serve as focal points and functional pieces for gatherings. The <p>light grey and dark grey (two-tone), shell-like backrest, smooth armchair</p>[<armchair-4>] and <p>light grey and dark grey (two-tone), smooth armchair</p>[<armchair-5>] add additional seating, complemented by the <p>rich walnut brown top and contrasting light grey base side table</p>[<corner/side table-6>] for convenience. Suspended above, the <p>gradient of grey to bronze hues, cylindrical with abstract cityscape cutouts pendant lamp</p>[<pendant lamp-7>] offers a decorative element with its unique Chinoiserie design. The room's setup is ideal for hosting guests or enjoying quiet evenings, with thoughtful placement of each piece to enhance the living experience."
        #         }
        #     ]
        # },
        for conv in sources[0]["conversations"]:
            for old_id, new_id in old_id_to_new_id_mapping.items():
                conv["value"] = re.sub(rf"<{old_id}>", f"<{new_id}>", conv["value"])

        # if in generate mode, shave off the last conversation if it is from the assistant
        if self.mode == "generate" and sources[0]["conversations"][-1]["from"] == "gpt":
            sources[0]["conversations"] = sources[0]["conversations"][:-1]

        # 2. Tokenize the conversation, and add object context to the tokenized conversation
        sources = preprocess_multimodal(
            copy.deepcopy([e["conversations"] for e in sources]),
            is_multimodal=True,
            mm_use_im_start_end=False,
        )
        data_dict = preprocess_llama_2_obj_identifier(
            sources=sources,
            tokenizer=self.tokenizer,
            obj_dict=result_obj_dict,
            obj_context_feature_type=self.obj_context_feature_type,
            mode=self.mode,
        )

        # if in generate mode, add the obj context and bbox label to the data_dict
        # so that we can use them later to compute the metrics
        if self.mode == "generate":
            data_dict["obj_context"] = result_obj_dict
            if "bbox" in self.list_data_dict[i]:
                data_dict["bbox_label"] = self.list_data_dict[i]["bbox"]
            # full_info_dict is the full information of this data sample
            # {'id': 'scene0643_00$desk-0@0', 'scene_id': 'scene0643_00', 'conversations': [{...}],
            # 'referred_object_id': '0', 'referred_object_text': 'desk',
            # 'grounded_object_reference': 'a brown wooden office desk on the left to the gray shelf.',
            # 'bbox': [0.3769365990161897, -0.06906220592784873, -0.020513275327205656, 1.1370925301275254, 1.5355764355778696, 0.8130822017173767]
            # }
            data_dict["full_info_dict"] = self.list_data_dict[i]

        return data_dict


@dataclass
class DataCollatorForObjIdentifierDataset(object):
    """Collate examples for supervised fine-tuning."""

    def __init__(self, tokenizer: transformers.PreTrainedTokenizer, **kwargs):
        self.tokenizer = tokenizer

    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
        input_ids, labels = tuple(
            [instance[key] for instance in instances] for key in ("input_ids", "labels")
        )
        input_ids = torch.nn.utils.rnn.pad_sequence(
            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
        )
        labels = torch.nn.utils.rnn.pad_sequence(
            labels, batch_first=True, padding_value=IGNORE_INDEX
        )
        input_ids = input_ids[:, : self.tokenizer.model_max_length]
        labels = labels[:, : self.tokenizer.model_max_length]
        batch = dict(
            input_ids=input_ids,
            labels=labels,
            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
        )

        return batch


@dataclass
class DataCollatorForBatchDecodingObjIdentifierDataset(object):
    """Collate examples for batch decoding."""

    def __init__(self, tokenizer: transformers.PreTrainedTokenizer, **kwargs):
        self.tokenizer = tokenizer

    def pad_sequence(self, input_ids, batch_first, padding_value):
        if self.tokenizer.padding_side == "right":
            input_ids = [torch.flip(_input_ids, [0]) for _input_ids in input_ids]
        input_ids = torch.nn.utils.rnn.pad_sequence(
            input_ids, batch_first=batch_first, padding_value=padding_value
        )
        if self.tokenizer.padding_side == "right":
            input_ids = torch.flip(input_ids, [1])
        return input_ids

    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
        input_ids = [instance["input_ids"] for instance in instances]
        input_ids = self.pad_sequence(
            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
        )

        batch = dict(input_ids=input_ids)
        if "bbox_label" in instances[0].keys():
            batch["bbox_label"] = [instance["bbox_label"] for instance in instances]

        if "obj_context" in instances[0].keys():
            batch["obj_context"] = [instance["obj_context"] for instance in instances]

        if "full_info_dict" in instances[0].keys():
            batch["full_info_dict"] = [instance["full_info_dict"] for instance in instances]

        return batch


# test the dataset
if __name__ == "__main__":
    from transformers import AutoTokenizer

    tokenizer = AutoTokenizer.from_pretrained(
        "/data/jianingy/3d-llama/checkpoints/llava-llama-2-7b-chat-lightning-preview",
        use_fast=False,
    )
    tokenizer.pad_token = tokenizer.unk_token
    conversation_lib.default_conversation = conversation_lib.conv_templates["llava_llama_2"]

    dataset = ObjIdentifierDataset(
        tokenizer,
        data_path="/home/jianingy/research/LLaVA-original/llava/dataset/3dfront/grounded_scene_description_gpt_format.json",
        scene_to_obj_mapping="/home/jianingy/research/LLaVA-original/llava/dataset/3dfront/compressed_organized_data.json",
    )
    print(len(dataset))
    print(dataset[0])