{ "_name_or_path": "dandelin/vilt-b32-mlm", "architectures": [ "ViltForQuestionAnswering" ], "attention_probs_dropout_prob": 0.0, "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "hidden_size": 768, "id2label": { "0": "ice cream", "1": "hair", "2": "2000", "3": "walking", "4": "giraffe", "5": "wine", "6": "laying down", "7": "birthday", "8": "dog", "9": "white and black", "10": "4", "11": "exit", "12": "crown", "13": "lady", "14": "trees", "15": "train", "16": "tabby", "17": "out", "18": "10", "19": "monitor", "20": "window", "21": "boy", "22": "suv", "23": "shade", "24": "snowboard", "25": "plain", "26": "stripes", "27": "bricks", "28": "snow", "29": "talking on phone", "30": "bikes", "31": "7", "32": "protection", "33": "jeep", "34": "tv", "35": "queen", "36": "brick", "37": "style", "38": "donut", "39": "nothing", "40": "lying down", "41": "air", "42": "security", "43": "crossing", "44": "girl", "45": "red", "46": "canopy", "47": "human", "48": "cross", "49": "station", "50": "skateboard", "51": "windows", "52": "hawaii", "53": "full", "54": "sidewalk", "55": "zoo", "56": "big ben", "57": "lanyard", "58": "yellow", "59": "6", "60": "no", "61": "on road", "62": "tired", "63": "little girl", "64": "clear", "65": "sleeping", "66": "forest", "67": "sun", "68": "street", "69": "shadow", "70": "woman", "71": "bus", "72": "at table", "73": "hat", "74": "roof", "75": "red and blue", "76": "watching", "77": "fashion", "78": "blue", "79": "lg", "80": "purple", "81": "bicycles", "82": "africa", "83": "1", "84": "gray and black", "85": "wall", "86": "bike rack", "87": "outside", "88": "0", "89": "7:35", "90": "picnic table", "91": "chopsticks", "92": "not there", "93": "red and yellow", "94": "don't know", "95": "wedding", "96": "not sure", "97": "cat", "98": "sky", "99": "calico", "100": "door", "101": "woods", "102": "skiing", "103": "cloudy", "104": "shelter", "105": "desert", "106": "unknown", "107": "smile", "108": "ground", "109": "clock tower", "110": "2013", "111": "natural", "112": "in car", "113": "car", "114": "talking", "115": "gray", "116": "plastic", "117": "church", "118": "curtains", "119": "tower", "120": "cup", "121": "7:45", "122": "beige", "123": "9:35", "124": "soccer ball", "125": "net", "126": "table", "127": "man", "128": "on street", "129": "screen", "130": "clock", "131": "smiling", "132": "shrimp", "133": "backpack", "134": "french", "135": "double", "136": "soccer", "137": "chair", "138": "arrow", "139": "small", "140": "photographer", "141": "many", "142": "large", "143": "stand", "144": "yes", "145": "snowboarding", "146": "fence", "147": "ball", "148": "blue and white", "149": "curtain", "150": "pink", "151": "rack", "152": "park", "153": "down", "154": "they aren't", "155": "beagle", "156": "green", "157": "black", "158": "3", "159": "solid", "160": "2", "161": "blonde", "162": "platform", "163": "shadows", "164": "black and white", "165": "8", "166": "camera", "167": "orange", "168": "8:35", "169": "low", "170": "tan", "171": "person", "172": "resting", "173": "women", "174": "skateboarding", "175": "cage", "176": "dirt", "177": "leather", "178": "right", "179": "can't tell", "180": "plate", "181": "5", "182": "necklace", "183": "bicycle", "184": "wine tasting", "185": "name tag", "186": "giraffes", "187": "snowboarder", "188": "2010", "189": "skier", "190": "doughnut", "191": "bedroom", "192": "happy", "193": "tent", "194": "white and blue", "195": "neon", "196": "king", "197": "brown", "198": "white" }, "image_size": 384, "initializer_range": 0.02, "intermediate_size": 3072, "label2id": { "0": 88, "1": 83, "10": 18, "2": 160, "2000": 2, "2010": 188, "2013": 110, "3": 158, "4": 10, "5": 181, "6": 59, "7": 31, "7:35": 89, "7:45": 121, "8": 165, "8:35": 168, "9:35": 123, "africa": 82, "air": 41, "arrow": 138, "at table": 72, "backpack": 133, "ball": 147, "beagle": 155, "bedroom": 191, "beige": 122, "bicycle": 183, "bicycles": 81, "big ben": 56, "bike rack": 86, "bikes": 30, "birthday": 7, "black": 157, "black and white": 164, "blonde": 161, "blue": 78, "blue and white": 148, "boy": 21, "brick": 36, "bricks": 27, "brown": 197, "bus": 71, "cage": 175, "calico": 99, "camera": 166, "can't tell": 179, "canopy": 46, "car": 113, "cat": 97, "chair": 137, "chopsticks": 91, "church": 117, "clear": 64, "clock": 130, "clock tower": 109, "cloudy": 103, "cross": 48, "crossing": 43, "crown": 12, "cup": 120, "curtain": 149, "curtains": 118, "desert": 105, "dirt": 176, "dog": 8, "don't know": 94, "donut": 38, "door": 100, "double": 135, "doughnut": 190, "down": 153, "exit": 11, "fashion": 77, "fence": 146, "forest": 66, "french": 134, "full": 53, "giraffe": 4, "giraffes": 186, "girl": 44, "gray": 115, "gray and black": 84, "green": 156, "ground": 108, "hair": 1, "happy": 192, "hat": 73, "hawaii": 52, "human": 47, "ice cream": 0, "in car": 112, "jeep": 33, "king": 196, "lady": 13, "lanyard": 57, "large": 142, "laying down": 6, "leather": 177, "lg": 79, "little girl": 63, "low": 169, "lying down": 40, "man": 127, "many": 141, "monitor": 19, "name tag": 185, "natural": 111, "necklace": 182, "neon": 195, "net": 125, "no": 60, "not sure": 96, "not there": 92, "nothing": 39, "on road": 61, "on street": 128, "orange": 167, "out": 17, "outside": 87, "park": 152, "person": 171, "photographer": 140, "picnic table": 90, "pink": 150, "plain": 25, "plastic": 116, "plate": 180, "platform": 162, "protection": 32, "purple": 80, "queen": 35, "rack": 151, "red": 45, "red and blue": 75, "red and yellow": 93, "resting": 172, "right": 178, "roof": 74, "screen": 129, "security": 42, "shade": 23, "shadow": 69, "shadows": 163, "shelter": 104, "shrimp": 132, "sidewalk": 54, "skateboard": 50, "skateboarding": 174, "skier": 189, "skiing": 102, "sky": 98, "sleeping": 65, "small": 139, "smile": 107, "smiling": 131, "snow": 28, "snowboard": 24, "snowboarder": 187, "snowboarding": 145, "soccer": 136, "soccer ball": 124, "solid": 159, "stand": 143, "station": 49, "street": 68, "stripes": 26, "style": 37, "sun": 67, "suv": 22, "tabby": 16, "table": 126, "talking": 114, "talking on phone": 29, "tan": 170, "tent": 193, "they aren't": 154, "tired": 62, "tower": 119, "train": 15, "trees": 14, "tv": 34, "unknown": 106, "walking": 3, "wall": 85, "watching": 76, "wedding": 95, "white": 198, "white and black": 9, "white and blue": 194, "window": 20, "windows": 51, "wine": 5, "wine tasting": 184, "woman": 70, "women": 173, "woods": 101, "yellow": 58, "yes": 144, "zoo": 55 }, "layer_norm_eps": 1e-12, "max_image_length": -1, "max_position_embeddings": 40, "modality_type_vocab_size": 2, "model_type": "vilt", "num_attention_heads": 12, "num_channels": 3, "num_hidden_layers": 12, "num_images": -1, "patch_size": 32, "qkv_bias": true, "tie_word_embeddings": false, "torch_dtype": "float32", "transformers_version": "4.41.2", "type_vocab_size": 2, "vocab_size": 30522 }