{ "_name_or_path": "dandelin/vilt-b32-mlm", "architectures": [ "ViltForQuestionAnswering" ], "attention_probs_dropout_prob": 0.0, "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "hidden_size": 768, "id2label": { "0": "9:35", "1": "queen", "2": "can't tell", "3": "white", "4": "ground", "5": "gray", "6": "plain", "7": "zoo", "8": "doughnut", "9": "shelter", "10": "door", "11": "white and black", "12": "7", "13": "women", "14": "dog", "15": "cloudy", "16": "train", "17": "suv", "18": "human", "19": "black", "20": "ball", "21": "curtains", "22": "2000", "23": "station", "24": "trees", "25": "bike rack", "26": "windows", "27": "snowboard", "28": "neon", "29": "not there", "30": "car", "31": "bicycle", "32": "hawaii", "33": "low", "34": "6", "35": "resting", "36": "skateboarding", "37": "red", "38": "bedroom", "39": "bricks", "40": "king", "41": "at table", "42": "red and yellow", "43": "fashion", "44": "camera", "45": "soccer", "46": "stripes", "47": "large", "48": "wedding", "49": "clear", "50": "7:35", "51": "table", "52": "lanyard", "53": "tower", "54": "bikes", "55": "no", "56": "exit", "57": "backpack", "58": "sleeping", "59": "orange", "60": "pink", "61": "double", "62": "unknown", "63": "don't know", "64": "shrimp", "65": "tabby", "66": "10", "67": "girl", "68": "wall", "69": "many", "70": "screen", "71": "bicycles", "72": "chopsticks", "73": "smile", "74": "2", "75": "down", "76": "small", "77": "stand", "78": "leather", "79": "2013", "80": "green", "81": "blue", "82": "tent", "83": "dirt", "84": "brown", "85": "tired", "86": "boy", "87": "clock", "88": "big ben", "89": "church", "90": "sky", "91": "solid", "92": "crown", "93": "snowboarder", "94": "monitor", "95": "smiling", "96": "shade", "97": "plastic", "98": "skier", "99": "roof", "100": "curtain", "101": "crossing", "102": "picnic table", "103": "lying down", "104": "happy", "105": "yes", "106": "walking", "107": "africa", "108": "5", "109": "blonde", "110": "out", "111": "name tag", "112": "right", "113": "8:35", "114": "man", "115": "red and blue", "116": "cat", "117": "air", "118": "style", "119": "4", "120": "calico", "121": "7:45", "122": "woman", "123": "hat", "124": "gray and black", "125": "giraffes", "126": "birthday", "127": "not sure", "128": "in car", "129": "they aren't", "130": "protection", "131": "8", "132": "brick", "133": "on street", "134": "wine tasting", "135": "fence", "136": "beagle", "137": "tan", "138": "watching", "139": "arrow", "140": "cage", "141": "ice cream", "142": "soccer ball", "143": "rack", "144": "park", "145": "hair", "146": "jeep", "147": "desert", "148": "chair", "149": "bus", "150": "donut", "151": "platform", "152": "wine", "153": "photographer", "154": "natural", "155": "white and blue", "156": "talking", "157": "lady", "158": "street", "159": "net", "160": "sidewalk", "161": "person", "162": "shadows", "163": "laying down", "164": "talking on phone", "165": "3", "166": "purple", "167": "security", "168": "sun", "169": "tv", "170": "0", "171": "cup", "172": "window", "173": "1", "174": "lg", "175": "yellow", "176": "clock tower", "177": "nothing", "178": "little girl", "179": "snow", "180": "woods", "181": "canopy", "182": "on road", "183": "blue and white", "184": "2010", "185": "plate", "186": "black and white", "187": "beige", "188": "outside", "189": "forest", "190": "shadow", "191": "cross", "192": "french", "193": "necklace", "194": "full", "195": "snowboarding", "196": "skateboard", "197": "giraffe", "198": "skiing" }, "image_size": 384, "initializer_range": 0.02, "intermediate_size": 3072, "label2id": { "0": 170, "1": 173, "10": 66, "2": 74, "2000": 22, "2010": 184, "2013": 79, "3": 165, "4": 119, "5": 108, "6": 34, "7": 12, "7:35": 50, "7:45": 121, "8": 131, "8:35": 113, "9:35": 0, "africa": 107, "air": 117, "arrow": 139, "at table": 41, "backpack": 57, "ball": 20, "beagle": 136, "bedroom": 38, "beige": 187, "bicycle": 31, "bicycles": 71, "big ben": 88, "bike rack": 25, "bikes": 54, "birthday": 126, "black": 19, "black and white": 186, "blonde": 109, "blue": 81, "blue and white": 183, "boy": 86, "brick": 132, "bricks": 39, "brown": 84, "bus": 149, "cage": 140, "calico": 120, "camera": 44, "can't tell": 2, "canopy": 181, "car": 30, "cat": 116, "chair": 148, "chopsticks": 72, "church": 89, "clear": 49, "clock": 87, "clock tower": 176, "cloudy": 15, "cross": 191, "crossing": 101, "crown": 92, "cup": 171, "curtain": 100, "curtains": 21, "desert": 147, "dirt": 83, "dog": 14, "don't know": 63, "donut": 150, "door": 10, "double": 61, "doughnut": 8, "down": 75, "exit": 56, "fashion": 43, "fence": 135, "forest": 189, "french": 192, "full": 194, "giraffe": 197, "giraffes": 125, "girl": 67, "gray": 5, "gray and black": 124, "green": 80, "ground": 4, "hair": 145, "happy": 104, "hat": 123, "hawaii": 32, "human": 18, "ice cream": 141, "in car": 128, "jeep": 146, "king": 40, "lady": 157, "lanyard": 52, "large": 47, "laying down": 163, "leather": 78, "lg": 174, "little girl": 178, "low": 33, "lying down": 103, "man": 114, "many": 69, "monitor": 94, "name tag": 111, "natural": 154, "necklace": 193, "neon": 28, "net": 159, "no": 55, "not sure": 127, "not there": 29, "nothing": 177, "on road": 182, "on street": 133, "orange": 59, "out": 110, "outside": 188, "park": 144, "person": 161, "photographer": 153, "picnic table": 102, "pink": 60, "plain": 6, "plastic": 97, "plate": 185, "platform": 151, "protection": 130, "purple": 166, "queen": 1, "rack": 143, "red": 37, "red and blue": 115, "red and yellow": 42, "resting": 35, "right": 112, "roof": 99, "screen": 70, "security": 167, "shade": 96, "shadow": 190, "shadows": 162, "shelter": 9, "shrimp": 64, "sidewalk": 160, "skateboard": 196, "skateboarding": 36, "skier": 98, "skiing": 198, "sky": 90, "sleeping": 58, "small": 76, "smile": 73, "smiling": 95, "snow": 179, "snowboard": 27, "snowboarder": 93, "snowboarding": 195, "soccer": 45, "soccer ball": 142, "solid": 91, "stand": 77, "station": 23, "street": 158, "stripes": 46, "style": 118, "sun": 168, "suv": 17, "tabby": 65, "table": 51, "talking": 156, "talking on phone": 164, "tan": 137, "tent": 82, "they aren't": 129, "tired": 85, "tower": 53, "train": 16, "trees": 24, "tv": 169, "unknown": 62, "walking": 106, "wall": 68, "watching": 138, "wedding": 48, "white": 3, "white and black": 11, "white and blue": 155, "window": 172, "windows": 26, "wine": 152, "wine tasting": 134, "woman": 122, "women": 13, "woods": 180, "yellow": 175, "yes": 105, "zoo": 7 }, "layer_norm_eps": 1e-12, "max_image_length": -1, "max_position_embeddings": 40, "modality_type_vocab_size": 2, "model_type": "vilt", "num_attention_heads": 12, "num_channels": 3, "num_hidden_layers": 12, "num_images": -1, "patch_size": 32, "qkv_bias": true, "tie_word_embeddings": false, "torch_dtype": "float32", "transformers_version": "4.31.0", "type_vocab_size": 2, "vocab_size": 30522 }