{ "_name_or_path": "dandelin/vilt-b32-mlm", "architectures": [ "ViltForQuestionAnswering" ], "attention_probs_dropout_prob": 0.0, "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "hidden_size": 768, "id2label": { "0": "cat", "1": "2010", "2": "sky", "3": "net", "4": "happy", "5": "soccer ball", "6": "name tag", "7": "lg", "8": "plastic", "9": "7:35", "10": "white", "11": "chopsticks", "12": "shade", "13": "backpack", "14": "bicycle", "15": "suv", "16": "woods", "17": "black", "18": "crown", "19": "king", "20": "shadows", "21": "white and blue", "22": "camera", "23": "snowboard", "24": "right", "25": "stand", "26": "nothing", "27": "man", "28": "yes", "29": "birthday", "30": "gray", "31": "soccer", "32": "cross", "33": "giraffes", "34": "shelter", "35": "purple", "36": "bicycles", "37": "crossing", "38": "9:35", "39": "neon", "40": "natural", "41": "skateboard", "42": "yellow", "43": "tan", "44": "green", "45": "photographer", "46": "gray and black", "47": "wine", "48": "little girl", "49": "8", "50": "boy", "51": "plain", "52": "can't tell", "53": "calico", "54": "clock", "55": "dirt", "56": "solid", "57": "many", "58": "7", "59": "2000", "60": "canopy", "61": "down", "62": "door", "63": "rack", "64": "fashion", "65": "bricks", "66": "wedding", "67": "7:45", "68": "1", "69": "pink", "70": "small", "71": "unknown", "72": "skiing", "73": "french", "74": "wine tasting", "75": "clock tower", "76": "snowboarder", "77": "2", "78": "ball", "79": "on road", "80": "blue", "81": "plate", "82": "4", "83": "station", "84": "in car", "85": "table", "86": "cup", "87": "trees", "88": "resting", "89": "snowboarding", "90": "arrow", "91": "curtain", "92": "hat", "93": "donut", "94": "street", "95": "windows", "96": "2013", "97": "girl", "98": "tent", "99": "they aren't", "100": "no", "101": "person", "102": "air", "103": "woman", "104": "ice cream", "105": "tired", "106": "0", "107": "monitor", "108": "leather", "109": "jeep", "110": "human", "111": "ground", "112": "hawaii", "113": "blue and white", "114": "orange", "115": "window", "116": "watching", "117": "6", "118": "brown", "119": "screen", "120": "not sure", "121": "africa", "122": "women", "123": "doughnut", "124": "forest", "125": "out", "126": "big ben", "127": "sleeping", "128": "giraffe", "129": "car", "130": "shrimp", "131": "protection", "132": "wall", "133": "cage", "134": "skateboarding", "135": "roof", "136": "don't know", "137": "beagle", "138": "beige", "139": "bedroom", "140": "bus", "141": "full", "142": "walking", "143": "large", "144": "low", "145": "skier", "146": "sidewalk", "147": "tv", "148": "at table", "149": "security", "150": "laying down", "151": "sun", "152": "cloudy", "153": "dog", "154": "smile", "155": "white and black", "156": "desert", "157": "double", "158": "not there", "159": "red", "160": "talking", "161": "bike rack", "162": "queen", "163": "platform", "164": "talking on phone", "165": "lying down", "166": "park", "167": "black and white", "168": "exit", "169": "lady", "170": "clear", "171": "on street", "172": "8:35", "173": "5", "174": "shadow", "175": "outside", "176": "red and yellow", "177": "smiling", "178": "zoo", "179": "fence", "180": "lanyard", "181": "necklace", "182": "brick", "183": "church", "184": "10", "185": "stripes", "186": "chair", "187": "tabby", "188": "hair", "189": "blonde", "190": "picnic table", "191": "train", "192": "tower", "193": "style", "194": "curtains", "195": "snow", "196": "bikes", "197": "red and blue", "198": "3" }, "image_size": 384, "initializer_range": 0.02, "intermediate_size": 3072, "label2id": { "0": 106, "1": 68, "10": 184, "2": 77, "2000": 59, "2010": 1, "2013": 96, "3": 198, "4": 82, "5": 173, "6": 117, "7": 58, "7:35": 9, "7:45": 67, "8": 49, "8:35": 172, "9:35": 38, "africa": 121, "air": 102, "arrow": 90, "at table": 148, "backpack": 13, "ball": 78, "beagle": 137, "bedroom": 139, "beige": 138, "bicycle": 14, "bicycles": 36, "big ben": 126, "bike rack": 161, "bikes": 196, "birthday": 29, "black": 17, "black and white": 167, "blonde": 189, "blue": 80, "blue and white": 113, "boy": 50, "brick": 182, "bricks": 65, "brown": 118, "bus": 140, "cage": 133, "calico": 53, "camera": 22, "can't tell": 52, "canopy": 60, "car": 129, "cat": 0, "chair": 186, "chopsticks": 11, "church": 183, "clear": 170, "clock": 54, "clock tower": 75, "cloudy": 152, "cross": 32, "crossing": 37, "crown": 18, "cup": 86, "curtain": 91, "curtains": 194, "desert": 156, "dirt": 55, "dog": 153, "don't know": 136, "donut": 93, "door": 62, "double": 157, "doughnut": 123, "down": 61, "exit": 168, "fashion": 64, "fence": 179, "forest": 124, "french": 73, "full": 141, "giraffe": 128, "giraffes": 33, "girl": 97, "gray": 30, "gray and black": 46, "green": 44, "ground": 111, "hair": 188, "happy": 4, "hat": 92, "hawaii": 112, "human": 110, "ice cream": 104, "in car": 84, "jeep": 109, "king": 19, "lady": 169, "lanyard": 180, "large": 143, "laying down": 150, "leather": 108, "lg": 7, "little girl": 48, "low": 144, "lying down": 165, "man": 27, "many": 57, "monitor": 107, "name tag": 6, "natural": 40, "necklace": 181, "neon": 39, "net": 3, "no": 100, "not sure": 120, "not there": 158, "nothing": 26, "on road": 79, "on street": 171, "orange": 114, "out": 125, "outside": 175, "park": 166, "person": 101, "photographer": 45, "picnic table": 190, "pink": 69, "plain": 51, "plastic": 8, "plate": 81, "platform": 163, "protection": 131, "purple": 35, "queen": 162, "rack": 63, "red": 159, "red and blue": 197, "red and yellow": 176, "resting": 88, "right": 24, "roof": 135, "screen": 119, "security": 149, "shade": 12, "shadow": 174, "shadows": 20, "shelter": 34, "shrimp": 130, "sidewalk": 146, "skateboard": 41, "skateboarding": 134, "skier": 145, "skiing": 72, "sky": 2, "sleeping": 127, "small": 70, "smile": 154, "smiling": 177, "snow": 195, "snowboard": 23, "snowboarder": 76, "snowboarding": 89, "soccer": 31, "soccer ball": 5, "solid": 56, "stand": 25, "station": 83, "street": 94, "stripes": 185, "style": 193, "sun": 151, "suv": 15, "tabby": 187, "table": 85, "talking": 160, "talking on phone": 164, "tan": 43, "tent": 98, "they aren't": 99, "tired": 105, "tower": 192, "train": 191, "trees": 87, "tv": 147, "unknown": 71, "walking": 142, "wall": 132, "watching": 116, "wedding": 66, "white": 10, "white and black": 155, "white and blue": 21, "window": 115, "windows": 95, "wine": 47, "wine tasting": 74, "woman": 103, "women": 122, "woods": 16, "yellow": 42, "yes": 28, "zoo": 178 }, "layer_norm_eps": 1e-12, "max_image_length": -1, "max_position_embeddings": 40, "modality_type_vocab_size": 2, "model_type": "vilt", "num_attention_heads": 12, "num_channels": 3, "num_hidden_layers": 12, "num_images": -1, "patch_size": 32, "qkv_bias": true, "tie_word_embeddings": false, "torch_dtype": "float32", "transformers_version": "4.33.2", "type_vocab_size": 2, "vocab_size": 30522 }