{ "_name_or_path": "dandelin/vilt-b32-mlm", "architectures": [ "ViltForQuestionAnswering" ], "attention_probs_dropout_prob": 0.0, "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "hidden_size": 768, "id2label": { "0": "7", "1": "suv", "2": "doughnut", "3": "out", "4": "cup", "5": "man", "6": "1", "7": "lg", "8": "can't tell", "9": "many", "10": "unknown", "11": "women", "12": "windows", "13": "snow", "14": "giraffe", "15": "7:35", "16": "curtain", "17": "street", "18": "birthday", "19": "small", "20": "ground", "21": "station", "22": "window", "23": "smile", "24": "bicycles", "25": "monitor", "26": "sidewalk", "27": "orange", "28": "church", "29": "bedroom", "30": "clear", "31": "canopy", "32": "snowboarder", "33": "giraffes", "34": "arrow", "35": "plate", "36": "happy", "37": "french", "38": "8", "39": "cross", "40": "lanyard", "41": "bikes", "42": "snowboarding", "43": "gray", "44": "wine", "45": "on street", "46": "gray and black", "47": "chair", "48": "curtains", "49": "white", "50": "blue", "51": "sleeping", "52": "2013", "53": "3", "54": "girl", "55": "2000", "56": "red and yellow", "57": "table", "58": "shadow", "59": "fence", "60": "picnic table", "61": "soccer", "62": "blue and white", "63": "screen", "64": "at table", "65": "name tag", "66": "park", "67": "roof", "68": "right", "69": "blonde", "70": "desert", "71": "stripes", "72": "4", "73": "person", "74": "necklace", "75": "woman", "76": "little girl", "77": "0", "78": "white and black", "79": "2010", "80": "nothing", "81": "crown", "82": "ball", "83": "cloudy", "84": "tan", "85": "clock", "86": "8:35", "87": "shade", "88": "door", "89": "shrimp", "90": "large", "91": "calico", "92": "resting", "93": "soccer ball", "94": "zoo", "95": "shadows", "96": "laying down", "97": "shelter", "98": "sky", "99": "double", "100": "red and blue", "101": "green", "102": "skateboarding", "103": "5", "104": "chopsticks", "105": "black", "106": "human", "107": "train", "108": "not sure", "109": "big ben", "110": "queen", "111": "snowboard", "112": "talking", "113": "hawaii", "114": "white and blue", "115": "bike rack", "116": "car", "117": "rack", "118": "down", "119": "donut", "120": "natural", "121": "air", "122": "beige", "123": "wine tasting", "124": "full", "125": "outside", "126": "hat", "127": "tv", "128": "neon", "129": "hair", "130": "they aren't", "131": "exit", "132": "skateboard", "133": "beagle", "134": "dog", "135": "wall", "136": "dirt", "137": "camera", "138": "cat", "139": "in car", "140": "yellow", "141": "crossing", "142": "style", "143": "9:35", "144": "skiing", "145": "stand", "146": "7:45", "147": "clock tower", "148": "woods", "149": "skier", "150": "protection", "151": "lying down", "152": "platform", "153": "boy", "154": "jeep", "155": "low", "156": "no", "157": "purple", "158": "tabby", "159": "net", "160": "brown", "161": "6", "162": "forest", "163": "lady", "164": "pink", "165": "wedding", "166": "tower", "167": "backpack", "168": "photographer", "169": "not there", "170": "sun", "171": "on road", "172": "don't know", "173": "security", "174": "cage", "175": "plain", "176": "trees", "177": "smiling", "178": "talking on phone", "179": "leather", "180": "plastic", "181": "bicycle", "182": "tent", "183": "black and white", "184": "walking", "185": "2", "186": "ice cream", "187": "brick", "188": "10", "189": "tired", "190": "watching", "191": "king", "192": "yes", "193": "bricks", "194": "red", "195": "fashion", "196": "africa", "197": "bus", "198": "solid" }, "image_size": 384, "initializer_range": 0.02, "intermediate_size": 3072, "label2id": { "0": 77, "1": 6, "10": 188, "2": 185, "2000": 55, "2010": 79, "2013": 52, "3": 53, "4": 72, "5": 103, "6": 161, "7": 0, "7:35": 15, "7:45": 146, "8": 38, "8:35": 86, "9:35": 143, "africa": 196, "air": 121, "arrow": 34, "at table": 64, "backpack": 167, "ball": 82, "beagle": 133, "bedroom": 29, "beige": 122, "bicycle": 181, "bicycles": 24, "big ben": 109, "bike rack": 115, "bikes": 41, "birthday": 18, "black": 105, "black and white": 183, "blonde": 69, "blue": 50, "blue and white": 62, "boy": 153, "brick": 187, "bricks": 193, "brown": 160, "bus": 197, "cage": 174, "calico": 91, "camera": 137, "can't tell": 8, "canopy": 31, "car": 116, "cat": 138, "chair": 47, "chopsticks": 104, "church": 28, "clear": 30, "clock": 85, "clock tower": 147, "cloudy": 83, "cross": 39, "crossing": 141, "crown": 81, "cup": 4, "curtain": 16, "curtains": 48, "desert": 70, "dirt": 136, "dog": 134, "don't know": 172, "donut": 119, "door": 88, "double": 99, "doughnut": 2, "down": 118, "exit": 131, "fashion": 195, "fence": 59, "forest": 162, "french": 37, "full": 124, "giraffe": 14, "giraffes": 33, "girl": 54, "gray": 43, "gray and black": 46, "green": 101, "ground": 20, "hair": 129, "happy": 36, "hat": 126, "hawaii": 113, "human": 106, "ice cream": 186, "in car": 139, "jeep": 154, "king": 191, "lady": 163, "lanyard": 40, "large": 90, "laying down": 96, "leather": 179, "lg": 7, "little girl": 76, "low": 155, "lying down": 151, "man": 5, "many": 9, "monitor": 25, "name tag": 65, "natural": 120, "necklace": 74, "neon": 128, "net": 159, "no": 156, "not sure": 108, "not there": 169, "nothing": 80, "on road": 171, "on street": 45, "orange": 27, "out": 3, "outside": 125, "park": 66, "person": 73, "photographer": 168, "picnic table": 60, "pink": 164, "plain": 175, "plastic": 180, "plate": 35, "platform": 152, "protection": 150, "purple": 157, "queen": 110, "rack": 117, "red": 194, "red and blue": 100, "red and yellow": 56, "resting": 92, "right": 68, "roof": 67, "screen": 63, "security": 173, "shade": 87, "shadow": 58, "shadows": 95, "shelter": 97, "shrimp": 89, "sidewalk": 26, "skateboard": 132, "skateboarding": 102, "skier": 149, "skiing": 144, "sky": 98, "sleeping": 51, "small": 19, "smile": 23, "smiling": 177, "snow": 13, "snowboard": 111, "snowboarder": 32, "snowboarding": 42, "soccer": 61, "soccer ball": 93, "solid": 198, "stand": 145, "station": 21, "street": 17, "stripes": 71, "style": 142, "sun": 170, "suv": 1, "tabby": 158, "table": 57, "talking": 112, "talking on phone": 178, "tan": 84, "tent": 182, "they aren't": 130, "tired": 189, "tower": 166, "train": 107, "trees": 176, "tv": 127, "unknown": 10, "walking": 184, "wall": 135, "watching": 190, "wedding": 165, "white": 49, "white and black": 78, "white and blue": 114, "window": 22, "windows": 12, "wine": 44, "wine tasting": 123, "woman": 75, "women": 11, "woods": 148, "yellow": 140, "yes": 192, "zoo": 94 }, "layer_norm_eps": 1e-12, "max_image_length": -1, "max_position_embeddings": 40, "modality_type_vocab_size": 2, "model_type": "vilt", "num_attention_heads": 12, "num_channels": 3, "num_hidden_layers": 12, "num_images": -1, "patch_size": 32, "qkv_bias": true, "tie_word_embeddings": false, "torch_dtype": "float32", "transformers_version": "4.37.2", "type_vocab_size": 2, "vocab_size": 30522 }