{ "_name_or_path": "dandelin/vilt-b32-mlm", "architectures": [ "ViltForQuestionAnswering" ], "attention_probs_dropout_prob": 0.0, "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "hidden_size": 768, "id2label": { "0": "net", "1": "cup", "2": "rack", "3": "window", "4": "fashion", "5": "tired", "6": "white and black", "7": "don't know", "8": "3", "9": "skateboard", "10": "exit", "11": "0", "12": "ball", "13": "lg", "14": "blue and white", "15": "white and blue", "16": "wedding", "17": "big ben", "18": "brick", "19": "6", "20": "skateboarding", "21": "shrimp", "22": "camera", "23": "birthday", "24": "bus", "25": "church", "26": "no", "27": "resting", "28": "shadows", "29": "down", "30": "solid", "31": "french", "32": "curtains", "33": "air", "34": "trees", "35": "chair", "36": "shadow", "37": "shade", "38": "snow", "39": "hawaii", "40": "person", "41": "5", "42": "style", "43": "donut", "44": "4", "45": "cat", "46": "zoo", "47": "dirt", "48": "low", "49": "necklace", "50": "africa", "51": "gray and black", "52": "white", "53": "neon", "54": "2", "55": "bricks", "56": "arrow", "57": "lying down", "58": "giraffe", "59": "backpack", "60": "on street", "61": "platform", "62": "canopy", "63": "cross", "64": "tan", "65": "ground", "66": "soccer", "67": "plate", "68": "skier", "69": "park", "70": "blue", "71": "smiling", "72": "beige", "73": "red", "74": "2013", "75": "little girl", "76": "happy", "77": "name tag", "78": "bikes", "79": "talking on phone", "80": "shelter", "81": "snowboarder", "82": "chopsticks", "83": "small", "84": "lanyard", "85": "wine", "86": "wine tasting", "87": "man", "88": "forest", "89": "green", "90": "tv", "91": "curtain", "92": "snowboarding", "93": "door", "94": "soccer ball", "95": "outside", "96": "walking", "97": "at table", "98": "pink", "99": "tower", "100": "orange", "101": "yes", "102": "black", "103": "sun", "104": "fence", "105": "calico", "106": "clear", "107": "red and yellow", "108": "gray", "109": "8:35", "110": "windows", "111": "human", "112": "stripes", "113": "picnic table", "114": "not sure", "115": "sky", "116": "they aren't", "117": "out", "118": "car", "119": "many", "120": "black and white", "121": "street", "122": "giraffes", "123": "7:35", "124": "security", "125": "tabby", "126": "watching", "127": "king", "128": "desert", "129": "train", "130": "nothing", "131": "clock", "132": "9:35", "133": "crown", "134": "girl", "135": "7", "136": "doughnut", "137": "sidewalk", "138": "in car", "139": "double", "140": "tent", "141": "skiing", "142": "hair", "143": "yellow", "144": "10", "145": "unknown", "146": "7:45", "147": "8", "148": "right", "149": "lady", "150": "plastic", "151": "talking", "152": "jeep", "153": "boy", "154": "monitor", "155": "ice cream", "156": "1", "157": "purple", "158": "hat", "159": "large", "160": "bicycles", "161": "crossing", "162": "brown", "163": "red and blue", "164": "not there", "165": "dog", "166": "can't tell", "167": "smile", "168": "sleeping", "169": "table", "170": "bike rack", "171": "screen", "172": "wall", "173": "clock tower", "174": "station", "175": "beagle", "176": "bedroom", "177": "cage", "178": "2010", "179": "women", "180": "woods", "181": "cloudy", "182": "natural", "183": "full", "184": "queen", "185": "stand", "186": "roof", "187": "blonde", "188": "on road", "189": "bicycle", "190": "laying down", "191": "photographer", "192": "protection", "193": "2000", "194": "plain", "195": "suv", "196": "leather", "197": "snowboard", "198": "woman" }, "image_size": 384, "initializer_range": 0.02, "intermediate_size": 3072, "label2id": { "0": 11, "1": 156, "10": 144, "2": 54, "2000": 193, "2010": 178, "2013": 74, "3": 8, "4": 44, "5": 41, "6": 19, "7": 135, "7:35": 123, "7:45": 146, "8": 147, "8:35": 109, "9:35": 132, "africa": 50, "air": 33, "arrow": 56, "at table": 97, "backpack": 59, "ball": 12, "beagle": 175, "bedroom": 176, "beige": 72, "bicycle": 189, "bicycles": 160, "big ben": 17, "bike rack": 170, "bikes": 78, "birthday": 23, "black": 102, "black and white": 120, "blonde": 187, "blue": 70, "blue and white": 14, "boy": 153, "brick": 18, "bricks": 55, "brown": 162, "bus": 24, "cage": 177, "calico": 105, "camera": 22, "can't tell": 166, "canopy": 62, "car": 118, "cat": 45, "chair": 35, "chopsticks": 82, "church": 25, "clear": 106, "clock": 131, "clock tower": 173, "cloudy": 181, "cross": 63, "crossing": 161, "crown": 133, "cup": 1, "curtain": 91, "curtains": 32, "desert": 128, "dirt": 47, "dog": 165, "don't know": 7, "donut": 43, "door": 93, "double": 139, "doughnut": 136, "down": 29, "exit": 10, "fashion": 4, "fence": 104, "forest": 88, "french": 31, "full": 183, "giraffe": 58, "giraffes": 122, "girl": 134, "gray": 108, "gray and black": 51, "green": 89, "ground": 65, "hair": 142, "happy": 76, "hat": 158, "hawaii": 39, "human": 111, "ice cream": 155, "in car": 138, "jeep": 152, "king": 127, "lady": 149, "lanyard": 84, "large": 159, "laying down": 190, "leather": 196, "lg": 13, "little girl": 75, "low": 48, "lying down": 57, "man": 87, "many": 119, "monitor": 154, "name tag": 77, "natural": 182, "necklace": 49, "neon": 53, "net": 0, "no": 26, "not sure": 114, "not there": 164, "nothing": 130, "on road": 188, "on street": 60, "orange": 100, "out": 117, "outside": 95, "park": 69, "person": 40, "photographer": 191, "picnic table": 113, "pink": 98, "plain": 194, "plastic": 150, "plate": 67, "platform": 61, "protection": 192, "purple": 157, "queen": 184, "rack": 2, "red": 73, "red and blue": 163, "red and yellow": 107, "resting": 27, "right": 148, "roof": 186, "screen": 171, "security": 124, "shade": 37, "shadow": 36, "shadows": 28, "shelter": 80, "shrimp": 21, "sidewalk": 137, "skateboard": 9, "skateboarding": 20, "skier": 68, "skiing": 141, "sky": 115, "sleeping": 168, "small": 83, "smile": 167, "smiling": 71, "snow": 38, "snowboard": 197, "snowboarder": 81, "snowboarding": 92, "soccer": 66, "soccer ball": 94, "solid": 30, "stand": 185, "station": 174, "street": 121, "stripes": 112, "style": 42, "sun": 103, "suv": 195, "tabby": 125, "table": 169, "talking": 151, "talking on phone": 79, "tan": 64, "tent": 140, "they aren't": 116, "tired": 5, "tower": 99, "train": 129, "trees": 34, "tv": 90, "unknown": 145, "walking": 96, "wall": 172, "watching": 126, "wedding": 16, "white": 52, "white and black": 6, "white and blue": 15, "window": 3, "windows": 110, "wine": 85, "wine tasting": 86, "woman": 198, "women": 179, "woods": 180, "yellow": 143, "yes": 101, "zoo": 46 }, "layer_norm_eps": 1e-12, "max_image_length": -1, "max_position_embeddings": 40, "modality_type_vocab_size": 2, "model_type": "vilt", "num_attention_heads": 12, "num_channels": 3, "num_hidden_layers": 12, "num_images": -1, "patch_size": 32, "qkv_bias": true, "tie_word_embeddings": false, "torch_dtype": "float32", "transformers_version": "4.40.1", "type_vocab_size": 2, "vocab_size": 30522 }