{ "_name_or_path": "dandelin/vilt-b32-mlm", "architectures": [ "ViltForQuestionAnswering" ], "attention_probs_dropout_prob": 0.0, "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "hidden_size": 768, "id2label": { "0": "necklace", "1": "street", "2": "backpack", "3": "church", "4": "tv", "5": "tent", "6": "snowboard", "7": "clock", "8": "6", "9": "on street", "10": "not there", "11": "screen", "12": "women", "13": "lady", "14": "nothing", "15": "walking", "16": "picnic table", "17": "black and white", "18": "name tag", "19": "chair", "20": "1", "21": "sky", "22": "gray and black", "23": "smiling", "24": "skateboard", "25": "8", "26": "not sure", "27": "clock tower", "28": "air", "29": "windows", "30": "lying down", "31": "talking on phone", "32": "king", "33": "bus", "34": "wine tasting", "35": "africa", "36": "unknown", "37": "red", "38": "little girl", "39": "blue", "40": "small", "41": "rack", "42": "soccer", "43": "natural", "44": "right", "45": "on road", "46": "red and blue", "47": "can't tell", "48": "curtains", "49": "park", "50": "tabby", "51": "exit", "52": "full", "53": "at table", "54": "7", "55": "ground", "56": "woman", "57": "human", "58": "fence", "59": "orange", "60": "wedding", "61": "station", "62": "cat", "63": "talking", "64": "jeep", "65": "many", "66": "snowboarding", "67": "big ben", "68": "soccer ball", "69": "bikes", "70": "tired", "71": "window", "72": "monitor", "73": "roof", "74": "door", "75": "laying down", "76": "doughnut", "77": "plain", "78": "2000", "79": "donut", "80": "brown", "81": "tower", "82": "happy", "83": "sun", "84": "snow", "85": "7:35", "86": "giraffe", "87": "giraffes", "88": "style", "89": "5", "90": "birthday", "91": "wall", "92": "cup", "93": "shadow", "94": "cage", "95": "dog", "96": "queen", "97": "crown", "98": "leather", "99": "watching", "100": "skier", "101": "bike rack", "102": "double", "103": "resting", "104": "wine", "105": "green", "106": "french", "107": "gray", "108": "purple", "109": "hat", "110": "tan", "111": "sidewalk", "112": "ice cream", "113": "plastic", "114": "desert", "115": "shrimp", "116": "cross", "117": "hawaii", "118": "low", "119": "10", "120": "beige", "121": "bricks", "122": "white and blue", "123": "shelter", "124": "train", "125": "boy", "126": "8:35", "127": "neon", "128": "black", "129": "crossing", "130": "net", "131": "woods", "132": "shadows", "133": "blonde", "134": "girl", "135": "trees", "136": "7:45", "137": "they aren't", "138": "canopy", "139": "photographer", "140": "security", "141": "2", "142": "man", "143": "skateboarding", "144": "sleeping", "145": "in car", "146": "3", "147": "beagle", "148": "calico", "149": "yes", "150": "red and yellow", "151": "brick", "152": "out", "153": "bicycle", "154": "white and black", "155": "4", "156": "protection", "157": "curtain", "158": "hair", "159": "shade", "160": "table", "161": "pink", "162": "skiing", "163": "don't know", "164": "forest", "165": "zoo", "166": "person", "167": "dirt", "168": "yellow", "169": "arrow", "170": "down", "171": "2010", "172": "camera", "173": "solid", "174": "car", "175": "clear", "176": "white", "177": "smile", "178": "2013", "179": "lg", "180": "fashion", "181": "bicycles", "182": "large", "183": "chopsticks", "184": "platform", "185": "lanyard", "186": "ball", "187": "suv", "188": "cloudy", "189": "bedroom", "190": "stand", "191": "blue and white", "192": "plate", "193": "0", "194": "outside", "195": "snowboarder", "196": "stripes", "197": "9:35", "198": "no" }, "image_size": 384, "initializer_range": 0.02, "intermediate_size": 3072, "label2id": { "0": 193, "1": 20, "10": 119, "2": 141, "2000": 78, "2010": 171, "2013": 178, "3": 146, "4": 155, "5": 89, "6": 8, "7": 54, "7:35": 85, "7:45": 136, "8": 25, "8:35": 126, "9:35": 197, "africa": 35, "air": 28, "arrow": 169, "at table": 53, "backpack": 2, "ball": 186, "beagle": 147, "bedroom": 189, "beige": 120, "bicycle": 153, "bicycles": 181, "big ben": 67, "bike rack": 101, "bikes": 69, "birthday": 90, "black": 128, "black and white": 17, "blonde": 133, "blue": 39, "blue and white": 191, "boy": 125, "brick": 151, "bricks": 121, "brown": 80, "bus": 33, "cage": 94, "calico": 148, "camera": 172, "can't tell": 47, "canopy": 138, "car": 174, "cat": 62, "chair": 19, "chopsticks": 183, "church": 3, "clear": 175, "clock": 7, "clock tower": 27, "cloudy": 188, "cross": 116, "crossing": 129, "crown": 97, "cup": 92, "curtain": 157, "curtains": 48, "desert": 114, "dirt": 167, "dog": 95, "don't know": 163, "donut": 79, "door": 74, "double": 102, "doughnut": 76, "down": 170, "exit": 51, "fashion": 180, "fence": 58, "forest": 164, "french": 106, "full": 52, "giraffe": 86, "giraffes": 87, "girl": 134, "gray": 107, "gray and black": 22, "green": 105, "ground": 55, "hair": 158, "happy": 82, "hat": 109, "hawaii": 117, "human": 57, "ice cream": 112, "in car": 145, "jeep": 64, "king": 32, "lady": 13, "lanyard": 185, "large": 182, "laying down": 75, "leather": 98, "lg": 179, "little girl": 38, "low": 118, "lying down": 30, "man": 142, "many": 65, "monitor": 72, "name tag": 18, "natural": 43, "necklace": 0, "neon": 127, "net": 130, "no": 198, "not sure": 26, "not there": 10, "nothing": 14, "on road": 45, "on street": 9, "orange": 59, "out": 152, "outside": 194, "park": 49, "person": 166, "photographer": 139, "picnic table": 16, "pink": 161, "plain": 77, "plastic": 113, "plate": 192, "platform": 184, "protection": 156, "purple": 108, "queen": 96, "rack": 41, "red": 37, "red and blue": 46, "red and yellow": 150, "resting": 103, "right": 44, "roof": 73, "screen": 11, "security": 140, "shade": 159, "shadow": 93, "shadows": 132, "shelter": 123, "shrimp": 115, "sidewalk": 111, "skateboard": 24, "skateboarding": 143, "skier": 100, "skiing": 162, "sky": 21, "sleeping": 144, "small": 40, "smile": 177, "smiling": 23, "snow": 84, "snowboard": 6, "snowboarder": 195, "snowboarding": 66, "soccer": 42, "soccer ball": 68, "solid": 173, "stand": 190, "station": 61, "street": 1, "stripes": 196, "style": 88, "sun": 83, "suv": 187, "tabby": 50, "table": 160, "talking": 63, "talking on phone": 31, "tan": 110, "tent": 5, "they aren't": 137, "tired": 70, "tower": 81, "train": 124, "trees": 135, "tv": 4, "unknown": 36, "walking": 15, "wall": 91, "watching": 99, "wedding": 60, "white": 176, "white and black": 154, "white and blue": 122, "window": 71, "windows": 29, "wine": 104, "wine tasting": 34, "woman": 56, "women": 12, "woods": 131, "yellow": 168, "yes": 149, "zoo": 165 }, "layer_norm_eps": 1e-12, "max_image_length": -1, "max_position_embeddings": 40, "modality_type_vocab_size": 2, "model_type": "vilt", "num_attention_heads": 12, "num_channels": 3, "num_hidden_layers": 12, "num_images": -1, "patch_size": 32, "qkv_bias": true, "tie_word_embeddings": false, "torch_dtype": "float32", "transformers_version": "4.42.4", "type_vocab_size": 2, "vocab_size": 30522 }