nielsr's picture
nielsr HF staff
Upload AudioSpectrogramTransformerForSequenceClassification
3fd2720
raw
history blame
26.8 kB
{
"architectures": [
"AudioSpectrogramTransformerForSequenceClassification"
],
"attention_probs_dropout_prob": 0.0,
"frequency_dimension": 128,
"frequency_stride": 14,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.0,
"hidden_size": 768,
"id2label": {
"0": "Speech",
"1": "Male speech, man speaking",
"2": "Female speech, woman speaking",
"3": "Child speech, kid speaking",
"4": "Conversation",
"5": "Narration, monologue",
"6": "Babbling",
"7": "Speech synthesizer",
"8": "Shout",
"9": "Bellow",
"10": "Whoop",
"11": "Yell",
"12": "Battle cry",
"13": "Children shouting",
"14": "Screaming",
"15": "Whispering",
"16": "Laughter",
"17": "Baby laughter",
"18": "Giggle",
"19": "Snicker",
"20": "Belly laugh",
"21": "Chuckle, chortle",
"22": "Crying, sobbing",
"23": "Baby cry, infant cry",
"24": "Whimper",
"25": "Wail, moan",
"26": "Sigh",
"27": "Singing",
"28": "Choir",
"29": "Yodeling",
"30": "Chant",
"31": "Mantra",
"32": "Male singing",
"33": "Female singing",
"34": "Child singing",
"35": "Synthetic singing",
"36": "Rapping",
"37": "Humming",
"38": "Groan",
"39": "Grunt",
"40": "Whistling",
"41": "Breathing",
"42": "Wheeze",
"43": "Snoring",
"44": "Gasp",
"45": "Pant",
"46": "Snort",
"47": "Cough",
"48": "Throat clearing",
"49": "Sneeze",
"50": "Sniff",
"51": "Run",
"52": "Shuffle",
"53": "Walk, footsteps",
"54": "Chewing, mastication",
"55": "Biting",
"56": "Gargling",
"57": "Stomach rumble",
"58": "Burping, eructation",
"59": "Hiccup",
"60": "Fart",
"61": "Hands",
"62": "Finger snapping",
"63": "Clapping",
"64": "Heart sounds, heartbeat",
"65": "Heart murmur",
"66": "Cheering",
"67": "Applause",
"68": "Chatter",
"69": "Crowd",
"70": "Hubbub, speech noise, speech babble",
"71": "Children playing",
"72": "Animal",
"73": "Domestic animals, pets",
"74": "Dog",
"75": "Bark",
"76": "Yip",
"77": "Howl",
"78": "Bow-wow",
"79": "Growling",
"80": "Whimper (dog)",
"81": "Cat",
"82": "Purr",
"83": "Meow",
"84": "Hiss",
"85": "Caterwaul",
"86": "Livestock, farm animals, working animals",
"87": "Horse",
"88": "Clip-clop",
"89": "Neigh, whinny",
"90": "Cattle, bovinae",
"91": "Moo",
"92": "Cowbell",
"93": "Pig",
"94": "Oink",
"95": "Goat",
"96": "Bleat",
"97": "Sheep",
"98": "Fowl",
"99": "Chicken, rooster",
"100": "Cluck",
"101": "Crowing, cock-a-doodle-doo",
"102": "Turkey",
"103": "Gobble",
"104": "Duck",
"105": "Quack",
"106": "Goose",
"107": "Honk",
"108": "Wild animals",
"109": "Roaring cats (lions, tigers)",
"110": "Roar",
"111": "Bird",
"112": "Bird vocalization, bird call, bird song",
"113": "Chirp, tweet",
"114": "Squawk",
"115": "Pigeon, dove",
"116": "Coo",
"117": "Crow",
"118": "Caw",
"119": "Owl",
"120": "Hoot",
"121": "Bird flight, flapping wings",
"122": "Canidae, dogs, wolves",
"123": "Rodents, rats, mice",
"124": "Mouse",
"125": "Patter",
"126": "Insect",
"127": "Cricket",
"128": "Mosquito",
"129": "Fly, housefly",
"130": "Buzz",
"131": "Bee, wasp, etc.",
"132": "Frog",
"133": "Croak",
"134": "Snake",
"135": "Rattle",
"136": "Whale vocalization",
"137": "Music",
"138": "Musical instrument",
"139": "Plucked string instrument",
"140": "Guitar",
"141": "Electric guitar",
"142": "Bass guitar",
"143": "Acoustic guitar",
"144": "Steel guitar, slide guitar",
"145": "Tapping (guitar technique)",
"146": "Strum",
"147": "Banjo",
"148": "Sitar",
"149": "Mandolin",
"150": "Zither",
"151": "Ukulele",
"152": "Keyboard (musical)",
"153": "Piano",
"154": "Electric piano",
"155": "Organ",
"156": "Electronic organ",
"157": "Hammond organ",
"158": "Synthesizer",
"159": "Sampler",
"160": "Harpsichord",
"161": "Percussion",
"162": "Drum kit",
"163": "Drum machine",
"164": "Drum",
"165": "Snare drum",
"166": "Rimshot",
"167": "Drum roll",
"168": "Bass drum",
"169": "Timpani",
"170": "Tabla",
"171": "Cymbal",
"172": "Hi-hat",
"173": "Wood block",
"174": "Tambourine",
"175": "Rattle (instrument)",
"176": "Maraca",
"177": "Gong",
"178": "Tubular bells",
"179": "Mallet percussion",
"180": "Marimba, xylophone",
"181": "Glockenspiel",
"182": "Vibraphone",
"183": "Steelpan",
"184": "Orchestra",
"185": "Brass instrument",
"186": "French horn",
"187": "Trumpet",
"188": "Trombone",
"189": "Bowed string instrument",
"190": "String section",
"191": "Violin, fiddle",
"192": "Pizzicato",
"193": "Cello",
"194": "Double bass",
"195": "Wind instrument, woodwind instrument",
"196": "Flute",
"197": "Saxophone",
"198": "Clarinet",
"199": "Harp",
"200": "Bell",
"201": "Church bell",
"202": "Jingle bell",
"203": "Bicycle bell",
"204": "Tuning fork",
"205": "Chime",
"206": "Wind chime",
"207": "Change ringing (campanology)",
"208": "Harmonica",
"209": "Accordion",
"210": "Bagpipes",
"211": "Didgeridoo",
"212": "Shofar",
"213": "Theremin",
"214": "Singing bowl",
"215": "Scratching (performance technique)",
"216": "Pop music",
"217": "Hip hop music",
"218": "Beatboxing",
"219": "Rock music",
"220": "Heavy metal",
"221": "Punk rock",
"222": "Grunge",
"223": "Progressive rock",
"224": "Rock and roll",
"225": "Psychedelic rock",
"226": "Rhythm and blues",
"227": "Soul music",
"228": "Reggae",
"229": "Country",
"230": "Swing music",
"231": "Bluegrass",
"232": "Funk",
"233": "Folk music",
"234": "Middle Eastern music",
"235": "Jazz",
"236": "Disco",
"237": "Classical music",
"238": "Opera",
"239": "Electronic music",
"240": "House music",
"241": "Techno",
"242": "Dubstep",
"243": "Drum and bass",
"244": "Electronica",
"245": "Electronic dance music",
"246": "Ambient music",
"247": "Trance music",
"248": "Music of Latin America",
"249": "Salsa music",
"250": "Flamenco",
"251": "Blues",
"252": "Music for children",
"253": "New-age music",
"254": "Vocal music",
"255": "A capella",
"256": "Music of Africa",
"257": "Afrobeat",
"258": "Christian music",
"259": "Gospel music",
"260": "Music of Asia",
"261": "Carnatic music",
"262": "Music of Bollywood",
"263": "Ska",
"264": "Traditional music",
"265": "Independent music",
"266": "Song",
"267": "Background music",
"268": "Theme music",
"269": "Jingle (music)",
"270": "Soundtrack music",
"271": "Lullaby",
"272": "Video game music",
"273": "Christmas music",
"274": "Dance music",
"275": "Wedding music",
"276": "Happy music",
"277": "Funny music",
"278": "Sad music",
"279": "Tender music",
"280": "Exciting music",
"281": "Angry music",
"282": "Scary music",
"283": "Wind",
"284": "Rustling leaves",
"285": "Wind noise (microphone)",
"286": "Thunderstorm",
"287": "Thunder",
"288": "Water",
"289": "Rain",
"290": "Raindrop",
"291": "Rain on surface",
"292": "Stream",
"293": "Waterfall",
"294": "Ocean",
"295": "Waves, surf",
"296": "Steam",
"297": "Gurgling",
"298": "Fire",
"299": "Crackle",
"300": "Vehicle",
"301": "Boat, Water vehicle",
"302": "Sailboat, sailing ship",
"303": "Rowboat, canoe, kayak",
"304": "Motorboat, speedboat",
"305": "Ship",
"306": "Motor vehicle (road)",
"307": "Car",
"308": "Vehicle horn, car horn, honking",
"309": "Toot",
"310": "Car alarm",
"311": "Power windows, electric windows",
"312": "Skidding",
"313": "Tire squeal",
"314": "Car passing by",
"315": "Race car, auto racing",
"316": "Truck",
"317": "Air brake",
"318": "Air horn, truck horn",
"319": "Reversing beeps",
"320": "Ice cream truck, ice cream van",
"321": "Bus",
"322": "Emergency vehicle",
"323": "Police car (siren)",
"324": "Ambulance (siren)",
"325": "Fire engine, fire truck (siren)",
"326": "Motorcycle",
"327": "Traffic noise, roadway noise",
"328": "Rail transport",
"329": "Train",
"330": "Train whistle",
"331": "Train horn",
"332": "Railroad car, train wagon",
"333": "Train wheels squealing",
"334": "Subway, metro, underground",
"335": "Aircraft",
"336": "Aircraft engine",
"337": "Jet engine",
"338": "Propeller, airscrew",
"339": "Helicopter",
"340": "Fixed-wing aircraft, airplane",
"341": "Bicycle",
"342": "Skateboard",
"343": "Engine",
"344": "Light engine (high frequency)",
"345": "Dental drill, dentist's drill",
"346": "Lawn mower",
"347": "Chainsaw",
"348": "Medium engine (mid frequency)",
"349": "Heavy engine (low frequency)",
"350": "Engine knocking",
"351": "Engine starting",
"352": "Idling",
"353": "Accelerating, revving, vroom",
"354": "Door",
"355": "Doorbell",
"356": "Ding-dong",
"357": "Sliding door",
"358": "Slam",
"359": "Knock",
"360": "Tap",
"361": "Squeak",
"362": "Cupboard open or close",
"363": "Drawer open or close",
"364": "Dishes, pots, and pans",
"365": "Cutlery, silverware",
"366": "Chopping (food)",
"367": "Frying (food)",
"368": "Microwave oven",
"369": "Blender",
"370": "Water tap, faucet",
"371": "Sink (filling or washing)",
"372": "Bathtub (filling or washing)",
"373": "Hair dryer",
"374": "Toilet flush",
"375": "Toothbrush",
"376": "Electric toothbrush",
"377": "Vacuum cleaner",
"378": "Zipper (clothing)",
"379": "Keys jangling",
"380": "Coin (dropping)",
"381": "Scissors",
"382": "Electric shaver, electric razor",
"383": "Shuffling cards",
"384": "Typing",
"385": "Typewriter",
"386": "Computer keyboard",
"387": "Writing",
"388": "Alarm",
"389": "Telephone",
"390": "Telephone bell ringing",
"391": "Ringtone",
"392": "Telephone dialing, DTMF",
"393": "Dial tone",
"394": "Busy signal",
"395": "Alarm clock",
"396": "Siren",
"397": "Civil defense siren",
"398": "Buzzer",
"399": "Smoke detector, smoke alarm",
"400": "Fire alarm",
"401": "Foghorn",
"402": "Whistle",
"403": "Steam whistle",
"404": "Mechanisms",
"405": "Ratchet, pawl",
"406": "Clock",
"407": "Tick",
"408": "Tick-tock",
"409": "Gears",
"410": "Pulleys",
"411": "Sewing machine",
"412": "Mechanical fan",
"413": "Air conditioning",
"414": "Cash register",
"415": "Printer",
"416": "Camera",
"417": "Single-lens reflex camera",
"418": "Tools",
"419": "Hammer",
"420": "Jackhammer",
"421": "Sawing",
"422": "Filing (rasp)",
"423": "Sanding",
"424": "Power tool",
"425": "Drill",
"426": "Explosion",
"427": "Gunshot, gunfire",
"428": "Machine gun",
"429": "Fusillade",
"430": "Artillery fire",
"431": "Cap gun",
"432": "Fireworks",
"433": "Firecracker",
"434": "Burst, pop",
"435": "Eruption",
"436": "Boom",
"437": "Wood",
"438": "Chop",
"439": "Splinter",
"440": "Crack",
"441": "Glass",
"442": "Chink, clink",
"443": "Shatter",
"444": "Liquid",
"445": "Splash, splatter",
"446": "Slosh",
"447": "Squish",
"448": "Drip",
"449": "Pour",
"450": "Trickle, dribble",
"451": "Gush",
"452": "Fill (with liquid)",
"453": "Spray",
"454": "Pump (liquid)",
"455": "Stir",
"456": "Boiling",
"457": "Sonar",
"458": "Arrow",
"459": "Whoosh, swoosh, swish",
"460": "Thump, thud",
"461": "Thunk",
"462": "Electronic tuner",
"463": "Effects unit",
"464": "Chorus effect",
"465": "Basketball bounce",
"466": "Bang",
"467": "Slap, smack",
"468": "Whack, thwack",
"469": "Smash, crash",
"470": "Breaking",
"471": "Bouncing",
"472": "Whip",
"473": "Flap",
"474": "Scratch",
"475": "Scrape",
"476": "Rub",
"477": "Roll",
"478": "Crushing",
"479": "Crumpling, crinkling",
"480": "Tearing",
"481": "Beep, bleep",
"482": "Ping",
"483": "Ding",
"484": "Clang",
"485": "Squeal",
"486": "Creak",
"487": "Rustle",
"488": "Whir",
"489": "Clatter",
"490": "Sizzle",
"491": "Clicking",
"492": "Clickety-clack",
"493": "Rumble",
"494": "Plop",
"495": "Jingle, tinkle",
"496": "Hum",
"497": "Zing",
"498": "Boing",
"499": "Crunch",
"500": "Silence",
"501": "Sine wave",
"502": "Harmonic",
"503": "Chirp tone",
"504": "Sound effect",
"505": "Pulse",
"506": "Inside, small room",
"507": "Inside, large room or hall",
"508": "Inside, public space",
"509": "Outside, urban or manmade",
"510": "Outside, rural or natural",
"511": "Reverberation",
"512": "Echo",
"513": "Noise",
"514": "Environmental noise",
"515": "Static",
"516": "Mains hum",
"517": "Distortion",
"518": "Sidetone",
"519": "Cacophony",
"520": "White noise",
"521": "Pink noise",
"522": "Throbbing",
"523": "Vibration",
"524": "Television",
"525": "Radio",
"526": "Field recording"
},
"initializer_range": 0.02,
"intermediate_size": 3072,
"label2id": {
"A capella": 255,
"Accelerating, revving, vroom": 353,
"Accordion": 209,
"Acoustic guitar": 143,
"Afrobeat": 257,
"Air brake": 317,
"Air conditioning": 413,
"Air horn, truck horn": 318,
"Aircraft": 335,
"Aircraft engine": 336,
"Alarm": 388,
"Alarm clock": 395,
"Ambient music": 246,
"Ambulance (siren)": 324,
"Angry music": 281,
"Animal": 72,
"Applause": 67,
"Arrow": 458,
"Artillery fire": 430,
"Babbling": 6,
"Baby cry, infant cry": 23,
"Baby laughter": 17,
"Background music": 267,
"Bagpipes": 210,
"Bang": 466,
"Banjo": 147,
"Bark": 75,
"Basketball bounce": 465,
"Bass drum": 168,
"Bass guitar": 142,
"Bathtub (filling or washing)": 372,
"Battle cry": 12,
"Beatboxing": 218,
"Bee, wasp, etc.": 131,
"Beep, bleep": 481,
"Bell": 200,
"Bellow": 9,
"Belly laugh": 20,
"Bicycle": 341,
"Bicycle bell": 203,
"Bird": 111,
"Bird flight, flapping wings": 121,
"Bird vocalization, bird call, bird song": 112,
"Biting": 55,
"Bleat": 96,
"Blender": 369,
"Bluegrass": 231,
"Blues": 251,
"Boat, Water vehicle": 301,
"Boiling": 456,
"Boing": 498,
"Boom": 436,
"Bouncing": 471,
"Bow-wow": 78,
"Bowed string instrument": 189,
"Brass instrument": 185,
"Breaking": 470,
"Breathing": 41,
"Burping, eructation": 58,
"Burst, pop": 434,
"Bus": 321,
"Busy signal": 394,
"Buzz": 130,
"Buzzer": 398,
"Cacophony": 519,
"Camera": 416,
"Canidae, dogs, wolves": 122,
"Cap gun": 431,
"Car": 307,
"Car alarm": 310,
"Car passing by": 314,
"Carnatic music": 261,
"Cash register": 414,
"Cat": 81,
"Caterwaul": 85,
"Cattle, bovinae": 90,
"Caw": 118,
"Cello": 193,
"Chainsaw": 347,
"Change ringing (campanology)": 207,
"Chant": 30,
"Chatter": 68,
"Cheering": 66,
"Chewing, mastication": 54,
"Chicken, rooster": 99,
"Child singing": 34,
"Child speech, kid speaking": 3,
"Children playing": 71,
"Children shouting": 13,
"Chime": 205,
"Chink, clink": 442,
"Chirp tone": 503,
"Chirp, tweet": 113,
"Choir": 28,
"Chop": 438,
"Chopping (food)": 366,
"Chorus effect": 464,
"Christian music": 258,
"Christmas music": 273,
"Chuckle, chortle": 21,
"Church bell": 201,
"Civil defense siren": 397,
"Clang": 484,
"Clapping": 63,
"Clarinet": 198,
"Classical music": 237,
"Clatter": 489,
"Clickety-clack": 492,
"Clicking": 491,
"Clip-clop": 88,
"Clock": 406,
"Cluck": 100,
"Coin (dropping)": 380,
"Computer keyboard": 386,
"Conversation": 4,
"Coo": 116,
"Cough": 47,
"Country": 229,
"Cowbell": 92,
"Crack": 440,
"Crackle": 299,
"Creak": 486,
"Cricket": 127,
"Croak": 133,
"Crow": 117,
"Crowd": 69,
"Crowing, cock-a-doodle-doo": 101,
"Crumpling, crinkling": 479,
"Crunch": 499,
"Crushing": 478,
"Crying, sobbing": 22,
"Cupboard open or close": 362,
"Cutlery, silverware": 365,
"Cymbal": 171,
"Dance music": 274,
"Dental drill, dentist's drill": 345,
"Dial tone": 393,
"Didgeridoo": 211,
"Ding": 483,
"Ding-dong": 356,
"Disco": 236,
"Dishes, pots, and pans": 364,
"Distortion": 517,
"Dog": 74,
"Domestic animals, pets": 73,
"Door": 354,
"Doorbell": 355,
"Double bass": 194,
"Drawer open or close": 363,
"Drill": 425,
"Drip": 448,
"Drum": 164,
"Drum and bass": 243,
"Drum kit": 162,
"Drum machine": 163,
"Drum roll": 167,
"Dubstep": 242,
"Duck": 104,
"Echo": 512,
"Effects unit": 463,
"Electric guitar": 141,
"Electric piano": 154,
"Electric shaver, electric razor": 382,
"Electric toothbrush": 376,
"Electronic dance music": 245,
"Electronic music": 239,
"Electronic organ": 156,
"Electronic tuner": 462,
"Electronica": 244,
"Emergency vehicle": 322,
"Engine": 343,
"Engine knocking": 350,
"Engine starting": 351,
"Environmental noise": 514,
"Eruption": 435,
"Exciting music": 280,
"Explosion": 426,
"Fart": 60,
"Female singing": 33,
"Female speech, woman speaking": 2,
"Field recording": 526,
"Filing (rasp)": 422,
"Fill (with liquid)": 452,
"Finger snapping": 62,
"Fire": 298,
"Fire alarm": 400,
"Fire engine, fire truck (siren)": 325,
"Firecracker": 433,
"Fireworks": 432,
"Fixed-wing aircraft, airplane": 340,
"Flamenco": 250,
"Flap": 473,
"Flute": 196,
"Fly, housefly": 129,
"Foghorn": 401,
"Folk music": 233,
"Fowl": 98,
"French horn": 186,
"Frog": 132,
"Frying (food)": 367,
"Funk": 232,
"Funny music": 277,
"Fusillade": 429,
"Gargling": 56,
"Gasp": 44,
"Gears": 409,
"Giggle": 18,
"Glass": 441,
"Glockenspiel": 181,
"Goat": 95,
"Gobble": 103,
"Gong": 177,
"Goose": 106,
"Gospel music": 259,
"Groan": 38,
"Growling": 79,
"Grunge": 222,
"Grunt": 39,
"Guitar": 140,
"Gunshot, gunfire": 427,
"Gurgling": 297,
"Gush": 451,
"Hair dryer": 373,
"Hammer": 419,
"Hammond organ": 157,
"Hands": 61,
"Happy music": 276,
"Harmonic": 502,
"Harmonica": 208,
"Harp": 199,
"Harpsichord": 160,
"Heart murmur": 65,
"Heart sounds, heartbeat": 64,
"Heavy engine (low frequency)": 349,
"Heavy metal": 220,
"Helicopter": 339,
"Hi-hat": 172,
"Hiccup": 59,
"Hip hop music": 217,
"Hiss": 84,
"Honk": 107,
"Hoot": 120,
"Horse": 87,
"House music": 240,
"Howl": 77,
"Hubbub, speech noise, speech babble": 70,
"Hum": 496,
"Humming": 37,
"Ice cream truck, ice cream van": 320,
"Idling": 352,
"Independent music": 265,
"Insect": 126,
"Inside, large room or hall": 507,
"Inside, public space": 508,
"Inside, small room": 506,
"Jackhammer": 420,
"Jazz": 235,
"Jet engine": 337,
"Jingle (music)": 269,
"Jingle bell": 202,
"Jingle, tinkle": 495,
"Keyboard (musical)": 152,
"Keys jangling": 379,
"Knock": 359,
"Laughter": 16,
"Lawn mower": 346,
"Light engine (high frequency)": 344,
"Liquid": 444,
"Livestock, farm animals, working animals": 86,
"Lullaby": 271,
"Machine gun": 428,
"Mains hum": 516,
"Male singing": 32,
"Male speech, man speaking": 1,
"Mallet percussion": 179,
"Mandolin": 149,
"Mantra": 31,
"Maraca": 176,
"Marimba, xylophone": 180,
"Mechanical fan": 412,
"Mechanisms": 404,
"Medium engine (mid frequency)": 348,
"Meow": 83,
"Microwave oven": 368,
"Middle Eastern music": 234,
"Moo": 91,
"Mosquito": 128,
"Motor vehicle (road)": 306,
"Motorboat, speedboat": 304,
"Motorcycle": 326,
"Mouse": 124,
"Music": 137,
"Music for children": 252,
"Music of Africa": 256,
"Music of Asia": 260,
"Music of Bollywood": 262,
"Music of Latin America": 248,
"Musical instrument": 138,
"Narration, monologue": 5,
"Neigh, whinny": 89,
"New-age music": 253,
"Noise": 513,
"Ocean": 294,
"Oink": 94,
"Opera": 238,
"Orchestra": 184,
"Organ": 155,
"Outside, rural or natural": 510,
"Outside, urban or manmade": 509,
"Owl": 119,
"Pant": 45,
"Patter": 125,
"Percussion": 161,
"Piano": 153,
"Pig": 93,
"Pigeon, dove": 115,
"Ping": 482,
"Pink noise": 521,
"Pizzicato": 192,
"Plop": 494,
"Plucked string instrument": 139,
"Police car (siren)": 323,
"Pop music": 216,
"Pour": 449,
"Power tool": 424,
"Power windows, electric windows": 311,
"Printer": 415,
"Progressive rock": 223,
"Propeller, airscrew": 338,
"Psychedelic rock": 225,
"Pulleys": 410,
"Pulse": 505,
"Pump (liquid)": 454,
"Punk rock": 221,
"Purr": 82,
"Quack": 105,
"Race car, auto racing": 315,
"Radio": 525,
"Rail transport": 328,
"Railroad car, train wagon": 332,
"Rain": 289,
"Rain on surface": 291,
"Raindrop": 290,
"Rapping": 36,
"Ratchet, pawl": 405,
"Rattle": 135,
"Rattle (instrument)": 175,
"Reggae": 228,
"Reverberation": 511,
"Reversing beeps": 319,
"Rhythm and blues": 226,
"Rimshot": 166,
"Ringtone": 391,
"Roar": 110,
"Roaring cats (lions, tigers)": 109,
"Rock and roll": 224,
"Rock music": 219,
"Rodents, rats, mice": 123,
"Roll": 477,
"Rowboat, canoe, kayak": 303,
"Rub": 476,
"Rumble": 493,
"Run": 51,
"Rustle": 487,
"Rustling leaves": 284,
"Sad music": 278,
"Sailboat, sailing ship": 302,
"Salsa music": 249,
"Sampler": 159,
"Sanding": 423,
"Sawing": 421,
"Saxophone": 197,
"Scary music": 282,
"Scissors": 381,
"Scrape": 475,
"Scratch": 474,
"Scratching (performance technique)": 215,
"Screaming": 14,
"Sewing machine": 411,
"Shatter": 443,
"Sheep": 97,
"Ship": 305,
"Shofar": 212,
"Shout": 8,
"Shuffle": 52,
"Shuffling cards": 383,
"Sidetone": 518,
"Sigh": 26,
"Silence": 500,
"Sine wave": 501,
"Singing": 27,
"Singing bowl": 214,
"Single-lens reflex camera": 417,
"Sink (filling or washing)": 371,
"Siren": 396,
"Sitar": 148,
"Sizzle": 490,
"Ska": 263,
"Skateboard": 342,
"Skidding": 312,
"Slam": 358,
"Slap, smack": 467,
"Sliding door": 357,
"Slosh": 446,
"Smash, crash": 469,
"Smoke detector, smoke alarm": 399,
"Snake": 134,
"Snare drum": 165,
"Sneeze": 49,
"Snicker": 19,
"Sniff": 50,
"Snoring": 43,
"Snort": 46,
"Sonar": 457,
"Song": 266,
"Soul music": 227,
"Sound effect": 504,
"Soundtrack music": 270,
"Speech": 0,
"Speech synthesizer": 7,
"Splash, splatter": 445,
"Splinter": 439,
"Spray": 453,
"Squawk": 114,
"Squeak": 361,
"Squeal": 485,
"Squish": 447,
"Static": 515,
"Steam": 296,
"Steam whistle": 403,
"Steel guitar, slide guitar": 144,
"Steelpan": 183,
"Stir": 455,
"Stomach rumble": 57,
"Stream": 292,
"String section": 190,
"Strum": 146,
"Subway, metro, underground": 334,
"Swing music": 230,
"Synthesizer": 158,
"Synthetic singing": 35,
"Tabla": 170,
"Tambourine": 174,
"Tap": 360,
"Tapping (guitar technique)": 145,
"Tearing": 480,
"Techno": 241,
"Telephone": 389,
"Telephone bell ringing": 390,
"Telephone dialing, DTMF": 392,
"Television": 524,
"Tender music": 279,
"Theme music": 268,
"Theremin": 213,
"Throat clearing": 48,
"Throbbing": 522,
"Thump, thud": 460,
"Thunder": 287,
"Thunderstorm": 286,
"Thunk": 461,
"Tick": 407,
"Tick-tock": 408,
"Timpani": 169,
"Tire squeal": 313,
"Toilet flush": 374,
"Tools": 418,
"Toot": 309,
"Toothbrush": 375,
"Traditional music": 264,
"Traffic noise, roadway noise": 327,
"Train": 329,
"Train horn": 331,
"Train wheels squealing": 333,
"Train whistle": 330,
"Trance music": 247,
"Trickle, dribble": 450,
"Trombone": 188,
"Truck": 316,
"Trumpet": 187,
"Tubular bells": 178,
"Tuning fork": 204,
"Turkey": 102,
"Typewriter": 385,
"Typing": 384,
"Ukulele": 151,
"Vacuum cleaner": 377,
"Vehicle": 300,
"Vehicle horn, car horn, honking": 308,
"Vibraphone": 182,
"Vibration": 523,
"Video game music": 272,
"Violin, fiddle": 191,
"Vocal music": 254,
"Wail, moan": 25,
"Walk, footsteps": 53,
"Water": 288,
"Water tap, faucet": 370,
"Waterfall": 293,
"Waves, surf": 295,
"Wedding music": 275,
"Whack, thwack": 468,
"Whale vocalization": 136,
"Wheeze": 42,
"Whimper": 24,
"Whimper (dog)": 80,
"Whip": 472,
"Whir": 488,
"Whispering": 15,
"Whistle": 402,
"Whistling": 40,
"White noise": 520,
"Whoop": 10,
"Whoosh, swoosh, swish": 459,
"Wild animals": 108,
"Wind": 283,
"Wind chime": 206,
"Wind instrument, woodwind instrument": 195,
"Wind noise (microphone)": 285,
"Wood": 437,
"Wood block": 173,
"Writing": 387,
"Yell": 11,
"Yip": 76,
"Yodeling": 29,
"Zing": 497,
"Zipper (clothing)": 378,
"Zither": 150
},
"layer_norm_eps": 1e-12,
"model_type": "audio-spectrogram-transformer",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"patch_size": 16,
"qkv_bias": true,
"time_dimension": 1024,
"time_stride": 14,
"torch_dtype": "float32",
"transformers_version": "4.25.0.dev0"
}