// // Generated by LLVM NVPTX Back-End // .version 8.2 .target sm_89 .address_size 64 // .globl triton__0d1d2d3d4d5de6de .extern .func __assertfail ( .param .b64 __assertfail_param_0, .param .b64 __assertfail_param_1, .param .b32 __assertfail_param_2, .param .b64 __assertfail_param_3, .param .b64 __assertfail_param_4 ) ; .global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100}; .global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62}; .global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 51, 32, 60, 32, 53, 48, 50, 53, 55}; .global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100}; .global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62}; .global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55}; .extern .shared .align 1 .b8 global_smem[]; .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0}; .visible .entry triton__0d1d2d3d4d5de6de( .param .u64 triton__0d1d2d3d4d5de6de_param_0, .param .u64 triton__0d1d2d3d4d5de6de_param_1, .param .u64 triton__0d1d2d3d4d5de6de_param_2, .param .u64 triton__0d1d2d3d4d5de6de_param_3, .param .u64 triton__0d1d2d3d4d5de6de_param_4, .param .u32 triton__0d1d2d3d4d5de6de_param_5, .param .u32 triton__0d1d2d3d4d5de6de_param_6 ) .maxntid 256, 1, 1 { .reg .pred %p<117>; .reg .b16 %rs<17>; .reg .b32 %r<375>; .reg .f32 %f<423>; .reg .b64 %rd<113>; .loc 1 18 0 $L__func_begin0: .loc 1 18 0 ld.param.u64 %rd13, [triton__0d1d2d3d4d5de6de_param_3]; ld.param.u64 %rd12, [triton__0d1d2d3d4d5de6de_param_1]; ld.param.u64 %rd53, [triton__0d1d2d3d4d5de6de_param_0]; $L__tmp0: .loc 1 22 44 mov.u32 %r59, %tid.x; ld.param.u64 %rd54, [triton__0d1d2d3d4d5de6de_param_2]; bfe.u32 %r60, %r59, 5, 3; and.b32 %r61, %r59, 15; .loc 1 24 33 shl.b32 %r62, %r59, 3; and.b32 %r1, %r62, 248; and.b32 %r2, %r59, 255; .loc 1 21 28 mov.u32 %r26, %ctaid.x; .loc 1 21 33 shl.b32 %r63, %r26, 4; .loc 1 22 23 or.b32 %r3, %r63, %r60; or.b32 %r4, %r3, 8; or.b32 %r64, %r63, %r61; .loc 1 26 30 mul.wide.s32 %rd55, %r3, 8; add.s64 %rd16, %rd53, %rd55; add.s64 %rd32, %rd16, 64; mul.wide.s32 %rd56, %r64, 8; add.s64 %rd48, %rd53, %rd56; mov.pred %p93, -1; .loc 1 26 35 mov.u64 %rd15, 0x0; @%p93 ld.global.L1::evict_last.b64 { %rd15 }, [ %rd16 + 0 ]; mov.u64 %rd17, 0x0; @%p93 ld.global.L1::evict_last.b64 { %rd17 }, [ %rd16 + 0 ]; mov.u64 %rd19, 0x0; @%p93 ld.global.L1::evict_last.b64 { %rd19 }, [ %rd16 + 0 ]; mov.u64 %rd21, 0x0; @%p93 ld.global.L1::evict_last.b64 { %rd21 }, [ %rd16 + 0 ]; mov.u64 %rd23, 0x0; @%p93 ld.global.L1::evict_last.b64 { %rd23 }, [ %rd16 + 0 ]; mov.u64 %rd25, 0x0; @%p93 ld.global.L1::evict_last.b64 { %rd25 }, [ %rd16 + 0 ]; mov.u64 %rd27, 0x0; @%p93 ld.global.L1::evict_last.b64 { %rd27 }, [ %rd16 + 0 ]; mov.u64 %rd29, 0x0; @%p93 ld.global.L1::evict_last.b64 { %rd29 }, [ %rd16 + 0 ]; mov.u64 %rd31, 0x0; @%p93 ld.global.L1::evict_last.b64 { %rd31 }, [ %rd32 + 0 ]; mov.u64 %rd33, 0x0; @%p93 ld.global.L1::evict_last.b64 { %rd33 }, [ %rd32 + 0 ]; mov.u64 %rd35, 0x0; @%p93 ld.global.L1::evict_last.b64 { %rd35 }, [ %rd32 + 0 ]; mov.u64 %rd37, 0x0; @%p93 ld.global.L1::evict_last.b64 { %rd37 }, [ %rd32 + 0 ]; mov.u64 %rd39, 0x0; @%p93 ld.global.L1::evict_last.b64 { %rd39 }, [ %rd32 + 0 ]; mov.u64 %rd41, 0x0; @%p93 ld.global.L1::evict_last.b64 { %rd41 }, [ %rd32 + 0 ]; mov.u64 %rd43, 0x0; @%p93 ld.global.L1::evict_last.b64 { %rd43 }, [ %rd32 + 0 ]; mov.u64 %rd45, 0x0; @%p93 ld.global.L1::evict_last.b64 { %rd45 }, [ %rd32 + 0 ]; mov.u64 %rd47, 0x0; @%p93 ld.global.L1::evict_last.b64 { %rd47 }, [ %rd48 + 0 ]; .loc 1 27 18 bfe.s32 %r65, %r26, 27, 1; shr.u32 %r66, %r65, 23; add.s32 %r67, %r3, %r66; and.b32 %r68, %r67, 16776704; sub.s32 %r69, %r3, %r68; add.s32 %r70, %r4, %r66; and.b32 %r71, %r70, 16776704; sub.s32 %r72, %r4, %r71; .loc 1 35 44 shl.b32 %r73, %r69, 8; shl.b32 %r74, %r72, 8; .loc 1 35 40 or.b32 %r75, %r73, %r1; or.b32 %r76, %r74, %r1; .loc 1 35 34 mul.wide.s32 %rd57, %r75, 4; add.s64 %rd80, %rd54, %rd57; cvt.s64.s32 %rd58, %r73; cvt.u64.u32 %rd59, %r1; or.b64 %rd60, %rd58, %rd59; shl.b64 %rd61, %rd60, 2; add.s64 %rd62, %rd54, %rd61; add.s64 %rd81, %rd62, 16; mul.wide.s32 %rd63, %r76, 4; add.s64 %rd82, %rd54, %rd63; cvt.s64.s32 %rd64, %r74; or.b64 %rd65, %rd64, %rd59; shl.b64 %rd66, %rd65, 2; add.s64 %rd67, %rd54, %rd66; add.s64 %rd83, %rd67, 16; mov.b32 %r257, 0; .loc 1 35 50 mov.u32 %r27, 0x0; mov.u32 %r28, 0x0; mov.u32 %r29, 0x0; mov.u32 %r30, 0x0; @%p93 ld.global.L1::evict_last.v4.b32 { %r27, %r28, %r29, %r30 }, [ %rd80 + 0 ]; @!%p93 mov.u32 %r27, %r257; @!%p93 mov.u32 %r28, %r257; @!%p93 mov.u32 %r29, %r257; @!%p93 mov.u32 %r30, %r257; mov.b32 %f1, %r27; mov.b32 %f2, %r28; mov.b32 %f3, %r29; mov.b32 %f4, %r30; mov.u32 %r35, 0x0; mov.u32 %r36, 0x0; mov.u32 %r37, 0x0; mov.u32 %r38, 0x0; @%p93 ld.global.L1::evict_last.v4.b32 { %r35, %r36, %r37, %r38 }, [ %rd81 + 0 ]; @!%p93 mov.u32 %r35, %r257; @!%p93 mov.u32 %r36, %r257; @!%p93 mov.u32 %r37, %r257; @!%p93 mov.u32 %r38, %r257; mov.b32 %f5, %r35; mov.b32 %f6, %r36; mov.b32 %f7, %r37; mov.b32 %f8, %r38; mov.u32 %r43, 0x0; mov.u32 %r44, 0x0; mov.u32 %r45, 0x0; mov.u32 %r46, 0x0; @%p93 ld.global.L1::evict_last.v4.b32 { %r43, %r44, %r45, %r46 }, [ %rd82 + 0 ]; @!%p93 mov.u32 %r43, %r257; @!%p93 mov.u32 %r44, %r257; @!%p93 mov.u32 %r45, %r257; @!%p93 mov.u32 %r46, %r257; mov.b32 %f9, %r43; mov.b32 %f10, %r44; mov.b32 %f11, %r45; mov.b32 %f12, %r46; mov.u32 %r51, 0x0; mov.u32 %r52, 0x0; mov.u32 %r53, 0x0; mov.u32 %r54, 0x0; @%p93 ld.global.L1::evict_last.v4.b32 { %r51, %r52, %r53, %r54 }, [ %rd83 + 0 ]; @!%p93 mov.u32 %r51, %r257; @!%p93 mov.u32 %r52, %r257; @!%p93 mov.u32 %r53, %r257; @!%p93 mov.u32 %r54, %r257; mov.b32 %f13, %r51; mov.b32 %f14, %r52; mov.b32 %f15, %r53; mov.b32 %f16, %r54; .loc 1 36 22 add.s64 %rd68, %rd47, 50257; .loc 1 37 22 setp.lt.s64 %p38, %rd47, 0; .loc 1 38 36 selp.b64 %rd7, %rd68, %rd47, %p38; .loc 1 39 40 setp.lt.u64 %p39, %rd7, 50257; mov.b32 %r374, 883; mov.u64 %rd112, 1; .loc 1 39 55 @%p39 bra $L__BB0_2; mov.u64 %rd69, assertMessage_0; cvta.global.u64 %rd70, %rd69; mov.u64 %rd71, assertFile_0; cvta.global.u64 %rd72, %rd71; mov.u64 %rd73, assertFunc_0; cvta.global.u64 %rd74, %rd73; { // callseq 8, 0 .reg .b32 temp_param_reg; .param .b64 param0; st.param.b64 [param0+0], %rd70; .param .b64 param1; st.param.b64 [param1+0], %rd72; .param .b32 param2; st.param.b32 [param2+0], %r374; .param .b64 param3; st.param.b64 [param3+0], %rd74; .param .b64 param4; st.param.b64 [param4+0], %rd112; call.uni __assertfail, ( param0, param1, param2, param3, param4 ); } // callseq 8 $L__BB0_2: .loc 1 0 55 ld.param.u64 %rd14, [triton__0d1d2d3d4d5de6de_param_4]; .loc 1 37 22 setp.lt.s64 %p83, %rd31, 0; setp.lt.s64 %p84, %rd15, 0; .loc 1 40 44 shl.b64 %rd85, %rd15, 8; add.s64 %rd86, %rd85, 12865792; selp.b64 %rd87, %rd86, %rd85, %p84; shl.b64 %rd88, %rd31, 8; add.s64 %rd89, %rd88, 12865792; selp.b64 %rd90, %rd89, %rd88, %p83; .loc 1 40 40 or.b64 %rd92, %rd87, %rd59; or.b64 %rd93, %rd90, %rd59; .loc 1 40 34 shl.b64 %rd94, %rd92, 2; add.s64 %rd104, %rd12, %rd94; add.s64 %rd105, %rd104, 16; shl.b64 %rd95, %rd93, 2; add.s64 %rd106, %rd12, %rd95; add.s64 %rd107, %rd106, 16; .loc 1 40 52 mov.u32 %r78, 0x0; mov.u32 %r79, 0x0; mov.u32 %r80, 0x0; mov.u32 %r81, 0x0; @%p93 ld.global.L1::evict_last.v4.b32 { %r78, %r79, %r80, %r81 }, [ %rd104 + 0 ]; @!%p93 mov.u32 %r78, %r257; @!%p93 mov.u32 %r79, %r257; @!%p93 mov.u32 %r80, %r257; @!%p93 mov.u32 %r81, %r257; mov.b32 %f27, %r78; mov.b32 %f28, %r79; mov.b32 %f29, %r80; mov.b32 %f30, %r81; mov.u32 %r86, 0x0; mov.u32 %r87, 0x0; mov.u32 %r88, 0x0; mov.u32 %r89, 0x0; @%p93 ld.global.L1::evict_last.v4.b32 { %r86, %r87, %r88, %r89 }, [ %rd105 + 0 ]; @!%p93 mov.u32 %r86, %r257; @!%p93 mov.u32 %r87, %r257; @!%p93 mov.u32 %r88, %r257; @!%p93 mov.u32 %r89, %r257; mov.b32 %f31, %r86; mov.b32 %f32, %r87; mov.b32 %f33, %r88; mov.b32 %f34, %r89; mov.u32 %r94, 0x0; mov.u32 %r95, 0x0; mov.u32 %r96, 0x0; mov.u32 %r97, 0x0; @%p93 ld.global.L1::evict_last.v4.b32 { %r94, %r95, %r96, %r97 }, [ %rd106 + 0 ]; @!%p93 mov.u32 %r94, %r257; @!%p93 mov.u32 %r95, %r257; @!%p93 mov.u32 %r96, %r257; @!%p93 mov.u32 %r97, %r257; mov.b32 %f35, %r94; mov.b32 %f36, %r95; mov.b32 %f37, %r96; mov.b32 %f38, %r97; mov.u32 %r102, 0x0; mov.u32 %r103, 0x0; mov.u32 %r104, 0x0; mov.u32 %r105, 0x0; @%p93 ld.global.L1::evict_last.v4.b32 { %r102, %r103, %r104, %r105 }, [ %rd107 + 0 ]; @!%p93 mov.u32 %r102, %r257; @!%p93 mov.u32 %r103, %r257; @!%p93 mov.u32 %r104, %r257; @!%p93 mov.u32 %r105, %r257; mov.b32 %f39, %r102; mov.b32 %f40, %r103; mov.b32 %f41, %r104; mov.b32 %f42, %r105; .loc 1 41 22 add.f32 %f43, %f1, %f27; add.f32 %f44, %f2, %f28; add.f32 %f45, %f3, %f29; add.f32 %f46, %f4, %f30; add.f32 %f47, %f5, %f31; add.f32 %f48, %f6, %f32; add.f32 %f49, %f7, %f33; add.f32 %f50, %f8, %f34; add.f32 %f51, %f9, %f35; add.f32 %f52, %f10, %f36; add.f32 %f53, %f11, %f37; add.f32 %f54, %f12, %f38; add.f32 %f55, %f13, %f39; add.f32 %f56, %f14, %f40; add.f32 %f57, %f15, %f41; add.f32 %f58, %f16, %f42; $L__tmp1: .loc 2 98 22 add.f32 %f59, %f43, 0f00000000; add.f32 %f60, %f44, 0f00000000; add.f32 %f61, %f45, 0f00000000; add.f32 %f62, %f46, 0f00000000; add.f32 %f63, %f47, 0f00000000; add.f32 %f64, %f48, 0f00000000; add.f32 %f65, %f49, 0f00000000; add.f32 %f66, %f50, 0f00000000; add.f32 %f67, %f51, 0f00000000; add.f32 %f68, %f52, 0f00000000; add.f32 %f69, %f53, 0f00000000; add.f32 %f70, %f54, 0f00000000; add.f32 %f71, %f55, 0f00000000; add.f32 %f72, %f56, 0f00000000; add.f32 %f73, %f57, 0f00000000; add.f32 %f74, %f58, 0f00000000; .loc 2 101 30 sub.f32 %f75, %f43, %f59; sub.f32 %f76, %f44, %f60; sub.f32 %f77, %f45, %f61; sub.f32 %f78, %f46, %f62; sub.f32 %f79, %f47, %f63; sub.f32 %f80, %f48, %f64; sub.f32 %f81, %f49, %f65; sub.f32 %f82, %f50, %f66; sub.f32 %f83, %f51, %f67; sub.f32 %f84, %f52, %f68; sub.f32 %f85, %f53, %f69; sub.f32 %f86, %f54, %f70; sub.f32 %f87, %f55, %f71; sub.f32 %f88, %f56, %f72; sub.f32 %f89, %f57, %f73; sub.f32 %f90, %f58, %f74; .loc 2 101 13 fma.rn.f32 %f91, %f43, %f75, 0f00000000; fma.rn.f32 %f92, %f44, %f76, 0f00000000; fma.rn.f32 %f93, %f45, %f77, 0f00000000; fma.rn.f32 %f94, %f46, %f78, 0f00000000; fma.rn.f32 %f95, %f47, %f79, 0f00000000; fma.rn.f32 %f96, %f48, %f80, 0f00000000; fma.rn.f32 %f97, %f49, %f81, 0f00000000; fma.rn.f32 %f98, %f50, %f82, 0f00000000; fma.rn.f32 %f99, %f51, %f83, 0f00000000; fma.rn.f32 %f100, %f52, %f84, 0f00000000; fma.rn.f32 %f101, %f53, %f85, 0f00000000; fma.rn.f32 %f102, %f54, %f86, 0f00000000; fma.rn.f32 %f103, %f55, %f87, 0f00000000; fma.rn.f32 %f104, %f56, %f88, 0f00000000; fma.rn.f32 %f105, %f57, %f89, 0f00000000; fma.rn.f32 %f106, %f58, %f90, 0f00000000; $L__tmp2: .loc 2 108 21 sub.f32 %f107, %f60, %f59; mov.b32 %r111, 1065353216; mov.b32 %r112, 1073741824; .loc 2 110 60 div.full.f32 %r110, %r111, %r112; mov.b32 %f108, %r110; .loc 2 112 17 fma.rn.f32 %f109, %f108, %f107, %f59; .loc 2 113 15 add.f32 %f110, %f91, %f92; .loc 2 113 30 mul.f32 %f111, %f107, %f107; .loc 2 113 22 fma.rn.f32 %f112, %f108, %f111, %f110; .loc 2 108 21 sub.f32 %f113, %f61, %f109; mov.b32 %r115, 1077936128; .loc 2 110 60 div.full.f32 %r113, %r111, %r115; mov.b32 %f114, %r113; .loc 2 112 17 fma.rn.f32 %f115, %f114, %f113, %f109; .loc 2 113 15 add.f32 %f116, %f93, %f112; .loc 2 113 30 mul.f32 %f117, %f113, %f113; .loc 2 113 38 fma.rn.f32 %f118, %f113, %f113, %f117; .loc 2 113 22 fma.rn.f32 %f119, %f114, %f118, %f116; .loc 2 108 21 sub.f32 %f120, %f62, %f115; mov.b32 %r118, 1082130432; .loc 2 110 60 div.full.f32 %r116, %r111, %r118; mov.b32 %f121, %r116; .loc 2 112 17 fma.rn.f32 %f122, %f121, %f120, %f115; .loc 2 113 15 add.f32 %f123, %f94, %f119; .loc 2 113 30 mul.f32 %f124, %f120, %f120; .loc 2 113 38 mul.f32 %f125, %f124, 0f40400000; .loc 2 113 22 fma.rn.f32 %f126, %f121, %f125, %f123; .loc 2 108 21 sub.f32 %f127, %f63, %f122; mov.b32 %r121, 1084227584; .loc 2 110 60 div.full.f32 %r119, %r111, %r121; mov.b32 %f128, %r119; .loc 2 112 17 fma.rn.f32 %f129, %f128, %f127, %f122; .loc 2 113 15 add.f32 %f130, %f95, %f126; .loc 2 113 30 mul.f32 %f131, %f127, %f127; .loc 2 113 38 mul.f32 %f132, %f131, 0f40800000; .loc 2 113 22 fma.rn.f32 %f133, %f128, %f132, %f130; .loc 2 108 21 sub.f32 %f134, %f64, %f129; mov.b32 %r124, 1086324736; .loc 2 110 60 div.full.f32 %r122, %r111, %r124; mov.b32 %f135, %r122; .loc 2 112 17 fma.rn.f32 %f136, %f135, %f134, %f129; .loc 2 113 15 add.f32 %f137, %f96, %f133; .loc 2 113 30 mul.f32 %f138, %f134, %f134; .loc 2 113 38 mul.f32 %f139, %f138, 0f40A00000; .loc 2 113 22 fma.rn.f32 %f140, %f135, %f139, %f137; .loc 2 108 21 sub.f32 %f141, %f65, %f136; mov.b32 %r127, 1088421888; .loc 2 110 60 div.full.f32 %r125, %r111, %r127; mov.b32 %f142, %r125; .loc 2 112 17 fma.rn.f32 %f143, %f142, %f141, %f136; .loc 2 113 15 add.f32 %f144, %f97, %f140; .loc 2 113 30 mul.f32 %f145, %f141, %f141; .loc 2 113 38 mul.f32 %f146, %f145, 0f40C00000; .loc 2 113 22 fma.rn.f32 %f147, %f142, %f146, %f144; .loc 2 108 21 sub.f32 %f148, %f66, %f143; mov.b32 %r130, 1090519040; .loc 2 110 60 div.full.f32 %r128, %r111, %r130; mov.b32 %f149, %r128; .loc 2 112 17 fma.rn.f32 %f150, %f149, %f148, %f143; .loc 2 113 15 add.f32 %f151, %f98, %f147; .loc 2 113 30 mul.f32 %f152, %f148, %f148; .loc 2 113 38 mul.f32 %f153, %f152, 0f40E00000; .loc 2 113 22 fma.rn.f32 %f154, %f149, %f153, %f151; .loc 2 108 21 sub.f32 %f155, %f68, %f67; .loc 2 110 60 div.full.f32 %r131, %r111, %r112; mov.b32 %f156, %r131; .loc 2 112 17 fma.rn.f32 %f157, %f155, %f156, %f67; .loc 2 113 15 add.f32 %f158, %f99, %f100; .loc 2 113 30 mul.f32 %f159, %f155, %f155; .loc 2 113 22 fma.rn.f32 %f160, %f159, %f156, %f158; .loc 2 108 21 sub.f32 %f161, %f69, %f157; .loc 2 110 60 div.full.f32 %r134, %r111, %r115; mov.b32 %f162, %r134; .loc 2 112 17 fma.rn.f32 %f163, %f162, %f161, %f157; .loc 2 113 15 add.f32 %f164, %f101, %f160; .loc 2 113 30 mul.f32 %f165, %f161, %f161; .loc 2 113 38 fma.rn.f32 %f166, %f161, %f161, %f165; .loc 2 113 22 fma.rn.f32 %f167, %f162, %f166, %f164; .loc 2 108 21 sub.f32 %f168, %f70, %f163; .loc 2 110 60 div.full.f32 %r137, %r111, %r118; mov.b32 %f169, %r137; .loc 2 112 17 fma.rn.f32 %f170, %f169, %f168, %f163; .loc 2 113 15 add.f32 %f171, %f102, %f167; .loc 2 113 30 mul.f32 %f172, %f168, %f168; .loc 2 113 38 mul.f32 %f173, %f172, 0f40400000; .loc 2 113 22 fma.rn.f32 %f174, %f169, %f173, %f171; .loc 2 108 21 sub.f32 %f175, %f71, %f170; .loc 2 110 60 div.full.f32 %r140, %r111, %r121; mov.b32 %f176, %r140; .loc 2 112 17 fma.rn.f32 %f177, %f176, %f175, %f170; .loc 2 113 15 add.f32 %f178, %f103, %f174; .loc 2 113 30 mul.f32 %f179, %f175, %f175; .loc 2 113 38 mul.f32 %f180, %f179, 0f40800000; .loc 2 113 22 fma.rn.f32 %f181, %f176, %f180, %f178; .loc 2 108 21 sub.f32 %f182, %f72, %f177; .loc 2 110 60 div.full.f32 %r143, %r111, %r124; mov.b32 %f183, %r143; .loc 2 112 17 fma.rn.f32 %f184, %f183, %f182, %f177; .loc 2 113 15 add.f32 %f185, %f104, %f181; .loc 2 113 30 mul.f32 %f186, %f182, %f182; .loc 2 113 38 mul.f32 %f187, %f186, 0f40A00000; .loc 2 113 22 fma.rn.f32 %f188, %f183, %f187, %f185; .loc 2 108 21 sub.f32 %f189, %f73, %f184; .loc 2 110 60 div.full.f32 %r146, %r111, %r127; mov.b32 %f190, %r146; .loc 2 112 17 fma.rn.f32 %f191, %f190, %f189, %f184; .loc 2 113 15 add.f32 %f192, %f105, %f188; .loc 2 113 30 mul.f32 %f193, %f189, %f189; .loc 2 113 38 mul.f32 %f194, %f193, 0f40C00000; .loc 2 113 22 fma.rn.f32 %f195, %f190, %f194, %f192; .loc 2 108 21 sub.f32 %f196, %f74, %f191; .loc 2 110 60 div.full.f32 %r149, %r111, %r130; mov.b32 %f197, %r149; .loc 2 112 17 fma.rn.f32 %f198, %f197, %f196, %f191; .loc 2 113 15 add.f32 %f199, %f106, %f195; .loc 2 113 30 mul.f32 %f200, %f196, %f196; .loc 2 113 38 mul.f32 %f201, %f200, 0f40E00000; .loc 2 113 22 fma.rn.f32 %f202, %f197, %f201, %f199; $L__tmp3: .loc 2 120 46 mov.b32 %r216, %f150; shfl.sync.bfly.b32 %r217, %r216, 16, 31, -1; mov.b32 %f203, %r217; mov.b32 %r218, %f154; shfl.sync.bfly.b32 %r219, %r218, 16, 31, -1; mov.b32 %f204, %r219; shfl.sync.bfly.b32 %r153, %r130, 16, 31, -1; mov.b32 %f205, %r153; $L__tmp4: .loc 2 108 21 sub.f32 %f206, %f203, %f150; .loc 2 109 28 add.f32 %f207, %f205, 0f41000000; .loc 2 110 39 setp.eq.f32 %p85, %f207, 0f00000000; .loc 2 110 60 mov.b32 %r154, %f207; div.full.f32 %r152, %r153, %r154; mov.b32 %f208, %r152; .loc 2 110 49 selp.f32 %f209, 0f00000000, %f208, %p85; .loc 2 112 17 fma.rn.f32 %f210, %f209, %f206, %f150; .loc 2 113 15 add.f32 %f211, %f154, %f204; .loc 2 113 30 mul.f32 %f212, %f206, %f206; .loc 2 113 38 mul.f32 %f213, %f212, 0f41000000; .loc 2 113 22 fma.rn.f32 %f214, %f209, %f213, %f211; $L__tmp5: .loc 2 120 46 mov.b32 %r220, %f210; shfl.sync.bfly.b32 %r221, %r220, 8, 31, -1; mov.b32 %f215, %r221; mov.b32 %r222, %f214; shfl.sync.bfly.b32 %r223, %r222, 8, 31, -1; mov.b32 %f216, %r223; shfl.sync.bfly.b32 %r156, %r154, 8, 31, -1; mov.b32 %f217, %r156; $L__tmp6: .loc 2 108 21 sub.f32 %f218, %f215, %f210; .loc 2 109 28 add.f32 %f219, %f207, %f217; .loc 2 110 39 setp.eq.f32 %p86, %f219, 0f00000000; .loc 2 110 60 mov.b32 %r157, %f219; div.full.f32 %r155, %r156, %r157; mov.b32 %f220, %r155; .loc 2 110 49 selp.f32 %f221, 0f00000000, %f220, %p86; .loc 2 112 17 fma.rn.f32 %f222, %f221, %f218, %f210; .loc 2 113 15 add.f32 %f223, %f214, %f216; .loc 2 113 30 mul.f32 %f224, %f218, %f218; .loc 2 113 38 mul.f32 %f225, %f207, %f224; .loc 2 113 22 fma.rn.f32 %f226, %f221, %f225, %f223; $L__tmp7: .loc 2 120 46 mov.b32 %r224, %f222; shfl.sync.bfly.b32 %r225, %r224, 4, 31, -1; mov.b32 %f227, %r225; mov.b32 %r226, %f226; shfl.sync.bfly.b32 %r227, %r226, 4, 31, -1; mov.b32 %f228, %r227; shfl.sync.bfly.b32 %r159, %r157, 4, 31, -1; mov.b32 %f229, %r159; $L__tmp8: .loc 2 108 21 sub.f32 %f230, %f227, %f222; .loc 2 109 28 add.f32 %f231, %f219, %f229; .loc 2 110 39 setp.eq.f32 %p87, %f231, 0f00000000; .loc 2 110 60 mov.b32 %r160, %f231; div.full.f32 %r158, %r159, %r160; mov.b32 %f232, %r158; .loc 2 110 49 selp.f32 %f233, 0f00000000, %f232, %p87; .loc 2 112 17 fma.rn.f32 %f234, %f230, %f233, %f222; .loc 2 113 15 add.f32 %f235, %f226, %f228; .loc 2 113 30 mul.f32 %f236, %f230, %f230; .loc 2 113 38 mul.f32 %f237, %f219, %f236; .loc 2 113 22 fma.rn.f32 %f238, %f233, %f237, %f235; $L__tmp9: .loc 2 120 46 mov.b32 %r228, %f234; shfl.sync.bfly.b32 %r229, %r228, 2, 31, -1; mov.b32 %f239, %r229; mov.b32 %r230, %f238; shfl.sync.bfly.b32 %r231, %r230, 2, 31, -1; mov.b32 %f240, %r231; shfl.sync.bfly.b32 %r162, %r160, 2, 31, -1; mov.b32 %f241, %r162; $L__tmp10: .loc 2 108 21 sub.f32 %f242, %f239, %f234; .loc 2 109 28 add.f32 %f17, %f231, %f241; .loc 2 110 39 setp.eq.f32 %p88, %f17, 0f00000000; .loc 2 110 60 mov.b32 %r163, %f17; div.full.f32 %r161, %r162, %r163; mov.b32 %f243, %r161; .loc 2 110 49 selp.f32 %f244, 0f00000000, %f243, %p88; .loc 2 112 17 fma.rn.f32 %f18, %f242, %f244, %f234; .loc 2 113 15 add.f32 %f245, %f238, %f240; .loc 2 113 30 mul.f32 %f246, %f242, %f242; .loc 2 113 38 mul.f32 %f247, %f231, %f246; .loc 2 113 22 fma.rn.f32 %f19, %f244, %f247, %f245; $L__tmp11: .loc 2 120 46 mov.b32 %r232, %f18; shfl.sync.bfly.b32 %r5, %r232, 1, 31, -1; mov.b32 %r233, %f19; shfl.sync.bfly.b32 %r6, %r233, 1, 31, -1; shfl.sync.bfly.b32 %r165, %r163, 1, 31, -1; mov.b32 %f248, %r165; $L__tmp12: .loc 2 109 28 add.f32 %f20, %f17, %f248; .loc 2 110 60 mov.b32 %r166, %f20; div.full.f32 %r164, %r165, %r166; mov.b32 %f21, %r164; $L__tmp13: .loc 2 120 46 mov.b32 %r234, %f198; shfl.sync.bfly.b32 %r235, %r234, 16, 31, -1; mov.b32 %f249, %r235; mov.b32 %r236, %f202; shfl.sync.bfly.b32 %r237, %r236, 16, 31, -1; mov.b32 %f250, %r237; shfl.sync.bfly.b32 %r168, %r130, 16, 31, -1; mov.b32 %f251, %r168; $L__tmp14: .loc 2 108 21 sub.f32 %f252, %f249, %f198; .loc 2 109 28 add.f32 %f253, %f251, 0f41000000; .loc 2 110 39 setp.eq.f32 %p89, %f253, 0f00000000; .loc 2 110 60 mov.b32 %r169, %f253; div.full.f32 %r167, %r168, %r169; mov.b32 %f254, %r167; .loc 2 110 49 selp.f32 %f255, 0f00000000, %f254, %p89; .loc 2 112 17 fma.rn.f32 %f256, %f252, %f255, %f198; .loc 2 113 15 add.f32 %f257, %f202, %f250; .loc 2 113 30 mul.f32 %f258, %f252, %f252; .loc 2 113 38 mul.f32 %f259, %f258, 0f41000000; .loc 2 113 22 fma.rn.f32 %f260, %f259, %f255, %f257; $L__tmp15: .loc 2 120 46 mov.b32 %r238, %f256; shfl.sync.bfly.b32 %r239, %r238, 8, 31, -1; mov.b32 %f261, %r239; mov.b32 %r240, %f260; shfl.sync.bfly.b32 %r241, %r240, 8, 31, -1; mov.b32 %f262, %r241; shfl.sync.bfly.b32 %r171, %r169, 8, 31, -1; mov.b32 %f263, %r171; $L__tmp16: .loc 2 108 21 sub.f32 %f264, %f261, %f256; .loc 2 109 28 add.f32 %f265, %f253, %f263; .loc 2 110 39 setp.eq.f32 %p90, %f265, 0f00000000; .loc 2 110 60 mov.b32 %r172, %f265; div.full.f32 %r170, %r171, %r172; mov.b32 %f266, %r170; .loc 2 110 49 selp.f32 %f267, 0f00000000, %f266, %p90; .loc 2 112 17 fma.rn.f32 %f268, %f264, %f267, %f256; .loc 2 113 15 add.f32 %f269, %f260, %f262; .loc 2 113 30 mul.f32 %f270, %f264, %f264; .loc 2 113 38 mul.f32 %f271, %f253, %f270; .loc 2 113 22 fma.rn.f32 %f272, %f267, %f271, %f269; $L__tmp17: .loc 2 120 46 mov.b32 %r242, %f268; shfl.sync.bfly.b32 %r243, %r242, 4, 31, -1; mov.b32 %f273, %r243; mov.b32 %r244, %f272; shfl.sync.bfly.b32 %r245, %r244, 4, 31, -1; mov.b32 %f274, %r245; shfl.sync.bfly.b32 %r174, %r172, 4, 31, -1; mov.b32 %f275, %r174; $L__tmp18: .loc 2 108 21 sub.f32 %f276, %f273, %f268; .loc 2 109 28 add.f32 %f277, %f265, %f275; .loc 2 110 39 setp.eq.f32 %p91, %f277, 0f00000000; .loc 2 110 60 mov.b32 %r175, %f277; div.full.f32 %r173, %r174, %r175; mov.b32 %f278, %r173; .loc 2 110 49 selp.f32 %f279, 0f00000000, %f278, %p91; .loc 2 112 17 fma.rn.f32 %f280, %f276, %f279, %f268; .loc 2 113 15 add.f32 %f281, %f272, %f274; .loc 2 113 30 mul.f32 %f282, %f276, %f276; .loc 2 113 38 mul.f32 %f283, %f265, %f282; .loc 2 113 22 fma.rn.f32 %f284, %f279, %f283, %f281; $L__tmp19: .loc 2 120 46 mov.b32 %r246, %f280; shfl.sync.bfly.b32 %r247, %r246, 2, 31, -1; mov.b32 %f285, %r247; mov.b32 %r248, %f284; shfl.sync.bfly.b32 %r249, %r248, 2, 31, -1; mov.b32 %f286, %r249; shfl.sync.bfly.b32 %r177, %r175, 2, 31, -1; mov.b32 %f287, %r177; $L__tmp20: .loc 2 108 21 sub.f32 %f288, %f285, %f280; .loc 2 109 28 add.f32 %f22, %f277, %f287; .loc 2 110 39 setp.eq.f32 %p92, %f22, 0f00000000; .loc 2 110 60 mov.b32 %r178, %f22; div.full.f32 %r176, %r177, %r178; mov.b32 %f289, %r176; .loc 2 110 49 selp.f32 %f290, 0f00000000, %f289, %p92; .loc 2 112 17 fma.rn.f32 %f23, %f288, %f290, %f280; .loc 2 113 15 add.f32 %f291, %f284, %f286; .loc 2 113 30 mul.f32 %f292, %f288, %f288; .loc 2 113 38 mul.f32 %f293, %f277, %f292; .loc 2 113 22 fma.rn.f32 %f24, %f290, %f293, %f291; $L__tmp21: .loc 2 120 46 mov.b32 %r250, %f23; shfl.sync.bfly.b32 %r7, %r250, 1, 31, -1; mov.b32 %r251, %f24; shfl.sync.bfly.b32 %r8, %r251, 1, 31, -1; shfl.sync.bfly.b32 %r180, %r178, 1, 31, -1; mov.b32 %f294, %r180; $L__tmp22: .loc 2 109 28 add.f32 %f25, %f22, %f294; .loc 2 110 60 mov.b32 %r181, %f25; div.full.f32 %r179, %r180, %r181; mov.b32 %f26, %r179; $L__tmp23: .loc 1 59 51 mov.u32 %r182, 0x0; mov.u32 %r183, 0x0; mov.u32 %r184, 0x0; mov.u32 %r185, 0x0; @%p93 ld.global.L1::evict_last.v4.b32 { %r182, %r183, %r184, %r185 }, [ %rd80 + 0 ]; @!%p93 mov.u32 %r182, %r257; @!%p93 mov.u32 %r183, %r257; @!%p93 mov.u32 %r184, %r257; @!%p93 mov.u32 %r185, %r257; mov.u32 %r190, 0x0; mov.u32 %r191, 0x0; mov.u32 %r192, 0x0; mov.u32 %r193, 0x0; @%p93 ld.global.L1::evict_last.v4.b32 { %r190, %r191, %r192, %r193 }, [ %rd81 + 0 ]; @!%p93 mov.u32 %r190, %r257; @!%p93 mov.u32 %r191, %r257; @!%p93 mov.u32 %r192, %r257; @!%p93 mov.u32 %r193, %r257; mov.u32 %r198, 0x0; mov.u32 %r199, 0x0; mov.u32 %r200, 0x0; mov.u32 %r201, 0x0; @%p93 ld.global.L1::evict_last.v4.b32 { %r198, %r199, %r200, %r201 }, [ %rd82 + 0 ]; @!%p93 mov.u32 %r198, %r257; @!%p93 mov.u32 %r199, %r257; @!%p93 mov.u32 %r200, %r257; @!%p93 mov.u32 %r201, %r257; mov.u32 %r206, 0x0; mov.u32 %r207, 0x0; mov.u32 %r208, 0x0; mov.u32 %r209, 0x0; @%p93 ld.global.L1::evict_last.v4.b32 { %r206, %r207, %r208, %r209 }, [ %rd83 + 0 ]; @!%p93 mov.u32 %r206, %r257; @!%p93 mov.u32 %r207, %r257; @!%p93 mov.u32 %r208, %r257; @!%p93 mov.u32 %r209, %r257; .loc 1 60 35 mul.wide.u32 %rd96, %r2, 4; add.s64 %rd84, %rd13, %rd96; .loc 1 60 40 mov.u32 %r214, 0x0; @%p93 ld.global.L1::evict_last.b32 { %r214 }, [ %rd84 + 0 ]; @!%p93 mov.u32 %r214, %r257; .loc 1 64 57 @%p39 bra $L__BB0_4; mov.u64 %rd97, assertMessage_1; cvta.global.u64 %rd98, %rd97; mov.u64 %rd99, assertFile_1; cvta.global.u64 %rd100, %rd99; mov.u64 %rd101, assertFunc_1; cvta.global.u64 %rd102, %rd101; { // callseq 9, 0 .reg .b32 temp_param_reg; .param .b64 param0; st.param.b64 [param0+0], %rd98; .param .b64 param1; st.param.b64 [param1+0], %rd100; .param .b32 param2; st.param.b32 [param2+0], %r374; .param .b64 param3; st.param.b64 [param3+0], %rd102; .param .b64 param4; st.param.b64 [param4+0], %rd112; call.uni __assertfail, ( param0, param1, param2, param3, param4 ); } // callseq 9 $L__BB0_4: $L__tmp24: .loc 2 120 46 mov.b32 %f295, %r8; $L__tmp25: .loc 2 113 15 add.f32 %f296, %f24, %f295; $L__tmp26: .loc 2 120 46 mov.b32 %f297, %r7; $L__tmp27: .loc 2 108 21 sub.f32 %f298, %f297, %f23; .loc 2 113 30 mul.f32 %f299, %f298, %f298; .loc 2 113 38 mul.f32 %f300, %f22, %f299; .loc 2 110 39 setp.eq.f32 %p115, %f25, 0f00000000; .loc 2 110 49 selp.f32 %f301, 0f00000000, %f26, %p115; .loc 2 113 22 fma.rn.f32 %f302, %f301, %f300, %f296; $L__tmp28: .loc 2 120 46 mov.b32 %f303, %r6; $L__tmp29: .loc 2 113 15 add.f32 %f304, %f19, %f303; $L__tmp30: .loc 2 120 46 mov.b32 %f305, %r5; $L__tmp31: .loc 2 108 21 sub.f32 %f306, %f305, %f18; .loc 2 113 30 mul.f32 %f307, %f306, %f306; .loc 2 113 38 mul.f32 %f308, %f17, %f307; .loc 2 110 39 setp.eq.f32 %p116, %f20, 0f00000000; .loc 2 110 49 selp.f32 %f309, 0f00000000, %f21, %p116; .loc 2 113 22 fma.rn.f32 %f310, %f309, %f308, %f304; $L__tmp32: .loc 1 65 54 mov.u32 %r253, 0x0; mov.u32 %r254, 0x0; mov.u32 %r255, 0x0; mov.u32 %r256, 0x0; @%p93 ld.global.L1::evict_first.v4.b32 { %r253, %r254, %r255, %r256 }, [ %rd104 + 0 ]; @!%p93 mov.u32 %r253, %r257; @!%p93 mov.u32 %r254, %r257; @!%p93 mov.u32 %r255, %r257; @!%p93 mov.u32 %r256, %r257; mov.u32 %r261, 0x0; mov.u32 %r262, 0x0; mov.u32 %r263, 0x0; mov.u32 %r264, 0x0; @%p93 ld.global.L1::evict_first.v4.b32 { %r261, %r262, %r263, %r264 }, [ %rd105 + 0 ]; @!%p93 mov.u32 %r261, %r257; @!%p93 mov.u32 %r262, %r257; @!%p93 mov.u32 %r263, %r257; @!%p93 mov.u32 %r264, %r257; mov.u32 %r269, 0x0; mov.u32 %r270, 0x0; mov.u32 %r271, 0x0; mov.u32 %r272, 0x0; @%p93 ld.global.L1::evict_first.v4.b32 { %r269, %r270, %r271, %r272 }, [ %rd106 + 0 ]; @!%p93 mov.u32 %r269, %r257; @!%p93 mov.u32 %r270, %r257; @!%p93 mov.u32 %r271, %r257; @!%p93 mov.u32 %r272, %r257; mov.u32 %r277, 0x0; mov.u32 %r278, 0x0; mov.u32 %r279, 0x0; mov.u32 %r280, 0x0; @%p93 ld.global.L1::evict_first.v4.b32 { %r277, %r278, %r279, %r280 }, [ %rd107 + 0 ]; @!%p93 mov.u32 %r277, %r257; @!%p93 mov.u32 %r278, %r257; @!%p93 mov.u32 %r279, %r257; @!%p93 mov.u32 %r280, %r257; .loc 1 69 23 mov.b32 %r286, %f310; mov.b32 %r287, 1132462080; div.full.f32 %r285, %r286, %r287; mov.b32 %f311, %r285; mov.b32 %r310, %f302; div.full.f32 %r309, %r310, %r287; mov.b32 %f312, %r309; .loc 1 71 24 add.f32 %f313, %f311, 0f3727C5AC; add.f32 %f314, %f312, 0f3727C5AC; .loc 1 72 30 rsqrt.approx.ftz.f32 %f315, %f313; rsqrt.approx.ftz.f32 %f316, %f314; .loc 1 65 54 mov.b32 %f317, %r280; .loc 1 59 51 mov.b32 %f318, %r209; .loc 1 66 24 add.f32 %f319, %f318, %f317; $L__tmp33: .loc 2 112 17 fma.rn.f32 %f320, %f298, %f301, %f23; $L__tmp34: .loc 1 67 24 sub.f32 %f321, %f319, %f320; .loc 1 65 54 mov.b32 %f322, %r279; .loc 1 59 51 mov.b32 %f323, %r208; .loc 1 66 24 add.f32 %f324, %f323, %f322; .loc 1 67 24 sub.f32 %f325, %f324, %f320; .loc 1 65 54 mov.b32 %f326, %r278; .loc 1 59 51 mov.b32 %f327, %r207; .loc 1 66 24 add.f32 %f328, %f327, %f326; .loc 1 67 24 sub.f32 %f329, %f328, %f320; .loc 1 65 54 mov.b32 %f330, %r277; .loc 1 59 51 mov.b32 %f331, %r206; .loc 1 66 24 add.f32 %f332, %f331, %f330; .loc 1 67 24 sub.f32 %f333, %f332, %f320; .loc 1 65 54 mov.b32 %f334, %r272; .loc 1 59 51 mov.b32 %f335, %r201; .loc 1 66 24 add.f32 %f336, %f335, %f334; .loc 1 67 24 sub.f32 %f337, %f336, %f320; .loc 1 65 54 mov.b32 %f338, %r271; .loc 1 59 51 mov.b32 %f339, %r200; .loc 1 66 24 add.f32 %f340, %f339, %f338; .loc 1 67 24 sub.f32 %f341, %f340, %f320; .loc 1 65 54 mov.b32 %f342, %r270; .loc 1 59 51 mov.b32 %f343, %r199; .loc 1 66 24 add.f32 %f344, %f343, %f342; .loc 1 67 24 sub.f32 %f345, %f344, %f320; .loc 1 65 54 mov.b32 %f346, %r269; .loc 1 59 51 mov.b32 %f347, %r198; .loc 1 66 24 add.f32 %f348, %f347, %f346; .loc 1 67 24 sub.f32 %f349, %f348, %f320; .loc 1 65 54 mov.b32 %f350, %r264; .loc 1 59 51 mov.b32 %f351, %r193; .loc 1 66 24 add.f32 %f352, %f351, %f350; $L__tmp35: .loc 2 112 17 fma.rn.f32 %f353, %f306, %f309, %f18; $L__tmp36: .loc 1 67 24 sub.f32 %f354, %f352, %f353; .loc 1 65 54 mov.b32 %f355, %r263; .loc 1 59 51 mov.b32 %f356, %r192; .loc 1 66 24 add.f32 %f357, %f356, %f355; .loc 1 67 24 sub.f32 %f358, %f357, %f353; .loc 1 65 54 mov.b32 %f359, %r262; .loc 1 59 51 mov.b32 %f360, %r191; .loc 1 66 24 add.f32 %f361, %f360, %f359; .loc 1 67 24 sub.f32 %f362, %f361, %f353; .loc 1 65 54 mov.b32 %f363, %r261; .loc 1 59 51 mov.b32 %f364, %r190; .loc 1 66 24 add.f32 %f365, %f364, %f363; .loc 1 67 24 sub.f32 %f366, %f365, %f353; .loc 1 65 54 mov.b32 %f367, %r256; .loc 1 59 51 mov.b32 %f368, %r185; .loc 1 66 24 add.f32 %f369, %f368, %f367; .loc 1 67 24 sub.f32 %f370, %f369, %f353; .loc 1 65 54 mov.b32 %f371, %r255; .loc 1 59 51 mov.b32 %f372, %r184; .loc 1 66 24 add.f32 %f373, %f372, %f371; .loc 1 67 24 sub.f32 %f374, %f373, %f353; .loc 1 65 54 mov.b32 %f375, %r254; .loc 1 59 51 mov.b32 %f376, %r183; .loc 1 66 24 add.f32 %f377, %f376, %f375; .loc 1 67 24 sub.f32 %f378, %f377, %f353; .loc 1 65 54 mov.b32 %f379, %r253; .loc 1 59 51 mov.b32 %f380, %r182; .loc 1 66 24 add.f32 %f381, %f380, %f379; .loc 1 67 24 sub.f32 %f382, %f381, %f353; .loc 1 73 24 mul.f32 %f383, %f382, %f315; mul.f32 %f384, %f378, %f315; mul.f32 %f385, %f374, %f315; mul.f32 %f386, %f370, %f315; mul.f32 %f387, %f366, %f315; mul.f32 %f388, %f362, %f315; mul.f32 %f389, %f358, %f315; mul.f32 %f390, %f354, %f315; mul.f32 %f391, %f349, %f316; mul.f32 %f392, %f345, %f316; mul.f32 %f393, %f341, %f316; mul.f32 %f394, %f337, %f316; mul.f32 %f395, %f333, %f316; mul.f32 %f396, %f329, %f316; mul.f32 %f397, %f325, %f316; mul.f32 %f398, %f321, %f316; .loc 1 74 24 shl.b32 %r357, %r2, 2; mov.u32 %r358, global_smem; add.s32 %r359, %r358, %r357; st.shared.u32 [%r359], %r214; bar.sync 0; shl.b32 %r360, %r1, 2; add.s32 %r361, %r358, %r360; ld.shared.v4.f32 {%f399, %f400, %f401, %f402}, [%r361]; ld.shared.v4.f32 {%f403, %f404, %f405, %f406}, [%r361+16]; mul.f32 %f407, %f383, %f399; mul.f32 %f408, %f384, %f400; mul.f32 %f409, %f385, %f401; mul.f32 %f410, %f386, %f402; mul.f32 %f411, %f387, %f403; mul.f32 %f412, %f388, %f404; mul.f32 %f413, %f389, %f405; mul.f32 %f414, %f390, %f406; mul.f32 %f415, %f391, %f399; mul.f32 %f416, %f392, %f400; mul.f32 %f417, %f393, %f401; mul.f32 %f418, %f394, %f402; mul.f32 %f419, %f395, %f403; mul.f32 %f420, %f396, %f404; mul.f32 %f421, %f397, %f405; mul.f32 %f422, %f398, %f406; .loc 1 76 39 shl.b32 %r362, %r3, 8; shl.b32 %r363, %r4, 8; .loc 1 76 35 or.b32 %r364, %r362, %r1; or.b32 %r365, %r363, %r1; .loc 1 76 29 mul.wide.s32 %rd110, %r364, 2; add.s64 %rd108, %rd14, %rd110; mul.wide.s32 %rd111, %r365, 2; add.s64 %rd109, %rd14, %rd111; .loc 1 76 52 mov.b32 %r333, %f407; cvt.rn.bf16.f32 %rs1, %r333; mov.b32 %r334, %f408; cvt.rn.bf16.f32 %rs2, %r334; mov.b32 %r335, %f409; cvt.rn.bf16.f32 %rs3, %r335; mov.b32 %r336, %f410; cvt.rn.bf16.f32 %rs4, %r336; mov.b32 %r337, %f411; cvt.rn.bf16.f32 %rs5, %r337; mov.b32 %r338, %f412; cvt.rn.bf16.f32 %rs6, %r338; mov.b32 %r339, %f413; cvt.rn.bf16.f32 %rs7, %r339; mov.b32 %r340, %f414; cvt.rn.bf16.f32 %rs8, %r340; mov.b32 %r341, %f415; cvt.rn.bf16.f32 %rs9, %r341; mov.b32 %r342, %f416; cvt.rn.bf16.f32 %rs10, %r342; mov.b32 %r343, %f417; cvt.rn.bf16.f32 %rs11, %r343; mov.b32 %r344, %f418; cvt.rn.bf16.f32 %rs12, %r344; mov.b32 %r345, %f419; cvt.rn.bf16.f32 %rs13, %r345; mov.b32 %r346, %f420; cvt.rn.bf16.f32 %rs14, %r346; mov.b32 %r347, %f421; cvt.rn.bf16.f32 %rs15, %r347; mov.b32 %r348, %f422; cvt.rn.bf16.f32 %rs16, %r348; mov.b32 %r366, {%rs1, %rs2}; mov.b32 %r367, {%rs3, %rs4}; mov.b32 %r368, {%rs5, %rs6}; mov.b32 %r369, {%rs7, %rs8}; @%p93 st.global.v4.b32 [ %rd108 + 0 ], { %r366, %r367, %r368, %r369 }; mov.b32 %r370, {%rs9, %rs10}; mov.b32 %r371, {%rs11, %rs12}; mov.b32 %r372, {%rs13, %rs14}; mov.b32 %r373, {%rs15, %rs16}; @%p93 st.global.v4.b32 [ %rd109 + 0 ], { %r370, %r371, %r372, %r373 }; .loc 1 55 4 ret; $L__tmp37: $L__func_end0: } // .globl __nv_rsqrtf .visible .func (.param .b32 func_retval0) __nv_rsqrtf( .param .b32 __nv_rsqrtf_param_0 ) { .reg .f32 %f<3>; $L__func_begin1: ld.param.f32 %f1, [__nv_rsqrtf_param_0]; rsqrt.approx.ftz.f32 %f2, %f1; st.param.f32 [func_retval0+0], %f2; ret; $L__func_end1: } .file 1 "/tmp/torchinductor_root/gx/cgx5lxpuexpindj4dsmjz5x42uhyy7iskevq7ovzpwagb3t5powj.py" .file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py" .section .debug_abbrev { .b8 1 .b8 17 .b8 1 .b8 37 .b8 8 .b8 19 .b8 5 .b8 3 .b8 8 .b8 16 .b8 6 .b8 27 .b8 8 .b8 180 .b8 66 .b8 12 .b8 17 .b8 1 .b8 18 .b8 1 .b8 0 .b8 0 .b8 2 .b8 46 .b8 0 .b8 135 .b8 64 .b8 8 .b8 3 .b8 8 .b8 58 .b8 11 .b8 59 .b8 11 .b8 63 .b8 12 .b8 32 .b8 11 .b8 0 .b8 0 .b8 3 .b8 46 .b8 1 .b8 17 .b8 1 .b8 18 .b8 1 .b8 64 .b8 10 .b8 49 .b8 19 .b8 0 .b8 0 .b8 4 .b8 29 .b8 0 .b8 49 .b8 19 .b8 17 .b8 1 .b8 18 .b8 1 .b8 88 .b8 11 .b8 89 .b8 11 .b8 87 .b8 11 .b8 0 .b8 0 .b8 5 .b8 29 .b8 1 .b8 49 .b8 19 .b8 17 .b8 1 .b8 18 .b8 1 .b8 88 .b8 11 .b8 89 .b8 11 .b8 87 .b8 11 .b8 0 .b8 0 .b8 0 } .section .debug_info { .b32 298 .b8 2 .b8 0 .b32 .debug_abbrev .b8 8 .b8 1 .b8 116 .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 0 .b8 2 .b8 0 .b8 99 .b8 103 .b8 120 .b8 53 .b8 108 .b8 120 .b8 112 .b8 117 .b8 101 .b8 120 .b8 112 .b8 105 .b8 110 .b8 100 .b8 106 .b8 52 .b8 100 .b8 115 .b8 109 .b8 106 .b8 122 .b8 53 .b8 120 .b8 52 .b8 50 .b8 117 .b8 104 .b8 121 .b8 121 .b8 55 .b8 105 .b8 115 .b8 107 .b8 101 .b8 118 .b8 113 .b8 55 .b8 111 .b8 118 .b8 122 .b8 112 .b8 119 .b8 97 .b8 103 .b8 98 .b8 51 .b8 116 .b8 53 .b8 112 .b8 111 .b8 119 .b8 106 .b8 46 .b8 112 .b8 121 .b8 0 .b32 .debug_line .b8 47 .b8 116 .b8 109 .b8 112 .b8 47 .b8 116 .b8 111 .b8 114 .b8 99 .b8 104 .b8 105 .b8 110 .b8 100 .b8 117 .b8 99 .b8 116 .b8 111 .b8 114 .b8 95 .b8 114 .b8 111 .b8 111 .b8 116 .b8 47 .b8 103 .b8 120 .b8 0 .b8 1 .b64 $L__func_begin0 .b64 $L__func_end0 .b8 2 .b8 116 .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 95 .b8 95 .b8 48 .b8 100 .b8 49 .b8 100 .b8 50 .b8 100 .b8 51 .b8 100 .b8 52 .b8 100 .b8 53 .b8 100 .b8 101 .b8 54 .b8 100 .b8 101 .b8 0 .b8 116 .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 95 .b8 95 .b8 48 .b8 100 .b8 49 .b8 100 .b8 50 .b8 100 .b8 51 .b8 100 .b8 52 .b8 100 .b8 53 .b8 100 .b8 101 .b8 54 .b8 100 .b8 101 .b8 0 .b8 1 .b8 18 .b8 1 .b8 1 .b8 3 .b64 $L__func_begin0 .b64 $L__func_end0 .b8 1 .b8 156 .b32 125 .b8 4 .b32 125 .b64 $L__tmp1 .b64 $L__tmp2 .b8 2 .b8 44 .b8 38 .b8 5 .b32 125 .b64 $L__tmp2 .b64 $L__tmp36 .b8 2 .b8 50 .b8 41 .b8 4 .b32 125 .b64 $L__tmp2 .b64 $L__tmp36 .b8 2 .b8 120 .b8 46 .b8 0 .b8 4 .b32 125 .b64 $L__tmp3 .b64 $L__tmp31 .b8 2 .b8 50 .b8 41 .b8 0 .b8 0 } .section .debug_pubnames { .b32 $L__pubNames_end0-$L__pubNames_start0 $L__pubNames_start0: .b8 2 .b8 0 .b32 .debug_info .b32 302 .b32 125 .b8 116 .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 95 .b8 95 .b8 48 .b8 100 .b8 49 .b8 100 .b8 50 .b8 100 .b8 51 .b8 100 .b8 52 .b8 100 .b8 53 .b8 100 .b8 101 .b8 54 .b8 100 .b8 101 .b8 0 .b32 0 $L__pubNames_end0: } .section .debug_pubtypes { .b32 $L__pubTypes_end0-$L__pubTypes_start0 $L__pubTypes_start0: .b8 2 .b8 0 .b32 .debug_info .b32 302 .b32 0 $L__pubTypes_end0: } .section .debug_loc { }