0-hero
/

gpt2-pos-encoding-experiment-100B

Model card Files Files and versions Community

0-hero commited on Sep 27, 2024

Commit

0def249

verified ·

1 Parent(s): d572127

Add files using upload-large-folder tool

Browse files

Files changed (18) hide show

.triton/dump/1e922bbbab749da355e4bad9c6b245e6/triton_.ptx +446 -0
.triton/dump/1e922bbbab749da355e4bad9c6b245e6/triton_.ttgir +26 -0
.triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.llir +503 -0
.triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.ptx +988 -0
.triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.ttir +104 -0
.triton/dump/305a9479aab997a3a16bfe46bb303a50/triton_.llir +524 -0
.triton/dump/305a9479aab997a3a16bfe46bb303a50/triton_.ttgir +110 -0
.triton/dump/305a9479aab997a3a16bfe46bb303a50/triton_.ttir +101 -0
.triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.llir +550 -0
.triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.ttgir +134 -0
.triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.ttir +113 -0
.triton/dump/89f8cc1079aa03024e56dc2aee42813a/triton_.cubin +0 -0
.triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.ptx +758 -0
.triton/dump/d7a12c0ba96f8920b8147157303ee99f/triton_.cubin +0 -0
.triton/dump/d7a12c0ba96f8920b8147157303ee99f/triton_.llir +310 -0
.triton/dump/ea4af10b775ebbfe0bdcca18fd6288c8/triton_.ttir +153 -0
.triton/dump/fac03406d1136fc802dac111a1efea36/triton_.ptx +971 -0
.triton/dump/fac03406d1136fc802dac111a1efea36/triton_.ttir +79 -0

.triton/dump/1e922bbbab749da355e4bad9c6b245e6/triton_.ptx ADDED Viewed

	@@ -0,0 +1,446 @@

+//
+// Generated by LLVM NVPTX Back-End
+//
+.version 8.2
+.target sm_89
+.address_size 64
+	// .globl	triton__0d1de
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
+.visible .entry triton__0d1de(
+	.param .u64 triton__0d1de_param_0,
+	.param .u32 triton__0d1de_param_1
+)
+.maxntid 256, 1, 1
+{
+	.reg .pred 	%p<9>;
+	.reg .b16 	%rs<5>;
+	.reg .b32 	%r<22>;
+	.reg .f32 	%f<113>;
+	.reg .b64 	%rd<6>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+	ld.param.u64 	%rd3, [triton__0d1de_param_0];
+$L__tmp0:
+	.loc	1 21 36
+	mov.u32 	%r5, %tid.x;
+	shl.b32 	%r6, %r5, 1;
+	and.b32  	%r7, %r6, 510;
+	.loc	1 20 28
+	mov.u32 %r1, %ctaid.x;
+	.loc	1 20 33
+	shl.b32 	%r8, %r1, 9;
+	.loc	1 21 23
+	or.b32  	%r9, %r8, %r7;
+	.loc	1 24 34
+	mul.wide.s32 	%rd4, %r9, 2;
+	add.s64 	%rd5, %rd3, %rd4;
+	mov.pred 	%p1, -1;
+	.loc	1 24 39
+	mov.u32 %r2, 0x0;
+	@%p1 ld.global.b32 { %r2 }, [ %rd5 + 0 ];
+	cvt.u16.u32 	%rs1, %r2;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
+	.loc	1 24 48
+	cvt.f32.bf16 %r3, %rs1;
+	mov.b32 	%f1, %r3;
+	cvt.f32.bf16 %r4, %rs2;
+	mov.b32 	%f2, %r4;
+	.loc	1 29 18
+	mul.f32 	%f3, %f1, 0f3F3504F3;
+	.loc	1 30 23
+	abs.ftz.f32 	%f5, %f3;
+	setp.ge.f32 	%p2, %f5, 0f3F8060FE;
+	mov.f32 	%f101, 0f3789CA3C;
+	mov.f32 	%f100, 0fB9F560B9;
+	mov.f32 	%f99, 0f3BAC840B;
+	mov.f32 	%f98, 0fBD0C8162;
+	mov.f32 	%f97, 0f3E1CF906;
+	mov.f32 	%f96, 0f3F6A937E;
+	mov.f32 	%f95, 0f3F20D842;
+	mov.f32 	%f102, %f5;
+	@%p2 bra 	$L__BB0_2;
+	.loc	1 0 23
+	mov.f32 	%f101, 0f38B1E96A;
+	mov.f32 	%f100, 0fBA574D20;
+	mov.f32 	%f99, 0f3BAAD5EA;
+	mov.f32 	%f98, 0fBCDC1BE7;
+	mov.f32 	%f97, 0f3DE718AF;
+	mov.f32 	%f96, 0fBEC093AC;
+	mov.f32 	%f95, 0f3E0375D3;
+	.loc	1 30 23
+	mul.f32 	%f102, %f3, %f3;
+$L__BB0_2:
+	.loc	1 0 0
+	mul.f32 	%f4, %f2, 0f3F3504F3;
+	.loc	1 30 23
+	setp.ltu.f32 	%p3, %f5, 0f3F8060FE;
+	fma.rn.ftz.f32 	%f45, %f101, %f102, %f100;
+	fma.rn.ftz.f32 	%f46, %f45, %f102, %f99;
+	fma.rn.ftz.f32 	%f47, %f46, %f102, %f98;
+	fma.rn.ftz.f32 	%f48, %f47, %f102, %f97;
+	fma.rn.ftz.f32 	%f49, %f48, %f102, %f96;
+	fma.rn.ftz.f32 	%f50, %f49, %f102, %f95;
+	neg.f32 	%f51, %f102;
+	selp.f32 	%f52, %f51, %f3, %p2;
+	fma.rn.ftz.f32 	%f103, %f50, %f52, %f52;
+	mov.f32 	%f94, 0f3F800000;
+	@%p3 bra 	$L__BB0_4;
+	ex2.approx.ftz.f32 	%f53, %f103;
+	sub.f32 	%f55, %f94, %f53;
+	mov.b32 	%r10, %f55;
+	mov.b32 	%r11, %f3;
+	and.b32  	%r12, %r11, -2147483648;
+	or.b32  	%r13, %r12, %r10;
+	mov.b32 	%f103, %r13;
+$L__BB0_4:
+	abs.ftz.f32 	%f18, %f4;
+	setp.ge.f32 	%p5, %f18, 0f3F8060FE;
+	mov.f32 	%f110, 0f3789CA3C;
+	mov.f32 	%f109, 0fB9F560B9;
+	mov.f32 	%f108, 0f3BAC840B;
+	mov.f32 	%f107, 0fBD0C8162;
+	mov.f32 	%f106, 0f3E1CF906;
+	mov.f32 	%f105, 0f3F6A937E;
+	mov.f32 	%f104, 0f3F20D842;
+	mov.f32 	%f111, %f18;
+	@%p5 bra 	$L__BB0_6;
+	mul.f32 	%f111, %f4, %f4;
+	mov.f32 	%f110, 0f38B1E96A;
+	mov.f32 	%f109, 0fBA574D20;
+	mov.f32 	%f108, 0f3BAAD5EA;
+	mov.f32 	%f107, 0fBCDC1BE7;
+	mov.f32 	%f106, 0f3DE718AF;
+	mov.f32 	%f105, 0fBEC093AC;
+	mov.f32 	%f104, 0f3E0375D3;
+$L__BB0_6:
+	setp.ltu.f32 	%p6, %f18, 0f3F8060FE;
+	fma.rn.ftz.f32 	%f70, %f110, %f111, %f109;
+	fma.rn.ftz.f32 	%f71, %f70, %f111, %f108;
+	fma.rn.ftz.f32 	%f72, %f71, %f111, %f107;
+	fma.rn.ftz.f32 	%f73, %f72, %f111, %f106;
+	fma.rn.ftz.f32 	%f74, %f73, %f111, %f105;
+	fma.rn.ftz.f32 	%f75, %f74, %f111, %f104;
+	neg.f32 	%f76, %f111;
+	selp.f32 	%f77, %f76, %f4, %p5;
+	fma.rn.ftz.f32 	%f112, %f75, %f77, %f77;
+	@%p6 bra 	$L__BB0_8;
+	ex2.approx.ftz.f32 	%f78, %f112;
+	sub.f32 	%f80, %f94, %f78;
+	mov.b32 	%r14, %f80;
+	mov.b32 	%r15, %f4;
+	and.b32  	%r16, %r15, -2147483648;
+	or.b32  	%r17, %r16, %r14;
+	mov.b32 	%f112, %r17;
+$L__BB0_8:
+	.loc	1 27 18
+	mul.f32 	%f81, %f2, 0f3F000000;
+	mul.f32 	%f82, %f1, 0f3F000000;
+	.loc	1 32 18
+	add.f32 	%f83, %f103, 0f3F800000;
+	add.f32 	%f84, %f112, 0f3F800000;
+	.loc	1 33 18
+	mul.f32 	%f85, %f82, %f83;
+	mul.f32 	%f86, %f81, %f84;
+	.loc	1 35 40
+	mov.b32 	%r18, %f85;
+	cvt.rn.bf16.f32 %rs3, %r18;
+	mov.b32 	%r19, %f86;
+	cvt.rn.bf16.f32 %rs4, %r19;
+	mov.b32 	%r21, {%rs3, %rs4};
+	@%p1 st.global.b32 [ %rd5 + 0 ], { %r21 };
+	.loc	1 35 4
+	ret;
+$L__tmp1:
+$L__func_end0:
+}
+	// .globl	__nv_erff
+.visible .func  (.param .b32 func_retval0) __nv_erff(
+	.param .b32 __nv_erff_param_0
+)
+{
+	.reg .pred 	%p<4>;
+	.reg .b32 	%r<5>;
+	.reg .f32 	%f<49>;
+$L__func_begin1:
+	ld.param.f32 	%f14, [__nv_erff_param_0];
+	abs.ftz.f32 	%f1, %f14;
+	setp.ge.f32 	%p1, %f1, 0f3F8060FE;
+	mov.f32 	%f46, 0f3789CA3C;
+	mov.f32 	%f45, 0fB9F560B9;
+	mov.f32 	%f44, 0f3BAC840B;
+	mov.f32 	%f43, 0fBD0C8162;
+	mov.f32 	%f42, 0f3E1CF906;
+	mov.f32 	%f41, 0f3F6A937E;
+	mov.f32 	%f40, 0f3F20D842;
+	mov.f32 	%f47, %f1;
+	@%p1 bra 	$L__BB1_2;
+	mul.f32 	%f47, %f14, %f14;
+	mov.f32 	%f46, 0f38B1E96A;
+	mov.f32 	%f45, 0fBA574D20;
+	mov.f32 	%f44, 0f3BAAD5EA;
+	mov.f32 	%f43, 0fBCDC1BE7;
+	mov.f32 	%f42, 0f3DE718AF;
+	mov.f32 	%f41, 0fBEC093AC;
+	mov.f32 	%f40, 0f3E0375D3;
+$L__BB1_2:
+	setp.ltu.f32 	%p2, %f1, 0f3F8060FE;
+	fma.rn.ftz.f32 	%f29, %f46, %f47, %f45;
+	fma.rn.ftz.f32 	%f30, %f29, %f47, %f44;
+	fma.rn.ftz.f32 	%f31, %f30, %f47, %f43;
+	fma.rn.ftz.f32 	%f32, %f31, %f47, %f42;
+	fma.rn.ftz.f32 	%f33, %f32, %f47, %f41;
+	fma.rn.ftz.f32 	%f34, %f33, %f47, %f40;
+	neg.f32 	%f35, %f47;
+	selp.f32 	%f36, %f35, %f14, %p1;
+	fma.rn.ftz.f32 	%f48, %f34, %f36, %f36;
+	@%p2 bra 	$L__BB1_4;
+	ex2.approx.ftz.f32 	%f37, %f48;
+	mov.f32 	%f38, 0f3F800000;
+	sub.f32 	%f39, %f38, %f37;
+	mov.b32 	%r1, %f39;
+	mov.b32 	%r2, %f14;
+	and.b32  	%r3, %r2, -2147483648;
+	or.b32  	%r4, %r3, %r1;
+	mov.b32 	%f48, %r4;
+$L__BB1_4:
+	st.param.f32 	[func_retval0+0], %f48;
+	ret;
+$L__func_end1:
+}
+	.file	1 "/tmp/torchinductor_root/kp/ckphrtdpgsxl7sfarkkzylhv4st3uhmzvg3u6z5excfp6ydybq74.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 172
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 107
+.b8 112
+.b8 104
+.b8 114
+.b8 116
+.b8 100
+.b8 112
+.b8 103
+.b8 115
+.b8 120
+.b8 108
+.b8 55
+.b8 115
+.b8 102
+.b8 97
+.b8 114
+.b8 107
+.b8 107
+.b8 122
+.b8 121
+.b8 108
+.b8 104
+.b8 118
+.b8 52
+.b8 115
+.b8 116
+.b8 51
+.b8 117
+.b8 104
+.b8 109
+.b8 122
+.b8 118
+.b8 103
+.b8 51
+.b8 117
+.b8 54
+.b8 122
+.b8 53
+.b8 101
+.b8 120
+.b8 99
+.b8 102
+.b8 112
+.b8 54
+.b8 121
+.b8 100
+.b8 121
+.b8 98
+.b8 113
+.b8 55
+.b8 52
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 107
+.b8 112
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 176
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 176
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}

.triton/dump/1e922bbbab749da355e4bad9c6b245e6/triton_.ttgir ADDED Viewed

	@@ -0,0 +1,26 @@

+#blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<1.000000e+00> : tensor<512xf32, #blocked>
+    %cst_0 = arith.constant dense<0.707106769> : tensor<512xf32, #blocked>
+    %cst_1 = arith.constant dense<5.000000e-01> : tensor<512xf32, #blocked>
+    %c512_i32 = arith.constant 512 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c512_i32 : i32
+    %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked>
+    %3 = tt.splat %1 : (i32) -> tensor<512xi32, #blocked>
+    %4 = arith.addi %3, %2 : tensor<512xi32, #blocked>
+    %5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>, #blocked>
+    %6 = tt.addptr %5, %4 : tensor<512x!tt.ptr<bf16, 1>, #blocked>, tensor<512xi32, #blocked>
+    %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16, #blocked>
+    %8 = arith.extf %7 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked>
+    %9 = arith.mulf %8, %cst_1 : tensor<512xf32, #blocked>
+    %10 = arith.mulf %8, %cst_0 : tensor<512xf32, #blocked>
+    %11 = tt.extern_elementwise %10 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<512xf32, #blocked>) -> tensor<512xf32, #blocked>
+    %12 = arith.addf %11, %cst : tensor<512xf32, #blocked>
+    %13 = arith.mulf %9, %12 : tensor<512xf32, #blocked>
+    %14 = arith.truncf %13 : tensor<512xf32, #blocked> to tensor<512xbf16, #blocked>
+    tt.store %6, %14 {cache = 1 : i32, evict = 1 : i32} : tensor<512xbf16, #blocked>
+    tt.return
+  }
+}

.triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.llir ADDED Viewed

	@@ -0,0 +1,503 @@

+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+@assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
+@assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
+@assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp13 < 50257"
+@assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
+@assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
+@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
+@global_smem = external addrspace(3) global [0 x i8]
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
+define void @triton__0d1d2d3d4d5de6de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i32 %5, i32 %6) local_unnamed_addr !dbg !7 {
+  %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %9 = and i32 %8, 31, !dbg !10
+  %10 = lshr i32 %8, 5, !dbg !10
+  %11 = lshr i32 %8, 6, !dbg !10
+  %12 = and i32 %11, 1, !dbg !10
+  %13 = and i32 %8, 1, !dbg !10
+  %14 = and i32 %10, 1, !dbg !11
+  %urem = shl i32 %8, 2, !dbg !11
+  %15 = and i32 %urem, 252, !dbg !11
+  %16 = shl i32 %8, 1, !dbg !11
+  %17 = and i32 %16, 254, !dbg !11
+  %18 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !12
+  %19 = shl i32 %18, 1, !dbg !13
+  %20 = or i32 %19, %12, !dbg !14
+  %21 = or i32 %19, %13, !dbg !14
+  %22 = sext i32 %20 to i64, !dbg !15
+  %23 = getelementptr i64, ptr addrspace(1) %0, i64 %22, !dbg !15
+  %24 = sext i32 %21 to i64, !dbg !15
+  %25 = getelementptr i64, ptr addrspace(1) %0, i64 %24, !dbg !15
+  %26 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
+  %27 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
+  %28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
+  %29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
+  %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !16
+  %31 = srem i32 %20, 512, !dbg !17
+  %32 = shl nsw i32 %31, 8, !dbg !18
+  %33 = or i32 %32, %15, !dbg !19
+  %34 = sext i32 %33 to i64, !dbg !20
+  %35 = getelementptr float, ptr addrspace(1) %2, i64 %34, !dbg !20
+  %36 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %35, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21
+  %37 = extractvalue { i32, i32, i32, i32 } %36, 0, !dbg !21
+  %38 = extractvalue { i32, i32, i32, i32 } %36, 1, !dbg !21
+  %39 = extractvalue { i32, i32, i32, i32 } %36, 2, !dbg !21
+  %40 = extractvalue { i32, i32, i32, i32 } %36, 3, !dbg !21
+  %41 = bitcast i32 %37 to float, !dbg !21
+  %42 = bitcast i32 %38 to float, !dbg !21
+  %43 = bitcast i32 %39 to float, !dbg !21
+  %44 = bitcast i32 %40 to float, !dbg !21
+  %45 = add i64 %30, 50257, !dbg !22
+  %46 = icmp slt i64 %26, 0, !dbg !23
+  %47 = icmp slt i64 %30, 0, !dbg !23
+  %48 = select i1 %47, i64 %45, i64 %30, !dbg !24
+  %49 = icmp ugt i64 %48, 50256, !dbg !25
+  br i1 %49, label %50, label %51, !dbg !26
+50:                                               ; preds = %7
+  tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !26
+  br label %51, !dbg !26
+51:                                               ; preds = %50, %7
+  %52 = shl i64 %26, 8, !dbg !27
+  %53 = add i64 %52, 12865792, !dbg !27
+  %54 = select i1 %46, i64 %53, i64 %52, !dbg !27
+  %55 = zext nneg i32 %15 to i64
+  %56 = or i64 %54, %55, !dbg !28
+  %57 = getelementptr float, ptr addrspace(1) %1, i64 %56, !dbg !29
+  %58 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %57, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !30
+  %59 = extractvalue { i32, i32, i32, i32 } %58, 0, !dbg !30
+  %60 = extractvalue { i32, i32, i32, i32 } %58, 1, !dbg !30
+  %61 = extractvalue { i32, i32, i32, i32 } %58, 2, !dbg !30
+  %62 = extractvalue { i32, i32, i32, i32 } %58, 3, !dbg !30
+  %63 = bitcast i32 %59 to float, !dbg !30
+  %64 = bitcast i32 %60 to float, !dbg !30
+  %65 = bitcast i32 %61 to float, !dbg !30
+  %66 = bitcast i32 %62 to float, !dbg !30
+  %67 = fadd float %41, %63, !dbg !31
+  %68 = fadd float %42, %64, !dbg !31
+  %69 = fadd float %43, %65, !dbg !31
+  %70 = fadd float %44, %66, !dbg !31
+  %71 = fadd float %67, 0.000000e+00, !dbg !32
+  %72 = fadd float %68, 0.000000e+00, !dbg !32
+  %73 = fadd float %69, 0.000000e+00, !dbg !32
+  %74 = fadd float %70, 0.000000e+00, !dbg !32
+  %75 = fsub float %67, %71, !dbg !36
+  %76 = fsub float %68, %72, !dbg !36
+  %77 = fsub float %69, %73, !dbg !36
+  %78 = fsub float %70, %74, !dbg !36
+  %79 = fmul float %67, %75, !dbg !37
+  %80 = fmul float %68, %76, !dbg !37
+  %81 = fmul float %69, %77, !dbg !37
+  %82 = fmul float %70, %78, !dbg !37
+  %83 = fadd float %79, 0.000000e+00, !dbg !38
+  %84 = fadd float %80, 0.000000e+00, !dbg !38
+  %85 = fadd float %81, 0.000000e+00, !dbg !38
+  %86 = fadd float %82, 0.000000e+00, !dbg !38
+  %87 = fsub float %72, %71, !dbg !39
+  %88 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 2.000000e+00) #6, !dbg !43
+  %89 = fmul float %88, %87, !dbg !44
+  %90 = fadd float %71, %89, !dbg !45
+  %91 = fadd float %83, %84, !dbg !46
+  %92 = fmul float %87, %87, !dbg !47
+  %93 = fmul float %88, %92, !dbg !48
+  %94 = fadd float %93, %91, !dbg !49
+  %95 = fsub float %73, %90, !dbg !39
+  %96 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 3.000000e+00) #6, !dbg !43
+  %97 = fmul float %96, %95, !dbg !44
+  %98 = fadd float %90, %97, !dbg !45
+  %99 = fadd float %85, %94, !dbg !46
+  %100 = fmul float %95, %95, !dbg !47
+  %101 = fmul float %100, 2.000000e+00, !dbg !50
+  %102 = fmul float %96, %101, !dbg !48
+  %103 = fadd float %99, %102, !dbg !49
+  %104 = fsub float %74, %98, !dbg !39
+  %105 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 4.000000e+00) #6, !dbg !43
+  %106 = fmul float %105, %104, !dbg !44
+  %107 = fadd float %98, %106, !dbg !45
+  %108 = fadd float %86, %103, !dbg !46
+  %109 = fmul float %104, %104, !dbg !47
+  %110 = fmul float %109, 3.000000e+00, !dbg !50
+  %111 = fmul float %105, %110, !dbg !48
+  %112 = fadd float %108, %111, !dbg !49
+  %113 = bitcast float %107 to i32, !dbg !51
+  %114 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %113, i32 16, i32 31), !dbg !51
+  %115 = bitcast i32 %114 to float, !dbg !51
+  %116 = bitcast float %112 to i32, !dbg !51
+  %117 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %116, i32 16, i32 31), !dbg !51
+  %118 = bitcast i32 %117 to float, !dbg !51
+  %119 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 1082130432, i32 16, i32 31), !dbg !51
+  %120 = bitcast i32 %119 to float, !dbg !51
+  %121 = fsub float %115, %107, !dbg !39
+  %122 = fadd float %120, 4.000000e+00, !dbg !53
+  %123 = fcmp oeq float %122, 0.000000e+00, !dbg !54
+  %124 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %120, float %122) #6, !dbg !43
+  %125 = select i1 %123, float 0.000000e+00, float %124, !dbg !55
+  %126 = fmul float %125, %121, !dbg !44
+  %127 = fadd float %107, %126, !dbg !45
+  %128 = fadd float %112, %118, !dbg !46
+  %129 = fmul float %121, %121, !dbg !47
+  %130 = fmul float %129, 4.000000e+00, !dbg !50
+  %131 = fmul float %125, %130, !dbg !48
+  %132 = fadd float %128, %131, !dbg !49
+  %133 = bitcast float %127 to i32, !dbg !51
+  %134 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %133, i32 8, i32 31), !dbg !51
+  %135 = bitcast i32 %134 to float, !dbg !51
+  %136 = bitcast float %132 to i32, !dbg !51
+  %137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 8, i32 31), !dbg !51
+  %138 = bitcast i32 %137 to float, !dbg !51
+  %139 = bitcast float %122 to i32, !dbg !51
+  %140 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %139, i32 8, i32 31), !dbg !51
+  %141 = bitcast i32 %140 to float, !dbg !51
+  %142 = fsub float %135, %127, !dbg !39
+  %143 = fadd float %122, %141, !dbg !53
+  %144 = fcmp oeq float %143, 0.000000e+00, !dbg !54
+  %145 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %141, float %143) #6, !dbg !43
+  %146 = select i1 %144, float 0.000000e+00, float %145, !dbg !55
+  %147 = fmul float %146, %142, !dbg !44
+  %148 = fadd float %127, %147, !dbg !45
+  %149 = fadd float %132, %138, !dbg !46
+  %150 = fmul float %142, %142, !dbg !47
+  %151 = fmul float %122, %150, !dbg !50
+  %152 = fmul float %146, %151, !dbg !48
+  %153 = fadd float %149, %152, !dbg !49
+  %154 = bitcast float %148 to i32, !dbg !51
+  %155 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %154, i32 4, i32 31), !dbg !51
+  %156 = bitcast i32 %155 to float, !dbg !51
+  %157 = bitcast float %153 to i32, !dbg !51
+  %158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 4, i32 31), !dbg !51
+  %159 = bitcast i32 %158 to float, !dbg !51
+  %160 = bitcast float %143 to i32, !dbg !51
+  %161 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %160, i32 4, i32 31), !dbg !51
+  %162 = bitcast i32 %161 to float, !dbg !51
+  %163 = fsub float %156, %148, !dbg !39
+  %164 = fadd float %143, %162, !dbg !53
+  %165 = fcmp oeq float %164, 0.000000e+00, !dbg !54
+  %166 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %162, float %164) #6, !dbg !43
+  %167 = select i1 %165, float 0.000000e+00, float %166, !dbg !55
+  %168 = fmul float %167, %163, !dbg !44
+  %169 = fadd float %148, %168, !dbg !45
+  %170 = fadd float %153, %159, !dbg !46
+  %171 = fmul float %163, %163, !dbg !47
+  %172 = fmul float %143, %171, !dbg !50
+  %173 = fmul float %167, %172, !dbg !48
+  %174 = fadd float %170, %173, !dbg !49
+  %175 = bitcast float %169 to i32, !dbg !51
+  %176 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %175, i32 2, i32 31), !dbg !51
+  %177 = bitcast i32 %176 to float, !dbg !51
+  %178 = bitcast float %174 to i32, !dbg !51
+  %179 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %178, i32 2, i32 31), !dbg !51
+  %180 = bitcast i32 %179 to float, !dbg !51
+  %181 = bitcast float %164 to i32, !dbg !51
+  %182 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %181, i32 2, i32 31), !dbg !51
+  %183 = bitcast i32 %182 to float, !dbg !51
+  %184 = fsub float %177, %169, !dbg !39
+  %185 = fadd float %164, %183, !dbg !53
+  %186 = fcmp oeq float %185, 0.000000e+00, !dbg !54
+  %187 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %183, float %185) #6, !dbg !43
+  %188 = select i1 %186, float 0.000000e+00, float %187, !dbg !55
+  %189 = fmul float %188, %184, !dbg !44
+  %190 = fadd float %169, %189, !dbg !45
+  %191 = fadd float %174, %180, !dbg !46
+  %192 = fmul float %184, %184, !dbg !47
+  %193 = fmul float %164, %192, !dbg !50
+  %194 = fmul float %188, %193, !dbg !48
+  %195 = fadd float %191, %194, !dbg !49
+  %196 = bitcast float %190 to i32, !dbg !51
+  %197 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %196, i32 1, i32 31), !dbg !51
+  %198 = bitcast i32 %197 to float, !dbg !51
+  %199 = bitcast float %195 to i32, !dbg !51
+  %200 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %199, i32 1, i32 31), !dbg !51
+  %201 = bitcast i32 %200 to float, !dbg !51
+  %202 = bitcast float %185 to i32, !dbg !51
+  %203 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %202, i32 1, i32 31), !dbg !51
+  %204 = bitcast i32 %203 to float, !dbg !51
+  %205 = fsub float %198, %190, !dbg !39
+  %206 = fadd float %185, %204, !dbg !53
+  %207 = fcmp oeq float %206, 0.000000e+00, !dbg !54
+  %208 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %204, float %206) #6, !dbg !43
+  %209 = select i1 %207, float 0.000000e+00, float %208, !dbg !55
+  %210 = fmul float %205, %209, !dbg !44
+  %211 = fadd float %190, %210, !dbg !45
+  %212 = fadd float %195, %201, !dbg !46
+  %213 = fmul float %205, %205, !dbg !47
+  %214 = fmul float %185, %213, !dbg !50
+  %215 = fmul float %209, %214, !dbg !48
+  %216 = fadd float %212, %215, !dbg !49
+  %217 = icmp eq i32 %9, 0, !dbg !51
+  %218 = shl nuw nsw i32 %12, 1, !dbg !51
+  %219 = or i32 %218, %14, !dbg !51
+  %220 = zext nneg i32 %219 to i64, !dbg !51
+  %221 = getelementptr float, ptr addrspace(3) @global_smem, i64 %220, !dbg !51
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %221, float %211, i1 %217) #6, !dbg !51
+  %222 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %220, !dbg !51
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %222, float %216, i1 %217) #6, !dbg !51
+  %223 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 32), i64 %220, !dbg !51
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %223, float %206, i1 %217) #6, !dbg !51
+  tail call void @llvm.nvvm.barrier0(), !dbg !51
+  %224 = icmp slt i32 %8, 4, !dbg !51
+  %225 = sext i32 %8 to i64, !dbg !51
+  %226 = getelementptr float, ptr addrspace(3) @global_smem, i64 %225, !dbg !51
+  %227 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %226, i1 %224) #6, !dbg !51
+  %228 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %225, !dbg !51
+  %229 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %228, i1 %224) #6, !dbg !51
+  %230 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 32), i64 %225, !dbg !51
+  %231 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %230, i1 %224) #6, !dbg !51
+  %232 = bitcast float %227 to i32, !dbg !51
+  %233 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %232, i32 1, i32 31), !dbg !51
+  %234 = bitcast i32 %233 to float, !dbg !51
+  %235 = bitcast float %229 to i32, !dbg !51
+  %236 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %235, i32 1, i32 31), !dbg !51
+  %237 = bitcast i32 %236 to float, !dbg !51
+  %238 = bitcast float %231 to i32, !dbg !51
+  %239 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %238, i32 1, i32 31), !dbg !51
+  %240 = bitcast i32 %239 to float, !dbg !51
+  %241 = fsub float %234, %227, !dbg !39
+  %242 = fadd float %231, %240, !dbg !53
+  %243 = fcmp oeq float %242, 0.000000e+00, !dbg !54
+  %244 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %240, float %242) #6, !dbg !43
+  %245 = select i1 %243, float 0.000000e+00, float %244, !dbg !55
+  %246 = fmul float %241, %245, !dbg !44
+  %247 = fadd float %227, %246, !dbg !45
+  %248 = fadd float %229, %237, !dbg !46
+  %249 = fmul float %241, %241, !dbg !47
+  %250 = fmul float %231, %249, !dbg !50
+  %251 = fmul float %250, %245, !dbg !48
+  %252 = fadd float %248, %251, !dbg !49
+  %253 = icmp eq i32 %13, 0, !dbg !51
+  %254 = and i1 %224, %253, !dbg !51
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %226, float %247, i1 %254) #6, !dbg !51
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %228, float %252, i1 %254) #6, !dbg !51
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %230, float %242, i1 %254) #6, !dbg !51
+  tail call void @llvm.nvvm.barrier0(), !dbg !51
+  %255 = zext nneg i32 %218 to i64, !dbg !51
+  %256 = getelementptr float, ptr addrspace(3) @global_smem, i64 %255, !dbg !51
+  %257 = load float, ptr addrspace(3) %256, align 4, !dbg !51
+  %258 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %255, !dbg !51
+  %259 = load float, ptr addrspace(3) %258, align 4, !dbg !51
+  %260 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %35, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !56
+  %261 = zext nneg i32 %17 to i64, !dbg !57
+  %262 = getelementptr float, ptr addrspace(1) %3, i64 %261, !dbg !57
+  %263 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %262, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !58
+  br i1 %49, label %264, label %265, !dbg !59
+264:                                              ; preds = %51
+  tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !59
+  br label %265, !dbg !59
+265:                                              ; preds = %264, %51
+  %266 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %57, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !60
+  %267 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %259, float 2.560000e+02) #6, !dbg !61
+  %268 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %259, float 2.560000e+02) #6, !dbg !61
+  %269 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %259, float 2.560000e+02) #6, !dbg !61
+  %270 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %259, float 2.560000e+02) #6, !dbg !61
+  %271 = fadd float %267, 0x3EE4F8B580000000, !dbg !62
+  %272 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
+  %.not.i = icmp eq i32 %272, 0, !dbg !63
+  br i1 %.not.i, label %275, label %273, !dbg !63
+273:                                              ; preds = %265
+  %274 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %271), !dbg !63
+  br label %__nv_rsqrtf.exit, !dbg !63
+275:                                              ; preds = %265
+  %276 = tail call float @llvm.nvvm.rsqrt.approx.f(float %271), !dbg !63
+  br label %__nv_rsqrtf.exit, !dbg !63
+__nv_rsqrtf.exit:                                 ; preds = %273, %275
+  %.0.i = phi float [ %274, %273 ], [ %276, %275 ], !dbg !63
+  %277 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
+  %278 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
+  %279 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
+  %280 = extractvalue { i32, i32, i32, i32 } %266, 3, !dbg !60
+  %281 = bitcast i32 %280 to float, !dbg !60
+  %282 = extractvalue { i32, i32, i32, i32 } %260, 3, !dbg !56
+  %283 = bitcast i32 %282 to float, !dbg !56
+  %284 = fadd float %283, %281, !dbg !64
+  %285 = fsub float %284, %257, !dbg !65
+  %286 = extractvalue { i32, i32, i32, i32 } %266, 2, !dbg !60
+  %287 = bitcast i32 %286 to float, !dbg !60
+  %288 = extractvalue { i32, i32, i32, i32 } %260, 2, !dbg !56
+  %289 = bitcast i32 %288 to float, !dbg !56
+  %290 = fadd float %289, %287, !dbg !64
+  %291 = fsub float %290, %257, !dbg !65
+  %292 = extractvalue { i32, i32, i32, i32 } %266, 1, !dbg !60
+  %293 = bitcast i32 %292 to float, !dbg !60
+  %294 = extractvalue { i32, i32, i32, i32 } %260, 1, !dbg !56
+  %295 = bitcast i32 %294 to float, !dbg !56
+  %296 = fadd float %295, %293, !dbg !64
+  %297 = fsub float %296, %257, !dbg !65
+  %298 = extractvalue { i32, i32, i32, i32 } %266, 0, !dbg !60
+  %299 = bitcast i32 %298 to float, !dbg !60
+  %300 = extractvalue { i32, i32, i32, i32 } %260, 0, !dbg !56
+  %301 = bitcast i32 %300 to float, !dbg !56
+  %302 = fadd float %301, %299, !dbg !64
+  %303 = fsub float %302, %257, !dbg !65
+  %304 = extractvalue { i32, i32 } %263, 0, !dbg !58
+  %305 = extractvalue { i32, i32 } %263, 1, !dbg !58
+  %306 = fmul float %303, %.0.i, !dbg !66
+  %307 = fmul float %297, %.0.i, !dbg !66
+  %308 = fmul float %291, %.0.i, !dbg !66
+  %309 = fmul float %285, %.0.i, !dbg !66
+  tail call void @llvm.nvvm.barrier0(), !dbg !67
+  %310 = getelementptr float, ptr addrspace(3) @global_smem, i64 %261, !dbg !67
+  %311 = insertelement <2 x i32> undef, i32 %304, i64 0, !dbg !67
+  %312 = insertelement <2 x i32> %311, i32 %305, i64 1, !dbg !67
+  store <2 x i32> %312, ptr addrspace(3) %310, align 8, !dbg !67
+  tail call void @llvm.nvvm.barrier0(), !dbg !67
+  %313 = getelementptr float, ptr addrspace(3) @global_smem, i64 %55, !dbg !67
+  %314 = load float, ptr addrspace(3) %313, align 16, !dbg !67
+  %315 = getelementptr inbounds <4 x float>, ptr addrspace(3) %313, i64 0, i64 1, !dbg !67
+  %316 = load float, ptr addrspace(3) %315, align 4, !dbg !67
+  %317 = getelementptr inbounds <4 x float>, ptr addrspace(3) %313, i64 0, i64 2, !dbg !67
+  %318 = load float, ptr addrspace(3) %317, align 8, !dbg !67
+  %319 = getelementptr inbounds <4 x float>, ptr addrspace(3) %313, i64 0, i64 3, !dbg !67
+  %320 = load float, ptr addrspace(3) %319, align 4, !dbg !67
+  %321 = fmul float %306, %314, !dbg !67
+  %322 = fmul float %307, %316, !dbg !67
+  %323 = fmul float %308, %318, !dbg !67
+  %324 = fmul float %309, %320, !dbg !67
+  %325 = shl i32 %20, 8, !dbg !68
+  %326 = or i32 %325, %15, !dbg !69
+  %327 = sext i32 %326 to i64, !dbg !70
+  %328 = getelementptr i16, ptr addrspace(1) %4, i64 %327, !dbg !70
+  %329 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %321) #6, !dbg !71
+  %330 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %322) #6, !dbg !71
+  %331 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %323) #6, !dbg !71
+  %332 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %324) #6, !dbg !71
+  %333 = insertelement <2 x i16> undef, i16 %329, i64 0, !dbg !71
+  %334 = insertelement <2 x i16> %333, i16 %330, i64 1, !dbg !71
+  %335 = bitcast <2 x i16> %334 to i32, !dbg !71
+  %336 = insertelement <2 x i16> undef, i16 %331, i64 0, !dbg !71
+  %337 = insertelement <2 x i16> %336, i16 %332, i64 1, !dbg !71
+  %338 = bitcast <2 x i16> %337 to i32, !dbg !71
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %335, i32 %338, ptr addrspace(1) %328, i1 true) #6, !dbg !71
+  ret void, !dbg !72
+}
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #2
+; Function Attrs: alwaysinline nounwind
+define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
+  %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
+  %.not = icmp eq i32 %1, 0
+  br i1 %.not, label %4, label %2
+2:                                                ; preds = %0
+  %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
+  br label %6
+4:                                                ; preds = %0
+  %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
+  br label %6
+6:                                                ; preds = %4, %2
+  %.0 = phi float [ %3, %2 ], [ %5, %4 ]
+  ret float %.0
+}
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #5
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #2 = { convergent nocallback nounwind }
+attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #6 = { nounwind }
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+!nvvm.annotations = !{!4, !5, !5, !4}
+!llvm.ident = !{!6}
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!3 = !DIFile(filename: "cgx5lxpuexpindj4dsmjz5x42uhyy7iskevq7ovzpwagb3t5powj.py", directory: "/tmp/torchinductor_root/gx")
+!4 = !{ptr @triton__0d1d2d3d4d5de6de, !"kernel", i32 1}
+!5 = !{ptr @triton__0d1d2d3d4d5de6de, !"maxntidx", i32 128}
+!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5de6de", linkageName: "triton__0d1d2d3d4d5de6de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
+!9 = !{}
+!10 = !DILocation(line: 22, column: 44, scope: !7)
+!11 = !DILocation(line: 24, column: 33, scope: !7)
+!12 = !DILocation(line: 21, column: 28, scope: !7)
+!13 = !DILocation(line: 21, column: 33, scope: !7)
+!14 = !DILocation(line: 22, column: 23, scope: !7)
+!15 = !DILocation(line: 26, column: 30, scope: !7)
+!16 = !DILocation(line: 26, column: 35, scope: !7)
+!17 = !DILocation(line: 27, column: 18, scope: !7)
+!18 = !DILocation(line: 35, column: 44, scope: !7)
+!19 = !DILocation(line: 35, column: 40, scope: !7)
+!20 = !DILocation(line: 35, column: 34, scope: !7)
+!21 = !DILocation(line: 35, column: 50, scope: !7)
+!22 = !DILocation(line: 36, column: 22, scope: !7)
+!23 = !DILocation(line: 37, column: 22, scope: !7)
+!24 = !DILocation(line: 38, column: 36, scope: !7)
+!25 = !DILocation(line: 39, column: 40, scope: !7)
+!26 = !DILocation(line: 39, column: 55, scope: !7)
+!27 = !DILocation(line: 40, column: 44, scope: !7)
+!28 = !DILocation(line: 40, column: 40, scope: !7)
+!29 = !DILocation(line: 40, column: 34, scope: !7)
+!30 = !DILocation(line: 40, column: 52, scope: !7)
+!31 = !DILocation(line: 41, column: 22, scope: !7)
+!32 = !DILocation(line: 98, column: 22, scope: !33, inlinedAt: !35)
+!33 = distinct !DILexicalBlockFile(scope: !7, file: !34, discriminator: 0)
+!34 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
+!35 = !DILocation(line: 44, column: 38, scope: !33)
+!36 = !DILocation(line: 101, column: 30, scope: !33, inlinedAt: !35)
+!37 = !DILocation(line: 101, column: 22, scope: !33, inlinedAt: !35)
+!38 = !DILocation(line: 101, column: 13, scope: !33, inlinedAt: !35)
+!39 = !DILocation(line: 108, column: 21, scope: !40, inlinedAt: !41)
+!40 = distinct !DILexicalBlockFile(scope: !33, file: !34, discriminator: 0)
+!41 = !DILocation(line: 120, column: 46, scope: !40, inlinedAt: !42)
+!42 = !DILocation(line: 50, column: 41, scope: !40)
+!43 = !DILocation(line: 110, column: 60, scope: !40, inlinedAt: !41)
+!44 = !DILocation(line: 112, column: 25, scope: !40, inlinedAt: !41)
+!45 = !DILocation(line: 112, column: 17, scope: !40, inlinedAt: !41)
+!46 = !DILocation(line: 113, column: 15, scope: !40, inlinedAt: !41)
+!47 = !DILocation(line: 113, column: 30, scope: !40, inlinedAt: !41)
+!48 = !DILocation(line: 113, column: 49, scope: !40, inlinedAt: !41)
+!49 = !DILocation(line: 113, column: 22, scope: !40, inlinedAt: !41)
+!50 = !DILocation(line: 113, column: 38, scope: !40, inlinedAt: !41)
+!51 = !DILocation(line: 120, column: 46, scope: !33, inlinedAt: !52)
+!52 = !DILocation(line: 50, column: 41, scope: !33)
+!53 = !DILocation(line: 109, column: 28, scope: !40, inlinedAt: !41)
+!54 = !DILocation(line: 110, column: 39, scope: !40, inlinedAt: !41)
+!55 = !DILocation(line: 110, column: 49, scope: !40, inlinedAt: !41)
+!56 = !DILocation(line: 59, column: 51, scope: !7)
+!57 = !DILocation(line: 60, column: 35, scope: !7)
+!58 = !DILocation(line: 60, column: 40, scope: !7)
+!59 = !DILocation(line: 64, column: 57, scope: !7)
+!60 = !DILocation(line: 65, column: 54, scope: !7)
+!61 = !DILocation(line: 69, column: 23, scope: !7)
+!62 = !DILocation(line: 71, column: 24, scope: !7)
+!63 = !DILocation(line: 72, column: 30, scope: !7)
+!64 = !DILocation(line: 66, column: 24, scope: !7)
+!65 = !DILocation(line: 67, column: 24, scope: !7)
+!66 = !DILocation(line: 73, column: 24, scope: !7)
+!67 = !DILocation(line: 74, column: 24, scope: !7)
+!68 = !DILocation(line: 76, column: 39, scope: !7)
+!69 = !DILocation(line: 76, column: 35, scope: !7)
+!70 = !DILocation(line: 76, column: 29, scope: !7)
+!71 = !DILocation(line: 76, column: 52, scope: !7)
+!72 = !DILocation(line: 55, column: 4, scope: !7)

.triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.ptx ADDED Viewed

	@@ -0,0 +1,988 @@

+//
+// Generated by LLVM NVPTX Back-End
+//
+.version 8.2
+.target sm_89
+.address_size 64
+	// .globl	triton__0d1d2d3d4d5de6de
+.extern .func __assertfail
+(
+	.param .b64 __assertfail_param_0,
+	.param .b64 __assertfail_param_1,
+	.param .b32 __assertfail_param_2,
+	.param .b64 __assertfail_param_3,
+	.param .b64 __assertfail_param_4
+)
+;
+.global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
+.global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
+.global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 51, 32, 60, 32, 53, 48, 50, 53, 55};
+.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
+.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
+.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
+.extern .shared .align 1 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
+.visible .entry triton__0d1d2d3d4d5de6de(
+	.param .u64 triton__0d1d2d3d4d5de6de_param_0,
+	.param .u64 triton__0d1d2d3d4d5de6de_param_1,
+	.param .u64 triton__0d1d2d3d4d5de6de_param_2,
+	.param .u64 triton__0d1d2d3d4d5de6de_param_3,
+	.param .u64 triton__0d1d2d3d4d5de6de_param_4,
+	.param .u32 triton__0d1d2d3d4d5de6de_param_5,
+	.param .u32 triton__0d1d2d3d4d5de6de_param_6
+)
+.maxntid 128, 1, 1
+{
+	.reg .pred 	%p<50>;
+	.reg .b16 	%rs<5>;
+	.reg .b32 	%r<169>;
+	.reg .f32 	%f<153>;
+	.reg .b64 	%rd<53>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+	ld.param.u64 	%rd6, [triton__0d1d2d3d4d5de6de_param_3];
+	ld.param.u64 	%rd5, [triton__0d1d2d3d4d5de6de_param_1];
+	ld.param.u64 	%rd19, [triton__0d1d2d3d4d5de6de_param_0];
+$L__tmp0:
+	.loc	1 22 44
+	mov.u32 	%r1, %tid.x;
+	and.b32  	%r2, %r1, 31;
+	ld.param.u64 	%rd20, [triton__0d1d2d3d4d5de6de_param_2];
+	bfe.u32 	%r3, %r1, 6, 1;
+	and.b32  	%r4, %r1, 1;
+	.loc	1 24 33
+	bfe.u32 	%r5, %r1, 5, 1;
+	shl.b32 	%r24, %r1, 2;
+	and.b32  	%r6, %r24, 252;
+	shl.b32 	%r25, %r1, 1;
+	and.b32  	%r7, %r25, 254;
+	.loc	1 21 28
+	mov.u32 %r15, %ctaid.x;
+	.loc	1 21 33
+	shl.b32 	%r26, %r15, 1;
+	.loc	1 22 23
+	or.b32  	%r8, %r26, %r3;
+	or.b32  	%r27, %r26, %r4;
+	.loc	1 26 30
+	mul.wide.s32 	%rd21, %r8, 8;
+	add.s64 	%rd9, %rd19, %rd21;
+	mul.wide.s32 	%rd22, %r27, 8;
+	add.s64 	%rd17, %rd19, %rd22;
+	mov.pred 	%p44, -1;
+	.loc	1 26 35
+	mov.u64 %rd8, 0x0;
+	@%p44 ld.global.L1::evict_last.b64 { %rd8 }, [ %rd9 + 0 ];
+	mov.u64 %rd10, 0x0;
+	@%p44 ld.global.L1::evict_last.b64 { %rd10 }, [ %rd9 + 0 ];
+	mov.u64 %rd12, 0x0;
+	@%p44 ld.global.L1::evict_last.b64 { %rd12 }, [ %rd9 + 0 ];
+	mov.u64 %rd14, 0x0;
+	@%p44 ld.global.L1::evict_last.b64 { %rd14 }, [ %rd9 + 0 ];
+	mov.u64 %rd16, 0x0;
+	@%p44 ld.global.L1::evict_last.b64 { %rd16 }, [ %rd17 + 0 ];
+	.loc	1 27 18
+	bfe.s32 	%r28, %r15, 30, 1;
+	shr.u32 	%r29, %r28, 23;
+	add.s32 	%r30, %r8, %r29;
+	and.b32  	%r31, %r30, 16776704;
+	sub.s32 	%r32, %r8, %r31;
+	.loc	1 35 44
+	shl.b32 	%r33, %r32, 8;
+	.loc	1 35 40
+	or.b32  	%r34, %r33, %r6;
+	.loc	1 35 34
+	mul.wide.s32 	%rd23, %r34, 4;
+	add.s64 	%rd33, %rd20, %rd23;
+	mov.b32 	%r137, 0;
+	.loc	1 35 50
+	mov.u32 %r16, 0x0;
+	mov.u32 %r17, 0x0;
+	mov.u32 %r18, 0x0;
+	mov.u32 %r19, 0x0;
+	@%p44 ld.global.L1::evict_last.v4.b32 { %r16, %r17, %r18, %r19 }, [ %rd33 + 0 ];
+	@!%p44 mov.u32 %r16, %r137;
+	@!%p44 mov.u32 %r17, %r137;
+	@!%p44 mov.u32 %r18, %r137;
+	@!%p44 mov.u32 %r19, %r137;
+	mov.b32 	%f1, %r16;
+	mov.b32 	%f2, %r17;
+	mov.b32 	%f3, %r18;
+	mov.b32 	%f4, %r19;
+	.loc	1 36 22
+	add.s64 	%rd24, %rd16, 50257;
+	.loc	1 37 22
+	setp.lt.s64 	%p11, %rd16, 0;
+	.loc	1 38 36
+	selp.b64 	%rd3, %rd24, %rd16, %p11;
+	.loc	1 39 40
+	setp.lt.u64 	%p12, %rd3, 50257;
+	mov.b32 	%r168, 883;
+	mov.u64 	%rd52, 1;
+	.loc	1 39 55
+	@%p12 bra 	$L__BB0_2;
+	mov.u64 	%rd25, assertMessage_0;
+	cvta.global.u64 	%rd26, %rd25;
+	mov.u64 	%rd27, assertFile_0;
+	cvta.global.u64 	%rd28, %rd27;
+	mov.u64 	%rd29, assertFunc_0;
+	cvta.global.u64 	%rd30, %rd29;
+	{ // callseq 4, 0
+	.reg .b32 temp_param_reg;
+	.param .b64 param0;
+	st.param.b64 	[param0+0], %rd26;
+	.param .b64 param1;
+	st.param.b64 	[param1+0], %rd28;
+	.param .b32 param2;
+	st.param.b32 	[param2+0], %r168;
+	.param .b64 param3;
+	st.param.b64 	[param3+0], %rd30;
+	.param .b64 param4;
+	st.param.b64 	[param4+0], %rd52;
+	call.uni
+	__assertfail,
+	(
+	param0,
+	param1,
+	param2,
+	param3,
+	param4
+	);
+	} // callseq 4
+$L__BB0_2:
+	.loc	1 0 55
+	ld.param.u64 	%rd7, [triton__0d1d2d3d4d5de6de_param_4];
+	.loc	1 37 22
+	setp.lt.s64 	%p36, %rd8, 0;
+	.loc	1 40 44
+	shl.b64 	%rd35, %rd8, 8;
+	add.s64 	%rd36, %rd35, 12865792;
+	selp.b64 	%rd37, %rd36, %rd35, %p36;
+	cvt.u64.u32 	%rd38, %r6;
+	.loc	1 40 40
+	or.b64  	%rd39, %rd37, %rd38;
+	.loc	1 40 34
+	shl.b64 	%rd40, %rd39, 2;
+	add.s64 	%rd49, %rd5, %rd40;
+	.loc	1 40 52
+	mov.u32 %r36, 0x0;
+	mov.u32 %r37, 0x0;
+	mov.u32 %r38, 0x0;
+	mov.u32 %r39, 0x0;
+	@%p44 ld.global.L1::evict_last.v4.b32 { %r36, %r37, %r38, %r39 }, [ %rd49 + 0 ];
+	@!%p44 mov.u32 %r36, %r137;
+	@!%p44 mov.u32 %r37, %r137;
+	@!%p44 mov.u32 %r38, %r137;
+	@!%p44 mov.u32 %r39, %r137;
+	mov.b32 	%f7, %r36;
+	mov.b32 	%f8, %r37;
+	mov.b32 	%f9, %r38;
+	mov.b32 	%f10, %r39;
+	.loc	1 41 22
+	add.f32 	%f11, %f1, %f7;
+	add.f32 	%f12, %f2, %f8;
+	add.f32 	%f13, %f3, %f9;
+	add.f32 	%f14, %f4, %f10;
+$L__tmp1:
+	.loc	2 98 22
+	add.f32 	%f15, %f11, 0f00000000;
+	add.f32 	%f16, %f12, 0f00000000;
+	add.f32 	%f17, %f13, 0f00000000;
+	add.f32 	%f18, %f14, 0f00000000;
+	.loc	2 101 30
+	sub.f32 	%f19, %f11, %f15;
+	sub.f32 	%f20, %f12, %f16;
+	sub.f32 	%f21, %f13, %f17;
+	sub.f32 	%f22, %f14, %f18;
+	.loc	2 101 13
+	fma.rn.f32 	%f23, %f11, %f19, 0f00000000;
+	fma.rn.f32 	%f24, %f12, %f20, 0f00000000;
+	fma.rn.f32 	%f25, %f13, %f21, 0f00000000;
+	fma.rn.f32 	%f26, %f14, %f22, 0f00000000;
+$L__tmp2:
+	.loc	2 108 21
+	sub.f32 	%f27, %f16, %f15;
+	mov.b32 	%r45, 1065353216;
+	mov.b32 	%r46, 1073741824;
+	.loc	2 110 60
+	div.full.f32 %r44, %r45, %r46;
+	mov.b32 	%f28, %r44;
+	.loc	2 112 17
+	fma.rn.f32 	%f29, %f28, %f27, %f15;
+	.loc	2 113 15
+	add.f32 	%f30, %f23, %f24;
+	.loc	2 113 30
+	mul.f32 	%f31, %f27, %f27;
+	.loc	2 113 22
+	fma.rn.f32 	%f32, %f28, %f31, %f30;
+	.loc	2 108 21
+	sub.f32 	%f33, %f17, %f29;
+	mov.b32 	%r49, 1077936128;
+	.loc	2 110 60
+	div.full.f32 %r47, %r45, %r49;
+	mov.b32 	%f34, %r47;
+	.loc	2 112 17
+	fma.rn.f32 	%f35, %f34, %f33, %f29;
+	.loc	2 113 15
+	add.f32 	%f36, %f25, %f32;
+	.loc	2 113 30
+	mul.f32 	%f37, %f33, %f33;
+	.loc	2 113 38
+	fma.rn.f32 	%f38, %f33, %f33, %f37;
+	.loc	2 113 22
+	fma.rn.f32 	%f39, %f34, %f38, %f36;
+	.loc	2 108 21
+	sub.f32 	%f40, %f18, %f35;
+	mov.b32 	%r52, 1082130432;
+	.loc	2 110 60
+	div.full.f32 %r50, %r45, %r52;
+	mov.b32 	%f41, %r50;
+	.loc	2 112 17
+	fma.rn.f32 	%f42, %f41, %f40, %f35;
+	.loc	2 113 15
+	add.f32 	%f43, %f26, %f39;
+	.loc	2 113 30
+	mul.f32 	%f44, %f40, %f40;
+	.loc	2 113 38
+	mul.f32 	%f45, %f44, 0f40400000;
+	.loc	2 113 22
+	fma.rn.f32 	%f46, %f41, %f45, %f43;
+$L__tmp3:
+	.loc	2 120 46
+	mov.b32 	%r101, %f42;
+	shfl.sync.bfly.b32	%r102, %r101, 16, 31, -1;
+	mov.b32 	%f47, %r102;
+	mov.b32 	%r103, %f46;
+	shfl.sync.bfly.b32	%r104, %r103, 16, 31, -1;
+	mov.b32 	%f48, %r104;
+	shfl.sync.bfly.b32	%r54, %r52, 16, 31, -1;
+	mov.b32 	%f49, %r54;
+$L__tmp4:
+	.loc	2 108 21
+	sub.f32 	%f50, %f47, %f42;
+	.loc	2 109 28
+	add.f32 	%f51, %f49, 0f40800000;
+	.loc	2 110 39
+	setp.eq.f32 	%p37, %f51, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r55, %f51;
+	div.full.f32 %r53, %r54, %r55;
+	mov.b32 	%f52, %r53;
+	.loc	2 110 49
+	selp.f32 	%f53, 0f00000000, %f52, %p37;
+	.loc	2 112 17
+	fma.rn.f32 	%f54, %f53, %f50, %f42;
+	.loc	2 113 15
+	add.f32 	%f55, %f46, %f48;
+	.loc	2 113 30
+	mul.f32 	%f56, %f50, %f50;
+	.loc	2 113 38
+	mul.f32 	%f57, %f56, 0f40800000;
+	.loc	2 113 22
+	fma.rn.f32 	%f58, %f53, %f57, %f55;
+$L__tmp5:
+	.loc	2 120 46
+	mov.b32 	%r105, %f54;
+	shfl.sync.bfly.b32	%r106, %r105, 8, 31, -1;
+	mov.b32 	%f59, %r106;
+	mov.b32 	%r107, %f58;
+	shfl.sync.bfly.b32	%r108, %r107, 8, 31, -1;
+	mov.b32 	%f60, %r108;
+	shfl.sync.bfly.b32	%r57, %r55, 8, 31, -1;
+	mov.b32 	%f61, %r57;
+$L__tmp6:
+	.loc	2 108 21
+	sub.f32 	%f62, %f59, %f54;
+	.loc	2 109 28
+	add.f32 	%f63, %f51, %f61;
+	.loc	2 110 39
+	setp.eq.f32 	%p38, %f63, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r58, %f63;
+	div.full.f32 %r56, %r57, %r58;
+	mov.b32 	%f64, %r56;
+	.loc	2 110 49
+	selp.f32 	%f65, 0f00000000, %f64, %p38;
+	.loc	2 112 17
+	fma.rn.f32 	%f66, %f65, %f62, %f54;
+	.loc	2 113 15
+	add.f32 	%f67, %f58, %f60;
+	.loc	2 113 30
+	mul.f32 	%f68, %f62, %f62;
+	.loc	2 113 38
+	mul.f32 	%f69, %f51, %f68;
+	.loc	2 113 22
+	fma.rn.f32 	%f70, %f65, %f69, %f67;
+$L__tmp7:
+	.loc	2 120 46
+	mov.b32 	%r109, %f66;
+	shfl.sync.bfly.b32	%r110, %r109, 4, 31, -1;
+	mov.b32 	%f71, %r110;
+	mov.b32 	%r111, %f70;
+	shfl.sync.bfly.b32	%r112, %r111, 4, 31, -1;
+	mov.b32 	%f72, %r112;
+	shfl.sync.bfly.b32	%r60, %r58, 4, 31, -1;
+	mov.b32 	%f73, %r60;
+$L__tmp8:
+	.loc	2 108 21
+	sub.f32 	%f74, %f71, %f66;
+	.loc	2 109 28
+	add.f32 	%f75, %f63, %f73;
+	.loc	2 110 39
+	setp.eq.f32 	%p39, %f75, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r61, %f75;
+	div.full.f32 %r59, %r60, %r61;
+	mov.b32 	%f76, %r59;
+	.loc	2 110 49
+	selp.f32 	%f77, 0f00000000, %f76, %p39;
+	.loc	2 112 17
+	fma.rn.f32 	%f78, %f77, %f74, %f66;
+	.loc	2 113 15
+	add.f32 	%f79, %f70, %f72;
+	.loc	2 113 30
+	mul.f32 	%f80, %f74, %f74;
+	.loc	2 113 38
+	mul.f32 	%f81, %f63, %f80;
+	.loc	2 113 22
+	fma.rn.f32 	%f82, %f77, %f81, %f79;
+$L__tmp9:
+	.loc	2 120 46
+	mov.b32 	%r113, %f78;
+	shfl.sync.bfly.b32	%r114, %r113, 2, 31, -1;
+	mov.b32 	%f83, %r114;
+	mov.b32 	%r115, %f82;
+	shfl.sync.bfly.b32	%r116, %r115, 2, 31, -1;
+	mov.b32 	%f84, %r116;
+	shfl.sync.bfly.b32	%r63, %r61, 2, 31, -1;
+	mov.b32 	%f85, %r63;
+$L__tmp10:
+	.loc	2 108 21
+	sub.f32 	%f86, %f83, %f78;
+	.loc	2 109 28
+	add.f32 	%f87, %f75, %f85;
+	.loc	2 110 39
+	setp.eq.f32 	%p40, %f87, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r64, %f87;
+	div.full.f32 %r62, %r63, %r64;
+	mov.b32 	%f88, %r62;
+	.loc	2 110 49
+	selp.f32 	%f89, 0f00000000, %f88, %p40;
+	.loc	2 112 17
+	fma.rn.f32 	%f90, %f89, %f86, %f78;
+	.loc	2 113 15
+	add.f32 	%f91, %f82, %f84;
+	.loc	2 113 30
+	mul.f32 	%f92, %f86, %f86;
+	.loc	2 113 38
+	mul.f32 	%f93, %f75, %f92;
+	.loc	2 113 22
+	fma.rn.f32 	%f94, %f89, %f93, %f91;
+$L__tmp11:
+	.loc	2 120 46
+	mov.b32 	%r117, %f90;
+	shfl.sync.bfly.b32	%r118, %r117, 1, 31, -1;
+	mov.b32 	%f95, %r118;
+	mov.b32 	%r119, %f94;
+	shfl.sync.bfly.b32	%r120, %r119, 1, 31, -1;
+	mov.b32 	%f96, %r120;
+	shfl.sync.bfly.b32	%r66, %r64, 1, 31, -1;
+	mov.b32 	%f97, %r66;
+$L__tmp12:
+	.loc	2 108 21
+	sub.f32 	%f98, %f95, %f90;
+	.loc	2 109 28
+	add.f32 	%f99, %f87, %f97;
+	.loc	2 110 39
+	setp.eq.f32 	%p41, %f99, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r67, %f99;
+	div.full.f32 %r65, %r66, %r67;
+	mov.b32 	%f100, %r65;
+	.loc	2 110 49
+	selp.f32 	%f101, 0f00000000, %f100, %p41;
+	.loc	2 112 17
+	fma.rn.f32 	%f102, %f98, %f101, %f90;
+	.loc	2 113 15
+	add.f32 	%f103, %f94, %f96;
+	.loc	2 113 30
+	mul.f32 	%f104, %f98, %f98;
+	.loc	2 113 38
+	mul.f32 	%f105, %f87, %f104;
+	.loc	2 113 22
+	fma.rn.f32 	%f106, %f101, %f105, %f103;
+$L__tmp13:
+	.loc	2 120 46
+	setp.eq.s32 	%p18, %r2, 0;
+	shl.b32 	%r121, %r5, 2;
+	shl.b32 	%r122, %r3, 3;
+	or.b32  	%r123, %r122, %r121;
+	mov.u32 	%r124, global_smem;
+	add.s32 	%r68, %r124, %r123;
+	mov.b32 	%r69, %f102;
+	@%p18 st.shared.b32 [ %r68 + 0 ], %r69;
+	add.s32 	%r125, %r124, 16;
+	add.s32 	%r70, %r125, %r123;
+	mov.b32 	%r71, %f106;
+	@%p18 st.shared.b32 [ %r70 + 0 ], %r71;
+	add.s32 	%r126, %r124, 32;
+	add.s32 	%r72, %r126, %r123;
+	@%p18 st.shared.b32 [ %r72 + 0 ], %r67;
+	bar.sync 	0;
+	setp.lt.s32 	%p21, %r1, 4;
+	add.s32 	%r75, %r124, %r24;
+	@%p21 ld.shared.b32 %r74, [ %r75 + 0 ];
+	mov.b32 	%f107, %r74;
+	add.s32 	%r77, %r125, %r24;
+	@%p21 ld.shared.b32 %r76, [ %r77 + 0 ];
+	mov.b32 	%f108, %r76;
+	add.s32 	%r79, %r126, %r24;
+	@%p21 ld.shared.b32 %r78, [ %r79 + 0 ];
+	mov.b32 	%f109, %r78;
+	shfl.sync.bfly.b32	%r128, %r74, 1, 31, -1;
+	mov.b32 	%f110, %r128;
+	shfl.sync.bfly.b32	%r129, %r76, 1, 31, -1;
+	mov.b32 	%f111, %r129;
+	shfl.sync.bfly.b32	%r81, %r78, 1, 31, -1;
+	mov.b32 	%f112, %r81;
+$L__tmp14:
+	.loc	2 108 21
+	sub.f32 	%f113, %f110, %f107;
+	.loc	2 109 28
+	add.f32 	%f114, %f109, %f112;
+	.loc	2 110 39
+	setp.eq.f32 	%p42, %f114, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r82, %f114;
+	div.full.f32 %r80, %r81, %r82;
+	mov.b32 	%f115, %r80;
+	.loc	2 110 49
+	selp.f32 	%f116, 0f00000000, %f115, %p42;
+	.loc	2 112 17
+	fma.rn.f32 	%f117, %f113, %f116, %f107;
+	.loc	2 113 15
+	add.f32 	%f118, %f108, %f111;
+	.loc	2 113 30
+	mul.f32 	%f119, %f113, %f113;
+	.loc	2 113 38
+	mul.f32 	%f120, %f109, %f119;
+	.loc	2 113 22
+	fma.rn.f32 	%f121, %f120, %f116, %f118;
+$L__tmp15:
+	.loc	2 120 46
+	setp.eq.s32 	%p43, %r4, 0;
+	and.pred  	%p24, %p21, %p43;
+	mov.b32 	%r84, %f117;
+	@%p24 st.shared.b32 [ %r75 + 0 ], %r84;
+	mov.b32 	%r86, %f121;
+	@%p24 st.shared.b32 [ %r77 + 0 ], %r86;
+	@%p24 st.shared.b32 [ %r79 + 0 ], %r82;
+	bar.sync 	0;
+	add.s32 	%r130, %r124, %r122;
+	ld.shared.f32 	%f5, [%r130];
+	add.s32 	%r131, %r125, %r122;
+	ld.shared.f32 	%f6, [%r131];
+$L__tmp16:
+	.loc	1 59 51
+	mov.u32 %r89, 0x0;
+	mov.u32 %r90, 0x0;
+	mov.u32 %r91, 0x0;
+	mov.u32 %r92, 0x0;
+	@%p44 ld.global.L1::evict_last.v4.b32 { %r89, %r90, %r91, %r92 }, [ %rd33 + 0 ];
+	@!%p44 mov.u32 %r89, %r137;
+	@!%p44 mov.u32 %r90, %r137;
+	@!%p44 mov.u32 %r91, %r137;
+	@!%p44 mov.u32 %r92, %r137;
+	.loc	1 60 35
+	mul.wide.u32 	%rd41, %r7, 4;
+	add.s64 	%rd34, %rd6, %rd41;
+	.loc	1 60 40
+	mov.u32 %r97, 0x0;
+	mov.u32 %r98, 0x0;
+	@%p44 ld.global.L1::evict_last.v2.b32 { %r97, %r98 }, [ %rd34 + 0 ];
+	@!%p44 mov.u32 %r97, %r137;
+	@!%p44 mov.u32 %r98, %r137;
+	.loc	1 64 57
+	@%p12 bra 	$L__BB0_4;
+	mov.u64 	%rd42, assertMessage_1;
+	cvta.global.u64 	%rd43, %rd42;
+	mov.u64 	%rd44, assertFile_1;
+	cvta.global.u64 	%rd45, %rd44;
+	mov.u64 	%rd46, assertFunc_1;
+	cvta.global.u64 	%rd47, %rd46;
+	{ // callseq 5, 0
+	.reg .b32 temp_param_reg;
+	.param .b64 param0;
+	st.param.b64 	[param0+0], %rd43;
+	.param .b64 param1;
+	st.param.b64 	[param1+0], %rd45;
+	.param .b32 param2;
+	st.param.b32 	[param2+0], %r168;
+	.param .b64 param3;
+	st.param.b64 	[param3+0], %rd47;
+	.param .b64 param4;
+	st.param.b64 	[param4+0], %rd52;
+	call.uni
+	__assertfail,
+	(
+	param0,
+	param1,
+	param2,
+	param3,
+	param4
+	);
+	} // callseq 5
+$L__BB0_4:
+	.loc	1 65 54
+	mov.u32 %r133, 0x0;
+	mov.u32 %r134, 0x0;
+	mov.u32 %r135, 0x0;
+	mov.u32 %r136, 0x0;
+	@%p44 ld.global.L1::evict_first.v4.b32 { %r133, %r134, %r135, %r136 }, [ %rd49 + 0 ];
+	@!%p44 mov.u32 %r133, %r137;
+	@!%p44 mov.u32 %r134, %r137;
+	@!%p44 mov.u32 %r135, %r137;
+	@!%p44 mov.u32 %r136, %r137;
+	.loc	1 69 23
+	mov.b32 	%r142, %f6;
+	mov.b32 	%r143, 1132462080;
+	div.full.f32 %r141, %r142, %r143;
+	mov.b32 	%f122, %r141;
+	.loc	1 71 24
+	add.f32 	%f123, %f122, 0f3727C5AC;
+	.loc	1 72 30
+	rsqrt.approx.ftz.f32 	%f124, %f123;
+	.loc	1 65 54
+	mov.b32 	%f125, %r136;
+	.loc	1 59 51
+	mov.b32 	%f126, %r92;
+	.loc	1 66 24
+	add.f32 	%f127, %f126, %f125;
+	.loc	1 67 24
+	sub.f32 	%f128, %f127, %f5;
+	.loc	1 65 54
+	mov.b32 	%f129, %r135;
+	.loc	1 59 51
+	mov.b32 	%f130, %r91;
+	.loc	1 66 24
+	add.f32 	%f131, %f130, %f129;
+	.loc	1 67 24
+	sub.f32 	%f132, %f131, %f5;
+	.loc	1 65 54
+	mov.b32 	%f133, %r134;
+	.loc	1 59 51
+	mov.b32 	%f134, %r90;
+	.loc	1 66 24
+	add.f32 	%f135, %f134, %f133;
+	.loc	1 67 24
+	sub.f32 	%f136, %f135, %f5;
+	.loc	1 65 54
+	mov.b32 	%f137, %r133;
+	.loc	1 59 51
+	mov.b32 	%f138, %r89;
+	.loc	1 66 24
+	add.f32 	%f139, %f138, %f137;
+	.loc	1 67 24
+	sub.f32 	%f140, %f139, %f5;
+	.loc	1 73 24
+	mul.f32 	%f141, %f140, %f124;
+	mul.f32 	%f142, %f136, %f124;
+	mul.f32 	%f143, %f132, %f124;
+	mul.f32 	%f144, %f128, %f124;
+	.loc	1 74 24
+	bar.sync 	0;
+	shl.b32 	%r159, %r7, 2;
+	add.s32 	%r161, %r124, %r159;
+	st.shared.v2.u32 	[%r161], {%r97, %r98};
+	bar.sync 	0;
+	shl.b32 	%r162, %r6, 2;
+	add.s32 	%r163, %r124, %r162;
+	ld.shared.v4.f32 	{%f145, %f146, %f147, %f148}, [%r163];
+	mul.f32 	%f149, %f141, %f145;
+	mul.f32 	%f150, %f142, %f146;
+	mul.f32 	%f151, %f143, %f147;
+	mul.f32 	%f152, %f144, %f148;
+	.loc	1 76 39
+	shl.b32 	%r164, %r8, 8;
+	.loc	1 76 35
+	or.b32  	%r165, %r164, %r6;
+	.loc	1 76 29
+	mul.wide.s32 	%rd51, %r165, 2;
+	add.s64 	%rd50, %rd7, %rd51;
+	.loc	1 76 52
+	mov.b32 	%r153, %f149;
+	cvt.rn.bf16.f32 %rs1, %r153;
+	mov.b32 	%r154, %f150;
+	cvt.rn.bf16.f32 %rs2, %r154;
+	mov.b32 	%r155, %f151;
+	cvt.rn.bf16.f32 %rs3, %r155;
+	mov.b32 	%r156, %f152;
+	cvt.rn.bf16.f32 %rs4, %r156;
+	mov.b32 	%r166, {%rs1, %rs2};
+	mov.b32 	%r167, {%rs3, %rs4};
+	@%p44 st.global.v2.b32 [ %rd50 + 0 ], { %r166, %r167 };
+	.loc	1 55 4
+	ret;
+$L__tmp17:
+$L__func_end0:
+}
+	// .globl	__nv_rsqrtf
+.visible .func  (.param .b32 func_retval0) __nv_rsqrtf(
+	.param .b32 __nv_rsqrtf_param_0
+)
+{
+	.reg .f32 	%f<3>;
+$L__func_begin1:
+	ld.param.f32 	%f1, [__nv_rsqrtf_param_0];
+	rsqrt.approx.ftz.f32 	%f2, %f1;
+	st.param.f32 	[func_retval0+0], %f2;
+	ret;
+$L__func_end1:
+}
+	.file	1 "/tmp/torchinductor_root/gx/cgx5lxpuexpindj4dsmjz5x42uhyy7iskevq7ovzpwagb3t5powj.py"
+	.file	2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 32
+.b8 11
+.b8 0
+.b8 0
+.b8 3
+.b8 46
+.b8 1
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 49
+.b8 19
+.b8 0
+.b8 0
+.b8 4
+.b8 29
+.b8 0
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 5
+.b8 29
+.b8 1
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 298
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 103
+.b8 120
+.b8 53
+.b8 108
+.b8 120
+.b8 112
+.b8 117
+.b8 101
+.b8 120
+.b8 112
+.b8 105
+.b8 110
+.b8 100
+.b8 106
+.b8 52
+.b8 100
+.b8 115
+.b8 109
+.b8 106
+.b8 122
+.b8 53
+.b8 120
+.b8 52
+.b8 50
+.b8 117
+.b8 104
+.b8 121
+.b8 121
+.b8 55
+.b8 105
+.b8 115
+.b8 107
+.b8 101
+.b8 118
+.b8 113
+.b8 55
+.b8 111
+.b8 118
+.b8 122
+.b8 112
+.b8 119
+.b8 97
+.b8 103
+.b8 98
+.b8 51
+.b8 116
+.b8 53
+.b8 112
+.b8 111
+.b8 119
+.b8 106
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 103
+.b8 120
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 101
+.b8 54
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 101
+.b8 54
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 1
+.b8 3
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b32 125
+.b8 4
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp2
+.b8 2
+.b8 44
+.b8 38
+.b8 5
+.b32 125
+.b64 $L__tmp2
+.b64 $L__tmp15
+.b8 2
+.b8 50
+.b8 41
+.b8 4
+.b32 125
+.b64 $L__tmp2
+.b64 $L__tmp15
+.b8 2
+.b8 120
+.b8 46
+.b8 0
+.b8 4
+.b32 125
+.b64 $L__tmp3
+.b64 $L__tmp16
+.b8 2
+.b8 50
+.b8 41
+.b8 0
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 302
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 101
+.b8 54
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 302
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}

.triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.ttir ADDED Viewed

	@@ -0,0 +1,104 @@

+module {
+  tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<1.000000e+00> : tensor<1x256xf32>
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x256xf32>
+    %cst_1 = arith.constant 0.000000e+00 : f32
+    %cst_2 = arith.constant dense<256> : tensor<2x1xi64>
+    %cst_3 = arith.constant dense<50257> : tensor<2x1xi64>
+    %cst_4 = arith.constant dense<0> : tensor<2x1xi64>
+    %cst_5 = arith.constant dense<9.99999974E-6> : tensor<2x1xf32>
+    %cst_6 = arith.constant dense<2.560000e+02> : tensor<2x1xf32>
+    %cst_7 = arith.constant dense<0.000000e+00> : tensor<2x256xf32>
+    %cst_8 = arith.constant dense<256> : tensor<2x1xi32>
+    %cst_9 = arith.constant dense<256> : tensor<1x256xi32>
+    %cst_10 = arith.constant dense<512> : tensor<2x1xi32>
+    %c2_i32 = arith.constant 2 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c2_i32 : i32
+    %2 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32>
+    %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<2xi32>) -> tensor<2x1xi32>
+    %4 = tt.splat %1 : (i32) -> tensor<2x1xi32>
+    %5 = arith.addi %4, %3 : tensor<2x1xi32>
+    %6 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
+    %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<256xi32>) -> tensor<1x256xi32>
+    %8 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<2x1x!tt.ptr<i64, 1>>
+    %9 = tt.addptr %8, %5 : tensor<2x1x!tt.ptr<i64, 1>>, tensor<2x1xi32>
+    %10 = tt.load %9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x1xi64>
+    %11 = arith.remsi %5, %cst_10 : tensor<2x1xi32>
+    %12 = arith.cmpi slt, %7, %cst_9 : tensor<1x256xi32>
+    %13 = arith.muli %11, %cst_8 : tensor<2x1xi32>
+    %14 = tt.broadcast %7 : (tensor<1x256xi32>) -> tensor<2x256xi32>
+    %15 = tt.broadcast %13 : (tensor<2x1xi32>) -> tensor<2x256xi32>
+    %16 = arith.addi %14, %15 : tensor<2x256xi32>
+    %17 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<2x256x!tt.ptr<f32, 1>>
+    %18 = tt.addptr %17, %16 : tensor<2x256x!tt.ptr<f32, 1>>, tensor<2x256xi32>
+    %19 = tt.broadcast %12 : (tensor<1x256xi1>) -> tensor<2x256xi1>
+    %20 = tt.load %18, %19, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32>
+    %21 = arith.addi %10, %cst_3 : tensor<2x1xi64>
+    %22 = arith.cmpi slt, %10, %cst_4 : tensor<2x1xi64>
+    %23 = arith.select %22, %21, %10 : tensor<2x1xi1>, tensor<2x1xi64>
+    %24 = arith.cmpi sge, %23, %cst_4 : tensor<2x1xi64>
+    %25 = arith.cmpi slt, %23, %cst_3 : tensor<2x1xi64>
+    %26 = arith.andi %24, %25 : tensor<2x1xi1>
+    tt.assert %26, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<2x1xi1>
+    %27 = arith.muli %23, %cst_2 : tensor<2x1xi64>
+    %28 = tt.broadcast %27 : (tensor<2x1xi64>) -> tensor<2x256xi64>
+    %29 = arith.extsi %7 : tensor<1x256xi32> to tensor<1x256xi64>
+    %30 = tt.broadcast %29 : (tensor<1x256xi64>) -> tensor<2x256xi64>
+    %31 = arith.addi %30, %28 : tensor<2x256xi64>
+    %32 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<2x256x!tt.ptr<f32, 1>>
+    %33 = tt.addptr %32, %31 : tensor<2x256x!tt.ptr<f32, 1>>, tensor<2x256xi64>
+    %34 = tt.load %33, %19, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32>
+    %35 = arith.addf %34, %20 : tensor<2x256xf32>
+    %36 = arith.addf %35, %cst_7 : tensor<2x256xf32>
+    %37 = arith.subf %35, %36 : tensor<2x256xf32>
+    %38 = arith.mulf %35, %37 : tensor<2x256xf32>
+    %39 = arith.addf %38, %cst_7 : tensor<2x256xf32>
+    %40 = arith.select %19, %36, %cst_7 : tensor<2x256xi1>, tensor<2x256xf32>
+    %41 = arith.select %19, %39, %cst_7 : tensor<2x256xi1>, tensor<2x256xf32>
+    %42 = arith.select %12, %cst, %cst_0 : tensor<1x256xi1>, tensor<1x256xf32>
+    %43 = tt.broadcast %42 : (tensor<1x256xf32>) -> tensor<2x256xf32>
+    %44:3 = "tt.reduce"(%40, %41, %43) <{axis = 1 : i32}> ({
+    ^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32):
+      %68 = arith.subf %arg10, %arg7 : f32
+      %69 = arith.addf %arg9, %arg12 : f32
+      %70 = arith.cmpf oeq, %69, %cst_1 : f32
+      %71 = arith.divf %arg12, %69 : f32
+      %72 = arith.select %70, %cst_1, %71 : f32
+      %73 = arith.mulf %68, %72 : f32
+      %74 = arith.addf %arg7, %73 : f32
+      %75 = arith.addf %arg8, %arg11 : f32
+      %76 = arith.mulf %68, %68 : f32
+      %77 = arith.mulf %76, %arg9 : f32
+      %78 = arith.mulf %77, %72 : f32
+      %79 = arith.addf %75, %78 : f32
+      tt.reduce.return %74, %79, %69 : f32, f32, f32
+    }) : (tensor<2x256xf32>, tensor<2x256xf32>, tensor<2x256xf32>) -> (tensor<2xf32>, tensor<2xf32>, tensor<2xf32>)
+    %45 = tt.expand_dims %44#0 {axis = 1 : i32} : (tensor<2xf32>) -> tensor<2x1xf32>
+    %46 = tt.expand_dims %44#1 {axis = 1 : i32} : (tensor<2xf32>) -> tensor<2x1xf32>
+    %47 = tt.load %18, %19, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32>
+    %48 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>>
+    %49 = tt.addptr %48, %7 : tensor<1x256x!tt.ptr<f32, 1>>, tensor<1x256xi32>
+    %50 = tt.load %49, %12, %cst_0 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32>
+    tt.assert %26, "index out of bounds: 0 <= tmp13 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<2x1xi1>
+    %51 = tt.load %33, %19, %cst_7 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<2x256xf32>
+    %52 = arith.addf %51, %47 : tensor<2x256xf32>
+    %53 = tt.broadcast %45 : (tensor<2x1xf32>) -> tensor<2x256xf32>
+    %54 = arith.subf %52, %53 : tensor<2x256xf32>
+    %55 = arith.divf %46, %cst_6 : tensor<2x1xf32>
+    %56 = arith.addf %55, %cst_5 : tensor<2x1xf32>
+    %57 = tt.extern_elementwise %56 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32>) -> tensor<2x1xf32>
+    %58 = tt.broadcast %57 : (tensor<2x1xf32>) -> tensor<2x256xf32>
+    %59 = arith.mulf %54, %58 : tensor<2x256xf32>
+    %60 = tt.broadcast %50 : (tensor<1x256xf32>) -> tensor<2x256xf32>
+    %61 = arith.mulf %59, %60 : tensor<2x256xf32>
+    %62 = arith.muli %5, %cst_8 : tensor<2x1xi32>
+    %63 = tt.broadcast %62 : (tensor<2x1xi32>) -> tensor<2x256xi32>
+    %64 = arith.addi %14, %63 : tensor<2x256xi32>
+    %65 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<2x256x!tt.ptr<bf16, 1>>
+    %66 = tt.addptr %65, %64 : tensor<2x256x!tt.ptr<bf16, 1>>, tensor<2x256xi32>
+    %67 = arith.truncf %61 : tensor<2x256xf32> to tensor<2x256xbf16>
+    tt.store %66, %67, %19 {cache = 1 : i32, evict = 1 : i32} : tensor<2x256xbf16>
+    tt.return
+  }
+}

.triton/dump/305a9479aab997a3a16bfe46bb303a50/triton_.llir ADDED Viewed

	@@ -0,0 +1,524 @@

+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+@assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
+@assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
+@assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp16 < 50257"
+@assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
+@assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
+@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
+@global_smem = external addrspace(3) global [0 x i8]
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
+define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !7 {
+  %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %10 = and i32 %9, 31, !dbg !10
+  %11 = lshr i32 %9, 5, !dbg !10
+  %12 = and i32 %11, 1, !dbg !10
+  %urem = shl i32 %9, 2, !dbg !10
+  %13 = and i32 %urem, 252, !dbg !10
+  %14 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !11
+  %15 = sext i32 %14 to i64, !dbg !12
+  %16 = getelementptr i64, ptr addrspace(1) %0, i64 %15, !dbg !12
+  %17 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %16, i1 true) #6, !dbg !13
+  %18 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %16, i1 true) #6, !dbg !13
+  %19 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %16, i1 true) #6, !dbg !13
+  %20 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %16, i1 true) #6, !dbg !13
+  %21 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %16, i1 true) #6, !dbg !13
+  %22 = srem i32 %14, 512, !dbg !14
+  %23 = shl nsw i32 %22, 8, !dbg !15
+  %24 = or i32 %23, %13, !dbg !16
+  %25 = sext i32 %24 to i64, !dbg !17
+  %26 = getelementptr float, ptr addrspace(1) %2, i64 %25, !dbg !17
+  %27 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %26, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !18
+  %28 = extractvalue { i32, i32, i32, i32 } %27, 0, !dbg !18
+  %29 = extractvalue { i32, i32, i32, i32 } %27, 1, !dbg !18
+  %30 = extractvalue { i32, i32, i32, i32 } %27, 2, !dbg !18
+  %31 = extractvalue { i32, i32, i32, i32 } %27, 3, !dbg !18
+  %32 = insertelement <2 x i32> poison, i32 %29, i64 0, !dbg !18
+  %33 = insertelement <2 x i32> %32, i32 %28, i64 1, !dbg !18
+  %34 = bitcast <2 x i32> %33 to <2 x float>, !dbg !18
+  %35 = bitcast i32 %30 to float, !dbg !18
+  %36 = bitcast i32 %31 to float, !dbg !18
+  %37 = shl i32 %14, 8, !dbg !19
+  %38 = or i32 %37, %13, !dbg !20
+  %39 = sext i32 %38 to i64, !dbg !21
+  %40 = getelementptr i16, ptr addrspace(1) %3, i64 %39, !dbg !21
+  %41 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %40, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !22
+  %42 = extractvalue { i32, i32 } %41, 0, !dbg !22
+  %43 = extractvalue { i32, i32 } %41, 1, !dbg !22
+  %44 = trunc i32 %42 to i16, !dbg !22
+  %extelt.offset = lshr i32 %42, 16, !dbg !22
+  %45 = trunc i32 %extelt.offset to i16, !dbg !22
+  %46 = trunc i32 %43 to i16, !dbg !22
+  %extelt.offset1 = lshr i32 %43, 16, !dbg !22
+  %47 = trunc i32 %extelt.offset1 to i16, !dbg !22
+  %48 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %44) #6, !dbg !23
+  %49 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %45) #6, !dbg !23
+  %50 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %46) #6, !dbg !23
+  %51 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %47) #6, !dbg !23
+  %52 = add i64 %21, 50257, !dbg !24
+  %53 = icmp slt i64 %17, 0, !dbg !25
+  %54 = icmp slt i64 %21, 0, !dbg !25
+  %55 = select i1 %54, i64 %52, i64 %21, !dbg !26
+  %56 = icmp ugt i64 %55, 50256, !dbg !27
+  br i1 %56, label %57, label %58, !dbg !28
+57:                                               ; preds = %8
+  tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !28
+  br label %58, !dbg !28
+58:                                               ; preds = %57, %8
+  %59 = shl i64 %17, 8, !dbg !29
+  %60 = add i64 %59, 12865792, !dbg !29
+  %61 = select i1 %53, i64 %60, i64 %59, !dbg !29
+  %62 = zext nneg i32 %13 to i64
+  %63 = or i64 %61, %62, !dbg !30
+  %64 = getelementptr float, ptr addrspace(1) %1, i64 %63, !dbg !31
+  %65 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %64, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
+  %66 = extractvalue { i32, i32, i32, i32 } %65, 0, !dbg !32
+  %67 = extractvalue { i32, i32, i32, i32 } %65, 1, !dbg !32
+  %68 = extractvalue { i32, i32, i32, i32 } %65, 2, !dbg !32
+  %69 = extractvalue { i32, i32, i32, i32 } %65, 3, !dbg !32
+  %70 = bitcast i32 %68 to float, !dbg !32
+  %71 = bitcast i32 %69 to float, !dbg !32
+  %72 = fadd float %35, %70, !dbg !33
+  %73 = fadd float %36, %71, !dbg !33
+  %74 = fadd float %50, %72, !dbg !34
+  %75 = fadd float %51, %73, !dbg !34
+  %76 = insertelement <2 x i32> poison, i32 %67, i64 0, !dbg !32
+  %77 = insertelement <2 x i32> %76, i32 %66, i64 1, !dbg !32
+  %78 = bitcast <2 x i32> %77 to <2 x float>, !dbg !32
+  %79 = fadd <2 x float> %34, %78, !dbg !33
+  %80 = insertelement <2 x float> poison, float %49, i64 0, !dbg !34
+  %81 = insertelement <2 x float> %80, float %48, i64 1, !dbg !34
+  %82 = fadd <2 x float> %81, %79, !dbg !34
+  %83 = fadd <2 x float> %82, zeroinitializer, !dbg !35
+  %84 = fadd float %74, 0.000000e+00, !dbg !35
+  %85 = fadd float %75, 0.000000e+00, !dbg !35
+  %86 = extractelement <2 x float> %83, i64 1, !dbg !39
+  %87 = extractelement <2 x float> %82, i64 1, !dbg !43
+  %88 = fsub float %87, %86, !dbg !44
+  %89 = extractelement <2 x float> %83, i64 0, !dbg !39
+  %90 = extractelement <2 x float> %82, i64 0, !dbg !43
+  %91 = fsub float %90, %89, !dbg !44
+  %92 = fsub float %74, %84, !dbg !44
+  %93 = fsub float %75, %85, !dbg !44
+  %94 = fmul float %87, %88, !dbg !43
+  %95 = fmul float %90, %91, !dbg !43
+  %96 = fmul float %74, %92, !dbg !43
+  %97 = fmul float %75, %93, !dbg !43
+  %98 = fadd float %94, 0.000000e+00, !dbg !45
+  %99 = fadd float %95, 0.000000e+00, !dbg !45
+  %100 = fadd float %96, 0.000000e+00, !dbg !45
+  %101 = fadd float %97, 0.000000e+00, !dbg !45
+  %102 = fsub float %89, %86, !dbg !39
+  %103 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 2.000000e+00) #6, !dbg !46
+  %104 = fmul float %103, %102, !dbg !47
+  %105 = fadd float %86, %104, !dbg !48
+  %106 = fadd float %98, %99, !dbg !49
+  %107 = fmul float %102, %102, !dbg !50
+  %108 = fmul float %103, %107, !dbg !51
+  %109 = fadd float %108, %106, !dbg !52
+  %110 = fsub float %84, %105, !dbg !39
+  %111 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 3.000000e+00) #6, !dbg !46
+  %112 = fmul float %111, %110, !dbg !47
+  %113 = fadd float %105, %112, !dbg !48
+  %114 = fadd float %100, %109, !dbg !49
+  %115 = fmul float %110, %110, !dbg !50
+  %116 = fmul float %115, 2.000000e+00, !dbg !53
+  %117 = fmul float %111, %116, !dbg !51
+  %118 = fadd float %114, %117, !dbg !52
+  %119 = fsub float %85, %113, !dbg !39
+  %120 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 4.000000e+00) #6, !dbg !46
+  %121 = fmul float %120, %119, !dbg !47
+  %122 = fadd float %113, %121, !dbg !48
+  %123 = fadd float %101, %118, !dbg !49
+  %124 = fmul float %119, %119, !dbg !50
+  %125 = fmul float %124, 3.000000e+00, !dbg !53
+  %126 = fmul float %120, %125, !dbg !51
+  %127 = fadd float %123, %126, !dbg !52
+  %128 = bitcast float %122 to i32, !dbg !54
+  %129 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %128, i32 16, i32 31), !dbg !54
+  %130 = bitcast i32 %129 to float, !dbg !54
+  %131 = bitcast float %127 to i32, !dbg !54
+  %132 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %131, i32 16, i32 31), !dbg !54
+  %133 = bitcast i32 %132 to float, !dbg !54
+  %134 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 1082130432, i32 16, i32 31), !dbg !54
+  %135 = bitcast i32 %134 to float, !dbg !54
+  %136 = fsub float %130, %122, !dbg !39
+  %137 = fadd float %135, 4.000000e+00, !dbg !56
+  %138 = fcmp oeq float %137, 0.000000e+00, !dbg !57
+  %139 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %135, float %137) #6, !dbg !46
+  %140 = select i1 %138, float 0.000000e+00, float %139, !dbg !58
+  %141 = fmul float %140, %136, !dbg !47
+  %142 = fadd float %122, %141, !dbg !48
+  %143 = fadd float %127, %133, !dbg !49
+  %144 = fmul float %136, %136, !dbg !50
+  %145 = fmul float %144, 4.000000e+00, !dbg !53
+  %146 = fmul float %140, %145, !dbg !51
+  %147 = fadd float %143, %146, !dbg !52
+  %148 = bitcast float %142 to i32, !dbg !54
+  %149 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %148, i32 8, i32 31), !dbg !54
+  %150 = bitcast i32 %149 to float, !dbg !54
+  %151 = bitcast float %147 to i32, !dbg !54
+  %152 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %151, i32 8, i32 31), !dbg !54
+  %153 = bitcast i32 %152 to float, !dbg !54
+  %154 = bitcast float %137 to i32, !dbg !54
+  %155 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %154, i32 8, i32 31), !dbg !54
+  %156 = bitcast i32 %155 to float, !dbg !54
+  %157 = fsub float %150, %142, !dbg !39
+  %158 = fadd float %137, %156, !dbg !56
+  %159 = fcmp oeq float %158, 0.000000e+00, !dbg !57
+  %160 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %156, float %158) #6, !dbg !46
+  %161 = select i1 %159, float 0.000000e+00, float %160, !dbg !58
+  %162 = fmul float %161, %157, !dbg !47
+  %163 = fadd float %142, %162, !dbg !48
+  %164 = fadd float %147, %153, !dbg !49
+  %165 = fmul float %157, %157, !dbg !50
+  %166 = fmul float %137, %165, !dbg !53
+  %167 = fmul float %161, %166, !dbg !51
+  %168 = fadd float %164, %167, !dbg !52
+  %169 = bitcast float %163 to i32, !dbg !54
+  %170 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %169, i32 4, i32 31), !dbg !54
+  %171 = bitcast i32 %170 to float, !dbg !54
+  %172 = bitcast float %168 to i32, !dbg !54
+  %173 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %172, i32 4, i32 31), !dbg !54
+  %174 = bitcast i32 %173 to float, !dbg !54
+  %175 = bitcast float %158 to i32, !dbg !54
+  %176 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %175, i32 4, i32 31), !dbg !54
+  %177 = bitcast i32 %176 to float, !dbg !54
+  %178 = fsub float %171, %163, !dbg !39
+  %179 = fadd float %158, %177, !dbg !56
+  %180 = fcmp oeq float %179, 0.000000e+00, !dbg !57
+  %181 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %177, float %179) #6, !dbg !46
+  %182 = select i1 %180, float 0.000000e+00, float %181, !dbg !58
+  %183 = fmul float %182, %178, !dbg !47
+  %184 = fadd float %163, %183, !dbg !48
+  %185 = fadd float %168, %174, !dbg !49
+  %186 = fmul float %178, %178, !dbg !50
+  %187 = fmul float %158, %186, !dbg !53
+  %188 = fmul float %182, %187, !dbg !51
+  %189 = fadd float %185, %188, !dbg !52
+  %190 = bitcast float %184 to i32, !dbg !54
+  %191 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %190, i32 2, i32 31), !dbg !54
+  %192 = bitcast i32 %191 to float, !dbg !54
+  %193 = bitcast float %189 to i32, !dbg !54
+  %194 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %193, i32 2, i32 31), !dbg !54
+  %195 = bitcast i32 %194 to float, !dbg !54
+  %196 = bitcast float %179 to i32, !dbg !54
+  %197 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %196, i32 2, i32 31), !dbg !54
+  %198 = bitcast i32 %197 to float, !dbg !54
+  %199 = fsub float %192, %184, !dbg !39
+  %200 = fadd float %179, %198, !dbg !56
+  %201 = fcmp oeq float %200, 0.000000e+00, !dbg !57
+  %202 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %198, float %200) #6, !dbg !46
+  %203 = select i1 %201, float 0.000000e+00, float %202, !dbg !58
+  %204 = fmul float %203, %199, !dbg !47
+  %205 = fadd float %184, %204, !dbg !48
+  %206 = fadd float %189, %195, !dbg !49
+  %207 = fmul float %199, %199, !dbg !50
+  %208 = fmul float %179, %207, !dbg !53
+  %209 = fmul float %203, %208, !dbg !51
+  %210 = fadd float %206, %209, !dbg !52
+  %211 = bitcast float %205 to i32, !dbg !54
+  %212 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %211, i32 1, i32 31), !dbg !54
+  %213 = bitcast i32 %212 to float, !dbg !54
+  %214 = bitcast float %210 to i32, !dbg !54
+  %215 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %214, i32 1, i32 31), !dbg !54
+  %216 = bitcast i32 %215 to float, !dbg !54
+  %217 = bitcast float %200 to i32, !dbg !54
+  %218 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %217, i32 1, i32 31), !dbg !54
+  %219 = bitcast i32 %218 to float, !dbg !54
+  %220 = fsub float %213, %205, !dbg !39
+  %221 = fadd float %200, %219, !dbg !56
+  %222 = fcmp oeq float %221, 0.000000e+00, !dbg !57
+  %223 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %219, float %221) #6, !dbg !46
+  %224 = select i1 %222, float 0.000000e+00, float %223, !dbg !58
+  %225 = fmul float %224, %220, !dbg !47
+  %226 = fadd float %205, %225, !dbg !48
+  %227 = fadd float %210, %216, !dbg !49
+  %228 = fmul float %220, %220, !dbg !50
+  %229 = fmul float %200, %228, !dbg !53
+  %230 = fmul float %224, %229, !dbg !51
+  %231 = fadd float %227, %230, !dbg !52
+  %232 = icmp eq i32 %10, 0, !dbg !54
+  %233 = zext nneg i32 %12 to i64, !dbg !54
+  %234 = getelementptr float, ptr addrspace(3) @global_smem, i64 %233, !dbg !54
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %234, float %226, i1 %232) #6, !dbg !54
+  %235 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 8), i64 %233, !dbg !54
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %235, float %231, i1 %232) #6, !dbg !54
+  %236 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %233, !dbg !54
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %236, float %221, i1 %232) #6, !dbg !54
+  tail call void @llvm.nvvm.barrier0(), !dbg !54
+  %237 = icmp slt i32 %9, 2, !dbg !54
+  %238 = sext i32 %9 to i64, !dbg !54
+  %239 = getelementptr float, ptr addrspace(3) @global_smem, i64 %238, !dbg !54
+  %240 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %239, i1 %237) #6, !dbg !54
+  %241 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 8), i64 %238, !dbg !54
+  %242 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %241, i1 %237) #6, !dbg !54
+  %243 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %238, !dbg !54
+  %244 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %243, i1 %237) #6, !dbg !54
+  %245 = bitcast float %240 to i32, !dbg !54
+  %246 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %245, i32 1, i32 31), !dbg !54
+  %247 = bitcast i32 %246 to float, !dbg !54
+  %248 = bitcast float %242 to i32, !dbg !54
+  %249 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %248, i32 1, i32 31), !dbg !54
+  %250 = bitcast i32 %249 to float, !dbg !54
+  %251 = bitcast float %244 to i32, !dbg !54
+  %252 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %251, i32 1, i32 31), !dbg !54
+  %253 = bitcast i32 %252 to float, !dbg !54
+  %254 = fsub float %247, %240, !dbg !39
+  %255 = fadd float %244, %253, !dbg !56
+  %256 = fcmp oeq float %255, 0.000000e+00, !dbg !57
+  %257 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %253, float %255) #6, !dbg !46
+  %258 = select i1 %256, float 0.000000e+00, float %257, !dbg !58
+  %259 = fmul float %254, %258, !dbg !47
+  %260 = fadd float %240, %259, !dbg !48
+  %261 = fadd float %242, %250, !dbg !49
+  %262 = fmul float %254, %254, !dbg !50
+  %263 = fmul float %244, %262, !dbg !53
+  %264 = fmul float %263, %258, !dbg !51
+  %265 = fadd float %261, %264, !dbg !52
+  %266 = and i32 %9, 1, !dbg !54
+  %267 = icmp eq i32 %266, 0, !dbg !54
+  %268 = and i1 %237, %267, !dbg !54
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %239, float %260, i1 %268) #6, !dbg !54
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %241, float %265, i1 %268) #6, !dbg !54
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %243, float %255, i1 %268) #6, !dbg !54
+  tail call void @llvm.nvvm.barrier0(), !dbg !54
+  %269 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !54
+  %270 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 8), align 4, !dbg !54
+  %271 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %26, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !59
+  %272 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %40, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !60
+  %273 = extractvalue { i32, i32 } %272, 0, !dbg !60
+  %274 = extractvalue { i32, i32 } %272, 1, !dbg !60
+  %275 = trunc i32 %273 to i16, !dbg !60
+  %extelt.offset2 = lshr i32 %273, 16, !dbg !60
+  %276 = trunc i32 %extelt.offset2 to i16, !dbg !60
+  %277 = trunc i32 %274 to i16, !dbg !60
+  %extelt.offset3 = lshr i32 %274, 16, !dbg !60
+  %278 = trunc i32 %extelt.offset3 to i16, !dbg !60
+  %279 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %275) #6, !dbg !61
+  %280 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %276) #6, !dbg !61
+  %281 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %277) #6, !dbg !61
+  %282 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %278) #6, !dbg !61
+  %283 = getelementptr float, ptr addrspace(1) %4, i64 %62, !dbg !62
+  %284 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %283, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !63
+  br i1 %56, label %285, label %286, !dbg !64
+285:                                              ; preds = %58
+  tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !64
+  br label %286, !dbg !64
+286:                                              ; preds = %285, %58
+  %287 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %64, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !65
+  %288 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %270, float 2.560000e+02) #6, !dbg !66
+  %289 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %270, float 2.560000e+02) #6, !dbg !66
+  %290 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %270, float 2.560000e+02) #6, !dbg !66
+  %291 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %270, float 2.560000e+02) #6, !dbg !66
+  %292 = fadd float %288, 0x3EE4F8B580000000, !dbg !67
+  %293 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !68
+  %.not.i = icmp eq i32 %293, 0, !dbg !68
+  br i1 %.not.i, label %296, label %294, !dbg !68
+294:                                              ; preds = %286
+  %295 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %292), !dbg !68
+  br label %__nv_rsqrtf.exit, !dbg !68
+296:                                              ; preds = %286
+  %297 = tail call float @llvm.nvvm.rsqrt.approx.f(float %292), !dbg !68
+  br label %__nv_rsqrtf.exit, !dbg !68
+__nv_rsqrtf.exit:                                 ; preds = %294, %296
+  %.0.i = phi float [ %295, %294 ], [ %297, %296 ], !dbg !68
+  %298 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !68
+  %299 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !68
+  %300 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !68
+  %301 = extractvalue { i32, i32, i32, i32 } %287, 3, !dbg !65
+  %302 = bitcast i32 %301 to float, !dbg !65
+  %303 = extractvalue { i32, i32, i32, i32 } %271, 3, !dbg !59
+  %304 = bitcast i32 %303 to float, !dbg !59
+  %305 = fadd float %304, %302, !dbg !69
+  %306 = fadd float %282, %305, !dbg !70
+  %307 = fsub float %306, %269, !dbg !71
+  %308 = extractvalue { i32, i32, i32, i32 } %287, 2, !dbg !65
+  %309 = bitcast i32 %308 to float, !dbg !65
+  %310 = extractvalue { i32, i32, i32, i32 } %271, 2, !dbg !59
+  %311 = bitcast i32 %310 to float, !dbg !59
+  %312 = fadd float %311, %309, !dbg !69
+  %313 = fadd float %281, %312, !dbg !70
+  %314 = fsub float %313, %269, !dbg !71
+  %315 = extractvalue { i32, i32, i32, i32 } %287, 1, !dbg !65
+  %316 = bitcast i32 %315 to float, !dbg !65
+  %317 = extractvalue { i32, i32, i32, i32 } %271, 1, !dbg !59
+  %318 = bitcast i32 %317 to float, !dbg !59
+  %319 = fadd float %318, %316, !dbg !69
+  %320 = fadd float %280, %319, !dbg !70
+  %321 = fsub float %320, %269, !dbg !71
+  %322 = extractvalue { i32, i32, i32, i32 } %287, 0, !dbg !65
+  %323 = bitcast i32 %322 to float, !dbg !65
+  %324 = extractvalue { i32, i32, i32, i32 } %271, 0, !dbg !59
+  %325 = bitcast i32 %324 to float, !dbg !59
+  %326 = fadd float %325, %323, !dbg !69
+  %327 = fadd float %279, %326, !dbg !70
+  %328 = fsub float %327, %269, !dbg !71
+  %329 = extractvalue { i32, i32, i32, i32 } %284, 0, !dbg !63
+  %330 = bitcast i32 %329 to float, !dbg !63
+  %331 = extractvalue { i32, i32, i32, i32 } %284, 1, !dbg !63
+  %332 = bitcast i32 %331 to float, !dbg !63
+  %333 = extractvalue { i32, i32, i32, i32 } %284, 2, !dbg !63
+  %334 = bitcast i32 %333 to float, !dbg !63
+  %335 = extractvalue { i32, i32, i32, i32 } %284, 3, !dbg !63
+  %336 = bitcast i32 %335 to float, !dbg !63
+  %337 = fmul float %328, %.0.i, !dbg !72
+  %338 = fmul float %321, %.0.i, !dbg !72
+  %339 = fmul float %314, %.0.i, !dbg !72
+  %340 = fmul float %307, %.0.i, !dbg !72
+  %341 = fmul float %337, %330, !dbg !73
+  %342 = fmul float %338, %332, !dbg !73
+  %343 = fmul float %339, %334, !dbg !73
+  %344 = fmul float %340, %336, !dbg !73
+  %345 = getelementptr i16, ptr addrspace(1) %5, i64 %39, !dbg !74
+  %346 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %341) #6, !dbg !75
+  %347 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %342) #6, !dbg !75
+  %348 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %343) #6, !dbg !75
+  %349 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %344) #6, !dbg !75
+  %350 = insertelement <2 x i16> undef, i16 %346, i64 0, !dbg !75
+  %351 = insertelement <2 x i16> %350, i16 %347, i64 1, !dbg !75
+  %352 = bitcast <2 x i16> %351 to i32, !dbg !75
+  %353 = insertelement <2 x i16> undef, i16 %348, i64 0, !dbg !75
+  %354 = insertelement <2 x i16> %353, i16 %349, i64 1, !dbg !75
+  %355 = bitcast <2 x i16> %354 to i32, !dbg !75
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %352, i32 %355, ptr addrspace(1) %345, i1 true) #6, !dbg !75
+  ret void, !dbg !76
+}
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #2
+; Function Attrs: alwaysinline nounwind
+define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
+  %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
+  %.not = icmp eq i32 %1, 0
+  br i1 %.not, label %4, label %2
+2:                                                ; preds = %0
+  %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
+  br label %6
+4:                                                ; preds = %0
+  %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
+  br label %6
+6:                                                ; preds = %4, %2
+  %.0 = phi float [ %3, %2 ], [ %5, %4 ]
+  ret float %.0
+}
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #5
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #2 = { convergent nocallback nounwind }
+attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #6 = { nounwind }
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+!nvvm.annotations = !{!4, !5, !5, !4}
+!llvm.ident = !{!6}
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!3 = !DIFile(filename: "cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py", directory: "/tmp/torchinductor_root/pn")
+!4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1}
+!5 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 64}
+!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
+!9 = !{}
+!10 = !DILocation(line: 24, column: 33, scope: !7)
+!11 = !DILocation(line: 21, column: 28, scope: !7)
+!12 = !DILocation(line: 26, column: 30, scope: !7)
+!13 = !DILocation(line: 26, column: 35, scope: !7)
+!14 = !DILocation(line: 27, column: 18, scope: !7)
+!15 = !DILocation(line: 35, column: 44, scope: !7)
+!16 = !DILocation(line: 35, column: 40, scope: !7)
+!17 = !DILocation(line: 35, column: 34, scope: !7)
+!18 = !DILocation(line: 35, column: 50, scope: !7)
+!19 = !DILocation(line: 36, column: 44, scope: !7)
+!20 = !DILocation(line: 36, column: 40, scope: !7)
+!21 = !DILocation(line: 36, column: 34, scope: !7)
+!22 = !DILocation(line: 36, column: 50, scope: !7)
+!23 = !DILocation(line: 36, column: 101, scope: !7)
+!24 = !DILocation(line: 37, column: 22, scope: !7)
+!25 = !DILocation(line: 38, column: 22, scope: !7)
+!26 = !DILocation(line: 39, column: 36, scope: !7)
+!27 = !DILocation(line: 40, column: 40, scope: !7)
+!28 = !DILocation(line: 40, column: 55, scope: !7)
+!29 = !DILocation(line: 41, column: 44, scope: !7)
+!30 = !DILocation(line: 41, column: 40, scope: !7)
+!31 = !DILocation(line: 41, column: 34, scope: !7)
+!32 = !DILocation(line: 41, column: 52, scope: !7)
+!33 = !DILocation(line: 42, column: 22, scope: !7)
+!34 = !DILocation(line: 44, column: 22, scope: !7)
+!35 = !DILocation(line: 98, column: 22, scope: !36, inlinedAt: !38)
+!36 = distinct !DILexicalBlockFile(scope: !7, file: !37, discriminator: 0)
+!37 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
+!38 = !DILocation(line: 47, column: 41, scope: !36)
+!39 = !DILocation(line: 108, column: 21, scope: !40, inlinedAt: !41)
+!40 = distinct !DILexicalBlockFile(scope: !36, file: !37, discriminator: 0)
+!41 = !DILocation(line: 120, column: 46, scope: !40, inlinedAt: !42)
+!42 = !DILocation(line: 53, column: 44, scope: !40)
+!43 = !DILocation(line: 101, column: 22, scope: !36, inlinedAt: !38)
+!44 = !DILocation(line: 101, column: 30, scope: !36, inlinedAt: !38)
+!45 = !DILocation(line: 101, column: 13, scope: !36, inlinedAt: !38)
+!46 = !DILocation(line: 110, column: 60, scope: !40, inlinedAt: !41)
+!47 = !DILocation(line: 112, column: 25, scope: !40, inlinedAt: !41)
+!48 = !DILocation(line: 112, column: 17, scope: !40, inlinedAt: !41)
+!49 = !DILocation(line: 113, column: 15, scope: !40, inlinedAt: !41)
+!50 = !DILocation(line: 113, column: 30, scope: !40, inlinedAt: !41)
+!51 = !DILocation(line: 113, column: 49, scope: !40, inlinedAt: !41)
+!52 = !DILocation(line: 113, column: 22, scope: !40, inlinedAt: !41)
+!53 = !DILocation(line: 113, column: 38, scope: !40, inlinedAt: !41)
+!54 = !DILocation(line: 120, column: 46, scope: !36, inlinedAt: !55)
+!55 = !DILocation(line: 53, column: 44, scope: !36)
+!56 = !DILocation(line: 109, column: 28, scope: !40, inlinedAt: !41)
+!57 = !DILocation(line: 110, column: 39, scope: !40, inlinedAt: !41)
+!58 = !DILocation(line: 110, column: 49, scope: !40, inlinedAt: !41)
+!59 = !DILocation(line: 62, column: 51, scope: !7)
+!60 = !DILocation(line: 63, column: 51, scope: !7)
+!61 = !DILocation(line: 63, column: 103, scope: !7)
+!62 = !DILocation(line: 64, column: 35, scope: !7)
+!63 = !DILocation(line: 64, column: 40, scope: !7)
+!64 = !DILocation(line: 68, column: 57, scope: !7)
+!65 = !DILocation(line: 69, column: 54, scope: !7)
+!66 = !DILocation(line: 75, column: 24, scope: !7)
+!67 = !DILocation(line: 77, column: 24, scope: !7)
+!68 = !DILocation(line: 78, column: 30, scope: !7)
+!69 = !DILocation(line: 70, column: 24, scope: !7)
+!70 = !DILocation(line: 72, column: 24, scope: !7)
+!71 = !DILocation(line: 73, column: 24, scope: !7)
+!72 = !DILocation(line: 79, column: 24, scope: !7)
+!73 = !DILocation(line: 80, column: 24, scope: !7)
+!74 = !DILocation(line: 82, column: 29, scope: !7)
+!75 = !DILocation(line: 82, column: 52, scope: !7)
+!76 = !DILocation(line: 58, column: 4, scope: !7)

.triton/dump/305a9479aab997a3a16bfe46bb303a50/triton_.ttgir ADDED Viewed

	@@ -0,0 +1,110 @@

+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<256> : tensor<1x256xi32, #blocked>
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x256xf32, #blocked>
+    %cst_1 = arith.constant dense<1.000000e+00> : tensor<1x256xf32, #blocked>
+    %cst_2 = arith.constant dense<256> : tensor<1x1xi64, #blocked>
+    %cst_3 = arith.constant dense<50257> : tensor<1x1xi64, #blocked>
+    %cst_4 = arith.constant dense<0> : tensor<1x1xi64, #blocked>
+    %cst_5 = arith.constant dense<0> : tensor<1x1xi64, #blocked1>
+    %cst_6 = arith.constant dense<50257> : tensor<1x1xi64, #blocked1>
+    %cst_7 = arith.constant 0.000000e+00 : f32
+    %c256_i32 = arith.constant 256 : i32
+    %c512_i32 = arith.constant 512 : i32
+    %cst_8 = arith.constant dense<9.99999974E-6> : tensor<1x1xf32, #blocked>
+    %cst_9 = arith.constant dense<2.560000e+02> : tensor<1x1xf32, #blocked>
+    %cst_10 = arith.constant dense<0.000000e+00> : tensor<1x256xbf16, #blocked>
+    %0 = tt.get_program_id x : i32
+    %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
+    %2 = tt.expand_dims %1 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x256xi32, #blocked>
+    %3 = tt.addptr %arg0, %0 : !tt.ptr<i64, 1>, i32
+    %4 = tt.splat %3 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>, #blocked>
+    %5 = tt.splat %3 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>, #blocked1>
+    %6 = tt.load %4 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x1xi64, #blocked>
+    %7 = tt.load %5 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x1xi64, #blocked1>
+    %8 = arith.remsi %0, %c512_i32 : i32
+    %9 = arith.cmpi slt, %2, %cst : tensor<1x256xi32, #blocked>
+    %10 = arith.muli %8, %c256_i32 : i32
+    %11 = tt.splat %10 : (i32) -> tensor<1x256xi32, #blocked>
+    %12 = arith.addi %2, %11 : tensor<1x256xi32, #blocked>
+    %13 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>, #blocked>
+    %14 = tt.addptr %13, %12 : tensor<1x256x!tt.ptr<f32, 1>, #blocked>, tensor<1x256xi32, #blocked>
+    %15 = tt.load %14, %9, %cst_0 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32, #blocked>
+    %16 = arith.muli %0, %c256_i32 : i32
+    %17 = tt.splat %16 : (i32) -> tensor<1x256xi32, #blocked>
+    %18 = arith.addi %2, %17 : tensor<1x256xi32, #blocked>
+    %19 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<1x256x!tt.ptr<bf16, 1>, #blocked>
+    %20 = tt.addptr %19, %18 : tensor<1x256x!tt.ptr<bf16, 1>, #blocked>, tensor<1x256xi32, #blocked>
+    %21 = tt.load %20, %9, %cst_10 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xbf16, #blocked>
+    %22 = arith.extf %21 : tensor<1x256xbf16, #blocked> to tensor<1x256xf32, #blocked>
+    %23 = arith.addi %6, %cst_3 : tensor<1x1xi64, #blocked>
+    %24 = arith.addi %7, %cst_6 : tensor<1x1xi64, #blocked1>
+    %25 = arith.cmpi slt, %6, %cst_4 : tensor<1x1xi64, #blocked>
+    %26 = arith.cmpi slt, %7, %cst_5 : tensor<1x1xi64, #blocked1>
+    %27 = arith.select %25, %23, %6 : tensor<1x1xi1, #blocked>, tensor<1x1xi64, #blocked>
+    %28 = arith.select %26, %24, %7 : tensor<1x1xi1, #blocked1>, tensor<1x1xi64, #blocked1>
+    %29 = arith.cmpi sge, %28, %cst_5 : tensor<1x1xi64, #blocked1>
+    %30 = arith.cmpi slt, %28, %cst_6 : tensor<1x1xi64, #blocked1>
+    %31 = arith.andi %29, %30 : tensor<1x1xi1, #blocked1>
+    tt.assert %31, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<1x1xi1, #blocked1>
+    %32 = arith.muli %27, %cst_2 : tensor<1x1xi64, #blocked>
+    %33 = tt.broadcast %32 : (tensor<1x1xi64, #blocked>) -> tensor<1x256xi64, #blocked>
+    %34 = arith.extsi %2 : tensor<1x256xi32, #blocked> to tensor<1x256xi64, #blocked>
+    %35 = arith.addi %34, %33 : tensor<1x256xi64, #blocked>
+    %36 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>, #blocked>
+    %37 = tt.addptr %36, %35 : tensor<1x256x!tt.ptr<f32, 1>, #blocked>, tensor<1x256xi64, #blocked>
+    %38 = tt.load %37, %9, %cst_0 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32, #blocked>
+    %39 = arith.addf %38, %15 : tensor<1x256xf32, #blocked>
+    %40 = arith.addf %39, %22 : tensor<1x256xf32, #blocked>
+    %41 = arith.addf %40, %cst_0 : tensor<1x256xf32, #blocked>
+    %42 = arith.subf %40, %41 : tensor<1x256xf32, #blocked>
+    %43 = arith.mulf %40, %42 : tensor<1x256xf32, #blocked>
+    %44 = arith.addf %43, %cst_0 : tensor<1x256xf32, #blocked>
+    %45 = arith.select %9, %41, %cst_0 : tensor<1x256xi1, #blocked>, tensor<1x256xf32, #blocked>
+    %46 = arith.select %9, %44, %cst_0 : tensor<1x256xi1, #blocked>, tensor<1x256xf32, #blocked>
+    %47 = arith.select %9, %cst_1, %cst_0 : tensor<1x256xi1, #blocked>, tensor<1x256xf32, #blocked>
+    %48:3 = "tt.reduce"(%45, %46, %47) <{axis = 1 : i32}> ({
+    ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
+      %71 = arith.subf %arg11, %arg8 : f32
+      %72 = arith.addf %arg10, %arg13 : f32
+      %73 = arith.cmpf oeq, %72, %cst_7 : f32
+      %74 = arith.divf %arg13, %72 : f32
+      %75 = arith.select %73, %cst_7, %74 : f32
+      %76 = arith.mulf %71, %75 : f32
+      %77 = arith.addf %arg8, %76 : f32
+      %78 = arith.addf %arg9, %arg12 : f32
+      %79 = arith.mulf %71, %71 : f32
+      %80 = arith.mulf %79, %arg10 : f32
+      %81 = arith.mulf %80, %75 : f32
+      %82 = arith.addf %78, %81 : f32
+      tt.reduce.return %77, %82, %72 : f32, f32, f32
+    }) : (tensor<1x256xf32, #blocked>, tensor<1x256xf32, #blocked>, tensor<1x256xf32, #blocked>) -> (tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
+    %49 = tt.expand_dims %48#0 {axis = 1 : i32} : (tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1x1xf32, #blocked>
+    %50 = tt.expand_dims %48#1 {axis = 1 : i32} : (tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1x1xf32, #blocked>
+    %51 = tt.load %14, %9, %cst_0 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32, #blocked>
+    %52 = tt.load %20, %9, %cst_10 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x256xbf16, #blocked>
+    %53 = arith.extf %52 : tensor<1x256xbf16, #blocked> to tensor<1x256xf32, #blocked>
+    %54 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>, #blocked>
+    %55 = tt.addptr %54, %2 : tensor<1x256x!tt.ptr<f32, 1>, #blocked>, tensor<1x256xi32, #blocked>
+    %56 = tt.load %55, %9, %cst_0 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32, #blocked>
+    tt.assert %31, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<1x1xi1, #blocked1>
+    %57 = tt.load %37, %9, %cst_0 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x256xf32, #blocked>
+    %58 = arith.addf %57, %51 : tensor<1x256xf32, #blocked>
+    %59 = arith.addf %58, %53 : tensor<1x256xf32, #blocked>
+    %60 = tt.broadcast %49 : (tensor<1x1xf32, #blocked>) -> tensor<1x256xf32, #blocked>
+    %61 = arith.subf %59, %60 : tensor<1x256xf32, #blocked>
+    %62 = arith.divf %50, %cst_9 : tensor<1x1xf32, #blocked>
+    %63 = arith.addf %62, %cst_8 : tensor<1x1xf32, #blocked>
+    %64 = tt.extern_elementwise %63 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32, #blocked>) -> tensor<1x1xf32, #blocked>
+    %65 = tt.broadcast %64 : (tensor<1x1xf32, #blocked>) -> tensor<1x256xf32, #blocked>
+    %66 = arith.mulf %61, %65 : tensor<1x256xf32, #blocked>
+    %67 = arith.mulf %66, %56 : tensor<1x256xf32, #blocked>
+    %68 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<1x256x!tt.ptr<bf16, 1>, #blocked>
+    %69 = tt.addptr %68, %18 : tensor<1x256x!tt.ptr<bf16, 1>, #blocked>, tensor<1x256xi32, #blocked>
+    %70 = arith.truncf %67 : tensor<1x256xf32, #blocked> to tensor<1x256xbf16, #blocked>
+    tt.store %69, %70, %9 {cache = 1 : i32, evict = 1 : i32} : tensor<1x256xbf16, #blocked>
+    tt.return
+  }
+}

.triton/dump/305a9479aab997a3a16bfe46bb303a50/triton_.ttir ADDED Viewed

	@@ -0,0 +1,101 @@

+module {
+  tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %c512_i32 = arith.constant 512 : i32
+    %c256_i32 = arith.constant 256 : i32
+    %cst = arith.constant dense<0.000000e+00> : tensor<1x256xbf16>
+    %cst_0 = arith.constant dense<1.000000e+00> : tensor<1x256xf32>
+    %cst_1 = arith.constant 0.000000e+00 : f32
+    %cst_2 = arith.constant dense<256> : tensor<1x1xi64>
+    %cst_3 = arith.constant dense<50257> : tensor<1x1xi64>
+    %cst_4 = arith.constant dense<0> : tensor<1x1xi64>
+    %cst_5 = arith.constant dense<9.99999974E-6> : tensor<1x1xf32>
+    %cst_6 = arith.constant dense<2.560000e+02> : tensor<1x1xf32>
+    %cst_7 = arith.constant dense<0.000000e+00> : tensor<1x256xf32>
+    %cst_8 = arith.constant dense<256> : tensor<1x256xi32>
+    %0 = tt.get_program_id x : i32
+    %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
+    %2 = tt.expand_dims %1 {axis = 0 : i32} : (tensor<256xi32>) -> tensor<1x256xi32>
+    %3 = tt.addptr %arg0, %0 : !tt.ptr<i64, 1>, i32
+    %4 = tt.splat %3 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>>
+    %5 = tt.load %4 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x1xi64>
+    %6 = arith.remsi %0, %c512_i32 : i32
+    %7 = arith.cmpi slt, %2, %cst_8 : tensor<1x256xi32>
+    %8 = arith.muli %6, %c256_i32 : i32
+    %9 = tt.splat %8 : (i32) -> tensor<1x256xi32>
+    %10 = arith.addi %2, %9 : tensor<1x256xi32>
+    %11 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>>
+    %12 = tt.addptr %11, %10 : tensor<1x256x!tt.ptr<f32, 1>>, tensor<1x256xi32>
+    %13 = tt.load %12, %7, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32>
+    %14 = arith.muli %0, %c256_i32 : i32
+    %15 = tt.splat %14 : (i32) -> tensor<1x256xi32>
+    %16 = arith.addi %2, %15 : tensor<1x256xi32>
+    %17 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<1x256x!tt.ptr<bf16, 1>>
+    %18 = tt.addptr %17, %16 : tensor<1x256x!tt.ptr<bf16, 1>>, tensor<1x256xi32>
+    %19 = tt.load %18, %7, %cst {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xbf16>
+    %20 = arith.extf %19 : tensor<1x256xbf16> to tensor<1x256xf32>
+    %21 = arith.addi %5, %cst_3 : tensor<1x1xi64>
+    %22 = arith.cmpi slt, %5, %cst_4 : tensor<1x1xi64>
+    %23 = arith.select %22, %21, %5 : tensor<1x1xi1>, tensor<1x1xi64>
+    %24 = arith.cmpi sge, %23, %cst_4 : tensor<1x1xi64>
+    %25 = arith.cmpi slt, %23, %cst_3 : tensor<1x1xi64>
+    %26 = arith.andi %24, %25 : tensor<1x1xi1>
+    tt.assert %26, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<1x1xi1>
+    %27 = arith.muli %23, %cst_2 : tensor<1x1xi64>
+    %28 = tt.broadcast %27 : (tensor<1x1xi64>) -> tensor<1x256xi64>
+    %29 = arith.extsi %2 : tensor<1x256xi32> to tensor<1x256xi64>
+    %30 = arith.addi %29, %28 : tensor<1x256xi64>
+    %31 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>>
+    %32 = tt.addptr %31, %30 : tensor<1x256x!tt.ptr<f32, 1>>, tensor<1x256xi64>
+    %33 = tt.load %32, %7, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32>
+    %34 = arith.addf %33, %13 : tensor<1x256xf32>
+    %35 = arith.addf %34, %20 : tensor<1x256xf32>
+    %36 = arith.addf %35, %cst_7 : tensor<1x256xf32>
+    %37 = arith.subf %35, %36 : tensor<1x256xf32>
+    %38 = arith.mulf %35, %37 : tensor<1x256xf32>
+    %39 = arith.addf %38, %cst_7 : tensor<1x256xf32>
+    %40 = arith.select %7, %36, %cst_7 : tensor<1x256xi1>, tensor<1x256xf32>
+    %41 = arith.select %7, %39, %cst_7 : tensor<1x256xi1>, tensor<1x256xf32>
+    %42 = arith.select %7, %cst_0, %cst_7 : tensor<1x256xi1>, tensor<1x256xf32>
+    %43:3 = "tt.reduce"(%40, %41, %42) <{axis = 1 : i32}> ({
+    ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
+      %66 = arith.subf %arg11, %arg8 : f32
+      %67 = arith.addf %arg10, %arg13 : f32
+      %68 = arith.cmpf oeq, %67, %cst_1 : f32
+      %69 = arith.divf %arg13, %67 : f32
+      %70 = arith.select %68, %cst_1, %69 : f32
+      %71 = arith.mulf %66, %70 : f32
+      %72 = arith.addf %arg8, %71 : f32
+      %73 = arith.addf %arg9, %arg12 : f32
+      %74 = arith.mulf %66, %66 : f32
+      %75 = arith.mulf %74, %arg10 : f32
+      %76 = arith.mulf %75, %70 : f32
+      %77 = arith.addf %73, %76 : f32
+      tt.reduce.return %72, %77, %67 : f32, f32, f32
+    }) : (tensor<1x256xf32>, tensor<1x256xf32>, tensor<1x256xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>)
+    %44 = tt.expand_dims %43#0 {axis = 1 : i32} : (tensor<1xf32>) -> tensor<1x1xf32>
+    %45 = tt.expand_dims %43#1 {axis = 1 : i32} : (tensor<1xf32>) -> tensor<1x1xf32>
+    %46 = tt.load %12, %7, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32>
+    %47 = tt.load %18, %7, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x256xbf16>
+    %48 = arith.extf %47 : tensor<1x256xbf16> to tensor<1x256xf32>
+    %49 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>>
+    %50 = tt.addptr %49, %2 : tensor<1x256x!tt.ptr<f32, 1>>, tensor<1x256xi32>
+    %51 = tt.load %50, %7, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32>
+    tt.assert %26, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<1x1xi1>
+    %52 = tt.load %32, %7, %cst_7 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x256xf32>
+    %53 = arith.addf %52, %46 : tensor<1x256xf32>
+    %54 = arith.addf %53, %48 : tensor<1x256xf32>
+    %55 = tt.broadcast %44 : (tensor<1x1xf32>) -> tensor<1x256xf32>
+    %56 = arith.subf %54, %55 : tensor<1x256xf32>
+    %57 = arith.divf %45, %cst_6 : tensor<1x1xf32>
+    %58 = arith.addf %57, %cst_5 : tensor<1x1xf32>
+    %59 = tt.extern_elementwise %58 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32>
+    %60 = tt.broadcast %59 : (tensor<1x1xf32>) -> tensor<1x256xf32>
+    %61 = arith.mulf %56, %60 : tensor<1x256xf32>
+    %62 = arith.mulf %61, %51 : tensor<1x256xf32>
+    %63 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<1x256x!tt.ptr<bf16, 1>>
+    %64 = tt.addptr %63, %16 : tensor<1x256x!tt.ptr<bf16, 1>>, tensor<1x256xi32>
+    %65 = arith.truncf %62 : tensor<1x256xf32> to tensor<1x256xbf16>
+    tt.store %64, %65, %7 {cache = 1 : i32, evict = 1 : i32} : tensor<1x256xbf16>
+    tt.return
+  }
+}

.triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.llir ADDED Viewed

	@@ -0,0 +1,550 @@

+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+@assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
+@assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
+@assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp16 < 50257"
+@assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
+@assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
+@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
+@global_smem = external addrspace(3) global [0 x i8]
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
+define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !7 {
+  %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %10 = and i32 %9, 31, !dbg !10
+  %11 = lshr i32 %9, 5, !dbg !10
+  %12 = lshr i32 %9, 6, !dbg !10
+  %13 = and i32 %12, 1, !dbg !10
+  %14 = and i32 %9, 1, !dbg !10
+  %15 = and i32 %11, 1, !dbg !11
+  %urem = shl i32 %9, 2, !dbg !11
+  %16 = and i32 %urem, 252, !dbg !11
+  %17 = shl i32 %9, 1, !dbg !11
+  %18 = and i32 %17, 254, !dbg !11
+  %19 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !12
+  %20 = shl i32 %19, 1, !dbg !13
+  %21 = or i32 %20, %13, !dbg !14
+  %22 = or i32 %20, %14, !dbg !14
+  %23 = sext i32 %21 to i64, !dbg !15
+  %24 = getelementptr i64, ptr addrspace(1) %0, i64 %23, !dbg !15
+  %25 = sext i32 %22 to i64, !dbg !15
+  %26 = getelementptr i64, ptr addrspace(1) %0, i64 %25, !dbg !15
+  %27 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
+  %28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
+  %29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
+  %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
+  %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #6, !dbg !16
+  %32 = srem i32 %21, 512, !dbg !17
+  %33 = shl nsw i32 %32, 8, !dbg !18
+  %34 = or i32 %33, %16, !dbg !19
+  %35 = sext i32 %34 to i64, !dbg !20
+  %36 = getelementptr float, ptr addrspace(1) %2, i64 %35, !dbg !20
+  %37 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %36, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21
+  %38 = extractvalue { i32, i32, i32, i32 } %37, 0, !dbg !21
+  %39 = extractvalue { i32, i32, i32, i32 } %37, 1, !dbg !21
+  %40 = extractvalue { i32, i32, i32, i32 } %37, 2, !dbg !21
+  %41 = extractvalue { i32, i32, i32, i32 } %37, 3, !dbg !21
+  %42 = insertelement <2 x i32> poison, i32 %39, i64 0, !dbg !21
+  %43 = insertelement <2 x i32> %42, i32 %38, i64 1, !dbg !21
+  %44 = bitcast <2 x i32> %43 to <2 x float>, !dbg !21
+  %45 = bitcast i32 %40 to float, !dbg !21
+  %46 = bitcast i32 %41 to float, !dbg !21
+  %47 = shl i32 %21, 8, !dbg !22
+  %48 = or i32 %47, %16, !dbg !23
+  %49 = sext i32 %48 to i64, !dbg !24
+  %50 = getelementptr i16, ptr addrspace(1) %3, i64 %49, !dbg !24
+  %51 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %50, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !25
+  %52 = extractvalue { i32, i32 } %51, 0, !dbg !25
+  %53 = extractvalue { i32, i32 } %51, 1, !dbg !25
+  %54 = trunc i32 %52 to i16, !dbg !25
+  %extelt.offset = lshr i32 %52, 16, !dbg !25
+  %55 = trunc i32 %extelt.offset to i16, !dbg !25
+  %56 = trunc i32 %53 to i16, !dbg !25
+  %extelt.offset1 = lshr i32 %53, 16, !dbg !25
+  %57 = trunc i32 %extelt.offset1 to i16, !dbg !25
+  %58 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %54) #6, !dbg !26
+  %59 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %55) #6, !dbg !26
+  %60 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %56) #6, !dbg !26
+  %61 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %57) #6, !dbg !26
+  %62 = add i64 %31, 50257, !dbg !27
+  %63 = icmp slt i64 %27, 0, !dbg !28
+  %64 = icmp slt i64 %31, 0, !dbg !28
+  %65 = select i1 %64, i64 %62, i64 %31, !dbg !29
+  %66 = icmp ugt i64 %65, 50256, !dbg !30
+  br i1 %66, label %67, label %68, !dbg !31
+67:                                               ; preds = %8
+  tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !31
+  br label %68, !dbg !31
+68:                                               ; preds = %67, %8
+  %69 = shl i64 %27, 8, !dbg !32
+  %70 = add i64 %69, 12865792, !dbg !32
+  %71 = select i1 %63, i64 %70, i64 %69, !dbg !32
+  %72 = zext nneg i32 %16 to i64
+  %73 = or i64 %71, %72, !dbg !33
+  %74 = getelementptr float, ptr addrspace(1) %1, i64 %73, !dbg !34
+  %75 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %74, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !35
+  %76 = extractvalue { i32, i32, i32, i32 } %75, 0, !dbg !35
+  %77 = extractvalue { i32, i32, i32, i32 } %75, 1, !dbg !35
+  %78 = extractvalue { i32, i32, i32, i32 } %75, 2, !dbg !35
+  %79 = extractvalue { i32, i32, i32, i32 } %75, 3, !dbg !35
+  %80 = bitcast i32 %78 to float, !dbg !35
+  %81 = bitcast i32 %79 to float, !dbg !35
+  %82 = fadd float %45, %80, !dbg !36
+  %83 = fadd float %46, %81, !dbg !36
+  %84 = fadd float %60, %82, !dbg !37
+  %85 = fadd float %61, %83, !dbg !37
+  %86 = insertelement <2 x i32> poison, i32 %77, i64 0, !dbg !35
+  %87 = insertelement <2 x i32> %86, i32 %76, i64 1, !dbg !35
+  %88 = bitcast <2 x i32> %87 to <2 x float>, !dbg !35
+  %89 = fadd <2 x float> %44, %88, !dbg !36
+  %90 = insertelement <2 x float> poison, float %59, i64 0, !dbg !37
+  %91 = insertelement <2 x float> %90, float %58, i64 1, !dbg !37
+  %92 = fadd <2 x float> %91, %89, !dbg !37
+  %93 = fadd <2 x float> %92, zeroinitializer, !dbg !38
+  %94 = fadd float %84, 0.000000e+00, !dbg !38
+  %95 = fadd float %85, 0.000000e+00, !dbg !38
+  %96 = extractelement <2 x float> %93, i64 1, !dbg !42
+  %97 = extractelement <2 x float> %92, i64 1, !dbg !46
+  %98 = fsub float %97, %96, !dbg !47
+  %99 = extractelement <2 x float> %93, i64 0, !dbg !42
+  %100 = extractelement <2 x float> %92, i64 0, !dbg !46
+  %101 = fsub float %100, %99, !dbg !47
+  %102 = fsub float %84, %94, !dbg !47
+  %103 = fsub float %85, %95, !dbg !47
+  %104 = fmul float %97, %98, !dbg !46
+  %105 = fmul float %100, %101, !dbg !46
+  %106 = fmul float %84, %102, !dbg !46
+  %107 = fmul float %85, %103, !dbg !46
+  %108 = fadd float %104, 0.000000e+00, !dbg !48
+  %109 = fadd float %105, 0.000000e+00, !dbg !48
+  %110 = fadd float %106, 0.000000e+00, !dbg !48
+  %111 = fadd float %107, 0.000000e+00, !dbg !48
+  %112 = fsub float %99, %96, !dbg !42
+  %113 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 2.000000e+00) #6, !dbg !49
+  %114 = fmul float %113, %112, !dbg !50
+  %115 = fadd float %96, %114, !dbg !51
+  %116 = fadd float %108, %109, !dbg !52
+  %117 = fmul float %112, %112, !dbg !53
+  %118 = fmul float %113, %117, !dbg !54
+  %119 = fadd float %118, %116, !dbg !55
+  %120 = fsub float %94, %115, !dbg !42
+  %121 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 3.000000e+00) #6, !dbg !49
+  %122 = fmul float %121, %120, !dbg !50
+  %123 = fadd float %115, %122, !dbg !51
+  %124 = fadd float %110, %119, !dbg !52
+  %125 = fmul float %120, %120, !dbg !53
+  %126 = fmul float %125, 2.000000e+00, !dbg !56
+  %127 = fmul float %121, %126, !dbg !54
+  %128 = fadd float %124, %127, !dbg !55
+  %129 = fsub float %95, %123, !dbg !42
+  %130 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 4.000000e+00) #6, !dbg !49
+  %131 = fmul float %130, %129, !dbg !50
+  %132 = fadd float %123, %131, !dbg !51
+  %133 = fadd float %111, %128, !dbg !52
+  %134 = fmul float %129, %129, !dbg !53
+  %135 = fmul float %134, 3.000000e+00, !dbg !56
+  %136 = fmul float %130, %135, !dbg !54
+  %137 = fadd float %133, %136, !dbg !55
+  %138 = bitcast float %132 to i32, !dbg !57
+  %139 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %138, i32 16, i32 31), !dbg !57
+  %140 = bitcast i32 %139 to float, !dbg !57
+  %141 = bitcast float %137 to i32, !dbg !57
+  %142 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %141, i32 16, i32 31), !dbg !57
+  %143 = bitcast i32 %142 to float, !dbg !57
+  %144 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 1082130432, i32 16, i32 31), !dbg !57
+  %145 = bitcast i32 %144 to float, !dbg !57
+  %146 = fsub float %140, %132, !dbg !42
+  %147 = fadd float %145, 4.000000e+00, !dbg !59
+  %148 = fcmp oeq float %147, 0.000000e+00, !dbg !60
+  %149 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %145, float %147) #6, !dbg !49
+  %150 = select i1 %148, float 0.000000e+00, float %149, !dbg !61
+  %151 = fmul float %150, %146, !dbg !50
+  %152 = fadd float %132, %151, !dbg !51
+  %153 = fadd float %137, %143, !dbg !52
+  %154 = fmul float %146, %146, !dbg !53
+  %155 = fmul float %154, 4.000000e+00, !dbg !56
+  %156 = fmul float %150, %155, !dbg !54
+  %157 = fadd float %153, %156, !dbg !55
+  %158 = bitcast float %152 to i32, !dbg !57
+  %159 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %158, i32 8, i32 31), !dbg !57
+  %160 = bitcast i32 %159 to float, !dbg !57
+  %161 = bitcast float %157 to i32, !dbg !57
+  %162 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %161, i32 8, i32 31), !dbg !57
+  %163 = bitcast i32 %162 to float, !dbg !57
+  %164 = bitcast float %147 to i32, !dbg !57
+  %165 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %164, i32 8, i32 31), !dbg !57
+  %166 = bitcast i32 %165 to float, !dbg !57
+  %167 = fsub float %160, %152, !dbg !42
+  %168 = fadd float %147, %166, !dbg !59
+  %169 = fcmp oeq float %168, 0.000000e+00, !dbg !60
+  %170 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %166, float %168) #6, !dbg !49
+  %171 = select i1 %169, float 0.000000e+00, float %170, !dbg !61
+  %172 = fmul float %171, %167, !dbg !50
+  %173 = fadd float %152, %172, !dbg !51
+  %174 = fadd float %157, %163, !dbg !52
+  %175 = fmul float %167, %167, !dbg !53
+  %176 = fmul float %147, %175, !dbg !56
+  %177 = fmul float %171, %176, !dbg !54
+  %178 = fadd float %174, %177, !dbg !55
+  %179 = bitcast float %173 to i32, !dbg !57
+  %180 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %179, i32 4, i32 31), !dbg !57
+  %181 = bitcast i32 %180 to float, !dbg !57
+  %182 = bitcast float %178 to i32, !dbg !57
+  %183 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %182, i32 4, i32 31), !dbg !57
+  %184 = bitcast i32 %183 to float, !dbg !57
+  %185 = bitcast float %168 to i32, !dbg !57
+  %186 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %185, i32 4, i32 31), !dbg !57
+  %187 = bitcast i32 %186 to float, !dbg !57
+  %188 = fsub float %181, %173, !dbg !42
+  %189 = fadd float %168, %187, !dbg !59
+  %190 = fcmp oeq float %189, 0.000000e+00, !dbg !60
+  %191 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %187, float %189) #6, !dbg !49
+  %192 = select i1 %190, float 0.000000e+00, float %191, !dbg !61
+  %193 = fmul float %192, %188, !dbg !50
+  %194 = fadd float %173, %193, !dbg !51
+  %195 = fadd float %178, %184, !dbg !52
+  %196 = fmul float %188, %188, !dbg !53
+  %197 = fmul float %168, %196, !dbg !56
+  %198 = fmul float %192, %197, !dbg !54
+  %199 = fadd float %195, %198, !dbg !55
+  %200 = bitcast float %194 to i32, !dbg !57
+  %201 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %200, i32 2, i32 31), !dbg !57
+  %202 = bitcast i32 %201 to float, !dbg !57
+  %203 = bitcast float %199 to i32, !dbg !57
+  %204 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %203, i32 2, i32 31), !dbg !57
+  %205 = bitcast i32 %204 to float, !dbg !57
+  %206 = bitcast float %189 to i32, !dbg !57
+  %207 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %206, i32 2, i32 31), !dbg !57
+  %208 = bitcast i32 %207 to float, !dbg !57
+  %209 = fsub float %202, %194, !dbg !42
+  %210 = fadd float %189, %208, !dbg !59
+  %211 = fcmp oeq float %210, 0.000000e+00, !dbg !60
+  %212 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %208, float %210) #6, !dbg !49
+  %213 = select i1 %211, float 0.000000e+00, float %212, !dbg !61
+  %214 = fmul float %213, %209, !dbg !50
+  %215 = fadd float %194, %214, !dbg !51
+  %216 = fadd float %199, %205, !dbg !52
+  %217 = fmul float %209, %209, !dbg !53
+  %218 = fmul float %189, %217, !dbg !56
+  %219 = fmul float %213, %218, !dbg !54
+  %220 = fadd float %216, %219, !dbg !55
+  %221 = bitcast float %215 to i32, !dbg !57
+  %222 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %221, i32 1, i32 31), !dbg !57
+  %223 = bitcast i32 %222 to float, !dbg !57
+  %224 = bitcast float %220 to i32, !dbg !57
+  %225 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %224, i32 1, i32 31), !dbg !57
+  %226 = bitcast i32 %225 to float, !dbg !57
+  %227 = bitcast float %210 to i32, !dbg !57
+  %228 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %227, i32 1, i32 31), !dbg !57
+  %229 = bitcast i32 %228 to float, !dbg !57
+  %230 = fsub float %223, %215, !dbg !42
+  %231 = fadd float %210, %229, !dbg !59
+  %232 = fcmp oeq float %231, 0.000000e+00, !dbg !60
+  %233 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %229, float %231) #6, !dbg !49
+  %234 = select i1 %232, float 0.000000e+00, float %233, !dbg !61
+  %235 = fmul float %234, %230, !dbg !50
+  %236 = fadd float %215, %235, !dbg !51
+  %237 = fadd float %220, %226, !dbg !52
+  %238 = fmul float %230, %230, !dbg !53
+  %239 = fmul float %210, %238, !dbg !56
+  %240 = fmul float %234, %239, !dbg !54
+  %241 = fadd float %237, %240, !dbg !55
+  %242 = icmp eq i32 %10, 0, !dbg !57
+  %243 = shl nuw nsw i32 %13, 1, !dbg !57
+  %244 = or i32 %243, %15, !dbg !57
+  %245 = zext nneg i32 %244 to i64, !dbg !57
+  %246 = getelementptr float, ptr addrspace(3) @global_smem, i64 %245, !dbg !57
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %246, float %236, i1 %242) #6, !dbg !57
+  %247 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %245, !dbg !57
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %247, float %241, i1 %242) #6, !dbg !57
+  %248 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 32), i64 %245, !dbg !57
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %248, float %231, i1 %242) #6, !dbg !57
+  tail call void @llvm.nvvm.barrier0(), !dbg !57
+  %249 = icmp slt i32 %9, 4, !dbg !57
+  %250 = sext i32 %9 to i64, !dbg !57
+  %251 = getelementptr float, ptr addrspace(3) @global_smem, i64 %250, !dbg !57
+  %252 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %251, i1 %249) #6, !dbg !57
+  %253 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %250, !dbg !57
+  %254 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %253, i1 %249) #6, !dbg !57
+  %255 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 32), i64 %250, !dbg !57
+  %256 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %255, i1 %249) #6, !dbg !57
+  %257 = bitcast float %252 to i32, !dbg !57
+  %258 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %257, i32 1, i32 31), !dbg !57
+  %259 = bitcast i32 %258 to float, !dbg !57
+  %260 = bitcast float %254 to i32, !dbg !57
+  %261 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %260, i32 1, i32 31), !dbg !57
+  %262 = bitcast i32 %261 to float, !dbg !57
+  %263 = bitcast float %256 to i32, !dbg !57
+  %264 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %263, i32 1, i32 31), !dbg !57
+  %265 = bitcast i32 %264 to float, !dbg !57
+  %266 = fsub float %259, %252, !dbg !42
+  %267 = fadd float %256, %265, !dbg !59
+  %268 = fcmp oeq float %267, 0.000000e+00, !dbg !60
+  %269 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %265, float %267) #6, !dbg !49
+  %270 = select i1 %268, float 0.000000e+00, float %269, !dbg !61
+  %271 = fmul float %266, %270, !dbg !50
+  %272 = fadd float %252, %271, !dbg !51
+  %273 = fadd float %254, %262, !dbg !52
+  %274 = fmul float %266, %266, !dbg !53
+  %275 = fmul float %256, %274, !dbg !56
+  %276 = fmul float %275, %270, !dbg !54
+  %277 = fadd float %273, %276, !dbg !55
+  %278 = icmp eq i32 %14, 0, !dbg !57
+  %279 = and i1 %249, %278, !dbg !57
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %251, float %272, i1 %279) #6, !dbg !57
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %253, float %277, i1 %279) #6, !dbg !57
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %255, float %267, i1 %279) #6, !dbg !57
+  tail call void @llvm.nvvm.barrier0(), !dbg !57
+  %280 = zext nneg i32 %243 to i64, !dbg !57
+  %281 = getelementptr float, ptr addrspace(3) @global_smem, i64 %280, !dbg !57
+  %282 = load float, ptr addrspace(3) %281, align 4, !dbg !57
+  %283 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %280, !dbg !57
+  %284 = load float, ptr addrspace(3) %283, align 4, !dbg !57
+  %285 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %36, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !62
+  %286 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %50, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !63
+  %287 = extractvalue { i32, i32 } %286, 0, !dbg !63
+  %288 = extractvalue { i32, i32 } %286, 1, !dbg !63
+  %289 = trunc i32 %287 to i16, !dbg !63
+  %extelt.offset2 = lshr i32 %287, 16, !dbg !63
+  %290 = trunc i32 %extelt.offset2 to i16, !dbg !63
+  %291 = trunc i32 %288 to i16, !dbg !63
+  %extelt.offset3 = lshr i32 %288, 16, !dbg !63
+  %292 = trunc i32 %extelt.offset3 to i16, !dbg !63
+  %293 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %289) #6, !dbg !64
+  %294 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %290) #6, !dbg !64
+  %295 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %291) #6, !dbg !64
+  %296 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %292) #6, !dbg !64
+  %297 = zext nneg i32 %18 to i64, !dbg !65
+  %298 = getelementptr float, ptr addrspace(1) %4, i64 %297, !dbg !65
+  %299 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %298, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !66
+  br i1 %66, label %300, label %301, !dbg !67
+300:                                              ; preds = %68
+  tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !67
+  br label %301, !dbg !67
+301:                                              ; preds = %300, %68
+  %302 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %74, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
+  %303 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %284, float 2.560000e+02) #6, !dbg !69
+  %304 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %284, float 2.560000e+02) #6, !dbg !69
+  %305 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %284, float 2.560000e+02) #6, !dbg !69
+  %306 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %284, float 2.560000e+02) #6, !dbg !69
+  %307 = fadd float %303, 0x3EE4F8B580000000, !dbg !70
+  %308 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
+  %.not.i = icmp eq i32 %308, 0, !dbg !71
+  br i1 %.not.i, label %311, label %309, !dbg !71
+309:                                              ; preds = %301
+  %310 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %307), !dbg !71
+  br label %__nv_rsqrtf.exit, !dbg !71
+311:                                              ; preds = %301
+  %312 = tail call float @llvm.nvvm.rsqrt.approx.f(float %307), !dbg !71
+  br label %__nv_rsqrtf.exit, !dbg !71
+__nv_rsqrtf.exit:                                 ; preds = %309, %311
+  %.0.i = phi float [ %310, %309 ], [ %312, %311 ], !dbg !71
+  %313 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
+  %314 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
+  %315 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
+  %316 = extractvalue { i32, i32, i32, i32 } %302, 3, !dbg !68
+  %317 = bitcast i32 %316 to float, !dbg !68
+  %318 = extractvalue { i32, i32, i32, i32 } %285, 3, !dbg !62
+  %319 = bitcast i32 %318 to float, !dbg !62
+  %320 = fadd float %319, %317, !dbg !72
+  %321 = fadd float %296, %320, !dbg !73
+  %322 = fsub float %321, %282, !dbg !74
+  %323 = extractvalue { i32, i32, i32, i32 } %302, 2, !dbg !68
+  %324 = bitcast i32 %323 to float, !dbg !68
+  %325 = extractvalue { i32, i32, i32, i32 } %285, 2, !dbg !62
+  %326 = bitcast i32 %325 to float, !dbg !62
+  %327 = fadd float %326, %324, !dbg !72
+  %328 = fadd float %295, %327, !dbg !73
+  %329 = fsub float %328, %282, !dbg !74
+  %330 = extractvalue { i32, i32, i32, i32 } %302, 1, !dbg !68
+  %331 = bitcast i32 %330 to float, !dbg !68
+  %332 = extractvalue { i32, i32, i32, i32 } %285, 1, !dbg !62
+  %333 = bitcast i32 %332 to float, !dbg !62
+  %334 = fadd float %333, %331, !dbg !72
+  %335 = fadd float %294, %334, !dbg !73
+  %336 = fsub float %335, %282, !dbg !74
+  %337 = extractvalue { i32, i32, i32, i32 } %302, 0, !dbg !68
+  %338 = bitcast i32 %337 to float, !dbg !68
+  %339 = extractvalue { i32, i32, i32, i32 } %285, 0, !dbg !62
+  %340 = bitcast i32 %339 to float, !dbg !62
+  %341 = fadd float %340, %338, !dbg !72
+  %342 = fadd float %293, %341, !dbg !73
+  %343 = fsub float %342, %282, !dbg !74
+  %344 = extractvalue { i32, i32 } %299, 0, !dbg !66
+  %345 = extractvalue { i32, i32 } %299, 1, !dbg !66
+  %346 = fmul float %343, %.0.i, !dbg !75
+  %347 = fmul float %336, %.0.i, !dbg !75
+  %348 = fmul float %329, %.0.i, !dbg !75
+  %349 = fmul float %322, %.0.i, !dbg !75
+  tail call void @llvm.nvvm.barrier0(), !dbg !76
+  %350 = getelementptr float, ptr addrspace(3) @global_smem, i64 %297, !dbg !76
+  %351 = insertelement <2 x i32> undef, i32 %344, i64 0, !dbg !76
+  %352 = insertelement <2 x i32> %351, i32 %345, i64 1, !dbg !76
+  store <2 x i32> %352, ptr addrspace(3) %350, align 8, !dbg !76
+  tail call void @llvm.nvvm.barrier0(), !dbg !76
+  %353 = getelementptr float, ptr addrspace(3) @global_smem, i64 %72, !dbg !76
+  %354 = load float, ptr addrspace(3) %353, align 16, !dbg !76
+  %355 = getelementptr inbounds <4 x float>, ptr addrspace(3) %353, i64 0, i64 1, !dbg !76
+  %356 = load float, ptr addrspace(3) %355, align 4, !dbg !76
+  %357 = getelementptr inbounds <4 x float>, ptr addrspace(3) %353, i64 0, i64 2, !dbg !76
+  %358 = load float, ptr addrspace(3) %357, align 8, !dbg !76
+  %359 = getelementptr inbounds <4 x float>, ptr addrspace(3) %353, i64 0, i64 3, !dbg !76
+  %360 = load float, ptr addrspace(3) %359, align 4, !dbg !76
+  %361 = fmul float %346, %354, !dbg !76
+  %362 = fmul float %347, %356, !dbg !76
+  %363 = fmul float %348, %358, !dbg !76
+  %364 = fmul float %349, %360, !dbg !76
+  %365 = getelementptr i16, ptr addrspace(1) %5, i64 %49, !dbg !77
+  %366 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %361) #6, !dbg !78
+  %367 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %362) #6, !dbg !78
+  %368 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %363) #6, !dbg !78
+  %369 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %364) #6, !dbg !78
+  %370 = insertelement <2 x i16> undef, i16 %366, i64 0, !dbg !78
+  %371 = insertelement <2 x i16> %370, i16 %367, i64 1, !dbg !78
+  %372 = bitcast <2 x i16> %371 to i32, !dbg !78
+  %373 = insertelement <2 x i16> undef, i16 %368, i64 0, !dbg !78
+  %374 = insertelement <2 x i16> %373, i16 %369, i64 1, !dbg !78
+  %375 = bitcast <2 x i16> %374 to i32, !dbg !78
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %372, i32 %375, ptr addrspace(1) %365, i1 true) #6, !dbg !78
+  ret void, !dbg !79
+}
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #2
+; Function Attrs: alwaysinline nounwind
+define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
+  %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
+  %.not = icmp eq i32 %1, 0
+  br i1 %.not, label %4, label %2
+2:                                                ; preds = %0
+  %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
+  br label %6
+4:                                                ; preds = %0
+  %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
+  br label %6
+6:                                                ; preds = %4, %2
+  %.0 = phi float [ %3, %2 ], [ %5, %4 ]
+  ret float %.0
+}
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #5
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #2 = { convergent nocallback nounwind }
+attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #6 = { nounwind }
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+!nvvm.annotations = !{!4, !5, !5, !4}
+!llvm.ident = !{!6}
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!3 = !DIFile(filename: "cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py", directory: "/tmp/torchinductor_root/pn")
+!4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1}
+!5 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 128}
+!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
+!9 = !{}
+!10 = !DILocation(line: 22, column: 44, scope: !7)
+!11 = !DILocation(line: 24, column: 33, scope: !7)
+!12 = !DILocation(line: 21, column: 28, scope: !7)
+!13 = !DILocation(line: 21, column: 33, scope: !7)
+!14 = !DILocation(line: 22, column: 23, scope: !7)
+!15 = !DILocation(line: 26, column: 30, scope: !7)
+!16 = !DILocation(line: 26, column: 35, scope: !7)
+!17 = !DILocation(line: 27, column: 18, scope: !7)
+!18 = !DILocation(line: 35, column: 44, scope: !7)
+!19 = !DILocation(line: 35, column: 40, scope: !7)
+!20 = !DILocation(line: 35, column: 34, scope: !7)
+!21 = !DILocation(line: 35, column: 50, scope: !7)
+!22 = !DILocation(line: 36, column: 44, scope: !7)
+!23 = !DILocation(line: 36, column: 40, scope: !7)
+!24 = !DILocation(line: 36, column: 34, scope: !7)
+!25 = !DILocation(line: 36, column: 50, scope: !7)
+!26 = !DILocation(line: 36, column: 101, scope: !7)
+!27 = !DILocation(line: 37, column: 22, scope: !7)
+!28 = !DILocation(line: 38, column: 22, scope: !7)
+!29 = !DILocation(line: 39, column: 36, scope: !7)
+!30 = !DILocation(line: 40, column: 40, scope: !7)
+!31 = !DILocation(line: 40, column: 55, scope: !7)
+!32 = !DILocation(line: 41, column: 44, scope: !7)
+!33 = !DILocation(line: 41, column: 40, scope: !7)
+!34 = !DILocation(line: 41, column: 34, scope: !7)
+!35 = !DILocation(line: 41, column: 52, scope: !7)
+!36 = !DILocation(line: 42, column: 22, scope: !7)
+!37 = !DILocation(line: 44, column: 22, scope: !7)
+!38 = !DILocation(line: 98, column: 22, scope: !39, inlinedAt: !41)
+!39 = distinct !DILexicalBlockFile(scope: !7, file: !40, discriminator: 0)
+!40 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
+!41 = !DILocation(line: 47, column: 41, scope: !39)
+!42 = !DILocation(line: 108, column: 21, scope: !43, inlinedAt: !44)
+!43 = distinct !DILexicalBlockFile(scope: !39, file: !40, discriminator: 0)
+!44 = !DILocation(line: 120, column: 46, scope: !43, inlinedAt: !45)
+!45 = !DILocation(line: 53, column: 44, scope: !43)
+!46 = !DILocation(line: 101, column: 22, scope: !39, inlinedAt: !41)
+!47 = !DILocation(line: 101, column: 30, scope: !39, inlinedAt: !41)
+!48 = !DILocation(line: 101, column: 13, scope: !39, inlinedAt: !41)
+!49 = !DILocation(line: 110, column: 60, scope: !43, inlinedAt: !44)
+!50 = !DILocation(line: 112, column: 25, scope: !43, inlinedAt: !44)
+!51 = !DILocation(line: 112, column: 17, scope: !43, inlinedAt: !44)
+!52 = !DILocation(line: 113, column: 15, scope: !43, inlinedAt: !44)
+!53 = !DILocation(line: 113, column: 30, scope: !43, inlinedAt: !44)
+!54 = !DILocation(line: 113, column: 49, scope: !43, inlinedAt: !44)
+!55 = !DILocation(line: 113, column: 22, scope: !43, inlinedAt: !44)
+!56 = !DILocation(line: 113, column: 38, scope: !43, inlinedAt: !44)
+!57 = !DILocation(line: 120, column: 46, scope: !39, inlinedAt: !58)
+!58 = !DILocation(line: 53, column: 44, scope: !39)
+!59 = !DILocation(line: 109, column: 28, scope: !43, inlinedAt: !44)
+!60 = !DILocation(line: 110, column: 39, scope: !43, inlinedAt: !44)
+!61 = !DILocation(line: 110, column: 49, scope: !43, inlinedAt: !44)
+!62 = !DILocation(line: 62, column: 51, scope: !7)
+!63 = !DILocation(line: 63, column: 51, scope: !7)
+!64 = !DILocation(line: 63, column: 103, scope: !7)
+!65 = !DILocation(line: 64, column: 35, scope: !7)
+!66 = !DILocation(line: 64, column: 40, scope: !7)
+!67 = !DILocation(line: 68, column: 57, scope: !7)
+!68 = !DILocation(line: 69, column: 54, scope: !7)
+!69 = !DILocation(line: 75, column: 24, scope: !7)
+!70 = !DILocation(line: 77, column: 24, scope: !7)
+!71 = !DILocation(line: 78, column: 30, scope: !7)
+!72 = !DILocation(line: 70, column: 24, scope: !7)
+!73 = !DILocation(line: 72, column: 24, scope: !7)
+!74 = !DILocation(line: 73, column: 24, scope: !7)
+!75 = !DILocation(line: 79, column: 24, scope: !7)
+!76 = !DILocation(line: 80, column: 24, scope: !7)
+!77 = !DILocation(line: 82, column: 29, scope: !7)
+!78 = !DILocation(line: 82, column: 52, scope: !7)
+!79 = !DILocation(line: 58, column: 4, scope: !7)

.triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.ttgir ADDED Viewed

	@@ -0,0 +1,134 @@

+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [2, 2], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 2], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<512> : tensor<2x1xi32, #blocked>
+    %cst_0 = arith.constant dense<256> : tensor<1x256xi32, #blocked>
+    %cst_1 = arith.constant dense<256> : tensor<1x256xi32, #blocked1>
+    %cst_2 = arith.constant dense<256> : tensor<2x1xi32, #blocked>
+    %cst_3 = arith.constant dense<1.000000e+00> : tensor<1x256xf32, #blocked>
+    %cst_4 = arith.constant dense<0.000000e+00> : tensor<1x256xf32, #blocked>
+    %cst_5 = arith.constant dense<256> : tensor<2x1xi64, #blocked>
+    %cst_6 = arith.constant dense<50257> : tensor<2x1xi64, #blocked>
+    %cst_7 = arith.constant dense<0> : tensor<2x1xi64, #blocked>
+    %cst_8 = arith.constant dense<0> : tensor<2x1xi64, #blocked2>
+    %cst_9 = arith.constant dense<50257> : tensor<2x1xi64, #blocked2>
+    %cst_10 = arith.constant 0.000000e+00 : f32
+    %cst_11 = arith.constant dense<9.99999974E-6> : tensor<2x1xf32, #blocked>
+    %cst_12 = arith.constant dense<2.560000e+02> : tensor<2x1xf32, #blocked>
+    %cst_13 = arith.constant dense<0.000000e+00> : tensor<2x256xf32, #blocked>
+    %cst_14 = arith.constant dense<0.000000e+00> : tensor<2x256xbf16, #blocked>
+    %cst_15 = arith.constant dense<0.000000e+00> : tensor<1x256xf32, #blocked1>
+    %c2_i32 = arith.constant 2 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c2_i32 : i32
+    %2 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+    %3 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>
+    %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<2x1xi32, #blocked>
+    %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>) -> tensor<2x1xi32, #blocked2>
+    %6 = tt.splat %1 : (i32) -> tensor<2x1xi32, #blocked>
+    %7 = tt.splat %1 : (i32) -> tensor<2x1xi32, #blocked2>
+    %8 = arith.addi %6, %4 : tensor<2x1xi32, #blocked>
+    %9 = arith.addi %7, %5 : tensor<2x1xi32, #blocked2>
+    %10 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
+    %11 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>
+    %12 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x256xi32, #blocked>
+    %13 = tt.expand_dims %11 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x256xi32, #blocked1>
+    %14 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<2x1x!tt.ptr<i64, 1>, #blocked>
+    %15 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<2x1x!tt.ptr<i64, 1>, #blocked2>
+    %16 = tt.addptr %14, %8 : tensor<2x1x!tt.ptr<i64, 1>, #blocked>, tensor<2x1xi32, #blocked>
+    %17 = tt.addptr %15, %9 : tensor<2x1x!tt.ptr<i64, 1>, #blocked2>, tensor<2x1xi32, #blocked2>
+    %18 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x1xi64, #blocked>
+    %19 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x1xi64, #blocked2>
+    %20 = arith.remsi %8, %cst : tensor<2x1xi32, #blocked>
+    %21 = arith.cmpi slt, %12, %cst_0 : tensor<1x256xi32, #blocked>
+    %22 = arith.cmpi slt, %13, %cst_1 : tensor<1x256xi32, #blocked1>
+    %23 = arith.muli %20, %cst_2 : tensor<2x1xi32, #blocked>
+    %24 = tt.broadcast %12 : (tensor<1x256xi32, #blocked>) -> tensor<2x256xi32, #blocked>
+    %25 = tt.broadcast %23 : (tensor<2x1xi32, #blocked>) -> tensor<2x256xi32, #blocked>
+    %26 = arith.addi %24, %25 : tensor<2x256xi32, #blocked>
+    %27 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<2x256x!tt.ptr<f32, 1>, #blocked>
+    %28 = tt.addptr %27, %26 : tensor<2x256x!tt.ptr<f32, 1>, #blocked>, tensor<2x256xi32, #blocked>
+    %29 = tt.broadcast %21 : (tensor<1x256xi1, #blocked>) -> tensor<2x256xi1, #blocked>
+    %30 = tt.load %28, %29, %cst_13 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32, #blocked>
+    %31 = arith.muli %8, %cst_2 : tensor<2x1xi32, #blocked>
+    %32 = tt.broadcast %31 : (tensor<2x1xi32, #blocked>) -> tensor<2x256xi32, #blocked>
+    %33 = arith.addi %24, %32 : tensor<2x256xi32, #blocked>
+    %34 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<2x256x!tt.ptr<bf16, 1>, #blocked>
+    %35 = tt.addptr %34, %33 : tensor<2x256x!tt.ptr<bf16, 1>, #blocked>, tensor<2x256xi32, #blocked>
+    %36 = tt.load %35, %29, %cst_14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xbf16, #blocked>
+    %37 = arith.extf %36 : tensor<2x256xbf16, #blocked> to tensor<2x256xf32, #blocked>
+    %38 = arith.addi %18, %cst_6 : tensor<2x1xi64, #blocked>
+    %39 = arith.addi %19, %cst_9 : tensor<2x1xi64, #blocked2>
+    %40 = arith.cmpi slt, %18, %cst_7 : tensor<2x1xi64, #blocked>
+    %41 = arith.cmpi slt, %19, %cst_8 : tensor<2x1xi64, #blocked2>
+    %42 = arith.select %40, %38, %18 : tensor<2x1xi1, #blocked>, tensor<2x1xi64, #blocked>
+    %43 = arith.select %41, %39, %19 : tensor<2x1xi1, #blocked2>, tensor<2x1xi64, #blocked2>
+    %44 = arith.cmpi sge, %43, %cst_8 : tensor<2x1xi64, #blocked2>
+    %45 = arith.cmpi slt, %43, %cst_9 : tensor<2x1xi64, #blocked2>
+    %46 = arith.andi %44, %45 : tensor<2x1xi1, #blocked2>
+    tt.assert %46, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<2x1xi1, #blocked2>
+    %47 = arith.muli %42, %cst_5 : tensor<2x1xi64, #blocked>
+    %48 = tt.broadcast %47 : (tensor<2x1xi64, #blocked>) -> tensor<2x256xi64, #blocked>
+    %49 = arith.extsi %12 : tensor<1x256xi32, #blocked> to tensor<1x256xi64, #blocked>
+    %50 = tt.broadcast %49 : (tensor<1x256xi64, #blocked>) -> tensor<2x256xi64, #blocked>
+    %51 = arith.addi %50, %48 : tensor<2x256xi64, #blocked>
+    %52 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<2x256x!tt.ptr<f32, 1>, #blocked>
+    %53 = tt.addptr %52, %51 : tensor<2x256x!tt.ptr<f32, 1>, #blocked>, tensor<2x256xi64, #blocked>
+    %54 = tt.load %53, %29, %cst_13 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32, #blocked>
+    %55 = arith.addf %54, %30 : tensor<2x256xf32, #blocked>
+    %56 = arith.addf %55, %37 : tensor<2x256xf32, #blocked>
+    %57 = arith.addf %56, %cst_13 : tensor<2x256xf32, #blocked>
+    %58 = arith.subf %56, %57 : tensor<2x256xf32, #blocked>
+    %59 = arith.mulf %56, %58 : tensor<2x256xf32, #blocked>
+    %60 = arith.addf %59, %cst_13 : tensor<2x256xf32, #blocked>
+    %61 = arith.select %29, %57, %cst_13 : tensor<2x256xi1, #blocked>, tensor<2x256xf32, #blocked>
+    %62 = arith.select %29, %60, %cst_13 : tensor<2x256xi1, #blocked>, tensor<2x256xf32, #blocked>
+    %63 = arith.select %21, %cst_3, %cst_4 : tensor<1x256xi1, #blocked>, tensor<1x256xf32, #blocked>
+    %64 = tt.broadcast %63 : (tensor<1x256xf32, #blocked>) -> tensor<2x256xf32, #blocked>
+    %65:3 = "tt.reduce"(%61, %62, %64) <{axis = 1 : i32}> ({
+    ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
+      %90 = arith.subf %arg11, %arg8 : f32
+      %91 = arith.addf %arg10, %arg13 : f32
+      %92 = arith.cmpf oeq, %91, %cst_10 : f32
+      %93 = arith.divf %arg13, %91 : f32
+      %94 = arith.select %92, %cst_10, %93 : f32
+      %95 = arith.mulf %90, %94 : f32
+      %96 = arith.addf %arg8, %95 : f32
+      %97 = arith.addf %arg9, %arg12 : f32
+      %98 = arith.mulf %90, %90 : f32
+      %99 = arith.mulf %98, %arg10 : f32
+      %100 = arith.mulf %99, %94 : f32
+      %101 = arith.addf %97, %100 : f32
+      tt.reduce.return %96, %101, %91 : f32, f32, f32
+    }) : (tensor<2x256xf32, #blocked>, tensor<2x256xf32, #blocked>, tensor<2x256xf32, #blocked>) -> (tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
+    %66 = tt.expand_dims %65#0 {axis = 1 : i32} : (tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<2x1xf32, #blocked>
+    %67 = tt.expand_dims %65#1 {axis = 1 : i32} : (tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<2x1xf32, #blocked>
+    %68 = tt.load %28, %29, %cst_13 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32, #blocked>
+    %69 = tt.load %35, %29, %cst_14 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<2x256xbf16, #blocked>
+    %70 = arith.extf %69 : tensor<2x256xbf16, #blocked> to tensor<2x256xf32, #blocked>
+    %71 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>, #blocked1>
+    %72 = tt.addptr %71, %13 : tensor<1x256x!tt.ptr<f32, 1>, #blocked1>, tensor<1x256xi32, #blocked1>
+    %73 = tt.load %72, %22, %cst_15 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32, #blocked1>
+    tt.assert %46, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<2x1xi1, #blocked2>
+    %74 = tt.load %53, %29, %cst_13 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<2x256xf32, #blocked>
+    %75 = arith.addf %74, %68 : tensor<2x256xf32, #blocked>
+    %76 = arith.addf %75, %70 : tensor<2x256xf32, #blocked>
+    %77 = tt.broadcast %66 : (tensor<2x1xf32, #blocked>) -> tensor<2x256xf32, #blocked>
+    %78 = arith.subf %76, %77 : tensor<2x256xf32, #blocked>
+    %79 = arith.divf %67, %cst_12 : tensor<2x1xf32, #blocked>
+    %80 = arith.addf %79, %cst_11 : tensor<2x1xf32, #blocked>
+    %81 = tt.extern_elementwise %80 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32, #blocked>) -> tensor<2x1xf32, #blocked>
+    %82 = tt.broadcast %81 : (tensor<2x1xf32, #blocked>) -> tensor<2x256xf32, #blocked>
+    %83 = arith.mulf %78, %82 : tensor<2x256xf32, #blocked>
+    %84 = triton_gpu.convert_layout %73 : (tensor<1x256xf32, #blocked1>) -> tensor<1x256xf32, #blocked>
+    %85 = tt.broadcast %84 : (tensor<1x256xf32, #blocked>) -> tensor<2x256xf32, #blocked>
+    %86 = arith.mulf %83, %85 : tensor<2x256xf32, #blocked>
+    %87 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<2x256x!tt.ptr<bf16, 1>, #blocked>
+    %88 = tt.addptr %87, %33 : tensor<2x256x!tt.ptr<bf16, 1>, #blocked>, tensor<2x256xi32, #blocked>
+    %89 = arith.truncf %86 : tensor<2x256xf32, #blocked> to tensor<2x256xbf16, #blocked>
+    tt.store %88, %89, %29 {cache = 1 : i32, evict = 1 : i32} : tensor<2x256xbf16, #blocked>
+    tt.return
+  }
+}

.triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.ttir ADDED Viewed

	@@ -0,0 +1,113 @@

+module {
+  tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<2x256xbf16>
+    %cst_0 = arith.constant dense<1.000000e+00> : tensor<1x256xf32>
+    %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x256xf32>
+    %cst_2 = arith.constant 0.000000e+00 : f32
+    %cst_3 = arith.constant dense<256> : tensor<2x1xi64>
+    %cst_4 = arith.constant dense<50257> : tensor<2x1xi64>
+    %cst_5 = arith.constant dense<0> : tensor<2x1xi64>
+    %cst_6 = arith.constant dense<9.99999974E-6> : tensor<2x1xf32>
+    %cst_7 = arith.constant dense<2.560000e+02> : tensor<2x1xf32>
+    %cst_8 = arith.constant dense<0.000000e+00> : tensor<2x256xf32>
+    %cst_9 = arith.constant dense<256> : tensor<2x1xi32>
+    %cst_10 = arith.constant dense<256> : tensor<1x256xi32>
+    %cst_11 = arith.constant dense<512> : tensor<2x1xi32>
+    %c2_i32 = arith.constant 2 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c2_i32 : i32
+    %2 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32>
+    %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<2xi32>) -> tensor<2x1xi32>
+    %4 = tt.splat %1 : (i32) -> tensor<2x1xi32>
+    %5 = arith.addi %4, %3 : tensor<2x1xi32>
+    %6 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
+    %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<256xi32>) -> tensor<1x256xi32>
+    %8 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<2x1x!tt.ptr<i64, 1>>
+    %9 = tt.addptr %8, %5 : tensor<2x1x!tt.ptr<i64, 1>>, tensor<2x1xi32>
+    %10 = tt.load %9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x1xi64>
+    %11 = arith.remsi %5, %cst_11 : tensor<2x1xi32>
+    %12 = arith.cmpi slt, %7, %cst_10 : tensor<1x256xi32>
+    %13 = arith.muli %11, %cst_9 : tensor<2x1xi32>
+    %14 = tt.broadcast %7 : (tensor<1x256xi32>) -> tensor<2x256xi32>
+    %15 = tt.broadcast %13 : (tensor<2x1xi32>) -> tensor<2x256xi32>
+    %16 = arith.addi %14, %15 : tensor<2x256xi32>
+    %17 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<2x256x!tt.ptr<f32, 1>>
+    %18 = tt.addptr %17, %16 : tensor<2x256x!tt.ptr<f32, 1>>, tensor<2x256xi32>
+    %19 = tt.broadcast %12 : (tensor<1x256xi1>) -> tensor<2x256xi1>
+    %20 = tt.load %18, %19, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32>
+    %21 = arith.muli %5, %cst_9 : tensor<2x1xi32>
+    %22 = tt.broadcast %21 : (tensor<2x1xi32>) -> tensor<2x256xi32>
+    %23 = arith.addi %14, %22 : tensor<2x256xi32>
+    %24 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<2x256x!tt.ptr<bf16, 1>>
+    %25 = tt.addptr %24, %23 : tensor<2x256x!tt.ptr<bf16, 1>>, tensor<2x256xi32>
+    %26 = tt.load %25, %19, %cst {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xbf16>
+    %27 = arith.extf %26 : tensor<2x256xbf16> to tensor<2x256xf32>
+    %28 = arith.addi %10, %cst_4 : tensor<2x1xi64>
+    %29 = arith.cmpi slt, %10, %cst_5 : tensor<2x1xi64>
+    %30 = arith.select %29, %28, %10 : tensor<2x1xi1>, tensor<2x1xi64>
+    %31 = arith.cmpi sge, %30, %cst_5 : tensor<2x1xi64>
+    %32 = arith.cmpi slt, %30, %cst_4 : tensor<2x1xi64>
+    %33 = arith.andi %31, %32 : tensor<2x1xi1>
+    tt.assert %33, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<2x1xi1>
+    %34 = arith.muli %30, %cst_3 : tensor<2x1xi64>
+    %35 = tt.broadcast %34 : (tensor<2x1xi64>) -> tensor<2x256xi64>
+    %36 = arith.extsi %7 : tensor<1x256xi32> to tensor<1x256xi64>
+    %37 = tt.broadcast %36 : (tensor<1x256xi64>) -> tensor<2x256xi64>
+    %38 = arith.addi %37, %35 : tensor<2x256xi64>
+    %39 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<2x256x!tt.ptr<f32, 1>>
+    %40 = tt.addptr %39, %38 : tensor<2x256x!tt.ptr<f32, 1>>, tensor<2x256xi64>
+    %41 = tt.load %40, %19, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32>
+    %42 = arith.addf %41, %20 : tensor<2x256xf32>
+    %43 = arith.addf %42, %27 : tensor<2x256xf32>
+    %44 = arith.addf %43, %cst_8 : tensor<2x256xf32>
+    %45 = arith.subf %43, %44 : tensor<2x256xf32>
+    %46 = arith.mulf %43, %45 : tensor<2x256xf32>
+    %47 = arith.addf %46, %cst_8 : tensor<2x256xf32>
+    %48 = arith.select %19, %44, %cst_8 : tensor<2x256xi1>, tensor<2x256xf32>
+    %49 = arith.select %19, %47, %cst_8 : tensor<2x256xi1>, tensor<2x256xf32>
+    %50 = arith.select %12, %cst_0, %cst_1 : tensor<1x256xi1>, tensor<1x256xf32>
+    %51 = tt.broadcast %50 : (tensor<1x256xf32>) -> tensor<2x256xf32>
+    %52:3 = "tt.reduce"(%48, %49, %51) <{axis = 1 : i32}> ({
+    ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
+      %76 = arith.subf %arg11, %arg8 : f32
+      %77 = arith.addf %arg10, %arg13 : f32
+      %78 = arith.cmpf oeq, %77, %cst_2 : f32
+      %79 = arith.divf %arg13, %77 : f32
+      %80 = arith.select %78, %cst_2, %79 : f32
+      %81 = arith.mulf %76, %80 : f32
+      %82 = arith.addf %arg8, %81 : f32
+      %83 = arith.addf %arg9, %arg12 : f32
+      %84 = arith.mulf %76, %76 : f32
+      %85 = arith.mulf %84, %arg10 : f32
+      %86 = arith.mulf %85, %80 : f32
+      %87 = arith.addf %83, %86 : f32
+      tt.reduce.return %82, %87, %77 : f32, f32, f32
+    }) : (tensor<2x256xf32>, tensor<2x256xf32>, tensor<2x256xf32>) -> (tensor<2xf32>, tensor<2xf32>, tensor<2xf32>)
+    %53 = tt.expand_dims %52#0 {axis = 1 : i32} : (tensor<2xf32>) -> tensor<2x1xf32>
+    %54 = tt.expand_dims %52#1 {axis = 1 : i32} : (tensor<2xf32>) -> tensor<2x1xf32>
+    %55 = tt.load %18, %19, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32>
+    %56 = tt.load %25, %19, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<2x256xbf16>
+    %57 = arith.extf %56 : tensor<2x256xbf16> to tensor<2x256xf32>
+    %58 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>>
+    %59 = tt.addptr %58, %7 : tensor<1x256x!tt.ptr<f32, 1>>, tensor<1x256xi32>
+    %60 = tt.load %59, %12, %cst_1 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32>
+    tt.assert %33, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<2x1xi1>
+    %61 = tt.load %40, %19, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<2x256xf32>
+    %62 = arith.addf %61, %55 : tensor<2x256xf32>
+    %63 = arith.addf %62, %57 : tensor<2x256xf32>
+    %64 = tt.broadcast %53 : (tensor<2x1xf32>) -> tensor<2x256xf32>
+    %65 = arith.subf %63, %64 : tensor<2x256xf32>
+    %66 = arith.divf %54, %cst_7 : tensor<2x1xf32>
+    %67 = arith.addf %66, %cst_6 : tensor<2x1xf32>
+    %68 = tt.extern_elementwise %67 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32>) -> tensor<2x1xf32>
+    %69 = tt.broadcast %68 : (tensor<2x1xf32>) -> tensor<2x256xf32>
+    %70 = arith.mulf %65, %69 : tensor<2x256xf32>
+    %71 = tt.broadcast %60 : (tensor<1x256xf32>) -> tensor<2x256xf32>
+    %72 = arith.mulf %70, %71 : tensor<2x256xf32>
+    %73 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<2x256x!tt.ptr<bf16, 1>>
+    %74 = tt.addptr %73, %23 : tensor<2x256x!tt.ptr<bf16, 1>>, tensor<2x256xi32>
+    %75 = arith.truncf %72 : tensor<2x256xf32> to tensor<2x256xbf16>
+    tt.store %74, %75, %19 {cache = 1 : i32, evict = 1 : i32} : tensor<2x256xbf16>
+    tt.return
+  }
+}

.triton/dump/89f8cc1079aa03024e56dc2aee42813a/triton_.cubin ADDED Viewed

Binary file (58.1 kB). View file

.triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.ptx ADDED Viewed

	@@ -0,0 +1,758 @@

+//
+// Generated by LLVM NVPTX Back-End
+//
+.version 8.2
+.target sm_89
+.address_size 64
+	// .globl	triton__0d1d2d3d4d5d6de7de
+.extern .shared .align 1 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
+.visible .entry triton__0d1d2d3d4d5d6de7de(
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_0,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_1,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_2,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_3,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_4,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_5,
+	.param .u32 triton__0d1d2d3d4d5d6de7de_param_6,
+	.param .u32 triton__0d1d2d3d4d5d6de7de_param_7
+)
+.maxntid 64, 1, 1
+{
+	.reg .pred 	%p<29>;
+	.reg .b16 	%rs<17>;
+	.reg .b32 	%r<100>;
+	.reg .f32 	%f<86>;
+	.reg .b64 	%rd<16>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+	ld.param.u64 	%rd7, [triton__0d1d2d3d4d5d6de7de_param_0];
+	ld.param.u64 	%rd8, [triton__0d1d2d3d4d5d6de7de_param_1];
+$L__tmp0:
+	.loc	1 26 26
+	mov.u32 	%r66, %tid.x;
+	and.b32  	%r67, %r66, 31;
+	ld.param.u64 	%rd9, [triton__0d1d2d3d4d5d6de7de_param_2];
+	ld.param.u64 	%rd10, [triton__0d1d2d3d4d5d6de7de_param_3];
+	ld.param.u64 	%rd11, [triton__0d1d2d3d4d5d6de7de_param_4];
+	shl.b32 	%r68, %r66, 2;
+	ld.param.u64 	%rd12, [triton__0d1d2d3d4d5d6de7de_param_5];
+	and.b32  	%r69, %r68, 252;
+	.loc	1 23 28
+	mov.u32 %r1, %ctaid.x;
+	.loc	1 30 40
+	shl.b32 	%r70, %r1, 8;
+	.loc	1 30 36
+	or.b32  	%r71, %r70, %r69;
+	.loc	1 30 30
+	mul.wide.s32 	%rd13, %r71, 4;
+	add.s64 	%rd1, %rd7, %rd13;
+	mov.b32 	%r6, 0;
+	mov.pred 	%p1, -1;
+	.loc	1 30 46
+	mov.u32 %r2, 0x0;
+	mov.u32 %r3, 0x0;
+	mov.u32 %r4, 0x0;
+	mov.u32 %r5, 0x0;
+	@%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
+	@!%p1 mov.u32 %r2, %r6;
+	@!%p1 mov.u32 %r3, %r6;
+	@!%p1 mov.u32 %r4, %r6;
+	@!%p1 mov.u32 %r5, %r6;
+	mov.b32 	%f1, %r4;
+	mov.b32 	%f2, %r5;
+	.loc	1 31 30
+	mul.wide.s32 	%rd14, %r71, 2;
+	add.s64 	%rd2, %rd8, %rd14;
+	.loc	1 31 46
+	mov.u32 %r10, 0x0;
+	mov.u32 %r11, 0x0;
+	@%p1 ld.global.v2.b32 { %r10, %r11 }, [ %rd2 + 0 ];
+	@!%p1 mov.u32 %r10, %r6;
+	@!%p1 mov.u32 %r11, %r6;
+	cvt.u16.u32 	%rs1, %r10;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r10; }
+	cvt.u16.u32 	%rs3, %r11;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r11; }
+	.loc	1 31 67
+	cvt.f32.bf16 %r14, %rs1;
+	mov.b32 	%f3, %r14;
+	cvt.f32.bf16 %r15, %rs2;
+	mov.b32 	%f4, %r15;
+	cvt.f32.bf16 %r16, %rs3;
+	mov.b32 	%f5, %r16;
+	cvt.f32.bf16 %r17, %rs4;
+	mov.b32 	%f6, %r17;
+	.loc	1 32 30
+	add.s64 	%rd3, %rd9, %rd14;
+	.loc	1 32 46
+	mov.u32 %r18, 0x0;
+	mov.u32 %r19, 0x0;
+	@%p1 ld.global.v2.b32 { %r18, %r19 }, [ %rd3 + 0 ];
+	@!%p1 mov.u32 %r18, %r6;
+	@!%p1 mov.u32 %r19, %r6;
+	cvt.u16.u32 	%rs5, %r18;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r18; }
+	cvt.u16.u32 	%rs7, %r19;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r19; }
+	.loc	1 32 67
+	cvt.f32.bf16 %r22, %rs5;
+	mov.b32 	%f7, %r22;
+	cvt.f32.bf16 %r23, %rs6;
+	mov.b32 	%f8, %r23;
+	cvt.f32.bf16 %r24, %rs7;
+	mov.b32 	%f9, %r24;
+	cvt.f32.bf16 %r25, %rs8;
+	mov.b32 	%f10, %r25;
+	.loc	1 33 30
+	add.s64 	%rd4, %rd10, %rd14;
+	.loc	1 33 46
+	mov.u32 %r26, 0x0;
+	mov.u32 %r27, 0x0;
+	@%p1 ld.global.v2.b32 { %r26, %r27 }, [ %rd4 + 0 ];
+	@!%p1 mov.u32 %r26, %r6;
+	@!%p1 mov.u32 %r27, %r6;
+	cvt.u16.u32 	%rs9, %r26;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r26; }
+	cvt.u16.u32 	%rs11, %r27;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r27; }
+	.loc	1 33 67
+	cvt.f32.bf16 %r30, %rs9;
+	mov.b32 	%f11, %r30;
+	cvt.f32.bf16 %r31, %rs10;
+	mov.b32 	%f12, %r31;
+	cvt.f32.bf16 %r32, %rs11;
+	mov.b32 	%f13, %r32;
+	cvt.f32.bf16 %r33, %rs12;
+	mov.b32 	%f14, %r33;
+	.loc	1 34 31
+	mul.wide.u32 	%rd15, %r69, 4;
+	add.s64 	%rd5, %rd11, %rd15;
+	.loc	1 34 36
+	mov.u32 %r34, 0x0;
+	mov.u32 %r35, 0x0;
+	mov.u32 %r36, 0x0;
+	mov.u32 %r37, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r34, %r35, %r36, %r37 }, [ %rd5 + 0 ];
+	@!%p1 mov.u32 %r34, %r6;
+	@!%p1 mov.u32 %r35, %r6;
+	@!%p1 mov.u32 %r36, %r6;
+	@!%p1 mov.u32 %r37, %r6;
+	.loc	1 36 18
+	add.f32 	%f15, %f5, %f1;
+	add.f32 	%f16, %f6, %f2;
+	.loc	1 38 18
+	add.f32 	%f17, %f15, %f9;
+	add.f32 	%f18, %f16, %f10;
+	.loc	1 30 46
+	mov.b32 	%f19, %r2;
+	mov.b32 	%f20, %r3;
+	.loc	1 36 18
+	add.f32 	%f21, %f4, %f20;
+	add.f32 	%f22, %f3, %f19;
+	.loc	1 38 18
+	add.f32 	%f23, %f22, %f7;
+	add.f32 	%f24, %f21, %f8;
+	.loc	1 40 18
+	add.f32 	%f25, %f24, %f12;
+	add.f32 	%f26, %f23, %f11;
+	add.f32 	%f27, %f17, %f13;
+	add.f32 	%f28, %f18, %f14;
+$L__tmp1:
+	.loc	2 233 15
+	add.f32 	%f29, %f26, %f25;
+	add.f32 	%f30, %f29, %f27;
+	add.f32 	%f31, %f30, %f28;
+$L__tmp2:
+	.loc	2 243 36
+	mov.b32 	%r72, %f31;
+	shfl.sync.bfly.b32	%r73, %r72, 16, 31, -1;
+	mov.b32 	%f32, %r73;
+$L__tmp3:
+	.loc	2 233 15
+	add.f32 	%f33, %f31, %f32;
+$L__tmp4:
+	.loc	2 243 36
+	mov.b32 	%r74, %f33;
+	shfl.sync.bfly.b32	%r75, %r74, 8, 31, -1;
+	mov.b32 	%f34, %r75;
+$L__tmp5:
+	.loc	2 233 15
+	add.f32 	%f35, %f33, %f34;
+$L__tmp6:
+	.loc	2 243 36
+	mov.b32 	%r76, %f35;
+	shfl.sync.bfly.b32	%r77, %r76, 4, 31, -1;
+	mov.b32 	%f36, %r77;
+$L__tmp7:
+	.loc	2 233 15
+	add.f32 	%f37, %f35, %f36;
+$L__tmp8:
+	.loc	2 243 36
+	mov.b32 	%r78, %f37;
+	shfl.sync.bfly.b32	%r79, %r78, 2, 31, -1;
+	mov.b32 	%f38, %r79;
+$L__tmp9:
+	.loc	2 233 15
+	add.f32 	%f39, %f37, %f38;
+$L__tmp10:
+	.loc	2 243 36
+	mov.b32 	%r80, %f39;
+	shfl.sync.bfly.b32	%r81, %r80, 1, 31, -1;
+	mov.b32 	%f40, %r81;
+$L__tmp11:
+	.loc	2 233 15
+	add.f32 	%f41, %f39, %f40;
+$L__tmp12:
+	.loc	2 243 36
+	setp.eq.s32 	%p20, %r67, 0;
+	shr.u32 	%r82, %r66, 3;
+	and.b32  	%r83, %r82, 4;
+	mov.u32 	%r84, global_smem;
+	add.s32 	%r42, %r84, %r83;
+	mov.b32 	%r43, %f41;
+	@%p20 st.shared.b32 [ %r42 + 0 ], %r43;
+	bar.sync 	0;
+	setp.lt.s32 	%p21, %r66, 2;
+	add.s32 	%r45, %r84, %r68;
+	@%p21 ld.shared.b32 %r44, [ %r45 + 0 ];
+	mov.b32 	%f42, %r44;
+	shfl.sync.bfly.b32	%r85, %r44, 1, 31, -1;
+	mov.b32 	%f43, %r85;
+$L__tmp13:
+	.loc	2 233 15
+	add.f32 	%f44, %f42, %f43;
+$L__tmp14:
+	.loc	2 243 36
+	and.b32  	%r86, %r66, 1;
+	setp.eq.b32 	%p27, %r86, 1;
+	not.pred 	%p28, %p27;
+	and.pred  	%p22, %p21, %p28;
+	mov.b32 	%r47, %f44;
+	@%p22 st.shared.b32 [ %r45 + 0 ], %r47;
+	bar.sync 	0;
+	ld.shared.f32 	%f45, [global_smem];
+$L__tmp15:
+	.loc	3 8 15
+	add.f32 	%f46, %f45, 0f00000000;
+$L__tmp16:
+	.loc	1 48 20
+	mov.b32 	%r49, %f46;
+	mov.b32 	%r50, 1132462080;
+	div.full.f32 %r48, %r49, %r50;
+	mov.b32 	%f47, %r48;
+	.loc	1 49 20
+	sub.f32 	%f48, %f26, %f47;
+	sub.f32 	%f49, %f25, %f47;
+	sub.f32 	%f50, %f27, %f47;
+	sub.f32 	%f51, %f28, %f47;
+	.loc	1 50 20
+	mul.f32 	%f52, %f49, %f49;
+$L__tmp17:
+	.loc	2 243 36
+	bar.sync 	0;
+$L__tmp18:
+	.loc	2 233 15
+	fma.rn.f32 	%f53, %f48, %f48, %f52;
+	fma.rn.f32 	%f54, %f50, %f50, %f53;
+	fma.rn.f32 	%f55, %f51, %f51, %f54;
+$L__tmp19:
+	.loc	2 243 36
+	mov.b32 	%r87, %f55;
+	shfl.sync.bfly.b32	%r88, %r87, 16, 31, -1;
+	mov.b32 	%f56, %r88;
+$L__tmp20:
+	.loc	2 233 15
+	add.f32 	%f57, %f55, %f56;
+$L__tmp21:
+	.loc	2 243 36
+	mov.b32 	%r89, %f57;
+	shfl.sync.bfly.b32	%r90, %r89, 8, 31, -1;
+	mov.b32 	%f58, %r90;
+$L__tmp22:
+	.loc	2 233 15
+	add.f32 	%f59, %f57, %f58;
+$L__tmp23:
+	.loc	2 243 36
+	mov.b32 	%r91, %f59;
+	shfl.sync.bfly.b32	%r92, %r91, 4, 31, -1;
+	mov.b32 	%f60, %r92;
+$L__tmp24:
+	.loc	2 233 15
+	add.f32 	%f61, %f59, %f60;
+$L__tmp25:
+	.loc	2 243 36
+	mov.b32 	%r93, %f61;
+	shfl.sync.bfly.b32	%r94, %r93, 2, 31, -1;
+	mov.b32 	%f62, %r94;
+$L__tmp26:
+	.loc	2 233 15
+	add.f32 	%f63, %f61, %f62;
+$L__tmp27:
+	.loc	2 243 36
+	mov.b32 	%r95, %f63;
+	shfl.sync.bfly.b32	%r96, %r95, 1, 31, -1;
+	mov.b32 	%f64, %r96;
+$L__tmp28:
+	.loc	2 233 15
+	add.f32 	%f65, %f63, %f64;
+$L__tmp29:
+	.loc	2 243 36
+	mov.b32 	%r52, %f65;
+	@%p20 st.shared.b32 [ %r42 + 0 ], %r52;
+	bar.sync 	0;
+	@%p21 ld.shared.b32 %r53, [ %r45 + 0 ];
+	mov.b32 	%f66, %r53;
+	shfl.sync.bfly.b32	%r97, %r53, 1, 31, -1;
+	mov.b32 	%f67, %r97;
+$L__tmp30:
+	.loc	2 233 15
+	add.f32 	%f68, %f66, %f67;
+$L__tmp31:
+	.loc	2 243 36
+	mov.b32 	%r56, %f68;
+	@%p22 st.shared.b32 [ %r45 + 0 ], %r56;
+	bar.sync 	0;
+	ld.shared.f32 	%f69, [global_smem];
+$L__tmp32:
+	.loc	3 8 15
+	add.f32 	%f70, %f69, 0f00000000;
+$L__tmp33:
+	.loc	1 56 20
+	mov.b32 	%r58, %f70;
+	div.full.f32 %r57, %r58, %r50;
+	mov.b32 	%f71, %r57;
+	.loc	1 58 20
+	add.f32 	%f72, %f71, 0f3727C5AC;
+	.loc	1 59 26
+	rsqrt.approx.ftz.f32 	%f73, %f72;
+	.loc	1 34 36
+	mov.b32 	%f74, %r37;
+	mov.b32 	%f75, %r36;
+	mov.b32 	%f76, %r35;
+	mov.b32 	%f77, %r34;
+	.loc	1 60 20
+	mul.f32 	%f78, %f48, %f73;
+	mul.f32 	%f79, %f49, %f73;
+	mul.f32 	%f80, %f50, %f73;
+	mul.f32 	%f81, %f51, %f73;
+	.loc	1 61 20
+	mul.f32 	%f82, %f78, %f77;
+	mul.f32 	%f83, %f79, %f76;
+	mul.f32 	%f84, %f80, %f75;
+	mul.f32 	%f85, %f81, %f74;
+	.loc	1 63 25
+	add.s64 	%rd6, %rd12, %rd14;
+	.loc	1 63 48
+	mov.b32 	%r60, %f82;
+	cvt.rn.bf16.f32 %rs13, %r60;
+	mov.b32 	%r61, %f83;
+	cvt.rn.bf16.f32 %rs14, %r61;
+	mov.b32 	%r62, %f84;
+	cvt.rn.bf16.f32 %rs15, %r62;
+	mov.b32 	%r63, %f85;
+	cvt.rn.bf16.f32 %rs16, %r63;
+	mov.b32 	%r98, {%rs13, %rs14};
+	mov.b32 	%r99, {%rs15, %rs16};
+	@%p1 st.global.v2.b32 [ %rd6 + 0 ], { %r98, %r99 };
+	.loc	1 63 4
+	ret;
+$L__tmp34:
+$L__func_end0:
+}
+	// .globl	__nv_rsqrtf
+.visible .func  (.param .b32 func_retval0) __nv_rsqrtf(
+	.param .b32 __nv_rsqrtf_param_0
+)
+{
+	.reg .f32 	%f<3>;
+$L__func_begin1:
+	ld.param.f32 	%f1, [__nv_rsqrtf_param_0];
+	rsqrt.approx.ftz.f32 	%f2, %f1;
+	st.param.f32 	[func_retval0+0], %f2;
+	ret;
+$L__func_end1:
+}
+	.file	1 "/tmp/torchinductor_root/pw/cpwl4wgyi5spzbgbswrqxfrxlyk2m76a4bakbp6l5ltopjbkjadt.py"
+	.file	2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
+	.file	3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 32
+.b8 11
+.b8 0
+.b8 0
+.b8 3
+.b8 46
+.b8 1
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 49
+.b8 19
+.b8 0
+.b8 0
+.b8 4
+.b8 29
+.b8 1
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 5
+.b8 29
+.b8 0
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 399
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 112
+.b8 119
+.b8 108
+.b8 52
+.b8 119
+.b8 103
+.b8 121
+.b8 105
+.b8 53
+.b8 115
+.b8 112
+.b8 122
+.b8 98
+.b8 103
+.b8 98
+.b8 115
+.b8 119
+.b8 114
+.b8 113
+.b8 120
+.b8 102
+.b8 114
+.b8 120
+.b8 108
+.b8 121
+.b8 107
+.b8 50
+.b8 109
+.b8 55
+.b8 54
+.b8 97
+.b8 52
+.b8 98
+.b8 97
+.b8 107
+.b8 98
+.b8 112
+.b8 54
+.b8 108
+.b8 53
+.b8 108
+.b8 116
+.b8 111
+.b8 112
+.b8 106
+.b8 98
+.b8 107
+.b8 106
+.b8 97
+.b8 100
+.b8 116
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 112
+.b8 119
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 101
+.b8 55
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 101
+.b8 55
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 1
+.b8 3
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b32 125
+.b8 4
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp14
+.b8 2
+.b8 45
+.b8 59
+.b8 5
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp14
+.b8 2
+.b8 243
+.b8 36
+.b8 0
+.b8 5
+.b32 125
+.b64 $L__tmp2
+.b64 $L__tmp15
+.b8 2
+.b8 45
+.b8 59
+.b8 5
+.b32 125
+.b64 $L__tmp15
+.b64 $L__tmp16
+.b8 3
+.b8 45
+.b8 45
+.b8 5
+.b32 125
+.b64 $L__tmp17
+.b64 $L__tmp32
+.b8 2
+.b8 53
+.b8 59
+.b8 4
+.b32 125
+.b64 $L__tmp18
+.b64 $L__tmp31
+.b8 2
+.b8 53
+.b8 59
+.b8 5
+.b32 125
+.b64 $L__tmp18
+.b64 $L__tmp31
+.b8 2
+.b8 243
+.b8 36
+.b8 0
+.b8 5
+.b32 125
+.b64 $L__tmp32
+.b64 $L__tmp33
+.b8 3
+.b8 53
+.b8 45
+.b8 0
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 403
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 101
+.b8 55
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 403
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}

.triton/dump/d7a12c0ba96f8920b8147157303ee99f/triton_.cubin ADDED Viewed

Binary file (14.1 kB). View file

.triton/dump/d7a12c0ba96f8920b8147157303ee99f/triton_.llir ADDED Viewed

	@@ -0,0 +1,310 @@

+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+@global_smem = external addrspace(3) global [0 x i8]
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+define void @triton__0d1d2d3d4d5de6de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i32 %5, i32 %6) local_unnamed_addr !dbg !7 {
+  %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %9 = and i32 %8, 31, !dbg !10
+  %10 = lshr i32 %8, 5, !dbg !10
+  %11 = and i32 %10, 1, !dbg !10
+  %urem = shl i32 %8, 2, !dbg !10
+  %12 = and i32 %urem, 252, !dbg !10
+  %13 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !11
+  %14 = shl i32 %13, 8, !dbg !12
+  %15 = or i32 %14, %12, !dbg !13
+  %16 = sext i32 %15 to i64, !dbg !14
+  %17 = getelementptr float, ptr addrspace(1) %0, i64 %16, !dbg !14
+  %18 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %17, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !15
+  %19 = extractvalue { i32, i32, i32, i32 } %18, 0, !dbg !15
+  %20 = extractvalue { i32, i32, i32, i32 } %18, 1, !dbg !15
+  %21 = extractvalue { i32, i32, i32, i32 } %18, 2, !dbg !15
+  %22 = extractvalue { i32, i32, i32, i32 } %18, 3, !dbg !15
+  %23 = bitcast i32 %21 to float, !dbg !15
+  %24 = bitcast i32 %22 to float, !dbg !15
+  %25 = getelementptr i16, ptr addrspace(1) %1, i64 %16, !dbg !16
+  %26 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %25, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !17
+  %27 = extractvalue { i32, i32 } %26, 0, !dbg !17
+  %28 = extractvalue { i32, i32 } %26, 1, !dbg !17
+  %29 = trunc i32 %27 to i16, !dbg !17
+  %extelt.offset = lshr i32 %27, 16, !dbg !17
+  %30 = trunc i32 %extelt.offset to i16, !dbg !17
+  %31 = trunc i32 %28 to i16, !dbg !17
+  %extelt.offset1 = lshr i32 %28, 16, !dbg !17
+  %32 = trunc i32 %extelt.offset1 to i16, !dbg !17
+  %33 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %29) #6, !dbg !18
+  %34 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %30) #6, !dbg !18
+  %35 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %31) #6, !dbg !18
+  %36 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %32) #6, !dbg !18
+  %37 = getelementptr i16, ptr addrspace(1) %2, i64 %16, !dbg !19
+  %38 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %37, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !20
+  %39 = extractvalue { i32, i32 } %38, 0, !dbg !20
+  %40 = extractvalue { i32, i32 } %38, 1, !dbg !20
+  %41 = trunc i32 %39 to i16, !dbg !20
+  %extelt.offset2 = lshr i32 %39, 16, !dbg !20
+  %42 = trunc i32 %extelt.offset2 to i16, !dbg !20
+  %43 = trunc i32 %40 to i16, !dbg !20
+  %extelt.offset3 = lshr i32 %40, 16, !dbg !20
+  %44 = trunc i32 %extelt.offset3 to i16, !dbg !20
+  %45 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %41) #6, !dbg !21
+  %46 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %42) #6, !dbg !21
+  %47 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %43) #6, !dbg !21
+  %48 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %44) #6, !dbg !21
+  %49 = zext nneg i32 %12 to i64, !dbg !22
+  %50 = getelementptr float, ptr addrspace(1) %3, i64 %49, !dbg !22
+  %51 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %50, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !23
+  %52 = fadd float %35, %23, !dbg !24
+  %53 = fadd float %36, %24, !dbg !24
+  %54 = insertelement <2 x i32> poison, i32 %19, i64 0, !dbg !15
+  %55 = insertelement <2 x i32> %54, i32 %20, i64 1, !dbg !15
+  %56 = bitcast <2 x i32> %55 to <2 x float>, !dbg !15
+  %57 = insertelement <2 x float> poison, float %33, i64 0, !dbg !24
+  %58 = insertelement <2 x float> %57, float %34, i64 1, !dbg !24
+  %59 = fadd <2 x float> %58, %56, !dbg !24
+  %60 = insertelement <2 x float> poison, float %45, i64 0, !dbg !25
+  %61 = insertelement <2 x float> %60, float %46, i64 1, !dbg !25
+  %62 = fadd <2 x float> %59, %61, !dbg !25
+  %63 = fadd float %52, %47, !dbg !25
+  %64 = fadd float %53, %48, !dbg !25
+  %65 = extractelement <2 x float> %62, i64 0, !dbg !26
+  %66 = extractelement <2 x float> %62, i64 1, !dbg !26
+  %67 = fadd float %65, %66, !dbg !26
+  %68 = fadd float %67, %63, !dbg !26
+  %69 = fadd float %68, %64, !dbg !26
+  %70 = bitcast float %69 to i32, !dbg !32
+  %71 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %70, i32 16, i32 31), !dbg !32
+  %72 = bitcast i32 %71 to float, !dbg !32
+  %73 = fadd float %69, %72, !dbg !26
+  %74 = bitcast float %73 to i32, !dbg !32
+  %75 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %74, i32 8, i32 31), !dbg !32
+  %76 = bitcast i32 %75 to float, !dbg !32
+  %77 = fadd float %73, %76, !dbg !26
+  %78 = bitcast float %77 to i32, !dbg !32
+  %79 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %78, i32 4, i32 31), !dbg !32
+  %80 = bitcast i32 %79 to float, !dbg !32
+  %81 = fadd float %77, %80, !dbg !26
+  %82 = bitcast float %81 to i32, !dbg !32
+  %83 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %82, i32 2, i32 31), !dbg !32
+  %84 = bitcast i32 %83 to float, !dbg !32
+  %85 = fadd float %81, %84, !dbg !26
+  %86 = bitcast float %85 to i32, !dbg !32
+  %87 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %86, i32 1, i32 31), !dbg !32
+  %88 = bitcast i32 %87 to float, !dbg !32
+  %89 = fadd float %85, %88, !dbg !26
+  %90 = icmp eq i32 %9, 0, !dbg !32
+  %91 = zext nneg i32 %11 to i64, !dbg !32
+  %92 = getelementptr float, ptr addrspace(3) @global_smem, i64 %91, !dbg !32
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %92, float %89, i1 %90) #6, !dbg !32
+  tail call void @llvm.nvvm.barrier0(), !dbg !32
+  %93 = icmp slt i32 %8, 2, !dbg !32
+  %94 = sext i32 %8 to i64, !dbg !32
+  %95 = getelementptr float, ptr addrspace(3) @global_smem, i64 %94, !dbg !32
+  %96 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %95, i1 %93) #6, !dbg !32
+  %97 = bitcast float %96 to i32, !dbg !32
+  %98 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %97, i32 1, i32 31), !dbg !32
+  %99 = bitcast i32 %98 to float, !dbg !32
+  %100 = fadd float %96, %99, !dbg !26
+  %101 = and i32 %8, 1, !dbg !32
+  %102 = icmp eq i32 %101, 0, !dbg !32
+  %103 = and i1 %93, %102, !dbg !32
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %95, float %100, i1 %103) #6, !dbg !32
+  tail call void @llvm.nvvm.barrier0(), !dbg !32
+  %104 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !32
+  %105 = fadd float %104, 0.000000e+00, !dbg !34
+  %106 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %105, float 2.560000e+02) #6, !dbg !38
+  %107 = fsub float %65, %106, !dbg !39
+  %108 = fsub float %66, %106, !dbg !39
+  %109 = fsub float %63, %106, !dbg !39
+  %110 = fsub float %64, %106, !dbg !39
+  %111 = fmul float %107, %107, !dbg !40
+  %112 = fmul float %108, %108, !dbg !40
+  %113 = fmul float %109, %109, !dbg !40
+  %114 = fmul float %110, %110, !dbg !40
+  tail call void @llvm.nvvm.barrier0(), !dbg !41
+  %115 = fadd float %111, %112, !dbg !43
+  %116 = fadd float %113, %115, !dbg !43
+  %117 = fadd float %114, %116, !dbg !43
+  %118 = bitcast float %117 to i32, !dbg !41
+  %119 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %118, i32 16, i32 31), !dbg !41
+  %120 = bitcast i32 %119 to float, !dbg !41
+  %121 = fadd float %117, %120, !dbg !43
+  %122 = bitcast float %121 to i32, !dbg !41
+  %123 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %122, i32 8, i32 31), !dbg !41
+  %124 = bitcast i32 %123 to float, !dbg !41
+  %125 = fadd float %121, %124, !dbg !43
+  %126 = bitcast float %125 to i32, !dbg !41
+  %127 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %126, i32 4, i32 31), !dbg !41
+  %128 = bitcast i32 %127 to float, !dbg !41
+  %129 = fadd float %125, %128, !dbg !43
+  %130 = bitcast float %129 to i32, !dbg !41
+  %131 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %130, i32 2, i32 31), !dbg !41
+  %132 = bitcast i32 %131 to float, !dbg !41
+  %133 = fadd float %129, %132, !dbg !43
+  %134 = bitcast float %133 to i32, !dbg !41
+  %135 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %134, i32 1, i32 31), !dbg !41
+  %136 = bitcast i32 %135 to float, !dbg !41
+  %137 = fadd float %133, %136, !dbg !43
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %92, float %137, i1 %90) #6, !dbg !41
+  tail call void @llvm.nvvm.barrier0(), !dbg !41
+  %138 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %95, i1 %93) #6, !dbg !41
+  %139 = bitcast float %138 to i32, !dbg !41
+  %140 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %139, i32 1, i32 31), !dbg !41
+  %141 = bitcast i32 %140 to float, !dbg !41
+  %142 = fadd float %138, %141, !dbg !43
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %95, float %142, i1 %103) #6, !dbg !41
+  tail call void @llvm.nvvm.barrier0(), !dbg !41
+  %143 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !41
+  %144 = fadd float %143, 0.000000e+00, !dbg !46
+  %145 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %144, float 2.560000e+02) #6, !dbg !48
+  %146 = fadd float %145, 0x3EE4F8B580000000, !dbg !49
+  %147 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !50
+  %.not.i = icmp eq i32 %147, 0, !dbg !50
+  br i1 %.not.i, label %150, label %148, !dbg !50
+148:                                              ; preds = %7
+  %149 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %146), !dbg !50
+  br label %__nv_rsqrtf.exit, !dbg !50
+150:                                              ; preds = %7
+  %151 = tail call float @llvm.nvvm.rsqrt.approx.f(float %146), !dbg !50
+  br label %__nv_rsqrtf.exit, !dbg !50
+__nv_rsqrtf.exit:                                 ; preds = %148, %150
+  %.0.i = phi float [ %149, %148 ], [ %151, %150 ], !dbg !50
+  %152 = extractvalue { i32, i32, i32, i32 } %51, 3, !dbg !23
+  %153 = bitcast i32 %152 to float, !dbg !23
+  %154 = extractvalue { i32, i32, i32, i32 } %51, 2, !dbg !23
+  %155 = bitcast i32 %154 to float, !dbg !23
+  %156 = extractvalue { i32, i32, i32, i32 } %51, 1, !dbg !23
+  %157 = bitcast i32 %156 to float, !dbg !23
+  %158 = extractvalue { i32, i32, i32, i32 } %51, 0, !dbg !23
+  %159 = bitcast i32 %158 to float, !dbg !23
+  %160 = fmul float %107, %.0.i, !dbg !51
+  %161 = fmul float %108, %.0.i, !dbg !51
+  %162 = fmul float %109, %.0.i, !dbg !51
+  %163 = fmul float %110, %.0.i, !dbg !51
+  %164 = fmul float %160, %159, !dbg !52
+  %165 = fmul float %161, %157, !dbg !52
+  %166 = fmul float %162, %155, !dbg !52
+  %167 = fmul float %163, %153, !dbg !52
+  %168 = getelementptr i16, ptr addrspace(1) %4, i64 %16, !dbg !53
+  %169 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %164) #6, !dbg !54
+  %170 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %165) #6, !dbg !54
+  %171 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %166) #6, !dbg !54
+  %172 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %167) #6, !dbg !54
+  %173 = insertelement <2 x i16> undef, i16 %169, i64 0, !dbg !54
+  %174 = insertelement <2 x i16> %173, i16 %170, i64 1, !dbg !54
+  %175 = bitcast <2 x i16> %174 to i32, !dbg !54
+  %176 = insertelement <2 x i16> undef, i16 %171, i64 0, !dbg !54
+  %177 = insertelement <2 x i16> %176, i16 %172, i64 1, !dbg !54
+  %178 = bitcast <2 x i16> %177 to i32, !dbg !54
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %175, i32 %178, ptr addrspace(1) %168, i1 true) #6, !dbg !54
+  ret void, !dbg !55
+}
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #2
+; Function Attrs: alwaysinline nounwind
+define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
+  %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
+  %.not = icmp eq i32 %1, 0
+  br i1 %.not, label %4, label %2
+2:                                                ; preds = %0
+  %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
+  br label %6
+4:                                                ; preds = %0
+  %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
+  br label %6
+6:                                                ; preds = %4, %2
+  %.0 = phi float [ %3, %2 ], [ %5, %4 ]
+  ret float %.0
+}
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #5
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #2 = { convergent nocallback nounwind }
+attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #6 = { nounwind }
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+!nvvm.annotations = !{!4, !5, !5, !4}
+!llvm.ident = !{!6}
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!3 = !DIFile(filename: "cdohrmmhfsykzlva6pepxaa7gf7klw7w5jzorpspyaldhfg3acr2.py", directory: "/tmp/torchinductor_root/do")
+!4 = !{ptr @triton__0d1d2d3d4d5de6de, !"kernel", i32 1}
+!5 = !{ptr @triton__0d1d2d3d4d5de6de, !"maxntidx", i32 64}
+!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5de6de", linkageName: "triton__0d1d2d3d4d5de6de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
+!9 = !{}
+!10 = !DILocation(line: 26, column: 26, scope: !7)
+!11 = !DILocation(line: 23, column: 28, scope: !7)
+!12 = !DILocation(line: 30, column: 40, scope: !7)
+!13 = !DILocation(line: 30, column: 36, scope: !7)
+!14 = !DILocation(line: 30, column: 30, scope: !7)
+!15 = !DILocation(line: 30, column: 46, scope: !7)
+!16 = !DILocation(line: 31, column: 30, scope: !7)
+!17 = !DILocation(line: 31, column: 46, scope: !7)
+!18 = !DILocation(line: 31, column: 67, scope: !7)
+!19 = !DILocation(line: 32, column: 30, scope: !7)
+!20 = !DILocation(line: 32, column: 46, scope: !7)
+!21 = !DILocation(line: 32, column: 67, scope: !7)
+!22 = !DILocation(line: 33, column: 31, scope: !7)
+!23 = !DILocation(line: 33, column: 36, scope: !7)
+!24 = !DILocation(line: 35, column: 18, scope: !7)
+!25 = !DILocation(line: 37, column: 18, scope: !7)
+!26 = !DILocation(line: 233, column: 15, scope: !27, inlinedAt: !30)
+!27 = distinct !DILexicalBlockFile(scope: !29, file: !28, discriminator: 0)
+!28 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
+!29 = distinct !DILexicalBlockFile(scope: !7, file: !28, discriminator: 0)
+!30 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !31)
+!31 = !DILocation(line: 42, column: 59, scope: !27)
+!32 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !33)
+!33 = !DILocation(line: 42, column: 59, scope: !29)
+!34 = !DILocation(line: 8, column: 15, scope: !35, inlinedAt: !37)
+!35 = distinct !DILexicalBlockFile(scope: !7, file: !36, discriminator: 0)
+!36 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
+!37 = !DILocation(line: 42, column: 45, scope: !35)
+!38 = !DILocation(line: 45, column: 20, scope: !7)
+!39 = !DILocation(line: 46, column: 19, scope: !7)
+!40 = !DILocation(line: 47, column: 20, scope: !7)
+!41 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !42)
+!42 = !DILocation(line: 50, column: 59, scope: !29)
+!43 = !DILocation(line: 233, column: 15, scope: !27, inlinedAt: !44)
+!44 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !45)
+!45 = !DILocation(line: 50, column: 59, scope: !27)
+!46 = !DILocation(line: 8, column: 15, scope: !35, inlinedAt: !47)
+!47 = !DILocation(line: 50, column: 45, scope: !35)
+!48 = !DILocation(line: 53, column: 20, scope: !7)
+!49 = !DILocation(line: 55, column: 20, scope: !7)
+!50 = !DILocation(line: 56, column: 26, scope: !7)
+!51 = !DILocation(line: 57, column: 20, scope: !7)
+!52 = !DILocation(line: 58, column: 20, scope: !7)
+!53 = !DILocation(line: 60, column: 25, scope: !7)
+!54 = !DILocation(line: 60, column: 48, scope: !7)
+!55 = !DILocation(line: 60, column: 4, scope: !7)

.triton/dump/ea4af10b775ebbfe0bdcca18fd6288c8/triton_.ttir ADDED Viewed

	@@ -0,0 +1,153 @@

+module {
+  tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<16x128xbf16>
+    %cst_0 = arith.constant 0.000000e+00 : f32
+    %cst_1 = arith.constant dense<1.000000e+00> : tensor<16x128xf32>
+    %c256_i32 = arith.constant 256 : i32
+    %c128_i32 = arith.constant 128 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %cst_2 = arith.constant dense<256> : tensor<16x1xi64>
+    %cst_3 = arith.constant dense<0> : tensor<16x1xi64>
+    %cst_4 = arith.constant dense<50257> : tensor<16x1xi64>
+    %cst_5 = arith.constant dense<9.99999974E-6> : tensor<16x1xf32>
+    %cst_6 = arith.constant dense<2.560000e+02> : tensor<16x1xf32>
+    %cst_7 = arith.constant dense<0.000000e+00> : tensor<1x128xf32>
+    %cst_8 = arith.constant dense<0.000000e+00> : tensor<16x128xf32>
+    %cst_9 = arith.constant dense<256> : tensor<16x1xi32>
+    %cst_10 = arith.constant dense<256> : tensor<1x128xi32>
+    %cst_11 = arith.constant dense<512> : tensor<16x1xi32>
+    %c16_i32 = arith.constant 16 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c16_i32 : i32
+    %2 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32>
+    %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<16xi32>) -> tensor<16x1xi32>
+    %4 = tt.splat %1 : (i32) -> tensor<16x1xi32>
+    %5 = arith.addi %4, %3 : tensor<16x1xi32>
+    %6 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
+    %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<128xi32>) -> tensor<1x128xi32>
+    %8 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<16x1x!tt.ptr<i64, 1>>
+    %9 = tt.addptr %8, %5 : tensor<16x1x!tt.ptr<i64, 1>>, tensor<16x1xi32>
+    %10 = tt.load %9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64>
+    %11 = arith.remsi %5, %cst_11 : tensor<16x1xi32>
+    %12 = arith.muli %11, %cst_9 : tensor<16x1xi32>
+    %13 = tt.broadcast %12 : (tensor<16x1xi32>) -> tensor<16x128xi32>
+    %14 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<16x128x!tt.ptr<f32, 1>>
+    %15 = arith.muli %5, %cst_9 : tensor<16x1xi32>
+    %16 = tt.broadcast %15 : (tensor<16x1xi32>) -> tensor<16x128xi32>
+    %17 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<16x128x!tt.ptr<bf16, 1>>
+    %18 = arith.addi %10, %cst_4 : tensor<16x1xi64>
+    %19 = arith.cmpi slt, %10, %cst_3 : tensor<16x1xi64>
+    %20 = arith.select %19, %18, %10 : tensor<16x1xi1>, tensor<16x1xi64>
+    %21 = arith.cmpi sge, %20, %cst_3 : tensor<16x1xi64>
+    %22 = arith.cmpi slt, %20, %cst_4 : tensor<16x1xi64>
+    %23 = arith.andi %21, %22 : tensor<16x1xi1>
+    %24 = arith.muli %20, %cst_2 : tensor<16x1xi64>
+    %25 = tt.broadcast %24 : (tensor<16x1xi64>) -> tensor<16x128xi64>
+    %26 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<16x128x!tt.ptr<f32, 1>>
+    %27:3 = scf.for %arg8 = %c0_i32 to %c256_i32 step %c128_i32 iter_args(%arg9 = %cst_8, %arg10 = %cst_8, %arg11 = %cst_8) -> (tensor<16x128xf32>, tensor<16x128xf32>, tensor<16x128xf32>)  : i32 {
+      %51 = tt.splat %arg8 : (i32) -> tensor<1x128xi32>
+      %52 = arith.addi %51, %7 : tensor<1x128xi32>
+      %53 = arith.cmpi slt, %52, %cst_10 : tensor<1x128xi32>
+      %54 = tt.broadcast %52 : (tensor<1x128xi32>) -> tensor<16x128xi32>
+      %55 = arith.addi %54, %13 : tensor<16x128xi32>
+      %56 = tt.addptr %14, %55 : tensor<16x128x!tt.ptr<f32, 1>>, tensor<16x128xi32>
+      %57 = tt.broadcast %53 : (tensor<1x128xi1>) -> tensor<16x128xi1>
+      %58 = tt.load %56, %57, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x128xf32>
+      %59 = arith.addi %54, %16 : tensor<16x128xi32>
+      %60 = tt.addptr %17, %59 : tensor<16x128x!tt.ptr<bf16, 1>>, tensor<16x128xi32>
+      %61 = tt.load %60, %57, %cst {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x128xbf16>
+      %62 = arith.extf %61 : tensor<16x128xbf16> to tensor<16x128xf32>
+      tt.assert %23, "index out of bounds: 0 <= tmp3 < 50257", "/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py", "<module>", 1892 : tensor<16x1xi1>
+      %63 = arith.extsi %52 : tensor<1x128xi32> to tensor<1x128xi64>
+      %64 = tt.broadcast %63 : (tensor<1x128xi64>) -> tensor<16x128xi64>
+      %65 = arith.addi %64, %25 : tensor<16x128xi64>
+      %66 = tt.addptr %26, %65 : tensor<16x128x!tt.ptr<f32, 1>>, tensor<16x128xi64>
+      %67 = tt.load %66, %57, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x128xf32>
+      %68 = arith.addf %67, %58 : tensor<16x128xf32>
+      %69 = arith.addf %68, %62 : tensor<16x128xf32>
+      %70 = arith.subf %69, %arg9 : tensor<16x128xf32>
+      %71 = arith.addf %arg11, %cst_1 : tensor<16x128xf32>
+      %72 = arith.divf %70, %71 : tensor<16x128xf32>
+      %73 = arith.addf %arg9, %72 : tensor<16x128xf32>
+      %74 = arith.subf %69, %73 : tensor<16x128xf32>
+      %75 = arith.mulf %70, %74 : tensor<16x128xf32>
+      %76 = arith.addf %arg10, %75 : tensor<16x128xf32>
+      %77 = arith.select %57, %73, %arg9 : tensor<16x128xi1>, tensor<16x128xf32>
+      %78 = arith.select %57, %76, %arg10 : tensor<16x128xi1>, tensor<16x128xf32>
+      %79 = arith.select %57, %71, %arg11 : tensor<16x128xi1>, tensor<16x128xf32>
+      scf.yield %77, %78, %79 : tensor<16x128xf32>, tensor<16x128xf32>, tensor<16x128xf32>
+    }
+    %28:3 = "tt.reduce"(%27#0, %27#1, %27#2) <{axis = 1 : i32}> ({
+    ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
+      %51 = arith.subf %arg11, %arg8 : f32
+      %52 = arith.addf %arg10, %arg13 : f32
+      %53 = arith.cmpf oeq, %52, %cst_0 : f32
+      %54 = arith.divf %arg13, %52 : f32
+      %55 = arith.select %53, %cst_0, %54 : f32
+      %56 = arith.mulf %51, %55 : f32
+      %57 = arith.addf %arg8, %56 : f32
+      %58 = arith.addf %arg9, %arg12 : f32
+      %59 = arith.mulf %51, %51 : f32
+      %60 = arith.mulf %59, %arg10 : f32
+      %61 = arith.mulf %60, %55 : f32
+      %62 = arith.addf %58, %61 : f32
+      tt.reduce.return %57, %62, %52 : f32, f32, f32
+    }) : (tensor<16x128xf32>, tensor<16x128xf32>, tensor<16x128xf32>) -> (tensor<16xf32>, tensor<16xf32>, tensor<16xf32>)
+    %29 = tt.expand_dims %28#0 {axis = 1 : i32} : (tensor<16xf32>) -> tensor<16x1xf32>
+    %30 = tt.expand_dims %28#1 {axis = 1 : i32} : (tensor<16xf32>) -> tensor<16x1xf32>
+    %31 = arith.muli %11, %cst_9 : tensor<16x1xi32>
+    %32 = tt.broadcast %31 : (tensor<16x1xi32>) -> tensor<16x128xi32>
+    %33 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<16x128x!tt.ptr<f32, 1>>
+    %34 = arith.muli %5, %cst_9 : tensor<16x1xi32>
+    %35 = tt.broadcast %34 : (tensor<16x1xi32>) -> tensor<16x128xi32>
+    %36 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<16x128x!tt.ptr<bf16, 1>>
+    %37 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x128x!tt.ptr<f32, 1>>
+    %38 = arith.addi %10, %cst_4 : tensor<16x1xi64>
+    %39 = arith.cmpi slt, %10, %cst_3 : tensor<16x1xi64>
+    %40 = arith.select %39, %38, %10 : tensor<16x1xi1>, tensor<16x1xi64>
+    %41 = arith.cmpi sge, %40, %cst_3 : tensor<16x1xi64>
+    %42 = arith.cmpi slt, %40, %cst_4 : tensor<16x1xi64>
+    %43 = arith.andi %41, %42 : tensor<16x1xi1>
+    %44 = arith.muli %40, %cst_2 : tensor<16x1xi64>
+    %45 = tt.broadcast %44 : (tensor<16x1xi64>) -> tensor<16x128xi64>
+    %46 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<16x128x!tt.ptr<f32, 1>>
+    %47 = tt.broadcast %29 : (tensor<16x1xf32>) -> tensor<16x128xf32>
+    %48 = arith.divf %30, %cst_6 : tensor<16x1xf32>
+    %49 = arith.addf %48, %cst_5 : tensor<16x1xf32>
+    %50 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<16x128x!tt.ptr<bf16, 1>>
+    scf.for %arg8 = %c0_i32 to %c256_i32 step %c128_i32  : i32 {
+      %51 = tt.splat %arg8 : (i32) -> tensor<1x128xi32>
+      %52 = arith.addi %51, %7 : tensor<1x128xi32>
+      %53 = arith.cmpi slt, %52, %cst_10 : tensor<1x128xi32>
+      %54 = tt.broadcast %52 : (tensor<1x128xi32>) -> tensor<16x128xi32>
+      %55 = arith.addi %54, %32 : tensor<16x128xi32>
+      %56 = tt.addptr %33, %55 : tensor<16x128x!tt.ptr<f32, 1>>, tensor<16x128xi32>
+      %57 = tt.broadcast %53 : (tensor<1x128xi1>) -> tensor<16x128xi1>
+      %58 = tt.load %56, %57, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x128xf32>
+      %59 = arith.addi %54, %35 : tensor<16x128xi32>
+      %60 = tt.addptr %36, %59 : tensor<16x128x!tt.ptr<bf16, 1>>, tensor<16x128xi32>
+      %61 = tt.load %60, %57, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x128xbf16>
+      %62 = arith.extf %61 : tensor<16x128xbf16> to tensor<16x128xf32>
+      %63 = tt.addptr %37, %52 : tensor<1x128x!tt.ptr<f32, 1>>, tensor<1x128xi32>
+      %64 = tt.load %63, %53, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x128xf32>
+      tt.assert %43, "index out of bounds: 0 <= tmp16 < 50257", "/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py", "<module>", 1892 : tensor<16x1xi1>
+      %65 = arith.extsi %52 : tensor<1x128xi32> to tensor<1x128xi64>
+      %66 = tt.broadcast %65 : (tensor<1x128xi64>) -> tensor<16x128xi64>
+      %67 = arith.addi %66, %45 : tensor<16x128xi64>
+      %68 = tt.addptr %46, %67 : tensor<16x128x!tt.ptr<f32, 1>>, tensor<16x128xi64>
+      %69 = tt.load %68, %57, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x128xf32>
+      %70 = arith.addf %69, %58 : tensor<16x128xf32>
+      %71 = arith.addf %70, %62 : tensor<16x128xf32>
+      %72 = arith.subf %71, %47 : tensor<16x128xf32>
+      %73 = tt.extern_elementwise %49 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<16x1xf32>) -> tensor<16x1xf32>
+      %74 = tt.broadcast %73 : (tensor<16x1xf32>) -> tensor<16x128xf32>
+      %75 = arith.mulf %72, %74 : tensor<16x128xf32>
+      %76 = tt.broadcast %64 : (tensor<1x128xf32>) -> tensor<16x128xf32>
+      %77 = arith.mulf %75, %76 : tensor<16x128xf32>
+      %78 = tt.addptr %50, %59 : tensor<16x128x!tt.ptr<bf16, 1>>, tensor<16x128xi32>
+      %79 = arith.truncf %77 : tensor<16x128xf32> to tensor<16x128xbf16>
+      tt.store %78, %79, %57 {cache = 1 : i32, evict = 1 : i32} : tensor<16x128xbf16>
+    }
+    tt.return
+  }
+}

.triton/dump/fac03406d1136fc802dac111a1efea36/triton_.ptx ADDED Viewed

	@@ -0,0 +1,971 @@

+//
+// Generated by LLVM NVPTX Back-End
+//
+.version 8.2
+.target sm_89
+.address_size 64
+	// .globl	triton__0d1d2d3de4
+.extern .shared .align 1 .b8 global_smem[];
+.visible .entry triton__0d1d2d3de4(
+	.param .u64 triton__0d1d2d3de4_param_0,
+	.param .u64 triton__0d1d2d3de4_param_1,
+	.param .u64 triton__0d1d2d3de4_param_2,
+	.param .u64 triton__0d1d2d3de4_param_3,
+	.param .u64 triton__0d1d2d3de4_param_4
+)
+.maxntid 256, 1, 1
+{
+	.reg .pred 	%p<91>;
+	.reg .b16 	%rs<49>;
+	.reg .b32 	%r<84>;
+	.reg .f32 	%f<194>;
+	.reg .b64 	%rd<75>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+	ld.param.u64 	%rd16, [triton__0d1d2d3de4_param_2];
+	ld.param.u64 	%rd15, [triton__0d1d2d3de4_param_1];
+	ld.param.u64 	%rd18, [triton__0d1d2d3de4_param_0];
+$L__tmp0:
+	.loc	1 24 33
+	mov.u32 	%r1, %tid.x;
+	shr.u32 	%r2, %r1, 5;
+	and.b32  	%r3, %r1, 255;
+	or.b32  	%r8, %r3, 256;
+	or.b32  	%r9, %r3, 512;
+	or.b32  	%r10, %r3, 768;
+	or.b32  	%r11, %r3, 1024;
+	or.b32  	%r12, %r3, 1280;
+	or.b32  	%r13, %r3, 1536;
+	or.b32  	%r14, %r3, 1792;
+	.loc	1 21 28
+	mov.u32 %r7, %ctaid.x;
+	.loc	1 21 34
+	cvt.s64.s32 	%rd1, %r7;
+	cvt.u64.u32 	%rd2, %r3;
+	cvt.u64.u32 	%rd9, %r14;
+	cvt.u64.u32 	%rd8, %r13;
+	cvt.u64.u32 	%rd7, %r12;
+	cvt.u64.u32 	%rd6, %r11;
+	cvt.u64.u32 	%rd5, %r10;
+	cvt.u64.u32 	%rd4, %r9;
+	cvt.u64.u32 	%rd3, %r8;
+	.loc	1 27 36
+	mul.wide.s32 	%rd19, %r7, 100514;
+	add.s64 	%rd10, %rd18, %rd19;
+	mov.f32 	%f178, 0fFF800000;
+	mov.u64 	%rd73, 0;
+	mov.u16 	%rs2, 0;
+	mov.f32 	%f179, %f178;
+	mov.f32 	%f180, %f178;
+	mov.f32 	%f181, %f178;
+	mov.f32 	%f182, %f178;
+	mov.f32 	%f183, %f178;
+	mov.f32 	%f184, %f178;
+	mov.f32 	%f185, %f178;
+$L__BB0_1:
+$L__tmp1:
+	.loc	2 38 21
+	setp.num.f32 	%p18, %f178, %f178;
+	setp.num.f32 	%p19, %f179, %f179;
+	setp.num.f32 	%p20, %f180, %f180;
+	setp.num.f32 	%p21, %f181, %f181;
+	setp.num.f32 	%p22, %f182, %f182;
+	setp.num.f32 	%p23, %f183, %f183;
+	setp.num.f32 	%p24, %f184, %f184;
+	setp.num.f32 	%p25, %f185, %f185;
+$L__tmp2:
+	.loc	1 28 27
+	or.b64  	%rd28, %rd73, %rd2;
+	or.b64  	%rd29, %rd73, %rd3;
+	or.b64  	%rd30, %rd73, %rd4;
+	or.b64  	%rd31, %rd73, %rd5;
+	or.b64  	%rd32, %rd73, %rd6;
+	or.b64  	%rd33, %rd73, %rd7;
+	or.b64  	%rd34, %rd73, %rd8;
+	or.b64  	%rd35, %rd73, %rd9;
+	.loc	1 29 25
+	setp.lt.u64 	%p17, %rd35, 50257;
+	setp.lt.u64 	%p15, %rd34, 50257;
+	setp.lt.u64 	%p13, %rd33, 50257;
+	setp.lt.u64 	%p11, %rd32, 50257;
+	setp.lt.u64 	%p9, %rd31, 50257;
+	setp.lt.u64 	%p7, %rd30, 50257;
+	setp.lt.u64 	%p5, %rd29, 50257;
+	setp.lt.u64 	%p3, %rd28, 50257;
+	.loc	1 31 34
+	shl.b64 	%rd36, %rd28, 1;
+	add.s64 	%rd20, %rd10, %rd36;
+	shl.b64 	%rd37, %rd29, 1;
+	add.s64 	%rd21, %rd10, %rd37;
+	shl.b64 	%rd38, %rd30, 1;
+	add.s64 	%rd22, %rd10, %rd38;
+	shl.b64 	%rd39, %rd31, 1;
+	add.s64 	%rd23, %rd10, %rd39;
+	shl.b64 	%rd40, %rd32, 1;
+	add.s64 	%rd24, %rd10, %rd40;
+	shl.b64 	%rd41, %rd33, 1;
+	add.s64 	%rd25, %rd10, %rd41;
+	shl.b64 	%rd42, %rd34, 1;
+	add.s64 	%rd26, %rd10, %rd42;
+	shl.b64 	%rd43, %rd35, 1;
+	add.s64 	%rd27, %rd10, %rd43;
+	.loc	1 31 52
+	mov.u16 %rs1, 0x0;
+	@%p3 ld.global.L1::evict_last.b16 { %rs1 }, [ %rd20 + 0 ];
+	@!%p3 mov.u16 %rs1, %rs2;
+	mov.u16 %rs3, 0x0;
+	@%p5 ld.global.L1::evict_last.b16 { %rs3 }, [ %rd21 + 0 ];
+	@!%p5 mov.u16 %rs3, %rs2;
+	mov.u16 %rs5, 0x0;
+	@%p7 ld.global.L1::evict_last.b16 { %rs5 }, [ %rd22 + 0 ];
+	@!%p7 mov.u16 %rs5, %rs2;
+	mov.u16 %rs7, 0x0;
+	@%p9 ld.global.L1::evict_last.b16 { %rs7 }, [ %rd23 + 0 ];
+	@!%p9 mov.u16 %rs7, %rs2;
+	mov.u16 %rs9, 0x0;
+	@%p11 ld.global.L1::evict_last.b16 { %rs9 }, [ %rd24 + 0 ];
+	@!%p11 mov.u16 %rs9, %rs2;
+	mov.u16 %rs11, 0x0;
+	@%p13 ld.global.L1::evict_last.b16 { %rs11 }, [ %rd25 + 0 ];
+	@!%p13 mov.u16 %rs11, %rs2;
+	mov.u16 %rs13, 0x0;
+	@%p15 ld.global.L1::evict_last.b16 { %rs13 }, [ %rd26 + 0 ];
+	@!%p15 mov.u16 %rs13, %rs2;
+	mov.u16 %rs15, 0x0;
+	@%p17 ld.global.L1::evict_last.b16 { %rs15 }, [ %rd27 + 0 ];
+	@!%p17 mov.u16 %rs15, %rs2;
+	.loc	1 31 103
+	cvt.f32.bf16 %r15, %rs1;
+	mov.b32 	%f42, %r15;
+	cvt.f32.bf16 %r16, %rs3;
+	mov.b32 	%f43, %r16;
+	cvt.f32.bf16 %r17, %rs5;
+	mov.b32 	%f44, %r17;
+	cvt.f32.bf16 %r18, %rs7;
+	mov.b32 	%f45, %r18;
+	cvt.f32.bf16 %r19, %rs9;
+	mov.b32 	%f46, %r19;
+	cvt.f32.bf16 %r20, %rs11;
+	mov.b32 	%f47, %r20;
+	cvt.f32.bf16 %r21, %rs13;
+	mov.b32 	%f48, %r21;
+	cvt.f32.bf16 %r22, %rs15;
+	mov.b32 	%f49, %r22;
+$L__tmp3:
+	.loc	2 36 15
+	setp.leu.f32 	%p26, %f178, %f42;
+	setp.leu.f32 	%p27, %f179, %f43;
+	setp.leu.f32 	%p28, %f180, %f44;
+	setp.leu.f32 	%p29, %f181, %f45;
+	setp.leu.f32 	%p30, %f182, %f46;
+	setp.leu.f32 	%p31, %f183, %f47;
+	setp.leu.f32 	%p32, %f184, %f48;
+	setp.leu.f32 	%p33, %f185, %f49;
+$L__tmp4:
+	.loc	1 0 0
+	selp.f32 	%f50, %f49, %f185, %p33;
+	selp.f32 	%f51, %f50, %f185, %p25;
+	selp.f32 	%f185, %f51, %f185, %p17;
+	selp.f32 	%f52, %f48, %f184, %p32;
+	selp.f32 	%f53, %f52, %f184, %p24;
+	selp.f32 	%f184, %f53, %f184, %p15;
+	selp.f32 	%f54, %f47, %f183, %p31;
+	selp.f32 	%f55, %f54, %f183, %p23;
+	selp.f32 	%f183, %f55, %f183, %p13;
+	selp.f32 	%f56, %f46, %f182, %p30;
+	selp.f32 	%f57, %f56, %f182, %p22;
+	selp.f32 	%f182, %f57, %f182, %p11;
+	selp.f32 	%f58, %f45, %f181, %p29;
+	selp.f32 	%f59, %f58, %f181, %p21;
+	selp.f32 	%f181, %f59, %f181, %p9;
+	selp.f32 	%f60, %f44, %f180, %p28;
+	selp.f32 	%f61, %f60, %f180, %p20;
+	selp.f32 	%f180, %f61, %f180, %p7;
+	selp.f32 	%f62, %f43, %f179, %p27;
+	selp.f32 	%f63, %f62, %f179, %p19;
+	selp.f32 	%f179, %f63, %f179, %p5;
+	selp.f32 	%f64, %f42, %f178, %p26;
+	selp.f32 	%f65, %f64, %f178, %p18;
+	selp.f32 	%f178, %f65, %f178, %p3;
+	.loc	1 27 36
+	add.s64 	%rd73, %rd73, 2048;
+	cvt.u32.u64 	%r23, %rd73;
+	add.s32 	%r24, %r23, -2048;
+	setp.lt.u32 	%p34, %r24, 48209;
+	@%p34 bra 	$L__BB0_1;
+	.loc	1 24 33
+	and.b32  	%r4, %r1, 31;
+	and.b32  	%r32, %r2, 7;
+$L__tmp5:
+	.loc	2 36 15
+	setp.gt.f32 	%p39, %f178, %f179;
+	.loc	2 38 21
+	setp.nan.f32 	%p40, %f178, %f178;
+	.loc	2 39 29
+	selp.f32 	%f74, %f178, %f179, %p40;
+	selp.f32 	%f75, %f178, %f74, %p39;
+	.loc	2 36 15
+	setp.gt.f32 	%p41, %f75, %f180;
+	.loc	2 38 21
+	setp.nan.f32 	%p42, %f75, %f75;
+	.loc	2 39 29
+	selp.f32 	%f76, %f75, %f180, %p42;
+	selp.f32 	%f77, %f75, %f76, %p41;
+	.loc	2 36 15
+	setp.gt.f32 	%p43, %f77, %f181;
+	.loc	2 38 21
+	setp.nan.f32 	%p44, %f77, %f77;
+	.loc	2 39 29
+	selp.f32 	%f78, %f77, %f181, %p44;
+	selp.f32 	%f79, %f77, %f78, %p43;
+	.loc	2 36 15
+	setp.gt.f32 	%p45, %f79, %f182;
+	.loc	2 38 21
+	setp.nan.f32 	%p46, %f79, %f79;
+	.loc	2 39 29
+	selp.f32 	%f80, %f79, %f182, %p46;
+	selp.f32 	%f81, %f79, %f80, %p45;
+	.loc	2 36 15
+	setp.gt.f32 	%p47, %f81, %f183;
+	.loc	2 38 21
+	setp.nan.f32 	%p48, %f81, %f81;
+	.loc	2 39 29
+	selp.f32 	%f82, %f81, %f183, %p48;
+	selp.f32 	%f83, %f81, %f82, %p47;
+	.loc	2 36 15
+	setp.gt.f32 	%p49, %f83, %f184;
+	.loc	2 38 21
+	setp.nan.f32 	%p50, %f83, %f83;
+	.loc	2 39 29
+	selp.f32 	%f84, %f83, %f184, %p50;
+	selp.f32 	%f85, %f83, %f84, %p49;
+	.loc	2 36 15
+	setp.gt.f32 	%p51, %f85, %f185;
+	.loc	2 38 21
+	setp.nan.f32 	%p52, %f85, %f85;
+	.loc	2 39 29
+	selp.f32 	%f86, %f85, %f185, %p52;
+	selp.f32 	%f87, %f85, %f86, %p51;
+$L__tmp6:
+	.loc	2 49 29
+	mov.b32 	%r33, %f87;
+	shfl.sync.bfly.b32	%r34, %r33, 16, 31, -1;
+	mov.b32 	%f88, %r34;
+$L__tmp7:
+	.loc	2 36 15
+	setp.gt.f32 	%p53, %f87, %f88;
+	.loc	2 38 21
+	setp.nan.f32 	%p54, %f87, %f87;
+	.loc	2 39 29
+	selp.f32 	%f89, %f87, %f88, %p53;
+	selp.f32 	%f90, %f87, %f89, %p54;
+$L__tmp8:
+	.loc	2 49 29
+	mov.b32 	%r35, %f90;
+	shfl.sync.bfly.b32	%r36, %r35, 8, 31, -1;
+	mov.b32 	%f91, %r36;
+$L__tmp9:
+	.loc	2 36 15
+	setp.gt.f32 	%p55, %f90, %f91;
+	.loc	2 38 21
+	setp.nan.f32 	%p56, %f90, %f90;
+	.loc	2 39 29
+	selp.f32 	%f92, %f90, %f91, %p56;
+	selp.f32 	%f93, %f90, %f92, %p55;
+$L__tmp10:
+	.loc	2 49 29
+	mov.b32 	%r37, %f93;
+	shfl.sync.bfly.b32	%r38, %r37, 4, 31, -1;
+	mov.b32 	%f94, %r38;
+$L__tmp11:
+	.loc	2 36 15
+	setp.gt.f32 	%p57, %f93, %f94;
+	.loc	2 38 21
+	setp.nan.f32 	%p58, %f93, %f93;
+	.loc	2 39 29
+	selp.f32 	%f95, %f93, %f94, %p58;
+	selp.f32 	%f96, %f93, %f95, %p57;
+$L__tmp12:
+	.loc	2 49 29
+	mov.b32 	%r39, %f96;
+	shfl.sync.bfly.b32	%r40, %r39, 2, 31, -1;
+	mov.b32 	%f97, %r40;
+$L__tmp13:
+	.loc	2 36 15
+	setp.gt.f32 	%p59, %f96, %f97;
+	.loc	2 38 21
+	setp.nan.f32 	%p60, %f96, %f96;
+	.loc	2 39 29
+	selp.f32 	%f98, %f96, %f97, %p60;
+	selp.f32 	%f99, %f96, %f98, %p59;
+$L__tmp14:
+	.loc	2 49 29
+	mov.b32 	%r41, %f99;
+	shfl.sync.bfly.b32	%r42, %r41, 1, 31, -1;
+	mov.b32 	%f100, %r42;
+$L__tmp15:
+	.loc	2 36 15
+	setp.gt.f32 	%p61, %f99, %f100;
+	.loc	2 38 21
+	setp.nan.f32 	%p62, %f99, %f99;
+	.loc	2 39 29
+	selp.f32 	%f101, %f99, %f100, %p62;
+	selp.f32 	%f102, %f99, %f101, %p61;
+$L__tmp16:
+	.loc	2 49 29
+	setp.eq.s32 	%p35, %r4, 0;
+	shl.b32 	%r43, %r32, 2;
+	mov.u32 	%r44, global_smem;
+	add.s32 	%r62, %r44, %r43;
+	mov.b32 	%r26, %f102;
+	@%p35 st.shared.b32 [ %r62 + 0 ], %r26;
+	bar.sync 	0;
+	setp.lt.s32 	%p36, %r1, 8;
+	shl.b32 	%r45, %r1, 2;
+	add.s32 	%r65, %r44, %r45;
+	@%p36 ld.shared.b32 %r27, [ %r65 + 0 ];
+	mov.b32 	%f103, %r27;
+	shfl.sync.bfly.b32	%r46, %r27, 4, 31, -1;
+	mov.b32 	%f104, %r46;
+$L__tmp17:
+	.loc	2 36 15
+	setp.gt.f32 	%p63, %f103, %f104;
+	.loc	2 38 21
+	setp.nan.f32 	%p64, %f103, %f103;
+	.loc	2 39 29
+	selp.f32 	%f105, %f103, %f104, %p63;
+	selp.f32 	%f106, %f103, %f105, %p64;
+$L__tmp18:
+	.loc	2 49 29
+	mov.b32 	%r47, %f106;
+	shfl.sync.bfly.b32	%r48, %r47, 2, 31, -1;
+	mov.b32 	%f107, %r48;
+$L__tmp19:
+	.loc	2 36 15
+	setp.gt.f32 	%p65, %f106, %f107;
+	.loc	2 38 21
+	setp.nan.f32 	%p66, %f106, %f106;
+	.loc	2 39 29
+	selp.f32 	%f108, %f106, %f107, %p66;
+	selp.f32 	%f109, %f106, %f108, %p65;
+$L__tmp20:
+	.loc	2 49 29
+	mov.b32 	%r49, %f109;
+	shfl.sync.bfly.b32	%r50, %r49, 1, 31, -1;
+	mov.b32 	%f110, %r50;
+$L__tmp21:
+	.loc	2 36 15
+	setp.gt.f32 	%p67, %f109, %f110;
+	.loc	2 38 21
+	setp.nan.f32 	%p68, %f109, %f109;
+	.loc	2 39 29
+	selp.f32 	%f111, %f109, %f110, %p68;
+	selp.f32 	%f112, %f109, %f111, %p67;
+$L__tmp22:
+	.loc	2 49 29
+	and.b32  	%r51, %r1, 7;
+	setp.eq.s32 	%p69, %r51, 0;
+	and.pred  	%p89, %p36, %p69;
+	mov.b32 	%r30, %f112;
+	@%p89 st.shared.b32 [ %r65 + 0 ], %r30;
+	bar.sync 	0;
+	ld.shared.f32 	%f17, [global_smem];
+$L__tmp23:
+	.loc	1 36 41
+	bar.sync 	0;
+	st.shared.f32 	[global_smem], %f17;
+	bar.sync 	0;
+	ld.shared.u32 	%r31, [global_smem];
+	.loc	1 37 25
+	shl.b64 	%rd46, %rd1, 2;
+	add.s64 	%rd44, %rd15, %rd46;
+	.loc	1 37 36
+	setp.eq.s32 	%p38, %r3, 0;
+	@%p38 st.global.b32 [ %rd44 + 0 ], { %r31 };
+	mov.f32 	%f186, 0f00000000;
+	mov.u64 	%rd74, 0;
+	mov.f32 	%f187, %f186;
+	mov.f32 	%f188, %f186;
+	mov.f32 	%f189, %f186;
+	mov.f32 	%f190, %f186;
+	mov.f32 	%f191, %f186;
+	mov.f32 	%f192, %f186;
+	mov.f32 	%f193, %f186;
+$L__BB0_3:
+	.loc	1 40 27
+	or.b64  	%rd55, %rd74, %rd2;
+	or.b64  	%rd56, %rd74, %rd3;
+	or.b64  	%rd57, %rd74, %rd4;
+	or.b64  	%rd58, %rd74, %rd5;
+	or.b64  	%rd59, %rd74, %rd6;
+	or.b64  	%rd60, %rd74, %rd7;
+	or.b64  	%rd61, %rd74, %rd8;
+	or.b64  	%rd62, %rd74, %rd9;
+	.loc	1 41 25
+	setp.lt.u64 	%p85, %rd62, 50257;
+	setp.lt.u64 	%p83, %rd61, 50257;
+	setp.lt.u64 	%p81, %rd60, 50257;
+	setp.lt.u64 	%p79, %rd59, 50257;
+	setp.lt.u64 	%p77, %rd58, 50257;
+	setp.lt.u64 	%p75, %rd57, 50257;
+	setp.lt.u64 	%p73, %rd56, 50257;
+	setp.lt.u64 	%p71, %rd55, 50257;
+	.loc	1 43 34
+	shl.b64 	%rd63, %rd55, 1;
+	add.s64 	%rd47, %rd10, %rd63;
+	shl.b64 	%rd64, %rd56, 1;
+	add.s64 	%rd48, %rd10, %rd64;
+	shl.b64 	%rd65, %rd57, 1;
+	add.s64 	%rd49, %rd10, %rd65;
+	shl.b64 	%rd66, %rd58, 1;
+	add.s64 	%rd50, %rd10, %rd66;
+	shl.b64 	%rd67, %rd59, 1;
+	add.s64 	%rd51, %rd10, %rd67;
+	shl.b64 	%rd68, %rd60, 1;
+	add.s64 	%rd52, %rd10, %rd68;
+	shl.b64 	%rd69, %rd61, 1;
+	add.s64 	%rd53, %rd10, %rd69;
+	shl.b64 	%rd70, %rd62, 1;
+	add.s64 	%rd54, %rd10, %rd70;
+	.loc	1 43 52
+	mov.u16 %rs25, 0x0;
+	@%p71 ld.global.L1::evict_first.b16 { %rs25 }, [ %rd47 + 0 ];
+	@!%p71 mov.u16 %rs25, %rs2;
+	mov.u16 %rs27, 0x0;
+	@%p73 ld.global.L1::evict_first.b16 { %rs27 }, [ %rd48 + 0 ];
+	@!%p73 mov.u16 %rs27, %rs2;
+	mov.u16 %rs29, 0x0;
+	@%p75 ld.global.L1::evict_first.b16 { %rs29 }, [ %rd49 + 0 ];
+	@!%p75 mov.u16 %rs29, %rs2;
+	mov.u16 %rs31, 0x0;
+	@%p77 ld.global.L1::evict_first.b16 { %rs31 }, [ %rd50 + 0 ];
+	@!%p77 mov.u16 %rs31, %rs2;
+	mov.u16 %rs33, 0x0;
+	@%p79 ld.global.L1::evict_first.b16 { %rs33 }, [ %rd51 + 0 ];
+	@!%p79 mov.u16 %rs33, %rs2;
+	mov.u16 %rs35, 0x0;
+	@%p81 ld.global.L1::evict_first.b16 { %rs35 }, [ %rd52 + 0 ];
+	@!%p81 mov.u16 %rs35, %rs2;
+	mov.u16 %rs37, 0x0;
+	@%p83 ld.global.L1::evict_first.b16 { %rs37 }, [ %rd53 + 0 ];
+	@!%p83 mov.u16 %rs37, %rs2;
+	mov.u16 %rs39, 0x0;
+	@%p85 ld.global.L1::evict_first.b16 { %rs39 }, [ %rd54 + 0 ];
+	@!%p85 mov.u16 %rs39, %rs2;
+	.loc	1 43 104
+	cvt.f32.bf16 %r52, %rs25;
+	mov.b32 	%f129, %r52;
+	cvt.f32.bf16 %r53, %rs27;
+	mov.b32 	%f130, %r53;
+	cvt.f32.bf16 %r54, %rs29;
+	mov.b32 	%f131, %r54;
+	cvt.f32.bf16 %r55, %rs31;
+	mov.b32 	%f132, %r55;
+	cvt.f32.bf16 %r56, %rs33;
+	mov.b32 	%f133, %r56;
+	cvt.f32.bf16 %r57, %rs35;
+	mov.b32 	%f134, %r57;
+	cvt.f32.bf16 %r58, %rs37;
+	mov.b32 	%f135, %r58;
+	cvt.f32.bf16 %r59, %rs39;
+	mov.b32 	%f136, %r59;
+	.loc	1 45 22
+	sub.f32 	%f137, %f129, %f17;
+	sub.f32 	%f138, %f130, %f17;
+	sub.f32 	%f139, %f131, %f17;
+	sub.f32 	%f140, %f132, %f17;
+	sub.f32 	%f141, %f133, %f17;
+	sub.f32 	%f142, %f134, %f17;
+	sub.f32 	%f143, %f135, %f17;
+	sub.f32 	%f144, %f136, %f17;
+	.loc	1 46 22
+	mul.f32 	%f114, %f137, 0f3FB8AA3B;
+	ex2.approx.f32 %f113, %f114;
+	mul.f32 	%f116, %f138, 0f3FB8AA3B;
+	ex2.approx.f32 %f115, %f116;
+	mul.f32 	%f118, %f139, 0f3FB8AA3B;
+	ex2.approx.f32 %f117, %f118;
+	mul.f32 	%f120, %f140, 0f3FB8AA3B;
+	ex2.approx.f32 %f119, %f120;
+	mul.f32 	%f122, %f141, 0f3FB8AA3B;
+	ex2.approx.f32 %f121, %f122;
+	mul.f32 	%f124, %f142, 0f3FB8AA3B;
+	ex2.approx.f32 %f123, %f124;
+	mul.f32 	%f126, %f143, 0f3FB8AA3B;
+	ex2.approx.f32 %f125, %f126;
+	mul.f32 	%f128, %f144, 0f3FB8AA3B;
+	ex2.approx.f32 %f127, %f128;
+	.loc	1 49 40
+	selp.f32 	%f145, %f113, 0f80000000, %p71;
+	selp.f32 	%f146, %f115, 0f80000000, %p73;
+	selp.f32 	%f147, %f117, 0f80000000, %p75;
+	selp.f32 	%f148, %f119, 0f80000000, %p77;
+	selp.f32 	%f149, %f121, 0f80000000, %p79;
+	selp.f32 	%f150, %f123, 0f80000000, %p81;
+	selp.f32 	%f151, %f125, 0f80000000, %p83;
+	selp.f32 	%f152, %f127, 0f80000000, %p85;
+	add.f32 	%f193, %f193, %f152;
+	add.f32 	%f192, %f192, %f151;
+	add.f32 	%f191, %f191, %f150;
+	add.f32 	%f190, %f190, %f149;
+	add.f32 	%f189, %f189, %f148;
+	add.f32 	%f188, %f188, %f147;
+	add.f32 	%f187, %f187, %f146;
+	add.f32 	%f186, %f186, %f145;
+	.loc	1 39 36
+	add.s64 	%rd74, %rd74, 2048;
+	cvt.u32.u64 	%r60, %rd74;
+	add.s32 	%r61, %r60, -2048;
+	setp.lt.u32 	%p86, %r61, 48209;
+	@%p86 bra 	$L__BB0_3;
+$L__tmp24:
+	.loc	3 243 36
+	bar.sync 	0;
+$L__tmp25:
+	.loc	3 233 15
+	add.f32 	%f153, %f186, %f187;
+	add.f32 	%f154, %f188, %f153;
+	add.f32 	%f155, %f189, %f154;
+	add.f32 	%f156, %f190, %f155;
+	add.f32 	%f157, %f191, %f156;
+	add.f32 	%f158, %f192, %f157;
+	add.f32 	%f159, %f193, %f158;
+$L__tmp26:
+	.loc	3 243 36
+	mov.b32 	%r69, %f159;
+	shfl.sync.bfly.b32	%r70, %r69, 16, 31, -1;
+	mov.b32 	%f160, %r70;
+$L__tmp27:
+	.loc	3 233 15
+	add.f32 	%f161, %f159, %f160;
+$L__tmp28:
+	.loc	3 243 36
+	mov.b32 	%r71, %f161;
+	shfl.sync.bfly.b32	%r72, %r71, 8, 31, -1;
+	mov.b32 	%f162, %r72;
+$L__tmp29:
+	.loc	3 233 15
+	add.f32 	%f163, %f161, %f162;
+$L__tmp30:
+	.loc	3 243 36
+	mov.b32 	%r73, %f163;
+	shfl.sync.bfly.b32	%r74, %r73, 4, 31, -1;
+	mov.b32 	%f164, %r74;
+$L__tmp31:
+	.loc	3 233 15
+	add.f32 	%f165, %f163, %f164;
+$L__tmp32:
+	.loc	3 243 36
+	mov.b32 	%r75, %f165;
+	shfl.sync.bfly.b32	%r76, %r75, 2, 31, -1;
+	mov.b32 	%f166, %r76;
+$L__tmp33:
+	.loc	3 233 15
+	add.f32 	%f167, %f165, %f166;
+$L__tmp34:
+	.loc	3 243 36
+	mov.b32 	%r77, %f167;
+	shfl.sync.bfly.b32	%r78, %r77, 1, 31, -1;
+	mov.b32 	%f168, %r78;
+$L__tmp35:
+	.loc	3 233 15
+	add.f32 	%f169, %f167, %f168;
+$L__tmp36:
+	.loc	3 243 36
+	mov.b32 	%r63, %f169;
+	@%p35 st.shared.b32 [ %r62 + 0 ], %r63;
+	bar.sync 	0;
+	@%p36 ld.shared.b32 %r64, [ %r65 + 0 ];
+	mov.b32 	%f170, %r64;
+	shfl.sync.bfly.b32	%r79, %r64, 4, 31, -1;
+	mov.b32 	%f171, %r79;
+$L__tmp37:
+	.loc	3 233 15
+	add.f32 	%f172, %f170, %f171;
+$L__tmp38:
+	.loc	3 243 36
+	mov.b32 	%r80, %f172;
+	shfl.sync.bfly.b32	%r81, %r80, 2, 31, -1;
+	mov.b32 	%f173, %r81;
+$L__tmp39:
+	.loc	3 233 15
+	add.f32 	%f174, %f172, %f173;
+$L__tmp40:
+	.loc	3 243 36
+	mov.b32 	%r82, %f174;
+	shfl.sync.bfly.b32	%r83, %r82, 1, 31, -1;
+	mov.b32 	%f175, %r83;
+$L__tmp41:
+	.loc	3 233 15
+	add.f32 	%f176, %f174, %f175;
+$L__tmp42:
+	.loc	3 243 36
+	mov.b32 	%r67, %f176;
+	@%p89 st.shared.b32 [ %r65 + 0 ], %r67;
+	bar.sync 	0;
+	ld.shared.f32 	%f177, [global_smem];
+$L__tmp43:
+	.loc	1 50 30
+	bar.sync 	0;
+	st.shared.f32 	[global_smem], %f177;
+	bar.sync 	0;
+	ld.shared.u32 	%r68, [global_smem];
+	.loc	1 51 25
+	add.s64 	%rd71, %rd16, %rd46;
+	.loc	1 51 37
+	@%p38 st.global.b32 [ %rd71 + 0 ], { %r68 };
+	.loc	1 51 4
+	ret;
+$L__tmp44:
+$L__func_end0:
+}
+	.file	1 "/tmp/torchinductor_root/cy/ccyhhqogjmaiuaq7b54att75rswph7r3hvxgfmkjyupj74n77r6i.py"
+	.file	2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
+	.file	3 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 32
+.b8 11
+.b8 0
+.b8 0
+.b8 3
+.b8 46
+.b8 1
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 49
+.b8 19
+.b8 0
+.b8 0
+.b8 4
+.b8 29
+.b8 0
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 5
+.b8 29
+.b8 1
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 359
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 99
+.b8 121
+.b8 104
+.b8 104
+.b8 113
+.b8 111
+.b8 103
+.b8 106
+.b8 109
+.b8 97
+.b8 105
+.b8 117
+.b8 97
+.b8 113
+.b8 55
+.b8 98
+.b8 53
+.b8 52
+.b8 97
+.b8 116
+.b8 116
+.b8 55
+.b8 53
+.b8 114
+.b8 115
+.b8 119
+.b8 112
+.b8 104
+.b8 55
+.b8 114
+.b8 51
+.b8 104
+.b8 118
+.b8 120
+.b8 103
+.b8 102
+.b8 109
+.b8 107
+.b8 106
+.b8 121
+.b8 117
+.b8 112
+.b8 106
+.b8 55
+.b8 52
+.b8 110
+.b8 55
+.b8 55
+.b8 114
+.b8 54
+.b8 105
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 99
+.b8 121
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 101
+.b8 52
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 101
+.b8 52
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 1
+.b8 3
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b32 125
+.b8 4
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp4
+.b8 2
+.b8 34
+.b8 45
+.b8 5
+.b32 125
+.b64 $L__tmp5
+.b64 $L__tmp22
+.b8 2
+.b8 36
+.b8 38
+.b8 4
+.b32 125
+.b64 $L__tmp5
+.b64 $L__tmp22
+.b8 2
+.b8 49
+.b8 29
+.b8 0
+.b8 4
+.b32 125
+.b64 $L__tmp6
+.b64 $L__tmp23
+.b8 2
+.b8 36
+.b8 38
+.b8 4
+.b32 125
+.b64 $L__tmp24
+.b64 $L__tmp43
+.b8 3
+.b8 50
+.b8 27
+.b8 5
+.b32 125
+.b64 $L__tmp25
+.b64 $L__tmp42
+.b8 3
+.b8 50
+.b8 27
+.b8 4
+.b32 125
+.b64 $L__tmp25
+.b64 $L__tmp42
+.b8 3
+.b8 243
+.b8 36
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 363
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 101
+.b8 52
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 363
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}

.triton/dump/fac03406d1136fc802dac111a1efea36/triton_.ttir ADDED Viewed

	@@ -0,0 +1,79 @@

+module {
+  tt.func public @triton__0d1d2d3de4(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i64) attributes {noinline = false} {
+    %c50257_i64 = arith.constant 50257 : i64
+    %cst = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16>
+    %cst_0 = arith.constant dense<true> : tensor<1x2048xi1>
+    %c50257_i32 = arith.constant 50257 : i32
+    %c2048_i32 = arith.constant 2048 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %cst_1 = arith.constant dense<50257> : tensor<1x2048xi64>
+    %cst_2 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32>
+    %cst_3 = arith.constant dense<0xFF800000> : tensor<1x2048xf32>
+    %0 = tt.get_program_id x : i32
+    %1 = arith.extsi %0 : i32 to i64
+    %2 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32>
+    %3 = tt.expand_dims %2 {axis = 0 : i32} : (tensor<2048xi32>) -> tensor<1x2048xi32>
+    %4 = arith.extsi %3 : tensor<1x2048xi32> to tensor<1x2048xi64>
+    %5 = arith.muli %1, %c50257_i64 : i64
+    %6 = tt.splat %5 : (i64) -> tensor<1x2048xi64>
+    %7 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1x2048x!tt.ptr<bf16, 1>>
+    %8 = scf.for %arg5 = %c0_i32 to %c50257_i32 step %c2048_i32 iter_args(%arg6 = %cst_3) -> (tensor<1x2048xf32>)  : i32 {
+      %22 = arith.extsi %arg5 : i32 to i64
+      %23 = tt.splat %22 : (i64) -> tensor<1x2048xi64>
+      %24 = arith.addi %23, %4 : tensor<1x2048xi64>
+      %25 = arith.cmpi slt, %24, %cst_1 : tensor<1x2048xi64>
+      %26 = arith.addi %24, %6 : tensor<1x2048xi64>
+      %27 = tt.addptr %7, %26 : tensor<1x2048x!tt.ptr<bf16, 1>>, tensor<1x2048xi64>
+      %28 = tt.load %27, %25, %cst {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x2048xbf16>
+      %29 = arith.extf %28 : tensor<1x2048xbf16> to tensor<1x2048xf32>
+      %30 = arith.cmpf ogt, %arg6, %29 : tensor<1x2048xf32>
+      %31 = arith.cmpf une, %arg6, %arg6 : tensor<1x2048xf32>
+      %32 = arith.ori %30, %31 : tensor<1x2048xi1>
+      %33 = arith.xori %32, %cst_0 : tensor<1x2048xi1>
+      %34 = arith.andi %25, %33 : tensor<1x2048xi1>
+      %35 = arith.select %34, %29, %arg6 : tensor<1x2048xi1>, tensor<1x2048xf32>
+      scf.yield %35 : tensor<1x2048xf32>
+    }
+    %9 = "tt.reduce"(%8) <{axis = 1 : i32}> ({
+    ^bb0(%arg5: f32, %arg6: f32):
+      %22 = arith.cmpf ogt, %arg5, %arg6 : f32
+      %23 = arith.cmpf une, %arg5, %arg5 : f32
+      %24 = arith.ori %22, %23 : i1
+      %25 = arith.select %24, %arg5, %arg6 : f32
+      tt.reduce.return %25 : f32
+    }) : (tensor<1x2048xf32>) -> tensor<1xf32>
+    %10 = tt.expand_dims %9 {axis = 1 : i32} : (tensor<1xf32>) -> tensor<1x1xf32>
+    %11 = tt.addptr %arg1, %1 : !tt.ptr<f32, 1>, i64
+    %12 = tt.splat %11 : (!tt.ptr<f32, 1>) -> tensor<1x1x!tt.ptr<f32, 1>>
+    tt.store %12, %10 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xf32>
+    %13 = arith.muli %1, %c50257_i64 : i64
+    %14 = tt.splat %13 : (i64) -> tensor<1x2048xi64>
+    %15 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1x2048x!tt.ptr<bf16, 1>>
+    %16 = tt.broadcast %10 : (tensor<1x1xf32>) -> tensor<1x2048xf32>
+    %17 = scf.for %arg5 = %c0_i32 to %c50257_i32 step %c2048_i32 iter_args(%arg6 = %cst_2) -> (tensor<1x2048xf32>)  : i32 {
+      %22 = arith.extsi %arg5 : i32 to i64
+      %23 = tt.splat %22 : (i64) -> tensor<1x2048xi64>
+      %24 = arith.addi %23, %4 : tensor<1x2048xi64>
+      %25 = arith.cmpi slt, %24, %cst_1 : tensor<1x2048xi64>
+      %26 = arith.addi %24, %14 : tensor<1x2048xi64>
+      %27 = tt.addptr %15, %26 : tensor<1x2048x!tt.ptr<bf16, 1>>, tensor<1x2048xi64>
+      %28 = tt.load %27, %25, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x2048xbf16>
+      %29 = arith.extf %28 : tensor<1x2048xbf16> to tensor<1x2048xf32>
+      %30 = arith.subf %29, %16 : tensor<1x2048xf32>
+      %31 = math.exp %30 : tensor<1x2048xf32>
+      %32 = arith.addf %arg6, %31 : tensor<1x2048xf32>
+      %33 = arith.select %25, %32, %arg6 : tensor<1x2048xi1>, tensor<1x2048xf32>
+      scf.yield %33 : tensor<1x2048xf32>
+    }
+    %18 = "tt.reduce"(%17) <{axis = 1 : i32}> ({
+    ^bb0(%arg5: f32, %arg6: f32):
+      %22 = arith.addf %arg5, %arg6 : f32
+      tt.reduce.return %22 : f32
+    }) : (tensor<1x2048xf32>) -> tensor<1xf32>
+    %19 = tt.expand_dims %18 {axis = 1 : i32} : (tensor<1xf32>) -> tensor<1x1xf32>
+    %20 = tt.addptr %arg2, %1 : !tt.ptr<f32, 1>, i64
+    %21 = tt.splat %20 : (!tt.ptr<f32, 1>) -> tensor<1x1x!tt.ptr<f32, 1>>
+    tt.store %21, %19 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xf32>
+    tt.return
+  }
+}