0-hero commited on
Commit
2000724
·
verified ·
1 Parent(s): 0def249

Add files using upload-large-folder tool

Browse files
.gitattributes CHANGED
@@ -49,3 +49,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
49
  .local/share/jupyter/nbextensions/nbTranslate/demo2.gif filter=lfs diff=lfs merge=lfs -text
50
  .local/share/jupyter/nbextensions/scratchpad/demo.gif filter=lfs diff=lfs merge=lfs -text
51
  .local/share/jupyter/nbextensions/toc2/demo.gif filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
49
  .local/share/jupyter/nbextensions/nbTranslate/demo2.gif filter=lfs diff=lfs merge=lfs -text
50
  .local/share/jupyter/nbextensions/scratchpad/demo.gif filter=lfs diff=lfs merge=lfs -text
51
  .local/share/jupyter/nbextensions/toc2/demo.gif filter=lfs diff=lfs merge=lfs -text
52
+ wandb/run-20240926_055222-14kj2390/run-14kj2390.wandb filter=lfs diff=lfs merge=lfs -text
53
+ wandb/run-20240926_124123-zc6s8e8w/run-zc6s8e8w.wandb filter=lfs diff=lfs merge=lfs -text
54
+ wandb/run-20240926_192831-378lr5yg/run-378lr5yg.wandb filter=lfs diff=lfs merge=lfs -text
55
+ wandb/run-20240927_021423-clesd0p8/run-clesd0p8.wandb filter=lfs diff=lfs merge=lfs -text
.triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.llir ADDED
@@ -0,0 +1,1121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
5
+ @assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
6
+ @assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp16 < 50257"
7
+ @assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
8
+ @assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
9
+ @assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
10
+ @global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
11
+ @.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
12
+
13
+ declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
14
+
15
+ define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !7 {
16
+ %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
17
+ %10 = lshr i32 %9, 5, !dbg !10
18
+ %11 = and i32 %10, 7, !dbg !10
19
+ %12 = and i32 %9, 15, !dbg !10
20
+ %13 = shl i32 %9, 3, !dbg !11
21
+ %14 = and i32 %13, 248, !dbg !11
22
+ %15 = or i32 %14, 4, !dbg !11
23
+ %urem = and i32 %9, 255, !dbg !11
24
+ %16 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !12
25
+ %17 = shl i32 %16, 4, !dbg !13
26
+ %18 = or i32 %17, %11, !dbg !14
27
+ %19 = or i32 %18, 8, !dbg !14
28
+ %20 = or i32 %17, %12, !dbg !14
29
+ %21 = sext i32 %18 to i64, !dbg !15
30
+ %22 = getelementptr i64, ptr addrspace(1) %0, i64 %21, !dbg !15
31
+ %23 = sext i32 %19 to i64, !dbg !15
32
+ %24 = getelementptr i64, ptr addrspace(1) %0, i64 %23, !dbg !15
33
+ %25 = sext i32 %20 to i64, !dbg !15
34
+ %26 = getelementptr i64, ptr addrspace(1) %0, i64 %25, !dbg !15
35
+ %27 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16
36
+ %28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16
37
+ %29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16
38
+ %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16
39
+ %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16
40
+ %32 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16
41
+ %33 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16
42
+ %34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16
43
+ %35 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
44
+ %36 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
45
+ %37 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
46
+ %38 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
47
+ %39 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
48
+ %40 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
49
+ %41 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
50
+ %42 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
51
+ %43 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #6, !dbg !16
52
+ %44 = srem i32 %18, 512, !dbg !17
53
+ %45 = srem i32 %19, 512, !dbg !17
54
+ %46 = shl nsw i32 %44, 8, !dbg !18
55
+ %47 = shl nsw i32 %45, 8, !dbg !18
56
+ %48 = or i32 %46, %14, !dbg !19
57
+ %49 = or i32 %46, %15, !dbg !19
58
+ %50 = or i32 %47, %14, !dbg !19
59
+ %51 = or i32 %47, %15, !dbg !19
60
+ %52 = sext i32 %48 to i64, !dbg !20
61
+ %53 = getelementptr float, ptr addrspace(1) %2, i64 %52, !dbg !20
62
+ %54 = sext i32 %49 to i64, !dbg !20
63
+ %55 = getelementptr float, ptr addrspace(1) %2, i64 %54, !dbg !20
64
+ %56 = sext i32 %50 to i64, !dbg !20
65
+ %57 = getelementptr float, ptr addrspace(1) %2, i64 %56, !dbg !20
66
+ %58 = sext i32 %51 to i64, !dbg !20
67
+ %59 = getelementptr float, ptr addrspace(1) %2, i64 %58, !dbg !20
68
+ %60 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %53, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21
69
+ %61 = extractvalue { i32, i32, i32, i32 } %60, 0, !dbg !21
70
+ %62 = extractvalue { i32, i32, i32, i32 } %60, 1, !dbg !21
71
+ %63 = extractvalue { i32, i32, i32, i32 } %60, 2, !dbg !21
72
+ %64 = extractvalue { i32, i32, i32, i32 } %60, 3, !dbg !21
73
+ %65 = bitcast i32 %61 to float, !dbg !21
74
+ %66 = bitcast i32 %62 to float, !dbg !21
75
+ %67 = bitcast i32 %63 to float, !dbg !21
76
+ %68 = bitcast i32 %64 to float, !dbg !21
77
+ %69 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %55, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21
78
+ %70 = extractvalue { i32, i32, i32, i32 } %69, 0, !dbg !21
79
+ %71 = extractvalue { i32, i32, i32, i32 } %69, 1, !dbg !21
80
+ %72 = extractvalue { i32, i32, i32, i32 } %69, 2, !dbg !21
81
+ %73 = extractvalue { i32, i32, i32, i32 } %69, 3, !dbg !21
82
+ %74 = bitcast i32 %70 to float, !dbg !21
83
+ %75 = bitcast i32 %71 to float, !dbg !21
84
+ %76 = bitcast i32 %72 to float, !dbg !21
85
+ %77 = bitcast i32 %73 to float, !dbg !21
86
+ %78 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %57, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21
87
+ %79 = extractvalue { i32, i32, i32, i32 } %78, 0, !dbg !21
88
+ %80 = extractvalue { i32, i32, i32, i32 } %78, 1, !dbg !21
89
+ %81 = extractvalue { i32, i32, i32, i32 } %78, 2, !dbg !21
90
+ %82 = extractvalue { i32, i32, i32, i32 } %78, 3, !dbg !21
91
+ %83 = bitcast i32 %79 to float, !dbg !21
92
+ %84 = bitcast i32 %80 to float, !dbg !21
93
+ %85 = bitcast i32 %81 to float, !dbg !21
94
+ %86 = bitcast i32 %82 to float, !dbg !21
95
+ %87 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %59, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21
96
+ %88 = extractvalue { i32, i32, i32, i32 } %87, 0, !dbg !21
97
+ %89 = extractvalue { i32, i32, i32, i32 } %87, 1, !dbg !21
98
+ %90 = extractvalue { i32, i32, i32, i32 } %87, 2, !dbg !21
99
+ %91 = extractvalue { i32, i32, i32, i32 } %87, 3, !dbg !21
100
+ %92 = bitcast i32 %88 to float, !dbg !21
101
+ %93 = bitcast i32 %89 to float, !dbg !21
102
+ %94 = bitcast i32 %90 to float, !dbg !21
103
+ %95 = bitcast i32 %91 to float, !dbg !21
104
+ %96 = shl i32 %18, 8, !dbg !22
105
+ %97 = shl i32 %19, 8, !dbg !22
106
+ %98 = or i32 %96, %14, !dbg !23
107
+ %99 = or i32 %97, %14, !dbg !23
108
+ %100 = sext i32 %98 to i64, !dbg !24
109
+ %101 = getelementptr i16, ptr addrspace(1) %3, i64 %100, !dbg !24
110
+ %102 = sext i32 %99 to i64, !dbg !24
111
+ %103 = getelementptr i16, ptr addrspace(1) %3, i64 %102, !dbg !24
112
+ %104 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %101, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !25
113
+ %105 = extractvalue { i32, i32, i32, i32 } %104, 0, !dbg !25
114
+ %106 = extractvalue { i32, i32, i32, i32 } %104, 1, !dbg !25
115
+ %107 = extractvalue { i32, i32, i32, i32 } %104, 2, !dbg !25
116
+ %108 = extractvalue { i32, i32, i32, i32 } %104, 3, !dbg !25
117
+ %109 = trunc i32 %105 to i16, !dbg !25
118
+ %extelt.offset = lshr i32 %105, 16, !dbg !25
119
+ %110 = trunc i32 %extelt.offset to i16, !dbg !25
120
+ %111 = trunc i32 %106 to i16, !dbg !25
121
+ %extelt.offset1 = lshr i32 %106, 16, !dbg !25
122
+ %112 = trunc i32 %extelt.offset1 to i16, !dbg !25
123
+ %113 = trunc i32 %107 to i16, !dbg !25
124
+ %extelt.offset2 = lshr i32 %107, 16, !dbg !25
125
+ %114 = trunc i32 %extelt.offset2 to i16, !dbg !25
126
+ %115 = trunc i32 %108 to i16, !dbg !25
127
+ %extelt.offset3 = lshr i32 %108, 16, !dbg !25
128
+ %116 = trunc i32 %extelt.offset3 to i16, !dbg !25
129
+ %117 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %103, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !25
130
+ %118 = extractvalue { i32, i32, i32, i32 } %117, 0, !dbg !25
131
+ %119 = extractvalue { i32, i32, i32, i32 } %117, 1, !dbg !25
132
+ %120 = extractvalue { i32, i32, i32, i32 } %117, 2, !dbg !25
133
+ %121 = extractvalue { i32, i32, i32, i32 } %117, 3, !dbg !25
134
+ %122 = trunc i32 %118 to i16, !dbg !25
135
+ %extelt.offset4 = lshr i32 %118, 16, !dbg !25
136
+ %123 = trunc i32 %extelt.offset4 to i16, !dbg !25
137
+ %124 = trunc i32 %119 to i16, !dbg !25
138
+ %extelt.offset5 = lshr i32 %119, 16, !dbg !25
139
+ %125 = trunc i32 %extelt.offset5 to i16, !dbg !25
140
+ %126 = trunc i32 %120 to i16, !dbg !25
141
+ %extelt.offset6 = lshr i32 %120, 16, !dbg !25
142
+ %127 = trunc i32 %extelt.offset6 to i16, !dbg !25
143
+ %128 = trunc i32 %121 to i16, !dbg !25
144
+ %extelt.offset7 = lshr i32 %121, 16, !dbg !25
145
+ %129 = trunc i32 %extelt.offset7 to i16, !dbg !25
146
+ %130 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %109) #6, !dbg !26
147
+ %131 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %110) #6, !dbg !26
148
+ %132 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %111) #6, !dbg !26
149
+ %133 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %112) #6, !dbg !26
150
+ %134 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %113) #6, !dbg !26
151
+ %135 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %114) #6, !dbg !26
152
+ %136 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %115) #6, !dbg !26
153
+ %137 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %116) #6, !dbg !26
154
+ %138 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %122) #6, !dbg !26
155
+ %139 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %123) #6, !dbg !26
156
+ %140 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %124) #6, !dbg !26
157
+ %141 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %125) #6, !dbg !26
158
+ %142 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %126) #6, !dbg !26
159
+ %143 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %127) #6, !dbg !26
160
+ %144 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %128) #6, !dbg !26
161
+ %145 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %129) #6, !dbg !26
162
+ %146 = add i64 %43, 50257, !dbg !27
163
+ %147 = icmp slt i64 %27, 0, !dbg !28
164
+ %148 = icmp slt i64 %35, 0, !dbg !28
165
+ %149 = icmp slt i64 %43, 0, !dbg !28
166
+ %150 = select i1 %149, i64 %146, i64 %43, !dbg !29
167
+ %151 = icmp ugt i64 %150, 50256, !dbg !30
168
+ br i1 %151, label %152, label %153, !dbg !31
169
+
170
+ 152: ; preds = %8
171
+ tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !31
172
+ br label %153, !dbg !31
173
+
174
+ 153: ; preds = %152, %8
175
+ %154 = shl i64 %27, 8, !dbg !32
176
+ %155 = add i64 %154, 12865792, !dbg !32
177
+ %156 = select i1 %147, i64 %155, i64 %154, !dbg !32
178
+ %157 = shl i64 %35, 8, !dbg !32
179
+ %158 = add i64 %157, 12865792, !dbg !32
180
+ %159 = select i1 %148, i64 %158, i64 %157, !dbg !32
181
+ %160 = zext nneg i32 %14 to i64
182
+ %161 = zext nneg i32 %15 to i64
183
+ %162 = or i64 %156, %160, !dbg !33
184
+ %163 = or i64 %156, %161, !dbg !33
185
+ %164 = or i64 %159, %160, !dbg !33
186
+ %165 = or i64 %159, %161, !dbg !33
187
+ %166 = getelementptr float, ptr addrspace(1) %1, i64 %162, !dbg !34
188
+ %167 = getelementptr float, ptr addrspace(1) %1, i64 %163, !dbg !34
189
+ %168 = getelementptr float, ptr addrspace(1) %1, i64 %164, !dbg !34
190
+ %169 = getelementptr float, ptr addrspace(1) %1, i64 %165, !dbg !34
191
+ %170 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %166, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !35
192
+ %171 = extractvalue { i32, i32, i32, i32 } %170, 0, !dbg !35
193
+ %172 = extractvalue { i32, i32, i32, i32 } %170, 1, !dbg !35
194
+ %173 = extractvalue { i32, i32, i32, i32 } %170, 2, !dbg !35
195
+ %174 = extractvalue { i32, i32, i32, i32 } %170, 3, !dbg !35
196
+ %175 = bitcast i32 %171 to float, !dbg !35
197
+ %176 = bitcast i32 %172 to float, !dbg !35
198
+ %177 = bitcast i32 %173 to float, !dbg !35
199
+ %178 = bitcast i32 %174 to float, !dbg !35
200
+ %179 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %167, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !35
201
+ %180 = extractvalue { i32, i32, i32, i32 } %179, 0, !dbg !35
202
+ %181 = extractvalue { i32, i32, i32, i32 } %179, 1, !dbg !35
203
+ %182 = extractvalue { i32, i32, i32, i32 } %179, 2, !dbg !35
204
+ %183 = extractvalue { i32, i32, i32, i32 } %179, 3, !dbg !35
205
+ %184 = bitcast i32 %180 to float, !dbg !35
206
+ %185 = bitcast i32 %181 to float, !dbg !35
207
+ %186 = bitcast i32 %182 to float, !dbg !35
208
+ %187 = bitcast i32 %183 to float, !dbg !35
209
+ %188 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %168, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !35
210
+ %189 = extractvalue { i32, i32, i32, i32 } %188, 0, !dbg !35
211
+ %190 = extractvalue { i32, i32, i32, i32 } %188, 1, !dbg !35
212
+ %191 = extractvalue { i32, i32, i32, i32 } %188, 2, !dbg !35
213
+ %192 = extractvalue { i32, i32, i32, i32 } %188, 3, !dbg !35
214
+ %193 = bitcast i32 %189 to float, !dbg !35
215
+ %194 = bitcast i32 %190 to float, !dbg !35
216
+ %195 = bitcast i32 %191 to float, !dbg !35
217
+ %196 = bitcast i32 %192 to float, !dbg !35
218
+ %197 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %169, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !35
219
+ %198 = extractvalue { i32, i32, i32, i32 } %197, 0, !dbg !35
220
+ %199 = extractvalue { i32, i32, i32, i32 } %197, 1, !dbg !35
221
+ %200 = extractvalue { i32, i32, i32, i32 } %197, 2, !dbg !35
222
+ %201 = extractvalue { i32, i32, i32, i32 } %197, 3, !dbg !35
223
+ %202 = bitcast i32 %198 to float, !dbg !35
224
+ %203 = bitcast i32 %199 to float, !dbg !35
225
+ %204 = bitcast i32 %200 to float, !dbg !35
226
+ %205 = bitcast i32 %201 to float, !dbg !35
227
+ %206 = fadd float %65, %175, !dbg !36
228
+ %207 = fadd float %66, %176, !dbg !36
229
+ %208 = fadd float %67, %177, !dbg !36
230
+ %209 = fadd float %68, %178, !dbg !36
231
+ %210 = fadd float %74, %184, !dbg !36
232
+ %211 = fadd float %75, %185, !dbg !36
233
+ %212 = fadd float %76, %186, !dbg !36
234
+ %213 = fadd float %77, %187, !dbg !36
235
+ %214 = fadd float %83, %193, !dbg !36
236
+ %215 = fadd float %84, %194, !dbg !36
237
+ %216 = fadd float %85, %195, !dbg !36
238
+ %217 = fadd float %86, %196, !dbg !36
239
+ %218 = fadd float %92, %202, !dbg !36
240
+ %219 = fadd float %93, %203, !dbg !36
241
+ %220 = fadd float %94, %204, !dbg !36
242
+ %221 = fadd float %95, %205, !dbg !36
243
+ %222 = fadd float %130, %206, !dbg !37
244
+ %223 = fadd float %131, %207, !dbg !37
245
+ %224 = fadd float %132, %208, !dbg !37
246
+ %225 = fadd float %133, %209, !dbg !37
247
+ %226 = fadd float %134, %210, !dbg !37
248
+ %227 = fadd float %135, %211, !dbg !37
249
+ %228 = fadd float %136, %212, !dbg !37
250
+ %229 = fadd float %137, %213, !dbg !37
251
+ %230 = fadd float %138, %214, !dbg !37
252
+ %231 = fadd float %139, %215, !dbg !37
253
+ %232 = fadd float %140, %216, !dbg !37
254
+ %233 = fadd float %141, %217, !dbg !37
255
+ %234 = fadd float %142, %218, !dbg !37
256
+ %235 = fadd float %143, %219, !dbg !37
257
+ %236 = fadd float %144, %220, !dbg !37
258
+ %237 = fadd float %145, %221, !dbg !37
259
+ %238 = fadd float %222, 0.000000e+00, !dbg !38
260
+ %239 = fadd float %223, 0.000000e+00, !dbg !38
261
+ %240 = fadd float %224, 0.000000e+00, !dbg !38
262
+ %241 = fadd float %225, 0.000000e+00, !dbg !38
263
+ %242 = fadd float %226, 0.000000e+00, !dbg !38
264
+ %243 = fadd float %227, 0.000000e+00, !dbg !38
265
+ %244 = fadd float %228, 0.000000e+00, !dbg !38
266
+ %245 = fadd float %229, 0.000000e+00, !dbg !38
267
+ %246 = fadd float %230, 0.000000e+00, !dbg !38
268
+ %247 = fadd float %231, 0.000000e+00, !dbg !38
269
+ %248 = fadd float %232, 0.000000e+00, !dbg !38
270
+ %249 = fadd float %233, 0.000000e+00, !dbg !38
271
+ %250 = fadd float %234, 0.000000e+00, !dbg !38
272
+ %251 = fadd float %235, 0.000000e+00, !dbg !38
273
+ %252 = fadd float %236, 0.000000e+00, !dbg !38
274
+ %253 = fadd float %237, 0.000000e+00, !dbg !38
275
+ %254 = fsub float %222, %238, !dbg !42
276
+ %255 = fsub float %223, %239, !dbg !42
277
+ %256 = fsub float %224, %240, !dbg !42
278
+ %257 = fsub float %225, %241, !dbg !42
279
+ %258 = fsub float %226, %242, !dbg !42
280
+ %259 = fsub float %227, %243, !dbg !42
281
+ %260 = fsub float %228, %244, !dbg !42
282
+ %261 = fsub float %229, %245, !dbg !42
283
+ %262 = fsub float %230, %246, !dbg !42
284
+ %263 = fsub float %231, %247, !dbg !42
285
+ %264 = fsub float %232, %248, !dbg !42
286
+ %265 = fsub float %233, %249, !dbg !42
287
+ %266 = fsub float %234, %250, !dbg !42
288
+ %267 = fsub float %235, %251, !dbg !42
289
+ %268 = fsub float %236, %252, !dbg !42
290
+ %269 = fsub float %237, %253, !dbg !42
291
+ %270 = fmul float %222, %254, !dbg !43
292
+ %271 = fmul float %223, %255, !dbg !43
293
+ %272 = fmul float %224, %256, !dbg !43
294
+ %273 = fmul float %225, %257, !dbg !43
295
+ %274 = fmul float %226, %258, !dbg !43
296
+ %275 = fmul float %227, %259, !dbg !43
297
+ %276 = fmul float %228, %260, !dbg !43
298
+ %277 = fmul float %229, %261, !dbg !43
299
+ %278 = fmul float %230, %262, !dbg !43
300
+ %279 = fmul float %231, %263, !dbg !43
301
+ %280 = fmul float %232, %264, !dbg !43
302
+ %281 = fmul float %233, %265, !dbg !43
303
+ %282 = fmul float %234, %266, !dbg !43
304
+ %283 = fmul float %235, %267, !dbg !43
305
+ %284 = fmul float %236, %268, !dbg !43
306
+ %285 = fmul float %237, %269, !dbg !43
307
+ %286 = fadd float %270, 0.000000e+00, !dbg !44
308
+ %287 = fadd float %271, 0.000000e+00, !dbg !44
309
+ %288 = fadd float %272, 0.000000e+00, !dbg !44
310
+ %289 = fadd float %273, 0.000000e+00, !dbg !44
311
+ %290 = fadd float %274, 0.000000e+00, !dbg !44
312
+ %291 = fadd float %275, 0.000000e+00, !dbg !44
313
+ %292 = fadd float %276, 0.000000e+00, !dbg !44
314
+ %293 = fadd float %277, 0.000000e+00, !dbg !44
315
+ %294 = fadd float %278, 0.000000e+00, !dbg !44
316
+ %295 = fadd float %279, 0.000000e+00, !dbg !44
317
+ %296 = fadd float %280, 0.000000e+00, !dbg !44
318
+ %297 = fadd float %281, 0.000000e+00, !dbg !44
319
+ %298 = fadd float %282, 0.000000e+00, !dbg !44
320
+ %299 = fadd float %283, 0.000000e+00, !dbg !44
321
+ %300 = fadd float %284, 0.000000e+00, !dbg !44
322
+ %301 = fadd float %285, 0.000000e+00, !dbg !44
323
+ %302 = fsub float %239, %238, !dbg !45
324
+ %303 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 2.000000e+00) #6, !dbg !49
325
+ %304 = fmul float %303, %302, !dbg !50
326
+ %305 = fadd float %238, %304, !dbg !51
327
+ %306 = fadd float %286, %287, !dbg !52
328
+ %307 = fmul float %302, %302, !dbg !53
329
+ %308 = fmul float %303, %307, !dbg !54
330
+ %309 = fadd float %308, %306, !dbg !55
331
+ %310 = fsub float %240, %305, !dbg !45
332
+ %311 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 3.000000e+00) #6, !dbg !49
333
+ %312 = fmul float %311, %310, !dbg !50
334
+ %313 = fadd float %305, %312, !dbg !51
335
+ %314 = fadd float %288, %309, !dbg !52
336
+ %315 = fmul float %310, %310, !dbg !53
337
+ %316 = fmul float %315, 2.000000e+00, !dbg !56
338
+ %317 = fmul float %311, %316, !dbg !54
339
+ %318 = fadd float %314, %317, !dbg !55
340
+ %319 = fsub float %241, %313, !dbg !45
341
+ %320 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 4.000000e+00) #6, !dbg !49
342
+ %321 = fmul float %320, %319, !dbg !50
343
+ %322 = fadd float %313, %321, !dbg !51
344
+ %323 = fadd float %289, %318, !dbg !52
345
+ %324 = fmul float %319, %319, !dbg !53
346
+ %325 = fmul float %324, 3.000000e+00, !dbg !56
347
+ %326 = fmul float %320, %325, !dbg !54
348
+ %327 = fadd float %323, %326, !dbg !55
349
+ %328 = fsub float %242, %322, !dbg !45
350
+ %329 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 5.000000e+00) #6, !dbg !49
351
+ %330 = fmul float %329, %328, !dbg !50
352
+ %331 = fadd float %322, %330, !dbg !51
353
+ %332 = fadd float %290, %327, !dbg !52
354
+ %333 = fmul float %328, %328, !dbg !53
355
+ %334 = fmul float %333, 4.000000e+00, !dbg !56
356
+ %335 = fmul float %329, %334, !dbg !54
357
+ %336 = fadd float %332, %335, !dbg !55
358
+ %337 = fsub float %243, %331, !dbg !45
359
+ %338 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 6.000000e+00) #6, !dbg !49
360
+ %339 = fmul float %338, %337, !dbg !50
361
+ %340 = fadd float %331, %339, !dbg !51
362
+ %341 = fadd float %291, %336, !dbg !52
363
+ %342 = fmul float %337, %337, !dbg !53
364
+ %343 = fmul float %342, 5.000000e+00, !dbg !56
365
+ %344 = fmul float %338, %343, !dbg !54
366
+ %345 = fadd float %341, %344, !dbg !55
367
+ %346 = fsub float %244, %340, !dbg !45
368
+ %347 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 7.000000e+00) #6, !dbg !49
369
+ %348 = fmul float %347, %346, !dbg !50
370
+ %349 = fadd float %340, %348, !dbg !51
371
+ %350 = fadd float %292, %345, !dbg !52
372
+ %351 = fmul float %346, %346, !dbg !53
373
+ %352 = fmul float %351, 6.000000e+00, !dbg !56
374
+ %353 = fmul float %347, %352, !dbg !54
375
+ %354 = fadd float %350, %353, !dbg !55
376
+ %355 = fsub float %245, %349, !dbg !45
377
+ %356 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 8.000000e+00) #6, !dbg !49
378
+ %357 = fmul float %356, %355, !dbg !50
379
+ %358 = fadd float %349, %357, !dbg !51
380
+ %359 = fadd float %293, %354, !dbg !52
381
+ %360 = fmul float %355, %355, !dbg !53
382
+ %361 = fmul float %360, 7.000000e+00, !dbg !56
383
+ %362 = fmul float %356, %361, !dbg !54
384
+ %363 = fadd float %359, %362, !dbg !55
385
+ %364 = fsub float %247, %246, !dbg !45
386
+ %365 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 2.000000e+00) #6, !dbg !49
387
+ %366 = fmul float %364, %365, !dbg !50
388
+ %367 = fadd float %246, %366, !dbg !51
389
+ %368 = fadd float %294, %295, !dbg !52
390
+ %369 = fmul float %364, %364, !dbg !53
391
+ %370 = fmul float %369, %365, !dbg !54
392
+ %371 = fadd float %368, %370, !dbg !55
393
+ %372 = fsub float %248, %367, !dbg !45
394
+ %373 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 3.000000e+00) #6, !dbg !49
395
+ %374 = fmul float %373, %372, !dbg !50
396
+ %375 = fadd float %367, %374, !dbg !51
397
+ %376 = fadd float %296, %371, !dbg !52
398
+ %377 = fmul float %372, %372, !dbg !53
399
+ %378 = fmul float %377, 2.000000e+00, !dbg !56
400
+ %379 = fmul float %373, %378, !dbg !54
401
+ %380 = fadd float %376, %379, !dbg !55
402
+ %381 = fsub float %249, %375, !dbg !45
403
+ %382 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 4.000000e+00) #6, !dbg !49
404
+ %383 = fmul float %382, %381, !dbg !50
405
+ %384 = fadd float %375, %383, !dbg !51
406
+ %385 = fadd float %297, %380, !dbg !52
407
+ %386 = fmul float %381, %381, !dbg !53
408
+ %387 = fmul float %386, 3.000000e+00, !dbg !56
409
+ %388 = fmul float %382, %387, !dbg !54
410
+ %389 = fadd float %385, %388, !dbg !55
411
+ %390 = fsub float %250, %384, !dbg !45
412
+ %391 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 5.000000e+00) #6, !dbg !49
413
+ %392 = fmul float %391, %390, !dbg !50
414
+ %393 = fadd float %384, %392, !dbg !51
415
+ %394 = fadd float %298, %389, !dbg !52
416
+ %395 = fmul float %390, %390, !dbg !53
417
+ %396 = fmul float %395, 4.000000e+00, !dbg !56
418
+ %397 = fmul float %391, %396, !dbg !54
419
+ %398 = fadd float %394, %397, !dbg !55
420
+ %399 = fsub float %251, %393, !dbg !45
421
+ %400 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 6.000000e+00) #6, !dbg !49
422
+ %401 = fmul float %400, %399, !dbg !50
423
+ %402 = fadd float %393, %401, !dbg !51
424
+ %403 = fadd float %299, %398, !dbg !52
425
+ %404 = fmul float %399, %399, !dbg !53
426
+ %405 = fmul float %404, 5.000000e+00, !dbg !56
427
+ %406 = fmul float %400, %405, !dbg !54
428
+ %407 = fadd float %403, %406, !dbg !55
429
+ %408 = fsub float %252, %402, !dbg !45
430
+ %409 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 7.000000e+00) #6, !dbg !49
431
+ %410 = fmul float %409, %408, !dbg !50
432
+ %411 = fadd float %402, %410, !dbg !51
433
+ %412 = fadd float %300, %407, !dbg !52
434
+ %413 = fmul float %408, %408, !dbg !53
435
+ %414 = fmul float %413, 6.000000e+00, !dbg !56
436
+ %415 = fmul float %409, %414, !dbg !54
437
+ %416 = fadd float %412, %415, !dbg !55
438
+ %417 = fsub float %253, %411, !dbg !45
439
+ %418 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 8.000000e+00) #6, !dbg !49
440
+ %419 = fmul float %418, %417, !dbg !50
441
+ %420 = fadd float %411, %419, !dbg !51
442
+ %421 = fadd float %301, %416, !dbg !52
443
+ %422 = fmul float %417, %417, !dbg !53
444
+ %423 = fmul float %422, 7.000000e+00, !dbg !56
445
+ %424 = fmul float %418, %423, !dbg !54
446
+ %425 = fadd float %421, %424, !dbg !55
447
+ %426 = bitcast float %358 to i32, !dbg !57
448
+ %427 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %426, i32 16, i32 31), !dbg !57
449
+ %428 = bitcast i32 %427 to float, !dbg !57
450
+ %429 = bitcast float %363 to i32, !dbg !57
451
+ %430 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %429, i32 16, i32 31), !dbg !57
452
+ %431 = bitcast i32 %430 to float, !dbg !57
453
+ %432 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 1090519040, i32 16, i32 31), !dbg !57
454
+ %433 = bitcast i32 %432 to float, !dbg !57
455
+ %434 = fsub float %428, %358, !dbg !45
456
+ %435 = fadd float %433, 8.000000e+00, !dbg !59
457
+ %436 = fcmp oeq float %435, 0.000000e+00, !dbg !60
458
+ %437 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %433, float %435) #6, !dbg !49
459
+ %438 = select i1 %436, float 0.000000e+00, float %437, !dbg !61
460
+ %439 = fmul float %438, %434, !dbg !50
461
+ %440 = fadd float %358, %439, !dbg !51
462
+ %441 = fadd float %363, %431, !dbg !52
463
+ %442 = fmul float %434, %434, !dbg !53
464
+ %443 = fmul float %442, 8.000000e+00, !dbg !56
465
+ %444 = fmul float %438, %443, !dbg !54
466
+ %445 = fadd float %441, %444, !dbg !55
467
+ %446 = bitcast float %440 to i32, !dbg !57
468
+ %447 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %446, i32 8, i32 31), !dbg !57
469
+ %448 = bitcast i32 %447 to float, !dbg !57
470
+ %449 = bitcast float %445 to i32, !dbg !57
471
+ %450 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %449, i32 8, i32 31), !dbg !57
472
+ %451 = bitcast i32 %450 to float, !dbg !57
473
+ %452 = bitcast float %435 to i32, !dbg !57
474
+ %453 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %452, i32 8, i32 31), !dbg !57
475
+ %454 = bitcast i32 %453 to float, !dbg !57
476
+ %455 = fsub float %448, %440, !dbg !45
477
+ %456 = fadd float %435, %454, !dbg !59
478
+ %457 = fcmp oeq float %456, 0.000000e+00, !dbg !60
479
+ %458 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %454, float %456) #6, !dbg !49
480
+ %459 = select i1 %457, float 0.000000e+00, float %458, !dbg !61
481
+ %460 = fmul float %459, %455, !dbg !50
482
+ %461 = fadd float %440, %460, !dbg !51
483
+ %462 = fadd float %445, %451, !dbg !52
484
+ %463 = fmul float %455, %455, !dbg !53
485
+ %464 = fmul float %435, %463, !dbg !56
486
+ %465 = fmul float %459, %464, !dbg !54
487
+ %466 = fadd float %462, %465, !dbg !55
488
+ %467 = bitcast float %461 to i32, !dbg !57
489
+ %468 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %467, i32 4, i32 31), !dbg !57
490
+ %469 = bitcast i32 %468 to float, !dbg !57
491
+ %470 = bitcast float %466 to i32, !dbg !57
492
+ %471 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %470, i32 4, i32 31), !dbg !57
493
+ %472 = bitcast i32 %471 to float, !dbg !57
494
+ %473 = bitcast float %456 to i32, !dbg !57
495
+ %474 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %473, i32 4, i32 31), !dbg !57
496
+ %475 = bitcast i32 %474 to float, !dbg !57
497
+ %476 = fsub float %469, %461, !dbg !45
498
+ %477 = fadd float %456, %475, !dbg !59
499
+ %478 = fcmp oeq float %477, 0.000000e+00, !dbg !60
500
+ %479 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %475, float %477) #6, !dbg !49
501
+ %480 = select i1 %478, float 0.000000e+00, float %479, !dbg !61
502
+ %481 = fmul float %480, %476, !dbg !50
503
+ %482 = fadd float %461, %481, !dbg !51
504
+ %483 = fadd float %466, %472, !dbg !52
505
+ %484 = fmul float %476, %476, !dbg !53
506
+ %485 = fmul float %456, %484, !dbg !56
507
+ %486 = fmul float %480, %485, !dbg !54
508
+ %487 = fadd float %483, %486, !dbg !55
509
+ %488 = bitcast float %482 to i32, !dbg !57
510
+ %489 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %488, i32 2, i32 31), !dbg !57
511
+ %490 = bitcast i32 %489 to float, !dbg !57
512
+ %491 = bitcast float %487 to i32, !dbg !57
513
+ %492 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %491, i32 2, i32 31), !dbg !57
514
+ %493 = bitcast i32 %492 to float, !dbg !57
515
+ %494 = bitcast float %477 to i32, !dbg !57
516
+ %495 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %494, i32 2, i32 31), !dbg !57
517
+ %496 = bitcast i32 %495 to float, !dbg !57
518
+ %497 = fsub float %490, %482, !dbg !45
519
+ %498 = fadd float %477, %496, !dbg !59
520
+ %499 = fcmp oeq float %498, 0.000000e+00, !dbg !60
521
+ %500 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %496, float %498) #6, !dbg !49
522
+ %501 = select i1 %499, float 0.000000e+00, float %500, !dbg !61
523
+ %502 = fmul float %497, %501, !dbg !50
524
+ %503 = fadd float %482, %502, !dbg !51
525
+ %504 = fadd float %487, %493, !dbg !52
526
+ %505 = fmul float %497, %497, !dbg !53
527
+ %506 = fmul float %477, %505, !dbg !56
528
+ %507 = fmul float %501, %506, !dbg !54
529
+ %508 = fadd float %504, %507, !dbg !55
530
+ %509 = bitcast float %503 to i32, !dbg !57
531
+ %510 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %509, i32 1, i32 31), !dbg !57
532
+ %511 = bitcast float %508 to i32, !dbg !57
533
+ %512 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %511, i32 1, i32 31), !dbg !57
534
+ %513 = bitcast float %498 to i32, !dbg !57
535
+ %514 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %513, i32 1, i32 31), !dbg !57
536
+ %515 = bitcast i32 %514 to float, !dbg !57
537
+ %516 = fadd float %498, %515, !dbg !59
538
+ %517 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %515, float %516) #6, !dbg !49
539
+ %518 = bitcast float %420 to i32, !dbg !57
540
+ %519 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %518, i32 16, i32 31), !dbg !57
541
+ %520 = bitcast i32 %519 to float, !dbg !57
542
+ %521 = bitcast float %425 to i32, !dbg !57
543
+ %522 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %521, i32 16, i32 31), !dbg !57
544
+ %523 = bitcast i32 %522 to float, !dbg !57
545
+ %524 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 1090519040, i32 16, i32 31), !dbg !57
546
+ %525 = bitcast i32 %524 to float, !dbg !57
547
+ %526 = fsub float %520, %420, !dbg !45
548
+ %527 = fadd float %525, 8.000000e+00, !dbg !59
549
+ %528 = fcmp oeq float %527, 0.000000e+00, !dbg !60
550
+ %529 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %525, float %527) #6, !dbg !49
551
+ %530 = select i1 %528, float 0.000000e+00, float %529, !dbg !61
552
+ %531 = fmul float %526, %530, !dbg !50
553
+ %532 = fadd float %420, %531, !dbg !51
554
+ %533 = fadd float %425, %523, !dbg !52
555
+ %534 = fmul float %526, %526, !dbg !53
556
+ %535 = fmul float %534, 8.000000e+00, !dbg !56
557
+ %536 = fmul float %535, %530, !dbg !54
558
+ %537 = fadd float %533, %536, !dbg !55
559
+ %538 = bitcast float %532 to i32, !dbg !57
560
+ %539 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %538, i32 8, i32 31), !dbg !57
561
+ %540 = bitcast i32 %539 to float, !dbg !57
562
+ %541 = bitcast float %537 to i32, !dbg !57
563
+ %542 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %541, i32 8, i32 31), !dbg !57
564
+ %543 = bitcast i32 %542 to float, !dbg !57
565
+ %544 = bitcast float %527 to i32, !dbg !57
566
+ %545 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %544, i32 8, i32 31), !dbg !57
567
+ %546 = bitcast i32 %545 to float, !dbg !57
568
+ %547 = fsub float %540, %532, !dbg !45
569
+ %548 = fadd float %527, %546, !dbg !59
570
+ %549 = fcmp oeq float %548, 0.000000e+00, !dbg !60
571
+ %550 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %546, float %548) #6, !dbg !49
572
+ %551 = select i1 %549, float 0.000000e+00, float %550, !dbg !61
573
+ %552 = fmul float %547, %551, !dbg !50
574
+ %553 = fadd float %532, %552, !dbg !51
575
+ %554 = fadd float %537, %543, !dbg !52
576
+ %555 = fmul float %547, %547, !dbg !53
577
+ %556 = fmul float %527, %555, !dbg !56
578
+ %557 = fmul float %551, %556, !dbg !54
579
+ %558 = fadd float %554, %557, !dbg !55
580
+ %559 = bitcast float %553 to i32, !dbg !57
581
+ %560 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %559, i32 4, i32 31), !dbg !57
582
+ %561 = bitcast i32 %560 to float, !dbg !57
583
+ %562 = bitcast float %558 to i32, !dbg !57
584
+ %563 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %562, i32 4, i32 31), !dbg !57
585
+ %564 = bitcast i32 %563 to float, !dbg !57
586
+ %565 = bitcast float %548 to i32, !dbg !57
587
+ %566 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %565, i32 4, i32 31), !dbg !57
588
+ %567 = bitcast i32 %566 to float, !dbg !57
589
+ %568 = fsub float %561, %553, !dbg !45
590
+ %569 = fadd float %548, %567, !dbg !59
591
+ %570 = fcmp oeq float %569, 0.000000e+00, !dbg !60
592
+ %571 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %567, float %569) #6, !dbg !49
593
+ %572 = select i1 %570, float 0.000000e+00, float %571, !dbg !61
594
+ %573 = fmul float %568, %572, !dbg !50
595
+ %574 = fadd float %553, %573, !dbg !51
596
+ %575 = fadd float %558, %564, !dbg !52
597
+ %576 = fmul float %568, %568, !dbg !53
598
+ %577 = fmul float %548, %576, !dbg !56
599
+ %578 = fmul float %572, %577, !dbg !54
600
+ %579 = fadd float %575, %578, !dbg !55
601
+ %580 = bitcast float %574 to i32, !dbg !57
602
+ %581 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %580, i32 2, i32 31), !dbg !57
603
+ %582 = bitcast i32 %581 to float, !dbg !57
604
+ %583 = bitcast float %579 to i32, !dbg !57
605
+ %584 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %583, i32 2, i32 31), !dbg !57
606
+ %585 = bitcast i32 %584 to float, !dbg !57
607
+ %586 = bitcast float %569 to i32, !dbg !57
608
+ %587 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %586, i32 2, i32 31), !dbg !57
609
+ %588 = bitcast i32 %587 to float, !dbg !57
610
+ %589 = fsub float %582, %574, !dbg !45
611
+ %590 = fadd float %569, %588, !dbg !59
612
+ %591 = fcmp oeq float %590, 0.000000e+00, !dbg !60
613
+ %592 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %588, float %590) #6, !dbg !49
614
+ %593 = select i1 %591, float 0.000000e+00, float %592, !dbg !61
615
+ %594 = fmul float %589, %593, !dbg !50
616
+ %595 = fadd float %574, %594, !dbg !51
617
+ %596 = fadd float %579, %585, !dbg !52
618
+ %597 = fmul float %589, %589, !dbg !53
619
+ %598 = fmul float %569, %597, !dbg !56
620
+ %599 = fmul float %593, %598, !dbg !54
621
+ %600 = fadd float %596, %599, !dbg !55
622
+ %601 = bitcast float %595 to i32, !dbg !57
623
+ %602 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %601, i32 1, i32 31), !dbg !57
624
+ %603 = bitcast float %600 to i32, !dbg !57
625
+ %604 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %603, i32 1, i32 31), !dbg !57
626
+ %605 = bitcast float %590 to i32, !dbg !57
627
+ %606 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %605, i32 1, i32 31), !dbg !57
628
+ %607 = bitcast i32 %606 to float, !dbg !57
629
+ %608 = fadd float %590, %607, !dbg !59
630
+ %609 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %607, float %608) #6, !dbg !49
631
+ %610 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %53, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !62
632
+ %611 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %55, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !62
633
+ %612 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %57, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !62
634
+ %613 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %59, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !62
635
+ %614 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %101, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !63
636
+ %615 = extractvalue { i32, i32, i32, i32 } %614, 0, !dbg !63
637
+ %616 = extractvalue { i32, i32, i32, i32 } %614, 1, !dbg !63
638
+ %617 = extractvalue { i32, i32, i32, i32 } %614, 2, !dbg !63
639
+ %618 = extractvalue { i32, i32, i32, i32 } %614, 3, !dbg !63
640
+ %619 = trunc i32 %615 to i16, !dbg !63
641
+ %extelt.offset8 = lshr i32 %615, 16, !dbg !63
642
+ %620 = trunc i32 %extelt.offset8 to i16, !dbg !63
643
+ %621 = trunc i32 %616 to i16, !dbg !63
644
+ %extelt.offset9 = lshr i32 %616, 16, !dbg !63
645
+ %622 = trunc i32 %extelt.offset9 to i16, !dbg !63
646
+ %623 = trunc i32 %617 to i16, !dbg !63
647
+ %extelt.offset10 = lshr i32 %617, 16, !dbg !63
648
+ %624 = trunc i32 %extelt.offset10 to i16, !dbg !63
649
+ %625 = trunc i32 %618 to i16, !dbg !63
650
+ %extelt.offset11 = lshr i32 %618, 16, !dbg !63
651
+ %626 = trunc i32 %extelt.offset11 to i16, !dbg !63
652
+ %627 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %103, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !63
653
+ %628 = extractvalue { i32, i32, i32, i32 } %627, 0, !dbg !63
654
+ %629 = extractvalue { i32, i32, i32, i32 } %627, 1, !dbg !63
655
+ %630 = extractvalue { i32, i32, i32, i32 } %627, 2, !dbg !63
656
+ %631 = extractvalue { i32, i32, i32, i32 } %627, 3, !dbg !63
657
+ %632 = trunc i32 %628 to i16, !dbg !63
658
+ %extelt.offset12 = lshr i32 %628, 16, !dbg !63
659
+ %633 = trunc i32 %extelt.offset12 to i16, !dbg !63
660
+ %634 = trunc i32 %629 to i16, !dbg !63
661
+ %extelt.offset13 = lshr i32 %629, 16, !dbg !63
662
+ %635 = trunc i32 %extelt.offset13 to i16, !dbg !63
663
+ %636 = trunc i32 %630 to i16, !dbg !63
664
+ %extelt.offset14 = lshr i32 %630, 16, !dbg !63
665
+ %637 = trunc i32 %extelt.offset14 to i16, !dbg !63
666
+ %638 = trunc i32 %631 to i16, !dbg !63
667
+ %extelt.offset15 = lshr i32 %631, 16, !dbg !63
668
+ %639 = trunc i32 %extelt.offset15 to i16, !dbg !63
669
+ %640 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %619) #6, !dbg !64
670
+ %641 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %620) #6, !dbg !64
671
+ %642 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %621) #6, !dbg !64
672
+ %643 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %622) #6, !dbg !64
673
+ %644 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %623) #6, !dbg !64
674
+ %645 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %624) #6, !dbg !64
675
+ %646 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %625) #6, !dbg !64
676
+ %647 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %626) #6, !dbg !64
677
+ %648 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %632) #6, !dbg !64
678
+ %649 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %633) #6, !dbg !64
679
+ %650 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %634) #6, !dbg !64
680
+ %651 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %635) #6, !dbg !64
681
+ %652 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %636) #6, !dbg !64
682
+ %653 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %637) #6, !dbg !64
683
+ %654 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %638) #6, !dbg !64
684
+ %655 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %639) #6, !dbg !64
685
+ %656 = zext nneg i32 %urem to i64, !dbg !65
686
+ %657 = getelementptr float, ptr addrspace(1) %4, i64 %656, !dbg !65
687
+ %658 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %657, i1 true, i32 0, i1 true) #6, !dbg !66
688
+ br i1 %151, label %659, label %660, !dbg !67
689
+
690
+ 659: ; preds = %153
691
+ tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !67
692
+ br label %660, !dbg !67
693
+
694
+ 660: ; preds = %659, %153
695
+ %661 = bitcast i32 %604 to float, !dbg !57
696
+ %662 = fadd float %600, %661, !dbg !52
697
+ %663 = bitcast i32 %602 to float, !dbg !57
698
+ %664 = fsub float %663, %595, !dbg !45
699
+ %665 = fmul float %664, %664, !dbg !53
700
+ %666 = fmul float %590, %665, !dbg !56
701
+ %667 = fcmp oeq float %608, 0.000000e+00, !dbg !60
702
+ %668 = select i1 %667, float 0.000000e+00, float %609, !dbg !61
703
+ %669 = fmul float %668, %666, !dbg !54
704
+ %670 = fadd float %662, %669, !dbg !55
705
+ %671 = bitcast i32 %512 to float, !dbg !57
706
+ %672 = fadd float %508, %671, !dbg !52
707
+ %673 = bitcast i32 %510 to float, !dbg !57
708
+ %674 = fsub float %673, %503, !dbg !45
709
+ %675 = fmul float %674, %674, !dbg !53
710
+ %676 = fmul float %498, %675, !dbg !56
711
+ %677 = fcmp oeq float %516, 0.000000e+00, !dbg !60
712
+ %678 = select i1 %677, float 0.000000e+00, float %517, !dbg !61
713
+ %679 = fmul float %678, %676, !dbg !54
714
+ %680 = fadd float %672, %679, !dbg !55
715
+ %681 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %166, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
716
+ %682 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %167, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
717
+ %683 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %168, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
718
+ %684 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %169, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
719
+ %685 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %680, float 2.560000e+02) #6, !dbg !69
720
+ %686 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %680, float 2.560000e+02) #6, !dbg !69
721
+ %687 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %680, float 2.560000e+02) #6, !dbg !69
722
+ %688 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %680, float 2.560000e+02) #6, !dbg !69
723
+ %689 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %680, float 2.560000e+02) #6, !dbg !69
724
+ %690 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %680, float 2.560000e+02) #6, !dbg !69
725
+ %691 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %680, float 2.560000e+02) #6, !dbg !69
726
+ %692 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %680, float 2.560000e+02) #6, !dbg !69
727
+ %693 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %670, float 2.560000e+02) #6, !dbg !69
728
+ %694 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %670, float 2.560000e+02) #6, !dbg !69
729
+ %695 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %670, float 2.560000e+02) #6, !dbg !69
730
+ %696 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %670, float 2.560000e+02) #6, !dbg !69
731
+ %697 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %670, float 2.560000e+02) #6, !dbg !69
732
+ %698 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %670, float 2.560000e+02) #6, !dbg !69
733
+ %699 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %670, float 2.560000e+02) #6, !dbg !69
734
+ %700 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %670, float 2.560000e+02) #6, !dbg !69
735
+ %701 = fadd float %685, 0x3EE4F8B580000000, !dbg !70
736
+ %702 = fadd float %693, 0x3EE4F8B580000000, !dbg !70
737
+ %703 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
738
+ %.not.i = icmp eq i32 %703, 0, !dbg !71
739
+ br i1 %.not.i, label %706, label %704, !dbg !71
740
+
741
+ 704: ; preds = %660
742
+ %705 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %701), !dbg !71
743
+ br label %__nv_rsqrtf.exit, !dbg !71
744
+
745
+ 706: ; preds = %660
746
+ %707 = tail call float @llvm.nvvm.rsqrt.approx.f(float %701), !dbg !71
747
+ br label %__nv_rsqrtf.exit, !dbg !71
748
+
749
+ __nv_rsqrtf.exit: ; preds = %704, %706
750
+ %.0.i = phi float [ %705, %704 ], [ %707, %706 ], !dbg !71
751
+ %708 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
752
+ %709 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
753
+ %710 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
754
+ %711 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
755
+ %712 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
756
+ %713 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
757
+ %714 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
758
+ %715 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
759
+ %.not.i37 = icmp eq i32 %715, 0, !dbg !71
760
+ br i1 %.not.i37, label %718, label %716, !dbg !71
761
+
762
+ 716: ; preds = %__nv_rsqrtf.exit
763
+ %717 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %702), !dbg !71
764
+ br label %__nv_rsqrtf.exit39, !dbg !71
765
+
766
+ 718: ; preds = %__nv_rsqrtf.exit
767
+ %719 = tail call float @llvm.nvvm.rsqrt.approx.f(float %702), !dbg !71
768
+ br label %__nv_rsqrtf.exit39, !dbg !71
769
+
770
+ __nv_rsqrtf.exit39: ; preds = %716, %718
771
+ %.0.i38 = phi float [ %717, %716 ], [ %719, %718 ], !dbg !71
772
+ %720 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
773
+ %721 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
774
+ %722 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
775
+ %723 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
776
+ %724 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
777
+ %725 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
778
+ %726 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
779
+ %727 = extractvalue { i32, i32, i32, i32 } %684, 3, !dbg !68
780
+ %728 = bitcast i32 %727 to float, !dbg !68
781
+ %729 = extractvalue { i32, i32, i32, i32 } %613, 3, !dbg !62
782
+ %730 = bitcast i32 %729 to float, !dbg !62
783
+ %731 = fadd float %730, %728, !dbg !72
784
+ %732 = fadd float %655, %731, !dbg !73
785
+ %733 = fmul float %664, %668, !dbg !50
786
+ %734 = fadd float %595, %733, !dbg !51
787
+ %735 = fsub float %732, %734, !dbg !74
788
+ %736 = extractvalue { i32, i32, i32, i32 } %684, 2, !dbg !68
789
+ %737 = bitcast i32 %736 to float, !dbg !68
790
+ %738 = extractvalue { i32, i32, i32, i32 } %613, 2, !dbg !62
791
+ %739 = bitcast i32 %738 to float, !dbg !62
792
+ %740 = fadd float %739, %737, !dbg !72
793
+ %741 = fadd float %654, %740, !dbg !73
794
+ %742 = fsub float %741, %734, !dbg !74
795
+ %743 = extractvalue { i32, i32, i32, i32 } %684, 1, !dbg !68
796
+ %744 = bitcast i32 %743 to float, !dbg !68
797
+ %745 = extractvalue { i32, i32, i32, i32 } %613, 1, !dbg !62
798
+ %746 = bitcast i32 %745 to float, !dbg !62
799
+ %747 = fadd float %746, %744, !dbg !72
800
+ %748 = fadd float %653, %747, !dbg !73
801
+ %749 = fsub float %748, %734, !dbg !74
802
+ %750 = extractvalue { i32, i32, i32, i32 } %684, 0, !dbg !68
803
+ %751 = bitcast i32 %750 to float, !dbg !68
804
+ %752 = extractvalue { i32, i32, i32, i32 } %613, 0, !dbg !62
805
+ %753 = bitcast i32 %752 to float, !dbg !62
806
+ %754 = fadd float %753, %751, !dbg !72
807
+ %755 = fadd float %652, %754, !dbg !73
808
+ %756 = fsub float %755, %734, !dbg !74
809
+ %757 = extractvalue { i32, i32, i32, i32 } %683, 3, !dbg !68
810
+ %758 = bitcast i32 %757 to float, !dbg !68
811
+ %759 = extractvalue { i32, i32, i32, i32 } %612, 3, !dbg !62
812
+ %760 = bitcast i32 %759 to float, !dbg !62
813
+ %761 = fadd float %760, %758, !dbg !72
814
+ %762 = fadd float %651, %761, !dbg !73
815
+ %763 = fsub float %762, %734, !dbg !74
816
+ %764 = extractvalue { i32, i32, i32, i32 } %683, 2, !dbg !68
817
+ %765 = bitcast i32 %764 to float, !dbg !68
818
+ %766 = extractvalue { i32, i32, i32, i32 } %612, 2, !dbg !62
819
+ %767 = bitcast i32 %766 to float, !dbg !62
820
+ %768 = fadd float %767, %765, !dbg !72
821
+ %769 = fadd float %650, %768, !dbg !73
822
+ %770 = fsub float %769, %734, !dbg !74
823
+ %771 = extractvalue { i32, i32, i32, i32 } %683, 1, !dbg !68
824
+ %772 = bitcast i32 %771 to float, !dbg !68
825
+ %773 = extractvalue { i32, i32, i32, i32 } %612, 1, !dbg !62
826
+ %774 = bitcast i32 %773 to float, !dbg !62
827
+ %775 = fadd float %774, %772, !dbg !72
828
+ %776 = fadd float %649, %775, !dbg !73
829
+ %777 = fsub float %776, %734, !dbg !74
830
+ %778 = extractvalue { i32, i32, i32, i32 } %683, 0, !dbg !68
831
+ %779 = bitcast i32 %778 to float, !dbg !68
832
+ %780 = extractvalue { i32, i32, i32, i32 } %612, 0, !dbg !62
833
+ %781 = bitcast i32 %780 to float, !dbg !62
834
+ %782 = fadd float %781, %779, !dbg !72
835
+ %783 = fadd float %648, %782, !dbg !73
836
+ %784 = fsub float %783, %734, !dbg !74
837
+ %785 = extractvalue { i32, i32, i32, i32 } %682, 3, !dbg !68
838
+ %786 = bitcast i32 %785 to float, !dbg !68
839
+ %787 = extractvalue { i32, i32, i32, i32 } %611, 3, !dbg !62
840
+ %788 = bitcast i32 %787 to float, !dbg !62
841
+ %789 = fadd float %788, %786, !dbg !72
842
+ %790 = fadd float %647, %789, !dbg !73
843
+ %791 = fmul float %674, %678, !dbg !50
844
+ %792 = fadd float %503, %791, !dbg !51
845
+ %793 = fsub float %790, %792, !dbg !74
846
+ %794 = extractvalue { i32, i32, i32, i32 } %682, 2, !dbg !68
847
+ %795 = bitcast i32 %794 to float, !dbg !68
848
+ %796 = extractvalue { i32, i32, i32, i32 } %611, 2, !dbg !62
849
+ %797 = bitcast i32 %796 to float, !dbg !62
850
+ %798 = fadd float %797, %795, !dbg !72
851
+ %799 = fadd float %646, %798, !dbg !73
852
+ %800 = fsub float %799, %792, !dbg !74
853
+ %801 = extractvalue { i32, i32, i32, i32 } %682, 1, !dbg !68
854
+ %802 = bitcast i32 %801 to float, !dbg !68
855
+ %803 = extractvalue { i32, i32, i32, i32 } %611, 1, !dbg !62
856
+ %804 = bitcast i32 %803 to float, !dbg !62
857
+ %805 = fadd float %804, %802, !dbg !72
858
+ %806 = fadd float %645, %805, !dbg !73
859
+ %807 = fsub float %806, %792, !dbg !74
860
+ %808 = extractvalue { i32, i32, i32, i32 } %682, 0, !dbg !68
861
+ %809 = bitcast i32 %808 to float, !dbg !68
862
+ %810 = extractvalue { i32, i32, i32, i32 } %611, 0, !dbg !62
863
+ %811 = bitcast i32 %810 to float, !dbg !62
864
+ %812 = fadd float %811, %809, !dbg !72
865
+ %813 = fadd float %644, %812, !dbg !73
866
+ %814 = fsub float %813, %792, !dbg !74
867
+ %815 = extractvalue { i32, i32, i32, i32 } %681, 3, !dbg !68
868
+ %816 = bitcast i32 %815 to float, !dbg !68
869
+ %817 = extractvalue { i32, i32, i32, i32 } %610, 3, !dbg !62
870
+ %818 = bitcast i32 %817 to float, !dbg !62
871
+ %819 = fadd float %818, %816, !dbg !72
872
+ %820 = fadd float %643, %819, !dbg !73
873
+ %821 = fsub float %820, %792, !dbg !74
874
+ %822 = extractvalue { i32, i32, i32, i32 } %681, 2, !dbg !68
875
+ %823 = bitcast i32 %822 to float, !dbg !68
876
+ %824 = extractvalue { i32, i32, i32, i32 } %610, 2, !dbg !62
877
+ %825 = bitcast i32 %824 to float, !dbg !62
878
+ %826 = fadd float %825, %823, !dbg !72
879
+ %827 = fadd float %642, %826, !dbg !73
880
+ %828 = fsub float %827, %792, !dbg !74
881
+ %829 = extractvalue { i32, i32, i32, i32 } %681, 1, !dbg !68
882
+ %830 = bitcast i32 %829 to float, !dbg !68
883
+ %831 = extractvalue { i32, i32, i32, i32 } %610, 1, !dbg !62
884
+ %832 = bitcast i32 %831 to float, !dbg !62
885
+ %833 = fadd float %832, %830, !dbg !72
886
+ %834 = fadd float %641, %833, !dbg !73
887
+ %835 = fsub float %834, %792, !dbg !74
888
+ %836 = extractvalue { i32, i32, i32, i32 } %681, 0, !dbg !68
889
+ %837 = bitcast i32 %836 to float, !dbg !68
890
+ %838 = extractvalue { i32, i32, i32, i32 } %610, 0, !dbg !62
891
+ %839 = bitcast i32 %838 to float, !dbg !62
892
+ %840 = fadd float %839, %837, !dbg !72
893
+ %841 = fadd float %640, %840, !dbg !73
894
+ %842 = fsub float %841, %792, !dbg !74
895
+ %843 = fmul float %842, %.0.i, !dbg !75
896
+ %844 = fmul float %835, %.0.i, !dbg !75
897
+ %845 = fmul float %828, %.0.i, !dbg !75
898
+ %846 = fmul float %821, %.0.i, !dbg !75
899
+ %847 = fmul float %814, %.0.i, !dbg !75
900
+ %848 = fmul float %807, %.0.i, !dbg !75
901
+ %849 = fmul float %800, %.0.i, !dbg !75
902
+ %850 = fmul float %793, %.0.i, !dbg !75
903
+ %851 = fmul float %784, %.0.i38, !dbg !75
904
+ %852 = fmul float %777, %.0.i38, !dbg !75
905
+ %853 = fmul float %770, %.0.i38, !dbg !75
906
+ %854 = fmul float %763, %.0.i38, !dbg !75
907
+ %855 = fmul float %756, %.0.i38, !dbg !75
908
+ %856 = fmul float %749, %.0.i38, !dbg !75
909
+ %857 = fmul float %742, %.0.i38, !dbg !75
910
+ %858 = fmul float %735, %.0.i38, !dbg !75
911
+ %859 = getelementptr float, ptr addrspace(3) @global_smem, i64 %656, !dbg !76
912
+ store i32 %658, ptr addrspace(3) %859, align 4, !dbg !76
913
+ tail call void @llvm.nvvm.barrier0(), !dbg !76
914
+ %860 = getelementptr float, ptr addrspace(3) @global_smem, i64 %160, !dbg !76
915
+ %861 = load float, ptr addrspace(3) %860, align 32, !dbg !76
916
+ %862 = getelementptr inbounds <8 x float>, ptr addrspace(3) %860, i64 0, i64 1, !dbg !76
917
+ %863 = load float, ptr addrspace(3) %862, align 4, !dbg !76
918
+ %864 = getelementptr inbounds <8 x float>, ptr addrspace(3) %860, i64 0, i64 2, !dbg !76
919
+ %865 = load float, ptr addrspace(3) %864, align 8, !dbg !76
920
+ %866 = getelementptr inbounds <8 x float>, ptr addrspace(3) %860, i64 0, i64 3, !dbg !76
921
+ %867 = load float, ptr addrspace(3) %866, align 4, !dbg !76
922
+ %868 = getelementptr inbounds <8 x float>, ptr addrspace(3) %860, i64 0, i64 4, !dbg !76
923
+ %869 = load float, ptr addrspace(3) %868, align 16, !dbg !76
924
+ %870 = getelementptr inbounds <8 x float>, ptr addrspace(3) %860, i64 0, i64 5, !dbg !76
925
+ %871 = load float, ptr addrspace(3) %870, align 4, !dbg !76
926
+ %872 = getelementptr inbounds <8 x float>, ptr addrspace(3) %860, i64 0, i64 6, !dbg !76
927
+ %873 = load float, ptr addrspace(3) %872, align 8, !dbg !76
928
+ %874 = getelementptr inbounds <8 x float>, ptr addrspace(3) %860, i64 0, i64 7, !dbg !76
929
+ %875 = load float, ptr addrspace(3) %874, align 4, !dbg !76
930
+ %876 = fmul float %843, %861, !dbg !76
931
+ %877 = fmul float %844, %863, !dbg !76
932
+ %878 = fmul float %845, %865, !dbg !76
933
+ %879 = fmul float %846, %867, !dbg !76
934
+ %880 = fmul float %847, %869, !dbg !76
935
+ %881 = fmul float %848, %871, !dbg !76
936
+ %882 = fmul float %849, %873, !dbg !76
937
+ %883 = fmul float %850, %875, !dbg !76
938
+ %884 = fmul float %851, %861, !dbg !76
939
+ %885 = fmul float %852, %863, !dbg !76
940
+ %886 = fmul float %853, %865, !dbg !76
941
+ %887 = fmul float %854, %867, !dbg !76
942
+ %888 = fmul float %855, %869, !dbg !76
943
+ %889 = fmul float %856, %871, !dbg !76
944
+ %890 = fmul float %857, %873, !dbg !76
945
+ %891 = fmul float %858, %875, !dbg !76
946
+ %892 = getelementptr i16, ptr addrspace(1) %5, i64 %100, !dbg !77
947
+ %893 = getelementptr i16, ptr addrspace(1) %5, i64 %102, !dbg !77
948
+ %894 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %876) #6, !dbg !78
949
+ %895 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %877) #6, !dbg !78
950
+ %896 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %878) #6, !dbg !78
951
+ %897 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %879) #6, !dbg !78
952
+ %898 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %880) #6, !dbg !78
953
+ %899 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %881) #6, !dbg !78
954
+ %900 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %882) #6, !dbg !78
955
+ %901 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %883) #6, !dbg !78
956
+ %902 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %884) #6, !dbg !78
957
+ %903 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %885) #6, !dbg !78
958
+ %904 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %886) #6, !dbg !78
959
+ %905 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %887) #6, !dbg !78
960
+ %906 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %888) #6, !dbg !78
961
+ %907 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %889) #6, !dbg !78
962
+ %908 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %890) #6, !dbg !78
963
+ %909 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %891) #6, !dbg !78
964
+ %910 = insertelement <2 x i16> undef, i16 %894, i64 0, !dbg !78
965
+ %911 = insertelement <2 x i16> %910, i16 %895, i64 1, !dbg !78
966
+ %912 = bitcast <2 x i16> %911 to i32, !dbg !78
967
+ %913 = insertelement <2 x i16> undef, i16 %896, i64 0, !dbg !78
968
+ %914 = insertelement <2 x i16> %913, i16 %897, i64 1, !dbg !78
969
+ %915 = bitcast <2 x i16> %914 to i32, !dbg !78
970
+ %916 = insertelement <2 x i16> undef, i16 %898, i64 0, !dbg !78
971
+ %917 = insertelement <2 x i16> %916, i16 %899, i64 1, !dbg !78
972
+ %918 = bitcast <2 x i16> %917 to i32, !dbg !78
973
+ %919 = insertelement <2 x i16> undef, i16 %900, i64 0, !dbg !78
974
+ %920 = insertelement <2 x i16> %919, i16 %901, i64 1, !dbg !78
975
+ %921 = bitcast <2 x i16> %920 to i32, !dbg !78
976
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %912, i32 %915, i32 %918, i32 %921, ptr addrspace(1) %892, i1 true) #6, !dbg !78
977
+ %922 = insertelement <2 x i16> undef, i16 %902, i64 0, !dbg !78
978
+ %923 = insertelement <2 x i16> %922, i16 %903, i64 1, !dbg !78
979
+ %924 = bitcast <2 x i16> %923 to i32, !dbg !78
980
+ %925 = insertelement <2 x i16> undef, i16 %904, i64 0, !dbg !78
981
+ %926 = insertelement <2 x i16> %925, i16 %905, i64 1, !dbg !78
982
+ %927 = bitcast <2 x i16> %926 to i32, !dbg !78
983
+ %928 = insertelement <2 x i16> undef, i16 %906, i64 0, !dbg !78
984
+ %929 = insertelement <2 x i16> %928, i16 %907, i64 1, !dbg !78
985
+ %930 = bitcast <2 x i16> %929 to i32, !dbg !78
986
+ %931 = insertelement <2 x i16> undef, i16 %908, i64 0, !dbg !78
987
+ %932 = insertelement <2 x i16> %931, i16 %909, i64 1, !dbg !78
988
+ %933 = bitcast <2 x i16> %932 to i32, !dbg !78
989
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %924, i32 %927, i32 %930, i32 %933, ptr addrspace(1) %893, i1 true) #6, !dbg !78
990
+ ret void, !dbg !79
991
+ }
992
+
993
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
994
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
995
+
996
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
997
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
998
+
999
+ ; Function Attrs: convergent nocallback nounwind
1000
+ declare void @llvm.nvvm.barrier0() #2
1001
+
1002
+ ; Function Attrs: alwaysinline nounwind
1003
+ define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
1004
+ %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
1005
+ %.not = icmp eq i32 %1, 0
1006
+ br i1 %.not, label %4, label %2
1007
+
1008
+ 2: ; preds = %0
1009
+ %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
1010
+ br label %6
1011
+
1012
+ 4: ; preds = %0
1013
+ %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
1014
+ br label %6
1015
+
1016
+ 6: ; preds = %4, %2
1017
+ %.0 = phi float [ %3, %2 ], [ %5, %4 ]
1018
+ ret float %.0
1019
+ }
1020
+
1021
+ declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
1022
+
1023
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
1024
+ declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
1025
+
1026
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
1027
+ declare float @llvm.nvvm.rsqrt.approx.f(float) #5
1028
+
1029
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
1030
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
1031
+ attributes #2 = { convergent nocallback nounwind }
1032
+ attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
1033
+ attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
1034
+ attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
1035
+ attributes #6 = { nounwind }
1036
+
1037
+ !llvm.module.flags = !{!0, !1}
1038
+ !llvm.dbg.cu = !{!2}
1039
+ !nvvm.annotations = !{!4, !5, !5, !4}
1040
+ !llvm.ident = !{!6}
1041
+
1042
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
1043
+ !1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
1044
+ !2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
1045
+ !3 = !DIFile(filename: "cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py", directory: "/tmp/torchinductor_root/pn")
1046
+ !4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1}
1047
+ !5 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 256}
1048
+ !6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
1049
+ !7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
1050
+ !8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
1051
+ !9 = !{}
1052
+ !10 = !DILocation(line: 22, column: 44, scope: !7)
1053
+ !11 = !DILocation(line: 24, column: 33, scope: !7)
1054
+ !12 = !DILocation(line: 21, column: 28, scope: !7)
1055
+ !13 = !DILocation(line: 21, column: 33, scope: !7)
1056
+ !14 = !DILocation(line: 22, column: 23, scope: !7)
1057
+ !15 = !DILocation(line: 26, column: 30, scope: !7)
1058
+ !16 = !DILocation(line: 26, column: 35, scope: !7)
1059
+ !17 = !DILocation(line: 27, column: 18, scope: !7)
1060
+ !18 = !DILocation(line: 35, column: 44, scope: !7)
1061
+ !19 = !DILocation(line: 35, column: 40, scope: !7)
1062
+ !20 = !DILocation(line: 35, column: 34, scope: !7)
1063
+ !21 = !DILocation(line: 35, column: 50, scope: !7)
1064
+ !22 = !DILocation(line: 36, column: 44, scope: !7)
1065
+ !23 = !DILocation(line: 36, column: 40, scope: !7)
1066
+ !24 = !DILocation(line: 36, column: 34, scope: !7)
1067
+ !25 = !DILocation(line: 36, column: 50, scope: !7)
1068
+ !26 = !DILocation(line: 36, column: 101, scope: !7)
1069
+ !27 = !DILocation(line: 37, column: 22, scope: !7)
1070
+ !28 = !DILocation(line: 38, column: 22, scope: !7)
1071
+ !29 = !DILocation(line: 39, column: 36, scope: !7)
1072
+ !30 = !DILocation(line: 40, column: 40, scope: !7)
1073
+ !31 = !DILocation(line: 40, column: 55, scope: !7)
1074
+ !32 = !DILocation(line: 41, column: 44, scope: !7)
1075
+ !33 = !DILocation(line: 41, column: 40, scope: !7)
1076
+ !34 = !DILocation(line: 41, column: 34, scope: !7)
1077
+ !35 = !DILocation(line: 41, column: 52, scope: !7)
1078
+ !36 = !DILocation(line: 42, column: 22, scope: !7)
1079
+ !37 = !DILocation(line: 44, column: 22, scope: !7)
1080
+ !38 = !DILocation(line: 98, column: 22, scope: !39, inlinedAt: !41)
1081
+ !39 = distinct !DILexicalBlockFile(scope: !7, file: !40, discriminator: 0)
1082
+ !40 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
1083
+ !41 = !DILocation(line: 47, column: 41, scope: !39)
1084
+ !42 = !DILocation(line: 101, column: 30, scope: !39, inlinedAt: !41)
1085
+ !43 = !DILocation(line: 101, column: 22, scope: !39, inlinedAt: !41)
1086
+ !44 = !DILocation(line: 101, column: 13, scope: !39, inlinedAt: !41)
1087
+ !45 = !DILocation(line: 108, column: 21, scope: !46, inlinedAt: !47)
1088
+ !46 = distinct !DILexicalBlockFile(scope: !39, file: !40, discriminator: 0)
1089
+ !47 = !DILocation(line: 120, column: 46, scope: !46, inlinedAt: !48)
1090
+ !48 = !DILocation(line: 53, column: 44, scope: !46)
1091
+ !49 = !DILocation(line: 110, column: 60, scope: !46, inlinedAt: !47)
1092
+ !50 = !DILocation(line: 112, column: 25, scope: !46, inlinedAt: !47)
1093
+ !51 = !DILocation(line: 112, column: 17, scope: !46, inlinedAt: !47)
1094
+ !52 = !DILocation(line: 113, column: 15, scope: !46, inlinedAt: !47)
1095
+ !53 = !DILocation(line: 113, column: 30, scope: !46, inlinedAt: !47)
1096
+ !54 = !DILocation(line: 113, column: 49, scope: !46, inlinedAt: !47)
1097
+ !55 = !DILocation(line: 113, column: 22, scope: !46, inlinedAt: !47)
1098
+ !56 = !DILocation(line: 113, column: 38, scope: !46, inlinedAt: !47)
1099
+ !57 = !DILocation(line: 120, column: 46, scope: !39, inlinedAt: !58)
1100
+ !58 = !DILocation(line: 53, column: 44, scope: !39)
1101
+ !59 = !DILocation(line: 109, column: 28, scope: !46, inlinedAt: !47)
1102
+ !60 = !DILocation(line: 110, column: 39, scope: !46, inlinedAt: !47)
1103
+ !61 = !DILocation(line: 110, column: 49, scope: !46, inlinedAt: !47)
1104
+ !62 = !DILocation(line: 62, column: 51, scope: !7)
1105
+ !63 = !DILocation(line: 63, column: 51, scope: !7)
1106
+ !64 = !DILocation(line: 63, column: 103, scope: !7)
1107
+ !65 = !DILocation(line: 64, column: 35, scope: !7)
1108
+ !66 = !DILocation(line: 64, column: 40, scope: !7)
1109
+ !67 = !DILocation(line: 68, column: 57, scope: !7)
1110
+ !68 = !DILocation(line: 69, column: 54, scope: !7)
1111
+ !69 = !DILocation(line: 75, column: 24, scope: !7)
1112
+ !70 = !DILocation(line: 77, column: 24, scope: !7)
1113
+ !71 = !DILocation(line: 78, column: 30, scope: !7)
1114
+ !72 = !DILocation(line: 70, column: 24, scope: !7)
1115
+ !73 = !DILocation(line: 72, column: 24, scope: !7)
1116
+ !74 = !DILocation(line: 73, column: 24, scope: !7)
1117
+ !75 = !DILocation(line: 79, column: 24, scope: !7)
1118
+ !76 = !DILocation(line: 80, column: 24, scope: !7)
1119
+ !77 = !DILocation(line: 82, column: 29, scope: !7)
1120
+ !78 = !DILocation(line: 82, column: 52, scope: !7)
1121
+ !79 = !DILocation(line: 58, column: 4, scope: !7)
.triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.llir ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
5
+ @assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
6
+ @assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp13 < 50257"
7
+ @assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
8
+ @assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
9
+ @assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
10
+ @.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
11
+
12
+ declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
13
+
14
+ define void @triton__0d1d2d3d4d5de6de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i32 %5, i32 %6) local_unnamed_addr !dbg !7 {
15
+ %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
16
+ %9 = lshr i32 %8, 2, !dbg !10
17
+ %10 = and i32 %9, 63, !dbg !10
18
+ %11 = and i32 %8, 63, !dbg !10
19
+ %12 = and i32 %8, 3, !dbg !11
20
+ %13 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #5, !dbg !12
21
+ %14 = shl i32 %13, 6, !dbg !13
22
+ %15 = or i32 %14, %10, !dbg !14
23
+ %16 = or i32 %14, %11, !dbg !14
24
+ %17 = sext i32 %15 to i64, !dbg !15
25
+ %18 = getelementptr i64, ptr addrspace(1) %0, i64 %17, !dbg !15
26
+ %19 = sext i32 %16 to i64, !dbg !15
27
+ %20 = getelementptr i64, ptr addrspace(1) %0, i64 %19, !dbg !15
28
+ %21 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %18, i1 true) #5, !dbg !16
29
+ %22 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %20, i1 true) #5, !dbg !16
30
+ %23 = srem i32 %15, 512, !dbg !17
31
+ %24 = shl nsw i32 %23, 8, !dbg !18
32
+ %25 = add i64 %22, 50257, !dbg !19
33
+ %26 = icmp slt i64 %21, 0, !dbg !20
34
+ %27 = icmp slt i64 %22, 0, !dbg !20
35
+ %28 = select i1 %27, i64 %25, i64 %22, !dbg !21
36
+ %.fr8 = freeze i64 %28, !dbg !22
37
+ %29 = icmp ugt i64 %.fr8, 50256, !dbg !22
38
+ %30 = shl i64 %21, 8, !dbg !23
39
+ %31 = add i64 %30, 12865792, !dbg !23
40
+ %32 = select i1 %26, i64 %31, i64 %30, !dbg !23
41
+ %33 = getelementptr float, ptr addrspace(1) %1, i64 %32
42
+ br i1 %29, label %.split.us, label %.split, !dbg !24
43
+
44
+ .split.us: ; preds = %7, %.split.us
45
+ %34 = phi float [ %50, %.split.us ], [ 0.000000e+00, %7 ]
46
+ %35 = phi float [ %55, %.split.us ], [ 0.000000e+00, %7 ]
47
+ %36 = phi float [ %52, %.split.us ], [ 0.000000e+00, %7 ]
48
+ %37 = phi i32 [ %56, %.split.us ], [ 0, %7 ]
49
+ %38 = or i32 %37, %12, !dbg !25
50
+ %39 = add i32 %38, %24, !dbg !26
51
+ %40 = sext i32 %39 to i64, !dbg !27
52
+ %41 = getelementptr float, ptr addrspace(1) %2, i64 %40, !dbg !27
53
+ %42 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %41, i1 true, i32 0, i1 true) #5, !dbg !28
54
+ %43 = bitcast i32 %42 to float, !dbg !28
55
+ tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !24
56
+ %44 = zext nneg i32 %38 to i64, !dbg !29
57
+ %45 = getelementptr float, ptr addrspace(1) %33, i64 %44, !dbg !30
58
+ %46 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %45, i1 true, i32 0, i1 true) #5, !dbg !31
59
+ %47 = bitcast i32 %46 to float, !dbg !31
60
+ %48 = fadd float %43, %47, !dbg !32
61
+ %49 = fsub float %48, %36, !dbg !33
62
+ %50 = fadd float %34, 1.000000e+00, !dbg !37
63
+ %51 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %49, float %50) #5, !dbg !38
64
+ %52 = fadd float %36, %51, !dbg !39
65
+ %53 = fsub float %48, %52, !dbg !40
66
+ %54 = fmul float %49, %53, !dbg !41
67
+ %55 = fadd float %35, %54, !dbg !42
68
+ %56 = add nuw nsw i32 %37, 4, !dbg !43
69
+ %57 = icmp ult i32 %37, 252, !dbg !43
70
+ br i1 %57, label %.split.us, label %.split5.us, !dbg !43
71
+
72
+ .split: ; preds = %7, %.split
73
+ %58 = phi float [ %74, %.split ], [ 0.000000e+00, %7 ]
74
+ %59 = phi float [ %79, %.split ], [ 0.000000e+00, %7 ]
75
+ %60 = phi float [ %76, %.split ], [ 0.000000e+00, %7 ]
76
+ %61 = phi i32 [ %80, %.split ], [ 0, %7 ]
77
+ %62 = or i32 %61, %12, !dbg !25
78
+ %63 = add i32 %62, %24, !dbg !26
79
+ %64 = sext i32 %63 to i64, !dbg !27
80
+ %65 = getelementptr float, ptr addrspace(1) %2, i64 %64, !dbg !27
81
+ %66 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %65, i1 true, i32 0, i1 true) #5, !dbg !28
82
+ %67 = bitcast i32 %66 to float, !dbg !28
83
+ %68 = zext nneg i32 %62 to i64, !dbg !29
84
+ %69 = getelementptr float, ptr addrspace(1) %33, i64 %68, !dbg !30
85
+ %70 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %69, i1 true, i32 0, i1 true) #5, !dbg !31
86
+ %71 = bitcast i32 %70 to float, !dbg !31
87
+ %72 = fadd float %67, %71, !dbg !32
88
+ %73 = fsub float %72, %60, !dbg !33
89
+ %74 = fadd float %58, 1.000000e+00, !dbg !37
90
+ %75 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %73, float %74) #5, !dbg !38
91
+ %76 = fadd float %60, %75, !dbg !39
92
+ %77 = fsub float %72, %76, !dbg !40
93
+ %78 = fmul float %73, %77, !dbg !41
94
+ %79 = fadd float %59, %78, !dbg !42
95
+ %80 = add nuw nsw i32 %61, 4, !dbg !43
96
+ %81 = icmp ult i32 %61, 252, !dbg !43
97
+ br i1 %81, label %.split, label %.split5.us, !dbg !43
98
+
99
+ .split5.us: ; preds = %.split, %.split.us
100
+ %.us-phi = phi float [ %52, %.split.us ], [ %76, %.split ]
101
+ %.us-phi6 = phi float [ %55, %.split.us ], [ %79, %.split ]
102
+ %.us-phi7 = phi float [ %50, %.split.us ], [ %74, %.split ]
103
+ %82 = bitcast float %.us-phi to i32, !dbg !44
104
+ %83 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %82, i32 2, i32 31), !dbg !44
105
+ %84 = bitcast i32 %83 to float, !dbg !44
106
+ %85 = bitcast float %.us-phi6 to i32, !dbg !44
107
+ %86 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %85, i32 2, i32 31), !dbg !44
108
+ %87 = bitcast i32 %86 to float, !dbg !44
109
+ %88 = bitcast float %.us-phi7 to i32, !dbg !44
110
+ %89 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %88, i32 2, i32 31), !dbg !44
111
+ %90 = bitcast i32 %89 to float, !dbg !44
112
+ %91 = fsub float %84, %.us-phi, !dbg !46
113
+ %92 = fadd float %.us-phi7, %90, !dbg !50
114
+ %93 = fcmp oeq float %92, 0.000000e+00, !dbg !51
115
+ %94 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %90, float %92) #5, !dbg !52
116
+ %95 = select i1 %93, float 0.000000e+00, float %94, !dbg !53
117
+ %96 = fmul float %91, %95, !dbg !54
118
+ %97 = fadd float %.us-phi, %96, !dbg !55
119
+ %98 = fadd float %.us-phi6, %87, !dbg !56
120
+ %99 = fmul float %91, %91, !dbg !57
121
+ %100 = fmul float %.us-phi7, %99, !dbg !58
122
+ %101 = fmul float %100, %95, !dbg !59
123
+ %102 = fadd float %98, %101, !dbg !60
124
+ %103 = bitcast float %97 to i32, !dbg !44
125
+ %104 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %103, i32 1, i32 31), !dbg !44
126
+ %105 = bitcast i32 %104 to float, !dbg !44
127
+ %106 = bitcast float %102 to i32, !dbg !44
128
+ %107 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %106, i32 1, i32 31), !dbg !44
129
+ %108 = bitcast i32 %107 to float, !dbg !44
130
+ %109 = bitcast float %92 to i32, !dbg !44
131
+ %110 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %109, i32 1, i32 31), !dbg !44
132
+ %111 = bitcast i32 %110 to float, !dbg !44
133
+ %112 = fsub float %105, %97, !dbg !46
134
+ %113 = fadd float %92, %111, !dbg !50
135
+ %114 = fcmp oeq float %113, 0.000000e+00, !dbg !51
136
+ %115 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %111, float %113) #5, !dbg !52
137
+ %116 = select i1 %114, float 0.000000e+00, float %115, !dbg !53
138
+ %117 = fmul float %112, %116, !dbg !54
139
+ %118 = fadd float %97, %117, !dbg !55
140
+ %119 = fadd float %102, %108, !dbg !56
141
+ %120 = fmul float %112, %112, !dbg !57
142
+ %121 = fmul float %92, %120, !dbg !58
143
+ %122 = fmul float %116, %121, !dbg !59
144
+ %123 = fadd float %119, %122, !dbg !60
145
+ %124 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %123, float 2.560000e+02) #5, !dbg !61
146
+ %125 = fadd float %124, 0x3EE4F8B580000000, !dbg !62
147
+ %126 = shl i32 %15, 8, !dbg !63
148
+ br label %127, !dbg !64
149
+
150
+ 127: ; preds = %.split5.us, %__nv_rsqrtf.exit
151
+ %128 = phi i32 [ 0, %.split5.us ], [ %157, %__nv_rsqrtf.exit ]
152
+ %129 = or i32 %128, %12, !dbg !65
153
+ %130 = add i32 %129, %24, !dbg !66
154
+ %131 = sext i32 %130 to i64, !dbg !67
155
+ %132 = getelementptr float, ptr addrspace(1) %2, i64 %131, !dbg !67
156
+ %133 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %132, i1 true, i32 0, i1 true) #5, !dbg !68
157
+ %134 = bitcast i32 %133 to float, !dbg !68
158
+ %135 = zext nneg i32 %129 to i64, !dbg !69
159
+ %136 = getelementptr float, ptr addrspace(1) %3, i64 %135, !dbg !69
160
+ %137 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %136, i1 true, i32 0, i1 true) #5, !dbg !70
161
+ %138 = bitcast i32 %137 to float, !dbg !70
162
+ br i1 %29, label %139, label %140, !dbg !71
163
+
164
+ 139: ; preds = %127
165
+ tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !71
166
+ br label %140, !dbg !71
167
+
168
+ 140: ; preds = %139, %127
169
+ %141 = getelementptr float, ptr addrspace(1) %33, i64 %135, !dbg !72
170
+ %142 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %141, i1 true, i32 0, i1 true) #5, !dbg !73
171
+ %143 = bitcast i32 %142 to float, !dbg !73
172
+ %144 = fadd float %134, %143, !dbg !74
173
+ %145 = fsub float %144, %118, !dbg !75
174
+ %146 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !76
175
+ %.not.i = icmp eq i32 %146, 0, !dbg !76
176
+ br i1 %.not.i, label %149, label %147, !dbg !76
177
+
178
+ 147: ; preds = %140
179
+ %148 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %125), !dbg !76
180
+ br label %__nv_rsqrtf.exit, !dbg !76
181
+
182
+ 149: ; preds = %140
183
+ %150 = tail call float @llvm.nvvm.rsqrt.approx.f(float %125), !dbg !76
184
+ br label %__nv_rsqrtf.exit, !dbg !76
185
+
186
+ __nv_rsqrtf.exit: ; preds = %147, %149
187
+ %.0.i = phi float [ %148, %147 ], [ %150, %149 ], !dbg !76
188
+ %151 = fmul float %145, %.0.i, !dbg !77
189
+ %152 = fmul float %151, %138, !dbg !78
190
+ %153 = add i32 %129, %126, !dbg !79
191
+ %154 = sext i32 %153 to i64, !dbg !80
192
+ %155 = getelementptr i16, ptr addrspace(1) %4, i64 %154, !dbg !80
193
+ %156 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %152) #5, !dbg !81
194
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %156, ptr addrspace(1) %155, i1 true) #5, !dbg !81
195
+ %157 = add nuw nsw i32 %128, 4, !dbg !64
196
+ %158 = icmp ult i32 %128, 252, !dbg !64
197
+ br i1 %158, label %127, label %159, !dbg !64
198
+
199
+ 159: ; preds = %__nv_rsqrtf.exit
200
+ ret void, !dbg !82
201
+ }
202
+
203
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
204
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
205
+
206
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
207
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
208
+
209
+ ; Function Attrs: alwaysinline nounwind
210
+ define float @__nv_rsqrtf(float %x) local_unnamed_addr #2 {
211
+ %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5
212
+ %.not = icmp eq i32 %1, 0
213
+ br i1 %.not, label %4, label %2
214
+
215
+ 2: ; preds = %0
216
+ %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
217
+ br label %6
218
+
219
+ 4: ; preds = %0
220
+ %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
221
+ br label %6
222
+
223
+ 6: ; preds = %4, %2
224
+ %.0 = phi float [ %3, %2 ], [ %5, %4 ]
225
+ ret float %.0
226
+ }
227
+
228
+ declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #3
229
+
230
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
231
+ declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #4
232
+
233
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
234
+ declare float @llvm.nvvm.rsqrt.approx.f(float) #4
235
+
236
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
237
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
238
+ attributes #2 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
239
+ attributes #3 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
240
+ attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
241
+ attributes #5 = { nounwind }
242
+
243
+ !llvm.module.flags = !{!0, !1}
244
+ !llvm.dbg.cu = !{!2}
245
+ !nvvm.annotations = !{!4, !5, !5, !4}
246
+ !llvm.ident = !{!6}
247
+
248
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
249
+ !1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
250
+ !2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
251
+ !3 = !DIFile(filename: "cgx5lxpuexpindj4dsmjz5x42uhyy7iskevq7ovzpwagb3t5powj.py", directory: "/tmp/torchinductor_root/gx")
252
+ !4 = !{ptr @triton__0d1d2d3d4d5de6de, !"kernel", i32 1}
253
+ !5 = !{ptr @triton__0d1d2d3d4d5de6de, !"maxntidx", i32 256}
254
+ !6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
255
+ !7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5de6de", linkageName: "triton__0d1d2d3d4d5de6de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
256
+ !8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
257
+ !9 = !{}
258
+ !10 = !DILocation(line: 22, column: 44, scope: !7)
259
+ !11 = !DILocation(line: 24, column: 33, scope: !7)
260
+ !12 = !DILocation(line: 21, column: 28, scope: !7)
261
+ !13 = !DILocation(line: 21, column: 33, scope: !7)
262
+ !14 = !DILocation(line: 22, column: 23, scope: !7)
263
+ !15 = !DILocation(line: 26, column: 30, scope: !7)
264
+ !16 = !DILocation(line: 26, column: 35, scope: !7)
265
+ !17 = !DILocation(line: 27, column: 18, scope: !7)
266
+ !18 = !DILocation(line: 35, column: 44, scope: !7)
267
+ !19 = !DILocation(line: 36, column: 22, scope: !7)
268
+ !20 = !DILocation(line: 37, column: 22, scope: !7)
269
+ !21 = !DILocation(line: 38, column: 36, scope: !7)
270
+ !22 = !DILocation(line: 39, column: 40, scope: !7)
271
+ !23 = !DILocation(line: 40, column: 44, scope: !7)
272
+ !24 = !DILocation(line: 39, column: 55, scope: !7)
273
+ !25 = !DILocation(line: 32, column: 27, scope: !7)
274
+ !26 = !DILocation(line: 35, column: 40, scope: !7)
275
+ !27 = !DILocation(line: 35, column: 34, scope: !7)
276
+ !28 = !DILocation(line: 35, column: 50, scope: !7)
277
+ !29 = !DILocation(line: 40, column: 40, scope: !7)
278
+ !30 = !DILocation(line: 40, column: 34, scope: !7)
279
+ !31 = !DILocation(line: 40, column: 52, scope: !7)
280
+ !32 = !DILocation(line: 41, column: 22, scope: !7)
281
+ !33 = !DILocation(line: 96, column: 20, scope: !34, inlinedAt: !36)
282
+ !34 = distinct !DILexicalBlockFile(scope: !7, file: !35, discriminator: 0)
283
+ !35 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
284
+ !36 = !DILocation(line: 44, column: 38, scope: !34)
285
+ !37 = !DILocation(line: 97, column: 26, scope: !34, inlinedAt: !36)
286
+ !38 = !DILocation(line: 98, column: 30, scope: !34, inlinedAt: !36)
287
+ !39 = !DILocation(line: 98, column: 22, scope: !34, inlinedAt: !36)
288
+ !40 = !DILocation(line: 101, column: 30, scope: !34, inlinedAt: !36)
289
+ !41 = !DILocation(line: 101, column: 22, scope: !34, inlinedAt: !36)
290
+ !42 = !DILocation(line: 47, column: 48, scope: !7)
291
+ !43 = !DILocation(line: 31, column: 36, scope: !7)
292
+ !44 = !DILocation(line: 120, column: 46, scope: !34, inlinedAt: !45)
293
+ !45 = !DILocation(line: 50, column: 41, scope: !34)
294
+ !46 = !DILocation(line: 108, column: 21, scope: !47, inlinedAt: !48)
295
+ !47 = distinct !DILexicalBlockFile(scope: !34, file: !35, discriminator: 0)
296
+ !48 = !DILocation(line: 120, column: 46, scope: !47, inlinedAt: !49)
297
+ !49 = !DILocation(line: 50, column: 41, scope: !47)
298
+ !50 = !DILocation(line: 109, column: 28, scope: !47, inlinedAt: !48)
299
+ !51 = !DILocation(line: 110, column: 39, scope: !47, inlinedAt: !48)
300
+ !52 = !DILocation(line: 110, column: 60, scope: !47, inlinedAt: !48)
301
+ !53 = !DILocation(line: 110, column: 49, scope: !47, inlinedAt: !48)
302
+ !54 = !DILocation(line: 112, column: 25, scope: !47, inlinedAt: !48)
303
+ !55 = !DILocation(line: 112, column: 17, scope: !47, inlinedAt: !48)
304
+ !56 = !DILocation(line: 113, column: 15, scope: !47, inlinedAt: !48)
305
+ !57 = !DILocation(line: 113, column: 30, scope: !47, inlinedAt: !48)
306
+ !58 = !DILocation(line: 113, column: 38, scope: !47, inlinedAt: !48)
307
+ !59 = !DILocation(line: 113, column: 49, scope: !47, inlinedAt: !48)
308
+ !60 = !DILocation(line: 113, column: 22, scope: !47, inlinedAt: !48)
309
+ !61 = !DILocation(line: 69, column: 23, scope: !7)
310
+ !62 = !DILocation(line: 71, column: 24, scope: !7)
311
+ !63 = !DILocation(line: 76, column: 39, scope: !7)
312
+ !64 = !DILocation(line: 55, column: 36, scope: !7)
313
+ !65 = !DILocation(line: 56, column: 27, scope: !7)
314
+ !66 = !DILocation(line: 59, column: 41, scope: !7)
315
+ !67 = !DILocation(line: 59, column: 35, scope: !7)
316
+ !68 = !DILocation(line: 59, column: 51, scope: !7)
317
+ !69 = !DILocation(line: 60, column: 35, scope: !7)
318
+ !70 = !DILocation(line: 60, column: 40, scope: !7)
319
+ !71 = !DILocation(line: 64, column: 57, scope: !7)
320
+ !72 = !DILocation(line: 65, column: 35, scope: !7)
321
+ !73 = !DILocation(line: 65, column: 54, scope: !7)
322
+ !74 = !DILocation(line: 66, column: 24, scope: !7)
323
+ !75 = !DILocation(line: 67, column: 24, scope: !7)
324
+ !76 = !DILocation(line: 72, column: 30, scope: !7)
325
+ !77 = !DILocation(line: 73, column: 24, scope: !7)
326
+ !78 = !DILocation(line: 74, column: 24, scope: !7)
327
+ !79 = !DILocation(line: 76, column: 35, scope: !7)
328
+ !80 = !DILocation(line: 76, column: 29, scope: !7)
329
+ !81 = !DILocation(line: 76, column: 52, scope: !7)
330
+ !82 = !DILocation(line: 55, column: 4, scope: !7)
.triton/dump/21d0195c63fb062bfc567b79c9bb2771/triton_.llir ADDED
@@ -0,0 +1,321 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @global_smem = external addrspace(3) global [0 x i8]
5
+
6
+ define void @triton__0d1d2d3d4d5d6d7d8de9de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, ptr addrspace(1) %7, i32 %8, i32 %9) local_unnamed_addr !dbg !5 {
7
+ %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
8
+ %12 = and i32 %11, 31, !dbg !8
9
+ %13 = lshr i32 %11, 5, !dbg !8
10
+ %14 = and i32 %13, 1, !dbg !8
11
+ %urem = shl i32 %11, 2, !dbg !8
12
+ %15 = and i32 %urem, 252, !dbg !8
13
+ %16 = or i32 %15, 1, !dbg !8
14
+ %17 = or i32 %15, 2, !dbg !8
15
+ %18 = or i32 %15, 3, !dbg !8
16
+ %19 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !9
17
+ %20 = shl i32 %19, 8, !dbg !10
18
+ %21 = or i32 %20, %15, !dbg !11
19
+ %22 = sext i32 %21 to i64, !dbg !12
20
+ %23 = getelementptr i16, ptr addrspace(1) %1, i64 %22, !dbg !12
21
+ %24 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %23, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !13
22
+ %25 = extractvalue { i32, i32 } %24, 0, !dbg !13
23
+ %26 = extractvalue { i32, i32 } %24, 1, !dbg !13
24
+ %27 = trunc i32 %25 to i16, !dbg !13
25
+ %extelt.offset = lshr i32 %25, 16, !dbg !13
26
+ %28 = trunc i32 %extelt.offset to i16, !dbg !13
27
+ %29 = trunc i32 %26 to i16, !dbg !13
28
+ %extelt.offset1 = lshr i32 %26, 16, !dbg !13
29
+ %30 = trunc i32 %extelt.offset1 to i16, !dbg !13
30
+ %31 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %27) #3, !dbg !14
31
+ %32 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %28) #3, !dbg !14
32
+ %33 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %29) #3, !dbg !14
33
+ %34 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %30) #3, !dbg !14
34
+ %35 = zext nneg i32 %15 to i64, !dbg !15
35
+ %36 = getelementptr float, ptr addrspace(1) %2, i64 %35, !dbg !15
36
+ %37 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %36, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !16
37
+ %38 = extractvalue { i32, i32, i32, i32 } %37, 0, !dbg !16
38
+ %39 = extractvalue { i32, i32, i32, i32 } %37, 1, !dbg !16
39
+ %40 = extractvalue { i32, i32, i32, i32 } %37, 2, !dbg !16
40
+ %41 = extractvalue { i32, i32, i32, i32 } %37, 3, !dbg !16
41
+ %42 = bitcast i32 %38 to float, !dbg !16
42
+ %43 = bitcast i32 %39 to float, !dbg !16
43
+ %44 = bitcast i32 %40 to float, !dbg !16
44
+ %45 = bitcast i32 %41 to float, !dbg !16
45
+ %46 = getelementptr float, ptr addrspace(1) %3, i64 %22, !dbg !17
46
+ %47 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %46, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !18
47
+ %48 = extractvalue { i32, i32, i32, i32 } %47, 0, !dbg !18
48
+ %49 = extractvalue { i32, i32, i32, i32 } %47, 1, !dbg !18
49
+ %50 = extractvalue { i32, i32, i32, i32 } %47, 2, !dbg !18
50
+ %51 = extractvalue { i32, i32, i32, i32 } %47, 3, !dbg !18
51
+ %52 = bitcast i32 %48 to float, !dbg !18
52
+ %53 = bitcast i32 %49 to float, !dbg !18
53
+ %54 = bitcast i32 %50 to float, !dbg !18
54
+ %55 = bitcast i32 %51 to float, !dbg !18
55
+ %56 = sext i32 %19 to i64, !dbg !19
56
+ %57 = getelementptr float, ptr addrspace(1) %4, i64 %56, !dbg !19
57
+ %58 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %57, i1 true) #3, !dbg !20
58
+ %59 = bitcast i32 %58 to float, !dbg !20
59
+ %60 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %57, i1 true) #3, !dbg !20
60
+ %61 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %57, i1 true) #3, !dbg !20
61
+ %62 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %57, i1 true) #3, !dbg !20
62
+ %63 = getelementptr float, ptr addrspace(1) %5, i64 %56, !dbg !21
63
+ %64 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %63, i1 true) #3, !dbg !22
64
+ %65 = bitcast i32 %64 to float, !dbg !22
65
+ %66 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %63, i1 true) #3, !dbg !22
66
+ %67 = bitcast i32 %66 to float, !dbg !22
67
+ %68 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %63, i1 true) #3, !dbg !22
68
+ %69 = bitcast i32 %68 to float, !dbg !22
69
+ %70 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %63, i1 true) #3, !dbg !22
70
+ %71 = bitcast i32 %70 to float, !dbg !22
71
+ %72 = getelementptr i64, ptr addrspace(1) %6, i64 %56, !dbg !23
72
+ %73 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %72, i1 true) #3, !dbg !24
73
+ %74 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %72, i1 true) #3, !dbg !24
74
+ %75 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %72, i1 true) #3, !dbg !24
75
+ %76 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %72, i1 true) #3, !dbg !24
76
+ %77 = getelementptr float, ptr addrspace(1) %0, i64 %22, !dbg !25
77
+ %78 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %77, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !26
78
+ %79 = extractvalue { i32, i32, i32, i32 } %78, 0, !dbg !26
79
+ %80 = extractvalue { i32, i32, i32, i32 } %78, 1, !dbg !26
80
+ %81 = extractvalue { i32, i32, i32, i32 } %78, 2, !dbg !26
81
+ %82 = extractvalue { i32, i32, i32, i32 } %78, 3, !dbg !26
82
+ %83 = bitcast i32 %79 to float, !dbg !26
83
+ %84 = bitcast i32 %80 to float, !dbg !26
84
+ %85 = bitcast i32 %81 to float, !dbg !26
85
+ %86 = bitcast i32 %82 to float, !dbg !26
86
+ %87 = fmul float %31, %42, !dbg !27
87
+ %88 = fmul float %32, %43, !dbg !27
88
+ %89 = fmul float %33, %44, !dbg !27
89
+ %90 = fmul float %34, %45, !dbg !27
90
+ %91 = fadd float %87, %88, !dbg !28
91
+ %92 = fadd float %89, %91, !dbg !28
92
+ %93 = fadd float %90, %92, !dbg !28
93
+ %94 = bitcast float %93 to i32, !dbg !34
94
+ %95 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %94, i32 16, i32 31), !dbg !34
95
+ %96 = bitcast i32 %95 to float, !dbg !34
96
+ %97 = fadd float %93, %96, !dbg !28
97
+ %98 = bitcast float %97 to i32, !dbg !34
98
+ %99 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %98, i32 8, i32 31), !dbg !34
99
+ %100 = bitcast i32 %99 to float, !dbg !34
100
+ %101 = fadd float %97, %100, !dbg !28
101
+ %102 = bitcast float %101 to i32, !dbg !34
102
+ %103 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %102, i32 4, i32 31), !dbg !34
103
+ %104 = bitcast i32 %103 to float, !dbg !34
104
+ %105 = fadd float %101, %104, !dbg !28
105
+ %106 = bitcast float %105 to i32, !dbg !34
106
+ %107 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %106, i32 2, i32 31), !dbg !34
107
+ %108 = bitcast i32 %107 to float, !dbg !34
108
+ %109 = fadd float %105, %108, !dbg !28
109
+ %110 = bitcast float %109 to i32, !dbg !34
110
+ %111 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %110, i32 1, i32 31), !dbg !34
111
+ %112 = bitcast i32 %111 to float, !dbg !34
112
+ %113 = fadd float %109, %112, !dbg !28
113
+ %114 = icmp eq i32 %12, 0, !dbg !34
114
+ %115 = zext nneg i32 %14 to i64, !dbg !34
115
+ %116 = getelementptr float, ptr addrspace(3) @global_smem, i64 %115, !dbg !34
116
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %116, float %113, i1 %114) #3, !dbg !34
117
+ tail call void @llvm.nvvm.barrier0(), !dbg !34
118
+ %117 = icmp slt i32 %11, 2, !dbg !34
119
+ %118 = sext i32 %11 to i64, !dbg !34
120
+ %119 = getelementptr float, ptr addrspace(3) @global_smem, i64 %118, !dbg !34
121
+ %120 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %119, i1 %117) #3, !dbg !34
122
+ %121 = bitcast float %120 to i32, !dbg !34
123
+ %122 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %121, i32 1, i32 31), !dbg !34
124
+ %123 = bitcast i32 %122 to float, !dbg !34
125
+ %124 = fadd float %120, %123, !dbg !28
126
+ %125 = and i32 %11, 1, !dbg !34
127
+ %126 = icmp eq i32 %125, 0, !dbg !34
128
+ %127 = and i1 %117, %126, !dbg !34
129
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %119, float %124, i1 %127) #3, !dbg !34
130
+ tail call void @llvm.nvvm.barrier0(), !dbg !34
131
+ %128 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !34
132
+ %129 = fadd float %128, 0.000000e+00, !dbg !36
133
+ %130 = fsub float %52, %59, !dbg !40
134
+ %131 = fsub float %53, %59, !dbg !40
135
+ %132 = fsub float %54, %59, !dbg !40
136
+ %133 = fsub float %55, %59, !dbg !40
137
+ %134 = fmul float %130, %65, !dbg !41
138
+ %135 = fmul float %131, %65, !dbg !41
139
+ %136 = fmul float %132, %65, !dbg !41
140
+ %137 = fmul float %133, %65, !dbg !41
141
+ %138 = fmul float %87, %134, !dbg !42
142
+ %139 = fmul float %88, %135, !dbg !42
143
+ %140 = fmul float %89, %136, !dbg !42
144
+ %141 = fmul float %90, %137, !dbg !42
145
+ tail call void @llvm.nvvm.barrier0(), !dbg !43
146
+ %142 = fadd float %138, %139, !dbg !45
147
+ %143 = fadd float %140, %142, !dbg !45
148
+ %144 = fadd float %141, %143, !dbg !45
149
+ %145 = bitcast float %144 to i32, !dbg !43
150
+ %146 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %145, i32 16, i32 31), !dbg !43
151
+ %147 = bitcast i32 %146 to float, !dbg !43
152
+ %148 = fadd float %144, %147, !dbg !45
153
+ %149 = bitcast float %148 to i32, !dbg !43
154
+ %150 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %149, i32 8, i32 31), !dbg !43
155
+ %151 = bitcast i32 %150 to float, !dbg !43
156
+ %152 = fadd float %148, %151, !dbg !45
157
+ %153 = bitcast float %152 to i32, !dbg !43
158
+ %154 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %153, i32 4, i32 31), !dbg !43
159
+ %155 = bitcast i32 %154 to float, !dbg !43
160
+ %156 = fadd float %152, %155, !dbg !45
161
+ %157 = bitcast float %156 to i32, !dbg !43
162
+ %158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 2, i32 31), !dbg !43
163
+ %159 = bitcast i32 %158 to float, !dbg !43
164
+ %160 = fadd float %156, %159, !dbg !45
165
+ %161 = bitcast float %160 to i32, !dbg !43
166
+ %162 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %161, i32 1, i32 31), !dbg !43
167
+ %163 = bitcast i32 %162 to float, !dbg !43
168
+ %164 = fadd float %160, %163, !dbg !45
169
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %116, float %164, i1 %114) #3, !dbg !43
170
+ tail call void @llvm.nvvm.barrier0(), !dbg !43
171
+ %165 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %119, i1 %117) #3, !dbg !43
172
+ %166 = bitcast float %165 to i32, !dbg !43
173
+ %167 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %166, i32 1, i32 31), !dbg !43
174
+ %168 = bitcast i32 %167 to float, !dbg !43
175
+ %169 = fadd float %165, %168, !dbg !45
176
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %119, float %169, i1 %127) #3, !dbg !43
177
+ tail call void @llvm.nvvm.barrier0(), !dbg !43
178
+ %170 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !43
179
+ %171 = fadd float %170, 0.000000e+00, !dbg !48
180
+ %172 = icmp eq i64 %73, -1, !dbg !50
181
+ %173 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %65, float 2.560000e+02) #3, !dbg !51
182
+ %174 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %67, float 2.560000e+02) #3, !dbg !51
183
+ %175 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %69, float 2.560000e+02) #3, !dbg !51
184
+ %176 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %71, float 2.560000e+02) #3, !dbg !51
185
+ %177 = fmul float %87, 2.560000e+02, !dbg !52
186
+ %178 = fmul float %88, 2.560000e+02, !dbg !52
187
+ %179 = fmul float %89, 2.560000e+02, !dbg !52
188
+ %180 = fmul float %90, 2.560000e+02, !dbg !52
189
+ %181 = fsub float %177, %129, !dbg !53
190
+ %182 = fsub float %178, %129, !dbg !53
191
+ %183 = fsub float %179, %129, !dbg !53
192
+ %184 = fsub float %180, %129, !dbg !53
193
+ %185 = fmul float %134, %171, !dbg !54
194
+ %186 = fmul float %135, %171, !dbg !54
195
+ %187 = fmul float %136, %171, !dbg !54
196
+ %188 = fmul float %137, %171, !dbg !54
197
+ %189 = fsub float %181, %185, !dbg !55
198
+ %190 = fsub float %182, %186, !dbg !55
199
+ %191 = fsub float %183, %187, !dbg !55
200
+ %192 = fsub float %184, %188, !dbg !55
201
+ %193 = fmul float %173, %189, !dbg !56
202
+ %194 = fmul float %173, %190, !dbg !56
203
+ %195 = fmul float %173, %191, !dbg !56
204
+ %196 = fmul float %173, %192, !dbg !56
205
+ %197 = fadd float %193, %83, !dbg !57
206
+ %198 = fadd float %194, %84, !dbg !57
207
+ %199 = fadd float %195, %85, !dbg !57
208
+ %200 = fadd float %196, %86, !dbg !57
209
+ %201 = select i1 %172, float 0.000000e+00, float %197, !dbg !58
210
+ %202 = select i1 %172, float 0.000000e+00, float %198, !dbg !58
211
+ %203 = select i1 %172, float 0.000000e+00, float %199, !dbg !58
212
+ %204 = select i1 %172, float 0.000000e+00, float %200, !dbg !58
213
+ %205 = icmp slt i64 %73, 0, !dbg !59
214
+ %206 = shl i64 %73, 8, !dbg !60
215
+ %207 = add i64 %206, 12865792, !dbg !60
216
+ %208 = select i1 %205, i64 %207, i64 %206, !dbg !60
217
+ %209 = zext nneg i32 %16 to i64
218
+ %210 = zext nneg i32 %17 to i64
219
+ %211 = zext nneg i32 %18 to i64
220
+ %212 = or i64 %208, %35, !dbg !61
221
+ %213 = or i64 %208, %209, !dbg !61
222
+ %214 = or i64 %208, %210, !dbg !61
223
+ %215 = or i64 %208, %211, !dbg !61
224
+ %216 = getelementptr float, ptr addrspace(1) %7, i64 %212, !dbg !62
225
+ %217 = getelementptr float, ptr addrspace(1) %7, i64 %213, !dbg !62
226
+ %218 = getelementptr float, ptr addrspace(1) %7, i64 %214, !dbg !62
227
+ %219 = getelementptr float, ptr addrspace(1) %7, i64 %215, !dbg !62
228
+ %220 = insertelement <1 x float> undef, float %201, i64 0, !dbg !63
229
+ %221 = tail call float asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 atom.global.gpu.acq_rel.add.f32 $0, [ $1 + 0 ], $2;", "=r,l,r,b"(ptr addrspace(1) %216, <1 x float> %220, i1 true) #3, !dbg !63
230
+ %222 = insertelement <1 x float> undef, float %202, i64 0, !dbg !63
231
+ %223 = tail call float asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 atom.global.gpu.acq_rel.add.f32 $0, [ $1 + 0 ], $2;", "=r,l,r,b"(ptr addrspace(1) %217, <1 x float> %222, i1 true) #3, !dbg !63
232
+ %224 = insertelement <1 x float> undef, float %203, i64 0, !dbg !63
233
+ %225 = tail call float asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 atom.global.gpu.acq_rel.add.f32 $0, [ $1 + 0 ], $2;", "=r,l,r,b"(ptr addrspace(1) %218, <1 x float> %224, i1 true) #3, !dbg !63
234
+ %226 = insertelement <1 x float> undef, float %204, i64 0, !dbg !63
235
+ %227 = tail call float asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 atom.global.gpu.acq_rel.add.f32 $0, [ $1 + 0 ], $2;", "=r,l,r,b"(ptr addrspace(1) %219, <1 x float> %226, i1 true) #3, !dbg !63
236
+ ret void, !dbg !64
237
+ }
238
+
239
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
240
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
241
+
242
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
243
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
244
+
245
+ ; Function Attrs: convergent nocallback nounwind
246
+ declare void @llvm.nvvm.barrier0() #2
247
+
248
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
249
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
250
+ attributes #2 = { convergent nocallback nounwind }
251
+ attributes #3 = { nounwind }
252
+
253
+ !llvm.module.flags = !{!0}
254
+ !llvm.dbg.cu = !{!1}
255
+ !nvvm.annotations = !{!3, !4, !4, !3}
256
+
257
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
258
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
259
+ !2 = !DIFile(filename: "cqryxm46jcxyr3qdktqirn53eap7h3pjjqiqavyqqyvflabjpvmd.py", directory: "/tmp/torchinductor_root/qr")
260
+ !3 = !{ptr @triton__0d1d2d3d4d5d6d7d8de9de, !"kernel", i32 1}
261
+ !4 = !{ptr @triton__0d1d2d3d4d5d6d7d8de9de, !"maxntidx", i32 64}
262
+ !5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7d8de9de", linkageName: "triton__0d1d2d3d4d5d6d7d8de9de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
263
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
264
+ !7 = !{}
265
+ !8 = !DILocation(line: 26, column: 26, scope: !5)
266
+ !9 = !DILocation(line: 23, column: 28, scope: !5)
267
+ !10 = !DILocation(line: 30, column: 40, scope: !5)
268
+ !11 = !DILocation(line: 30, column: 36, scope: !5)
269
+ !12 = !DILocation(line: 30, column: 30, scope: !5)
270
+ !13 = !DILocation(line: 30, column: 46, scope: !5)
271
+ !14 = !DILocation(line: 30, column: 67, scope: !5)
272
+ !15 = !DILocation(line: 31, column: 30, scope: !5)
273
+ !16 = !DILocation(line: 31, column: 35, scope: !5)
274
+ !17 = !DILocation(line: 32, column: 30, scope: !5)
275
+ !18 = !DILocation(line: 32, column: 46, scope: !5)
276
+ !19 = !DILocation(line: 33, column: 30, scope: !5)
277
+ !20 = !DILocation(line: 33, column: 35, scope: !5)
278
+ !21 = !DILocation(line: 34, column: 31, scope: !5)
279
+ !22 = !DILocation(line: 34, column: 36, scope: !5)
280
+ !23 = !DILocation(line: 35, column: 31, scope: !5)
281
+ !24 = !DILocation(line: 35, column: 36, scope: !5)
282
+ !25 = !DILocation(line: 36, column: 35, scope: !5)
283
+ !26 = !DILocation(line: 36, column: 51, scope: !5)
284
+ !27 = !DILocation(line: 38, column: 18, scope: !5)
285
+ !28 = !DILocation(line: 233, column: 15, scope: !29, inlinedAt: !32)
286
+ !29 = distinct !DILexicalBlockFile(scope: !31, file: !30, discriminator: 0)
287
+ !30 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
288
+ !31 = distinct !DILexicalBlockFile(scope: !5, file: !30, discriminator: 0)
289
+ !32 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !33)
290
+ !33 = !DILocation(line: 41, column: 57, scope: !29)
291
+ !34 = !DILocation(line: 243, column: 36, scope: !31, inlinedAt: !35)
292
+ !35 = !DILocation(line: 41, column: 57, scope: !31)
293
+ !36 = !DILocation(line: 8, column: 15, scope: !37, inlinedAt: !39)
294
+ !37 = distinct !DILexicalBlockFile(scope: !5, file: !38, discriminator: 0)
295
+ !38 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
296
+ !39 = !DILocation(line: 41, column: 44, scope: !37)
297
+ !40 = !DILocation(line: 42, column: 19, scope: !5)
298
+ !41 = !DILocation(line: 43, column: 20, scope: !5)
299
+ !42 = !DILocation(line: 44, column: 19, scope: !5)
300
+ !43 = !DILocation(line: 243, column: 36, scope: !31, inlinedAt: !44)
301
+ !44 = !DILocation(line: 47, column: 59, scope: !31)
302
+ !45 = !DILocation(line: 233, column: 15, scope: !29, inlinedAt: !46)
303
+ !46 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !47)
304
+ !47 = !DILocation(line: 47, column: 59, scope: !29)
305
+ !48 = !DILocation(line: 8, column: 15, scope: !37, inlinedAt: !49)
306
+ !49 = !DILocation(line: 47, column: 45, scope: !37)
307
+ !50 = !DILocation(line: 49, column: 21, scope: !5)
308
+ !51 = !DILocation(line: 51, column: 20, scope: !5)
309
+ !52 = !DILocation(line: 52, column: 19, scope: !5)
310
+ !53 = !DILocation(line: 53, column: 20, scope: !5)
311
+ !54 = !DILocation(line: 54, column: 20, scope: !5)
312
+ !55 = !DILocation(line: 55, column: 20, scope: !5)
313
+ !56 = !DILocation(line: 56, column: 20, scope: !5)
314
+ !57 = !DILocation(line: 57, column: 20, scope: !5)
315
+ !58 = !DILocation(line: 59, column: 35, scope: !5)
316
+ !59 = !DILocation(line: 61, column: 20, scope: !5)
317
+ !60 = !DILocation(line: 63, column: 56, scope: !5)
318
+ !61 = !DILocation(line: 63, column: 52, scope: !5)
319
+ !62 = !DILocation(line: 63, column: 30, scope: !5)
320
+ !63 = !DILocation(line: 63, column: 83, scope: !5)
321
+ !64 = !DILocation(line: 63, column: 4, scope: !5)
.triton/dump/21d0195c63fb062bfc567b79c9bb2771/triton_.ttir ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3d4d5d6d7d8de9de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %c256_i32 = arith.constant 256 : i32
4
+ %cst = arith.constant dense<0.000000e+00> : tensor<256xbf16>
5
+ %cst_0 = arith.constant 0.000000e+00 : f32
6
+ %cst_1 = arith.constant dense<256> : tensor<1xi64>
7
+ %cst_2 = arith.constant dense<0> : tensor<1xi64>
8
+ %cst_3 = arith.constant dense<50257> : tensor<1xi64>
9
+ %cst_4 = arith.constant dense<0.000000e+00> : tensor<256xf32>
10
+ %cst_5 = arith.constant dense<2.560000e+02> : tensor<256xf32>
11
+ %cst_6 = arith.constant dense<2.560000e+02> : tensor<1xf32>
12
+ %cst_7 = arith.constant dense<-1> : tensor<1xi64>
13
+ %cst_8 = arith.constant dense<256> : tensor<256xi32>
14
+ %0 = tt.get_program_id x : i32
15
+ %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
16
+ %2 = arith.cmpi slt, %1, %cst_8 : tensor<256xi32>
17
+ %3 = arith.muli %0, %c256_i32 : i32
18
+ %4 = tt.splat %3 : (i32) -> tensor<256xi32>
19
+ %5 = arith.addi %1, %4 : tensor<256xi32>
20
+ %6 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
21
+ %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
22
+ %8 = tt.load %7, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
23
+ %9 = arith.extf %8 : tensor<256xbf16> to tensor<256xf32>
24
+ %10 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
25
+ %11 = tt.addptr %10, %1 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
26
+ %12 = tt.load %11, %2, %cst_4 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
27
+ %13 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
28
+ %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
29
+ %15 = tt.load %14, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
30
+ %16 = tt.addptr %arg4, %0 : !tt.ptr<f32, 1>, i32
31
+ %17 = tt.splat %16 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>>
32
+ %18 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32>
33
+ %19 = tt.addptr %arg5, %0 : !tt.ptr<f32, 1>, i32
34
+ %20 = tt.splat %19 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>>
35
+ %21 = tt.load %20 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32>
36
+ %22 = tt.addptr %arg6, %0 : !tt.ptr<i64, 1>, i32
37
+ %23 = tt.splat %22 : (!tt.ptr<i64, 1>) -> tensor<1x!tt.ptr<i64, 1>>
38
+ %24 = tt.load %23 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xi64>
39
+ %25 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
40
+ %26 = tt.addptr %25, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
41
+ %27 = tt.load %26, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
42
+ %28 = arith.mulf %9, %12 : tensor<256xf32>
43
+ %29 = arith.select %2, %28, %cst_4 : tensor<256xi1>, tensor<256xf32>
44
+ %30 = "tt.reduce"(%29) <{axis = 0 : i32}> ({
45
+ ^bb0(%arg10: f32, %arg11: f32):
46
+ %63 = arith.addf %arg10, %arg11 : f32
47
+ tt.reduce.return %63 : f32
48
+ }) : (tensor<256xf32>) -> f32
49
+ %31 = arith.addf %30, %cst_0 : f32
50
+ %32 = tt.broadcast %18 : (tensor<1xf32>) -> tensor<256xf32>
51
+ %33 = arith.subf %15, %32 : tensor<256xf32>
52
+ %34 = tt.broadcast %21 : (tensor<1xf32>) -> tensor<256xf32>
53
+ %35 = arith.mulf %33, %34 : tensor<256xf32>
54
+ %36 = arith.mulf %28, %35 : tensor<256xf32>
55
+ %37 = arith.select %2, %36, %cst_4 : tensor<256xi1>, tensor<256xf32>
56
+ %38 = "tt.reduce"(%37) <{axis = 0 : i32}> ({
57
+ ^bb0(%arg10: f32, %arg11: f32):
58
+ %63 = arith.addf %arg10, %arg11 : f32
59
+ tt.reduce.return %63 : f32
60
+ }) : (tensor<256xf32>) -> f32
61
+ %39 = arith.addf %38, %cst_0 : f32
62
+ %40 = arith.cmpi eq, %24, %cst_7 : tensor<1xi64>
63
+ %41 = arith.divf %21, %cst_6 : tensor<1xf32>
64
+ %42 = arith.mulf %28, %cst_5 : tensor<256xf32>
65
+ %43 = tt.splat %31 : (f32) -> tensor<256xf32>
66
+ %44 = arith.subf %42, %43 : tensor<256xf32>
67
+ %45 = tt.splat %39 : (f32) -> tensor<256xf32>
68
+ %46 = arith.mulf %35, %45 : tensor<256xf32>
69
+ %47 = arith.subf %44, %46 : tensor<256xf32>
70
+ %48 = tt.broadcast %41 : (tensor<1xf32>) -> tensor<256xf32>
71
+ %49 = arith.mulf %48, %47 : tensor<256xf32>
72
+ %50 = arith.addf %27, %49 : tensor<256xf32>
73
+ %51 = tt.broadcast %40 : (tensor<1xi1>) -> tensor<256xi1>
74
+ %52 = arith.select %51, %cst_4, %50 : tensor<256xi1>, tensor<256xf32>
75
+ %53 = arith.addi %24, %cst_3 : tensor<1xi64>
76
+ %54 = arith.cmpi slt, %24, %cst_2 : tensor<1xi64>
77
+ %55 = arith.select %54, %53, %24 : tensor<1xi1>, tensor<1xi64>
78
+ %56 = arith.muli %55, %cst_1 : tensor<1xi64>
79
+ %57 = tt.broadcast %56 : (tensor<1xi64>) -> tensor<256xi64>
80
+ %58 = arith.extsi %1 : tensor<256xi32> to tensor<256xi64>
81
+ %59 = arith.addi %58, %57 : tensor<256xi64>
82
+ %60 = tt.splat %arg7 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
83
+ %61 = tt.addptr %60, %59 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi64>
84
+ %62 = "tt.atomic_rmw"(%61, %52, %2) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<256x!tt.ptr<f32, 1>>, tensor<256xf32>, tensor<256xi1>) -> tensor<256xf32>
85
+ tt.return
86
+ }
87
+ }
.triton/dump/345a87a492fd703c73ab83265a21fcb6/triton_.cubin ADDED
Binary file (52.2 kB). View file
 
.triton/dump/345a87a492fd703c73ab83265a21fcb6/triton_.llir ADDED
@@ -0,0 +1,980 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
5
+ @assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
6
+ @assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp13 < 50257"
7
+ @assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
8
+ @assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
9
+ @assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
10
+ @global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
11
+ @.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
12
+
13
+ declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
14
+
15
+ define void @triton__0d1d2d3d4d5de6de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i32 %5, i32 %6) local_unnamed_addr !dbg !7 {
16
+ %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
17
+ %9 = lshr i32 %8, 5, !dbg !10
18
+ %10 = and i32 %9, 7, !dbg !10
19
+ %11 = and i32 %8, 15, !dbg !10
20
+ %12 = shl i32 %8, 3, !dbg !11
21
+ %13 = and i32 %12, 248, !dbg !11
22
+ %14 = or i32 %13, 4, !dbg !11
23
+ %urem = and i32 %8, 255, !dbg !11
24
+ %15 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !12
25
+ %16 = shl i32 %15, 4, !dbg !13
26
+ %17 = or i32 %16, %10, !dbg !14
27
+ %18 = or i32 %17, 8, !dbg !14
28
+ %19 = or i32 %16, %11, !dbg !14
29
+ %20 = sext i32 %17 to i64, !dbg !15
30
+ %21 = getelementptr i64, ptr addrspace(1) %0, i64 %20, !dbg !15
31
+ %22 = sext i32 %18 to i64, !dbg !15
32
+ %23 = getelementptr i64, ptr addrspace(1) %0, i64 %22, !dbg !15
33
+ %24 = sext i32 %19 to i64, !dbg !15
34
+ %25 = getelementptr i64, ptr addrspace(1) %0, i64 %24, !dbg !15
35
+ %26 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %21, i1 true) #6, !dbg !16
36
+ %27 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %21, i1 true) #6, !dbg !16
37
+ %28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %21, i1 true) #6, !dbg !16
38
+ %29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %21, i1 true) #6, !dbg !16
39
+ %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %21, i1 true) #6, !dbg !16
40
+ %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %21, i1 true) #6, !dbg !16
41
+ %32 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %21, i1 true) #6, !dbg !16
42
+ %33 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %21, i1 true) #6, !dbg !16
43
+ %34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
44
+ %35 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
45
+ %36 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
46
+ %37 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
47
+ %38 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
48
+ %39 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
49
+ %40 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
50
+ %41 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
51
+ %42 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !16
52
+ %43 = srem i32 %17, 512, !dbg !17
53
+ %44 = srem i32 %18, 512, !dbg !17
54
+ %45 = shl nsw i32 %43, 8, !dbg !18
55
+ %46 = shl nsw i32 %44, 8, !dbg !18
56
+ %47 = or i32 %45, %13, !dbg !19
57
+ %48 = or i32 %45, %14, !dbg !19
58
+ %49 = or i32 %46, %13, !dbg !19
59
+ %50 = or i32 %46, %14, !dbg !19
60
+ %51 = sext i32 %47 to i64, !dbg !20
61
+ %52 = getelementptr float, ptr addrspace(1) %2, i64 %51, !dbg !20
62
+ %53 = sext i32 %48 to i64, !dbg !20
63
+ %54 = getelementptr float, ptr addrspace(1) %2, i64 %53, !dbg !20
64
+ %55 = sext i32 %49 to i64, !dbg !20
65
+ %56 = getelementptr float, ptr addrspace(1) %2, i64 %55, !dbg !20
66
+ %57 = sext i32 %50 to i64, !dbg !20
67
+ %58 = getelementptr float, ptr addrspace(1) %2, i64 %57, !dbg !20
68
+ %59 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %52, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21
69
+ %60 = extractvalue { i32, i32, i32, i32 } %59, 0, !dbg !21
70
+ %61 = extractvalue { i32, i32, i32, i32 } %59, 1, !dbg !21
71
+ %62 = extractvalue { i32, i32, i32, i32 } %59, 2, !dbg !21
72
+ %63 = extractvalue { i32, i32, i32, i32 } %59, 3, !dbg !21
73
+ %64 = bitcast i32 %60 to float, !dbg !21
74
+ %65 = bitcast i32 %61 to float, !dbg !21
75
+ %66 = bitcast i32 %62 to float, !dbg !21
76
+ %67 = bitcast i32 %63 to float, !dbg !21
77
+ %68 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %54, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21
78
+ %69 = extractvalue { i32, i32, i32, i32 } %68, 0, !dbg !21
79
+ %70 = extractvalue { i32, i32, i32, i32 } %68, 1, !dbg !21
80
+ %71 = extractvalue { i32, i32, i32, i32 } %68, 2, !dbg !21
81
+ %72 = extractvalue { i32, i32, i32, i32 } %68, 3, !dbg !21
82
+ %73 = bitcast i32 %69 to float, !dbg !21
83
+ %74 = bitcast i32 %70 to float, !dbg !21
84
+ %75 = bitcast i32 %71 to float, !dbg !21
85
+ %76 = bitcast i32 %72 to float, !dbg !21
86
+ %77 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %56, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21
87
+ %78 = extractvalue { i32, i32, i32, i32 } %77, 0, !dbg !21
88
+ %79 = extractvalue { i32, i32, i32, i32 } %77, 1, !dbg !21
89
+ %80 = extractvalue { i32, i32, i32, i32 } %77, 2, !dbg !21
90
+ %81 = extractvalue { i32, i32, i32, i32 } %77, 3, !dbg !21
91
+ %82 = bitcast i32 %78 to float, !dbg !21
92
+ %83 = bitcast i32 %79 to float, !dbg !21
93
+ %84 = bitcast i32 %80 to float, !dbg !21
94
+ %85 = bitcast i32 %81 to float, !dbg !21
95
+ %86 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %58, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21
96
+ %87 = extractvalue { i32, i32, i32, i32 } %86, 0, !dbg !21
97
+ %88 = extractvalue { i32, i32, i32, i32 } %86, 1, !dbg !21
98
+ %89 = extractvalue { i32, i32, i32, i32 } %86, 2, !dbg !21
99
+ %90 = extractvalue { i32, i32, i32, i32 } %86, 3, !dbg !21
100
+ %91 = bitcast i32 %87 to float, !dbg !21
101
+ %92 = bitcast i32 %88 to float, !dbg !21
102
+ %93 = bitcast i32 %89 to float, !dbg !21
103
+ %94 = bitcast i32 %90 to float, !dbg !21
104
+ %95 = add i64 %42, 50257, !dbg !22
105
+ %96 = icmp slt i64 %26, 0, !dbg !23
106
+ %97 = icmp slt i64 %34, 0, !dbg !23
107
+ %98 = icmp slt i64 %42, 0, !dbg !23
108
+ %99 = select i1 %98, i64 %95, i64 %42, !dbg !24
109
+ %100 = icmp ugt i64 %99, 50256, !dbg !25
110
+ br i1 %100, label %101, label %102, !dbg !26
111
+
112
+ 101: ; preds = %7
113
+ tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !26
114
+ br label %102, !dbg !26
115
+
116
+ 102: ; preds = %101, %7
117
+ %103 = shl i64 %26, 8, !dbg !27
118
+ %104 = add i64 %103, 12865792, !dbg !27
119
+ %105 = select i1 %96, i64 %104, i64 %103, !dbg !27
120
+ %106 = shl i64 %34, 8, !dbg !27
121
+ %107 = add i64 %106, 12865792, !dbg !27
122
+ %108 = select i1 %97, i64 %107, i64 %106, !dbg !27
123
+ %109 = zext nneg i32 %13 to i64
124
+ %110 = zext nneg i32 %14 to i64
125
+ %111 = or i64 %105, %109, !dbg !28
126
+ %112 = or i64 %105, %110, !dbg !28
127
+ %113 = or i64 %108, %109, !dbg !28
128
+ %114 = or i64 %108, %110, !dbg !28
129
+ %115 = getelementptr float, ptr addrspace(1) %1, i64 %111, !dbg !29
130
+ %116 = getelementptr float, ptr addrspace(1) %1, i64 %112, !dbg !29
131
+ %117 = getelementptr float, ptr addrspace(1) %1, i64 %113, !dbg !29
132
+ %118 = getelementptr float, ptr addrspace(1) %1, i64 %114, !dbg !29
133
+ %119 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %115, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !30
134
+ %120 = extractvalue { i32, i32, i32, i32 } %119, 0, !dbg !30
135
+ %121 = extractvalue { i32, i32, i32, i32 } %119, 1, !dbg !30
136
+ %122 = extractvalue { i32, i32, i32, i32 } %119, 2, !dbg !30
137
+ %123 = extractvalue { i32, i32, i32, i32 } %119, 3, !dbg !30
138
+ %124 = bitcast i32 %120 to float, !dbg !30
139
+ %125 = bitcast i32 %121 to float, !dbg !30
140
+ %126 = bitcast i32 %122 to float, !dbg !30
141
+ %127 = bitcast i32 %123 to float, !dbg !30
142
+ %128 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %116, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !30
143
+ %129 = extractvalue { i32, i32, i32, i32 } %128, 0, !dbg !30
144
+ %130 = extractvalue { i32, i32, i32, i32 } %128, 1, !dbg !30
145
+ %131 = extractvalue { i32, i32, i32, i32 } %128, 2, !dbg !30
146
+ %132 = extractvalue { i32, i32, i32, i32 } %128, 3, !dbg !30
147
+ %133 = bitcast i32 %129 to float, !dbg !30
148
+ %134 = bitcast i32 %130 to float, !dbg !30
149
+ %135 = bitcast i32 %131 to float, !dbg !30
150
+ %136 = bitcast i32 %132 to float, !dbg !30
151
+ %137 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %117, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !30
152
+ %138 = extractvalue { i32, i32, i32, i32 } %137, 0, !dbg !30
153
+ %139 = extractvalue { i32, i32, i32, i32 } %137, 1, !dbg !30
154
+ %140 = extractvalue { i32, i32, i32, i32 } %137, 2, !dbg !30
155
+ %141 = extractvalue { i32, i32, i32, i32 } %137, 3, !dbg !30
156
+ %142 = bitcast i32 %138 to float, !dbg !30
157
+ %143 = bitcast i32 %139 to float, !dbg !30
158
+ %144 = bitcast i32 %140 to float, !dbg !30
159
+ %145 = bitcast i32 %141 to float, !dbg !30
160
+ %146 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %118, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !30
161
+ %147 = extractvalue { i32, i32, i32, i32 } %146, 0, !dbg !30
162
+ %148 = extractvalue { i32, i32, i32, i32 } %146, 1, !dbg !30
163
+ %149 = extractvalue { i32, i32, i32, i32 } %146, 2, !dbg !30
164
+ %150 = extractvalue { i32, i32, i32, i32 } %146, 3, !dbg !30
165
+ %151 = bitcast i32 %147 to float, !dbg !30
166
+ %152 = bitcast i32 %148 to float, !dbg !30
167
+ %153 = bitcast i32 %149 to float, !dbg !30
168
+ %154 = bitcast i32 %150 to float, !dbg !30
169
+ %155 = fadd float %64, %124, !dbg !31
170
+ %156 = fadd float %65, %125, !dbg !31
171
+ %157 = fadd float %66, %126, !dbg !31
172
+ %158 = fadd float %67, %127, !dbg !31
173
+ %159 = fadd float %73, %133, !dbg !31
174
+ %160 = fadd float %74, %134, !dbg !31
175
+ %161 = fadd float %75, %135, !dbg !31
176
+ %162 = fadd float %76, %136, !dbg !31
177
+ %163 = fadd float %82, %142, !dbg !31
178
+ %164 = fadd float %83, %143, !dbg !31
179
+ %165 = fadd float %84, %144, !dbg !31
180
+ %166 = fadd float %85, %145, !dbg !31
181
+ %167 = fadd float %91, %151, !dbg !31
182
+ %168 = fadd float %92, %152, !dbg !31
183
+ %169 = fadd float %93, %153, !dbg !31
184
+ %170 = fadd float %94, %154, !dbg !31
185
+ %171 = fadd float %155, 0.000000e+00, !dbg !32
186
+ %172 = fadd float %156, 0.000000e+00, !dbg !32
187
+ %173 = fadd float %157, 0.000000e+00, !dbg !32
188
+ %174 = fadd float %158, 0.000000e+00, !dbg !32
189
+ %175 = fadd float %159, 0.000000e+00, !dbg !32
190
+ %176 = fadd float %160, 0.000000e+00, !dbg !32
191
+ %177 = fadd float %161, 0.000000e+00, !dbg !32
192
+ %178 = fadd float %162, 0.000000e+00, !dbg !32
193
+ %179 = fadd float %163, 0.000000e+00, !dbg !32
194
+ %180 = fadd float %164, 0.000000e+00, !dbg !32
195
+ %181 = fadd float %165, 0.000000e+00, !dbg !32
196
+ %182 = fadd float %166, 0.000000e+00, !dbg !32
197
+ %183 = fadd float %167, 0.000000e+00, !dbg !32
198
+ %184 = fadd float %168, 0.000000e+00, !dbg !32
199
+ %185 = fadd float %169, 0.000000e+00, !dbg !32
200
+ %186 = fadd float %170, 0.000000e+00, !dbg !32
201
+ %187 = fsub float %155, %171, !dbg !36
202
+ %188 = fsub float %156, %172, !dbg !36
203
+ %189 = fsub float %157, %173, !dbg !36
204
+ %190 = fsub float %158, %174, !dbg !36
205
+ %191 = fsub float %159, %175, !dbg !36
206
+ %192 = fsub float %160, %176, !dbg !36
207
+ %193 = fsub float %161, %177, !dbg !36
208
+ %194 = fsub float %162, %178, !dbg !36
209
+ %195 = fsub float %163, %179, !dbg !36
210
+ %196 = fsub float %164, %180, !dbg !36
211
+ %197 = fsub float %165, %181, !dbg !36
212
+ %198 = fsub float %166, %182, !dbg !36
213
+ %199 = fsub float %167, %183, !dbg !36
214
+ %200 = fsub float %168, %184, !dbg !36
215
+ %201 = fsub float %169, %185, !dbg !36
216
+ %202 = fsub float %170, %186, !dbg !36
217
+ %203 = fmul float %155, %187, !dbg !37
218
+ %204 = fmul float %156, %188, !dbg !37
219
+ %205 = fmul float %157, %189, !dbg !37
220
+ %206 = fmul float %158, %190, !dbg !37
221
+ %207 = fmul float %159, %191, !dbg !37
222
+ %208 = fmul float %160, %192, !dbg !37
223
+ %209 = fmul float %161, %193, !dbg !37
224
+ %210 = fmul float %162, %194, !dbg !37
225
+ %211 = fmul float %163, %195, !dbg !37
226
+ %212 = fmul float %164, %196, !dbg !37
227
+ %213 = fmul float %165, %197, !dbg !37
228
+ %214 = fmul float %166, %198, !dbg !37
229
+ %215 = fmul float %167, %199, !dbg !37
230
+ %216 = fmul float %168, %200, !dbg !37
231
+ %217 = fmul float %169, %201, !dbg !37
232
+ %218 = fmul float %170, %202, !dbg !37
233
+ %219 = fadd float %203, 0.000000e+00, !dbg !38
234
+ %220 = fadd float %204, 0.000000e+00, !dbg !38
235
+ %221 = fadd float %205, 0.000000e+00, !dbg !38
236
+ %222 = fadd float %206, 0.000000e+00, !dbg !38
237
+ %223 = fadd float %207, 0.000000e+00, !dbg !38
238
+ %224 = fadd float %208, 0.000000e+00, !dbg !38
239
+ %225 = fadd float %209, 0.000000e+00, !dbg !38
240
+ %226 = fadd float %210, 0.000000e+00, !dbg !38
241
+ %227 = fadd float %211, 0.000000e+00, !dbg !38
242
+ %228 = fadd float %212, 0.000000e+00, !dbg !38
243
+ %229 = fadd float %213, 0.000000e+00, !dbg !38
244
+ %230 = fadd float %214, 0.000000e+00, !dbg !38
245
+ %231 = fadd float %215, 0.000000e+00, !dbg !38
246
+ %232 = fadd float %216, 0.000000e+00, !dbg !38
247
+ %233 = fadd float %217, 0.000000e+00, !dbg !38
248
+ %234 = fadd float %218, 0.000000e+00, !dbg !38
249
+ %235 = fsub float %172, %171, !dbg !39
250
+ %236 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 2.000000e+00) #6, !dbg !43
251
+ %237 = fmul float %236, %235, !dbg !44
252
+ %238 = fadd float %171, %237, !dbg !45
253
+ %239 = fadd float %219, %220, !dbg !46
254
+ %240 = fmul float %235, %235, !dbg !47
255
+ %241 = fmul float %236, %240, !dbg !48
256
+ %242 = fadd float %241, %239, !dbg !49
257
+ %243 = fsub float %173, %238, !dbg !39
258
+ %244 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 3.000000e+00) #6, !dbg !43
259
+ %245 = fmul float %244, %243, !dbg !44
260
+ %246 = fadd float %238, %245, !dbg !45
261
+ %247 = fadd float %221, %242, !dbg !46
262
+ %248 = fmul float %243, %243, !dbg !47
263
+ %249 = fmul float %248, 2.000000e+00, !dbg !50
264
+ %250 = fmul float %244, %249, !dbg !48
265
+ %251 = fadd float %247, %250, !dbg !49
266
+ %252 = fsub float %174, %246, !dbg !39
267
+ %253 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 4.000000e+00) #6, !dbg !43
268
+ %254 = fmul float %253, %252, !dbg !44
269
+ %255 = fadd float %246, %254, !dbg !45
270
+ %256 = fadd float %222, %251, !dbg !46
271
+ %257 = fmul float %252, %252, !dbg !47
272
+ %258 = fmul float %257, 3.000000e+00, !dbg !50
273
+ %259 = fmul float %253, %258, !dbg !48
274
+ %260 = fadd float %256, %259, !dbg !49
275
+ %261 = fsub float %175, %255, !dbg !39
276
+ %262 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 5.000000e+00) #6, !dbg !43
277
+ %263 = fmul float %262, %261, !dbg !44
278
+ %264 = fadd float %255, %263, !dbg !45
279
+ %265 = fadd float %223, %260, !dbg !46
280
+ %266 = fmul float %261, %261, !dbg !47
281
+ %267 = fmul float %266, 4.000000e+00, !dbg !50
282
+ %268 = fmul float %262, %267, !dbg !48
283
+ %269 = fadd float %265, %268, !dbg !49
284
+ %270 = fsub float %176, %264, !dbg !39
285
+ %271 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 6.000000e+00) #6, !dbg !43
286
+ %272 = fmul float %271, %270, !dbg !44
287
+ %273 = fadd float %264, %272, !dbg !45
288
+ %274 = fadd float %224, %269, !dbg !46
289
+ %275 = fmul float %270, %270, !dbg !47
290
+ %276 = fmul float %275, 5.000000e+00, !dbg !50
291
+ %277 = fmul float %271, %276, !dbg !48
292
+ %278 = fadd float %274, %277, !dbg !49
293
+ %279 = fsub float %177, %273, !dbg !39
294
+ %280 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 7.000000e+00) #6, !dbg !43
295
+ %281 = fmul float %280, %279, !dbg !44
296
+ %282 = fadd float %273, %281, !dbg !45
297
+ %283 = fadd float %225, %278, !dbg !46
298
+ %284 = fmul float %279, %279, !dbg !47
299
+ %285 = fmul float %284, 6.000000e+00, !dbg !50
300
+ %286 = fmul float %280, %285, !dbg !48
301
+ %287 = fadd float %283, %286, !dbg !49
302
+ %288 = fsub float %178, %282, !dbg !39
303
+ %289 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 8.000000e+00) #6, !dbg !43
304
+ %290 = fmul float %289, %288, !dbg !44
305
+ %291 = fadd float %282, %290, !dbg !45
306
+ %292 = fadd float %226, %287, !dbg !46
307
+ %293 = fmul float %288, %288, !dbg !47
308
+ %294 = fmul float %293, 7.000000e+00, !dbg !50
309
+ %295 = fmul float %289, %294, !dbg !48
310
+ %296 = fadd float %292, %295, !dbg !49
311
+ %297 = fsub float %180, %179, !dbg !39
312
+ %298 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 2.000000e+00) #6, !dbg !43
313
+ %299 = fmul float %297, %298, !dbg !44
314
+ %300 = fadd float %179, %299, !dbg !45
315
+ %301 = fadd float %227, %228, !dbg !46
316
+ %302 = fmul float %297, %297, !dbg !47
317
+ %303 = fmul float %302, %298, !dbg !48
318
+ %304 = fadd float %301, %303, !dbg !49
319
+ %305 = fsub float %181, %300, !dbg !39
320
+ %306 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 3.000000e+00) #6, !dbg !43
321
+ %307 = fmul float %306, %305, !dbg !44
322
+ %308 = fadd float %300, %307, !dbg !45
323
+ %309 = fadd float %229, %304, !dbg !46
324
+ %310 = fmul float %305, %305, !dbg !47
325
+ %311 = fmul float %310, 2.000000e+00, !dbg !50
326
+ %312 = fmul float %306, %311, !dbg !48
327
+ %313 = fadd float %309, %312, !dbg !49
328
+ %314 = fsub float %182, %308, !dbg !39
329
+ %315 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 4.000000e+00) #6, !dbg !43
330
+ %316 = fmul float %315, %314, !dbg !44
331
+ %317 = fadd float %308, %316, !dbg !45
332
+ %318 = fadd float %230, %313, !dbg !46
333
+ %319 = fmul float %314, %314, !dbg !47
334
+ %320 = fmul float %319, 3.000000e+00, !dbg !50
335
+ %321 = fmul float %315, %320, !dbg !48
336
+ %322 = fadd float %318, %321, !dbg !49
337
+ %323 = fsub float %183, %317, !dbg !39
338
+ %324 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 5.000000e+00) #6, !dbg !43
339
+ %325 = fmul float %324, %323, !dbg !44
340
+ %326 = fadd float %317, %325, !dbg !45
341
+ %327 = fadd float %231, %322, !dbg !46
342
+ %328 = fmul float %323, %323, !dbg !47
343
+ %329 = fmul float %328, 4.000000e+00, !dbg !50
344
+ %330 = fmul float %324, %329, !dbg !48
345
+ %331 = fadd float %327, %330, !dbg !49
346
+ %332 = fsub float %184, %326, !dbg !39
347
+ %333 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 6.000000e+00) #6, !dbg !43
348
+ %334 = fmul float %333, %332, !dbg !44
349
+ %335 = fadd float %326, %334, !dbg !45
350
+ %336 = fadd float %232, %331, !dbg !46
351
+ %337 = fmul float %332, %332, !dbg !47
352
+ %338 = fmul float %337, 5.000000e+00, !dbg !50
353
+ %339 = fmul float %333, %338, !dbg !48
354
+ %340 = fadd float %336, %339, !dbg !49
355
+ %341 = fsub float %185, %335, !dbg !39
356
+ %342 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 7.000000e+00) #6, !dbg !43
357
+ %343 = fmul float %342, %341, !dbg !44
358
+ %344 = fadd float %335, %343, !dbg !45
359
+ %345 = fadd float %233, %340, !dbg !46
360
+ %346 = fmul float %341, %341, !dbg !47
361
+ %347 = fmul float %346, 6.000000e+00, !dbg !50
362
+ %348 = fmul float %342, %347, !dbg !48
363
+ %349 = fadd float %345, %348, !dbg !49
364
+ %350 = fsub float %186, %344, !dbg !39
365
+ %351 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 8.000000e+00) #6, !dbg !43
366
+ %352 = fmul float %351, %350, !dbg !44
367
+ %353 = fadd float %344, %352, !dbg !45
368
+ %354 = fadd float %234, %349, !dbg !46
369
+ %355 = fmul float %350, %350, !dbg !47
370
+ %356 = fmul float %355, 7.000000e+00, !dbg !50
371
+ %357 = fmul float %351, %356, !dbg !48
372
+ %358 = fadd float %354, %357, !dbg !49
373
+ %359 = bitcast float %291 to i32, !dbg !51
374
+ %360 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %359, i32 16, i32 31), !dbg !51
375
+ %361 = bitcast i32 %360 to float, !dbg !51
376
+ %362 = bitcast float %296 to i32, !dbg !51
377
+ %363 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %362, i32 16, i32 31), !dbg !51
378
+ %364 = bitcast i32 %363 to float, !dbg !51
379
+ %365 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 1090519040, i32 16, i32 31), !dbg !51
380
+ %366 = bitcast i32 %365 to float, !dbg !51
381
+ %367 = fsub float %361, %291, !dbg !39
382
+ %368 = fadd float %366, 8.000000e+00, !dbg !53
383
+ %369 = fcmp oeq float %368, 0.000000e+00, !dbg !54
384
+ %370 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %366, float %368) #6, !dbg !43
385
+ %371 = select i1 %369, float 0.000000e+00, float %370, !dbg !55
386
+ %372 = fmul float %371, %367, !dbg !44
387
+ %373 = fadd float %291, %372, !dbg !45
388
+ %374 = fadd float %296, %364, !dbg !46
389
+ %375 = fmul float %367, %367, !dbg !47
390
+ %376 = fmul float %375, 8.000000e+00, !dbg !50
391
+ %377 = fmul float %371, %376, !dbg !48
392
+ %378 = fadd float %374, %377, !dbg !49
393
+ %379 = bitcast float %373 to i32, !dbg !51
394
+ %380 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %379, i32 8, i32 31), !dbg !51
395
+ %381 = bitcast i32 %380 to float, !dbg !51
396
+ %382 = bitcast float %378 to i32, !dbg !51
397
+ %383 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %382, i32 8, i32 31), !dbg !51
398
+ %384 = bitcast i32 %383 to float, !dbg !51
399
+ %385 = bitcast float %368 to i32, !dbg !51
400
+ %386 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %385, i32 8, i32 31), !dbg !51
401
+ %387 = bitcast i32 %386 to float, !dbg !51
402
+ %388 = fsub float %381, %373, !dbg !39
403
+ %389 = fadd float %368, %387, !dbg !53
404
+ %390 = fcmp oeq float %389, 0.000000e+00, !dbg !54
405
+ %391 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %387, float %389) #6, !dbg !43
406
+ %392 = select i1 %390, float 0.000000e+00, float %391, !dbg !55
407
+ %393 = fmul float %392, %388, !dbg !44
408
+ %394 = fadd float %373, %393, !dbg !45
409
+ %395 = fadd float %378, %384, !dbg !46
410
+ %396 = fmul float %388, %388, !dbg !47
411
+ %397 = fmul float %368, %396, !dbg !50
412
+ %398 = fmul float %392, %397, !dbg !48
413
+ %399 = fadd float %395, %398, !dbg !49
414
+ %400 = bitcast float %394 to i32, !dbg !51
415
+ %401 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %400, i32 4, i32 31), !dbg !51
416
+ %402 = bitcast i32 %401 to float, !dbg !51
417
+ %403 = bitcast float %399 to i32, !dbg !51
418
+ %404 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %403, i32 4, i32 31), !dbg !51
419
+ %405 = bitcast i32 %404 to float, !dbg !51
420
+ %406 = bitcast float %389 to i32, !dbg !51
421
+ %407 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %406, i32 4, i32 31), !dbg !51
422
+ %408 = bitcast i32 %407 to float, !dbg !51
423
+ %409 = fsub float %402, %394, !dbg !39
424
+ %410 = fadd float %389, %408, !dbg !53
425
+ %411 = fcmp oeq float %410, 0.000000e+00, !dbg !54
426
+ %412 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %408, float %410) #6, !dbg !43
427
+ %413 = select i1 %411, float 0.000000e+00, float %412, !dbg !55
428
+ %414 = fmul float %409, %413, !dbg !44
429
+ %415 = fadd float %394, %414, !dbg !45
430
+ %416 = fadd float %399, %405, !dbg !46
431
+ %417 = fmul float %409, %409, !dbg !47
432
+ %418 = fmul float %389, %417, !dbg !50
433
+ %419 = fmul float %413, %418, !dbg !48
434
+ %420 = fadd float %416, %419, !dbg !49
435
+ %421 = bitcast float %415 to i32, !dbg !51
436
+ %422 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %421, i32 2, i32 31), !dbg !51
437
+ %423 = bitcast i32 %422 to float, !dbg !51
438
+ %424 = bitcast float %420 to i32, !dbg !51
439
+ %425 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %424, i32 2, i32 31), !dbg !51
440
+ %426 = bitcast i32 %425 to float, !dbg !51
441
+ %427 = bitcast float %410 to i32, !dbg !51
442
+ %428 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %427, i32 2, i32 31), !dbg !51
443
+ %429 = bitcast i32 %428 to float, !dbg !51
444
+ %430 = fsub float %423, %415, !dbg !39
445
+ %431 = fadd float %410, %429, !dbg !53
446
+ %432 = fcmp oeq float %431, 0.000000e+00, !dbg !54
447
+ %433 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %429, float %431) #6, !dbg !43
448
+ %434 = select i1 %432, float 0.000000e+00, float %433, !dbg !55
449
+ %435 = fmul float %430, %434, !dbg !44
450
+ %436 = fadd float %415, %435, !dbg !45
451
+ %437 = fadd float %420, %426, !dbg !46
452
+ %438 = fmul float %430, %430, !dbg !47
453
+ %439 = fmul float %410, %438, !dbg !50
454
+ %440 = fmul float %434, %439, !dbg !48
455
+ %441 = fadd float %437, %440, !dbg !49
456
+ %442 = bitcast float %436 to i32, !dbg !51
457
+ %443 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %442, i32 1, i32 31), !dbg !51
458
+ %444 = bitcast float %441 to i32, !dbg !51
459
+ %445 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %444, i32 1, i32 31), !dbg !51
460
+ %446 = bitcast float %431 to i32, !dbg !51
461
+ %447 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %446, i32 1, i32 31), !dbg !51
462
+ %448 = bitcast i32 %447 to float, !dbg !51
463
+ %449 = fadd float %431, %448, !dbg !53
464
+ %450 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %448, float %449) #6, !dbg !43
465
+ %451 = bitcast float %353 to i32, !dbg !51
466
+ %452 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %451, i32 16, i32 31), !dbg !51
467
+ %453 = bitcast i32 %452 to float, !dbg !51
468
+ %454 = bitcast float %358 to i32, !dbg !51
469
+ %455 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %454, i32 16, i32 31), !dbg !51
470
+ %456 = bitcast i32 %455 to float, !dbg !51
471
+ %457 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 1090519040, i32 16, i32 31), !dbg !51
472
+ %458 = bitcast i32 %457 to float, !dbg !51
473
+ %459 = fsub float %453, %353, !dbg !39
474
+ %460 = fadd float %458, 8.000000e+00, !dbg !53
475
+ %461 = fcmp oeq float %460, 0.000000e+00, !dbg !54
476
+ %462 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %458, float %460) #6, !dbg !43
477
+ %463 = select i1 %461, float 0.000000e+00, float %462, !dbg !55
478
+ %464 = fmul float %459, %463, !dbg !44
479
+ %465 = fadd float %353, %464, !dbg !45
480
+ %466 = fadd float %358, %456, !dbg !46
481
+ %467 = fmul float %459, %459, !dbg !47
482
+ %468 = fmul float %467, 8.000000e+00, !dbg !50
483
+ %469 = fmul float %468, %463, !dbg !48
484
+ %470 = fadd float %466, %469, !dbg !49
485
+ %471 = bitcast float %465 to i32, !dbg !51
486
+ %472 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %471, i32 8, i32 31), !dbg !51
487
+ %473 = bitcast i32 %472 to float, !dbg !51
488
+ %474 = bitcast float %470 to i32, !dbg !51
489
+ %475 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %474, i32 8, i32 31), !dbg !51
490
+ %476 = bitcast i32 %475 to float, !dbg !51
491
+ %477 = bitcast float %460 to i32, !dbg !51
492
+ %478 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %477, i32 8, i32 31), !dbg !51
493
+ %479 = bitcast i32 %478 to float, !dbg !51
494
+ %480 = fsub float %473, %465, !dbg !39
495
+ %481 = fadd float %460, %479, !dbg !53
496
+ %482 = fcmp oeq float %481, 0.000000e+00, !dbg !54
497
+ %483 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %479, float %481) #6, !dbg !43
498
+ %484 = select i1 %482, float 0.000000e+00, float %483, !dbg !55
499
+ %485 = fmul float %480, %484, !dbg !44
500
+ %486 = fadd float %465, %485, !dbg !45
501
+ %487 = fadd float %470, %476, !dbg !46
502
+ %488 = fmul float %480, %480, !dbg !47
503
+ %489 = fmul float %460, %488, !dbg !50
504
+ %490 = fmul float %484, %489, !dbg !48
505
+ %491 = fadd float %487, %490, !dbg !49
506
+ %492 = bitcast float %486 to i32, !dbg !51
507
+ %493 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %492, i32 4, i32 31), !dbg !51
508
+ %494 = bitcast i32 %493 to float, !dbg !51
509
+ %495 = bitcast float %491 to i32, !dbg !51
510
+ %496 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %495, i32 4, i32 31), !dbg !51
511
+ %497 = bitcast i32 %496 to float, !dbg !51
512
+ %498 = bitcast float %481 to i32, !dbg !51
513
+ %499 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %498, i32 4, i32 31), !dbg !51
514
+ %500 = bitcast i32 %499 to float, !dbg !51
515
+ %501 = fsub float %494, %486, !dbg !39
516
+ %502 = fadd float %481, %500, !dbg !53
517
+ %503 = fcmp oeq float %502, 0.000000e+00, !dbg !54
518
+ %504 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %500, float %502) #6, !dbg !43
519
+ %505 = select i1 %503, float 0.000000e+00, float %504, !dbg !55
520
+ %506 = fmul float %501, %505, !dbg !44
521
+ %507 = fadd float %486, %506, !dbg !45
522
+ %508 = fadd float %491, %497, !dbg !46
523
+ %509 = fmul float %501, %501, !dbg !47
524
+ %510 = fmul float %481, %509, !dbg !50
525
+ %511 = fmul float %505, %510, !dbg !48
526
+ %512 = fadd float %508, %511, !dbg !49
527
+ %513 = bitcast float %507 to i32, !dbg !51
528
+ %514 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %513, i32 2, i32 31), !dbg !51
529
+ %515 = bitcast i32 %514 to float, !dbg !51
530
+ %516 = bitcast float %512 to i32, !dbg !51
531
+ %517 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %516, i32 2, i32 31), !dbg !51
532
+ %518 = bitcast i32 %517 to float, !dbg !51
533
+ %519 = bitcast float %502 to i32, !dbg !51
534
+ %520 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %519, i32 2, i32 31), !dbg !51
535
+ %521 = bitcast i32 %520 to float, !dbg !51
536
+ %522 = fsub float %515, %507, !dbg !39
537
+ %523 = fadd float %502, %521, !dbg !53
538
+ %524 = fcmp oeq float %523, 0.000000e+00, !dbg !54
539
+ %525 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %521, float %523) #6, !dbg !43
540
+ %526 = select i1 %524, float 0.000000e+00, float %525, !dbg !55
541
+ %527 = fmul float %522, %526, !dbg !44
542
+ %528 = fadd float %507, %527, !dbg !45
543
+ %529 = fadd float %512, %518, !dbg !46
544
+ %530 = fmul float %522, %522, !dbg !47
545
+ %531 = fmul float %502, %530, !dbg !50
546
+ %532 = fmul float %526, %531, !dbg !48
547
+ %533 = fadd float %529, %532, !dbg !49
548
+ %534 = bitcast float %528 to i32, !dbg !51
549
+ %535 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %534, i32 1, i32 31), !dbg !51
550
+ %536 = bitcast float %533 to i32, !dbg !51
551
+ %537 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %536, i32 1, i32 31), !dbg !51
552
+ %538 = bitcast float %523 to i32, !dbg !51
553
+ %539 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %538, i32 1, i32 31), !dbg !51
554
+ %540 = bitcast i32 %539 to float, !dbg !51
555
+ %541 = fadd float %523, %540, !dbg !53
556
+ %542 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %540, float %541) #6, !dbg !43
557
+ %543 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %52, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !56
558
+ %544 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %54, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !56
559
+ %545 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %56, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !56
560
+ %546 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %58, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !56
561
+ %547 = zext nneg i32 %urem to i64, !dbg !57
562
+ %548 = getelementptr float, ptr addrspace(1) %3, i64 %547, !dbg !57
563
+ %549 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %548, i1 true, i32 0, i1 true) #6, !dbg !58
564
+ br i1 %100, label %550, label %551, !dbg !59
565
+
566
+ 550: ; preds = %102
567
+ tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !59
568
+ br label %551, !dbg !59
569
+
570
+ 551: ; preds = %550, %102
571
+ %552 = bitcast i32 %537 to float, !dbg !51
572
+ %553 = fadd float %533, %552, !dbg !46
573
+ %554 = bitcast i32 %535 to float, !dbg !51
574
+ %555 = fsub float %554, %528, !dbg !39
575
+ %556 = fmul float %555, %555, !dbg !47
576
+ %557 = fmul float %523, %556, !dbg !50
577
+ %558 = fcmp oeq float %541, 0.000000e+00, !dbg !54
578
+ %559 = select i1 %558, float 0.000000e+00, float %542, !dbg !55
579
+ %560 = fmul float %559, %557, !dbg !48
580
+ %561 = fadd float %553, %560, !dbg !49
581
+ %562 = bitcast i32 %445 to float, !dbg !51
582
+ %563 = fadd float %441, %562, !dbg !46
583
+ %564 = bitcast i32 %443 to float, !dbg !51
584
+ %565 = fsub float %564, %436, !dbg !39
585
+ %566 = fmul float %565, %565, !dbg !47
586
+ %567 = fmul float %431, %566, !dbg !50
587
+ %568 = fcmp oeq float %449, 0.000000e+00, !dbg !54
588
+ %569 = select i1 %568, float 0.000000e+00, float %450, !dbg !55
589
+ %570 = fmul float %569, %567, !dbg !48
590
+ %571 = fadd float %563, %570, !dbg !49
591
+ %572 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %115, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !60
592
+ %573 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %116, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !60
593
+ %574 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %117, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !60
594
+ %575 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %118, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !60
595
+ %576 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %571, float 2.560000e+02) #6, !dbg !61
596
+ %577 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %571, float 2.560000e+02) #6, !dbg !61
597
+ %578 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %571, float 2.560000e+02) #6, !dbg !61
598
+ %579 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %571, float 2.560000e+02) #6, !dbg !61
599
+ %580 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %571, float 2.560000e+02) #6, !dbg !61
600
+ %581 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %571, float 2.560000e+02) #6, !dbg !61
601
+ %582 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %571, float 2.560000e+02) #6, !dbg !61
602
+ %583 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %571, float 2.560000e+02) #6, !dbg !61
603
+ %584 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %561, float 2.560000e+02) #6, !dbg !61
604
+ %585 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %561, float 2.560000e+02) #6, !dbg !61
605
+ %586 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %561, float 2.560000e+02) #6, !dbg !61
606
+ %587 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %561, float 2.560000e+02) #6, !dbg !61
607
+ %588 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %561, float 2.560000e+02) #6, !dbg !61
608
+ %589 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %561, float 2.560000e+02) #6, !dbg !61
609
+ %590 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %561, float 2.560000e+02) #6, !dbg !61
610
+ %591 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %561, float 2.560000e+02) #6, !dbg !61
611
+ %592 = fadd float %576, 0x3EE4F8B580000000, !dbg !62
612
+ %593 = fadd float %584, 0x3EE4F8B580000000, !dbg !62
613
+ %594 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
614
+ %.not.i = icmp eq i32 %594, 0, !dbg !63
615
+ br i1 %.not.i, label %597, label %595, !dbg !63
616
+
617
+ 595: ; preds = %551
618
+ %596 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %592), !dbg !63
619
+ br label %__nv_rsqrtf.exit, !dbg !63
620
+
621
+ 597: ; preds = %551
622
+ %598 = tail call float @llvm.nvvm.rsqrt.approx.f(float %592), !dbg !63
623
+ br label %__nv_rsqrtf.exit, !dbg !63
624
+
625
+ __nv_rsqrtf.exit: ; preds = %595, %597
626
+ %.0.i = phi float [ %596, %595 ], [ %598, %597 ], !dbg !63
627
+ %599 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
628
+ %600 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
629
+ %601 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
630
+ %602 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
631
+ %603 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
632
+ %604 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
633
+ %605 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
634
+ %606 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
635
+ %.not.i22 = icmp eq i32 %606, 0, !dbg !63
636
+ br i1 %.not.i22, label %609, label %607, !dbg !63
637
+
638
+ 607: ; preds = %__nv_rsqrtf.exit
639
+ %608 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %593), !dbg !63
640
+ br label %__nv_rsqrtf.exit24, !dbg !63
641
+
642
+ 609: ; preds = %__nv_rsqrtf.exit
643
+ %610 = tail call float @llvm.nvvm.rsqrt.approx.f(float %593), !dbg !63
644
+ br label %__nv_rsqrtf.exit24, !dbg !63
645
+
646
+ __nv_rsqrtf.exit24: ; preds = %607, %609
647
+ %.0.i23 = phi float [ %608, %607 ], [ %610, %609 ], !dbg !63
648
+ %611 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
649
+ %612 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
650
+ %613 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
651
+ %614 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
652
+ %615 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
653
+ %616 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
654
+ %617 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
655
+ %618 = extractvalue { i32, i32, i32, i32 } %575, 3, !dbg !60
656
+ %619 = bitcast i32 %618 to float, !dbg !60
657
+ %620 = extractvalue { i32, i32, i32, i32 } %546, 3, !dbg !56
658
+ %621 = bitcast i32 %620 to float, !dbg !56
659
+ %622 = fadd float %621, %619, !dbg !64
660
+ %623 = fmul float %555, %559, !dbg !44
661
+ %624 = fadd float %528, %623, !dbg !45
662
+ %625 = fsub float %622, %624, !dbg !65
663
+ %626 = extractvalue { i32, i32, i32, i32 } %575, 2, !dbg !60
664
+ %627 = bitcast i32 %626 to float, !dbg !60
665
+ %628 = extractvalue { i32, i32, i32, i32 } %546, 2, !dbg !56
666
+ %629 = bitcast i32 %628 to float, !dbg !56
667
+ %630 = fadd float %629, %627, !dbg !64
668
+ %631 = fsub float %630, %624, !dbg !65
669
+ %632 = extractvalue { i32, i32, i32, i32 } %575, 1, !dbg !60
670
+ %633 = bitcast i32 %632 to float, !dbg !60
671
+ %634 = extractvalue { i32, i32, i32, i32 } %546, 1, !dbg !56
672
+ %635 = bitcast i32 %634 to float, !dbg !56
673
+ %636 = fadd float %635, %633, !dbg !64
674
+ %637 = fsub float %636, %624, !dbg !65
675
+ %638 = extractvalue { i32, i32, i32, i32 } %575, 0, !dbg !60
676
+ %639 = bitcast i32 %638 to float, !dbg !60
677
+ %640 = extractvalue { i32, i32, i32, i32 } %546, 0, !dbg !56
678
+ %641 = bitcast i32 %640 to float, !dbg !56
679
+ %642 = fadd float %641, %639, !dbg !64
680
+ %643 = fsub float %642, %624, !dbg !65
681
+ %644 = extractvalue { i32, i32, i32, i32 } %574, 3, !dbg !60
682
+ %645 = bitcast i32 %644 to float, !dbg !60
683
+ %646 = extractvalue { i32, i32, i32, i32 } %545, 3, !dbg !56
684
+ %647 = bitcast i32 %646 to float, !dbg !56
685
+ %648 = fadd float %647, %645, !dbg !64
686
+ %649 = fsub float %648, %624, !dbg !65
687
+ %650 = extractvalue { i32, i32, i32, i32 } %574, 2, !dbg !60
688
+ %651 = bitcast i32 %650 to float, !dbg !60
689
+ %652 = extractvalue { i32, i32, i32, i32 } %545, 2, !dbg !56
690
+ %653 = bitcast i32 %652 to float, !dbg !56
691
+ %654 = fadd float %653, %651, !dbg !64
692
+ %655 = fsub float %654, %624, !dbg !65
693
+ %656 = extractvalue { i32, i32, i32, i32 } %574, 1, !dbg !60
694
+ %657 = bitcast i32 %656 to float, !dbg !60
695
+ %658 = extractvalue { i32, i32, i32, i32 } %545, 1, !dbg !56
696
+ %659 = bitcast i32 %658 to float, !dbg !56
697
+ %660 = fadd float %659, %657, !dbg !64
698
+ %661 = fsub float %660, %624, !dbg !65
699
+ %662 = extractvalue { i32, i32, i32, i32 } %574, 0, !dbg !60
700
+ %663 = bitcast i32 %662 to float, !dbg !60
701
+ %664 = extractvalue { i32, i32, i32, i32 } %545, 0, !dbg !56
702
+ %665 = bitcast i32 %664 to float, !dbg !56
703
+ %666 = fadd float %665, %663, !dbg !64
704
+ %667 = fsub float %666, %624, !dbg !65
705
+ %668 = extractvalue { i32, i32, i32, i32 } %573, 3, !dbg !60
706
+ %669 = bitcast i32 %668 to float, !dbg !60
707
+ %670 = extractvalue { i32, i32, i32, i32 } %544, 3, !dbg !56
708
+ %671 = bitcast i32 %670 to float, !dbg !56
709
+ %672 = fadd float %671, %669, !dbg !64
710
+ %673 = fmul float %565, %569, !dbg !44
711
+ %674 = fadd float %436, %673, !dbg !45
712
+ %675 = fsub float %672, %674, !dbg !65
713
+ %676 = extractvalue { i32, i32, i32, i32 } %573, 2, !dbg !60
714
+ %677 = bitcast i32 %676 to float, !dbg !60
715
+ %678 = extractvalue { i32, i32, i32, i32 } %544, 2, !dbg !56
716
+ %679 = bitcast i32 %678 to float, !dbg !56
717
+ %680 = fadd float %679, %677, !dbg !64
718
+ %681 = fsub float %680, %674, !dbg !65
719
+ %682 = extractvalue { i32, i32, i32, i32 } %573, 1, !dbg !60
720
+ %683 = bitcast i32 %682 to float, !dbg !60
721
+ %684 = extractvalue { i32, i32, i32, i32 } %544, 1, !dbg !56
722
+ %685 = bitcast i32 %684 to float, !dbg !56
723
+ %686 = fadd float %685, %683, !dbg !64
724
+ %687 = fsub float %686, %674, !dbg !65
725
+ %688 = extractvalue { i32, i32, i32, i32 } %573, 0, !dbg !60
726
+ %689 = bitcast i32 %688 to float, !dbg !60
727
+ %690 = extractvalue { i32, i32, i32, i32 } %544, 0, !dbg !56
728
+ %691 = bitcast i32 %690 to float, !dbg !56
729
+ %692 = fadd float %691, %689, !dbg !64
730
+ %693 = fsub float %692, %674, !dbg !65
731
+ %694 = extractvalue { i32, i32, i32, i32 } %572, 3, !dbg !60
732
+ %695 = bitcast i32 %694 to float, !dbg !60
733
+ %696 = extractvalue { i32, i32, i32, i32 } %543, 3, !dbg !56
734
+ %697 = bitcast i32 %696 to float, !dbg !56
735
+ %698 = fadd float %697, %695, !dbg !64
736
+ %699 = fsub float %698, %674, !dbg !65
737
+ %700 = extractvalue { i32, i32, i32, i32 } %572, 2, !dbg !60
738
+ %701 = bitcast i32 %700 to float, !dbg !60
739
+ %702 = extractvalue { i32, i32, i32, i32 } %543, 2, !dbg !56
740
+ %703 = bitcast i32 %702 to float, !dbg !56
741
+ %704 = fadd float %703, %701, !dbg !64
742
+ %705 = fsub float %704, %674, !dbg !65
743
+ %706 = extractvalue { i32, i32, i32, i32 } %572, 1, !dbg !60
744
+ %707 = bitcast i32 %706 to float, !dbg !60
745
+ %708 = extractvalue { i32, i32, i32, i32 } %543, 1, !dbg !56
746
+ %709 = bitcast i32 %708 to float, !dbg !56
747
+ %710 = fadd float %709, %707, !dbg !64
748
+ %711 = fsub float %710, %674, !dbg !65
749
+ %712 = extractvalue { i32, i32, i32, i32 } %572, 0, !dbg !60
750
+ %713 = bitcast i32 %712 to float, !dbg !60
751
+ %714 = extractvalue { i32, i32, i32, i32 } %543, 0, !dbg !56
752
+ %715 = bitcast i32 %714 to float, !dbg !56
753
+ %716 = fadd float %715, %713, !dbg !64
754
+ %717 = fsub float %716, %674, !dbg !65
755
+ %718 = fmul float %717, %.0.i, !dbg !66
756
+ %719 = fmul float %711, %.0.i, !dbg !66
757
+ %720 = fmul float %705, %.0.i, !dbg !66
758
+ %721 = fmul float %699, %.0.i, !dbg !66
759
+ %722 = fmul float %693, %.0.i, !dbg !66
760
+ %723 = fmul float %687, %.0.i, !dbg !66
761
+ %724 = fmul float %681, %.0.i, !dbg !66
762
+ %725 = fmul float %675, %.0.i, !dbg !66
763
+ %726 = fmul float %667, %.0.i23, !dbg !66
764
+ %727 = fmul float %661, %.0.i23, !dbg !66
765
+ %728 = fmul float %655, %.0.i23, !dbg !66
766
+ %729 = fmul float %649, %.0.i23, !dbg !66
767
+ %730 = fmul float %643, %.0.i23, !dbg !66
768
+ %731 = fmul float %637, %.0.i23, !dbg !66
769
+ %732 = fmul float %631, %.0.i23, !dbg !66
770
+ %733 = fmul float %625, %.0.i23, !dbg !66
771
+ %734 = getelementptr float, ptr addrspace(3) @global_smem, i64 %547, !dbg !67
772
+ store i32 %549, ptr addrspace(3) %734, align 4, !dbg !67
773
+ tail call void @llvm.nvvm.barrier0(), !dbg !67
774
+ %735 = getelementptr float, ptr addrspace(3) @global_smem, i64 %109, !dbg !67
775
+ %736 = load float, ptr addrspace(3) %735, align 32, !dbg !67
776
+ %737 = getelementptr inbounds <8 x float>, ptr addrspace(3) %735, i64 0, i64 1, !dbg !67
777
+ %738 = load float, ptr addrspace(3) %737, align 4, !dbg !67
778
+ %739 = getelementptr inbounds <8 x float>, ptr addrspace(3) %735, i64 0, i64 2, !dbg !67
779
+ %740 = load float, ptr addrspace(3) %739, align 8, !dbg !67
780
+ %741 = getelementptr inbounds <8 x float>, ptr addrspace(3) %735, i64 0, i64 3, !dbg !67
781
+ %742 = load float, ptr addrspace(3) %741, align 4, !dbg !67
782
+ %743 = getelementptr inbounds <8 x float>, ptr addrspace(3) %735, i64 0, i64 4, !dbg !67
783
+ %744 = load float, ptr addrspace(3) %743, align 16, !dbg !67
784
+ %745 = getelementptr inbounds <8 x float>, ptr addrspace(3) %735, i64 0, i64 5, !dbg !67
785
+ %746 = load float, ptr addrspace(3) %745, align 4, !dbg !67
786
+ %747 = getelementptr inbounds <8 x float>, ptr addrspace(3) %735, i64 0, i64 6, !dbg !67
787
+ %748 = load float, ptr addrspace(3) %747, align 8, !dbg !67
788
+ %749 = getelementptr inbounds <8 x float>, ptr addrspace(3) %735, i64 0, i64 7, !dbg !67
789
+ %750 = load float, ptr addrspace(3) %749, align 4, !dbg !67
790
+ %751 = fmul float %718, %736, !dbg !67
791
+ %752 = fmul float %719, %738, !dbg !67
792
+ %753 = fmul float %720, %740, !dbg !67
793
+ %754 = fmul float %721, %742, !dbg !67
794
+ %755 = fmul float %722, %744, !dbg !67
795
+ %756 = fmul float %723, %746, !dbg !67
796
+ %757 = fmul float %724, %748, !dbg !67
797
+ %758 = fmul float %725, %750, !dbg !67
798
+ %759 = fmul float %726, %736, !dbg !67
799
+ %760 = fmul float %727, %738, !dbg !67
800
+ %761 = fmul float %728, %740, !dbg !67
801
+ %762 = fmul float %729, %742, !dbg !67
802
+ %763 = fmul float %730, %744, !dbg !67
803
+ %764 = fmul float %731, %746, !dbg !67
804
+ %765 = fmul float %732, %748, !dbg !67
805
+ %766 = fmul float %733, %750, !dbg !67
806
+ %767 = shl i32 %17, 8, !dbg !68
807
+ %768 = shl i32 %18, 8, !dbg !68
808
+ %769 = or i32 %767, %13, !dbg !69
809
+ %770 = or i32 %768, %13, !dbg !69
810
+ %771 = sext i32 %769 to i64, !dbg !70
811
+ %772 = getelementptr i16, ptr addrspace(1) %4, i64 %771, !dbg !70
812
+ %773 = sext i32 %770 to i64, !dbg !70
813
+ %774 = getelementptr i16, ptr addrspace(1) %4, i64 %773, !dbg !70
814
+ %775 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %751) #6, !dbg !71
815
+ %776 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %752) #6, !dbg !71
816
+ %777 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %753) #6, !dbg !71
817
+ %778 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %754) #6, !dbg !71
818
+ %779 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %755) #6, !dbg !71
819
+ %780 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %756) #6, !dbg !71
820
+ %781 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %757) #6, !dbg !71
821
+ %782 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %758) #6, !dbg !71
822
+ %783 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %759) #6, !dbg !71
823
+ %784 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %760) #6, !dbg !71
824
+ %785 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %761) #6, !dbg !71
825
+ %786 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %762) #6, !dbg !71
826
+ %787 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %763) #6, !dbg !71
827
+ %788 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %764) #6, !dbg !71
828
+ %789 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %765) #6, !dbg !71
829
+ %790 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %766) #6, !dbg !71
830
+ %791 = insertelement <2 x i16> undef, i16 %775, i64 0, !dbg !71
831
+ %792 = insertelement <2 x i16> %791, i16 %776, i64 1, !dbg !71
832
+ %793 = bitcast <2 x i16> %792 to i32, !dbg !71
833
+ %794 = insertelement <2 x i16> undef, i16 %777, i64 0, !dbg !71
834
+ %795 = insertelement <2 x i16> %794, i16 %778, i64 1, !dbg !71
835
+ %796 = bitcast <2 x i16> %795 to i32, !dbg !71
836
+ %797 = insertelement <2 x i16> undef, i16 %779, i64 0, !dbg !71
837
+ %798 = insertelement <2 x i16> %797, i16 %780, i64 1, !dbg !71
838
+ %799 = bitcast <2 x i16> %798 to i32, !dbg !71
839
+ %800 = insertelement <2 x i16> undef, i16 %781, i64 0, !dbg !71
840
+ %801 = insertelement <2 x i16> %800, i16 %782, i64 1, !dbg !71
841
+ %802 = bitcast <2 x i16> %801 to i32, !dbg !71
842
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %793, i32 %796, i32 %799, i32 %802, ptr addrspace(1) %772, i1 true) #6, !dbg !71
843
+ %803 = insertelement <2 x i16> undef, i16 %783, i64 0, !dbg !71
844
+ %804 = insertelement <2 x i16> %803, i16 %784, i64 1, !dbg !71
845
+ %805 = bitcast <2 x i16> %804 to i32, !dbg !71
846
+ %806 = insertelement <2 x i16> undef, i16 %785, i64 0, !dbg !71
847
+ %807 = insertelement <2 x i16> %806, i16 %786, i64 1, !dbg !71
848
+ %808 = bitcast <2 x i16> %807 to i32, !dbg !71
849
+ %809 = insertelement <2 x i16> undef, i16 %787, i64 0, !dbg !71
850
+ %810 = insertelement <2 x i16> %809, i16 %788, i64 1, !dbg !71
851
+ %811 = bitcast <2 x i16> %810 to i32, !dbg !71
852
+ %812 = insertelement <2 x i16> undef, i16 %789, i64 0, !dbg !71
853
+ %813 = insertelement <2 x i16> %812, i16 %790, i64 1, !dbg !71
854
+ %814 = bitcast <2 x i16> %813 to i32, !dbg !71
855
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %805, i32 %808, i32 %811, i32 %814, ptr addrspace(1) %774, i1 true) #6, !dbg !71
856
+ ret void, !dbg !72
857
+ }
858
+
859
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
860
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
861
+
862
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
863
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
864
+
865
+ ; Function Attrs: convergent nocallback nounwind
866
+ declare void @llvm.nvvm.barrier0() #2
867
+
868
+ ; Function Attrs: alwaysinline nounwind
869
+ define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
870
+ %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
871
+ %.not = icmp eq i32 %1, 0
872
+ br i1 %.not, label %4, label %2
873
+
874
+ 2: ; preds = %0
875
+ %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
876
+ br label %6
877
+
878
+ 4: ; preds = %0
879
+ %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
880
+ br label %6
881
+
882
+ 6: ; preds = %4, %2
883
+ %.0 = phi float [ %3, %2 ], [ %5, %4 ]
884
+ ret float %.0
885
+ }
886
+
887
+ declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
888
+
889
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
890
+ declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
891
+
892
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
893
+ declare float @llvm.nvvm.rsqrt.approx.f(float) #5
894
+
895
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
896
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
897
+ attributes #2 = { convergent nocallback nounwind }
898
+ attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
899
+ attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
900
+ attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
901
+ attributes #6 = { nounwind }
902
+
903
+ !llvm.module.flags = !{!0, !1}
904
+ !llvm.dbg.cu = !{!2}
905
+ !nvvm.annotations = !{!4, !5, !5, !4}
906
+ !llvm.ident = !{!6}
907
+
908
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
909
+ !1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
910
+ !2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
911
+ !3 = !DIFile(filename: "cgx5lxpuexpindj4dsmjz5x42uhyy7iskevq7ovzpwagb3t5powj.py", directory: "/tmp/torchinductor_root/gx")
912
+ !4 = !{ptr @triton__0d1d2d3d4d5de6de, !"kernel", i32 1}
913
+ !5 = !{ptr @triton__0d1d2d3d4d5de6de, !"maxntidx", i32 256}
914
+ !6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
915
+ !7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5de6de", linkageName: "triton__0d1d2d3d4d5de6de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
916
+ !8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
917
+ !9 = !{}
918
+ !10 = !DILocation(line: 22, column: 44, scope: !7)
919
+ !11 = !DILocation(line: 24, column: 33, scope: !7)
920
+ !12 = !DILocation(line: 21, column: 28, scope: !7)
921
+ !13 = !DILocation(line: 21, column: 33, scope: !7)
922
+ !14 = !DILocation(line: 22, column: 23, scope: !7)
923
+ !15 = !DILocation(line: 26, column: 30, scope: !7)
924
+ !16 = !DILocation(line: 26, column: 35, scope: !7)
925
+ !17 = !DILocation(line: 27, column: 18, scope: !7)
926
+ !18 = !DILocation(line: 35, column: 44, scope: !7)
927
+ !19 = !DILocation(line: 35, column: 40, scope: !7)
928
+ !20 = !DILocation(line: 35, column: 34, scope: !7)
929
+ !21 = !DILocation(line: 35, column: 50, scope: !7)
930
+ !22 = !DILocation(line: 36, column: 22, scope: !7)
931
+ !23 = !DILocation(line: 37, column: 22, scope: !7)
932
+ !24 = !DILocation(line: 38, column: 36, scope: !7)
933
+ !25 = !DILocation(line: 39, column: 40, scope: !7)
934
+ !26 = !DILocation(line: 39, column: 55, scope: !7)
935
+ !27 = !DILocation(line: 40, column: 44, scope: !7)
936
+ !28 = !DILocation(line: 40, column: 40, scope: !7)
937
+ !29 = !DILocation(line: 40, column: 34, scope: !7)
938
+ !30 = !DILocation(line: 40, column: 52, scope: !7)
939
+ !31 = !DILocation(line: 41, column: 22, scope: !7)
940
+ !32 = !DILocation(line: 98, column: 22, scope: !33, inlinedAt: !35)
941
+ !33 = distinct !DILexicalBlockFile(scope: !7, file: !34, discriminator: 0)
942
+ !34 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
943
+ !35 = !DILocation(line: 44, column: 38, scope: !33)
944
+ !36 = !DILocation(line: 101, column: 30, scope: !33, inlinedAt: !35)
945
+ !37 = !DILocation(line: 101, column: 22, scope: !33, inlinedAt: !35)
946
+ !38 = !DILocation(line: 101, column: 13, scope: !33, inlinedAt: !35)
947
+ !39 = !DILocation(line: 108, column: 21, scope: !40, inlinedAt: !41)
948
+ !40 = distinct !DILexicalBlockFile(scope: !33, file: !34, discriminator: 0)
949
+ !41 = !DILocation(line: 120, column: 46, scope: !40, inlinedAt: !42)
950
+ !42 = !DILocation(line: 50, column: 41, scope: !40)
951
+ !43 = !DILocation(line: 110, column: 60, scope: !40, inlinedAt: !41)
952
+ !44 = !DILocation(line: 112, column: 25, scope: !40, inlinedAt: !41)
953
+ !45 = !DILocation(line: 112, column: 17, scope: !40, inlinedAt: !41)
954
+ !46 = !DILocation(line: 113, column: 15, scope: !40, inlinedAt: !41)
955
+ !47 = !DILocation(line: 113, column: 30, scope: !40, inlinedAt: !41)
956
+ !48 = !DILocation(line: 113, column: 49, scope: !40, inlinedAt: !41)
957
+ !49 = !DILocation(line: 113, column: 22, scope: !40, inlinedAt: !41)
958
+ !50 = !DILocation(line: 113, column: 38, scope: !40, inlinedAt: !41)
959
+ !51 = !DILocation(line: 120, column: 46, scope: !33, inlinedAt: !52)
960
+ !52 = !DILocation(line: 50, column: 41, scope: !33)
961
+ !53 = !DILocation(line: 109, column: 28, scope: !40, inlinedAt: !41)
962
+ !54 = !DILocation(line: 110, column: 39, scope: !40, inlinedAt: !41)
963
+ !55 = !DILocation(line: 110, column: 49, scope: !40, inlinedAt: !41)
964
+ !56 = !DILocation(line: 59, column: 51, scope: !7)
965
+ !57 = !DILocation(line: 60, column: 35, scope: !7)
966
+ !58 = !DILocation(line: 60, column: 40, scope: !7)
967
+ !59 = !DILocation(line: 64, column: 57, scope: !7)
968
+ !60 = !DILocation(line: 65, column: 54, scope: !7)
969
+ !61 = !DILocation(line: 69, column: 23, scope: !7)
970
+ !62 = !DILocation(line: 71, column: 24, scope: !7)
971
+ !63 = !DILocation(line: 72, column: 30, scope: !7)
972
+ !64 = !DILocation(line: 66, column: 24, scope: !7)
973
+ !65 = !DILocation(line: 67, column: 24, scope: !7)
974
+ !66 = !DILocation(line: 73, column: 24, scope: !7)
975
+ !67 = !DILocation(line: 74, column: 24, scope: !7)
976
+ !68 = !DILocation(line: 76, column: 39, scope: !7)
977
+ !69 = !DILocation(line: 76, column: 35, scope: !7)
978
+ !70 = !DILocation(line: 76, column: 29, scope: !7)
979
+ !71 = !DILocation(line: 76, column: 52, scope: !7)
980
+ !72 = !DILocation(line: 55, column: 4, scope: !7)
.triton/dump/345a87a492fd703c73ab83265a21fcb6/triton_.ptx ADDED
@@ -0,0 +1,1654 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4d5de6de
10
+ .extern .func __assertfail
11
+ (
12
+ .param .b64 __assertfail_param_0,
13
+ .param .b64 __assertfail_param_1,
14
+ .param .b32 __assertfail_param_2,
15
+ .param .b64 __assertfail_param_3,
16
+ .param .b64 __assertfail_param_4
17
+ )
18
+ ;
19
+ .global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
20
+ .global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
21
+ .global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 51, 32, 60, 32, 53, 48, 50, 53, 55};
22
+ .global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
23
+ .global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
24
+ .global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
25
+ .extern .shared .align 1 .b8 global_smem[];
26
+ .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
27
+
28
+ .visible .entry triton__0d1d2d3d4d5de6de(
29
+ .param .u64 triton__0d1d2d3d4d5de6de_param_0,
30
+ .param .u64 triton__0d1d2d3d4d5de6de_param_1,
31
+ .param .u64 triton__0d1d2d3d4d5de6de_param_2,
32
+ .param .u64 triton__0d1d2d3d4d5de6de_param_3,
33
+ .param .u64 triton__0d1d2d3d4d5de6de_param_4,
34
+ .param .u32 triton__0d1d2d3d4d5de6de_param_5,
35
+ .param .u32 triton__0d1d2d3d4d5de6de_param_6
36
+ )
37
+ .maxntid 256, 1, 1
38
+ {
39
+ .reg .pred %p<117>;
40
+ .reg .b16 %rs<17>;
41
+ .reg .b32 %r<375>;
42
+ .reg .f32 %f<423>;
43
+ .reg .b64 %rd<113>;
44
+ .loc 1 18 0
45
+ $L__func_begin0:
46
+ .loc 1 18 0
47
+
48
+ ld.param.u64 %rd13, [triton__0d1d2d3d4d5de6de_param_3];
49
+ ld.param.u64 %rd12, [triton__0d1d2d3d4d5de6de_param_1];
50
+ ld.param.u64 %rd53, [triton__0d1d2d3d4d5de6de_param_0];
51
+ $L__tmp0:
52
+ .loc 1 22 44
53
+ mov.u32 %r59, %tid.x;
54
+ ld.param.u64 %rd54, [triton__0d1d2d3d4d5de6de_param_2];
55
+ bfe.u32 %r60, %r59, 5, 3;
56
+ and.b32 %r61, %r59, 15;
57
+ .loc 1 24 33
58
+ shl.b32 %r62, %r59, 3;
59
+ and.b32 %r1, %r62, 248;
60
+ and.b32 %r2, %r59, 255;
61
+ .loc 1 21 28
62
+ mov.u32 %r26, %ctaid.x;
63
+ .loc 1 21 33
64
+ shl.b32 %r63, %r26, 4;
65
+ .loc 1 22 23
66
+ or.b32 %r3, %r63, %r60;
67
+ or.b32 %r4, %r3, 8;
68
+ or.b32 %r64, %r63, %r61;
69
+ .loc 1 26 30
70
+ mul.wide.s32 %rd55, %r3, 8;
71
+ add.s64 %rd16, %rd53, %rd55;
72
+ add.s64 %rd32, %rd16, 64;
73
+ mul.wide.s32 %rd56, %r64, 8;
74
+ add.s64 %rd48, %rd53, %rd56;
75
+ mov.pred %p93, -1;
76
+ .loc 1 26 35
77
+ mov.u64 %rd15, 0x0;
78
+ @%p93 ld.global.L1::evict_last.b64 { %rd15 }, [ %rd16 + 0 ];
79
+ mov.u64 %rd17, 0x0;
80
+ @%p93 ld.global.L1::evict_last.b64 { %rd17 }, [ %rd16 + 0 ];
81
+ mov.u64 %rd19, 0x0;
82
+ @%p93 ld.global.L1::evict_last.b64 { %rd19 }, [ %rd16 + 0 ];
83
+ mov.u64 %rd21, 0x0;
84
+ @%p93 ld.global.L1::evict_last.b64 { %rd21 }, [ %rd16 + 0 ];
85
+ mov.u64 %rd23, 0x0;
86
+ @%p93 ld.global.L1::evict_last.b64 { %rd23 }, [ %rd16 + 0 ];
87
+ mov.u64 %rd25, 0x0;
88
+ @%p93 ld.global.L1::evict_last.b64 { %rd25 }, [ %rd16 + 0 ];
89
+ mov.u64 %rd27, 0x0;
90
+ @%p93 ld.global.L1::evict_last.b64 { %rd27 }, [ %rd16 + 0 ];
91
+ mov.u64 %rd29, 0x0;
92
+ @%p93 ld.global.L1::evict_last.b64 { %rd29 }, [ %rd16 + 0 ];
93
+ mov.u64 %rd31, 0x0;
94
+ @%p93 ld.global.L1::evict_last.b64 { %rd31 }, [ %rd32 + 0 ];
95
+ mov.u64 %rd33, 0x0;
96
+ @%p93 ld.global.L1::evict_last.b64 { %rd33 }, [ %rd32 + 0 ];
97
+ mov.u64 %rd35, 0x0;
98
+ @%p93 ld.global.L1::evict_last.b64 { %rd35 }, [ %rd32 + 0 ];
99
+ mov.u64 %rd37, 0x0;
100
+ @%p93 ld.global.L1::evict_last.b64 { %rd37 }, [ %rd32 + 0 ];
101
+ mov.u64 %rd39, 0x0;
102
+ @%p93 ld.global.L1::evict_last.b64 { %rd39 }, [ %rd32 + 0 ];
103
+ mov.u64 %rd41, 0x0;
104
+ @%p93 ld.global.L1::evict_last.b64 { %rd41 }, [ %rd32 + 0 ];
105
+ mov.u64 %rd43, 0x0;
106
+ @%p93 ld.global.L1::evict_last.b64 { %rd43 }, [ %rd32 + 0 ];
107
+ mov.u64 %rd45, 0x0;
108
+ @%p93 ld.global.L1::evict_last.b64 { %rd45 }, [ %rd32 + 0 ];
109
+ mov.u64 %rd47, 0x0;
110
+ @%p93 ld.global.L1::evict_last.b64 { %rd47 }, [ %rd48 + 0 ];
111
+ .loc 1 27 18
112
+ bfe.s32 %r65, %r26, 27, 1;
113
+ shr.u32 %r66, %r65, 23;
114
+ add.s32 %r67, %r3, %r66;
115
+ and.b32 %r68, %r67, 16776704;
116
+ sub.s32 %r69, %r3, %r68;
117
+ add.s32 %r70, %r4, %r66;
118
+ and.b32 %r71, %r70, 16776704;
119
+ sub.s32 %r72, %r4, %r71;
120
+ .loc 1 35 44
121
+ shl.b32 %r73, %r69, 8;
122
+ shl.b32 %r74, %r72, 8;
123
+ .loc 1 35 40
124
+ or.b32 %r75, %r73, %r1;
125
+ or.b32 %r76, %r74, %r1;
126
+ .loc 1 35 34
127
+ mul.wide.s32 %rd57, %r75, 4;
128
+ add.s64 %rd80, %rd54, %rd57;
129
+ cvt.s64.s32 %rd58, %r73;
130
+ cvt.u64.u32 %rd59, %r1;
131
+ or.b64 %rd60, %rd58, %rd59;
132
+ shl.b64 %rd61, %rd60, 2;
133
+ add.s64 %rd62, %rd54, %rd61;
134
+ add.s64 %rd81, %rd62, 16;
135
+ mul.wide.s32 %rd63, %r76, 4;
136
+ add.s64 %rd82, %rd54, %rd63;
137
+ cvt.s64.s32 %rd64, %r74;
138
+ or.b64 %rd65, %rd64, %rd59;
139
+ shl.b64 %rd66, %rd65, 2;
140
+ add.s64 %rd67, %rd54, %rd66;
141
+ add.s64 %rd83, %rd67, 16;
142
+ mov.b32 %r257, 0;
143
+ .loc 1 35 50
144
+ mov.u32 %r27, 0x0;
145
+ mov.u32 %r28, 0x0;
146
+ mov.u32 %r29, 0x0;
147
+ mov.u32 %r30, 0x0;
148
+ @%p93 ld.global.L1::evict_last.v4.b32 { %r27, %r28, %r29, %r30 }, [ %rd80 + 0 ];
149
+ @!%p93 mov.u32 %r27, %r257;
150
+ @!%p93 mov.u32 %r28, %r257;
151
+ @!%p93 mov.u32 %r29, %r257;
152
+ @!%p93 mov.u32 %r30, %r257;
153
+ mov.b32 %f1, %r27;
154
+ mov.b32 %f2, %r28;
155
+ mov.b32 %f3, %r29;
156
+ mov.b32 %f4, %r30;
157
+ mov.u32 %r35, 0x0;
158
+ mov.u32 %r36, 0x0;
159
+ mov.u32 %r37, 0x0;
160
+ mov.u32 %r38, 0x0;
161
+ @%p93 ld.global.L1::evict_last.v4.b32 { %r35, %r36, %r37, %r38 }, [ %rd81 + 0 ];
162
+ @!%p93 mov.u32 %r35, %r257;
163
+ @!%p93 mov.u32 %r36, %r257;
164
+ @!%p93 mov.u32 %r37, %r257;
165
+ @!%p93 mov.u32 %r38, %r257;
166
+ mov.b32 %f5, %r35;
167
+ mov.b32 %f6, %r36;
168
+ mov.b32 %f7, %r37;
169
+ mov.b32 %f8, %r38;
170
+ mov.u32 %r43, 0x0;
171
+ mov.u32 %r44, 0x0;
172
+ mov.u32 %r45, 0x0;
173
+ mov.u32 %r46, 0x0;
174
+ @%p93 ld.global.L1::evict_last.v4.b32 { %r43, %r44, %r45, %r46 }, [ %rd82 + 0 ];
175
+ @!%p93 mov.u32 %r43, %r257;
176
+ @!%p93 mov.u32 %r44, %r257;
177
+ @!%p93 mov.u32 %r45, %r257;
178
+ @!%p93 mov.u32 %r46, %r257;
179
+ mov.b32 %f9, %r43;
180
+ mov.b32 %f10, %r44;
181
+ mov.b32 %f11, %r45;
182
+ mov.b32 %f12, %r46;
183
+ mov.u32 %r51, 0x0;
184
+ mov.u32 %r52, 0x0;
185
+ mov.u32 %r53, 0x0;
186
+ mov.u32 %r54, 0x0;
187
+ @%p93 ld.global.L1::evict_last.v4.b32 { %r51, %r52, %r53, %r54 }, [ %rd83 + 0 ];
188
+ @!%p93 mov.u32 %r51, %r257;
189
+ @!%p93 mov.u32 %r52, %r257;
190
+ @!%p93 mov.u32 %r53, %r257;
191
+ @!%p93 mov.u32 %r54, %r257;
192
+ mov.b32 %f13, %r51;
193
+ mov.b32 %f14, %r52;
194
+ mov.b32 %f15, %r53;
195
+ mov.b32 %f16, %r54;
196
+ .loc 1 36 22
197
+ add.s64 %rd68, %rd47, 50257;
198
+ .loc 1 37 22
199
+ setp.lt.s64 %p38, %rd47, 0;
200
+ .loc 1 38 36
201
+ selp.b64 %rd7, %rd68, %rd47, %p38;
202
+ .loc 1 39 40
203
+ setp.lt.u64 %p39, %rd7, 50257;
204
+ mov.b32 %r374, 883;
205
+ mov.u64 %rd112, 1;
206
+ .loc 1 39 55
207
+ @%p39 bra $L__BB0_2;
208
+ mov.u64 %rd69, assertMessage_0;
209
+ cvta.global.u64 %rd70, %rd69;
210
+ mov.u64 %rd71, assertFile_0;
211
+ cvta.global.u64 %rd72, %rd71;
212
+ mov.u64 %rd73, assertFunc_0;
213
+ cvta.global.u64 %rd74, %rd73;
214
+ { // callseq 8, 0
215
+ .reg .b32 temp_param_reg;
216
+ .param .b64 param0;
217
+ st.param.b64 [param0+0], %rd70;
218
+ .param .b64 param1;
219
+ st.param.b64 [param1+0], %rd72;
220
+ .param .b32 param2;
221
+ st.param.b32 [param2+0], %r374;
222
+ .param .b64 param3;
223
+ st.param.b64 [param3+0], %rd74;
224
+ .param .b64 param4;
225
+ st.param.b64 [param4+0], %rd112;
226
+ call.uni
227
+ __assertfail,
228
+ (
229
+ param0,
230
+ param1,
231
+ param2,
232
+ param3,
233
+ param4
234
+ );
235
+ } // callseq 8
236
+ $L__BB0_2:
237
+ .loc 1 0 55
238
+ ld.param.u64 %rd14, [triton__0d1d2d3d4d5de6de_param_4];
239
+ .loc 1 37 22
240
+ setp.lt.s64 %p83, %rd31, 0;
241
+ setp.lt.s64 %p84, %rd15, 0;
242
+ .loc 1 40 44
243
+ shl.b64 %rd85, %rd15, 8;
244
+ add.s64 %rd86, %rd85, 12865792;
245
+ selp.b64 %rd87, %rd86, %rd85, %p84;
246
+ shl.b64 %rd88, %rd31, 8;
247
+ add.s64 %rd89, %rd88, 12865792;
248
+ selp.b64 %rd90, %rd89, %rd88, %p83;
249
+ .loc 1 40 40
250
+ or.b64 %rd92, %rd87, %rd59;
251
+ or.b64 %rd93, %rd90, %rd59;
252
+ .loc 1 40 34
253
+ shl.b64 %rd94, %rd92, 2;
254
+ add.s64 %rd104, %rd12, %rd94;
255
+ add.s64 %rd105, %rd104, 16;
256
+ shl.b64 %rd95, %rd93, 2;
257
+ add.s64 %rd106, %rd12, %rd95;
258
+ add.s64 %rd107, %rd106, 16;
259
+ .loc 1 40 52
260
+ mov.u32 %r78, 0x0;
261
+ mov.u32 %r79, 0x0;
262
+ mov.u32 %r80, 0x0;
263
+ mov.u32 %r81, 0x0;
264
+ @%p93 ld.global.L1::evict_last.v4.b32 { %r78, %r79, %r80, %r81 }, [ %rd104 + 0 ];
265
+ @!%p93 mov.u32 %r78, %r257;
266
+ @!%p93 mov.u32 %r79, %r257;
267
+ @!%p93 mov.u32 %r80, %r257;
268
+ @!%p93 mov.u32 %r81, %r257;
269
+ mov.b32 %f27, %r78;
270
+ mov.b32 %f28, %r79;
271
+ mov.b32 %f29, %r80;
272
+ mov.b32 %f30, %r81;
273
+ mov.u32 %r86, 0x0;
274
+ mov.u32 %r87, 0x0;
275
+ mov.u32 %r88, 0x0;
276
+ mov.u32 %r89, 0x0;
277
+ @%p93 ld.global.L1::evict_last.v4.b32 { %r86, %r87, %r88, %r89 }, [ %rd105 + 0 ];
278
+ @!%p93 mov.u32 %r86, %r257;
279
+ @!%p93 mov.u32 %r87, %r257;
280
+ @!%p93 mov.u32 %r88, %r257;
281
+ @!%p93 mov.u32 %r89, %r257;
282
+ mov.b32 %f31, %r86;
283
+ mov.b32 %f32, %r87;
284
+ mov.b32 %f33, %r88;
285
+ mov.b32 %f34, %r89;
286
+ mov.u32 %r94, 0x0;
287
+ mov.u32 %r95, 0x0;
288
+ mov.u32 %r96, 0x0;
289
+ mov.u32 %r97, 0x0;
290
+ @%p93 ld.global.L1::evict_last.v4.b32 { %r94, %r95, %r96, %r97 }, [ %rd106 + 0 ];
291
+ @!%p93 mov.u32 %r94, %r257;
292
+ @!%p93 mov.u32 %r95, %r257;
293
+ @!%p93 mov.u32 %r96, %r257;
294
+ @!%p93 mov.u32 %r97, %r257;
295
+ mov.b32 %f35, %r94;
296
+ mov.b32 %f36, %r95;
297
+ mov.b32 %f37, %r96;
298
+ mov.b32 %f38, %r97;
299
+ mov.u32 %r102, 0x0;
300
+ mov.u32 %r103, 0x0;
301
+ mov.u32 %r104, 0x0;
302
+ mov.u32 %r105, 0x0;
303
+ @%p93 ld.global.L1::evict_last.v4.b32 { %r102, %r103, %r104, %r105 }, [ %rd107 + 0 ];
304
+ @!%p93 mov.u32 %r102, %r257;
305
+ @!%p93 mov.u32 %r103, %r257;
306
+ @!%p93 mov.u32 %r104, %r257;
307
+ @!%p93 mov.u32 %r105, %r257;
308
+ mov.b32 %f39, %r102;
309
+ mov.b32 %f40, %r103;
310
+ mov.b32 %f41, %r104;
311
+ mov.b32 %f42, %r105;
312
+ .loc 1 41 22
313
+ add.f32 %f43, %f1, %f27;
314
+ add.f32 %f44, %f2, %f28;
315
+ add.f32 %f45, %f3, %f29;
316
+ add.f32 %f46, %f4, %f30;
317
+ add.f32 %f47, %f5, %f31;
318
+ add.f32 %f48, %f6, %f32;
319
+ add.f32 %f49, %f7, %f33;
320
+ add.f32 %f50, %f8, %f34;
321
+ add.f32 %f51, %f9, %f35;
322
+ add.f32 %f52, %f10, %f36;
323
+ add.f32 %f53, %f11, %f37;
324
+ add.f32 %f54, %f12, %f38;
325
+ add.f32 %f55, %f13, %f39;
326
+ add.f32 %f56, %f14, %f40;
327
+ add.f32 %f57, %f15, %f41;
328
+ add.f32 %f58, %f16, %f42;
329
+ $L__tmp1:
330
+ .loc 2 98 22
331
+ add.f32 %f59, %f43, 0f00000000;
332
+ add.f32 %f60, %f44, 0f00000000;
333
+ add.f32 %f61, %f45, 0f00000000;
334
+ add.f32 %f62, %f46, 0f00000000;
335
+ add.f32 %f63, %f47, 0f00000000;
336
+ add.f32 %f64, %f48, 0f00000000;
337
+ add.f32 %f65, %f49, 0f00000000;
338
+ add.f32 %f66, %f50, 0f00000000;
339
+ add.f32 %f67, %f51, 0f00000000;
340
+ add.f32 %f68, %f52, 0f00000000;
341
+ add.f32 %f69, %f53, 0f00000000;
342
+ add.f32 %f70, %f54, 0f00000000;
343
+ add.f32 %f71, %f55, 0f00000000;
344
+ add.f32 %f72, %f56, 0f00000000;
345
+ add.f32 %f73, %f57, 0f00000000;
346
+ add.f32 %f74, %f58, 0f00000000;
347
+ .loc 2 101 30
348
+ sub.f32 %f75, %f43, %f59;
349
+ sub.f32 %f76, %f44, %f60;
350
+ sub.f32 %f77, %f45, %f61;
351
+ sub.f32 %f78, %f46, %f62;
352
+ sub.f32 %f79, %f47, %f63;
353
+ sub.f32 %f80, %f48, %f64;
354
+ sub.f32 %f81, %f49, %f65;
355
+ sub.f32 %f82, %f50, %f66;
356
+ sub.f32 %f83, %f51, %f67;
357
+ sub.f32 %f84, %f52, %f68;
358
+ sub.f32 %f85, %f53, %f69;
359
+ sub.f32 %f86, %f54, %f70;
360
+ sub.f32 %f87, %f55, %f71;
361
+ sub.f32 %f88, %f56, %f72;
362
+ sub.f32 %f89, %f57, %f73;
363
+ sub.f32 %f90, %f58, %f74;
364
+ .loc 2 101 13
365
+ fma.rn.f32 %f91, %f43, %f75, 0f00000000;
366
+ fma.rn.f32 %f92, %f44, %f76, 0f00000000;
367
+ fma.rn.f32 %f93, %f45, %f77, 0f00000000;
368
+ fma.rn.f32 %f94, %f46, %f78, 0f00000000;
369
+ fma.rn.f32 %f95, %f47, %f79, 0f00000000;
370
+ fma.rn.f32 %f96, %f48, %f80, 0f00000000;
371
+ fma.rn.f32 %f97, %f49, %f81, 0f00000000;
372
+ fma.rn.f32 %f98, %f50, %f82, 0f00000000;
373
+ fma.rn.f32 %f99, %f51, %f83, 0f00000000;
374
+ fma.rn.f32 %f100, %f52, %f84, 0f00000000;
375
+ fma.rn.f32 %f101, %f53, %f85, 0f00000000;
376
+ fma.rn.f32 %f102, %f54, %f86, 0f00000000;
377
+ fma.rn.f32 %f103, %f55, %f87, 0f00000000;
378
+ fma.rn.f32 %f104, %f56, %f88, 0f00000000;
379
+ fma.rn.f32 %f105, %f57, %f89, 0f00000000;
380
+ fma.rn.f32 %f106, %f58, %f90, 0f00000000;
381
+ $L__tmp2:
382
+ .loc 2 108 21
383
+ sub.f32 %f107, %f60, %f59;
384
+ mov.b32 %r111, 1065353216;
385
+ mov.b32 %r112, 1073741824;
386
+ .loc 2 110 60
387
+ div.full.f32 %r110, %r111, %r112;
388
+ mov.b32 %f108, %r110;
389
+ .loc 2 112 17
390
+ fma.rn.f32 %f109, %f108, %f107, %f59;
391
+ .loc 2 113 15
392
+ add.f32 %f110, %f91, %f92;
393
+ .loc 2 113 30
394
+ mul.f32 %f111, %f107, %f107;
395
+ .loc 2 113 22
396
+ fma.rn.f32 %f112, %f108, %f111, %f110;
397
+ .loc 2 108 21
398
+ sub.f32 %f113, %f61, %f109;
399
+ mov.b32 %r115, 1077936128;
400
+ .loc 2 110 60
401
+ div.full.f32 %r113, %r111, %r115;
402
+ mov.b32 %f114, %r113;
403
+ .loc 2 112 17
404
+ fma.rn.f32 %f115, %f114, %f113, %f109;
405
+ .loc 2 113 15
406
+ add.f32 %f116, %f93, %f112;
407
+ .loc 2 113 30
408
+ mul.f32 %f117, %f113, %f113;
409
+ .loc 2 113 38
410
+ fma.rn.f32 %f118, %f113, %f113, %f117;
411
+ .loc 2 113 22
412
+ fma.rn.f32 %f119, %f114, %f118, %f116;
413
+ .loc 2 108 21
414
+ sub.f32 %f120, %f62, %f115;
415
+ mov.b32 %r118, 1082130432;
416
+ .loc 2 110 60
417
+ div.full.f32 %r116, %r111, %r118;
418
+ mov.b32 %f121, %r116;
419
+ .loc 2 112 17
420
+ fma.rn.f32 %f122, %f121, %f120, %f115;
421
+ .loc 2 113 15
422
+ add.f32 %f123, %f94, %f119;
423
+ .loc 2 113 30
424
+ mul.f32 %f124, %f120, %f120;
425
+ .loc 2 113 38
426
+ mul.f32 %f125, %f124, 0f40400000;
427
+ .loc 2 113 22
428
+ fma.rn.f32 %f126, %f121, %f125, %f123;
429
+ .loc 2 108 21
430
+ sub.f32 %f127, %f63, %f122;
431
+ mov.b32 %r121, 1084227584;
432
+ .loc 2 110 60
433
+ div.full.f32 %r119, %r111, %r121;
434
+ mov.b32 %f128, %r119;
435
+ .loc 2 112 17
436
+ fma.rn.f32 %f129, %f128, %f127, %f122;
437
+ .loc 2 113 15
438
+ add.f32 %f130, %f95, %f126;
439
+ .loc 2 113 30
440
+ mul.f32 %f131, %f127, %f127;
441
+ .loc 2 113 38
442
+ mul.f32 %f132, %f131, 0f40800000;
443
+ .loc 2 113 22
444
+ fma.rn.f32 %f133, %f128, %f132, %f130;
445
+ .loc 2 108 21
446
+ sub.f32 %f134, %f64, %f129;
447
+ mov.b32 %r124, 1086324736;
448
+ .loc 2 110 60
449
+ div.full.f32 %r122, %r111, %r124;
450
+ mov.b32 %f135, %r122;
451
+ .loc 2 112 17
452
+ fma.rn.f32 %f136, %f135, %f134, %f129;
453
+ .loc 2 113 15
454
+ add.f32 %f137, %f96, %f133;
455
+ .loc 2 113 30
456
+ mul.f32 %f138, %f134, %f134;
457
+ .loc 2 113 38
458
+ mul.f32 %f139, %f138, 0f40A00000;
459
+ .loc 2 113 22
460
+ fma.rn.f32 %f140, %f135, %f139, %f137;
461
+ .loc 2 108 21
462
+ sub.f32 %f141, %f65, %f136;
463
+ mov.b32 %r127, 1088421888;
464
+ .loc 2 110 60
465
+ div.full.f32 %r125, %r111, %r127;
466
+ mov.b32 %f142, %r125;
467
+ .loc 2 112 17
468
+ fma.rn.f32 %f143, %f142, %f141, %f136;
469
+ .loc 2 113 15
470
+ add.f32 %f144, %f97, %f140;
471
+ .loc 2 113 30
472
+ mul.f32 %f145, %f141, %f141;
473
+ .loc 2 113 38
474
+ mul.f32 %f146, %f145, 0f40C00000;
475
+ .loc 2 113 22
476
+ fma.rn.f32 %f147, %f142, %f146, %f144;
477
+ .loc 2 108 21
478
+ sub.f32 %f148, %f66, %f143;
479
+ mov.b32 %r130, 1090519040;
480
+ .loc 2 110 60
481
+ div.full.f32 %r128, %r111, %r130;
482
+ mov.b32 %f149, %r128;
483
+ .loc 2 112 17
484
+ fma.rn.f32 %f150, %f149, %f148, %f143;
485
+ .loc 2 113 15
486
+ add.f32 %f151, %f98, %f147;
487
+ .loc 2 113 30
488
+ mul.f32 %f152, %f148, %f148;
489
+ .loc 2 113 38
490
+ mul.f32 %f153, %f152, 0f40E00000;
491
+ .loc 2 113 22
492
+ fma.rn.f32 %f154, %f149, %f153, %f151;
493
+ .loc 2 108 21
494
+ sub.f32 %f155, %f68, %f67;
495
+ .loc 2 110 60
496
+ div.full.f32 %r131, %r111, %r112;
497
+ mov.b32 %f156, %r131;
498
+ .loc 2 112 17
499
+ fma.rn.f32 %f157, %f155, %f156, %f67;
500
+ .loc 2 113 15
501
+ add.f32 %f158, %f99, %f100;
502
+ .loc 2 113 30
503
+ mul.f32 %f159, %f155, %f155;
504
+ .loc 2 113 22
505
+ fma.rn.f32 %f160, %f159, %f156, %f158;
506
+ .loc 2 108 21
507
+ sub.f32 %f161, %f69, %f157;
508
+ .loc 2 110 60
509
+ div.full.f32 %r134, %r111, %r115;
510
+ mov.b32 %f162, %r134;
511
+ .loc 2 112 17
512
+ fma.rn.f32 %f163, %f162, %f161, %f157;
513
+ .loc 2 113 15
514
+ add.f32 %f164, %f101, %f160;
515
+ .loc 2 113 30
516
+ mul.f32 %f165, %f161, %f161;
517
+ .loc 2 113 38
518
+ fma.rn.f32 %f166, %f161, %f161, %f165;
519
+ .loc 2 113 22
520
+ fma.rn.f32 %f167, %f162, %f166, %f164;
521
+ .loc 2 108 21
522
+ sub.f32 %f168, %f70, %f163;
523
+ .loc 2 110 60
524
+ div.full.f32 %r137, %r111, %r118;
525
+ mov.b32 %f169, %r137;
526
+ .loc 2 112 17
527
+ fma.rn.f32 %f170, %f169, %f168, %f163;
528
+ .loc 2 113 15
529
+ add.f32 %f171, %f102, %f167;
530
+ .loc 2 113 30
531
+ mul.f32 %f172, %f168, %f168;
532
+ .loc 2 113 38
533
+ mul.f32 %f173, %f172, 0f40400000;
534
+ .loc 2 113 22
535
+ fma.rn.f32 %f174, %f169, %f173, %f171;
536
+ .loc 2 108 21
537
+ sub.f32 %f175, %f71, %f170;
538
+ .loc 2 110 60
539
+ div.full.f32 %r140, %r111, %r121;
540
+ mov.b32 %f176, %r140;
541
+ .loc 2 112 17
542
+ fma.rn.f32 %f177, %f176, %f175, %f170;
543
+ .loc 2 113 15
544
+ add.f32 %f178, %f103, %f174;
545
+ .loc 2 113 30
546
+ mul.f32 %f179, %f175, %f175;
547
+ .loc 2 113 38
548
+ mul.f32 %f180, %f179, 0f40800000;
549
+ .loc 2 113 22
550
+ fma.rn.f32 %f181, %f176, %f180, %f178;
551
+ .loc 2 108 21
552
+ sub.f32 %f182, %f72, %f177;
553
+ .loc 2 110 60
554
+ div.full.f32 %r143, %r111, %r124;
555
+ mov.b32 %f183, %r143;
556
+ .loc 2 112 17
557
+ fma.rn.f32 %f184, %f183, %f182, %f177;
558
+ .loc 2 113 15
559
+ add.f32 %f185, %f104, %f181;
560
+ .loc 2 113 30
561
+ mul.f32 %f186, %f182, %f182;
562
+ .loc 2 113 38
563
+ mul.f32 %f187, %f186, 0f40A00000;
564
+ .loc 2 113 22
565
+ fma.rn.f32 %f188, %f183, %f187, %f185;
566
+ .loc 2 108 21
567
+ sub.f32 %f189, %f73, %f184;
568
+ .loc 2 110 60
569
+ div.full.f32 %r146, %r111, %r127;
570
+ mov.b32 %f190, %r146;
571
+ .loc 2 112 17
572
+ fma.rn.f32 %f191, %f190, %f189, %f184;
573
+ .loc 2 113 15
574
+ add.f32 %f192, %f105, %f188;
575
+ .loc 2 113 30
576
+ mul.f32 %f193, %f189, %f189;
577
+ .loc 2 113 38
578
+ mul.f32 %f194, %f193, 0f40C00000;
579
+ .loc 2 113 22
580
+ fma.rn.f32 %f195, %f190, %f194, %f192;
581
+ .loc 2 108 21
582
+ sub.f32 %f196, %f74, %f191;
583
+ .loc 2 110 60
584
+ div.full.f32 %r149, %r111, %r130;
585
+ mov.b32 %f197, %r149;
586
+ .loc 2 112 17
587
+ fma.rn.f32 %f198, %f197, %f196, %f191;
588
+ .loc 2 113 15
589
+ add.f32 %f199, %f106, %f195;
590
+ .loc 2 113 30
591
+ mul.f32 %f200, %f196, %f196;
592
+ .loc 2 113 38
593
+ mul.f32 %f201, %f200, 0f40E00000;
594
+ .loc 2 113 22
595
+ fma.rn.f32 %f202, %f197, %f201, %f199;
596
+ $L__tmp3:
597
+ .loc 2 120 46
598
+ mov.b32 %r216, %f150;
599
+ shfl.sync.bfly.b32 %r217, %r216, 16, 31, -1;
600
+ mov.b32 %f203, %r217;
601
+ mov.b32 %r218, %f154;
602
+ shfl.sync.bfly.b32 %r219, %r218, 16, 31, -1;
603
+ mov.b32 %f204, %r219;
604
+ shfl.sync.bfly.b32 %r153, %r130, 16, 31, -1;
605
+ mov.b32 %f205, %r153;
606
+ $L__tmp4:
607
+ .loc 2 108 21
608
+ sub.f32 %f206, %f203, %f150;
609
+ .loc 2 109 28
610
+ add.f32 %f207, %f205, 0f41000000;
611
+ .loc 2 110 39
612
+ setp.eq.f32 %p85, %f207, 0f00000000;
613
+ .loc 2 110 60
614
+ mov.b32 %r154, %f207;
615
+ div.full.f32 %r152, %r153, %r154;
616
+ mov.b32 %f208, %r152;
617
+ .loc 2 110 49
618
+ selp.f32 %f209, 0f00000000, %f208, %p85;
619
+ .loc 2 112 17
620
+ fma.rn.f32 %f210, %f209, %f206, %f150;
621
+ .loc 2 113 15
622
+ add.f32 %f211, %f154, %f204;
623
+ .loc 2 113 30
624
+ mul.f32 %f212, %f206, %f206;
625
+ .loc 2 113 38
626
+ mul.f32 %f213, %f212, 0f41000000;
627
+ .loc 2 113 22
628
+ fma.rn.f32 %f214, %f209, %f213, %f211;
629
+ $L__tmp5:
630
+ .loc 2 120 46
631
+ mov.b32 %r220, %f210;
632
+ shfl.sync.bfly.b32 %r221, %r220, 8, 31, -1;
633
+ mov.b32 %f215, %r221;
634
+ mov.b32 %r222, %f214;
635
+ shfl.sync.bfly.b32 %r223, %r222, 8, 31, -1;
636
+ mov.b32 %f216, %r223;
637
+ shfl.sync.bfly.b32 %r156, %r154, 8, 31, -1;
638
+ mov.b32 %f217, %r156;
639
+ $L__tmp6:
640
+ .loc 2 108 21
641
+ sub.f32 %f218, %f215, %f210;
642
+ .loc 2 109 28
643
+ add.f32 %f219, %f207, %f217;
644
+ .loc 2 110 39
645
+ setp.eq.f32 %p86, %f219, 0f00000000;
646
+ .loc 2 110 60
647
+ mov.b32 %r157, %f219;
648
+ div.full.f32 %r155, %r156, %r157;
649
+ mov.b32 %f220, %r155;
650
+ .loc 2 110 49
651
+ selp.f32 %f221, 0f00000000, %f220, %p86;
652
+ .loc 2 112 17
653
+ fma.rn.f32 %f222, %f221, %f218, %f210;
654
+ .loc 2 113 15
655
+ add.f32 %f223, %f214, %f216;
656
+ .loc 2 113 30
657
+ mul.f32 %f224, %f218, %f218;
658
+ .loc 2 113 38
659
+ mul.f32 %f225, %f207, %f224;
660
+ .loc 2 113 22
661
+ fma.rn.f32 %f226, %f221, %f225, %f223;
662
+ $L__tmp7:
663
+ .loc 2 120 46
664
+ mov.b32 %r224, %f222;
665
+ shfl.sync.bfly.b32 %r225, %r224, 4, 31, -1;
666
+ mov.b32 %f227, %r225;
667
+ mov.b32 %r226, %f226;
668
+ shfl.sync.bfly.b32 %r227, %r226, 4, 31, -1;
669
+ mov.b32 %f228, %r227;
670
+ shfl.sync.bfly.b32 %r159, %r157, 4, 31, -1;
671
+ mov.b32 %f229, %r159;
672
+ $L__tmp8:
673
+ .loc 2 108 21
674
+ sub.f32 %f230, %f227, %f222;
675
+ .loc 2 109 28
676
+ add.f32 %f231, %f219, %f229;
677
+ .loc 2 110 39
678
+ setp.eq.f32 %p87, %f231, 0f00000000;
679
+ .loc 2 110 60
680
+ mov.b32 %r160, %f231;
681
+ div.full.f32 %r158, %r159, %r160;
682
+ mov.b32 %f232, %r158;
683
+ .loc 2 110 49
684
+ selp.f32 %f233, 0f00000000, %f232, %p87;
685
+ .loc 2 112 17
686
+ fma.rn.f32 %f234, %f230, %f233, %f222;
687
+ .loc 2 113 15
688
+ add.f32 %f235, %f226, %f228;
689
+ .loc 2 113 30
690
+ mul.f32 %f236, %f230, %f230;
691
+ .loc 2 113 38
692
+ mul.f32 %f237, %f219, %f236;
693
+ .loc 2 113 22
694
+ fma.rn.f32 %f238, %f233, %f237, %f235;
695
+ $L__tmp9:
696
+ .loc 2 120 46
697
+ mov.b32 %r228, %f234;
698
+ shfl.sync.bfly.b32 %r229, %r228, 2, 31, -1;
699
+ mov.b32 %f239, %r229;
700
+ mov.b32 %r230, %f238;
701
+ shfl.sync.bfly.b32 %r231, %r230, 2, 31, -1;
702
+ mov.b32 %f240, %r231;
703
+ shfl.sync.bfly.b32 %r162, %r160, 2, 31, -1;
704
+ mov.b32 %f241, %r162;
705
+ $L__tmp10:
706
+ .loc 2 108 21
707
+ sub.f32 %f242, %f239, %f234;
708
+ .loc 2 109 28
709
+ add.f32 %f17, %f231, %f241;
710
+ .loc 2 110 39
711
+ setp.eq.f32 %p88, %f17, 0f00000000;
712
+ .loc 2 110 60
713
+ mov.b32 %r163, %f17;
714
+ div.full.f32 %r161, %r162, %r163;
715
+ mov.b32 %f243, %r161;
716
+ .loc 2 110 49
717
+ selp.f32 %f244, 0f00000000, %f243, %p88;
718
+ .loc 2 112 17
719
+ fma.rn.f32 %f18, %f242, %f244, %f234;
720
+ .loc 2 113 15
721
+ add.f32 %f245, %f238, %f240;
722
+ .loc 2 113 30
723
+ mul.f32 %f246, %f242, %f242;
724
+ .loc 2 113 38
725
+ mul.f32 %f247, %f231, %f246;
726
+ .loc 2 113 22
727
+ fma.rn.f32 %f19, %f244, %f247, %f245;
728
+ $L__tmp11:
729
+ .loc 2 120 46
730
+ mov.b32 %r232, %f18;
731
+ shfl.sync.bfly.b32 %r5, %r232, 1, 31, -1;
732
+ mov.b32 %r233, %f19;
733
+ shfl.sync.bfly.b32 %r6, %r233, 1, 31, -1;
734
+ shfl.sync.bfly.b32 %r165, %r163, 1, 31, -1;
735
+ mov.b32 %f248, %r165;
736
+ $L__tmp12:
737
+ .loc 2 109 28
738
+ add.f32 %f20, %f17, %f248;
739
+ .loc 2 110 60
740
+ mov.b32 %r166, %f20;
741
+ div.full.f32 %r164, %r165, %r166;
742
+ mov.b32 %f21, %r164;
743
+ $L__tmp13:
744
+ .loc 2 120 46
745
+ mov.b32 %r234, %f198;
746
+ shfl.sync.bfly.b32 %r235, %r234, 16, 31, -1;
747
+ mov.b32 %f249, %r235;
748
+ mov.b32 %r236, %f202;
749
+ shfl.sync.bfly.b32 %r237, %r236, 16, 31, -1;
750
+ mov.b32 %f250, %r237;
751
+ shfl.sync.bfly.b32 %r168, %r130, 16, 31, -1;
752
+ mov.b32 %f251, %r168;
753
+ $L__tmp14:
754
+ .loc 2 108 21
755
+ sub.f32 %f252, %f249, %f198;
756
+ .loc 2 109 28
757
+ add.f32 %f253, %f251, 0f41000000;
758
+ .loc 2 110 39
759
+ setp.eq.f32 %p89, %f253, 0f00000000;
760
+ .loc 2 110 60
761
+ mov.b32 %r169, %f253;
762
+ div.full.f32 %r167, %r168, %r169;
763
+ mov.b32 %f254, %r167;
764
+ .loc 2 110 49
765
+ selp.f32 %f255, 0f00000000, %f254, %p89;
766
+ .loc 2 112 17
767
+ fma.rn.f32 %f256, %f252, %f255, %f198;
768
+ .loc 2 113 15
769
+ add.f32 %f257, %f202, %f250;
770
+ .loc 2 113 30
771
+ mul.f32 %f258, %f252, %f252;
772
+ .loc 2 113 38
773
+ mul.f32 %f259, %f258, 0f41000000;
774
+ .loc 2 113 22
775
+ fma.rn.f32 %f260, %f259, %f255, %f257;
776
+ $L__tmp15:
777
+ .loc 2 120 46
778
+ mov.b32 %r238, %f256;
779
+ shfl.sync.bfly.b32 %r239, %r238, 8, 31, -1;
780
+ mov.b32 %f261, %r239;
781
+ mov.b32 %r240, %f260;
782
+ shfl.sync.bfly.b32 %r241, %r240, 8, 31, -1;
783
+ mov.b32 %f262, %r241;
784
+ shfl.sync.bfly.b32 %r171, %r169, 8, 31, -1;
785
+ mov.b32 %f263, %r171;
786
+ $L__tmp16:
787
+ .loc 2 108 21
788
+ sub.f32 %f264, %f261, %f256;
789
+ .loc 2 109 28
790
+ add.f32 %f265, %f253, %f263;
791
+ .loc 2 110 39
792
+ setp.eq.f32 %p90, %f265, 0f00000000;
793
+ .loc 2 110 60
794
+ mov.b32 %r172, %f265;
795
+ div.full.f32 %r170, %r171, %r172;
796
+ mov.b32 %f266, %r170;
797
+ .loc 2 110 49
798
+ selp.f32 %f267, 0f00000000, %f266, %p90;
799
+ .loc 2 112 17
800
+ fma.rn.f32 %f268, %f264, %f267, %f256;
801
+ .loc 2 113 15
802
+ add.f32 %f269, %f260, %f262;
803
+ .loc 2 113 30
804
+ mul.f32 %f270, %f264, %f264;
805
+ .loc 2 113 38
806
+ mul.f32 %f271, %f253, %f270;
807
+ .loc 2 113 22
808
+ fma.rn.f32 %f272, %f267, %f271, %f269;
809
+ $L__tmp17:
810
+ .loc 2 120 46
811
+ mov.b32 %r242, %f268;
812
+ shfl.sync.bfly.b32 %r243, %r242, 4, 31, -1;
813
+ mov.b32 %f273, %r243;
814
+ mov.b32 %r244, %f272;
815
+ shfl.sync.bfly.b32 %r245, %r244, 4, 31, -1;
816
+ mov.b32 %f274, %r245;
817
+ shfl.sync.bfly.b32 %r174, %r172, 4, 31, -1;
818
+ mov.b32 %f275, %r174;
819
+ $L__tmp18:
820
+ .loc 2 108 21
821
+ sub.f32 %f276, %f273, %f268;
822
+ .loc 2 109 28
823
+ add.f32 %f277, %f265, %f275;
824
+ .loc 2 110 39
825
+ setp.eq.f32 %p91, %f277, 0f00000000;
826
+ .loc 2 110 60
827
+ mov.b32 %r175, %f277;
828
+ div.full.f32 %r173, %r174, %r175;
829
+ mov.b32 %f278, %r173;
830
+ .loc 2 110 49
831
+ selp.f32 %f279, 0f00000000, %f278, %p91;
832
+ .loc 2 112 17
833
+ fma.rn.f32 %f280, %f276, %f279, %f268;
834
+ .loc 2 113 15
835
+ add.f32 %f281, %f272, %f274;
836
+ .loc 2 113 30
837
+ mul.f32 %f282, %f276, %f276;
838
+ .loc 2 113 38
839
+ mul.f32 %f283, %f265, %f282;
840
+ .loc 2 113 22
841
+ fma.rn.f32 %f284, %f279, %f283, %f281;
842
+ $L__tmp19:
843
+ .loc 2 120 46
844
+ mov.b32 %r246, %f280;
845
+ shfl.sync.bfly.b32 %r247, %r246, 2, 31, -1;
846
+ mov.b32 %f285, %r247;
847
+ mov.b32 %r248, %f284;
848
+ shfl.sync.bfly.b32 %r249, %r248, 2, 31, -1;
849
+ mov.b32 %f286, %r249;
850
+ shfl.sync.bfly.b32 %r177, %r175, 2, 31, -1;
851
+ mov.b32 %f287, %r177;
852
+ $L__tmp20:
853
+ .loc 2 108 21
854
+ sub.f32 %f288, %f285, %f280;
855
+ .loc 2 109 28
856
+ add.f32 %f22, %f277, %f287;
857
+ .loc 2 110 39
858
+ setp.eq.f32 %p92, %f22, 0f00000000;
859
+ .loc 2 110 60
860
+ mov.b32 %r178, %f22;
861
+ div.full.f32 %r176, %r177, %r178;
862
+ mov.b32 %f289, %r176;
863
+ .loc 2 110 49
864
+ selp.f32 %f290, 0f00000000, %f289, %p92;
865
+ .loc 2 112 17
866
+ fma.rn.f32 %f23, %f288, %f290, %f280;
867
+ .loc 2 113 15
868
+ add.f32 %f291, %f284, %f286;
869
+ .loc 2 113 30
870
+ mul.f32 %f292, %f288, %f288;
871
+ .loc 2 113 38
872
+ mul.f32 %f293, %f277, %f292;
873
+ .loc 2 113 22
874
+ fma.rn.f32 %f24, %f290, %f293, %f291;
875
+ $L__tmp21:
876
+ .loc 2 120 46
877
+ mov.b32 %r250, %f23;
878
+ shfl.sync.bfly.b32 %r7, %r250, 1, 31, -1;
879
+ mov.b32 %r251, %f24;
880
+ shfl.sync.bfly.b32 %r8, %r251, 1, 31, -1;
881
+ shfl.sync.bfly.b32 %r180, %r178, 1, 31, -1;
882
+ mov.b32 %f294, %r180;
883
+ $L__tmp22:
884
+ .loc 2 109 28
885
+ add.f32 %f25, %f22, %f294;
886
+ .loc 2 110 60
887
+ mov.b32 %r181, %f25;
888
+ div.full.f32 %r179, %r180, %r181;
889
+ mov.b32 %f26, %r179;
890
+ $L__tmp23:
891
+ .loc 1 59 51
892
+ mov.u32 %r182, 0x0;
893
+ mov.u32 %r183, 0x0;
894
+ mov.u32 %r184, 0x0;
895
+ mov.u32 %r185, 0x0;
896
+ @%p93 ld.global.L1::evict_last.v4.b32 { %r182, %r183, %r184, %r185 }, [ %rd80 + 0 ];
897
+ @!%p93 mov.u32 %r182, %r257;
898
+ @!%p93 mov.u32 %r183, %r257;
899
+ @!%p93 mov.u32 %r184, %r257;
900
+ @!%p93 mov.u32 %r185, %r257;
901
+ mov.u32 %r190, 0x0;
902
+ mov.u32 %r191, 0x0;
903
+ mov.u32 %r192, 0x0;
904
+ mov.u32 %r193, 0x0;
905
+ @%p93 ld.global.L1::evict_last.v4.b32 { %r190, %r191, %r192, %r193 }, [ %rd81 + 0 ];
906
+ @!%p93 mov.u32 %r190, %r257;
907
+ @!%p93 mov.u32 %r191, %r257;
908
+ @!%p93 mov.u32 %r192, %r257;
909
+ @!%p93 mov.u32 %r193, %r257;
910
+ mov.u32 %r198, 0x0;
911
+ mov.u32 %r199, 0x0;
912
+ mov.u32 %r200, 0x0;
913
+ mov.u32 %r201, 0x0;
914
+ @%p93 ld.global.L1::evict_last.v4.b32 { %r198, %r199, %r200, %r201 }, [ %rd82 + 0 ];
915
+ @!%p93 mov.u32 %r198, %r257;
916
+ @!%p93 mov.u32 %r199, %r257;
917
+ @!%p93 mov.u32 %r200, %r257;
918
+ @!%p93 mov.u32 %r201, %r257;
919
+ mov.u32 %r206, 0x0;
920
+ mov.u32 %r207, 0x0;
921
+ mov.u32 %r208, 0x0;
922
+ mov.u32 %r209, 0x0;
923
+ @%p93 ld.global.L1::evict_last.v4.b32 { %r206, %r207, %r208, %r209 }, [ %rd83 + 0 ];
924
+ @!%p93 mov.u32 %r206, %r257;
925
+ @!%p93 mov.u32 %r207, %r257;
926
+ @!%p93 mov.u32 %r208, %r257;
927
+ @!%p93 mov.u32 %r209, %r257;
928
+ .loc 1 60 35
929
+ mul.wide.u32 %rd96, %r2, 4;
930
+ add.s64 %rd84, %rd13, %rd96;
931
+ .loc 1 60 40
932
+ mov.u32 %r214, 0x0;
933
+ @%p93 ld.global.L1::evict_last.b32 { %r214 }, [ %rd84 + 0 ];
934
+ @!%p93 mov.u32 %r214, %r257;
935
+ .loc 1 64 57
936
+ @%p39 bra $L__BB0_4;
937
+ mov.u64 %rd97, assertMessage_1;
938
+ cvta.global.u64 %rd98, %rd97;
939
+ mov.u64 %rd99, assertFile_1;
940
+ cvta.global.u64 %rd100, %rd99;
941
+ mov.u64 %rd101, assertFunc_1;
942
+ cvta.global.u64 %rd102, %rd101;
943
+ { // callseq 9, 0
944
+ .reg .b32 temp_param_reg;
945
+ .param .b64 param0;
946
+ st.param.b64 [param0+0], %rd98;
947
+ .param .b64 param1;
948
+ st.param.b64 [param1+0], %rd100;
949
+ .param .b32 param2;
950
+ st.param.b32 [param2+0], %r374;
951
+ .param .b64 param3;
952
+ st.param.b64 [param3+0], %rd102;
953
+ .param .b64 param4;
954
+ st.param.b64 [param4+0], %rd112;
955
+ call.uni
956
+ __assertfail,
957
+ (
958
+ param0,
959
+ param1,
960
+ param2,
961
+ param3,
962
+ param4
963
+ );
964
+ } // callseq 9
965
+ $L__BB0_4:
966
+ $L__tmp24:
967
+ .loc 2 120 46
968
+ mov.b32 %f295, %r8;
969
+ $L__tmp25:
970
+ .loc 2 113 15
971
+ add.f32 %f296, %f24, %f295;
972
+ $L__tmp26:
973
+ .loc 2 120 46
974
+ mov.b32 %f297, %r7;
975
+ $L__tmp27:
976
+ .loc 2 108 21
977
+ sub.f32 %f298, %f297, %f23;
978
+ .loc 2 113 30
979
+ mul.f32 %f299, %f298, %f298;
980
+ .loc 2 113 38
981
+ mul.f32 %f300, %f22, %f299;
982
+ .loc 2 110 39
983
+ setp.eq.f32 %p115, %f25, 0f00000000;
984
+ .loc 2 110 49
985
+ selp.f32 %f301, 0f00000000, %f26, %p115;
986
+ .loc 2 113 22
987
+ fma.rn.f32 %f302, %f301, %f300, %f296;
988
+ $L__tmp28:
989
+ .loc 2 120 46
990
+ mov.b32 %f303, %r6;
991
+ $L__tmp29:
992
+ .loc 2 113 15
993
+ add.f32 %f304, %f19, %f303;
994
+ $L__tmp30:
995
+ .loc 2 120 46
996
+ mov.b32 %f305, %r5;
997
+ $L__tmp31:
998
+ .loc 2 108 21
999
+ sub.f32 %f306, %f305, %f18;
1000
+ .loc 2 113 30
1001
+ mul.f32 %f307, %f306, %f306;
1002
+ .loc 2 113 38
1003
+ mul.f32 %f308, %f17, %f307;
1004
+ .loc 2 110 39
1005
+ setp.eq.f32 %p116, %f20, 0f00000000;
1006
+ .loc 2 110 49
1007
+ selp.f32 %f309, 0f00000000, %f21, %p116;
1008
+ .loc 2 113 22
1009
+ fma.rn.f32 %f310, %f309, %f308, %f304;
1010
+ $L__tmp32:
1011
+ .loc 1 65 54
1012
+ mov.u32 %r253, 0x0;
1013
+ mov.u32 %r254, 0x0;
1014
+ mov.u32 %r255, 0x0;
1015
+ mov.u32 %r256, 0x0;
1016
+ @%p93 ld.global.L1::evict_first.v4.b32 { %r253, %r254, %r255, %r256 }, [ %rd104 + 0 ];
1017
+ @!%p93 mov.u32 %r253, %r257;
1018
+ @!%p93 mov.u32 %r254, %r257;
1019
+ @!%p93 mov.u32 %r255, %r257;
1020
+ @!%p93 mov.u32 %r256, %r257;
1021
+ mov.u32 %r261, 0x0;
1022
+ mov.u32 %r262, 0x0;
1023
+ mov.u32 %r263, 0x0;
1024
+ mov.u32 %r264, 0x0;
1025
+ @%p93 ld.global.L1::evict_first.v4.b32 { %r261, %r262, %r263, %r264 }, [ %rd105 + 0 ];
1026
+ @!%p93 mov.u32 %r261, %r257;
1027
+ @!%p93 mov.u32 %r262, %r257;
1028
+ @!%p93 mov.u32 %r263, %r257;
1029
+ @!%p93 mov.u32 %r264, %r257;
1030
+ mov.u32 %r269, 0x0;
1031
+ mov.u32 %r270, 0x0;
1032
+ mov.u32 %r271, 0x0;
1033
+ mov.u32 %r272, 0x0;
1034
+ @%p93 ld.global.L1::evict_first.v4.b32 { %r269, %r270, %r271, %r272 }, [ %rd106 + 0 ];
1035
+ @!%p93 mov.u32 %r269, %r257;
1036
+ @!%p93 mov.u32 %r270, %r257;
1037
+ @!%p93 mov.u32 %r271, %r257;
1038
+ @!%p93 mov.u32 %r272, %r257;
1039
+ mov.u32 %r277, 0x0;
1040
+ mov.u32 %r278, 0x0;
1041
+ mov.u32 %r279, 0x0;
1042
+ mov.u32 %r280, 0x0;
1043
+ @%p93 ld.global.L1::evict_first.v4.b32 { %r277, %r278, %r279, %r280 }, [ %rd107 + 0 ];
1044
+ @!%p93 mov.u32 %r277, %r257;
1045
+ @!%p93 mov.u32 %r278, %r257;
1046
+ @!%p93 mov.u32 %r279, %r257;
1047
+ @!%p93 mov.u32 %r280, %r257;
1048
+ .loc 1 69 23
1049
+ mov.b32 %r286, %f310;
1050
+ mov.b32 %r287, 1132462080;
1051
+ div.full.f32 %r285, %r286, %r287;
1052
+ mov.b32 %f311, %r285;
1053
+ mov.b32 %r310, %f302;
1054
+ div.full.f32 %r309, %r310, %r287;
1055
+ mov.b32 %f312, %r309;
1056
+ .loc 1 71 24
1057
+ add.f32 %f313, %f311, 0f3727C5AC;
1058
+ add.f32 %f314, %f312, 0f3727C5AC;
1059
+ .loc 1 72 30
1060
+ rsqrt.approx.ftz.f32 %f315, %f313;
1061
+ rsqrt.approx.ftz.f32 %f316, %f314;
1062
+ .loc 1 65 54
1063
+ mov.b32 %f317, %r280;
1064
+ .loc 1 59 51
1065
+ mov.b32 %f318, %r209;
1066
+ .loc 1 66 24
1067
+ add.f32 %f319, %f318, %f317;
1068
+ $L__tmp33:
1069
+ .loc 2 112 17
1070
+ fma.rn.f32 %f320, %f298, %f301, %f23;
1071
+ $L__tmp34:
1072
+ .loc 1 67 24
1073
+ sub.f32 %f321, %f319, %f320;
1074
+ .loc 1 65 54
1075
+ mov.b32 %f322, %r279;
1076
+ .loc 1 59 51
1077
+ mov.b32 %f323, %r208;
1078
+ .loc 1 66 24
1079
+ add.f32 %f324, %f323, %f322;
1080
+ .loc 1 67 24
1081
+ sub.f32 %f325, %f324, %f320;
1082
+ .loc 1 65 54
1083
+ mov.b32 %f326, %r278;
1084
+ .loc 1 59 51
1085
+ mov.b32 %f327, %r207;
1086
+ .loc 1 66 24
1087
+ add.f32 %f328, %f327, %f326;
1088
+ .loc 1 67 24
1089
+ sub.f32 %f329, %f328, %f320;
1090
+ .loc 1 65 54
1091
+ mov.b32 %f330, %r277;
1092
+ .loc 1 59 51
1093
+ mov.b32 %f331, %r206;
1094
+ .loc 1 66 24
1095
+ add.f32 %f332, %f331, %f330;
1096
+ .loc 1 67 24
1097
+ sub.f32 %f333, %f332, %f320;
1098
+ .loc 1 65 54
1099
+ mov.b32 %f334, %r272;
1100
+ .loc 1 59 51
1101
+ mov.b32 %f335, %r201;
1102
+ .loc 1 66 24
1103
+ add.f32 %f336, %f335, %f334;
1104
+ .loc 1 67 24
1105
+ sub.f32 %f337, %f336, %f320;
1106
+ .loc 1 65 54
1107
+ mov.b32 %f338, %r271;
1108
+ .loc 1 59 51
1109
+ mov.b32 %f339, %r200;
1110
+ .loc 1 66 24
1111
+ add.f32 %f340, %f339, %f338;
1112
+ .loc 1 67 24
1113
+ sub.f32 %f341, %f340, %f320;
1114
+ .loc 1 65 54
1115
+ mov.b32 %f342, %r270;
1116
+ .loc 1 59 51
1117
+ mov.b32 %f343, %r199;
1118
+ .loc 1 66 24
1119
+ add.f32 %f344, %f343, %f342;
1120
+ .loc 1 67 24
1121
+ sub.f32 %f345, %f344, %f320;
1122
+ .loc 1 65 54
1123
+ mov.b32 %f346, %r269;
1124
+ .loc 1 59 51
1125
+ mov.b32 %f347, %r198;
1126
+ .loc 1 66 24
1127
+ add.f32 %f348, %f347, %f346;
1128
+ .loc 1 67 24
1129
+ sub.f32 %f349, %f348, %f320;
1130
+ .loc 1 65 54
1131
+ mov.b32 %f350, %r264;
1132
+ .loc 1 59 51
1133
+ mov.b32 %f351, %r193;
1134
+ .loc 1 66 24
1135
+ add.f32 %f352, %f351, %f350;
1136
+ $L__tmp35:
1137
+ .loc 2 112 17
1138
+ fma.rn.f32 %f353, %f306, %f309, %f18;
1139
+ $L__tmp36:
1140
+ .loc 1 67 24
1141
+ sub.f32 %f354, %f352, %f353;
1142
+ .loc 1 65 54
1143
+ mov.b32 %f355, %r263;
1144
+ .loc 1 59 51
1145
+ mov.b32 %f356, %r192;
1146
+ .loc 1 66 24
1147
+ add.f32 %f357, %f356, %f355;
1148
+ .loc 1 67 24
1149
+ sub.f32 %f358, %f357, %f353;
1150
+ .loc 1 65 54
1151
+ mov.b32 %f359, %r262;
1152
+ .loc 1 59 51
1153
+ mov.b32 %f360, %r191;
1154
+ .loc 1 66 24
1155
+ add.f32 %f361, %f360, %f359;
1156
+ .loc 1 67 24
1157
+ sub.f32 %f362, %f361, %f353;
1158
+ .loc 1 65 54
1159
+ mov.b32 %f363, %r261;
1160
+ .loc 1 59 51
1161
+ mov.b32 %f364, %r190;
1162
+ .loc 1 66 24
1163
+ add.f32 %f365, %f364, %f363;
1164
+ .loc 1 67 24
1165
+ sub.f32 %f366, %f365, %f353;
1166
+ .loc 1 65 54
1167
+ mov.b32 %f367, %r256;
1168
+ .loc 1 59 51
1169
+ mov.b32 %f368, %r185;
1170
+ .loc 1 66 24
1171
+ add.f32 %f369, %f368, %f367;
1172
+ .loc 1 67 24
1173
+ sub.f32 %f370, %f369, %f353;
1174
+ .loc 1 65 54
1175
+ mov.b32 %f371, %r255;
1176
+ .loc 1 59 51
1177
+ mov.b32 %f372, %r184;
1178
+ .loc 1 66 24
1179
+ add.f32 %f373, %f372, %f371;
1180
+ .loc 1 67 24
1181
+ sub.f32 %f374, %f373, %f353;
1182
+ .loc 1 65 54
1183
+ mov.b32 %f375, %r254;
1184
+ .loc 1 59 51
1185
+ mov.b32 %f376, %r183;
1186
+ .loc 1 66 24
1187
+ add.f32 %f377, %f376, %f375;
1188
+ .loc 1 67 24
1189
+ sub.f32 %f378, %f377, %f353;
1190
+ .loc 1 65 54
1191
+ mov.b32 %f379, %r253;
1192
+ .loc 1 59 51
1193
+ mov.b32 %f380, %r182;
1194
+ .loc 1 66 24
1195
+ add.f32 %f381, %f380, %f379;
1196
+ .loc 1 67 24
1197
+ sub.f32 %f382, %f381, %f353;
1198
+ .loc 1 73 24
1199
+ mul.f32 %f383, %f382, %f315;
1200
+ mul.f32 %f384, %f378, %f315;
1201
+ mul.f32 %f385, %f374, %f315;
1202
+ mul.f32 %f386, %f370, %f315;
1203
+ mul.f32 %f387, %f366, %f315;
1204
+ mul.f32 %f388, %f362, %f315;
1205
+ mul.f32 %f389, %f358, %f315;
1206
+ mul.f32 %f390, %f354, %f315;
1207
+ mul.f32 %f391, %f349, %f316;
1208
+ mul.f32 %f392, %f345, %f316;
1209
+ mul.f32 %f393, %f341, %f316;
1210
+ mul.f32 %f394, %f337, %f316;
1211
+ mul.f32 %f395, %f333, %f316;
1212
+ mul.f32 %f396, %f329, %f316;
1213
+ mul.f32 %f397, %f325, %f316;
1214
+ mul.f32 %f398, %f321, %f316;
1215
+ .loc 1 74 24
1216
+ shl.b32 %r357, %r2, 2;
1217
+ mov.u32 %r358, global_smem;
1218
+ add.s32 %r359, %r358, %r357;
1219
+ st.shared.u32 [%r359], %r214;
1220
+ bar.sync 0;
1221
+ shl.b32 %r360, %r1, 2;
1222
+ add.s32 %r361, %r358, %r360;
1223
+ ld.shared.v4.f32 {%f399, %f400, %f401, %f402}, [%r361];
1224
+ ld.shared.v4.f32 {%f403, %f404, %f405, %f406}, [%r361+16];
1225
+ mul.f32 %f407, %f383, %f399;
1226
+ mul.f32 %f408, %f384, %f400;
1227
+ mul.f32 %f409, %f385, %f401;
1228
+ mul.f32 %f410, %f386, %f402;
1229
+ mul.f32 %f411, %f387, %f403;
1230
+ mul.f32 %f412, %f388, %f404;
1231
+ mul.f32 %f413, %f389, %f405;
1232
+ mul.f32 %f414, %f390, %f406;
1233
+ mul.f32 %f415, %f391, %f399;
1234
+ mul.f32 %f416, %f392, %f400;
1235
+ mul.f32 %f417, %f393, %f401;
1236
+ mul.f32 %f418, %f394, %f402;
1237
+ mul.f32 %f419, %f395, %f403;
1238
+ mul.f32 %f420, %f396, %f404;
1239
+ mul.f32 %f421, %f397, %f405;
1240
+ mul.f32 %f422, %f398, %f406;
1241
+ .loc 1 76 39
1242
+ shl.b32 %r362, %r3, 8;
1243
+ shl.b32 %r363, %r4, 8;
1244
+ .loc 1 76 35
1245
+ or.b32 %r364, %r362, %r1;
1246
+ or.b32 %r365, %r363, %r1;
1247
+ .loc 1 76 29
1248
+ mul.wide.s32 %rd110, %r364, 2;
1249
+ add.s64 %rd108, %rd14, %rd110;
1250
+ mul.wide.s32 %rd111, %r365, 2;
1251
+ add.s64 %rd109, %rd14, %rd111;
1252
+ .loc 1 76 52
1253
+ mov.b32 %r333, %f407;
1254
+ cvt.rn.bf16.f32 %rs1, %r333;
1255
+ mov.b32 %r334, %f408;
1256
+ cvt.rn.bf16.f32 %rs2, %r334;
1257
+ mov.b32 %r335, %f409;
1258
+ cvt.rn.bf16.f32 %rs3, %r335;
1259
+ mov.b32 %r336, %f410;
1260
+ cvt.rn.bf16.f32 %rs4, %r336;
1261
+ mov.b32 %r337, %f411;
1262
+ cvt.rn.bf16.f32 %rs5, %r337;
1263
+ mov.b32 %r338, %f412;
1264
+ cvt.rn.bf16.f32 %rs6, %r338;
1265
+ mov.b32 %r339, %f413;
1266
+ cvt.rn.bf16.f32 %rs7, %r339;
1267
+ mov.b32 %r340, %f414;
1268
+ cvt.rn.bf16.f32 %rs8, %r340;
1269
+ mov.b32 %r341, %f415;
1270
+ cvt.rn.bf16.f32 %rs9, %r341;
1271
+ mov.b32 %r342, %f416;
1272
+ cvt.rn.bf16.f32 %rs10, %r342;
1273
+ mov.b32 %r343, %f417;
1274
+ cvt.rn.bf16.f32 %rs11, %r343;
1275
+ mov.b32 %r344, %f418;
1276
+ cvt.rn.bf16.f32 %rs12, %r344;
1277
+ mov.b32 %r345, %f419;
1278
+ cvt.rn.bf16.f32 %rs13, %r345;
1279
+ mov.b32 %r346, %f420;
1280
+ cvt.rn.bf16.f32 %rs14, %r346;
1281
+ mov.b32 %r347, %f421;
1282
+ cvt.rn.bf16.f32 %rs15, %r347;
1283
+ mov.b32 %r348, %f422;
1284
+ cvt.rn.bf16.f32 %rs16, %r348;
1285
+ mov.b32 %r366, {%rs1, %rs2};
1286
+ mov.b32 %r367, {%rs3, %rs4};
1287
+ mov.b32 %r368, {%rs5, %rs6};
1288
+ mov.b32 %r369, {%rs7, %rs8};
1289
+ @%p93 st.global.v4.b32 [ %rd108 + 0 ], { %r366, %r367, %r368, %r369 };
1290
+ mov.b32 %r370, {%rs9, %rs10};
1291
+ mov.b32 %r371, {%rs11, %rs12};
1292
+ mov.b32 %r372, {%rs13, %rs14};
1293
+ mov.b32 %r373, {%rs15, %rs16};
1294
+ @%p93 st.global.v4.b32 [ %rd109 + 0 ], { %r370, %r371, %r372, %r373 };
1295
+ .loc 1 55 4
1296
+ ret;
1297
+ $L__tmp37:
1298
+ $L__func_end0:
1299
+
1300
+ }
1301
+ // .globl __nv_rsqrtf
1302
+ .visible .func (.param .b32 func_retval0) __nv_rsqrtf(
1303
+ .param .b32 __nv_rsqrtf_param_0
1304
+ )
1305
+ {
1306
+ .reg .f32 %f<3>;
1307
+ $L__func_begin1:
1308
+
1309
+ ld.param.f32 %f1, [__nv_rsqrtf_param_0];
1310
+ rsqrt.approx.ftz.f32 %f2, %f1;
1311
+ st.param.f32 [func_retval0+0], %f2;
1312
+ ret;
1313
+ $L__func_end1:
1314
+
1315
+ }
1316
+ .file 1 "/tmp/torchinductor_root/gx/cgx5lxpuexpindj4dsmjz5x42uhyy7iskevq7ovzpwagb3t5powj.py"
1317
+ .file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
1318
+ .section .debug_abbrev
1319
+ {
1320
+ .b8 1
1321
+ .b8 17
1322
+ .b8 1
1323
+ .b8 37
1324
+ .b8 8
1325
+ .b8 19
1326
+ .b8 5
1327
+ .b8 3
1328
+ .b8 8
1329
+ .b8 16
1330
+ .b8 6
1331
+ .b8 27
1332
+ .b8 8
1333
+ .b8 180
1334
+ .b8 66
1335
+ .b8 12
1336
+ .b8 17
1337
+ .b8 1
1338
+ .b8 18
1339
+ .b8 1
1340
+ .b8 0
1341
+ .b8 0
1342
+ .b8 2
1343
+ .b8 46
1344
+ .b8 0
1345
+ .b8 135
1346
+ .b8 64
1347
+ .b8 8
1348
+ .b8 3
1349
+ .b8 8
1350
+ .b8 58
1351
+ .b8 11
1352
+ .b8 59
1353
+ .b8 11
1354
+ .b8 63
1355
+ .b8 12
1356
+ .b8 32
1357
+ .b8 11
1358
+ .b8 0
1359
+ .b8 0
1360
+ .b8 3
1361
+ .b8 46
1362
+ .b8 1
1363
+ .b8 17
1364
+ .b8 1
1365
+ .b8 18
1366
+ .b8 1
1367
+ .b8 64
1368
+ .b8 10
1369
+ .b8 49
1370
+ .b8 19
1371
+ .b8 0
1372
+ .b8 0
1373
+ .b8 4
1374
+ .b8 29
1375
+ .b8 0
1376
+ .b8 49
1377
+ .b8 19
1378
+ .b8 17
1379
+ .b8 1
1380
+ .b8 18
1381
+ .b8 1
1382
+ .b8 88
1383
+ .b8 11
1384
+ .b8 89
1385
+ .b8 11
1386
+ .b8 87
1387
+ .b8 11
1388
+ .b8 0
1389
+ .b8 0
1390
+ .b8 5
1391
+ .b8 29
1392
+ .b8 1
1393
+ .b8 49
1394
+ .b8 19
1395
+ .b8 17
1396
+ .b8 1
1397
+ .b8 18
1398
+ .b8 1
1399
+ .b8 88
1400
+ .b8 11
1401
+ .b8 89
1402
+ .b8 11
1403
+ .b8 87
1404
+ .b8 11
1405
+ .b8 0
1406
+ .b8 0
1407
+ .b8 0
1408
+ }
1409
+ .section .debug_info
1410
+ {
1411
+ .b32 298
1412
+ .b8 2
1413
+ .b8 0
1414
+ .b32 .debug_abbrev
1415
+ .b8 8
1416
+ .b8 1
1417
+ .b8 116
1418
+ .b8 114
1419
+ .b8 105
1420
+ .b8 116
1421
+ .b8 111
1422
+ .b8 110
1423
+ .b8 0
1424
+ .b8 2
1425
+ .b8 0
1426
+ .b8 99
1427
+ .b8 103
1428
+ .b8 120
1429
+ .b8 53
1430
+ .b8 108
1431
+ .b8 120
1432
+ .b8 112
1433
+ .b8 117
1434
+ .b8 101
1435
+ .b8 120
1436
+ .b8 112
1437
+ .b8 105
1438
+ .b8 110
1439
+ .b8 100
1440
+ .b8 106
1441
+ .b8 52
1442
+ .b8 100
1443
+ .b8 115
1444
+ .b8 109
1445
+ .b8 106
1446
+ .b8 122
1447
+ .b8 53
1448
+ .b8 120
1449
+ .b8 52
1450
+ .b8 50
1451
+ .b8 117
1452
+ .b8 104
1453
+ .b8 121
1454
+ .b8 121
1455
+ .b8 55
1456
+ .b8 105
1457
+ .b8 115
1458
+ .b8 107
1459
+ .b8 101
1460
+ .b8 118
1461
+ .b8 113
1462
+ .b8 55
1463
+ .b8 111
1464
+ .b8 118
1465
+ .b8 122
1466
+ .b8 112
1467
+ .b8 119
1468
+ .b8 97
1469
+ .b8 103
1470
+ .b8 98
1471
+ .b8 51
1472
+ .b8 116
1473
+ .b8 53
1474
+ .b8 112
1475
+ .b8 111
1476
+ .b8 119
1477
+ .b8 106
1478
+ .b8 46
1479
+ .b8 112
1480
+ .b8 121
1481
+ .b8 0
1482
+ .b32 .debug_line
1483
+ .b8 47
1484
+ .b8 116
1485
+ .b8 109
1486
+ .b8 112
1487
+ .b8 47
1488
+ .b8 116
1489
+ .b8 111
1490
+ .b8 114
1491
+ .b8 99
1492
+ .b8 104
1493
+ .b8 105
1494
+ .b8 110
1495
+ .b8 100
1496
+ .b8 117
1497
+ .b8 99
1498
+ .b8 116
1499
+ .b8 111
1500
+ .b8 114
1501
+ .b8 95
1502
+ .b8 114
1503
+ .b8 111
1504
+ .b8 111
1505
+ .b8 116
1506
+ .b8 47
1507
+ .b8 103
1508
+ .b8 120
1509
+ .b8 0
1510
+ .b8 1
1511
+ .b64 $L__func_begin0
1512
+ .b64 $L__func_end0
1513
+ .b8 2
1514
+ .b8 116
1515
+ .b8 114
1516
+ .b8 105
1517
+ .b8 116
1518
+ .b8 111
1519
+ .b8 110
1520
+ .b8 95
1521
+ .b8 95
1522
+ .b8 48
1523
+ .b8 100
1524
+ .b8 49
1525
+ .b8 100
1526
+ .b8 50
1527
+ .b8 100
1528
+ .b8 51
1529
+ .b8 100
1530
+ .b8 52
1531
+ .b8 100
1532
+ .b8 53
1533
+ .b8 100
1534
+ .b8 101
1535
+ .b8 54
1536
+ .b8 100
1537
+ .b8 101
1538
+ .b8 0
1539
+ .b8 116
1540
+ .b8 114
1541
+ .b8 105
1542
+ .b8 116
1543
+ .b8 111
1544
+ .b8 110
1545
+ .b8 95
1546
+ .b8 95
1547
+ .b8 48
1548
+ .b8 100
1549
+ .b8 49
1550
+ .b8 100
1551
+ .b8 50
1552
+ .b8 100
1553
+ .b8 51
1554
+ .b8 100
1555
+ .b8 52
1556
+ .b8 100
1557
+ .b8 53
1558
+ .b8 100
1559
+ .b8 101
1560
+ .b8 54
1561
+ .b8 100
1562
+ .b8 101
1563
+ .b8 0
1564
+ .b8 1
1565
+ .b8 18
1566
+ .b8 1
1567
+ .b8 1
1568
+ .b8 3
1569
+ .b64 $L__func_begin0
1570
+ .b64 $L__func_end0
1571
+ .b8 1
1572
+ .b8 156
1573
+ .b32 125
1574
+ .b8 4
1575
+ .b32 125
1576
+ .b64 $L__tmp1
1577
+ .b64 $L__tmp2
1578
+ .b8 2
1579
+ .b8 44
1580
+ .b8 38
1581
+ .b8 5
1582
+ .b32 125
1583
+ .b64 $L__tmp2
1584
+ .b64 $L__tmp36
1585
+ .b8 2
1586
+ .b8 50
1587
+ .b8 41
1588
+ .b8 4
1589
+ .b32 125
1590
+ .b64 $L__tmp2
1591
+ .b64 $L__tmp36
1592
+ .b8 2
1593
+ .b8 120
1594
+ .b8 46
1595
+ .b8 0
1596
+ .b8 4
1597
+ .b32 125
1598
+ .b64 $L__tmp3
1599
+ .b64 $L__tmp31
1600
+ .b8 2
1601
+ .b8 50
1602
+ .b8 41
1603
+ .b8 0
1604
+ .b8 0
1605
+ }
1606
+ .section .debug_pubnames
1607
+ {
1608
+ .b32 $L__pubNames_end0-$L__pubNames_start0
1609
+ $L__pubNames_start0:
1610
+ .b8 2
1611
+ .b8 0
1612
+ .b32 .debug_info
1613
+ .b32 302
1614
+ .b32 125
1615
+ .b8 116
1616
+ .b8 114
1617
+ .b8 105
1618
+ .b8 116
1619
+ .b8 111
1620
+ .b8 110
1621
+ .b8 95
1622
+ .b8 95
1623
+ .b8 48
1624
+ .b8 100
1625
+ .b8 49
1626
+ .b8 100
1627
+ .b8 50
1628
+ .b8 100
1629
+ .b8 51
1630
+ .b8 100
1631
+ .b8 52
1632
+ .b8 100
1633
+ .b8 53
1634
+ .b8 100
1635
+ .b8 101
1636
+ .b8 54
1637
+ .b8 100
1638
+ .b8 101
1639
+ .b8 0
1640
+ .b32 0
1641
+ $L__pubNames_end0:
1642
+ }
1643
+ .section .debug_pubtypes
1644
+ {
1645
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
1646
+ $L__pubTypes_start0:
1647
+ .b8 2
1648
+ .b8 0
1649
+ .b32 .debug_info
1650
+ .b32 302
1651
+ .b32 0
1652
+ $L__pubTypes_end0:
1653
+ }
1654
+ .section .debug_loc { }
.triton/dump/345a87a492fd703c73ab83265a21fcb6/triton_.ttgir ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 32], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
3
+ #blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 8], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
4
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
5
+ tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
6
+ %cst = arith.constant dense<512> : tensor<16x1xi32, #blocked>
7
+ %cst_0 = arith.constant dense<256> : tensor<1x256xi32, #blocked>
8
+ %cst_1 = arith.constant dense<256> : tensor<16x1xi32, #blocked>
9
+ %cst_2 = arith.constant dense<1.000000e+00> : tensor<1x256xf32, #blocked>
10
+ %cst_3 = arith.constant dense<0.000000e+00> : tensor<1x256xf32, #blocked>
11
+ %cst_4 = arith.constant dense<256> : tensor<16x1xi64, #blocked>
12
+ %cst_5 = arith.constant dense<50257> : tensor<16x1xi64, #blocked>
13
+ %cst_6 = arith.constant dense<0> : tensor<16x1xi64, #blocked>
14
+ %cst_7 = arith.constant dense<0> : tensor<16x1xi64, #blocked1>
15
+ %cst_8 = arith.constant dense<50257> : tensor<16x1xi64, #blocked1>
16
+ %cst_9 = arith.constant 0.000000e+00 : f32
17
+ %cst_10 = arith.constant dense<0.000000e+00> : tensor<1x256xf32, #blocked2>
18
+ %cst_11 = arith.constant dense<256> : tensor<1x256xi32, #blocked2>
19
+ %cst_12 = arith.constant dense<9.99999974E-6> : tensor<16x1xf32, #blocked>
20
+ %cst_13 = arith.constant dense<2.560000e+02> : tensor<16x1xf32, #blocked>
21
+ %cst_14 = arith.constant dense<0.000000e+00> : tensor<16x256xf32, #blocked>
22
+ %c16_i32 = arith.constant 16 : i32
23
+ %0 = tt.get_program_id x : i32
24
+ %1 = arith.muli %0, %c16_i32 : i32
25
+ %2 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
26
+ %3 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
27
+ %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xi32, #blocked>
28
+ %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<16x1xi32, #blocked1>
29
+ %6 = tt.splat %1 : (i32) -> tensor<16x1xi32, #blocked>
30
+ %7 = tt.splat %1 : (i32) -> tensor<16x1xi32, #blocked1>
31
+ %8 = arith.addi %6, %4 : tensor<16x1xi32, #blocked>
32
+ %9 = arith.addi %7, %5 : tensor<16x1xi32, #blocked1>
33
+ %10 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
34
+ %11 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>
35
+ %12 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x256xi32, #blocked>
36
+ %13 = tt.expand_dims %11 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x256xi32, #blocked2>
37
+ %14 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<16x1x!tt.ptr<i64, 1>, #blocked>
38
+ %15 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<16x1x!tt.ptr<i64, 1>, #blocked1>
39
+ %16 = tt.addptr %14, %8 : tensor<16x1x!tt.ptr<i64, 1>, #blocked>, tensor<16x1xi32, #blocked>
40
+ %17 = tt.addptr %15, %9 : tensor<16x1x!tt.ptr<i64, 1>, #blocked1>, tensor<16x1xi32, #blocked1>
41
+ %18 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64, #blocked>
42
+ %19 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64, #blocked1>
43
+ %20 = arith.remsi %8, %cst : tensor<16x1xi32, #blocked>
44
+ %21 = arith.cmpi slt, %12, %cst_0 : tensor<1x256xi32, #blocked>
45
+ %22 = arith.cmpi slt, %13, %cst_11 : tensor<1x256xi32, #blocked2>
46
+ %23 = arith.muli %20, %cst_1 : tensor<16x1xi32, #blocked>
47
+ %24 = tt.broadcast %12 : (tensor<1x256xi32, #blocked>) -> tensor<16x256xi32, #blocked>
48
+ %25 = tt.broadcast %23 : (tensor<16x1xi32, #blocked>) -> tensor<16x256xi32, #blocked>
49
+ %26 = arith.addi %24, %25 : tensor<16x256xi32, #blocked>
50
+ %27 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<16x256x!tt.ptr<f32, 1>, #blocked>
51
+ %28 = tt.addptr %27, %26 : tensor<16x256x!tt.ptr<f32, 1>, #blocked>, tensor<16x256xi32, #blocked>
52
+ %29 = tt.broadcast %21 : (tensor<1x256xi1, #blocked>) -> tensor<16x256xi1, #blocked>
53
+ %30 = tt.load %28, %29, %cst_14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32, #blocked>
54
+ %31 = arith.addi %18, %cst_5 : tensor<16x1xi64, #blocked>
55
+ %32 = arith.addi %19, %cst_8 : tensor<16x1xi64, #blocked1>
56
+ %33 = arith.cmpi slt, %18, %cst_6 : tensor<16x1xi64, #blocked>
57
+ %34 = arith.cmpi slt, %19, %cst_7 : tensor<16x1xi64, #blocked1>
58
+ %35 = arith.select %33, %31, %18 : tensor<16x1xi1, #blocked>, tensor<16x1xi64, #blocked>
59
+ %36 = arith.select %34, %32, %19 : tensor<16x1xi1, #blocked1>, tensor<16x1xi64, #blocked1>
60
+ %37 = arith.cmpi sge, %36, %cst_7 : tensor<16x1xi64, #blocked1>
61
+ %38 = arith.cmpi slt, %36, %cst_8 : tensor<16x1xi64, #blocked1>
62
+ %39 = arith.andi %37, %38 : tensor<16x1xi1, #blocked1>
63
+ tt.assert %39, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<16x1xi1, #blocked1>
64
+ %40 = arith.muli %35, %cst_4 : tensor<16x1xi64, #blocked>
65
+ %41 = tt.broadcast %40 : (tensor<16x1xi64, #blocked>) -> tensor<16x256xi64, #blocked>
66
+ %42 = arith.extsi %12 : tensor<1x256xi32, #blocked> to tensor<1x256xi64, #blocked>
67
+ %43 = tt.broadcast %42 : (tensor<1x256xi64, #blocked>) -> tensor<16x256xi64, #blocked>
68
+ %44 = arith.addi %43, %41 : tensor<16x256xi64, #blocked>
69
+ %45 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<16x256x!tt.ptr<f32, 1>, #blocked>
70
+ %46 = tt.addptr %45, %44 : tensor<16x256x!tt.ptr<f32, 1>, #blocked>, tensor<16x256xi64, #blocked>
71
+ %47 = tt.load %46, %29, %cst_14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32, #blocked>
72
+ %48 = arith.addf %47, %30 : tensor<16x256xf32, #blocked>
73
+ %49 = arith.addf %48, %cst_14 : tensor<16x256xf32, #blocked>
74
+ %50 = arith.subf %48, %49 : tensor<16x256xf32, #blocked>
75
+ %51 = arith.mulf %48, %50 : tensor<16x256xf32, #blocked>
76
+ %52 = arith.addf %51, %cst_14 : tensor<16x256xf32, #blocked>
77
+ %53 = arith.select %29, %49, %cst_14 : tensor<16x256xi1, #blocked>, tensor<16x256xf32, #blocked>
78
+ %54 = arith.select %29, %52, %cst_14 : tensor<16x256xi1, #blocked>, tensor<16x256xf32, #blocked>
79
+ %55 = arith.select %21, %cst_2, %cst_3 : tensor<1x256xi1, #blocked>, tensor<1x256xf32, #blocked>
80
+ %56 = tt.broadcast %55 : (tensor<1x256xf32, #blocked>) -> tensor<16x256xf32, #blocked>
81
+ %57:3 = "tt.reduce"(%53, %54, %56) <{axis = 1 : i32}> ({
82
+ ^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32):
83
+ %82 = arith.subf %arg10, %arg7 : f32
84
+ %83 = arith.addf %arg9, %arg12 : f32
85
+ %84 = arith.cmpf oeq, %83, %cst_9 : f32
86
+ %85 = arith.divf %arg12, %83 : f32
87
+ %86 = arith.select %84, %cst_9, %85 : f32
88
+ %87 = arith.mulf %82, %86 : f32
89
+ %88 = arith.addf %arg7, %87 : f32
90
+ %89 = arith.addf %arg8, %arg11 : f32
91
+ %90 = arith.mulf %82, %82 : f32
92
+ %91 = arith.mulf %90, %arg9 : f32
93
+ %92 = arith.mulf %91, %86 : f32
94
+ %93 = arith.addf %89, %92 : f32
95
+ tt.reduce.return %88, %93, %83 : f32, f32, f32
96
+ }) : (tensor<16x256xf32, #blocked>, tensor<16x256xf32, #blocked>, tensor<16x256xf32, #blocked>) -> (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
97
+ %58 = tt.expand_dims %57#0 {axis = 1 : i32} : (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xf32, #blocked>
98
+ %59 = tt.expand_dims %57#1 {axis = 1 : i32} : (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xf32, #blocked>
99
+ %60 = tt.load %28, %29, %cst_14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32, #blocked>
100
+ %61 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>, #blocked2>
101
+ %62 = tt.addptr %61, %13 : tensor<1x256x!tt.ptr<f32, 1>, #blocked2>, tensor<1x256xi32, #blocked2>
102
+ %63 = tt.load %62, %22, %cst_10 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32, #blocked2>
103
+ tt.assert %39, "index out of bounds: 0 <= tmp13 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<16x1xi1, #blocked1>
104
+ %64 = tt.load %46, %29, %cst_14 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x256xf32, #blocked>
105
+ %65 = arith.addf %64, %60 : tensor<16x256xf32, #blocked>
106
+ %66 = tt.broadcast %58 : (tensor<16x1xf32, #blocked>) -> tensor<16x256xf32, #blocked>
107
+ %67 = arith.subf %65, %66 : tensor<16x256xf32, #blocked>
108
+ %68 = arith.divf %59, %cst_13 : tensor<16x1xf32, #blocked>
109
+ %69 = arith.addf %68, %cst_12 : tensor<16x1xf32, #blocked>
110
+ %70 = tt.extern_elementwise %69 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<16x1xf32, #blocked>) -> tensor<16x1xf32, #blocked>
111
+ %71 = tt.broadcast %70 : (tensor<16x1xf32, #blocked>) -> tensor<16x256xf32, #blocked>
112
+ %72 = arith.mulf %67, %71 : tensor<16x256xf32, #blocked>
113
+ %73 = triton_gpu.convert_layout %63 : (tensor<1x256xf32, #blocked2>) -> tensor<1x256xf32, #blocked>
114
+ %74 = tt.broadcast %73 : (tensor<1x256xf32, #blocked>) -> tensor<16x256xf32, #blocked>
115
+ %75 = arith.mulf %72, %74 : tensor<16x256xf32, #blocked>
116
+ %76 = arith.muli %8, %cst_1 : tensor<16x1xi32, #blocked>
117
+ %77 = tt.broadcast %76 : (tensor<16x1xi32, #blocked>) -> tensor<16x256xi32, #blocked>
118
+ %78 = arith.addi %24, %77 : tensor<16x256xi32, #blocked>
119
+ %79 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<16x256x!tt.ptr<bf16, 1>, #blocked>
120
+ %80 = tt.addptr %79, %78 : tensor<16x256x!tt.ptr<bf16, 1>, #blocked>, tensor<16x256xi32, #blocked>
121
+ %81 = arith.truncf %75 : tensor<16x256xf32, #blocked> to tensor<16x256xbf16, #blocked>
122
+ tt.store %80, %81, %29 {cache = 1 : i32, evict = 1 : i32} : tensor<16x256xbf16, #blocked>
123
+ tt.return
124
+ }
125
+ }
.triton/dump/345a87a492fd703c73ab83265a21fcb6/triton_.ttir ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<1.000000e+00> : tensor<1x256xf32>
4
+ %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x256xf32>
5
+ %cst_1 = arith.constant 0.000000e+00 : f32
6
+ %cst_2 = arith.constant dense<256> : tensor<16x1xi64>
7
+ %cst_3 = arith.constant dense<50257> : tensor<16x1xi64>
8
+ %cst_4 = arith.constant dense<0> : tensor<16x1xi64>
9
+ %cst_5 = arith.constant dense<9.99999974E-6> : tensor<16x1xf32>
10
+ %cst_6 = arith.constant dense<2.560000e+02> : tensor<16x1xf32>
11
+ %cst_7 = arith.constant dense<0.000000e+00> : tensor<16x256xf32>
12
+ %cst_8 = arith.constant dense<256> : tensor<16x1xi32>
13
+ %cst_9 = arith.constant dense<256> : tensor<1x256xi32>
14
+ %cst_10 = arith.constant dense<512> : tensor<16x1xi32>
15
+ %c16_i32 = arith.constant 16 : i32
16
+ %0 = tt.get_program_id x : i32
17
+ %1 = arith.muli %0, %c16_i32 : i32
18
+ %2 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32>
19
+ %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<16xi32>) -> tensor<16x1xi32>
20
+ %4 = tt.splat %1 : (i32) -> tensor<16x1xi32>
21
+ %5 = arith.addi %4, %3 : tensor<16x1xi32>
22
+ %6 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
23
+ %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<256xi32>) -> tensor<1x256xi32>
24
+ %8 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<16x1x!tt.ptr<i64, 1>>
25
+ %9 = tt.addptr %8, %5 : tensor<16x1x!tt.ptr<i64, 1>>, tensor<16x1xi32>
26
+ %10 = tt.load %9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64>
27
+ %11 = arith.remsi %5, %cst_10 : tensor<16x1xi32>
28
+ %12 = arith.cmpi slt, %7, %cst_9 : tensor<1x256xi32>
29
+ %13 = arith.muli %11, %cst_8 : tensor<16x1xi32>
30
+ %14 = tt.broadcast %7 : (tensor<1x256xi32>) -> tensor<16x256xi32>
31
+ %15 = tt.broadcast %13 : (tensor<16x1xi32>) -> tensor<16x256xi32>
32
+ %16 = arith.addi %14, %15 : tensor<16x256xi32>
33
+ %17 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<16x256x!tt.ptr<f32, 1>>
34
+ %18 = tt.addptr %17, %16 : tensor<16x256x!tt.ptr<f32, 1>>, tensor<16x256xi32>
35
+ %19 = tt.broadcast %12 : (tensor<1x256xi1>) -> tensor<16x256xi1>
36
+ %20 = tt.load %18, %19, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32>
37
+ %21 = arith.addi %10, %cst_3 : tensor<16x1xi64>
38
+ %22 = arith.cmpi slt, %10, %cst_4 : tensor<16x1xi64>
39
+ %23 = arith.select %22, %21, %10 : tensor<16x1xi1>, tensor<16x1xi64>
40
+ %24 = arith.cmpi sge, %23, %cst_4 : tensor<16x1xi64>
41
+ %25 = arith.cmpi slt, %23, %cst_3 : tensor<16x1xi64>
42
+ %26 = arith.andi %24, %25 : tensor<16x1xi1>
43
+ tt.assert %26, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<16x1xi1>
44
+ %27 = arith.muli %23, %cst_2 : tensor<16x1xi64>
45
+ %28 = tt.broadcast %27 : (tensor<16x1xi64>) -> tensor<16x256xi64>
46
+ %29 = arith.extsi %7 : tensor<1x256xi32> to tensor<1x256xi64>
47
+ %30 = tt.broadcast %29 : (tensor<1x256xi64>) -> tensor<16x256xi64>
48
+ %31 = arith.addi %30, %28 : tensor<16x256xi64>
49
+ %32 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<16x256x!tt.ptr<f32, 1>>
50
+ %33 = tt.addptr %32, %31 : tensor<16x256x!tt.ptr<f32, 1>>, tensor<16x256xi64>
51
+ %34 = tt.load %33, %19, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32>
52
+ %35 = arith.addf %34, %20 : tensor<16x256xf32>
53
+ %36 = arith.addf %35, %cst_7 : tensor<16x256xf32>
54
+ %37 = arith.subf %35, %36 : tensor<16x256xf32>
55
+ %38 = arith.mulf %35, %37 : tensor<16x256xf32>
56
+ %39 = arith.addf %38, %cst_7 : tensor<16x256xf32>
57
+ %40 = arith.select %19, %36, %cst_7 : tensor<16x256xi1>, tensor<16x256xf32>
58
+ %41 = arith.select %19, %39, %cst_7 : tensor<16x256xi1>, tensor<16x256xf32>
59
+ %42 = arith.select %12, %cst, %cst_0 : tensor<1x256xi1>, tensor<1x256xf32>
60
+ %43 = tt.broadcast %42 : (tensor<1x256xf32>) -> tensor<16x256xf32>
61
+ %44:3 = "tt.reduce"(%40, %41, %43) <{axis = 1 : i32}> ({
62
+ ^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32):
63
+ %68 = arith.subf %arg10, %arg7 : f32
64
+ %69 = arith.addf %arg9, %arg12 : f32
65
+ %70 = arith.cmpf oeq, %69, %cst_1 : f32
66
+ %71 = arith.divf %arg12, %69 : f32
67
+ %72 = arith.select %70, %cst_1, %71 : f32
68
+ %73 = arith.mulf %68, %72 : f32
69
+ %74 = arith.addf %arg7, %73 : f32
70
+ %75 = arith.addf %arg8, %arg11 : f32
71
+ %76 = arith.mulf %68, %68 : f32
72
+ %77 = arith.mulf %76, %arg9 : f32
73
+ %78 = arith.mulf %77, %72 : f32
74
+ %79 = arith.addf %75, %78 : f32
75
+ tt.reduce.return %74, %79, %69 : f32, f32, f32
76
+ }) : (tensor<16x256xf32>, tensor<16x256xf32>, tensor<16x256xf32>) -> (tensor<16xf32>, tensor<16xf32>, tensor<16xf32>)
77
+ %45 = tt.expand_dims %44#0 {axis = 1 : i32} : (tensor<16xf32>) -> tensor<16x1xf32>
78
+ %46 = tt.expand_dims %44#1 {axis = 1 : i32} : (tensor<16xf32>) -> tensor<16x1xf32>
79
+ %47 = tt.load %18, %19, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32>
80
+ %48 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>>
81
+ %49 = tt.addptr %48, %7 : tensor<1x256x!tt.ptr<f32, 1>>, tensor<1x256xi32>
82
+ %50 = tt.load %49, %12, %cst_0 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32>
83
+ tt.assert %26, "index out of bounds: 0 <= tmp13 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<16x1xi1>
84
+ %51 = tt.load %33, %19, %cst_7 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x256xf32>
85
+ %52 = arith.addf %51, %47 : tensor<16x256xf32>
86
+ %53 = tt.broadcast %45 : (tensor<16x1xf32>) -> tensor<16x256xf32>
87
+ %54 = arith.subf %52, %53 : tensor<16x256xf32>
88
+ %55 = arith.divf %46, %cst_6 : tensor<16x1xf32>
89
+ %56 = arith.addf %55, %cst_5 : tensor<16x1xf32>
90
+ %57 = tt.extern_elementwise %56 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<16x1xf32>) -> tensor<16x1xf32>
91
+ %58 = tt.broadcast %57 : (tensor<16x1xf32>) -> tensor<16x256xf32>
92
+ %59 = arith.mulf %54, %58 : tensor<16x256xf32>
93
+ %60 = tt.broadcast %50 : (tensor<1x256xf32>) -> tensor<16x256xf32>
94
+ %61 = arith.mulf %59, %60 : tensor<16x256xf32>
95
+ %62 = arith.muli %5, %cst_8 : tensor<16x1xi32>
96
+ %63 = tt.broadcast %62 : (tensor<16x1xi32>) -> tensor<16x256xi32>
97
+ %64 = arith.addi %14, %63 : tensor<16x256xi32>
98
+ %65 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<16x256x!tt.ptr<bf16, 1>>
99
+ %66 = tt.addptr %65, %64 : tensor<16x256x!tt.ptr<bf16, 1>>, tensor<16x256xi32>
100
+ %67 = arith.truncf %61 : tensor<16x256xf32> to tensor<16x256xbf16>
101
+ tt.store %66, %67, %19 {cache = 1 : i32, evict = 1 : i32} : tensor<16x256xbf16>
102
+ tt.return
103
+ }
104
+ }
.triton/dump/3cd3b6d7993c56f7d0340d40c84f737c/triton_.llir ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
5
+ @assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
6
+ @assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp16 < 50257"
7
+ @assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
8
+ @assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
9
+ @assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
10
+ @.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
11
+
12
+ declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
13
+
14
+ define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !7 {
15
+ %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
16
+ %10 = lshr i32 %9, 2, !dbg !10
17
+ %11 = and i32 %10, 63, !dbg !10
18
+ %12 = and i32 %9, 63, !dbg !10
19
+ %13 = and i32 %9, 3, !dbg !11
20
+ %14 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #5, !dbg !12
21
+ %15 = shl i32 %14, 6, !dbg !13
22
+ %16 = or i32 %15, %11, !dbg !14
23
+ %17 = or i32 %15, %12, !dbg !14
24
+ %18 = sext i32 %16 to i64, !dbg !15
25
+ %19 = getelementptr i64, ptr addrspace(1) %0, i64 %18, !dbg !15
26
+ %20 = sext i32 %17 to i64, !dbg !15
27
+ %21 = getelementptr i64, ptr addrspace(1) %0, i64 %20, !dbg !15
28
+ %22 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 true) #5, !dbg !16
29
+ %23 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %21, i1 true) #5, !dbg !16
30
+ %24 = srem i32 %16, 512, !dbg !17
31
+ %25 = shl nsw i32 %24, 8, !dbg !18
32
+ %26 = shl i32 %16, 8, !dbg !19
33
+ %27 = add i64 %23, 50257, !dbg !20
34
+ %28 = icmp slt i64 %22, 0, !dbg !21
35
+ %29 = icmp slt i64 %23, 0, !dbg !21
36
+ %30 = select i1 %29, i64 %27, i64 %23, !dbg !22
37
+ %.fr8 = freeze i64 %30, !dbg !23
38
+ %31 = icmp ugt i64 %.fr8, 50256, !dbg !23
39
+ %32 = shl i64 %22, 8, !dbg !24
40
+ %33 = add i64 %32, 12865792, !dbg !24
41
+ %34 = select i1 %28, i64 %33, i64 %32, !dbg !24
42
+ %35 = getelementptr float, ptr addrspace(1) %1, i64 %34
43
+ br i1 %31, label %.split.us, label %.split, !dbg !25
44
+
45
+ .split.us: ; preds = %8, %.split.us
46
+ %36 = phi float [ %58, %.split.us ], [ 0.000000e+00, %8 ]
47
+ %37 = phi float [ %63, %.split.us ], [ 0.000000e+00, %8 ]
48
+ %38 = phi float [ %60, %.split.us ], [ 0.000000e+00, %8 ]
49
+ %39 = phi i32 [ %64, %.split.us ], [ 0, %8 ]
50
+ %40 = or i32 %39, %13, !dbg !26
51
+ %41 = add i32 %40, %25, !dbg !27
52
+ %42 = sext i32 %41 to i64, !dbg !28
53
+ %43 = getelementptr float, ptr addrspace(1) %2, i64 %42, !dbg !28
54
+ %44 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %43, i1 true, i32 0, i1 true) #5, !dbg !29
55
+ %45 = bitcast i32 %44 to float, !dbg !29
56
+ %46 = add i32 %40, %26, !dbg !30
57
+ %47 = sext i32 %46 to i64, !dbg !31
58
+ %48 = getelementptr i16, ptr addrspace(1) %3, i64 %47, !dbg !31
59
+ %49 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %48, i1 true, i16 0, i1 true) #5, !dbg !32
60
+ %50 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %49) #5, !dbg !33
61
+ tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !25
62
+ %51 = zext nneg i32 %40 to i64, !dbg !34
63
+ %52 = getelementptr float, ptr addrspace(1) %35, i64 %51, !dbg !35
64
+ %53 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %52, i1 true, i32 0, i1 true) #5, !dbg !36
65
+ %54 = bitcast i32 %53 to float, !dbg !36
66
+ %55 = fadd float %45, %54, !dbg !37
67
+ %56 = fadd float %50, %55, !dbg !38
68
+ %57 = fsub float %56, %38, !dbg !39
69
+ %58 = fadd float %36, 1.000000e+00, !dbg !43
70
+ %59 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %57, float %58) #5, !dbg !44
71
+ %60 = fadd float %38, %59, !dbg !45
72
+ %61 = fsub float %56, %60, !dbg !46
73
+ %62 = fmul float %57, %61, !dbg !47
74
+ %63 = fadd float %37, %62, !dbg !48
75
+ %64 = add nuw nsw i32 %39, 4, !dbg !49
76
+ %65 = icmp ult i32 %39, 252, !dbg !49
77
+ br i1 %65, label %.split.us, label %.split5.us, !dbg !49
78
+
79
+ .split: ; preds = %8, %.split
80
+ %66 = phi float [ %88, %.split ], [ 0.000000e+00, %8 ]
81
+ %67 = phi float [ %93, %.split ], [ 0.000000e+00, %8 ]
82
+ %68 = phi float [ %90, %.split ], [ 0.000000e+00, %8 ]
83
+ %69 = phi i32 [ %94, %.split ], [ 0, %8 ]
84
+ %70 = or i32 %69, %13, !dbg !26
85
+ %71 = add i32 %70, %25, !dbg !27
86
+ %72 = sext i32 %71 to i64, !dbg !28
87
+ %73 = getelementptr float, ptr addrspace(1) %2, i64 %72, !dbg !28
88
+ %74 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %73, i1 true, i32 0, i1 true) #5, !dbg !29
89
+ %75 = bitcast i32 %74 to float, !dbg !29
90
+ %76 = add i32 %70, %26, !dbg !30
91
+ %77 = sext i32 %76 to i64, !dbg !31
92
+ %78 = getelementptr i16, ptr addrspace(1) %3, i64 %77, !dbg !31
93
+ %79 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %78, i1 true, i16 0, i1 true) #5, !dbg !32
94
+ %80 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %79) #5, !dbg !33
95
+ %81 = zext nneg i32 %70 to i64, !dbg !34
96
+ %82 = getelementptr float, ptr addrspace(1) %35, i64 %81, !dbg !35
97
+ %83 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %82, i1 true, i32 0, i1 true) #5, !dbg !36
98
+ %84 = bitcast i32 %83 to float, !dbg !36
99
+ %85 = fadd float %75, %84, !dbg !37
100
+ %86 = fadd float %80, %85, !dbg !38
101
+ %87 = fsub float %86, %68, !dbg !39
102
+ %88 = fadd float %66, 1.000000e+00, !dbg !43
103
+ %89 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %87, float %88) #5, !dbg !44
104
+ %90 = fadd float %68, %89, !dbg !45
105
+ %91 = fsub float %86, %90, !dbg !46
106
+ %92 = fmul float %87, %91, !dbg !47
107
+ %93 = fadd float %67, %92, !dbg !48
108
+ %94 = add nuw nsw i32 %69, 4, !dbg !49
109
+ %95 = icmp ult i32 %69, 252, !dbg !49
110
+ br i1 %95, label %.split, label %.split5.us, !dbg !49
111
+
112
+ .split5.us: ; preds = %.split, %.split.us
113
+ %.us-phi = phi float [ %60, %.split.us ], [ %90, %.split ]
114
+ %.us-phi6 = phi float [ %63, %.split.us ], [ %93, %.split ]
115
+ %.us-phi7 = phi float [ %58, %.split.us ], [ %88, %.split ]
116
+ %96 = bitcast float %.us-phi to i32, !dbg !50
117
+ %97 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %96, i32 2, i32 31), !dbg !50
118
+ %98 = bitcast i32 %97 to float, !dbg !50
119
+ %99 = bitcast float %.us-phi6 to i32, !dbg !50
120
+ %100 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %99, i32 2, i32 31), !dbg !50
121
+ %101 = bitcast i32 %100 to float, !dbg !50
122
+ %102 = bitcast float %.us-phi7 to i32, !dbg !50
123
+ %103 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %102, i32 2, i32 31), !dbg !50
124
+ %104 = bitcast i32 %103 to float, !dbg !50
125
+ %105 = fsub float %98, %.us-phi, !dbg !52
126
+ %106 = fadd float %.us-phi7, %104, !dbg !56
127
+ %107 = fcmp oeq float %106, 0.000000e+00, !dbg !57
128
+ %108 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %104, float %106) #5, !dbg !58
129
+ %109 = select i1 %107, float 0.000000e+00, float %108, !dbg !59
130
+ %110 = fmul float %105, %109, !dbg !60
131
+ %111 = fadd float %.us-phi, %110, !dbg !61
132
+ %112 = fadd float %.us-phi6, %101, !dbg !62
133
+ %113 = fmul float %105, %105, !dbg !63
134
+ %114 = fmul float %.us-phi7, %113, !dbg !64
135
+ %115 = fmul float %114, %109, !dbg !65
136
+ %116 = fadd float %112, %115, !dbg !66
137
+ %117 = bitcast float %111 to i32, !dbg !50
138
+ %118 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %117, i32 1, i32 31), !dbg !50
139
+ %119 = bitcast i32 %118 to float, !dbg !50
140
+ %120 = bitcast float %116 to i32, !dbg !50
141
+ %121 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %120, i32 1, i32 31), !dbg !50
142
+ %122 = bitcast i32 %121 to float, !dbg !50
143
+ %123 = bitcast float %106 to i32, !dbg !50
144
+ %124 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %123, i32 1, i32 31), !dbg !50
145
+ %125 = bitcast i32 %124 to float, !dbg !50
146
+ %126 = fsub float %119, %111, !dbg !52
147
+ %127 = fadd float %106, %125, !dbg !56
148
+ %128 = fcmp oeq float %127, 0.000000e+00, !dbg !57
149
+ %129 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %125, float %127) #5, !dbg !58
150
+ %130 = select i1 %128, float 0.000000e+00, float %129, !dbg !59
151
+ %131 = fmul float %126, %130, !dbg !60
152
+ %132 = fadd float %111, %131, !dbg !61
153
+ %133 = fadd float %116, %122, !dbg !62
154
+ %134 = fmul float %126, %126, !dbg !63
155
+ %135 = fmul float %106, %134, !dbg !64
156
+ %136 = fmul float %130, %135, !dbg !65
157
+ %137 = fadd float %133, %136, !dbg !66
158
+ %138 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %137, float 2.560000e+02) #5, !dbg !67
159
+ %139 = fadd float %138, 0x3EE4F8B580000000, !dbg !68
160
+ br label %140, !dbg !69
161
+
162
+ 140: ; preds = %.split5.us, %__nv_rsqrtf.exit
163
+ %141 = phi i32 [ 0, %.split5.us ], [ %174, %__nv_rsqrtf.exit ]
164
+ %142 = or i32 %141, %13, !dbg !70
165
+ %143 = add i32 %142, %25, !dbg !71
166
+ %144 = sext i32 %143 to i64, !dbg !72
167
+ %145 = getelementptr float, ptr addrspace(1) %2, i64 %144, !dbg !72
168
+ %146 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %145, i1 true, i32 0, i1 true) #5, !dbg !73
169
+ %147 = bitcast i32 %146 to float, !dbg !73
170
+ %148 = add i32 %142, %26, !dbg !74
171
+ %149 = sext i32 %148 to i64, !dbg !75
172
+ %150 = getelementptr i16, ptr addrspace(1) %3, i64 %149, !dbg !75
173
+ %151 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %150, i1 true, i16 0, i1 true) #5, !dbg !76
174
+ %152 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %151) #5, !dbg !77
175
+ %153 = zext nneg i32 %142 to i64, !dbg !78
176
+ %154 = getelementptr float, ptr addrspace(1) %4, i64 %153, !dbg !78
177
+ %155 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %154, i1 true, i32 0, i1 true) #5, !dbg !79
178
+ %156 = bitcast i32 %155 to float, !dbg !79
179
+ br i1 %31, label %157, label %158, !dbg !80
180
+
181
+ 157: ; preds = %140
182
+ tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !80
183
+ br label %158, !dbg !80
184
+
185
+ 158: ; preds = %157, %140
186
+ %159 = getelementptr float, ptr addrspace(1) %35, i64 %153, !dbg !81
187
+ %160 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %159, i1 true, i32 0, i1 true) #5, !dbg !82
188
+ %161 = bitcast i32 %160 to float, !dbg !82
189
+ %162 = fadd float %147, %161, !dbg !83
190
+ %163 = fadd float %152, %162, !dbg !84
191
+ %164 = fsub float %163, %132, !dbg !85
192
+ %165 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !86
193
+ %.not.i = icmp eq i32 %165, 0, !dbg !86
194
+ br i1 %.not.i, label %168, label %166, !dbg !86
195
+
196
+ 166: ; preds = %158
197
+ %167 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %139), !dbg !86
198
+ br label %__nv_rsqrtf.exit, !dbg !86
199
+
200
+ 168: ; preds = %158
201
+ %169 = tail call float @llvm.nvvm.rsqrt.approx.f(float %139), !dbg !86
202
+ br label %__nv_rsqrtf.exit, !dbg !86
203
+
204
+ __nv_rsqrtf.exit: ; preds = %166, %168
205
+ %.0.i = phi float [ %167, %166 ], [ %169, %168 ], !dbg !86
206
+ %170 = fmul float %164, %.0.i, !dbg !87
207
+ %171 = fmul float %170, %156, !dbg !88
208
+ %172 = getelementptr i16, ptr addrspace(1) %5, i64 %149, !dbg !89
209
+ %173 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %171) #5, !dbg !90
210
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %173, ptr addrspace(1) %172, i1 true) #5, !dbg !90
211
+ %174 = add nuw nsw i32 %141, 4, !dbg !69
212
+ %175 = icmp ult i32 %141, 252, !dbg !69
213
+ br i1 %175, label %140, label %176, !dbg !69
214
+
215
+ 176: ; preds = %__nv_rsqrtf.exit
216
+ ret void, !dbg !91
217
+ }
218
+
219
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
220
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
221
+
222
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
223
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
224
+
225
+ ; Function Attrs: alwaysinline nounwind
226
+ define float @__nv_rsqrtf(float %x) local_unnamed_addr #2 {
227
+ %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5
228
+ %.not = icmp eq i32 %1, 0
229
+ br i1 %.not, label %4, label %2
230
+
231
+ 2: ; preds = %0
232
+ %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
233
+ br label %6
234
+
235
+ 4: ; preds = %0
236
+ %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
237
+ br label %6
238
+
239
+ 6: ; preds = %4, %2
240
+ %.0 = phi float [ %3, %2 ], [ %5, %4 ]
241
+ ret float %.0
242
+ }
243
+
244
+ declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #3
245
+
246
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
247
+ declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #4
248
+
249
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
250
+ declare float @llvm.nvvm.rsqrt.approx.f(float) #4
251
+
252
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
253
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
254
+ attributes #2 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
255
+ attributes #3 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
256
+ attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
257
+ attributes #5 = { nounwind }
258
+
259
+ !llvm.module.flags = !{!0, !1}
260
+ !llvm.dbg.cu = !{!2}
261
+ !nvvm.annotations = !{!4, !5, !5, !4}
262
+ !llvm.ident = !{!6}
263
+
264
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
265
+ !1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
266
+ !2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
267
+ !3 = !DIFile(filename: "cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py", directory: "/tmp/torchinductor_root/pn")
268
+ !4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1}
269
+ !5 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 256}
270
+ !6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
271
+ !7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
272
+ !8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
273
+ !9 = !{}
274
+ !10 = !DILocation(line: 22, column: 44, scope: !7)
275
+ !11 = !DILocation(line: 24, column: 33, scope: !7)
276
+ !12 = !DILocation(line: 21, column: 28, scope: !7)
277
+ !13 = !DILocation(line: 21, column: 33, scope: !7)
278
+ !14 = !DILocation(line: 22, column: 23, scope: !7)
279
+ !15 = !DILocation(line: 26, column: 30, scope: !7)
280
+ !16 = !DILocation(line: 26, column: 35, scope: !7)
281
+ !17 = !DILocation(line: 27, column: 18, scope: !7)
282
+ !18 = !DILocation(line: 35, column: 44, scope: !7)
283
+ !19 = !DILocation(line: 36, column: 44, scope: !7)
284
+ !20 = !DILocation(line: 37, column: 22, scope: !7)
285
+ !21 = !DILocation(line: 38, column: 22, scope: !7)
286
+ !22 = !DILocation(line: 39, column: 36, scope: !7)
287
+ !23 = !DILocation(line: 40, column: 40, scope: !7)
288
+ !24 = !DILocation(line: 41, column: 44, scope: !7)
289
+ !25 = !DILocation(line: 40, column: 55, scope: !7)
290
+ !26 = !DILocation(line: 32, column: 27, scope: !7)
291
+ !27 = !DILocation(line: 35, column: 40, scope: !7)
292
+ !28 = !DILocation(line: 35, column: 34, scope: !7)
293
+ !29 = !DILocation(line: 35, column: 50, scope: !7)
294
+ !30 = !DILocation(line: 36, column: 40, scope: !7)
295
+ !31 = !DILocation(line: 36, column: 34, scope: !7)
296
+ !32 = !DILocation(line: 36, column: 50, scope: !7)
297
+ !33 = !DILocation(line: 36, column: 101, scope: !7)
298
+ !34 = !DILocation(line: 41, column: 40, scope: !7)
299
+ !35 = !DILocation(line: 41, column: 34, scope: !7)
300
+ !36 = !DILocation(line: 41, column: 52, scope: !7)
301
+ !37 = !DILocation(line: 42, column: 22, scope: !7)
302
+ !38 = !DILocation(line: 44, column: 22, scope: !7)
303
+ !39 = !DILocation(line: 96, column: 20, scope: !40, inlinedAt: !42)
304
+ !40 = distinct !DILexicalBlockFile(scope: !7, file: !41, discriminator: 0)
305
+ !41 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
306
+ !42 = !DILocation(line: 47, column: 41, scope: !40)
307
+ !43 = !DILocation(line: 97, column: 26, scope: !40, inlinedAt: !42)
308
+ !44 = !DILocation(line: 98, column: 30, scope: !40, inlinedAt: !42)
309
+ !45 = !DILocation(line: 98, column: 22, scope: !40, inlinedAt: !42)
310
+ !46 = !DILocation(line: 101, column: 30, scope: !40, inlinedAt: !42)
311
+ !47 = !DILocation(line: 101, column: 22, scope: !40, inlinedAt: !42)
312
+ !48 = !DILocation(line: 50, column: 50, scope: !7)
313
+ !49 = !DILocation(line: 31, column: 36, scope: !7)
314
+ !50 = !DILocation(line: 120, column: 46, scope: !40, inlinedAt: !51)
315
+ !51 = !DILocation(line: 53, column: 44, scope: !40)
316
+ !52 = !DILocation(line: 108, column: 21, scope: !53, inlinedAt: !54)
317
+ !53 = distinct !DILexicalBlockFile(scope: !40, file: !41, discriminator: 0)
318
+ !54 = !DILocation(line: 120, column: 46, scope: !53, inlinedAt: !55)
319
+ !55 = !DILocation(line: 53, column: 44, scope: !53)
320
+ !56 = !DILocation(line: 109, column: 28, scope: !53, inlinedAt: !54)
321
+ !57 = !DILocation(line: 110, column: 39, scope: !53, inlinedAt: !54)
322
+ !58 = !DILocation(line: 110, column: 60, scope: !53, inlinedAt: !54)
323
+ !59 = !DILocation(line: 110, column: 49, scope: !53, inlinedAt: !54)
324
+ !60 = !DILocation(line: 112, column: 25, scope: !53, inlinedAt: !54)
325
+ !61 = !DILocation(line: 112, column: 17, scope: !53, inlinedAt: !54)
326
+ !62 = !DILocation(line: 113, column: 15, scope: !53, inlinedAt: !54)
327
+ !63 = !DILocation(line: 113, column: 30, scope: !53, inlinedAt: !54)
328
+ !64 = !DILocation(line: 113, column: 38, scope: !53, inlinedAt: !54)
329
+ !65 = !DILocation(line: 113, column: 49, scope: !53, inlinedAt: !54)
330
+ !66 = !DILocation(line: 113, column: 22, scope: !53, inlinedAt: !54)
331
+ !67 = !DILocation(line: 75, column: 24, scope: !7)
332
+ !68 = !DILocation(line: 77, column: 24, scope: !7)
333
+ !69 = !DILocation(line: 58, column: 36, scope: !7)
334
+ !70 = !DILocation(line: 59, column: 27, scope: !7)
335
+ !71 = !DILocation(line: 62, column: 41, scope: !7)
336
+ !72 = !DILocation(line: 62, column: 35, scope: !7)
337
+ !73 = !DILocation(line: 62, column: 51, scope: !7)
338
+ !74 = !DILocation(line: 63, column: 41, scope: !7)
339
+ !75 = !DILocation(line: 63, column: 35, scope: !7)
340
+ !76 = !DILocation(line: 63, column: 51, scope: !7)
341
+ !77 = !DILocation(line: 63, column: 103, scope: !7)
342
+ !78 = !DILocation(line: 64, column: 35, scope: !7)
343
+ !79 = !DILocation(line: 64, column: 40, scope: !7)
344
+ !80 = !DILocation(line: 68, column: 57, scope: !7)
345
+ !81 = !DILocation(line: 69, column: 35, scope: !7)
346
+ !82 = !DILocation(line: 69, column: 54, scope: !7)
347
+ !83 = !DILocation(line: 70, column: 24, scope: !7)
348
+ !84 = !DILocation(line: 72, column: 24, scope: !7)
349
+ !85 = !DILocation(line: 73, column: 24, scope: !7)
350
+ !86 = !DILocation(line: 78, column: 30, scope: !7)
351
+ !87 = !DILocation(line: 79, column: 24, scope: !7)
352
+ !88 = !DILocation(line: 80, column: 24, scope: !7)
353
+ !89 = !DILocation(line: 82, column: 29, scope: !7)
354
+ !90 = !DILocation(line: 82, column: 52, scope: !7)
355
+ !91 = !DILocation(line: 58, column: 4, scope: !7)
.triton/dump/4710f23a3addbad00b260d7a02366fe0/triton_.ttgir ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1d2d34e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg3: i32, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
4
+ %cst = arith.constant dense<0> : tensor<1x8xi64, #blocked>
5
+ %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x8xf32, #blocked>
6
+ %cst_1 = arith.constant dense<8> : tensor<1x8xi32, #blocked>
7
+ %c0_i32 = arith.constant 0 : i32
8
+ %0 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
9
+ %1 = tt.expand_dims %0 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x8xi32, #blocked>
10
+ %2 = arith.cmpi slt, %1, %cst_1 : tensor<1x8xi32, #blocked>
11
+ %3 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<1x8x!tt.ptr<f32, 1>, #blocked>
12
+ %4 = tt.addptr %3, %1 : tensor<1x8x!tt.ptr<f32, 1>, #blocked>, tensor<1x8xi32, #blocked>
13
+ %5 = tt.load %4, %2, %cst_0 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x8xf32, #blocked>
14
+ %6 = tt.splat %arg2 : (!tt.ptr<i64, 1>) -> tensor<1x8x!tt.ptr<i64, 1>, #blocked>
15
+ %7 = tt.addptr %6, %1 : tensor<1x8x!tt.ptr<i64, 1>, #blocked>, tensor<1x8xi32, #blocked>
16
+ %8 = tt.load %7, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x8xi64, #blocked>
17
+ %9 = arith.select %2, %5, %cst_0 : tensor<1x8xi1, #blocked>, tensor<1x8xf32, #blocked>
18
+ %10 = "tt.reduce"(%9) <{axis = 1 : i32}> ({
19
+ ^bb0(%arg5: f32, %arg6: f32):
20
+ %19 = arith.addf %arg5, %arg6 : f32
21
+ tt.reduce.return %19 : f32
22
+ }) : (tensor<1x8xf32, #blocked>) -> tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
23
+ %11 = tt.expand_dims %10 {axis = 1 : i32} : (tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1x1xf32, #blocked>
24
+ %12 = arith.select %2, %8, %cst : tensor<1x8xi1, #blocked>, tensor<1x8xi64, #blocked>
25
+ %13 = "tt.reduce"(%12) <{axis = 1 : i32}> ({
26
+ ^bb0(%arg5: i64, %arg6: i64):
27
+ %19 = arith.addi %arg5, %arg6 : i64
28
+ tt.reduce.return %19 : i64
29
+ }) : (tensor<1x8xi64, #blocked>) -> tensor<1xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
30
+ %14 = tt.expand_dims %13 {axis = 1 : i32} : (tensor<1xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1x1xi64, #blocked>
31
+ %15 = arith.sitofp %14 : tensor<1x1xi64, #blocked> to tensor<1x1xf32, #blocked>
32
+ %16 = arith.divf %11, %15 : tensor<1x1xf32, #blocked>
33
+ gpu.barrier
34
+ %17 = tt.addptr %arg0, %c0_i32 : !tt.ptr<f32, 1>, i32
35
+ %18 = tt.splat %17 : (!tt.ptr<f32, 1>) -> tensor<1x1x!tt.ptr<f32, 1>, #blocked>
36
+ tt.store %18, %16 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xf32, #blocked>
37
+ tt.return
38
+ }
39
+ }
.triton/dump/53075505618c3af0ef6ce61f3300cdcb/triton_.cubin ADDED
Binary file (73.7 kB). View file
 
.triton/dump/53075505618c3af0ef6ce61f3300cdcb/triton_.ptx ADDED
@@ -0,0 +1,2004 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4d5d6de7de
10
+ .extern .func __assertfail
11
+ (
12
+ .param .b64 __assertfail_param_0,
13
+ .param .b64 __assertfail_param_1,
14
+ .param .b32 __assertfail_param_2,
15
+ .param .b64 __assertfail_param_3,
16
+ .param .b64 __assertfail_param_4
17
+ )
18
+ ;
19
+ .global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
20
+ .global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
21
+ .global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 54, 32, 60, 32, 53, 48, 50, 53, 55};
22
+ .global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
23
+ .global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
24
+ .global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
25
+ .extern .shared .align 1 .b8 global_smem[];
26
+ .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
27
+
28
+ .visible .entry triton__0d1d2d3d4d5d6de7de(
29
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_0,
30
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_1,
31
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_2,
32
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_3,
33
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_4,
34
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_5,
35
+ .param .u32 triton__0d1d2d3d4d5d6de7de_param_6,
36
+ .param .u32 triton__0d1d2d3d4d5d6de7de_param_7
37
+ )
38
+ .maxntid 256, 1, 1
39
+ {
40
+ .reg .pred %p<157>;
41
+ .reg .b16 %rs<49>;
42
+ .reg .b32 %r<474>;
43
+ .reg .f32 %f<678>;
44
+ .reg .b64 %rd<118>;
45
+ .loc 1 18 0
46
+ $L__func_begin0:
47
+ .loc 1 18 0
48
+
49
+ ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6de7de_param_5];
50
+ ld.param.u64 %rd15, [triton__0d1d2d3d4d5d6de7de_param_4];
51
+ ld.param.u64 %rd14, [triton__0d1d2d3d4d5d6de7de_param_3];
52
+ ld.param.u64 %rd52, [triton__0d1d2d3d4d5d6de7de_param_0];
53
+ ld.param.u64 %rd53, [triton__0d1d2d3d4d5d6de7de_param_1];
54
+ $L__tmp0:
55
+ .loc 1 22 44
56
+ mov.u32 %r12, %tid.x;
57
+ ld.param.u64 %rd54, [triton__0d1d2d3d4d5d6de7de_param_2];
58
+ bfe.u32 %r1, %r12, 3, 5;
59
+ and.b32 %r2, %r12, 63;
60
+ .loc 1 24 33
61
+ shl.b32 %r13, %r12, 3;
62
+ and.b32 %r3, %r13, 56;
63
+ .loc 1 31 36
64
+ shr.u32 %r4, %r12, 6;
65
+ .loc 1 21 28
66
+ mov.u32 %r10, %ctaid.x;
67
+ .loc 1 21 33
68
+ shl.b32 %r14, %r10, 6;
69
+ .loc 1 22 23
70
+ or.b32 %r15, %r14, %r1;
71
+ or.b32 %r16, %r15, 32;
72
+ or.b32 %r17, %r14, %r2;
73
+ .loc 1 26 30
74
+ mul.wide.s32 %rd55, %r15, 8;
75
+ add.s64 %rd18, %rd52, %rd55;
76
+ add.s64 %rd34, %rd18, 256;
77
+ mul.wide.s32 %rd56, %r17, 8;
78
+ add.s64 %rd50, %rd52, %rd56;
79
+ mov.pred %p1, -1;
80
+ .loc 1 26 35
81
+ mov.u64 %rd17, 0x0;
82
+ @%p1 ld.global.L1::evict_last.b64 { %rd17 }, [ %rd18 + 0 ];
83
+ mov.u64 %rd19, 0x0;
84
+ @%p1 ld.global.L1::evict_last.b64 { %rd19 }, [ %rd18 + 0 ];
85
+ mov.u64 %rd21, 0x0;
86
+ @%p1 ld.global.L1::evict_last.b64 { %rd21 }, [ %rd18 + 0 ];
87
+ mov.u64 %rd23, 0x0;
88
+ @%p1 ld.global.L1::evict_last.b64 { %rd23 }, [ %rd18 + 0 ];
89
+ mov.u64 %rd25, 0x0;
90
+ @%p1 ld.global.L1::evict_last.b64 { %rd25 }, [ %rd18 + 0 ];
91
+ mov.u64 %rd27, 0x0;
92
+ @%p1 ld.global.L1::evict_last.b64 { %rd27 }, [ %rd18 + 0 ];
93
+ mov.u64 %rd29, 0x0;
94
+ @%p1 ld.global.L1::evict_last.b64 { %rd29 }, [ %rd18 + 0 ];
95
+ mov.u64 %rd31, 0x0;
96
+ @%p1 ld.global.L1::evict_last.b64 { %rd31 }, [ %rd18 + 0 ];
97
+ mov.u64 %rd33, 0x0;
98
+ @%p1 ld.global.L1::evict_last.b64 { %rd33 }, [ %rd34 + 0 ];
99
+ mov.u64 %rd35, 0x0;
100
+ @%p1 ld.global.L1::evict_last.b64 { %rd35 }, [ %rd34 + 0 ];
101
+ mov.u64 %rd37, 0x0;
102
+ @%p1 ld.global.L1::evict_last.b64 { %rd37 }, [ %rd34 + 0 ];
103
+ mov.u64 %rd39, 0x0;
104
+ @%p1 ld.global.L1::evict_last.b64 { %rd39 }, [ %rd34 + 0 ];
105
+ mov.u64 %rd41, 0x0;
106
+ @%p1 ld.global.L1::evict_last.b64 { %rd41 }, [ %rd34 + 0 ];
107
+ mov.u64 %rd43, 0x0;
108
+ @%p1 ld.global.L1::evict_last.b64 { %rd43 }, [ %rd34 + 0 ];
109
+ mov.u64 %rd45, 0x0;
110
+ @%p1 ld.global.L1::evict_last.b64 { %rd45 }, [ %rd34 + 0 ];
111
+ mov.u64 %rd47, 0x0;
112
+ @%p1 ld.global.L1::evict_last.b64 { %rd47 }, [ %rd34 + 0 ];
113
+ mov.u64 %rd49, 0x0;
114
+ @%p1 ld.global.L1::evict_last.b64 { %rd49 }, [ %rd50 + 0 ];
115
+ .loc 1 27 18
116
+ bfe.s32 %r18, %r10, 25, 1;
117
+ shr.u32 %r19, %r18, 23;
118
+ add.s32 %r20, %r15, %r19;
119
+ and.b32 %r21, %r20, 16776704;
120
+ sub.s32 %r22, %r15, %r21;
121
+ add.s32 %r23, %r16, %r19;
122
+ and.b32 %r24, %r23, 16776704;
123
+ sub.s32 %r25, %r16, %r24;
124
+ .loc 1 35 44
125
+ shl.b32 %r26, %r22, 8;
126
+ shl.b32 %r27, %r25, 8;
127
+ .loc 1 37 22
128
+ add.s64 %rd57, %rd49, 50257;
129
+ .loc 1 38 22
130
+ setp.lt.s64 %p18, %rd17, 0;
131
+ setp.lt.s64 %p19, %rd33, 0;
132
+ setp.lt.s64 %p20, %rd49, 0;
133
+ .loc 1 39 36
134
+ selp.b64 %rd1, %rd57, %rd49, %p20;
135
+ .loc 1 41 44
136
+ shl.b64 %rd58, %rd17, 8;
137
+ add.s64 %rd59, %rd58, 12865792;
138
+ selp.b64 %rd60, %rd59, %rd58, %p18;
139
+ shl.b64 %rd61, %rd33, 8;
140
+ add.s64 %rd62, %rd61, 12865792;
141
+ selp.b64 %rd63, %rd62, %rd61, %p19;
142
+ .loc 1 31 36
143
+ and.b32 %r28, %r12, 7;
144
+ mul.wide.u32 %rd2, %r28, 32;
145
+ shl.b64 %rd64, %rd63, 2;
146
+ or.b64 %rd65, %rd2, %rd64;
147
+ add.s64 %rd3, %rd53, %rd65;
148
+ shl.b64 %rd66, %rd60, 2;
149
+ or.b64 %rd67, %rd2, %rd66;
150
+ add.s64 %rd4, %rd53, %rd67;
151
+ or.b32 %r29, %r27, %r3;
152
+ mul.wide.s32 %rd68, %r29, 4;
153
+ add.s64 %rd5, %rd54, %rd68;
154
+ or.b32 %r30, %r26, %r3;
155
+ mul.wide.s32 %rd69, %r30, 4;
156
+ add.s64 %rd6, %rd54, %rd69;
157
+ shl.b32 %r31, %r10, 14;
158
+ shl.b32 %r32, %r1, 8;
159
+ or.b32 %r33, %r31, %r32;
160
+ or.b32 %r5, %r33, %r3;
161
+ mov.f32 %f614, 0f00000000;
162
+ mov.u64 %rd116, 0;
163
+ mov.b32 %r472, -64;
164
+ mov.f32 %f615, %f614;
165
+ mov.f32 %f616, %f614;
166
+ mov.f32 %f617, %f614;
167
+ mov.f32 %f618, %f614;
168
+ mov.f32 %f619, %f614;
169
+ mov.f32 %f620, %f614;
170
+ mov.f32 %f621, %f614;
171
+ mov.f32 %f622, %f614;
172
+ mov.f32 %f623, %f614;
173
+ mov.f32 %f624, %f614;
174
+ mov.f32 %f625, %f614;
175
+ mov.f32 %f626, %f614;
176
+ mov.f32 %f627, %f614;
177
+ mov.f32 %f628, %f614;
178
+ mov.f32 %f629, %f614;
179
+ mov.f32 %f630, %f614;
180
+ mov.f32 %f631, %f614;
181
+ mov.f32 %f632, %f614;
182
+ mov.f32 %f633, %f614;
183
+ mov.f32 %f634, %f614;
184
+ mov.f32 %f635, %f614;
185
+ mov.f32 %f636, %f614;
186
+ mov.f32 %f637, %f614;
187
+ mov.f32 %f638, %f614;
188
+ mov.f32 %f639, %f614;
189
+ mov.f32 %f640, %f614;
190
+ mov.f32 %f641, %f614;
191
+ mov.f32 %f642, %f614;
192
+ mov.f32 %f643, %f614;
193
+ mov.f32 %f644, %f614;
194
+ mov.f32 %f645, %f614;
195
+ mov.f32 %f646, %f614;
196
+ mov.f32 %f647, %f614;
197
+ mov.f32 %f648, %f614;
198
+ mov.f32 %f649, %f614;
199
+ mov.f32 %f650, %f614;
200
+ mov.f32 %f651, %f614;
201
+ mov.f32 %f652, %f614;
202
+ mov.f32 %f653, %f614;
203
+ mov.f32 %f654, %f614;
204
+ mov.f32 %f655, %f614;
205
+ mov.f32 %f656, %f614;
206
+ mov.f32 %f657, %f614;
207
+ mov.f32 %f658, %f614;
208
+ mov.f32 %f659, %f614;
209
+ mov.f32 %f660, %f614;
210
+ mov.f32 %f661, %f614;
211
+ mov.f32 %f662, %f614;
212
+ mov.f32 %f663, %f614;
213
+ mov.f32 %f664, %f614;
214
+ mov.f32 %f665, %f614;
215
+ mov.f32 %f666, %f614;
216
+ mov.f32 %f667, %f614;
217
+ mov.f32 %f668, %f614;
218
+ mov.f32 %f669, %f614;
219
+ mov.f32 %f670, %f614;
220
+ mov.f32 %f671, %f614;
221
+ mov.f32 %f672, %f614;
222
+ mov.f32 %f673, %f614;
223
+ mov.f32 %f674, %f614;
224
+ mov.f32 %f675, %f614;
225
+ mov.f32 %f676, %f614;
226
+ mov.f32 %f677, %f614;
227
+ bra.uni $L__BB0_1;
228
+ $L__BB0_3:
229
+ .loc 1 41 40
230
+ add.s64 %rd85, %rd4, %rd116;
231
+ .loc 1 41 34
232
+ add.s64 %rd86, %rd85, 16;
233
+ add.s64 %rd87, %rd3, %rd116;
234
+ .loc 1 41 52
235
+ add.s64 %rd88, %rd87, 16;
236
+ mov.u32 %r102, 0x0;
237
+ mov.u32 %r103, 0x0;
238
+ mov.u32 %r104, 0x0;
239
+ mov.u32 %r105, 0x0;
240
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r102, %r103, %r104, %r105 }, [ %rd85 + 0 ];
241
+ @!%p1 mov.u32 %r102, %r411;
242
+ @!%p1 mov.u32 %r103, %r411;
243
+ @!%p1 mov.u32 %r104, %r411;
244
+ @!%p1 mov.u32 %r105, %r411;
245
+ mov.b32 %f206, %r102;
246
+ mov.b32 %f207, %r103;
247
+ mov.b32 %f208, %r104;
248
+ mov.b32 %f209, %r105;
249
+ mov.u32 %r110, 0x0;
250
+ mov.u32 %r111, 0x0;
251
+ mov.u32 %r112, 0x0;
252
+ mov.u32 %r113, 0x0;
253
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r110, %r111, %r112, %r113 }, [ %rd86 + 0 ];
254
+ @!%p1 mov.u32 %r110, %r411;
255
+ @!%p1 mov.u32 %r111, %r411;
256
+ @!%p1 mov.u32 %r112, %r411;
257
+ @!%p1 mov.u32 %r113, %r411;
258
+ mov.b32 %f210, %r110;
259
+ mov.b32 %f211, %r111;
260
+ mov.b32 %f212, %r112;
261
+ mov.b32 %f213, %r113;
262
+ mov.u32 %r118, 0x0;
263
+ mov.u32 %r119, 0x0;
264
+ mov.u32 %r120, 0x0;
265
+ mov.u32 %r121, 0x0;
266
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r118, %r119, %r120, %r121 }, [ %rd87 + 0 ];
267
+ @!%p1 mov.u32 %r118, %r411;
268
+ @!%p1 mov.u32 %r119, %r411;
269
+ @!%p1 mov.u32 %r120, %r411;
270
+ @!%p1 mov.u32 %r121, %r411;
271
+ mov.b32 %f214, %r118;
272
+ mov.b32 %f215, %r119;
273
+ mov.b32 %f216, %r120;
274
+ mov.b32 %f217, %r121;
275
+ mov.u32 %r126, 0x0;
276
+ mov.u32 %r127, 0x0;
277
+ mov.u32 %r128, 0x0;
278
+ mov.u32 %r129, 0x0;
279
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r126, %r127, %r128, %r129 }, [ %rd88 + 0 ];
280
+ @!%p1 mov.u32 %r126, %r411;
281
+ @!%p1 mov.u32 %r127, %r411;
282
+ @!%p1 mov.u32 %r128, %r411;
283
+ @!%p1 mov.u32 %r129, %r411;
284
+ mov.b32 %f218, %r126;
285
+ mov.b32 %f219, %r127;
286
+ mov.b32 %f220, %r128;
287
+ mov.b32 %f221, %r129;
288
+ .loc 1 42 22
289
+ add.f32 %f222, %f65, %f206;
290
+ add.f32 %f223, %f66, %f207;
291
+ add.f32 %f224, %f67, %f208;
292
+ add.f32 %f225, %f68, %f209;
293
+ add.f32 %f226, %f69, %f210;
294
+ add.f32 %f227, %f70, %f211;
295
+ add.f32 %f228, %f71, %f212;
296
+ add.f32 %f229, %f72, %f213;
297
+ add.f32 %f230, %f73, %f214;
298
+ add.f32 %f231, %f74, %f215;
299
+ add.f32 %f232, %f75, %f216;
300
+ add.f32 %f233, %f76, %f217;
301
+ add.f32 %f234, %f77, %f218;
302
+ add.f32 %f235, %f78, %f219;
303
+ add.f32 %f236, %f79, %f220;
304
+ add.f32 %f237, %f80, %f221;
305
+ .loc 1 44 22
306
+ add.f32 %f238, %f81, %f222;
307
+ add.f32 %f239, %f82, %f223;
308
+ add.f32 %f240, %f83, %f224;
309
+ add.f32 %f241, %f84, %f225;
310
+ add.f32 %f242, %f85, %f226;
311
+ add.f32 %f243, %f86, %f227;
312
+ add.f32 %f244, %f87, %f228;
313
+ add.f32 %f245, %f88, %f229;
314
+ add.f32 %f246, %f89, %f230;
315
+ add.f32 %f247, %f90, %f231;
316
+ add.f32 %f248, %f91, %f232;
317
+ add.f32 %f249, %f92, %f233;
318
+ add.f32 %f250, %f93, %f234;
319
+ add.f32 %f251, %f94, %f235;
320
+ add.f32 %f252, %f95, %f236;
321
+ add.f32 %f253, %f96, %f237;
322
+ $L__tmp1:
323
+ .loc 2 96 20
324
+ sub.f32 %f254, %f238, %f662;
325
+ sub.f32 %f255, %f239, %f663;
326
+ sub.f32 %f256, %f240, %f664;
327
+ sub.f32 %f257, %f241, %f665;
328
+ sub.f32 %f258, %f242, %f666;
329
+ sub.f32 %f259, %f243, %f667;
330
+ sub.f32 %f260, %f244, %f668;
331
+ sub.f32 %f261, %f245, %f669;
332
+ sub.f32 %f262, %f246, %f670;
333
+ sub.f32 %f263, %f247, %f671;
334
+ sub.f32 %f264, %f248, %f672;
335
+ sub.f32 %f265, %f249, %f673;
336
+ sub.f32 %f266, %f250, %f674;
337
+ sub.f32 %f267, %f251, %f675;
338
+ sub.f32 %f268, %f252, %f676;
339
+ sub.f32 %f269, %f253, %f677;
340
+ .loc 2 97 26
341
+ add.f32 %f614, %f614, 0f3F800000;
342
+ add.f32 %f615, %f615, 0f3F800000;
343
+ add.f32 %f616, %f616, 0f3F800000;
344
+ add.f32 %f617, %f617, 0f3F800000;
345
+ add.f32 %f618, %f618, 0f3F800000;
346
+ add.f32 %f619, %f619, 0f3F800000;
347
+ add.f32 %f620, %f620, 0f3F800000;
348
+ add.f32 %f621, %f621, 0f3F800000;
349
+ add.f32 %f622, %f622, 0f3F800000;
350
+ add.f32 %f623, %f623, 0f3F800000;
351
+ add.f32 %f624, %f624, 0f3F800000;
352
+ add.f32 %f625, %f625, 0f3F800000;
353
+ add.f32 %f626, %f626, 0f3F800000;
354
+ add.f32 %f627, %f627, 0f3F800000;
355
+ add.f32 %f628, %f628, 0f3F800000;
356
+ add.f32 %f629, %f629, 0f3F800000;
357
+ add.f32 %f630, %f630, 0f3F800000;
358
+ add.f32 %f631, %f631, 0f3F800000;
359
+ add.f32 %f632, %f632, 0f3F800000;
360
+ add.f32 %f633, %f633, 0f3F800000;
361
+ add.f32 %f634, %f634, 0f3F800000;
362
+ add.f32 %f635, %f635, 0f3F800000;
363
+ add.f32 %f636, %f636, 0f3F800000;
364
+ add.f32 %f637, %f637, 0f3F800000;
365
+ add.f32 %f638, %f638, 0f3F800000;
366
+ add.f32 %f639, %f639, 0f3F800000;
367
+ add.f32 %f640, %f640, 0f3F800000;
368
+ add.f32 %f641, %f641, 0f3F800000;
369
+ add.f32 %f642, %f642, 0f3F800000;
370
+ add.f32 %f643, %f643, 0f3F800000;
371
+ add.f32 %f644, %f644, 0f3F800000;
372
+ add.f32 %f645, %f645, 0f3F800000;
373
+ .loc 2 98 30
374
+ mov.b32 %r135, %f254;
375
+ mov.b32 %r136, %f614;
376
+ div.full.f32 %r134, %r135, %r136;
377
+ mov.b32 %f270, %r134;
378
+ mov.b32 %r138, %f255;
379
+ mov.b32 %r139, %f615;
380
+ div.full.f32 %r137, %r138, %r139;
381
+ mov.b32 %f271, %r137;
382
+ mov.b32 %r141, %f256;
383
+ mov.b32 %r142, %f616;
384
+ div.full.f32 %r140, %r141, %r142;
385
+ mov.b32 %f272, %r140;
386
+ mov.b32 %r144, %f257;
387
+ mov.b32 %r145, %f617;
388
+ div.full.f32 %r143, %r144, %r145;
389
+ mov.b32 %f273, %r143;
390
+ mov.b32 %r147, %f258;
391
+ mov.b32 %r148, %f618;
392
+ div.full.f32 %r146, %r147, %r148;
393
+ mov.b32 %f274, %r146;
394
+ mov.b32 %r150, %f259;
395
+ mov.b32 %r151, %f619;
396
+ div.full.f32 %r149, %r150, %r151;
397
+ mov.b32 %f275, %r149;
398
+ mov.b32 %r153, %f260;
399
+ mov.b32 %r154, %f620;
400
+ div.full.f32 %r152, %r153, %r154;
401
+ mov.b32 %f276, %r152;
402
+ mov.b32 %r156, %f261;
403
+ mov.b32 %r157, %f621;
404
+ div.full.f32 %r155, %r156, %r157;
405
+ mov.b32 %f277, %r155;
406
+ mov.b32 %r159, %f262;
407
+ mov.b32 %r160, %f622;
408
+ div.full.f32 %r158, %r159, %r160;
409
+ mov.b32 %f278, %r158;
410
+ mov.b32 %r162, %f263;
411
+ mov.b32 %r163, %f623;
412
+ div.full.f32 %r161, %r162, %r163;
413
+ mov.b32 %f279, %r161;
414
+ mov.b32 %r165, %f264;
415
+ mov.b32 %r166, %f624;
416
+ div.full.f32 %r164, %r165, %r166;
417
+ mov.b32 %f280, %r164;
418
+ mov.b32 %r168, %f265;
419
+ mov.b32 %r169, %f625;
420
+ div.full.f32 %r167, %r168, %r169;
421
+ mov.b32 %f281, %r167;
422
+ mov.b32 %r171, %f266;
423
+ mov.b32 %r172, %f626;
424
+ div.full.f32 %r170, %r171, %r172;
425
+ mov.b32 %f282, %r170;
426
+ mov.b32 %r174, %f267;
427
+ mov.b32 %r175, %f627;
428
+ div.full.f32 %r173, %r174, %r175;
429
+ mov.b32 %f283, %r173;
430
+ mov.b32 %r177, %f268;
431
+ mov.b32 %r178, %f628;
432
+ div.full.f32 %r176, %r177, %r178;
433
+ mov.b32 %f284, %r176;
434
+ mov.b32 %r180, %f269;
435
+ mov.b32 %r181, %f629;
436
+ div.full.f32 %r179, %r180, %r181;
437
+ mov.b32 %f285, %r179;
438
+ .loc 2 98 22
439
+ add.f32 %f662, %f662, %f270;
440
+ add.f32 %f663, %f663, %f271;
441
+ add.f32 %f664, %f664, %f272;
442
+ add.f32 %f665, %f665, %f273;
443
+ add.f32 %f666, %f666, %f274;
444
+ add.f32 %f667, %f667, %f275;
445
+ add.f32 %f668, %f668, %f276;
446
+ add.f32 %f669, %f669, %f277;
447
+ add.f32 %f670, %f670, %f278;
448
+ add.f32 %f671, %f671, %f279;
449
+ add.f32 %f672, %f672, %f280;
450
+ add.f32 %f673, %f673, %f281;
451
+ add.f32 %f674, %f674, %f282;
452
+ add.f32 %f675, %f675, %f283;
453
+ add.f32 %f676, %f676, %f284;
454
+ add.f32 %f677, %f677, %f285;
455
+ .loc 2 101 30
456
+ sub.f32 %f286, %f238, %f662;
457
+ sub.f32 %f287, %f239, %f663;
458
+ sub.f32 %f288, %f240, %f664;
459
+ sub.f32 %f289, %f241, %f665;
460
+ sub.f32 %f290, %f242, %f666;
461
+ sub.f32 %f291, %f243, %f667;
462
+ sub.f32 %f292, %f244, %f668;
463
+ sub.f32 %f293, %f245, %f669;
464
+ sub.f32 %f294, %f246, %f670;
465
+ sub.f32 %f295, %f247, %f671;
466
+ sub.f32 %f296, %f248, %f672;
467
+ sub.f32 %f297, %f249, %f673;
468
+ sub.f32 %f298, %f250, %f674;
469
+ sub.f32 %f299, %f251, %f675;
470
+ sub.f32 %f300, %f252, %f676;
471
+ sub.f32 %f301, %f253, %f677;
472
+ $L__tmp2:
473
+ .loc 1 50 50
474
+ fma.rn.f32 %f646, %f254, %f286, %f646;
475
+ fma.rn.f32 %f647, %f255, %f287, %f647;
476
+ fma.rn.f32 %f648, %f256, %f288, %f648;
477
+ fma.rn.f32 %f649, %f257, %f289, %f649;
478
+ fma.rn.f32 %f650, %f258, %f290, %f650;
479
+ fma.rn.f32 %f651, %f259, %f291, %f651;
480
+ fma.rn.f32 %f652, %f260, %f292, %f652;
481
+ fma.rn.f32 %f653, %f261, %f293, %f653;
482
+ fma.rn.f32 %f654, %f262, %f294, %f654;
483
+ fma.rn.f32 %f655, %f263, %f295, %f655;
484
+ fma.rn.f32 %f656, %f264, %f296, %f656;
485
+ fma.rn.f32 %f657, %f265, %f297, %f657;
486
+ fma.rn.f32 %f658, %f266, %f298, %f658;
487
+ fma.rn.f32 %f659, %f267, %f299, %f659;
488
+ fma.rn.f32 %f660, %f268, %f300, %f660;
489
+ fma.rn.f32 %f661, %f269, %f301, %f661;
490
+ .loc 1 31 36
491
+ add.s64 %rd116, %rd116, 256;
492
+ add.s32 %r472, %r472, 64;
493
+ setp.lt.u32 %p72, %r472, 192;
494
+ @%p72 bra $L__BB0_1;
495
+ bra.uni $L__BB0_4;
496
+ $L__BB0_1:
497
+ .loc 1 40 40
498
+ setp.lt.u64 %p51, %rd1, 50257;
499
+ .loc 1 35 34
500
+ add.s64 %rd70, %rd6, %rd116;
501
+ add.s64 %rd71, %rd70, 16;
502
+ add.s64 %rd72, %rd5, %rd116;
503
+ .loc 1 35 50
504
+ add.s64 %rd73, %rd72, 16;
505
+ mov.b32 %r411, 0;
506
+ mov.u32 %r34, 0x0;
507
+ mov.u32 %r35, 0x0;
508
+ mov.u32 %r36, 0x0;
509
+ mov.u32 %r37, 0x0;
510
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r34, %r35, %r36, %r37 }, [ %rd70 + 0 ];
511
+ @!%p1 mov.u32 %r34, %r411;
512
+ @!%p1 mov.u32 %r35, %r411;
513
+ @!%p1 mov.u32 %r36, %r411;
514
+ @!%p1 mov.u32 %r37, %r411;
515
+ mov.b32 %f65, %r34;
516
+ mov.b32 %f66, %r35;
517
+ mov.b32 %f67, %r36;
518
+ mov.b32 %f68, %r37;
519
+ mov.u32 %r42, 0x0;
520
+ mov.u32 %r43, 0x0;
521
+ mov.u32 %r44, 0x0;
522
+ mov.u32 %r45, 0x0;
523
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r42, %r43, %r44, %r45 }, [ %rd71 + 0 ];
524
+ @!%p1 mov.u32 %r42, %r411;
525
+ @!%p1 mov.u32 %r43, %r411;
526
+ @!%p1 mov.u32 %r44, %r411;
527
+ @!%p1 mov.u32 %r45, %r411;
528
+ mov.b32 %f69, %r42;
529
+ mov.b32 %f70, %r43;
530
+ mov.b32 %f71, %r44;
531
+ mov.b32 %f72, %r45;
532
+ mov.u32 %r50, 0x0;
533
+ mov.u32 %r51, 0x0;
534
+ mov.u32 %r52, 0x0;
535
+ mov.u32 %r53, 0x0;
536
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r50, %r51, %r52, %r53 }, [ %rd72 + 0 ];
537
+ @!%p1 mov.u32 %r50, %r411;
538
+ @!%p1 mov.u32 %r51, %r411;
539
+ @!%p1 mov.u32 %r52, %r411;
540
+ @!%p1 mov.u32 %r53, %r411;
541
+ mov.b32 %f73, %r50;
542
+ mov.b32 %f74, %r51;
543
+ mov.b32 %f75, %r52;
544
+ mov.b32 %f76, %r53;
545
+ mov.u32 %r58, 0x0;
546
+ mov.u32 %r59, 0x0;
547
+ mov.u32 %r60, 0x0;
548
+ mov.u32 %r61, 0x0;
549
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r58, %r59, %r60, %r61 }, [ %rd73 + 0 ];
550
+ @!%p1 mov.u32 %r58, %r411;
551
+ @!%p1 mov.u32 %r59, %r411;
552
+ @!%p1 mov.u32 %r60, %r411;
553
+ @!%p1 mov.u32 %r61, %r411;
554
+ mov.b32 %f77, %r58;
555
+ mov.b32 %f78, %r59;
556
+ mov.b32 %f79, %r60;
557
+ mov.b32 %f80, %r61;
558
+ .loc 1 36 40
559
+ add.s32 %r98, %r5, %r472;
560
+ add.s32 %r99, %r98, 64;
561
+ .loc 1 36 34
562
+ add.s32 %r100, %r98, 8256;
563
+ mul.wide.s32 %rd76, %r99, 2;
564
+ add.s64 %rd74, %rd14, %rd76;
565
+ mul.wide.s32 %rd77, %r100, 2;
566
+ add.s64 %rd75, %rd14, %rd77;
567
+ .loc 1 36 50
568
+ mov.u32 %r66, 0x0;
569
+ mov.u32 %r67, 0x0;
570
+ mov.u32 %r68, 0x0;
571
+ mov.u32 %r69, 0x0;
572
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r66, %r67, %r68, %r69 }, [ %rd74 + 0 ];
573
+ @!%p1 mov.u32 %r66, %r411;
574
+ @!%p1 mov.u32 %r67, %r411;
575
+ @!%p1 mov.u32 %r68, %r411;
576
+ @!%p1 mov.u32 %r69, %r411;
577
+ cvt.u16.u32 %rs1, %r66;
578
+ { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r66; }
579
+ cvt.u16.u32 %rs3, %r67;
580
+ { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r67; }
581
+ cvt.u16.u32 %rs5, %r68;
582
+ { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r68; }
583
+ cvt.u16.u32 %rs7, %r69;
584
+ { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r69; }
585
+ mov.u32 %r74, 0x0;
586
+ mov.u32 %r75, 0x0;
587
+ mov.u32 %r76, 0x0;
588
+ mov.u32 %r77, 0x0;
589
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r74, %r75, %r76, %r77 }, [ %rd75 + 0 ];
590
+ @!%p1 mov.u32 %r74, %r411;
591
+ @!%p1 mov.u32 %r75, %r411;
592
+ @!%p1 mov.u32 %r76, %r411;
593
+ @!%p1 mov.u32 %r77, %r411;
594
+ cvt.u16.u32 %rs9, %r74;
595
+ { .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r74; }
596
+ cvt.u16.u32 %rs11, %r75;
597
+ { .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r75; }
598
+ cvt.u16.u32 %rs13, %r76;
599
+ { .reg .b16 tmp; mov.b32 {tmp, %rs14}, %r76; }
600
+ cvt.u16.u32 %rs15, %r77;
601
+ { .reg .b16 tmp; mov.b32 {tmp, %rs16}, %r77; }
602
+ .loc 1 36 101
603
+ cvt.f32.bf16 %r82, %rs1;
604
+ mov.b32 %f81, %r82;
605
+ cvt.f32.bf16 %r83, %rs2;
606
+ mov.b32 %f82, %r83;
607
+ cvt.f32.bf16 %r84, %rs3;
608
+ mov.b32 %f83, %r84;
609
+ cvt.f32.bf16 %r85, %rs4;
610
+ mov.b32 %f84, %r85;
611
+ cvt.f32.bf16 %r86, %rs5;
612
+ mov.b32 %f85, %r86;
613
+ cvt.f32.bf16 %r87, %rs6;
614
+ mov.b32 %f86, %r87;
615
+ cvt.f32.bf16 %r88, %rs7;
616
+ mov.b32 %f87, %r88;
617
+ cvt.f32.bf16 %r89, %rs8;
618
+ mov.b32 %f88, %r89;
619
+ cvt.f32.bf16 %r90, %rs9;
620
+ mov.b32 %f89, %r90;
621
+ cvt.f32.bf16 %r91, %rs10;
622
+ mov.b32 %f90, %r91;
623
+ cvt.f32.bf16 %r92, %rs11;
624
+ mov.b32 %f91, %r92;
625
+ cvt.f32.bf16 %r93, %rs12;
626
+ mov.b32 %f92, %r93;
627
+ cvt.f32.bf16 %r94, %rs13;
628
+ mov.b32 %f93, %r94;
629
+ cvt.f32.bf16 %r95, %rs14;
630
+ mov.b32 %f94, %r95;
631
+ cvt.f32.bf16 %r96, %rs15;
632
+ mov.b32 %f95, %r96;
633
+ cvt.f32.bf16 %r97, %rs16;
634
+ mov.b32 %f96, %r97;
635
+ mov.b32 %r471, 883;
636
+ mov.u64 %rd115, 1;
637
+ .loc 1 40 55
638
+ @%p51 bra $L__BB0_3;
639
+ mov.u64 %rd78, assertMessage_0;
640
+ cvta.global.u64 %rd79, %rd78;
641
+ mov.u64 %rd80, assertFile_0;
642
+ cvta.global.u64 %rd81, %rd80;
643
+ mov.u64 %rd82, assertFunc_0;
644
+ cvta.global.u64 %rd83, %rd82;
645
+ { // callseq 6, 0
646
+ .reg .b32 temp_param_reg;
647
+ .param .b64 param0;
648
+ st.param.b64 [param0+0], %rd79;
649
+ .param .b64 param1;
650
+ st.param.b64 [param1+0], %rd81;
651
+ .param .b32 param2;
652
+ st.param.b32 [param2+0], %r471;
653
+ .param .b64 param3;
654
+ st.param.b64 [param3+0], %rd83;
655
+ .param .b64 param4;
656
+ st.param.b64 [param4+0], %rd115;
657
+ call.uni
658
+ __assertfail,
659
+ (
660
+ param0,
661
+ param1,
662
+ param2,
663
+ param3,
664
+ param4
665
+ );
666
+ } // callseq 6
667
+ bra.uni $L__BB0_3;
668
+ $L__BB0_4:
669
+ .loc 1 31 36
670
+ and.b32 %r291, %r4, 3;
671
+ mad.lo.s32 %r292, %r291, 72, %r2;
672
+ shl.b32 %r293, %r292, 2;
673
+ mov.u32 %r294, global_smem;
674
+ add.s32 %r295, %r294, %r293;
675
+ st.shared.f32 [%r295], %f630;
676
+ st.shared.f32 [%r295+1152], %f631;
677
+ st.shared.f32 [%r295+2304], %f632;
678
+ st.shared.f32 [%r295+3456], %f633;
679
+ st.shared.f32 [%r295+4608], %f634;
680
+ st.shared.f32 [%r295+5760], %f635;
681
+ st.shared.f32 [%r295+6912], %f636;
682
+ st.shared.f32 [%r295+8064], %f637;
683
+ bar.sync 0;
684
+ mad.lo.s32 %r296, %r1, 72, %r3;
685
+ shl.b32 %r297, %r296, 2;
686
+ add.s32 %r298, %r294, %r297;
687
+ ld.shared.v4.f32 {%f302, %f303, %f304, %f305}, [%r298];
688
+ ld.shared.v4.f32 {%f306, %f307, %f308, %f309}, [%r298+16];
689
+ bar.sync 0;
690
+ st.shared.f32 [%r295], %f638;
691
+ st.shared.f32 [%r295+1152], %f639;
692
+ st.shared.f32 [%r295+2304], %f640;
693
+ st.shared.f32 [%r295+3456], %f641;
694
+ st.shared.f32 [%r295+4608], %f642;
695
+ st.shared.f32 [%r295+5760], %f643;
696
+ st.shared.f32 [%r295+6912], %f644;
697
+ st.shared.f32 [%r295+8064], %f645;
698
+ bar.sync 0;
699
+ ld.shared.v4.f32 {%f310, %f311, %f312, %f313}, [%r298];
700
+ ld.shared.v4.f32 {%f314, %f315, %f316, %f317}, [%r298+16];
701
+ $L__tmp3:
702
+ .loc 2 108 21
703
+ sub.f32 %f318, %f663, %f662;
704
+ .loc 2 109 28
705
+ add.f32 %f319, %f302, %f303;
706
+ .loc 2 110 39
707
+ setp.eq.f32 %p73, %f319, 0f00000000;
708
+ .loc 2 110 60
709
+ mov.b32 %r183, %f303;
710
+ mov.b32 %r184, %f319;
711
+ div.full.f32 %r182, %r183, %r184;
712
+ mov.b32 %f320, %r182;
713
+ .loc 2 110 49
714
+ selp.f32 %f321, 0f00000000, %f320, %p73;
715
+ .loc 2 112 17
716
+ fma.rn.f32 %f322, %f318, %f321, %f662;
717
+ .loc 2 113 15
718
+ add.f32 %f323, %f646, %f647;
719
+ .loc 2 113 30
720
+ mul.f32 %f324, %f318, %f318;
721
+ .loc 2 113 38
722
+ mul.f32 %f325, %f324, %f302;
723
+ .loc 2 113 22
724
+ fma.rn.f32 %f326, %f325, %f321, %f323;
725
+ .loc 2 108 21
726
+ sub.f32 %f327, %f664, %f322;
727
+ .loc 2 109 28
728
+ add.f32 %f328, %f304, %f319;
729
+ .loc 2 110 39
730
+ setp.eq.f32 %p74, %f328, 0f00000000;
731
+ .loc 2 110 60
732
+ mov.b32 %r187, %f328;
733
+ mov.b32 %r186, %f304;
734
+ div.full.f32 %r185, %r186, %r187;
735
+ mov.b32 %f329, %r185;
736
+ .loc 2 110 49
737
+ selp.f32 %f330, 0f00000000, %f329, %p74;
738
+ .loc 2 112 17
739
+ fma.rn.f32 %f331, %f330, %f327, %f322;
740
+ .loc 2 113 15
741
+ add.f32 %f332, %f648, %f326;
742
+ .loc 2 113 30
743
+ mul.f32 %f333, %f327, %f327;
744
+ .loc 2 113 38
745
+ mul.f32 %f334, %f319, %f333;
746
+ .loc 2 113 22
747
+ fma.rn.f32 %f335, %f330, %f334, %f332;
748
+ .loc 2 108 21
749
+ sub.f32 %f336, %f665, %f331;
750
+ .loc 2 109 28
751
+ add.f32 %f337, %f305, %f328;
752
+ .loc 2 110 39
753
+ setp.eq.f32 %p75, %f337, 0f00000000;
754
+ .loc 2 110 60
755
+ mov.b32 %r190, %f337;
756
+ mov.b32 %r189, %f305;
757
+ div.full.f32 %r188, %r189, %r190;
758
+ mov.b32 %f338, %r188;
759
+ .loc 2 110 49
760
+ selp.f32 %f339, 0f00000000, %f338, %p75;
761
+ .loc 2 112 17
762
+ fma.rn.f32 %f340, %f339, %f336, %f331;
763
+ .loc 2 113 15
764
+ add.f32 %f341, %f649, %f335;
765
+ .loc 2 113 30
766
+ mul.f32 %f342, %f336, %f336;
767
+ .loc 2 113 38
768
+ mul.f32 %f343, %f328, %f342;
769
+ .loc 2 113 22
770
+ fma.rn.f32 %f344, %f339, %f343, %f341;
771
+ .loc 2 108 21
772
+ sub.f32 %f345, %f666, %f340;
773
+ .loc 2 109 28
774
+ add.f32 %f346, %f306, %f337;
775
+ .loc 2 110 39
776
+ setp.eq.f32 %p76, %f346, 0f00000000;
777
+ .loc 2 110 60
778
+ mov.b32 %r193, %f346;
779
+ mov.b32 %r192, %f306;
780
+ div.full.f32 %r191, %r192, %r193;
781
+ mov.b32 %f347, %r191;
782
+ .loc 2 110 49
783
+ selp.f32 %f348, 0f00000000, %f347, %p76;
784
+ .loc 2 112 17
785
+ fma.rn.f32 %f349, %f348, %f345, %f340;
786
+ .loc 2 113 15
787
+ add.f32 %f350, %f650, %f344;
788
+ .loc 2 113 30
789
+ mul.f32 %f351, %f345, %f345;
790
+ .loc 2 113 38
791
+ mul.f32 %f352, %f337, %f351;
792
+ .loc 2 113 22
793
+ fma.rn.f32 %f353, %f348, %f352, %f350;
794
+ .loc 2 108 21
795
+ sub.f32 %f354, %f667, %f349;
796
+ .loc 2 109 28
797
+ add.f32 %f355, %f307, %f346;
798
+ .loc 2 110 39
799
+ setp.eq.f32 %p77, %f355, 0f00000000;
800
+ .loc 2 110 60
801
+ mov.b32 %r196, %f355;
802
+ mov.b32 %r195, %f307;
803
+ div.full.f32 %r194, %r195, %r196;
804
+ mov.b32 %f356, %r194;
805
+ .loc 2 110 49
806
+ selp.f32 %f357, 0f00000000, %f356, %p77;
807
+ .loc 2 112 17
808
+ fma.rn.f32 %f358, %f357, %f354, %f349;
809
+ .loc 2 113 15
810
+ add.f32 %f359, %f651, %f353;
811
+ .loc 2 113 30
812
+ mul.f32 %f360, %f354, %f354;
813
+ .loc 2 113 38
814
+ mul.f32 %f361, %f346, %f360;
815
+ .loc 2 113 22
816
+ fma.rn.f32 %f362, %f357, %f361, %f359;
817
+ .loc 2 108 21
818
+ sub.f32 %f363, %f668, %f358;
819
+ .loc 2 109 28
820
+ add.f32 %f364, %f308, %f355;
821
+ .loc 2 110 39
822
+ setp.eq.f32 %p78, %f364, 0f00000000;
823
+ .loc 2 110 60
824
+ mov.b32 %r199, %f364;
825
+ mov.b32 %r198, %f308;
826
+ div.full.f32 %r197, %r198, %r199;
827
+ mov.b32 %f365, %r197;
828
+ .loc 2 110 49
829
+ selp.f32 %f366, 0f00000000, %f365, %p78;
830
+ .loc 2 112 17
831
+ fma.rn.f32 %f367, %f366, %f363, %f358;
832
+ .loc 2 113 15
833
+ add.f32 %f368, %f652, %f362;
834
+ .loc 2 113 30
835
+ mul.f32 %f369, %f363, %f363;
836
+ .loc 2 113 38
837
+ mul.f32 %f370, %f355, %f369;
838
+ .loc 2 113 22
839
+ fma.rn.f32 %f371, %f366, %f370, %f368;
840
+ .loc 2 108 21
841
+ sub.f32 %f372, %f669, %f367;
842
+ .loc 2 109 28
843
+ add.f32 %f373, %f309, %f364;
844
+ .loc 2 110 39
845
+ setp.eq.f32 %p79, %f373, 0f00000000;
846
+ .loc 2 110 60
847
+ mov.b32 %r202, %f373;
848
+ mov.b32 %r201, %f309;
849
+ div.full.f32 %r200, %r201, %r202;
850
+ mov.b32 %f374, %r200;
851
+ .loc 2 110 49
852
+ selp.f32 %f375, 0f00000000, %f374, %p79;
853
+ .loc 2 112 17
854
+ fma.rn.f32 %f376, %f375, %f372, %f367;
855
+ .loc 2 113 15
856
+ add.f32 %f377, %f653, %f371;
857
+ .loc 2 113 30
858
+ mul.f32 %f378, %f372, %f372;
859
+ .loc 2 113 38
860
+ mul.f32 %f379, %f364, %f378;
861
+ .loc 2 113 22
862
+ fma.rn.f32 %f380, %f375, %f379, %f377;
863
+ .loc 2 108 21
864
+ sub.f32 %f381, %f671, %f670;
865
+ .loc 2 109 28
866
+ add.f32 %f382, %f310, %f311;
867
+ .loc 2 110 39
868
+ setp.eq.f32 %p80, %f382, 0f00000000;
869
+ .loc 2 110 60
870
+ mov.b32 %r204, %f311;
871
+ mov.b32 %r205, %f382;
872
+ div.full.f32 %r203, %r204, %r205;
873
+ mov.b32 %f383, %r203;
874
+ .loc 2 110 49
875
+ selp.f32 %f384, 0f00000000, %f383, %p80;
876
+ .loc 2 112 17
877
+ fma.rn.f32 %f385, %f381, %f384, %f670;
878
+ .loc 2 113 15
879
+ add.f32 %f386, %f654, %f655;
880
+ .loc 2 113 30
881
+ mul.f32 %f387, %f381, %f381;
882
+ .loc 2 113 38
883
+ mul.f32 %f388, %f387, %f310;
884
+ .loc 2 113 22
885
+ fma.rn.f32 %f389, %f388, %f384, %f386;
886
+ .loc 2 108 21
887
+ sub.f32 %f390, %f672, %f385;
888
+ .loc 2 109 28
889
+ add.f32 %f391, %f312, %f382;
890
+ .loc 2 110 39
891
+ setp.eq.f32 %p81, %f391, 0f00000000;
892
+ .loc 2 110 60
893
+ mov.b32 %r208, %f391;
894
+ mov.b32 %r207, %f312;
895
+ div.full.f32 %r206, %r207, %r208;
896
+ mov.b32 %f392, %r206;
897
+ .loc 2 110 49
898
+ selp.f32 %f393, 0f00000000, %f392, %p81;
899
+ .loc 2 112 17
900
+ fma.rn.f32 %f394, %f393, %f390, %f385;
901
+ .loc 2 113 15
902
+ add.f32 %f395, %f656, %f389;
903
+ .loc 2 113 30
904
+ mul.f32 %f396, %f390, %f390;
905
+ .loc 2 113 38
906
+ mul.f32 %f397, %f382, %f396;
907
+ .loc 2 113 22
908
+ fma.rn.f32 %f398, %f393, %f397, %f395;
909
+ .loc 2 108 21
910
+ sub.f32 %f399, %f673, %f394;
911
+ .loc 2 109 28
912
+ add.f32 %f400, %f313, %f391;
913
+ .loc 2 110 39
914
+ setp.eq.f32 %p82, %f400, 0f00000000;
915
+ .loc 2 110 60
916
+ mov.b32 %r211, %f400;
917
+ mov.b32 %r210, %f313;
918
+ div.full.f32 %r209, %r210, %r211;
919
+ mov.b32 %f401, %r209;
920
+ .loc 2 110 49
921
+ selp.f32 %f402, 0f00000000, %f401, %p82;
922
+ .loc 2 112 17
923
+ fma.rn.f32 %f403, %f402, %f399, %f394;
924
+ .loc 2 113 15
925
+ add.f32 %f404, %f657, %f398;
926
+ .loc 2 113 30
927
+ mul.f32 %f405, %f399, %f399;
928
+ .loc 2 113 38
929
+ mul.f32 %f406, %f391, %f405;
930
+ .loc 2 113 22
931
+ fma.rn.f32 %f407, %f402, %f406, %f404;
932
+ .loc 2 108 21
933
+ sub.f32 %f408, %f674, %f403;
934
+ .loc 2 109 28
935
+ add.f32 %f409, %f314, %f400;
936
+ .loc 2 110 39
937
+ setp.eq.f32 %p83, %f409, 0f00000000;
938
+ .loc 2 110 60
939
+ mov.b32 %r214, %f409;
940
+ mov.b32 %r213, %f314;
941
+ div.full.f32 %r212, %r213, %r214;
942
+ mov.b32 %f410, %r212;
943
+ .loc 2 110 49
944
+ selp.f32 %f411, 0f00000000, %f410, %p83;
945
+ .loc 2 112 17
946
+ fma.rn.f32 %f412, %f411, %f408, %f403;
947
+ .loc 2 113 15
948
+ add.f32 %f413, %f658, %f407;
949
+ .loc 2 113 30
950
+ mul.f32 %f414, %f408, %f408;
951
+ .loc 2 113 38
952
+ mul.f32 %f415, %f400, %f414;
953
+ .loc 2 113 22
954
+ fma.rn.f32 %f416, %f411, %f415, %f413;
955
+ .loc 2 108 21
956
+ sub.f32 %f417, %f675, %f412;
957
+ .loc 2 109 28
958
+ add.f32 %f418, %f315, %f409;
959
+ .loc 2 110 39
960
+ setp.eq.f32 %p84, %f418, 0f00000000;
961
+ .loc 2 110 60
962
+ mov.b32 %r217, %f418;
963
+ mov.b32 %r216, %f315;
964
+ div.full.f32 %r215, %r216, %r217;
965
+ mov.b32 %f419, %r215;
966
+ .loc 2 110 49
967
+ selp.f32 %f420, 0f00000000, %f419, %p84;
968
+ .loc 2 112 17
969
+ fma.rn.f32 %f421, %f420, %f417, %f412;
970
+ .loc 2 113 15
971
+ add.f32 %f422, %f659, %f416;
972
+ .loc 2 113 30
973
+ mul.f32 %f423, %f417, %f417;
974
+ .loc 2 113 38
975
+ mul.f32 %f424, %f409, %f423;
976
+ .loc 2 113 22
977
+ fma.rn.f32 %f425, %f420, %f424, %f422;
978
+ .loc 2 108 21
979
+ sub.f32 %f426, %f676, %f421;
980
+ .loc 2 109 28
981
+ add.f32 %f427, %f316, %f418;
982
+ .loc 2 110 39
983
+ setp.eq.f32 %p85, %f427, 0f00000000;
984
+ .loc 2 110 60
985
+ mov.b32 %r220, %f427;
986
+ mov.b32 %r219, %f316;
987
+ div.full.f32 %r218, %r219, %r220;
988
+ mov.b32 %f428, %r218;
989
+ .loc 2 110 49
990
+ selp.f32 %f429, 0f00000000, %f428, %p85;
991
+ .loc 2 112 17
992
+ fma.rn.f32 %f430, %f429, %f426, %f421;
993
+ .loc 2 113 15
994
+ add.f32 %f431, %f660, %f425;
995
+ .loc 2 113 30
996
+ mul.f32 %f432, %f426, %f426;
997
+ .loc 2 113 38
998
+ mul.f32 %f433, %f418, %f432;
999
+ .loc 2 113 22
1000
+ fma.rn.f32 %f434, %f429, %f433, %f431;
1001
+ .loc 2 108 21
1002
+ sub.f32 %f435, %f677, %f430;
1003
+ .loc 2 109 28
1004
+ add.f32 %f436, %f317, %f427;
1005
+ .loc 2 110 39
1006
+ setp.eq.f32 %p86, %f436, 0f00000000;
1007
+ .loc 2 110 60
1008
+ mov.b32 %r223, %f436;
1009
+ mov.b32 %r222, %f317;
1010
+ div.full.f32 %r221, %r222, %r223;
1011
+ mov.b32 %f437, %r221;
1012
+ .loc 2 110 49
1013
+ selp.f32 %f438, 0f00000000, %f437, %p86;
1014
+ .loc 2 112 17
1015
+ fma.rn.f32 %f439, %f438, %f435, %f430;
1016
+ .loc 2 113 15
1017
+ add.f32 %f440, %f661, %f434;
1018
+ .loc 2 113 30
1019
+ mul.f32 %f441, %f435, %f435;
1020
+ .loc 2 113 38
1021
+ mul.f32 %f442, %f427, %f441;
1022
+ .loc 2 113 22
1023
+ fma.rn.f32 %f443, %f438, %f442, %f440;
1024
+ $L__tmp4:
1025
+ .loc 2 120 46
1026
+ mov.b32 %r299, %f376;
1027
+ shfl.sync.bfly.b32 %r300, %r299, 4, 31, -1;
1028
+ mov.b32 %f444, %r300;
1029
+ mov.b32 %r301, %f380;
1030
+ shfl.sync.bfly.b32 %r302, %r301, 4, 31, -1;
1031
+ mov.b32 %f445, %r302;
1032
+ shfl.sync.bfly.b32 %r225, %r202, 4, 31, -1;
1033
+ mov.b32 %f446, %r225;
1034
+ $L__tmp5:
1035
+ .loc 2 108 21
1036
+ sub.f32 %f447, %f444, %f376;
1037
+ .loc 2 109 28
1038
+ add.f32 %f448, %f373, %f446;
1039
+ .loc 2 110 39
1040
+ setp.eq.f32 %p87, %f448, 0f00000000;
1041
+ .loc 2 110 60
1042
+ mov.b32 %r226, %f448;
1043
+ div.full.f32 %r224, %r225, %r226;
1044
+ mov.b32 %f449, %r224;
1045
+ .loc 2 110 49
1046
+ selp.f32 %f450, 0f00000000, %f449, %p87;
1047
+ .loc 2 112 17
1048
+ fma.rn.f32 %f451, %f450, %f447, %f376;
1049
+ .loc 2 113 15
1050
+ add.f32 %f452, %f380, %f445;
1051
+ .loc 2 113 30
1052
+ mul.f32 %f453, %f447, %f447;
1053
+ .loc 2 113 38
1054
+ mul.f32 %f454, %f373, %f453;
1055
+ .loc 2 113 22
1056
+ fma.rn.f32 %f455, %f450, %f454, %f452;
1057
+ $L__tmp6:
1058
+ .loc 2 120 46
1059
+ mov.b32 %r303, %f451;
1060
+ shfl.sync.bfly.b32 %r304, %r303, 2, 31, -1;
1061
+ mov.b32 %f456, %r304;
1062
+ mov.b32 %r305, %f455;
1063
+ shfl.sync.bfly.b32 %r306, %r305, 2, 31, -1;
1064
+ mov.b32 %f457, %r306;
1065
+ shfl.sync.bfly.b32 %r228, %r226, 2, 31, -1;
1066
+ mov.b32 %f458, %r228;
1067
+ $L__tmp7:
1068
+ .loc 2 108 21
1069
+ sub.f32 %f459, %f456, %f451;
1070
+ .loc 2 109 28
1071
+ add.f32 %f460, %f448, %f458;
1072
+ .loc 2 110 39
1073
+ setp.eq.f32 %p88, %f460, 0f00000000;
1074
+ .loc 2 110 60
1075
+ mov.b32 %r229, %f460;
1076
+ div.full.f32 %r227, %r228, %r229;
1077
+ mov.b32 %f461, %r227;
1078
+ .loc 2 110 49
1079
+ selp.f32 %f462, 0f00000000, %f461, %p88;
1080
+ .loc 2 112 17
1081
+ fma.rn.f32 %f463, %f462, %f459, %f451;
1082
+ .loc 2 113 15
1083
+ add.f32 %f464, %f455, %f457;
1084
+ .loc 2 113 30
1085
+ mul.f32 %f465, %f459, %f459;
1086
+ .loc 2 113 38
1087
+ mul.f32 %f466, %f448, %f465;
1088
+ .loc 2 113 22
1089
+ fma.rn.f32 %f467, %f462, %f466, %f464;
1090
+ $L__tmp8:
1091
+ .loc 2 120 46
1092
+ mov.b32 %r307, %f463;
1093
+ shfl.sync.bfly.b32 %r308, %r307, 1, 31, -1;
1094
+ mov.b32 %f468, %r308;
1095
+ mov.b32 %r309, %f467;
1096
+ shfl.sync.bfly.b32 %r310, %r309, 1, 31, -1;
1097
+ mov.b32 %f469, %r310;
1098
+ shfl.sync.bfly.b32 %r231, %r229, 1, 31, -1;
1099
+ mov.b32 %f470, %r231;
1100
+ $L__tmp9:
1101
+ .loc 2 108 21
1102
+ sub.f32 %f471, %f468, %f463;
1103
+ .loc 2 109 28
1104
+ add.f32 %f472, %f460, %f470;
1105
+ .loc 2 110 39
1106
+ setp.eq.f32 %p89, %f472, 0f00000000;
1107
+ .loc 2 110 60
1108
+ mov.b32 %r232, %f472;
1109
+ div.full.f32 %r230, %r231, %r232;
1110
+ mov.b32 %f473, %r230;
1111
+ .loc 2 110 49
1112
+ selp.f32 %f474, 0f00000000, %f473, %p89;
1113
+ .loc 2 112 17
1114
+ fma.rn.f32 %f161, %f471, %f474, %f463;
1115
+ .loc 2 113 15
1116
+ add.f32 %f475, %f467, %f469;
1117
+ .loc 2 113 30
1118
+ mul.f32 %f476, %f471, %f471;
1119
+ .loc 2 113 38
1120
+ mul.f32 %f477, %f460, %f476;
1121
+ .loc 2 113 22
1122
+ fma.rn.f32 %f478, %f474, %f477, %f475;
1123
+ $L__tmp10:
1124
+ .loc 2 120 46
1125
+ mov.b32 %r311, %f439;
1126
+ shfl.sync.bfly.b32 %r312, %r311, 4, 31, -1;
1127
+ mov.b32 %f479, %r312;
1128
+ mov.b32 %r313, %f443;
1129
+ shfl.sync.bfly.b32 %r314, %r313, 4, 31, -1;
1130
+ mov.b32 %f480, %r314;
1131
+ shfl.sync.bfly.b32 %r234, %r223, 4, 31, -1;
1132
+ mov.b32 %f481, %r234;
1133
+ $L__tmp11:
1134
+ .loc 2 108 21
1135
+ sub.f32 %f482, %f479, %f439;
1136
+ .loc 2 109 28
1137
+ add.f32 %f483, %f436, %f481;
1138
+ .loc 2 110 39
1139
+ setp.eq.f32 %p90, %f483, 0f00000000;
1140
+ .loc 2 110 60
1141
+ mov.b32 %r235, %f483;
1142
+ div.full.f32 %r233, %r234, %r235;
1143
+ mov.b32 %f484, %r233;
1144
+ .loc 2 110 49
1145
+ selp.f32 %f485, 0f00000000, %f484, %p90;
1146
+ .loc 2 112 17
1147
+ fma.rn.f32 %f486, %f482, %f485, %f439;
1148
+ .loc 2 113 15
1149
+ add.f32 %f487, %f443, %f480;
1150
+ .loc 2 113 30
1151
+ mul.f32 %f488, %f482, %f482;
1152
+ .loc 2 113 38
1153
+ mul.f32 %f489, %f436, %f488;
1154
+ .loc 2 113 22
1155
+ fma.rn.f32 %f490, %f489, %f485, %f487;
1156
+ $L__tmp12:
1157
+ .loc 2 120 46
1158
+ mov.b32 %r315, %f486;
1159
+ shfl.sync.bfly.b32 %r316, %r315, 2, 31, -1;
1160
+ mov.b32 %f491, %r316;
1161
+ mov.b32 %r317, %f490;
1162
+ shfl.sync.bfly.b32 %r318, %r317, 2, 31, -1;
1163
+ mov.b32 %f492, %r318;
1164
+ shfl.sync.bfly.b32 %r237, %r235, 2, 31, -1;
1165
+ mov.b32 %f493, %r237;
1166
+ $L__tmp13:
1167
+ .loc 2 108 21
1168
+ sub.f32 %f494, %f491, %f486;
1169
+ .loc 2 109 28
1170
+ add.f32 %f495, %f483, %f493;
1171
+ .loc 2 110 39
1172
+ setp.eq.f32 %p91, %f495, 0f00000000;
1173
+ .loc 2 110 60
1174
+ mov.b32 %r238, %f495;
1175
+ div.full.f32 %r236, %r237, %r238;
1176
+ mov.b32 %f496, %r236;
1177
+ .loc 2 110 49
1178
+ selp.f32 %f497, 0f00000000, %f496, %p91;
1179
+ .loc 2 112 17
1180
+ fma.rn.f32 %f498, %f494, %f497, %f486;
1181
+ .loc 2 113 15
1182
+ add.f32 %f499, %f490, %f492;
1183
+ .loc 2 113 30
1184
+ mul.f32 %f500, %f494, %f494;
1185
+ .loc 2 113 38
1186
+ mul.f32 %f501, %f483, %f500;
1187
+ .loc 2 113 22
1188
+ fma.rn.f32 %f502, %f497, %f501, %f499;
1189
+ $L__tmp14:
1190
+ .loc 2 120 46
1191
+ mov.b32 %r319, %f498;
1192
+ shfl.sync.bfly.b32 %r320, %r319, 1, 31, -1;
1193
+ mov.b32 %f503, %r320;
1194
+ mov.b32 %r321, %f502;
1195
+ shfl.sync.bfly.b32 %r322, %r321, 1, 31, -1;
1196
+ mov.b32 %f504, %r322;
1197
+ shfl.sync.bfly.b32 %r240, %r238, 1, 31, -1;
1198
+ mov.b32 %f505, %r240;
1199
+ $L__tmp15:
1200
+ .loc 2 108 21
1201
+ sub.f32 %f506, %f503, %f498;
1202
+ .loc 2 109 28
1203
+ add.f32 %f507, %f495, %f505;
1204
+ .loc 2 110 39
1205
+ setp.eq.f32 %p92, %f507, 0f00000000;
1206
+ .loc 2 110 60
1207
+ mov.b32 %r241, %f507;
1208
+ div.full.f32 %r239, %r240, %r241;
1209
+ mov.b32 %f508, %r239;
1210
+ .loc 2 110 49
1211
+ selp.f32 %f509, 0f00000000, %f508, %p92;
1212
+ .loc 2 112 17
1213
+ fma.rn.f32 %f162, %f506, %f509, %f498;
1214
+ .loc 2 113 15
1215
+ add.f32 %f510, %f502, %f504;
1216
+ .loc 2 113 30
1217
+ mul.f32 %f511, %f506, %f506;
1218
+ .loc 2 113 38
1219
+ mul.f32 %f512, %f495, %f511;
1220
+ .loc 2 113 22
1221
+ fma.rn.f32 %f513, %f509, %f512, %f510;
1222
+ $L__tmp16:
1223
+ .loc 1 75 24
1224
+ mov.b32 %r243, %f478;
1225
+ mov.b32 %r244, 1132462080;
1226
+ div.full.f32 %r242, %r243, %r244;
1227
+ mov.b32 %f514, %r242;
1228
+ mov.b32 %r267, %f513;
1229
+ div.full.f32 %r266, %r267, %r244;
1230
+ mov.b32 %f515, %r266;
1231
+ .loc 1 77 24
1232
+ add.f32 %f163, %f514, 0f3727C5AC;
1233
+ add.f32 %f164, %f515, 0f3727C5AC;
1234
+ .loc 1 58 36
1235
+ add.s64 %rd9, %rd15, %rd2;
1236
+ mov.u64 %rd117, 0;
1237
+ mov.b32 %r473, -64;
1238
+ rsqrt.approx.ftz.f32 %f580, %f163;
1239
+ rsqrt.approx.ftz.f32 %f581, %f164;
1240
+ bra.uni $L__BB0_5;
1241
+ $L__BB0_7:
1242
+ .loc 1 69 35
1243
+ add.s64 %rd107, %rd4, %rd117;
1244
+ add.s64 %rd108, %rd107, 16;
1245
+ add.s64 %rd109, %rd3, %rd117;
1246
+ .loc 1 69 54
1247
+ add.s64 %rd110, %rd109, 16;
1248
+ mov.u32 %r407, 0x0;
1249
+ mov.u32 %r408, 0x0;
1250
+ mov.u32 %r409, 0x0;
1251
+ mov.u32 %r410, 0x0;
1252
+ @%p1 ld.global.L1::evict_first.v4.b32 { %r407, %r408, %r409, %r410 }, [ %rd107 + 0 ];
1253
+ @!%p1 mov.u32 %r407, %r411;
1254
+ @!%p1 mov.u32 %r408, %r411;
1255
+ @!%p1 mov.u32 %r409, %r411;
1256
+ @!%p1 mov.u32 %r410, %r411;
1257
+ mov.b32 %f516, %r407;
1258
+ mov.b32 %f517, %r408;
1259
+ mov.b32 %f518, %r409;
1260
+ mov.b32 %f519, %r410;
1261
+ mov.u32 %r415, 0x0;
1262
+ mov.u32 %r416, 0x0;
1263
+ mov.u32 %r417, 0x0;
1264
+ mov.u32 %r418, 0x0;
1265
+ @%p1 ld.global.L1::evict_first.v4.b32 { %r415, %r416, %r417, %r418 }, [ %rd108 + 0 ];
1266
+ @!%p1 mov.u32 %r415, %r411;
1267
+ @!%p1 mov.u32 %r416, %r411;
1268
+ @!%p1 mov.u32 %r417, %r411;
1269
+ @!%p1 mov.u32 %r418, %r411;
1270
+ mov.b32 %f520, %r415;
1271
+ mov.b32 %f521, %r416;
1272
+ mov.b32 %f522, %r417;
1273
+ mov.b32 %f523, %r418;
1274
+ mov.u32 %r423, 0x0;
1275
+ mov.u32 %r424, 0x0;
1276
+ mov.u32 %r425, 0x0;
1277
+ mov.u32 %r426, 0x0;
1278
+ @%p1 ld.global.L1::evict_first.v4.b32 { %r423, %r424, %r425, %r426 }, [ %rd109 + 0 ];
1279
+ @!%p1 mov.u32 %r423, %r411;
1280
+ @!%p1 mov.u32 %r424, %r411;
1281
+ @!%p1 mov.u32 %r425, %r411;
1282
+ @!%p1 mov.u32 %r426, %r411;
1283
+ mov.b32 %f524, %r423;
1284
+ mov.b32 %f525, %r424;
1285
+ mov.b32 %f526, %r425;
1286
+ mov.b32 %f527, %r426;
1287
+ mov.u32 %r431, 0x0;
1288
+ mov.u32 %r432, 0x0;
1289
+ mov.u32 %r433, 0x0;
1290
+ mov.u32 %r434, 0x0;
1291
+ @%p1 ld.global.L1::evict_first.v4.b32 { %r431, %r432, %r433, %r434 }, [ %rd110 + 0 ];
1292
+ @!%p1 mov.u32 %r431, %r411;
1293
+ @!%p1 mov.u32 %r432, %r411;
1294
+ @!%p1 mov.u32 %r433, %r411;
1295
+ @!%p1 mov.u32 %r434, %r411;
1296
+ mov.b32 %f528, %r431;
1297
+ mov.b32 %f529, %r432;
1298
+ mov.b32 %f530, %r433;
1299
+ mov.b32 %f531, %r434;
1300
+ .loc 1 70 24
1301
+ add.f32 %f532, %f165, %f516;
1302
+ add.f32 %f533, %f166, %f517;
1303
+ add.f32 %f534, %f167, %f518;
1304
+ add.f32 %f535, %f168, %f519;
1305
+ add.f32 %f536, %f169, %f520;
1306
+ add.f32 %f537, %f170, %f521;
1307
+ add.f32 %f538, %f171, %f522;
1308
+ add.f32 %f539, %f172, %f523;
1309
+ add.f32 %f540, %f173, %f524;
1310
+ add.f32 %f541, %f174, %f525;
1311
+ add.f32 %f542, %f175, %f526;
1312
+ add.f32 %f543, %f176, %f527;
1313
+ add.f32 %f544, %f177, %f528;
1314
+ add.f32 %f545, %f178, %f529;
1315
+ add.f32 %f546, %f179, %f530;
1316
+ add.f32 %f547, %f180, %f531;
1317
+ .loc 1 72 24
1318
+ add.f32 %f548, %f181, %f532;
1319
+ add.f32 %f549, %f182, %f533;
1320
+ add.f32 %f550, %f183, %f534;
1321
+ add.f32 %f551, %f184, %f535;
1322
+ add.f32 %f552, %f185, %f536;
1323
+ add.f32 %f553, %f186, %f537;
1324
+ add.f32 %f554, %f187, %f538;
1325
+ add.f32 %f555, %f188, %f539;
1326
+ add.f32 %f556, %f189, %f540;
1327
+ add.f32 %f557, %f190, %f541;
1328
+ add.f32 %f558, %f191, %f542;
1329
+ add.f32 %f559, %f192, %f543;
1330
+ add.f32 %f560, %f193, %f544;
1331
+ add.f32 %f561, %f194, %f545;
1332
+ add.f32 %f562, %f195, %f546;
1333
+ add.f32 %f563, %f196, %f547;
1334
+ .loc 1 73 24
1335
+ sub.f32 %f564, %f548, %f161;
1336
+ sub.f32 %f565, %f549, %f161;
1337
+ sub.f32 %f566, %f550, %f161;
1338
+ sub.f32 %f567, %f551, %f161;
1339
+ sub.f32 %f568, %f552, %f161;
1340
+ sub.f32 %f569, %f553, %f161;
1341
+ sub.f32 %f570, %f554, %f161;
1342
+ sub.f32 %f571, %f555, %f161;
1343
+ sub.f32 %f572, %f556, %f162;
1344
+ sub.f32 %f573, %f557, %f162;
1345
+ sub.f32 %f574, %f558, %f162;
1346
+ sub.f32 %f575, %f559, %f162;
1347
+ sub.f32 %f576, %f560, %f162;
1348
+ sub.f32 %f577, %f561, %f162;
1349
+ sub.f32 %f578, %f562, %f162;
1350
+ sub.f32 %f579, %f563, %f162;
1351
+ .loc 1 79 24
1352
+ mul.f32 %f582, %f564, %f580;
1353
+ mul.f32 %f583, %f565, %f580;
1354
+ mul.f32 %f584, %f566, %f580;
1355
+ mul.f32 %f585, %f567, %f580;
1356
+ mul.f32 %f586, %f568, %f580;
1357
+ mul.f32 %f587, %f569, %f580;
1358
+ mul.f32 %f588, %f570, %f580;
1359
+ mul.f32 %f589, %f571, %f580;
1360
+ mul.f32 %f590, %f572, %f581;
1361
+ mul.f32 %f591, %f573, %f581;
1362
+ mul.f32 %f592, %f574, %f581;
1363
+ mul.f32 %f593, %f575, %f581;
1364
+ mul.f32 %f594, %f576, %f581;
1365
+ mul.f32 %f595, %f577, %f581;
1366
+ mul.f32 %f596, %f578, %f581;
1367
+ mul.f32 %f597, %f579, %f581;
1368
+ .loc 1 80 24
1369
+ mul.f32 %f598, %f582, %f197;
1370
+ mul.f32 %f599, %f583, %f198;
1371
+ mul.f32 %f600, %f584, %f199;
1372
+ mul.f32 %f601, %f585, %f200;
1373
+ mul.f32 %f602, %f586, %f201;
1374
+ mul.f32 %f603, %f587, %f202;
1375
+ mul.f32 %f604, %f588, %f203;
1376
+ mul.f32 %f605, %f589, %f204;
1377
+ mul.f32 %f606, %f590, %f197;
1378
+ mul.f32 %f607, %f591, %f198;
1379
+ mul.f32 %f608, %f592, %f199;
1380
+ mul.f32 %f609, %f593, %f200;
1381
+ mul.f32 %f610, %f594, %f201;
1382
+ mul.f32 %f611, %f595, %f202;
1383
+ mul.f32 %f612, %f596, %f203;
1384
+ mul.f32 %f613, %f597, %f204;
1385
+ .loc 1 82 29
1386
+ shl.b64 %rd113, %rd11, 1;
1387
+ add.s64 %rd111, %rd16, %rd113;
1388
+ shl.b64 %rd114, %rd12, 1;
1389
+ add.s64 %rd112, %rd16, %rd114;
1390
+ .loc 1 82 52
1391
+ mov.b32 %r439, %f598;
1392
+ cvt.rn.bf16.f32 %rs33, %r439;
1393
+ mov.b32 %r440, %f599;
1394
+ cvt.rn.bf16.f32 %rs34, %r440;
1395
+ mov.b32 %r441, %f600;
1396
+ cvt.rn.bf16.f32 %rs35, %r441;
1397
+ mov.b32 %r442, %f601;
1398
+ cvt.rn.bf16.f32 %rs36, %r442;
1399
+ mov.b32 %r443, %f602;
1400
+ cvt.rn.bf16.f32 %rs37, %r443;
1401
+ mov.b32 %r444, %f603;
1402
+ cvt.rn.bf16.f32 %rs38, %r444;
1403
+ mov.b32 %r445, %f604;
1404
+ cvt.rn.bf16.f32 %rs39, %r445;
1405
+ mov.b32 %r446, %f605;
1406
+ cvt.rn.bf16.f32 %rs40, %r446;
1407
+ mov.b32 %r447, %f606;
1408
+ cvt.rn.bf16.f32 %rs41, %r447;
1409
+ mov.b32 %r448, %f607;
1410
+ cvt.rn.bf16.f32 %rs42, %r448;
1411
+ mov.b32 %r449, %f608;
1412
+ cvt.rn.bf16.f32 %rs43, %r449;
1413
+ mov.b32 %r450, %f609;
1414
+ cvt.rn.bf16.f32 %rs44, %r450;
1415
+ mov.b32 %r451, %f610;
1416
+ cvt.rn.bf16.f32 %rs45, %r451;
1417
+ mov.b32 %r452, %f611;
1418
+ cvt.rn.bf16.f32 %rs46, %r452;
1419
+ mov.b32 %r453, %f612;
1420
+ cvt.rn.bf16.f32 %rs47, %r453;
1421
+ mov.b32 %r454, %f613;
1422
+ cvt.rn.bf16.f32 %rs48, %r454;
1423
+ mov.b32 %r463, {%rs33, %rs34};
1424
+ mov.b32 %r464, {%rs35, %rs36};
1425
+ mov.b32 %r465, {%rs37, %rs38};
1426
+ mov.b32 %r466, {%rs39, %rs40};
1427
+ @%p1 st.global.v4.b32 [ %rd111 + 0 ], { %r463, %r464, %r465, %r466 };
1428
+ mov.b32 %r467, {%rs41, %rs42};
1429
+ mov.b32 %r468, {%rs43, %rs44};
1430
+ mov.b32 %r469, {%rs45, %rs46};
1431
+ mov.b32 %r470, {%rs47, %rs48};
1432
+ @%p1 st.global.v4.b32 [ %rd112 + 0 ], { %r467, %r468, %r469, %r470 };
1433
+ .loc 1 58 36
1434
+ add.s64 %rd117, %rd117, 256;
1435
+ add.s32 %r473, %r473, 64;
1436
+ setp.lt.u32 %p156, %r473, 192;
1437
+ @%p156 bra $L__BB0_5;
1438
+ bra.uni $L__BB0_8;
1439
+ $L__BB0_5:
1440
+ .loc 1 62 35
1441
+ add.s64 %rd90, %rd6, %rd117;
1442
+ add.s64 %rd91, %rd90, 16;
1443
+ add.s64 %rd92, %rd5, %rd117;
1444
+ .loc 1 62 51
1445
+ add.s64 %rd93, %rd92, 16;
1446
+ mov.u32 %r323, 0x0;
1447
+ mov.u32 %r324, 0x0;
1448
+ mov.u32 %r325, 0x0;
1449
+ mov.u32 %r326, 0x0;
1450
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r323, %r324, %r325, %r326 }, [ %rd90 + 0 ];
1451
+ @!%p1 mov.u32 %r323, %r411;
1452
+ @!%p1 mov.u32 %r324, %r411;
1453
+ @!%p1 mov.u32 %r325, %r411;
1454
+ @!%p1 mov.u32 %r326, %r411;
1455
+ mov.b32 %f165, %r323;
1456
+ mov.b32 %f166, %r324;
1457
+ mov.b32 %f167, %r325;
1458
+ mov.b32 %f168, %r326;
1459
+ mov.u32 %r331, 0x0;
1460
+ mov.u32 %r332, 0x0;
1461
+ mov.u32 %r333, 0x0;
1462
+ mov.u32 %r334, 0x0;
1463
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r331, %r332, %r333, %r334 }, [ %rd91 + 0 ];
1464
+ @!%p1 mov.u32 %r331, %r411;
1465
+ @!%p1 mov.u32 %r332, %r411;
1466
+ @!%p1 mov.u32 %r333, %r411;
1467
+ @!%p1 mov.u32 %r334, %r411;
1468
+ mov.b32 %f169, %r331;
1469
+ mov.b32 %f170, %r332;
1470
+ mov.b32 %f171, %r333;
1471
+ mov.b32 %f172, %r334;
1472
+ mov.u32 %r339, 0x0;
1473
+ mov.u32 %r340, 0x0;
1474
+ mov.u32 %r341, 0x0;
1475
+ mov.u32 %r342, 0x0;
1476
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r339, %r340, %r341, %r342 }, [ %rd92 + 0 ];
1477
+ @!%p1 mov.u32 %r339, %r411;
1478
+ @!%p1 mov.u32 %r340, %r411;
1479
+ @!%p1 mov.u32 %r341, %r411;
1480
+ @!%p1 mov.u32 %r342, %r411;
1481
+ mov.b32 %f173, %r339;
1482
+ mov.b32 %f174, %r340;
1483
+ mov.b32 %f175, %r341;
1484
+ mov.b32 %f176, %r342;
1485
+ mov.u32 %r347, 0x0;
1486
+ mov.u32 %r348, 0x0;
1487
+ mov.u32 %r349, 0x0;
1488
+ mov.u32 %r350, 0x0;
1489
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r347, %r348, %r349, %r350 }, [ %rd93 + 0 ];
1490
+ @!%p1 mov.u32 %r347, %r411;
1491
+ @!%p1 mov.u32 %r348, %r411;
1492
+ @!%p1 mov.u32 %r349, %r411;
1493
+ @!%p1 mov.u32 %r350, %r411;
1494
+ mov.b32 %f177, %r347;
1495
+ mov.b32 %f178, %r348;
1496
+ mov.b32 %f179, %r349;
1497
+ mov.b32 %f180, %r350;
1498
+ .loc 1 63 41
1499
+ add.s32 %r403, %r5, %r473;
1500
+ add.s32 %r404, %r403, 64;
1501
+ .loc 1 63 35
1502
+ add.s32 %r405, %r403, 8256;
1503
+ cvt.s64.s32 %rd11, %r404;
1504
+ mul.wide.s32 %rd98, %r404, 2;
1505
+ add.s64 %rd94, %rd14, %rd98;
1506
+ cvt.s64.s32 %rd12, %r405;
1507
+ mul.wide.s32 %rd99, %r405, 2;
1508
+ add.s64 %rd95, %rd14, %rd99;
1509
+ .loc 1 63 51
1510
+ mov.u32 %r355, 0x0;
1511
+ mov.u32 %r356, 0x0;
1512
+ mov.u32 %r357, 0x0;
1513
+ mov.u32 %r358, 0x0;
1514
+ @%p1 ld.global.L1::evict_first.v4.b32 { %r355, %r356, %r357, %r358 }, [ %rd94 + 0 ];
1515
+ @!%p1 mov.u32 %r355, %r411;
1516
+ @!%p1 mov.u32 %r356, %r411;
1517
+ @!%p1 mov.u32 %r357, %r411;
1518
+ @!%p1 mov.u32 %r358, %r411;
1519
+ cvt.u16.u32 %rs17, %r355;
1520
+ { .reg .b16 tmp; mov.b32 {tmp, %rs18}, %r355; }
1521
+ cvt.u16.u32 %rs19, %r356;
1522
+ { .reg .b16 tmp; mov.b32 {tmp, %rs20}, %r356; }
1523
+ cvt.u16.u32 %rs21, %r357;
1524
+ { .reg .b16 tmp; mov.b32 {tmp, %rs22}, %r357; }
1525
+ cvt.u16.u32 %rs23, %r358;
1526
+ { .reg .b16 tmp; mov.b32 {tmp, %rs24}, %r358; }
1527
+ mov.u32 %r363, 0x0;
1528
+ mov.u32 %r364, 0x0;
1529
+ mov.u32 %r365, 0x0;
1530
+ mov.u32 %r366, 0x0;
1531
+ @%p1 ld.global.L1::evict_first.v4.b32 { %r363, %r364, %r365, %r366 }, [ %rd95 + 0 ];
1532
+ @!%p1 mov.u32 %r363, %r411;
1533
+ @!%p1 mov.u32 %r364, %r411;
1534
+ @!%p1 mov.u32 %r365, %r411;
1535
+ @!%p1 mov.u32 %r366, %r411;
1536
+ cvt.u16.u32 %rs25, %r363;
1537
+ { .reg .b16 tmp; mov.b32 {tmp, %rs26}, %r363; }
1538
+ cvt.u16.u32 %rs27, %r364;
1539
+ { .reg .b16 tmp; mov.b32 {tmp, %rs28}, %r364; }
1540
+ cvt.u16.u32 %rs29, %r365;
1541
+ { .reg .b16 tmp; mov.b32 {tmp, %rs30}, %r365; }
1542
+ cvt.u16.u32 %rs31, %r366;
1543
+ { .reg .b16 tmp; mov.b32 {tmp, %rs32}, %r366; }
1544
+ .loc 1 63 103
1545
+ cvt.f32.bf16 %r371, %rs17;
1546
+ mov.b32 %f181, %r371;
1547
+ cvt.f32.bf16 %r372, %rs18;
1548
+ mov.b32 %f182, %r372;
1549
+ cvt.f32.bf16 %r373, %rs19;
1550
+ mov.b32 %f183, %r373;
1551
+ cvt.f32.bf16 %r374, %rs20;
1552
+ mov.b32 %f184, %r374;
1553
+ cvt.f32.bf16 %r375, %rs21;
1554
+ mov.b32 %f185, %r375;
1555
+ cvt.f32.bf16 %r376, %rs22;
1556
+ mov.b32 %f186, %r376;
1557
+ cvt.f32.bf16 %r377, %rs23;
1558
+ mov.b32 %f187, %r377;
1559
+ cvt.f32.bf16 %r378, %rs24;
1560
+ mov.b32 %f188, %r378;
1561
+ cvt.f32.bf16 %r379, %rs25;
1562
+ mov.b32 %f189, %r379;
1563
+ cvt.f32.bf16 %r380, %rs26;
1564
+ mov.b32 %f190, %r380;
1565
+ cvt.f32.bf16 %r381, %rs27;
1566
+ mov.b32 %f191, %r381;
1567
+ cvt.f32.bf16 %r382, %rs28;
1568
+ mov.b32 %f192, %r382;
1569
+ cvt.f32.bf16 %r383, %rs29;
1570
+ mov.b32 %f193, %r383;
1571
+ cvt.f32.bf16 %r384, %rs30;
1572
+ mov.b32 %f194, %r384;
1573
+ cvt.f32.bf16 %r385, %rs31;
1574
+ mov.b32 %f195, %r385;
1575
+ cvt.f32.bf16 %r386, %rs32;
1576
+ mov.b32 %f196, %r386;
1577
+ .loc 1 64 35
1578
+ add.s64 %rd96, %rd9, %rd117;
1579
+ .loc 1 64 40
1580
+ add.s64 %rd97, %rd96, 16;
1581
+ mov.u32 %r387, 0x0;
1582
+ mov.u32 %r388, 0x0;
1583
+ mov.u32 %r389, 0x0;
1584
+ mov.u32 %r390, 0x0;
1585
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r387, %r388, %r389, %r390 }, [ %rd96 + 0 ];
1586
+ @!%p1 mov.u32 %r387, %r411;
1587
+ @!%p1 mov.u32 %r388, %r411;
1588
+ @!%p1 mov.u32 %r389, %r411;
1589
+ @!%p1 mov.u32 %r390, %r411;
1590
+ mov.b32 %f197, %r387;
1591
+ mov.b32 %f198, %r388;
1592
+ mov.b32 %f199, %r389;
1593
+ mov.b32 %f200, %r390;
1594
+ mov.u32 %r395, 0x0;
1595
+ mov.u32 %r396, 0x0;
1596
+ mov.u32 %r397, 0x0;
1597
+ mov.u32 %r398, 0x0;
1598
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r395, %r396, %r397, %r398 }, [ %rd97 + 0 ];
1599
+ @!%p1 mov.u32 %r395, %r411;
1600
+ @!%p1 mov.u32 %r396, %r411;
1601
+ @!%p1 mov.u32 %r397, %r411;
1602
+ @!%p1 mov.u32 %r398, %r411;
1603
+ mov.b32 %f201, %r395;
1604
+ mov.b32 %f202, %r396;
1605
+ mov.b32 %f203, %r397;
1606
+ mov.b32 %f204, %r398;
1607
+ .loc 1 68 57
1608
+ @%p51 bra $L__BB0_7;
1609
+ mov.u64 %rd100, assertMessage_1;
1610
+ cvta.global.u64 %rd101, %rd100;
1611
+ mov.u64 %rd102, assertFile_1;
1612
+ cvta.global.u64 %rd103, %rd102;
1613
+ mov.u64 %rd104, assertFunc_1;
1614
+ cvta.global.u64 %rd105, %rd104;
1615
+ { // callseq 7, 0
1616
+ .reg .b32 temp_param_reg;
1617
+ .param .b64 param0;
1618
+ st.param.b64 [param0+0], %rd101;
1619
+ .param .b64 param1;
1620
+ st.param.b64 [param1+0], %rd103;
1621
+ .param .b32 param2;
1622
+ st.param.b32 [param2+0], %r471;
1623
+ .param .b64 param3;
1624
+ st.param.b64 [param3+0], %rd105;
1625
+ .param .b64 param4;
1626
+ st.param.b64 [param4+0], %rd115;
1627
+ call.uni
1628
+ __assertfail,
1629
+ (
1630
+ param0,
1631
+ param1,
1632
+ param2,
1633
+ param3,
1634
+ param4
1635
+ );
1636
+ } // callseq 7
1637
+ bra.uni $L__BB0_7;
1638
+ $L__BB0_8:
1639
+ .loc 1 58 4
1640
+ ret;
1641
+ $L__tmp17:
1642
+ $L__func_end0:
1643
+
1644
+ }
1645
+ // .globl __nv_rsqrtf
1646
+ .visible .func (.param .b32 func_retval0) __nv_rsqrtf(
1647
+ .param .b32 __nv_rsqrtf_param_0
1648
+ )
1649
+ {
1650
+ .reg .f32 %f<3>;
1651
+ $L__func_begin1:
1652
+
1653
+ ld.param.f32 %f1, [__nv_rsqrtf_param_0];
1654
+ rsqrt.approx.ftz.f32 %f2, %f1;
1655
+ st.param.f32 [func_retval0+0], %f2;
1656
+ ret;
1657
+ $L__func_end1:
1658
+
1659
+ }
1660
+ .file 1 "/tmp/torchinductor_root/pn/cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py"
1661
+ .file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
1662
+ .section .debug_abbrev
1663
+ {
1664
+ .b8 1
1665
+ .b8 17
1666
+ .b8 1
1667
+ .b8 37
1668
+ .b8 8
1669
+ .b8 19
1670
+ .b8 5
1671
+ .b8 3
1672
+ .b8 8
1673
+ .b8 16
1674
+ .b8 6
1675
+ .b8 27
1676
+ .b8 8
1677
+ .b8 180
1678
+ .b8 66
1679
+ .b8 12
1680
+ .b8 17
1681
+ .b8 1
1682
+ .b8 18
1683
+ .b8 1
1684
+ .b8 0
1685
+ .b8 0
1686
+ .b8 2
1687
+ .b8 46
1688
+ .b8 0
1689
+ .b8 135
1690
+ .b8 64
1691
+ .b8 8
1692
+ .b8 3
1693
+ .b8 8
1694
+ .b8 58
1695
+ .b8 11
1696
+ .b8 59
1697
+ .b8 11
1698
+ .b8 63
1699
+ .b8 12
1700
+ .b8 32
1701
+ .b8 11
1702
+ .b8 0
1703
+ .b8 0
1704
+ .b8 3
1705
+ .b8 46
1706
+ .b8 1
1707
+ .b8 17
1708
+ .b8 1
1709
+ .b8 18
1710
+ .b8 1
1711
+ .b8 64
1712
+ .b8 10
1713
+ .b8 49
1714
+ .b8 19
1715
+ .b8 0
1716
+ .b8 0
1717
+ .b8 4
1718
+ .b8 29
1719
+ .b8 0
1720
+ .b8 49
1721
+ .b8 19
1722
+ .b8 17
1723
+ .b8 1
1724
+ .b8 18
1725
+ .b8 1
1726
+ .b8 88
1727
+ .b8 11
1728
+ .b8 89
1729
+ .b8 11
1730
+ .b8 87
1731
+ .b8 11
1732
+ .b8 0
1733
+ .b8 0
1734
+ .b8 5
1735
+ .b8 29
1736
+ .b8 1
1737
+ .b8 49
1738
+ .b8 19
1739
+ .b8 17
1740
+ .b8 1
1741
+ .b8 18
1742
+ .b8 1
1743
+ .b8 88
1744
+ .b8 11
1745
+ .b8 89
1746
+ .b8 11
1747
+ .b8 87
1748
+ .b8 11
1749
+ .b8 0
1750
+ .b8 0
1751
+ .b8 0
1752
+ }
1753
+ .section .debug_info
1754
+ {
1755
+ .b32 302
1756
+ .b8 2
1757
+ .b8 0
1758
+ .b32 .debug_abbrev
1759
+ .b8 8
1760
+ .b8 1
1761
+ .b8 116
1762
+ .b8 114
1763
+ .b8 105
1764
+ .b8 116
1765
+ .b8 111
1766
+ .b8 110
1767
+ .b8 0
1768
+ .b8 2
1769
+ .b8 0
1770
+ .b8 99
1771
+ .b8 112
1772
+ .b8 110
1773
+ .b8 51
1774
+ .b8 108
1775
+ .b8 97
1776
+ .b8 119
1777
+ .b8 103
1778
+ .b8 54
1779
+ .b8 53
1780
+ .b8 108
1781
+ .b8 112
1782
+ .b8 105
1783
+ .b8 54
1784
+ .b8 51
1785
+ .b8 103
1786
+ .b8 118
1787
+ .b8 54
1788
+ .b8 99
1789
+ .b8 54
1790
+ .b8 112
1791
+ .b8 110
1792
+ .b8 52
1793
+ .b8 111
1794
+ .b8 105
1795
+ .b8 107
1796
+ .b8 104
1797
+ .b8 103
1798
+ .b8 54
1799
+ .b8 113
1800
+ .b8 118
1801
+ .b8 97
1802
+ .b8 50
1803
+ .b8 104
1804
+ .b8 50
1805
+ .b8 113
1806
+ .b8 106
1807
+ .b8 100
1808
+ .b8 112
1809
+ .b8 120
1810
+ .b8 101
1811
+ .b8 54
1812
+ .b8 113
1813
+ .b8 106
1814
+ .b8 52
1815
+ .b8 108
1816
+ .b8 118
1817
+ .b8 116
1818
+ .b8 116
1819
+ .b8 119
1820
+ .b8 101
1821
+ .b8 122
1822
+ .b8 46
1823
+ .b8 112
1824
+ .b8 121
1825
+ .b8 0
1826
+ .b32 .debug_line
1827
+ .b8 47
1828
+ .b8 116
1829
+ .b8 109
1830
+ .b8 112
1831
+ .b8 47
1832
+ .b8 116
1833
+ .b8 111
1834
+ .b8 114
1835
+ .b8 99
1836
+ .b8 104
1837
+ .b8 105
1838
+ .b8 110
1839
+ .b8 100
1840
+ .b8 117
1841
+ .b8 99
1842
+ .b8 116
1843
+ .b8 111
1844
+ .b8 114
1845
+ .b8 95
1846
+ .b8 114
1847
+ .b8 111
1848
+ .b8 111
1849
+ .b8 116
1850
+ .b8 47
1851
+ .b8 112
1852
+ .b8 110
1853
+ .b8 0
1854
+ .b8 1
1855
+ .b64 $L__func_begin0
1856
+ .b64 $L__func_end0
1857
+ .b8 2
1858
+ .b8 116
1859
+ .b8 114
1860
+ .b8 105
1861
+ .b8 116
1862
+ .b8 111
1863
+ .b8 110
1864
+ .b8 95
1865
+ .b8 95
1866
+ .b8 48
1867
+ .b8 100
1868
+ .b8 49
1869
+ .b8 100
1870
+ .b8 50
1871
+ .b8 100
1872
+ .b8 51
1873
+ .b8 100
1874
+ .b8 52
1875
+ .b8 100
1876
+ .b8 53
1877
+ .b8 100
1878
+ .b8 54
1879
+ .b8 100
1880
+ .b8 101
1881
+ .b8 55
1882
+ .b8 100
1883
+ .b8 101
1884
+ .b8 0
1885
+ .b8 116
1886
+ .b8 114
1887
+ .b8 105
1888
+ .b8 116
1889
+ .b8 111
1890
+ .b8 110
1891
+ .b8 95
1892
+ .b8 95
1893
+ .b8 48
1894
+ .b8 100
1895
+ .b8 49
1896
+ .b8 100
1897
+ .b8 50
1898
+ .b8 100
1899
+ .b8 51
1900
+ .b8 100
1901
+ .b8 52
1902
+ .b8 100
1903
+ .b8 53
1904
+ .b8 100
1905
+ .b8 54
1906
+ .b8 100
1907
+ .b8 101
1908
+ .b8 55
1909
+ .b8 100
1910
+ .b8 101
1911
+ .b8 0
1912
+ .b8 1
1913
+ .b8 18
1914
+ .b8 1
1915
+ .b8 1
1916
+ .b8 3
1917
+ .b64 $L__func_begin0
1918
+ .b64 $L__func_end0
1919
+ .b8 1
1920
+ .b8 156
1921
+ .b32 125
1922
+ .b8 4
1923
+ .b32 125
1924
+ .b64 $L__tmp1
1925
+ .b64 $L__tmp2
1926
+ .b8 2
1927
+ .b8 47
1928
+ .b8 41
1929
+ .b8 5
1930
+ .b32 125
1931
+ .b64 $L__tmp3
1932
+ .b64 $L__tmp16
1933
+ .b8 2
1934
+ .b8 53
1935
+ .b8 44
1936
+ .b8 4
1937
+ .b32 125
1938
+ .b64 $L__tmp3
1939
+ .b64 $L__tmp16
1940
+ .b8 2
1941
+ .b8 120
1942
+ .b8 46
1943
+ .b8 0
1944
+ .b8 4
1945
+ .b32 125
1946
+ .b64 $L__tmp4
1947
+ .b64 $L__tmp15
1948
+ .b8 2
1949
+ .b8 53
1950
+ .b8 44
1951
+ .b8 0
1952
+ .b8 0
1953
+ }
1954
+ .section .debug_pubnames
1955
+ {
1956
+ .b32 $L__pubNames_end0-$L__pubNames_start0
1957
+ $L__pubNames_start0:
1958
+ .b8 2
1959
+ .b8 0
1960
+ .b32 .debug_info
1961
+ .b32 306
1962
+ .b32 125
1963
+ .b8 116
1964
+ .b8 114
1965
+ .b8 105
1966
+ .b8 116
1967
+ .b8 111
1968
+ .b8 110
1969
+ .b8 95
1970
+ .b8 95
1971
+ .b8 48
1972
+ .b8 100
1973
+ .b8 49
1974
+ .b8 100
1975
+ .b8 50
1976
+ .b8 100
1977
+ .b8 51
1978
+ .b8 100
1979
+ .b8 52
1980
+ .b8 100
1981
+ .b8 53
1982
+ .b8 100
1983
+ .b8 54
1984
+ .b8 100
1985
+ .b8 101
1986
+ .b8 55
1987
+ .b8 100
1988
+ .b8 101
1989
+ .b8 0
1990
+ .b32 0
1991
+ $L__pubNames_end0:
1992
+ }
1993
+ .section .debug_pubtypes
1994
+ {
1995
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
1996
+ $L__pubTypes_start0:
1997
+ .b8 2
1998
+ .b8 0
1999
+ .b32 .debug_info
2000
+ .b32 306
2001
+ .b32 0
2002
+ $L__pubTypes_end0:
2003
+ }
2004
+ .section .debug_loc { }
.triton/dump/cde66a00b7594f6428e4f4dcdfa88537/triton_.ttgir ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
3
+ #blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 4], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
4
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
5
+ tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
6
+ %cst = arith.constant dense<512> : tensor<16x1xi32, #blocked>
7
+ %cst_0 = arith.constant dense<256> : tensor<1x128xi32, #blocked>
8
+ %cst_1 = arith.constant dense<256> : tensor<16x1xi32, #blocked>
9
+ %cst_2 = arith.constant dense<0.000000e+00> : tensor<16x128xf32, #blocked>
10
+ %cst_3 = arith.constant dense<0.000000e+00> : tensor<1x128xf32, #blocked>
11
+ %cst_4 = arith.constant dense<1.000000e+00> : tensor<16x128xf32, #blocked>
12
+ %cst_5 = arith.constant dense<256> : tensor<16x1xi64, #blocked>
13
+ %cst_6 = arith.constant dense<0> : tensor<16x1xi64, #blocked>
14
+ %cst_7 = arith.constant dense<50257> : tensor<16x1xi64, #blocked>
15
+ %cst_8 = arith.constant dense<50257> : tensor<16x1xi64, #blocked1>
16
+ %cst_9 = arith.constant dense<0> : tensor<16x1xi64, #blocked1>
17
+ %c0_i32 = arith.constant 0 : i32
18
+ %c128_i32 = arith.constant 128 : i32
19
+ %c256_i32 = arith.constant 256 : i32
20
+ %cst_10 = arith.constant dense<1.000000e+00> : tensor<16x128xf32, #blocked2>
21
+ %cst_11 = arith.constant 0.000000e+00 : f32
22
+ %cst_12 = arith.constant dense<0.000000e+00> : tensor<16x128xf32, #blocked2>
23
+ %cst_13 = arith.constant dense<256> : tensor<1x128xi32, #blocked2>
24
+ %cst_14 = arith.constant dense<9.99999974E-6> : tensor<16x1xf32, #blocked>
25
+ %cst_15 = arith.constant dense<2.560000e+02> : tensor<16x1xf32, #blocked>
26
+ %c16_i32 = arith.constant 16 : i32
27
+ %0 = tt.get_program_id x : i32
28
+ %1 = arith.muli %0, %c16_i32 : i32
29
+ %2 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
30
+ %3 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
31
+ %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xi32, #blocked>
32
+ %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<16x1xi32, #blocked1>
33
+ %6 = tt.splat %1 : (i32) -> tensor<16x1xi32, #blocked>
34
+ %7 = tt.splat %1 : (i32) -> tensor<16x1xi32, #blocked1>
35
+ %8 = arith.addi %6, %4 : tensor<16x1xi32, #blocked>
36
+ %9 = arith.addi %7, %5 : tensor<16x1xi32, #blocked1>
37
+ %10 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
38
+ %11 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>
39
+ %12 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x128xi32, #blocked>
40
+ %13 = tt.expand_dims %11 {axis = 0 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x128xi32, #blocked2>
41
+ %14 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<16x1x!tt.ptr<i64, 1>, #blocked>
42
+ %15 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<16x1x!tt.ptr<i64, 1>, #blocked1>
43
+ %16 = tt.addptr %14, %8 : tensor<16x1x!tt.ptr<i64, 1>, #blocked>, tensor<16x1xi32, #blocked>
44
+ %17 = tt.addptr %15, %9 : tensor<16x1x!tt.ptr<i64, 1>, #blocked1>, tensor<16x1xi32, #blocked1>
45
+ %18 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64, #blocked>
46
+ %19 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64, #blocked1>
47
+ %20 = arith.remsi %8, %cst : tensor<16x1xi32, #blocked>
48
+ %21 = arith.muli %20, %cst_1 : tensor<16x1xi32, #blocked>
49
+ %22 = tt.broadcast %21 : (tensor<16x1xi32, #blocked>) -> tensor<16x128xi32, #blocked>
50
+ %23 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<16x128x!tt.ptr<f32, 1>, #blocked>
51
+ %24 = arith.addi %18, %cst_7 : tensor<16x1xi64, #blocked>
52
+ %25 = arith.addi %19, %cst_8 : tensor<16x1xi64, #blocked1>
53
+ %26 = arith.cmpi slt, %18, %cst_6 : tensor<16x1xi64, #blocked>
54
+ %27 = arith.cmpi slt, %19, %cst_9 : tensor<16x1xi64, #blocked1>
55
+ %28 = arith.select %26, %24, %18 : tensor<16x1xi1, #blocked>, tensor<16x1xi64, #blocked>
56
+ %29 = arith.select %27, %25, %19 : tensor<16x1xi1, #blocked1>, tensor<16x1xi64, #blocked1>
57
+ %30 = arith.cmpi sge, %29, %cst_9 : tensor<16x1xi64, #blocked1>
58
+ %31 = arith.cmpi slt, %29, %cst_8 : tensor<16x1xi64, #blocked1>
59
+ %32 = arith.andi %30, %31 : tensor<16x1xi1, #blocked1>
60
+ %33 = arith.muli %28, %cst_5 : tensor<16x1xi64, #blocked>
61
+ %34 = tt.broadcast %33 : (tensor<16x1xi64, #blocked>) -> tensor<16x128xi64, #blocked>
62
+ %35 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<16x128x!tt.ptr<f32, 1>, #blocked>
63
+ %36:4 = scf.for %arg7 = %c0_i32 to %c256_i32 step %c128_i32 iter_args(%arg8 = %cst_2, %arg9 = %cst_2, %arg10 = %cst_12, %arg11 = %cst_2) -> (tensor<16x128xf32, #blocked>, tensor<16x128xf32, #blocked>, tensor<16x128xf32, #blocked2>, tensor<16x128xf32, #blocked>) : i32 {
64
+ %48 = tt.splat %arg7 : (i32) -> tensor<1x128xi32, #blocked>
65
+ %49 = tt.splat %arg7 : (i32) -> tensor<1x128xi32, #blocked2>
66
+ %50 = arith.addi %48, %12 : tensor<1x128xi32, #blocked>
67
+ %51 = arith.addi %49, %13 : tensor<1x128xi32, #blocked2>
68
+ %52 = arith.cmpi slt, %50, %cst_0 : tensor<1x128xi32, #blocked>
69
+ %53 = arith.cmpi slt, %51, %cst_13 : tensor<1x128xi32, #blocked2>
70
+ %54 = tt.broadcast %50 : (tensor<1x128xi32, #blocked>) -> tensor<16x128xi32, #blocked>
71
+ %55 = arith.addi %54, %22 : tensor<16x128xi32, #blocked>
72
+ %56 = tt.addptr %23, %55 : tensor<16x128x!tt.ptr<f32, 1>, #blocked>, tensor<16x128xi32, #blocked>
73
+ %57 = tt.broadcast %52 : (tensor<1x128xi1, #blocked>) -> tensor<16x128xi1, #blocked>
74
+ %58 = tt.broadcast %53 : (tensor<1x128xi1, #blocked2>) -> tensor<16x128xi1, #blocked2>
75
+ %59 = tt.load %56, %57, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x128xf32, #blocked>
76
+ tt.assert %32, "index out of bounds: 0 <= tmp3 < 50257", "/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py", "<module>", 1892 : tensor<16x1xi1, #blocked1>
77
+ %60 = arith.extsi %50 : tensor<1x128xi32, #blocked> to tensor<1x128xi64, #blocked>
78
+ %61 = tt.broadcast %60 : (tensor<1x128xi64, #blocked>) -> tensor<16x128xi64, #blocked>
79
+ %62 = arith.addi %61, %34 : tensor<16x128xi64, #blocked>
80
+ %63 = tt.addptr %35, %62 : tensor<16x128x!tt.ptr<f32, 1>, #blocked>, tensor<16x128xi64, #blocked>
81
+ %64 = tt.load %63, %57, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x128xf32, #blocked>
82
+ %65 = arith.addf %64, %59 : tensor<16x128xf32, #blocked>
83
+ %66 = arith.subf %65, %arg8 : tensor<16x128xf32, #blocked>
84
+ %67 = arith.addf %arg11, %cst_4 : tensor<16x128xf32, #blocked>
85
+ %68 = arith.addf %arg10, %cst_10 : tensor<16x128xf32, #blocked2>
86
+ %69 = arith.divf %66, %67 : tensor<16x128xf32, #blocked>
87
+ %70 = arith.addf %arg8, %69 : tensor<16x128xf32, #blocked>
88
+ %71 = arith.subf %65, %70 : tensor<16x128xf32, #blocked>
89
+ %72 = arith.mulf %66, %71 : tensor<16x128xf32, #blocked>
90
+ %73 = arith.addf %arg9, %72 : tensor<16x128xf32, #blocked>
91
+ %74 = arith.select %57, %70, %arg8 : tensor<16x128xi1, #blocked>, tensor<16x128xf32, #blocked>
92
+ %75 = arith.select %57, %73, %arg9 : tensor<16x128xi1, #blocked>, tensor<16x128xf32, #blocked>
93
+ %76 = arith.select %57, %67, %arg11 : tensor<16x128xi1, #blocked>, tensor<16x128xf32, #blocked>
94
+ %77 = arith.select %58, %68, %arg10 : tensor<16x128xi1, #blocked2>, tensor<16x128xf32, #blocked2>
95
+ scf.yield %74, %75, %77, %76 : tensor<16x128xf32, #blocked>, tensor<16x128xf32, #blocked>, tensor<16x128xf32, #blocked2>, tensor<16x128xf32, #blocked>
96
+ }
97
+ %37 = triton_gpu.convert_layout %36#2 : (tensor<16x128xf32, #blocked2>) -> tensor<16x128xf32, #blocked>
98
+ %38:3 = "tt.reduce"(%36#0, %36#1, %37) <{axis = 1 : i32}> ({
99
+ ^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32):
100
+ %48 = arith.subf %arg10, %arg7 : f32
101
+ %49 = arith.addf %arg9, %arg12 : f32
102
+ %50 = arith.cmpf oeq, %49, %cst_11 : f32
103
+ %51 = arith.divf %arg12, %49 : f32
104
+ %52 = arith.select %50, %cst_11, %51 : f32
105
+ %53 = arith.mulf %48, %52 : f32
106
+ %54 = arith.addf %arg7, %53 : f32
107
+ %55 = arith.addf %arg8, %arg11 : f32
108
+ %56 = arith.mulf %48, %48 : f32
109
+ %57 = arith.mulf %56, %arg9 : f32
110
+ %58 = arith.mulf %57, %52 : f32
111
+ %59 = arith.addf %55, %58 : f32
112
+ tt.reduce.return %54, %59, %49 : f32, f32, f32
113
+ }) : (tensor<16x128xf32, #blocked>, tensor<16x128xf32, #blocked>, tensor<16x128xf32, #blocked>) -> (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
114
+ %39 = tt.expand_dims %38#0 {axis = 1 : i32} : (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xf32, #blocked>
115
+ %40 = tt.expand_dims %38#1 {axis = 1 : i32} : (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xf32, #blocked>
116
+ %41 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<1x128x!tt.ptr<f32, 1>, #blocked>
117
+ %42 = tt.broadcast %39 : (tensor<16x1xf32, #blocked>) -> tensor<16x128xf32, #blocked>
118
+ %43 = arith.divf %40, %cst_15 : tensor<16x1xf32, #blocked>
119
+ %44 = arith.addf %43, %cst_14 : tensor<16x1xf32, #blocked>
120
+ %45 = arith.muli %8, %cst_1 : tensor<16x1xi32, #blocked>
121
+ %46 = tt.broadcast %45 : (tensor<16x1xi32, #blocked>) -> tensor<16x128xi32, #blocked>
122
+ %47 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<16x128x!tt.ptr<bf16, 1>, #blocked>
123
+ scf.for %arg7 = %c0_i32 to %c256_i32 step %c128_i32 : i32 {
124
+ %48 = tt.splat %arg7 : (i32) -> tensor<1x128xi32, #blocked>
125
+ %49 = arith.addi %48, %12 : tensor<1x128xi32, #blocked>
126
+ %50 = arith.cmpi slt, %49, %cst_0 : tensor<1x128xi32, #blocked>
127
+ %51 = tt.broadcast %49 : (tensor<1x128xi32, #blocked>) -> tensor<16x128xi32, #blocked>
128
+ %52 = arith.addi %51, %22 : tensor<16x128xi32, #blocked>
129
+ %53 = tt.addptr %23, %52 : tensor<16x128x!tt.ptr<f32, 1>, #blocked>, tensor<16x128xi32, #blocked>
130
+ %54 = tt.broadcast %50 : (tensor<1x128xi1, #blocked>) -> tensor<16x128xi1, #blocked>
131
+ %55 = tt.load %53, %54, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x128xf32, #blocked>
132
+ %56 = tt.addptr %41, %49 : tensor<1x128x!tt.ptr<f32, 1>, #blocked>, tensor<1x128xi32, #blocked>
133
+ %57 = tt.load %56, %50, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x128xf32, #blocked>
134
+ tt.assert %32, "index out of bounds: 0 <= tmp13 < 50257", "/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py", "<module>", 1892 : tensor<16x1xi1, #blocked1>
135
+ %58 = arith.extsi %49 : tensor<1x128xi32, #blocked> to tensor<1x128xi64, #blocked>
136
+ %59 = tt.broadcast %58 : (tensor<1x128xi64, #blocked>) -> tensor<16x128xi64, #blocked>
137
+ %60 = arith.addi %59, %34 : tensor<16x128xi64, #blocked>
138
+ %61 = tt.addptr %35, %60 : tensor<16x128x!tt.ptr<f32, 1>, #blocked>, tensor<16x128xi64, #blocked>
139
+ %62 = tt.load %61, %54, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x128xf32, #blocked>
140
+ %63 = arith.addf %62, %55 : tensor<16x128xf32, #blocked>
141
+ %64 = arith.subf %63, %42 : tensor<16x128xf32, #blocked>
142
+ %65 = tt.extern_elementwise %44 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<16x1xf32, #blocked>) -> tensor<16x1xf32, #blocked>
143
+ %66 = tt.broadcast %65 : (tensor<16x1xf32, #blocked>) -> tensor<16x128xf32, #blocked>
144
+ %67 = arith.mulf %64, %66 : tensor<16x128xf32, #blocked>
145
+ %68 = tt.broadcast %57 : (tensor<1x128xf32, #blocked>) -> tensor<16x128xf32, #blocked>
146
+ %69 = arith.mulf %67, %68 : tensor<16x128xf32, #blocked>
147
+ %70 = arith.addi %51, %46 : tensor<16x128xi32, #blocked>
148
+ %71 = tt.addptr %47, %70 : tensor<16x128x!tt.ptr<bf16, 1>, #blocked>, tensor<16x128xi32, #blocked>
149
+ %72 = arith.truncf %69 : tensor<16x128xf32, #blocked> to tensor<16x128xbf16, #blocked>
150
+ tt.store %71, %72, %54 {cache = 1 : i32, evict = 1 : i32} : tensor<16x128xbf16, #blocked>
151
+ }
152
+ tt.return
153
+ }
154
+ }
wandb/run-20240926_055222-14kj2390/run-14kj2390.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a41d34844187b603a549bc59bb4411ea5dc13a1b984865be55df2318b2e7a8c3
3
+ size 28769629
wandb/run-20240926_124123-zc6s8e8w/run-zc6s8e8w.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb8a97e2da032f4f531a370d36d1202e0c3415accd568812cc10f04ca7663010
3
+ size 28283808
wandb/run-20240926_192831-378lr5yg/run-378lr5yg.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6072638d705ca84f37480a4e2d52557f77495381d8a7857a0fe95ba3c6b6d88b
3
+ size 28266657
wandb/run-20240927_021423-clesd0p8/run-clesd0p8.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80ce69a6ed252398eac1d3fc86876dd099d2ecff12750606bc8be6129112bf24
3
+ size 27656192