Add files using upload-large-folder tool
Browse files- .gitattributes +4 -0
- .triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.llir +1121 -0
- .triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.llir +330 -0
- .triton/dump/21d0195c63fb062bfc567b79c9bb2771/triton_.llir +321 -0
- .triton/dump/21d0195c63fb062bfc567b79c9bb2771/triton_.ttir +87 -0
- .triton/dump/345a87a492fd703c73ab83265a21fcb6/triton_.cubin +0 -0
- .triton/dump/345a87a492fd703c73ab83265a21fcb6/triton_.llir +980 -0
- .triton/dump/345a87a492fd703c73ab83265a21fcb6/triton_.ptx +1654 -0
- .triton/dump/345a87a492fd703c73ab83265a21fcb6/triton_.ttgir +125 -0
- .triton/dump/345a87a492fd703c73ab83265a21fcb6/triton_.ttir +104 -0
- .triton/dump/3cd3b6d7993c56f7d0340d40c84f737c/triton_.llir +355 -0
- .triton/dump/4710f23a3addbad00b260d7a02366fe0/triton_.ttgir +39 -0
- .triton/dump/53075505618c3af0ef6ce61f3300cdcb/triton_.cubin +0 -0
- .triton/dump/53075505618c3af0ef6ce61f3300cdcb/triton_.ptx +2004 -0
- .triton/dump/cde66a00b7594f6428e4f4dcdfa88537/triton_.ttgir +154 -0
- wandb/run-20240926_055222-14kj2390/run-14kj2390.wandb +3 -0
- wandb/run-20240926_124123-zc6s8e8w/run-zc6s8e8w.wandb +3 -0
- wandb/run-20240926_192831-378lr5yg/run-378lr5yg.wandb +3 -0
- wandb/run-20240927_021423-clesd0p8/run-clesd0p8.wandb +3 -0
.gitattributes
CHANGED
@@ -49,3 +49,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
49 |
.local/share/jupyter/nbextensions/nbTranslate/demo2.gif filter=lfs diff=lfs merge=lfs -text
|
50 |
.local/share/jupyter/nbextensions/scratchpad/demo.gif filter=lfs diff=lfs merge=lfs -text
|
51 |
.local/share/jupyter/nbextensions/toc2/demo.gif filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
49 |
.local/share/jupyter/nbextensions/nbTranslate/demo2.gif filter=lfs diff=lfs merge=lfs -text
|
50 |
.local/share/jupyter/nbextensions/scratchpad/demo.gif filter=lfs diff=lfs merge=lfs -text
|
51 |
.local/share/jupyter/nbextensions/toc2/demo.gif filter=lfs diff=lfs merge=lfs -text
|
52 |
+
wandb/run-20240926_055222-14kj2390/run-14kj2390.wandb filter=lfs diff=lfs merge=lfs -text
|
53 |
+
wandb/run-20240926_124123-zc6s8e8w/run-zc6s8e8w.wandb filter=lfs diff=lfs merge=lfs -text
|
54 |
+
wandb/run-20240926_192831-378lr5yg/run-378lr5yg.wandb filter=lfs diff=lfs merge=lfs -text
|
55 |
+
wandb/run-20240927_021423-clesd0p8/run-clesd0p8.wandb filter=lfs diff=lfs merge=lfs -text
|
.triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.llir
ADDED
@@ -0,0 +1,1121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
|
5 |
+
@assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
|
6 |
+
@assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp16 < 50257"
|
7 |
+
@assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
|
8 |
+
@assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
|
9 |
+
@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
|
10 |
+
@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
|
11 |
+
@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
|
12 |
+
|
13 |
+
declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
|
14 |
+
|
15 |
+
define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !7 {
|
16 |
+
%9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
|
17 |
+
%10 = lshr i32 %9, 5, !dbg !10
|
18 |
+
%11 = and i32 %10, 7, !dbg !10
|
19 |
+
%12 = and i32 %9, 15, !dbg !10
|
20 |
+
%13 = shl i32 %9, 3, !dbg !11
|
21 |
+
%14 = and i32 %13, 248, !dbg !11
|
22 |
+
%15 = or i32 %14, 4, !dbg !11
|
23 |
+
%urem = and i32 %9, 255, !dbg !11
|
24 |
+
%16 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !12
|
25 |
+
%17 = shl i32 %16, 4, !dbg !13
|
26 |
+
%18 = or i32 %17, %11, !dbg !14
|
27 |
+
%19 = or i32 %18, 8, !dbg !14
|
28 |
+
%20 = or i32 %17, %12, !dbg !14
|
29 |
+
%21 = sext i32 %18 to i64, !dbg !15
|
30 |
+
%22 = getelementptr i64, ptr addrspace(1) %0, i64 %21, !dbg !15
|
31 |
+
%23 = sext i32 %19 to i64, !dbg !15
|
32 |
+
%24 = getelementptr i64, ptr addrspace(1) %0, i64 %23, !dbg !15
|
33 |
+
%25 = sext i32 %20 to i64, !dbg !15
|
34 |
+
%26 = getelementptr i64, ptr addrspace(1) %0, i64 %25, !dbg !15
|
35 |
+
%27 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16
|
36 |
+
%28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16
|
37 |
+
%29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16
|
38 |
+
%30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16
|
39 |
+
%31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16
|
40 |
+
%32 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16
|
41 |
+
%33 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16
|
42 |
+
%34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16
|
43 |
+
%35 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
|
44 |
+
%36 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
|
45 |
+
%37 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
|
46 |
+
%38 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
|
47 |
+
%39 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
|
48 |
+
%40 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
|
49 |
+
%41 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
|
50 |
+
%42 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
|
51 |
+
%43 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #6, !dbg !16
|
52 |
+
%44 = srem i32 %18, 512, !dbg !17
|
53 |
+
%45 = srem i32 %19, 512, !dbg !17
|
54 |
+
%46 = shl nsw i32 %44, 8, !dbg !18
|
55 |
+
%47 = shl nsw i32 %45, 8, !dbg !18
|
56 |
+
%48 = or i32 %46, %14, !dbg !19
|
57 |
+
%49 = or i32 %46, %15, !dbg !19
|
58 |
+
%50 = or i32 %47, %14, !dbg !19
|
59 |
+
%51 = or i32 %47, %15, !dbg !19
|
60 |
+
%52 = sext i32 %48 to i64, !dbg !20
|
61 |
+
%53 = getelementptr float, ptr addrspace(1) %2, i64 %52, !dbg !20
|
62 |
+
%54 = sext i32 %49 to i64, !dbg !20
|
63 |
+
%55 = getelementptr float, ptr addrspace(1) %2, i64 %54, !dbg !20
|
64 |
+
%56 = sext i32 %50 to i64, !dbg !20
|
65 |
+
%57 = getelementptr float, ptr addrspace(1) %2, i64 %56, !dbg !20
|
66 |
+
%58 = sext i32 %51 to i64, !dbg !20
|
67 |
+
%59 = getelementptr float, ptr addrspace(1) %2, i64 %58, !dbg !20
|
68 |
+
%60 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %53, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21
|
69 |
+
%61 = extractvalue { i32, i32, i32, i32 } %60, 0, !dbg !21
|
70 |
+
%62 = extractvalue { i32, i32, i32, i32 } %60, 1, !dbg !21
|
71 |
+
%63 = extractvalue { i32, i32, i32, i32 } %60, 2, !dbg !21
|
72 |
+
%64 = extractvalue { i32, i32, i32, i32 } %60, 3, !dbg !21
|
73 |
+
%65 = bitcast i32 %61 to float, !dbg !21
|
74 |
+
%66 = bitcast i32 %62 to float, !dbg !21
|
75 |
+
%67 = bitcast i32 %63 to float, !dbg !21
|
76 |
+
%68 = bitcast i32 %64 to float, !dbg !21
|
77 |
+
%69 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %55, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21
|
78 |
+
%70 = extractvalue { i32, i32, i32, i32 } %69, 0, !dbg !21
|
79 |
+
%71 = extractvalue { i32, i32, i32, i32 } %69, 1, !dbg !21
|
80 |
+
%72 = extractvalue { i32, i32, i32, i32 } %69, 2, !dbg !21
|
81 |
+
%73 = extractvalue { i32, i32, i32, i32 } %69, 3, !dbg !21
|
82 |
+
%74 = bitcast i32 %70 to float, !dbg !21
|
83 |
+
%75 = bitcast i32 %71 to float, !dbg !21
|
84 |
+
%76 = bitcast i32 %72 to float, !dbg !21
|
85 |
+
%77 = bitcast i32 %73 to float, !dbg !21
|
86 |
+
%78 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %57, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21
|
87 |
+
%79 = extractvalue { i32, i32, i32, i32 } %78, 0, !dbg !21
|
88 |
+
%80 = extractvalue { i32, i32, i32, i32 } %78, 1, !dbg !21
|
89 |
+
%81 = extractvalue { i32, i32, i32, i32 } %78, 2, !dbg !21
|
90 |
+
%82 = extractvalue { i32, i32, i32, i32 } %78, 3, !dbg !21
|
91 |
+
%83 = bitcast i32 %79 to float, !dbg !21
|
92 |
+
%84 = bitcast i32 %80 to float, !dbg !21
|
93 |
+
%85 = bitcast i32 %81 to float, !dbg !21
|
94 |
+
%86 = bitcast i32 %82 to float, !dbg !21
|
95 |
+
%87 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %59, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21
|
96 |
+
%88 = extractvalue { i32, i32, i32, i32 } %87, 0, !dbg !21
|
97 |
+
%89 = extractvalue { i32, i32, i32, i32 } %87, 1, !dbg !21
|
98 |
+
%90 = extractvalue { i32, i32, i32, i32 } %87, 2, !dbg !21
|
99 |
+
%91 = extractvalue { i32, i32, i32, i32 } %87, 3, !dbg !21
|
100 |
+
%92 = bitcast i32 %88 to float, !dbg !21
|
101 |
+
%93 = bitcast i32 %89 to float, !dbg !21
|
102 |
+
%94 = bitcast i32 %90 to float, !dbg !21
|
103 |
+
%95 = bitcast i32 %91 to float, !dbg !21
|
104 |
+
%96 = shl i32 %18, 8, !dbg !22
|
105 |
+
%97 = shl i32 %19, 8, !dbg !22
|
106 |
+
%98 = or i32 %96, %14, !dbg !23
|
107 |
+
%99 = or i32 %97, %14, !dbg !23
|
108 |
+
%100 = sext i32 %98 to i64, !dbg !24
|
109 |
+
%101 = getelementptr i16, ptr addrspace(1) %3, i64 %100, !dbg !24
|
110 |
+
%102 = sext i32 %99 to i64, !dbg !24
|
111 |
+
%103 = getelementptr i16, ptr addrspace(1) %3, i64 %102, !dbg !24
|
112 |
+
%104 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %101, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !25
|
113 |
+
%105 = extractvalue { i32, i32, i32, i32 } %104, 0, !dbg !25
|
114 |
+
%106 = extractvalue { i32, i32, i32, i32 } %104, 1, !dbg !25
|
115 |
+
%107 = extractvalue { i32, i32, i32, i32 } %104, 2, !dbg !25
|
116 |
+
%108 = extractvalue { i32, i32, i32, i32 } %104, 3, !dbg !25
|
117 |
+
%109 = trunc i32 %105 to i16, !dbg !25
|
118 |
+
%extelt.offset = lshr i32 %105, 16, !dbg !25
|
119 |
+
%110 = trunc i32 %extelt.offset to i16, !dbg !25
|
120 |
+
%111 = trunc i32 %106 to i16, !dbg !25
|
121 |
+
%extelt.offset1 = lshr i32 %106, 16, !dbg !25
|
122 |
+
%112 = trunc i32 %extelt.offset1 to i16, !dbg !25
|
123 |
+
%113 = trunc i32 %107 to i16, !dbg !25
|
124 |
+
%extelt.offset2 = lshr i32 %107, 16, !dbg !25
|
125 |
+
%114 = trunc i32 %extelt.offset2 to i16, !dbg !25
|
126 |
+
%115 = trunc i32 %108 to i16, !dbg !25
|
127 |
+
%extelt.offset3 = lshr i32 %108, 16, !dbg !25
|
128 |
+
%116 = trunc i32 %extelt.offset3 to i16, !dbg !25
|
129 |
+
%117 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %103, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !25
|
130 |
+
%118 = extractvalue { i32, i32, i32, i32 } %117, 0, !dbg !25
|
131 |
+
%119 = extractvalue { i32, i32, i32, i32 } %117, 1, !dbg !25
|
132 |
+
%120 = extractvalue { i32, i32, i32, i32 } %117, 2, !dbg !25
|
133 |
+
%121 = extractvalue { i32, i32, i32, i32 } %117, 3, !dbg !25
|
134 |
+
%122 = trunc i32 %118 to i16, !dbg !25
|
135 |
+
%extelt.offset4 = lshr i32 %118, 16, !dbg !25
|
136 |
+
%123 = trunc i32 %extelt.offset4 to i16, !dbg !25
|
137 |
+
%124 = trunc i32 %119 to i16, !dbg !25
|
138 |
+
%extelt.offset5 = lshr i32 %119, 16, !dbg !25
|
139 |
+
%125 = trunc i32 %extelt.offset5 to i16, !dbg !25
|
140 |
+
%126 = trunc i32 %120 to i16, !dbg !25
|
141 |
+
%extelt.offset6 = lshr i32 %120, 16, !dbg !25
|
142 |
+
%127 = trunc i32 %extelt.offset6 to i16, !dbg !25
|
143 |
+
%128 = trunc i32 %121 to i16, !dbg !25
|
144 |
+
%extelt.offset7 = lshr i32 %121, 16, !dbg !25
|
145 |
+
%129 = trunc i32 %extelt.offset7 to i16, !dbg !25
|
146 |
+
%130 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %109) #6, !dbg !26
|
147 |
+
%131 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %110) #6, !dbg !26
|
148 |
+
%132 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %111) #6, !dbg !26
|
149 |
+
%133 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %112) #6, !dbg !26
|
150 |
+
%134 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %113) #6, !dbg !26
|
151 |
+
%135 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %114) #6, !dbg !26
|
152 |
+
%136 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %115) #6, !dbg !26
|
153 |
+
%137 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %116) #6, !dbg !26
|
154 |
+
%138 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %122) #6, !dbg !26
|
155 |
+
%139 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %123) #6, !dbg !26
|
156 |
+
%140 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %124) #6, !dbg !26
|
157 |
+
%141 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %125) #6, !dbg !26
|
158 |
+
%142 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %126) #6, !dbg !26
|
159 |
+
%143 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %127) #6, !dbg !26
|
160 |
+
%144 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %128) #6, !dbg !26
|
161 |
+
%145 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %129) #6, !dbg !26
|
162 |
+
%146 = add i64 %43, 50257, !dbg !27
|
163 |
+
%147 = icmp slt i64 %27, 0, !dbg !28
|
164 |
+
%148 = icmp slt i64 %35, 0, !dbg !28
|
165 |
+
%149 = icmp slt i64 %43, 0, !dbg !28
|
166 |
+
%150 = select i1 %149, i64 %146, i64 %43, !dbg !29
|
167 |
+
%151 = icmp ugt i64 %150, 50256, !dbg !30
|
168 |
+
br i1 %151, label %152, label %153, !dbg !31
|
169 |
+
|
170 |
+
152: ; preds = %8
|
171 |
+
tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !31
|
172 |
+
br label %153, !dbg !31
|
173 |
+
|
174 |
+
153: ; preds = %152, %8
|
175 |
+
%154 = shl i64 %27, 8, !dbg !32
|
176 |
+
%155 = add i64 %154, 12865792, !dbg !32
|
177 |
+
%156 = select i1 %147, i64 %155, i64 %154, !dbg !32
|
178 |
+
%157 = shl i64 %35, 8, !dbg !32
|
179 |
+
%158 = add i64 %157, 12865792, !dbg !32
|
180 |
+
%159 = select i1 %148, i64 %158, i64 %157, !dbg !32
|
181 |
+
%160 = zext nneg i32 %14 to i64
|
182 |
+
%161 = zext nneg i32 %15 to i64
|
183 |
+
%162 = or i64 %156, %160, !dbg !33
|
184 |
+
%163 = or i64 %156, %161, !dbg !33
|
185 |
+
%164 = or i64 %159, %160, !dbg !33
|
186 |
+
%165 = or i64 %159, %161, !dbg !33
|
187 |
+
%166 = getelementptr float, ptr addrspace(1) %1, i64 %162, !dbg !34
|
188 |
+
%167 = getelementptr float, ptr addrspace(1) %1, i64 %163, !dbg !34
|
189 |
+
%168 = getelementptr float, ptr addrspace(1) %1, i64 %164, !dbg !34
|
190 |
+
%169 = getelementptr float, ptr addrspace(1) %1, i64 %165, !dbg !34
|
191 |
+
%170 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %166, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !35
|
192 |
+
%171 = extractvalue { i32, i32, i32, i32 } %170, 0, !dbg !35
|
193 |
+
%172 = extractvalue { i32, i32, i32, i32 } %170, 1, !dbg !35
|
194 |
+
%173 = extractvalue { i32, i32, i32, i32 } %170, 2, !dbg !35
|
195 |
+
%174 = extractvalue { i32, i32, i32, i32 } %170, 3, !dbg !35
|
196 |
+
%175 = bitcast i32 %171 to float, !dbg !35
|
197 |
+
%176 = bitcast i32 %172 to float, !dbg !35
|
198 |
+
%177 = bitcast i32 %173 to float, !dbg !35
|
199 |
+
%178 = bitcast i32 %174 to float, !dbg !35
|
200 |
+
%179 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %167, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !35
|
201 |
+
%180 = extractvalue { i32, i32, i32, i32 } %179, 0, !dbg !35
|
202 |
+
%181 = extractvalue { i32, i32, i32, i32 } %179, 1, !dbg !35
|
203 |
+
%182 = extractvalue { i32, i32, i32, i32 } %179, 2, !dbg !35
|
204 |
+
%183 = extractvalue { i32, i32, i32, i32 } %179, 3, !dbg !35
|
205 |
+
%184 = bitcast i32 %180 to float, !dbg !35
|
206 |
+
%185 = bitcast i32 %181 to float, !dbg !35
|
207 |
+
%186 = bitcast i32 %182 to float, !dbg !35
|
208 |
+
%187 = bitcast i32 %183 to float, !dbg !35
|
209 |
+
%188 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %168, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !35
|
210 |
+
%189 = extractvalue { i32, i32, i32, i32 } %188, 0, !dbg !35
|
211 |
+
%190 = extractvalue { i32, i32, i32, i32 } %188, 1, !dbg !35
|
212 |
+
%191 = extractvalue { i32, i32, i32, i32 } %188, 2, !dbg !35
|
213 |
+
%192 = extractvalue { i32, i32, i32, i32 } %188, 3, !dbg !35
|
214 |
+
%193 = bitcast i32 %189 to float, !dbg !35
|
215 |
+
%194 = bitcast i32 %190 to float, !dbg !35
|
216 |
+
%195 = bitcast i32 %191 to float, !dbg !35
|
217 |
+
%196 = bitcast i32 %192 to float, !dbg !35
|
218 |
+
%197 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %169, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !35
|
219 |
+
%198 = extractvalue { i32, i32, i32, i32 } %197, 0, !dbg !35
|
220 |
+
%199 = extractvalue { i32, i32, i32, i32 } %197, 1, !dbg !35
|
221 |
+
%200 = extractvalue { i32, i32, i32, i32 } %197, 2, !dbg !35
|
222 |
+
%201 = extractvalue { i32, i32, i32, i32 } %197, 3, !dbg !35
|
223 |
+
%202 = bitcast i32 %198 to float, !dbg !35
|
224 |
+
%203 = bitcast i32 %199 to float, !dbg !35
|
225 |
+
%204 = bitcast i32 %200 to float, !dbg !35
|
226 |
+
%205 = bitcast i32 %201 to float, !dbg !35
|
227 |
+
%206 = fadd float %65, %175, !dbg !36
|
228 |
+
%207 = fadd float %66, %176, !dbg !36
|
229 |
+
%208 = fadd float %67, %177, !dbg !36
|
230 |
+
%209 = fadd float %68, %178, !dbg !36
|
231 |
+
%210 = fadd float %74, %184, !dbg !36
|
232 |
+
%211 = fadd float %75, %185, !dbg !36
|
233 |
+
%212 = fadd float %76, %186, !dbg !36
|
234 |
+
%213 = fadd float %77, %187, !dbg !36
|
235 |
+
%214 = fadd float %83, %193, !dbg !36
|
236 |
+
%215 = fadd float %84, %194, !dbg !36
|
237 |
+
%216 = fadd float %85, %195, !dbg !36
|
238 |
+
%217 = fadd float %86, %196, !dbg !36
|
239 |
+
%218 = fadd float %92, %202, !dbg !36
|
240 |
+
%219 = fadd float %93, %203, !dbg !36
|
241 |
+
%220 = fadd float %94, %204, !dbg !36
|
242 |
+
%221 = fadd float %95, %205, !dbg !36
|
243 |
+
%222 = fadd float %130, %206, !dbg !37
|
244 |
+
%223 = fadd float %131, %207, !dbg !37
|
245 |
+
%224 = fadd float %132, %208, !dbg !37
|
246 |
+
%225 = fadd float %133, %209, !dbg !37
|
247 |
+
%226 = fadd float %134, %210, !dbg !37
|
248 |
+
%227 = fadd float %135, %211, !dbg !37
|
249 |
+
%228 = fadd float %136, %212, !dbg !37
|
250 |
+
%229 = fadd float %137, %213, !dbg !37
|
251 |
+
%230 = fadd float %138, %214, !dbg !37
|
252 |
+
%231 = fadd float %139, %215, !dbg !37
|
253 |
+
%232 = fadd float %140, %216, !dbg !37
|
254 |
+
%233 = fadd float %141, %217, !dbg !37
|
255 |
+
%234 = fadd float %142, %218, !dbg !37
|
256 |
+
%235 = fadd float %143, %219, !dbg !37
|
257 |
+
%236 = fadd float %144, %220, !dbg !37
|
258 |
+
%237 = fadd float %145, %221, !dbg !37
|
259 |
+
%238 = fadd float %222, 0.000000e+00, !dbg !38
|
260 |
+
%239 = fadd float %223, 0.000000e+00, !dbg !38
|
261 |
+
%240 = fadd float %224, 0.000000e+00, !dbg !38
|
262 |
+
%241 = fadd float %225, 0.000000e+00, !dbg !38
|
263 |
+
%242 = fadd float %226, 0.000000e+00, !dbg !38
|
264 |
+
%243 = fadd float %227, 0.000000e+00, !dbg !38
|
265 |
+
%244 = fadd float %228, 0.000000e+00, !dbg !38
|
266 |
+
%245 = fadd float %229, 0.000000e+00, !dbg !38
|
267 |
+
%246 = fadd float %230, 0.000000e+00, !dbg !38
|
268 |
+
%247 = fadd float %231, 0.000000e+00, !dbg !38
|
269 |
+
%248 = fadd float %232, 0.000000e+00, !dbg !38
|
270 |
+
%249 = fadd float %233, 0.000000e+00, !dbg !38
|
271 |
+
%250 = fadd float %234, 0.000000e+00, !dbg !38
|
272 |
+
%251 = fadd float %235, 0.000000e+00, !dbg !38
|
273 |
+
%252 = fadd float %236, 0.000000e+00, !dbg !38
|
274 |
+
%253 = fadd float %237, 0.000000e+00, !dbg !38
|
275 |
+
%254 = fsub float %222, %238, !dbg !42
|
276 |
+
%255 = fsub float %223, %239, !dbg !42
|
277 |
+
%256 = fsub float %224, %240, !dbg !42
|
278 |
+
%257 = fsub float %225, %241, !dbg !42
|
279 |
+
%258 = fsub float %226, %242, !dbg !42
|
280 |
+
%259 = fsub float %227, %243, !dbg !42
|
281 |
+
%260 = fsub float %228, %244, !dbg !42
|
282 |
+
%261 = fsub float %229, %245, !dbg !42
|
283 |
+
%262 = fsub float %230, %246, !dbg !42
|
284 |
+
%263 = fsub float %231, %247, !dbg !42
|
285 |
+
%264 = fsub float %232, %248, !dbg !42
|
286 |
+
%265 = fsub float %233, %249, !dbg !42
|
287 |
+
%266 = fsub float %234, %250, !dbg !42
|
288 |
+
%267 = fsub float %235, %251, !dbg !42
|
289 |
+
%268 = fsub float %236, %252, !dbg !42
|
290 |
+
%269 = fsub float %237, %253, !dbg !42
|
291 |
+
%270 = fmul float %222, %254, !dbg !43
|
292 |
+
%271 = fmul float %223, %255, !dbg !43
|
293 |
+
%272 = fmul float %224, %256, !dbg !43
|
294 |
+
%273 = fmul float %225, %257, !dbg !43
|
295 |
+
%274 = fmul float %226, %258, !dbg !43
|
296 |
+
%275 = fmul float %227, %259, !dbg !43
|
297 |
+
%276 = fmul float %228, %260, !dbg !43
|
298 |
+
%277 = fmul float %229, %261, !dbg !43
|
299 |
+
%278 = fmul float %230, %262, !dbg !43
|
300 |
+
%279 = fmul float %231, %263, !dbg !43
|
301 |
+
%280 = fmul float %232, %264, !dbg !43
|
302 |
+
%281 = fmul float %233, %265, !dbg !43
|
303 |
+
%282 = fmul float %234, %266, !dbg !43
|
304 |
+
%283 = fmul float %235, %267, !dbg !43
|
305 |
+
%284 = fmul float %236, %268, !dbg !43
|
306 |
+
%285 = fmul float %237, %269, !dbg !43
|
307 |
+
%286 = fadd float %270, 0.000000e+00, !dbg !44
|
308 |
+
%287 = fadd float %271, 0.000000e+00, !dbg !44
|
309 |
+
%288 = fadd float %272, 0.000000e+00, !dbg !44
|
310 |
+
%289 = fadd float %273, 0.000000e+00, !dbg !44
|
311 |
+
%290 = fadd float %274, 0.000000e+00, !dbg !44
|
312 |
+
%291 = fadd float %275, 0.000000e+00, !dbg !44
|
313 |
+
%292 = fadd float %276, 0.000000e+00, !dbg !44
|
314 |
+
%293 = fadd float %277, 0.000000e+00, !dbg !44
|
315 |
+
%294 = fadd float %278, 0.000000e+00, !dbg !44
|
316 |
+
%295 = fadd float %279, 0.000000e+00, !dbg !44
|
317 |
+
%296 = fadd float %280, 0.000000e+00, !dbg !44
|
318 |
+
%297 = fadd float %281, 0.000000e+00, !dbg !44
|
319 |
+
%298 = fadd float %282, 0.000000e+00, !dbg !44
|
320 |
+
%299 = fadd float %283, 0.000000e+00, !dbg !44
|
321 |
+
%300 = fadd float %284, 0.000000e+00, !dbg !44
|
322 |
+
%301 = fadd float %285, 0.000000e+00, !dbg !44
|
323 |
+
%302 = fsub float %239, %238, !dbg !45
|
324 |
+
%303 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 2.000000e+00) #6, !dbg !49
|
325 |
+
%304 = fmul float %303, %302, !dbg !50
|
326 |
+
%305 = fadd float %238, %304, !dbg !51
|
327 |
+
%306 = fadd float %286, %287, !dbg !52
|
328 |
+
%307 = fmul float %302, %302, !dbg !53
|
329 |
+
%308 = fmul float %303, %307, !dbg !54
|
330 |
+
%309 = fadd float %308, %306, !dbg !55
|
331 |
+
%310 = fsub float %240, %305, !dbg !45
|
332 |
+
%311 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 3.000000e+00) #6, !dbg !49
|
333 |
+
%312 = fmul float %311, %310, !dbg !50
|
334 |
+
%313 = fadd float %305, %312, !dbg !51
|
335 |
+
%314 = fadd float %288, %309, !dbg !52
|
336 |
+
%315 = fmul float %310, %310, !dbg !53
|
337 |
+
%316 = fmul float %315, 2.000000e+00, !dbg !56
|
338 |
+
%317 = fmul float %311, %316, !dbg !54
|
339 |
+
%318 = fadd float %314, %317, !dbg !55
|
340 |
+
%319 = fsub float %241, %313, !dbg !45
|
341 |
+
%320 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 4.000000e+00) #6, !dbg !49
|
342 |
+
%321 = fmul float %320, %319, !dbg !50
|
343 |
+
%322 = fadd float %313, %321, !dbg !51
|
344 |
+
%323 = fadd float %289, %318, !dbg !52
|
345 |
+
%324 = fmul float %319, %319, !dbg !53
|
346 |
+
%325 = fmul float %324, 3.000000e+00, !dbg !56
|
347 |
+
%326 = fmul float %320, %325, !dbg !54
|
348 |
+
%327 = fadd float %323, %326, !dbg !55
|
349 |
+
%328 = fsub float %242, %322, !dbg !45
|
350 |
+
%329 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 5.000000e+00) #6, !dbg !49
|
351 |
+
%330 = fmul float %329, %328, !dbg !50
|
352 |
+
%331 = fadd float %322, %330, !dbg !51
|
353 |
+
%332 = fadd float %290, %327, !dbg !52
|
354 |
+
%333 = fmul float %328, %328, !dbg !53
|
355 |
+
%334 = fmul float %333, 4.000000e+00, !dbg !56
|
356 |
+
%335 = fmul float %329, %334, !dbg !54
|
357 |
+
%336 = fadd float %332, %335, !dbg !55
|
358 |
+
%337 = fsub float %243, %331, !dbg !45
|
359 |
+
%338 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 6.000000e+00) #6, !dbg !49
|
360 |
+
%339 = fmul float %338, %337, !dbg !50
|
361 |
+
%340 = fadd float %331, %339, !dbg !51
|
362 |
+
%341 = fadd float %291, %336, !dbg !52
|
363 |
+
%342 = fmul float %337, %337, !dbg !53
|
364 |
+
%343 = fmul float %342, 5.000000e+00, !dbg !56
|
365 |
+
%344 = fmul float %338, %343, !dbg !54
|
366 |
+
%345 = fadd float %341, %344, !dbg !55
|
367 |
+
%346 = fsub float %244, %340, !dbg !45
|
368 |
+
%347 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 7.000000e+00) #6, !dbg !49
|
369 |
+
%348 = fmul float %347, %346, !dbg !50
|
370 |
+
%349 = fadd float %340, %348, !dbg !51
|
371 |
+
%350 = fadd float %292, %345, !dbg !52
|
372 |
+
%351 = fmul float %346, %346, !dbg !53
|
373 |
+
%352 = fmul float %351, 6.000000e+00, !dbg !56
|
374 |
+
%353 = fmul float %347, %352, !dbg !54
|
375 |
+
%354 = fadd float %350, %353, !dbg !55
|
376 |
+
%355 = fsub float %245, %349, !dbg !45
|
377 |
+
%356 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 8.000000e+00) #6, !dbg !49
|
378 |
+
%357 = fmul float %356, %355, !dbg !50
|
379 |
+
%358 = fadd float %349, %357, !dbg !51
|
380 |
+
%359 = fadd float %293, %354, !dbg !52
|
381 |
+
%360 = fmul float %355, %355, !dbg !53
|
382 |
+
%361 = fmul float %360, 7.000000e+00, !dbg !56
|
383 |
+
%362 = fmul float %356, %361, !dbg !54
|
384 |
+
%363 = fadd float %359, %362, !dbg !55
|
385 |
+
%364 = fsub float %247, %246, !dbg !45
|
386 |
+
%365 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 2.000000e+00) #6, !dbg !49
|
387 |
+
%366 = fmul float %364, %365, !dbg !50
|
388 |
+
%367 = fadd float %246, %366, !dbg !51
|
389 |
+
%368 = fadd float %294, %295, !dbg !52
|
390 |
+
%369 = fmul float %364, %364, !dbg !53
|
391 |
+
%370 = fmul float %369, %365, !dbg !54
|
392 |
+
%371 = fadd float %368, %370, !dbg !55
|
393 |
+
%372 = fsub float %248, %367, !dbg !45
|
394 |
+
%373 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 3.000000e+00) #6, !dbg !49
|
395 |
+
%374 = fmul float %373, %372, !dbg !50
|
396 |
+
%375 = fadd float %367, %374, !dbg !51
|
397 |
+
%376 = fadd float %296, %371, !dbg !52
|
398 |
+
%377 = fmul float %372, %372, !dbg !53
|
399 |
+
%378 = fmul float %377, 2.000000e+00, !dbg !56
|
400 |
+
%379 = fmul float %373, %378, !dbg !54
|
401 |
+
%380 = fadd float %376, %379, !dbg !55
|
402 |
+
%381 = fsub float %249, %375, !dbg !45
|
403 |
+
%382 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 4.000000e+00) #6, !dbg !49
|
404 |
+
%383 = fmul float %382, %381, !dbg !50
|
405 |
+
%384 = fadd float %375, %383, !dbg !51
|
406 |
+
%385 = fadd float %297, %380, !dbg !52
|
407 |
+
%386 = fmul float %381, %381, !dbg !53
|
408 |
+
%387 = fmul float %386, 3.000000e+00, !dbg !56
|
409 |
+
%388 = fmul float %382, %387, !dbg !54
|
410 |
+
%389 = fadd float %385, %388, !dbg !55
|
411 |
+
%390 = fsub float %250, %384, !dbg !45
|
412 |
+
%391 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 5.000000e+00) #6, !dbg !49
|
413 |
+
%392 = fmul float %391, %390, !dbg !50
|
414 |
+
%393 = fadd float %384, %392, !dbg !51
|
415 |
+
%394 = fadd float %298, %389, !dbg !52
|
416 |
+
%395 = fmul float %390, %390, !dbg !53
|
417 |
+
%396 = fmul float %395, 4.000000e+00, !dbg !56
|
418 |
+
%397 = fmul float %391, %396, !dbg !54
|
419 |
+
%398 = fadd float %394, %397, !dbg !55
|
420 |
+
%399 = fsub float %251, %393, !dbg !45
|
421 |
+
%400 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 6.000000e+00) #6, !dbg !49
|
422 |
+
%401 = fmul float %400, %399, !dbg !50
|
423 |
+
%402 = fadd float %393, %401, !dbg !51
|
424 |
+
%403 = fadd float %299, %398, !dbg !52
|
425 |
+
%404 = fmul float %399, %399, !dbg !53
|
426 |
+
%405 = fmul float %404, 5.000000e+00, !dbg !56
|
427 |
+
%406 = fmul float %400, %405, !dbg !54
|
428 |
+
%407 = fadd float %403, %406, !dbg !55
|
429 |
+
%408 = fsub float %252, %402, !dbg !45
|
430 |
+
%409 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 7.000000e+00) #6, !dbg !49
|
431 |
+
%410 = fmul float %409, %408, !dbg !50
|
432 |
+
%411 = fadd float %402, %410, !dbg !51
|
433 |
+
%412 = fadd float %300, %407, !dbg !52
|
434 |
+
%413 = fmul float %408, %408, !dbg !53
|
435 |
+
%414 = fmul float %413, 6.000000e+00, !dbg !56
|
436 |
+
%415 = fmul float %409, %414, !dbg !54
|
437 |
+
%416 = fadd float %412, %415, !dbg !55
|
438 |
+
%417 = fsub float %253, %411, !dbg !45
|
439 |
+
%418 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 8.000000e+00) #6, !dbg !49
|
440 |
+
%419 = fmul float %418, %417, !dbg !50
|
441 |
+
%420 = fadd float %411, %419, !dbg !51
|
442 |
+
%421 = fadd float %301, %416, !dbg !52
|
443 |
+
%422 = fmul float %417, %417, !dbg !53
|
444 |
+
%423 = fmul float %422, 7.000000e+00, !dbg !56
|
445 |
+
%424 = fmul float %418, %423, !dbg !54
|
446 |
+
%425 = fadd float %421, %424, !dbg !55
|
447 |
+
%426 = bitcast float %358 to i32, !dbg !57
|
448 |
+
%427 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %426, i32 16, i32 31), !dbg !57
|
449 |
+
%428 = bitcast i32 %427 to float, !dbg !57
|
450 |
+
%429 = bitcast float %363 to i32, !dbg !57
|
451 |
+
%430 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %429, i32 16, i32 31), !dbg !57
|
452 |
+
%431 = bitcast i32 %430 to float, !dbg !57
|
453 |
+
%432 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 1090519040, i32 16, i32 31), !dbg !57
|
454 |
+
%433 = bitcast i32 %432 to float, !dbg !57
|
455 |
+
%434 = fsub float %428, %358, !dbg !45
|
456 |
+
%435 = fadd float %433, 8.000000e+00, !dbg !59
|
457 |
+
%436 = fcmp oeq float %435, 0.000000e+00, !dbg !60
|
458 |
+
%437 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %433, float %435) #6, !dbg !49
|
459 |
+
%438 = select i1 %436, float 0.000000e+00, float %437, !dbg !61
|
460 |
+
%439 = fmul float %438, %434, !dbg !50
|
461 |
+
%440 = fadd float %358, %439, !dbg !51
|
462 |
+
%441 = fadd float %363, %431, !dbg !52
|
463 |
+
%442 = fmul float %434, %434, !dbg !53
|
464 |
+
%443 = fmul float %442, 8.000000e+00, !dbg !56
|
465 |
+
%444 = fmul float %438, %443, !dbg !54
|
466 |
+
%445 = fadd float %441, %444, !dbg !55
|
467 |
+
%446 = bitcast float %440 to i32, !dbg !57
|
468 |
+
%447 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %446, i32 8, i32 31), !dbg !57
|
469 |
+
%448 = bitcast i32 %447 to float, !dbg !57
|
470 |
+
%449 = bitcast float %445 to i32, !dbg !57
|
471 |
+
%450 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %449, i32 8, i32 31), !dbg !57
|
472 |
+
%451 = bitcast i32 %450 to float, !dbg !57
|
473 |
+
%452 = bitcast float %435 to i32, !dbg !57
|
474 |
+
%453 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %452, i32 8, i32 31), !dbg !57
|
475 |
+
%454 = bitcast i32 %453 to float, !dbg !57
|
476 |
+
%455 = fsub float %448, %440, !dbg !45
|
477 |
+
%456 = fadd float %435, %454, !dbg !59
|
478 |
+
%457 = fcmp oeq float %456, 0.000000e+00, !dbg !60
|
479 |
+
%458 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %454, float %456) #6, !dbg !49
|
480 |
+
%459 = select i1 %457, float 0.000000e+00, float %458, !dbg !61
|
481 |
+
%460 = fmul float %459, %455, !dbg !50
|
482 |
+
%461 = fadd float %440, %460, !dbg !51
|
483 |
+
%462 = fadd float %445, %451, !dbg !52
|
484 |
+
%463 = fmul float %455, %455, !dbg !53
|
485 |
+
%464 = fmul float %435, %463, !dbg !56
|
486 |
+
%465 = fmul float %459, %464, !dbg !54
|
487 |
+
%466 = fadd float %462, %465, !dbg !55
|
488 |
+
%467 = bitcast float %461 to i32, !dbg !57
|
489 |
+
%468 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %467, i32 4, i32 31), !dbg !57
|
490 |
+
%469 = bitcast i32 %468 to float, !dbg !57
|
491 |
+
%470 = bitcast float %466 to i32, !dbg !57
|
492 |
+
%471 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %470, i32 4, i32 31), !dbg !57
|
493 |
+
%472 = bitcast i32 %471 to float, !dbg !57
|
494 |
+
%473 = bitcast float %456 to i32, !dbg !57
|
495 |
+
%474 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %473, i32 4, i32 31), !dbg !57
|
496 |
+
%475 = bitcast i32 %474 to float, !dbg !57
|
497 |
+
%476 = fsub float %469, %461, !dbg !45
|
498 |
+
%477 = fadd float %456, %475, !dbg !59
|
499 |
+
%478 = fcmp oeq float %477, 0.000000e+00, !dbg !60
|
500 |
+
%479 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %475, float %477) #6, !dbg !49
|
501 |
+
%480 = select i1 %478, float 0.000000e+00, float %479, !dbg !61
|
502 |
+
%481 = fmul float %480, %476, !dbg !50
|
503 |
+
%482 = fadd float %461, %481, !dbg !51
|
504 |
+
%483 = fadd float %466, %472, !dbg !52
|
505 |
+
%484 = fmul float %476, %476, !dbg !53
|
506 |
+
%485 = fmul float %456, %484, !dbg !56
|
507 |
+
%486 = fmul float %480, %485, !dbg !54
|
508 |
+
%487 = fadd float %483, %486, !dbg !55
|
509 |
+
%488 = bitcast float %482 to i32, !dbg !57
|
510 |
+
%489 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %488, i32 2, i32 31), !dbg !57
|
511 |
+
%490 = bitcast i32 %489 to float, !dbg !57
|
512 |
+
%491 = bitcast float %487 to i32, !dbg !57
|
513 |
+
%492 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %491, i32 2, i32 31), !dbg !57
|
514 |
+
%493 = bitcast i32 %492 to float, !dbg !57
|
515 |
+
%494 = bitcast float %477 to i32, !dbg !57
|
516 |
+
%495 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %494, i32 2, i32 31), !dbg !57
|
517 |
+
%496 = bitcast i32 %495 to float, !dbg !57
|
518 |
+
%497 = fsub float %490, %482, !dbg !45
|
519 |
+
%498 = fadd float %477, %496, !dbg !59
|
520 |
+
%499 = fcmp oeq float %498, 0.000000e+00, !dbg !60
|
521 |
+
%500 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %496, float %498) #6, !dbg !49
|
522 |
+
%501 = select i1 %499, float 0.000000e+00, float %500, !dbg !61
|
523 |
+
%502 = fmul float %497, %501, !dbg !50
|
524 |
+
%503 = fadd float %482, %502, !dbg !51
|
525 |
+
%504 = fadd float %487, %493, !dbg !52
|
526 |
+
%505 = fmul float %497, %497, !dbg !53
|
527 |
+
%506 = fmul float %477, %505, !dbg !56
|
528 |
+
%507 = fmul float %501, %506, !dbg !54
|
529 |
+
%508 = fadd float %504, %507, !dbg !55
|
530 |
+
%509 = bitcast float %503 to i32, !dbg !57
|
531 |
+
%510 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %509, i32 1, i32 31), !dbg !57
|
532 |
+
%511 = bitcast float %508 to i32, !dbg !57
|
533 |
+
%512 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %511, i32 1, i32 31), !dbg !57
|
534 |
+
%513 = bitcast float %498 to i32, !dbg !57
|
535 |
+
%514 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %513, i32 1, i32 31), !dbg !57
|
536 |
+
%515 = bitcast i32 %514 to float, !dbg !57
|
537 |
+
%516 = fadd float %498, %515, !dbg !59
|
538 |
+
%517 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %515, float %516) #6, !dbg !49
|
539 |
+
%518 = bitcast float %420 to i32, !dbg !57
|
540 |
+
%519 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %518, i32 16, i32 31), !dbg !57
|
541 |
+
%520 = bitcast i32 %519 to float, !dbg !57
|
542 |
+
%521 = bitcast float %425 to i32, !dbg !57
|
543 |
+
%522 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %521, i32 16, i32 31), !dbg !57
|
544 |
+
%523 = bitcast i32 %522 to float, !dbg !57
|
545 |
+
%524 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 1090519040, i32 16, i32 31), !dbg !57
|
546 |
+
%525 = bitcast i32 %524 to float, !dbg !57
|
547 |
+
%526 = fsub float %520, %420, !dbg !45
|
548 |
+
%527 = fadd float %525, 8.000000e+00, !dbg !59
|
549 |
+
%528 = fcmp oeq float %527, 0.000000e+00, !dbg !60
|
550 |
+
%529 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %525, float %527) #6, !dbg !49
|
551 |
+
%530 = select i1 %528, float 0.000000e+00, float %529, !dbg !61
|
552 |
+
%531 = fmul float %526, %530, !dbg !50
|
553 |
+
%532 = fadd float %420, %531, !dbg !51
|
554 |
+
%533 = fadd float %425, %523, !dbg !52
|
555 |
+
%534 = fmul float %526, %526, !dbg !53
|
556 |
+
%535 = fmul float %534, 8.000000e+00, !dbg !56
|
557 |
+
%536 = fmul float %535, %530, !dbg !54
|
558 |
+
%537 = fadd float %533, %536, !dbg !55
|
559 |
+
%538 = bitcast float %532 to i32, !dbg !57
|
560 |
+
%539 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %538, i32 8, i32 31), !dbg !57
|
561 |
+
%540 = bitcast i32 %539 to float, !dbg !57
|
562 |
+
%541 = bitcast float %537 to i32, !dbg !57
|
563 |
+
%542 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %541, i32 8, i32 31), !dbg !57
|
564 |
+
%543 = bitcast i32 %542 to float, !dbg !57
|
565 |
+
%544 = bitcast float %527 to i32, !dbg !57
|
566 |
+
%545 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %544, i32 8, i32 31), !dbg !57
|
567 |
+
%546 = bitcast i32 %545 to float, !dbg !57
|
568 |
+
%547 = fsub float %540, %532, !dbg !45
|
569 |
+
%548 = fadd float %527, %546, !dbg !59
|
570 |
+
%549 = fcmp oeq float %548, 0.000000e+00, !dbg !60
|
571 |
+
%550 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %546, float %548) #6, !dbg !49
|
572 |
+
%551 = select i1 %549, float 0.000000e+00, float %550, !dbg !61
|
573 |
+
%552 = fmul float %547, %551, !dbg !50
|
574 |
+
%553 = fadd float %532, %552, !dbg !51
|
575 |
+
%554 = fadd float %537, %543, !dbg !52
|
576 |
+
%555 = fmul float %547, %547, !dbg !53
|
577 |
+
%556 = fmul float %527, %555, !dbg !56
|
578 |
+
%557 = fmul float %551, %556, !dbg !54
|
579 |
+
%558 = fadd float %554, %557, !dbg !55
|
580 |
+
%559 = bitcast float %553 to i32, !dbg !57
|
581 |
+
%560 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %559, i32 4, i32 31), !dbg !57
|
582 |
+
%561 = bitcast i32 %560 to float, !dbg !57
|
583 |
+
%562 = bitcast float %558 to i32, !dbg !57
|
584 |
+
%563 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %562, i32 4, i32 31), !dbg !57
|
585 |
+
%564 = bitcast i32 %563 to float, !dbg !57
|
586 |
+
%565 = bitcast float %548 to i32, !dbg !57
|
587 |
+
%566 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %565, i32 4, i32 31), !dbg !57
|
588 |
+
%567 = bitcast i32 %566 to float, !dbg !57
|
589 |
+
%568 = fsub float %561, %553, !dbg !45
|
590 |
+
%569 = fadd float %548, %567, !dbg !59
|
591 |
+
%570 = fcmp oeq float %569, 0.000000e+00, !dbg !60
|
592 |
+
%571 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %567, float %569) #6, !dbg !49
|
593 |
+
%572 = select i1 %570, float 0.000000e+00, float %571, !dbg !61
|
594 |
+
%573 = fmul float %568, %572, !dbg !50
|
595 |
+
%574 = fadd float %553, %573, !dbg !51
|
596 |
+
%575 = fadd float %558, %564, !dbg !52
|
597 |
+
%576 = fmul float %568, %568, !dbg !53
|
598 |
+
%577 = fmul float %548, %576, !dbg !56
|
599 |
+
%578 = fmul float %572, %577, !dbg !54
|
600 |
+
%579 = fadd float %575, %578, !dbg !55
|
601 |
+
%580 = bitcast float %574 to i32, !dbg !57
|
602 |
+
%581 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %580, i32 2, i32 31), !dbg !57
|
603 |
+
%582 = bitcast i32 %581 to float, !dbg !57
|
604 |
+
%583 = bitcast float %579 to i32, !dbg !57
|
605 |
+
%584 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %583, i32 2, i32 31), !dbg !57
|
606 |
+
%585 = bitcast i32 %584 to float, !dbg !57
|
607 |
+
%586 = bitcast float %569 to i32, !dbg !57
|
608 |
+
%587 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %586, i32 2, i32 31), !dbg !57
|
609 |
+
%588 = bitcast i32 %587 to float, !dbg !57
|
610 |
+
%589 = fsub float %582, %574, !dbg !45
|
611 |
+
%590 = fadd float %569, %588, !dbg !59
|
612 |
+
%591 = fcmp oeq float %590, 0.000000e+00, !dbg !60
|
613 |
+
%592 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %588, float %590) #6, !dbg !49
|
614 |
+
%593 = select i1 %591, float 0.000000e+00, float %592, !dbg !61
|
615 |
+
%594 = fmul float %589, %593, !dbg !50
|
616 |
+
%595 = fadd float %574, %594, !dbg !51
|
617 |
+
%596 = fadd float %579, %585, !dbg !52
|
618 |
+
%597 = fmul float %589, %589, !dbg !53
|
619 |
+
%598 = fmul float %569, %597, !dbg !56
|
620 |
+
%599 = fmul float %593, %598, !dbg !54
|
621 |
+
%600 = fadd float %596, %599, !dbg !55
|
622 |
+
%601 = bitcast float %595 to i32, !dbg !57
|
623 |
+
%602 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %601, i32 1, i32 31), !dbg !57
|
624 |
+
%603 = bitcast float %600 to i32, !dbg !57
|
625 |
+
%604 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %603, i32 1, i32 31), !dbg !57
|
626 |
+
%605 = bitcast float %590 to i32, !dbg !57
|
627 |
+
%606 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %605, i32 1, i32 31), !dbg !57
|
628 |
+
%607 = bitcast i32 %606 to float, !dbg !57
|
629 |
+
%608 = fadd float %590, %607, !dbg !59
|
630 |
+
%609 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %607, float %608) #6, !dbg !49
|
631 |
+
%610 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %53, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !62
|
632 |
+
%611 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %55, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !62
|
633 |
+
%612 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %57, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !62
|
634 |
+
%613 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %59, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !62
|
635 |
+
%614 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %101, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !63
|
636 |
+
%615 = extractvalue { i32, i32, i32, i32 } %614, 0, !dbg !63
|
637 |
+
%616 = extractvalue { i32, i32, i32, i32 } %614, 1, !dbg !63
|
638 |
+
%617 = extractvalue { i32, i32, i32, i32 } %614, 2, !dbg !63
|
639 |
+
%618 = extractvalue { i32, i32, i32, i32 } %614, 3, !dbg !63
|
640 |
+
%619 = trunc i32 %615 to i16, !dbg !63
|
641 |
+
%extelt.offset8 = lshr i32 %615, 16, !dbg !63
|
642 |
+
%620 = trunc i32 %extelt.offset8 to i16, !dbg !63
|
643 |
+
%621 = trunc i32 %616 to i16, !dbg !63
|
644 |
+
%extelt.offset9 = lshr i32 %616, 16, !dbg !63
|
645 |
+
%622 = trunc i32 %extelt.offset9 to i16, !dbg !63
|
646 |
+
%623 = trunc i32 %617 to i16, !dbg !63
|
647 |
+
%extelt.offset10 = lshr i32 %617, 16, !dbg !63
|
648 |
+
%624 = trunc i32 %extelt.offset10 to i16, !dbg !63
|
649 |
+
%625 = trunc i32 %618 to i16, !dbg !63
|
650 |
+
%extelt.offset11 = lshr i32 %618, 16, !dbg !63
|
651 |
+
%626 = trunc i32 %extelt.offset11 to i16, !dbg !63
|
652 |
+
%627 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %103, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !63
|
653 |
+
%628 = extractvalue { i32, i32, i32, i32 } %627, 0, !dbg !63
|
654 |
+
%629 = extractvalue { i32, i32, i32, i32 } %627, 1, !dbg !63
|
655 |
+
%630 = extractvalue { i32, i32, i32, i32 } %627, 2, !dbg !63
|
656 |
+
%631 = extractvalue { i32, i32, i32, i32 } %627, 3, !dbg !63
|
657 |
+
%632 = trunc i32 %628 to i16, !dbg !63
|
658 |
+
%extelt.offset12 = lshr i32 %628, 16, !dbg !63
|
659 |
+
%633 = trunc i32 %extelt.offset12 to i16, !dbg !63
|
660 |
+
%634 = trunc i32 %629 to i16, !dbg !63
|
661 |
+
%extelt.offset13 = lshr i32 %629, 16, !dbg !63
|
662 |
+
%635 = trunc i32 %extelt.offset13 to i16, !dbg !63
|
663 |
+
%636 = trunc i32 %630 to i16, !dbg !63
|
664 |
+
%extelt.offset14 = lshr i32 %630, 16, !dbg !63
|
665 |
+
%637 = trunc i32 %extelt.offset14 to i16, !dbg !63
|
666 |
+
%638 = trunc i32 %631 to i16, !dbg !63
|
667 |
+
%extelt.offset15 = lshr i32 %631, 16, !dbg !63
|
668 |
+
%639 = trunc i32 %extelt.offset15 to i16, !dbg !63
|
669 |
+
%640 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %619) #6, !dbg !64
|
670 |
+
%641 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %620) #6, !dbg !64
|
671 |
+
%642 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %621) #6, !dbg !64
|
672 |
+
%643 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %622) #6, !dbg !64
|
673 |
+
%644 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %623) #6, !dbg !64
|
674 |
+
%645 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %624) #6, !dbg !64
|
675 |
+
%646 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %625) #6, !dbg !64
|
676 |
+
%647 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %626) #6, !dbg !64
|
677 |
+
%648 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %632) #6, !dbg !64
|
678 |
+
%649 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %633) #6, !dbg !64
|
679 |
+
%650 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %634) #6, !dbg !64
|
680 |
+
%651 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %635) #6, !dbg !64
|
681 |
+
%652 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %636) #6, !dbg !64
|
682 |
+
%653 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %637) #6, !dbg !64
|
683 |
+
%654 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %638) #6, !dbg !64
|
684 |
+
%655 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %639) #6, !dbg !64
|
685 |
+
%656 = zext nneg i32 %urem to i64, !dbg !65
|
686 |
+
%657 = getelementptr float, ptr addrspace(1) %4, i64 %656, !dbg !65
|
687 |
+
%658 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %657, i1 true, i32 0, i1 true) #6, !dbg !66
|
688 |
+
br i1 %151, label %659, label %660, !dbg !67
|
689 |
+
|
690 |
+
659: ; preds = %153
|
691 |
+
tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !67
|
692 |
+
br label %660, !dbg !67
|
693 |
+
|
694 |
+
660: ; preds = %659, %153
|
695 |
+
%661 = bitcast i32 %604 to float, !dbg !57
|
696 |
+
%662 = fadd float %600, %661, !dbg !52
|
697 |
+
%663 = bitcast i32 %602 to float, !dbg !57
|
698 |
+
%664 = fsub float %663, %595, !dbg !45
|
699 |
+
%665 = fmul float %664, %664, !dbg !53
|
700 |
+
%666 = fmul float %590, %665, !dbg !56
|
701 |
+
%667 = fcmp oeq float %608, 0.000000e+00, !dbg !60
|
702 |
+
%668 = select i1 %667, float 0.000000e+00, float %609, !dbg !61
|
703 |
+
%669 = fmul float %668, %666, !dbg !54
|
704 |
+
%670 = fadd float %662, %669, !dbg !55
|
705 |
+
%671 = bitcast i32 %512 to float, !dbg !57
|
706 |
+
%672 = fadd float %508, %671, !dbg !52
|
707 |
+
%673 = bitcast i32 %510 to float, !dbg !57
|
708 |
+
%674 = fsub float %673, %503, !dbg !45
|
709 |
+
%675 = fmul float %674, %674, !dbg !53
|
710 |
+
%676 = fmul float %498, %675, !dbg !56
|
711 |
+
%677 = fcmp oeq float %516, 0.000000e+00, !dbg !60
|
712 |
+
%678 = select i1 %677, float 0.000000e+00, float %517, !dbg !61
|
713 |
+
%679 = fmul float %678, %676, !dbg !54
|
714 |
+
%680 = fadd float %672, %679, !dbg !55
|
715 |
+
%681 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %166, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
|
716 |
+
%682 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %167, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
|
717 |
+
%683 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %168, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
|
718 |
+
%684 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %169, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
|
719 |
+
%685 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %680, float 2.560000e+02) #6, !dbg !69
|
720 |
+
%686 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %680, float 2.560000e+02) #6, !dbg !69
|
721 |
+
%687 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %680, float 2.560000e+02) #6, !dbg !69
|
722 |
+
%688 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %680, float 2.560000e+02) #6, !dbg !69
|
723 |
+
%689 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %680, float 2.560000e+02) #6, !dbg !69
|
724 |
+
%690 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %680, float 2.560000e+02) #6, !dbg !69
|
725 |
+
%691 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %680, float 2.560000e+02) #6, !dbg !69
|
726 |
+
%692 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %680, float 2.560000e+02) #6, !dbg !69
|
727 |
+
%693 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %670, float 2.560000e+02) #6, !dbg !69
|
728 |
+
%694 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %670, float 2.560000e+02) #6, !dbg !69
|
729 |
+
%695 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %670, float 2.560000e+02) #6, !dbg !69
|
730 |
+
%696 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %670, float 2.560000e+02) #6, !dbg !69
|
731 |
+
%697 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %670, float 2.560000e+02) #6, !dbg !69
|
732 |
+
%698 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %670, float 2.560000e+02) #6, !dbg !69
|
733 |
+
%699 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %670, float 2.560000e+02) #6, !dbg !69
|
734 |
+
%700 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %670, float 2.560000e+02) #6, !dbg !69
|
735 |
+
%701 = fadd float %685, 0x3EE4F8B580000000, !dbg !70
|
736 |
+
%702 = fadd float %693, 0x3EE4F8B580000000, !dbg !70
|
737 |
+
%703 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
|
738 |
+
%.not.i = icmp eq i32 %703, 0, !dbg !71
|
739 |
+
br i1 %.not.i, label %706, label %704, !dbg !71
|
740 |
+
|
741 |
+
704: ; preds = %660
|
742 |
+
%705 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %701), !dbg !71
|
743 |
+
br label %__nv_rsqrtf.exit, !dbg !71
|
744 |
+
|
745 |
+
706: ; preds = %660
|
746 |
+
%707 = tail call float @llvm.nvvm.rsqrt.approx.f(float %701), !dbg !71
|
747 |
+
br label %__nv_rsqrtf.exit, !dbg !71
|
748 |
+
|
749 |
+
__nv_rsqrtf.exit: ; preds = %704, %706
|
750 |
+
%.0.i = phi float [ %705, %704 ], [ %707, %706 ], !dbg !71
|
751 |
+
%708 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
|
752 |
+
%709 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
|
753 |
+
%710 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
|
754 |
+
%711 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
|
755 |
+
%712 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
|
756 |
+
%713 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
|
757 |
+
%714 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
|
758 |
+
%715 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
|
759 |
+
%.not.i37 = icmp eq i32 %715, 0, !dbg !71
|
760 |
+
br i1 %.not.i37, label %718, label %716, !dbg !71
|
761 |
+
|
762 |
+
716: ; preds = %__nv_rsqrtf.exit
|
763 |
+
%717 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %702), !dbg !71
|
764 |
+
br label %__nv_rsqrtf.exit39, !dbg !71
|
765 |
+
|
766 |
+
718: ; preds = %__nv_rsqrtf.exit
|
767 |
+
%719 = tail call float @llvm.nvvm.rsqrt.approx.f(float %702), !dbg !71
|
768 |
+
br label %__nv_rsqrtf.exit39, !dbg !71
|
769 |
+
|
770 |
+
__nv_rsqrtf.exit39: ; preds = %716, %718
|
771 |
+
%.0.i38 = phi float [ %717, %716 ], [ %719, %718 ], !dbg !71
|
772 |
+
%720 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
|
773 |
+
%721 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
|
774 |
+
%722 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
|
775 |
+
%723 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
|
776 |
+
%724 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
|
777 |
+
%725 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
|
778 |
+
%726 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
|
779 |
+
%727 = extractvalue { i32, i32, i32, i32 } %684, 3, !dbg !68
|
780 |
+
%728 = bitcast i32 %727 to float, !dbg !68
|
781 |
+
%729 = extractvalue { i32, i32, i32, i32 } %613, 3, !dbg !62
|
782 |
+
%730 = bitcast i32 %729 to float, !dbg !62
|
783 |
+
%731 = fadd float %730, %728, !dbg !72
|
784 |
+
%732 = fadd float %655, %731, !dbg !73
|
785 |
+
%733 = fmul float %664, %668, !dbg !50
|
786 |
+
%734 = fadd float %595, %733, !dbg !51
|
787 |
+
%735 = fsub float %732, %734, !dbg !74
|
788 |
+
%736 = extractvalue { i32, i32, i32, i32 } %684, 2, !dbg !68
|
789 |
+
%737 = bitcast i32 %736 to float, !dbg !68
|
790 |
+
%738 = extractvalue { i32, i32, i32, i32 } %613, 2, !dbg !62
|
791 |
+
%739 = bitcast i32 %738 to float, !dbg !62
|
792 |
+
%740 = fadd float %739, %737, !dbg !72
|
793 |
+
%741 = fadd float %654, %740, !dbg !73
|
794 |
+
%742 = fsub float %741, %734, !dbg !74
|
795 |
+
%743 = extractvalue { i32, i32, i32, i32 } %684, 1, !dbg !68
|
796 |
+
%744 = bitcast i32 %743 to float, !dbg !68
|
797 |
+
%745 = extractvalue { i32, i32, i32, i32 } %613, 1, !dbg !62
|
798 |
+
%746 = bitcast i32 %745 to float, !dbg !62
|
799 |
+
%747 = fadd float %746, %744, !dbg !72
|
800 |
+
%748 = fadd float %653, %747, !dbg !73
|
801 |
+
%749 = fsub float %748, %734, !dbg !74
|
802 |
+
%750 = extractvalue { i32, i32, i32, i32 } %684, 0, !dbg !68
|
803 |
+
%751 = bitcast i32 %750 to float, !dbg !68
|
804 |
+
%752 = extractvalue { i32, i32, i32, i32 } %613, 0, !dbg !62
|
805 |
+
%753 = bitcast i32 %752 to float, !dbg !62
|
806 |
+
%754 = fadd float %753, %751, !dbg !72
|
807 |
+
%755 = fadd float %652, %754, !dbg !73
|
808 |
+
%756 = fsub float %755, %734, !dbg !74
|
809 |
+
%757 = extractvalue { i32, i32, i32, i32 } %683, 3, !dbg !68
|
810 |
+
%758 = bitcast i32 %757 to float, !dbg !68
|
811 |
+
%759 = extractvalue { i32, i32, i32, i32 } %612, 3, !dbg !62
|
812 |
+
%760 = bitcast i32 %759 to float, !dbg !62
|
813 |
+
%761 = fadd float %760, %758, !dbg !72
|
814 |
+
%762 = fadd float %651, %761, !dbg !73
|
815 |
+
%763 = fsub float %762, %734, !dbg !74
|
816 |
+
%764 = extractvalue { i32, i32, i32, i32 } %683, 2, !dbg !68
|
817 |
+
%765 = bitcast i32 %764 to float, !dbg !68
|
818 |
+
%766 = extractvalue { i32, i32, i32, i32 } %612, 2, !dbg !62
|
819 |
+
%767 = bitcast i32 %766 to float, !dbg !62
|
820 |
+
%768 = fadd float %767, %765, !dbg !72
|
821 |
+
%769 = fadd float %650, %768, !dbg !73
|
822 |
+
%770 = fsub float %769, %734, !dbg !74
|
823 |
+
%771 = extractvalue { i32, i32, i32, i32 } %683, 1, !dbg !68
|
824 |
+
%772 = bitcast i32 %771 to float, !dbg !68
|
825 |
+
%773 = extractvalue { i32, i32, i32, i32 } %612, 1, !dbg !62
|
826 |
+
%774 = bitcast i32 %773 to float, !dbg !62
|
827 |
+
%775 = fadd float %774, %772, !dbg !72
|
828 |
+
%776 = fadd float %649, %775, !dbg !73
|
829 |
+
%777 = fsub float %776, %734, !dbg !74
|
830 |
+
%778 = extractvalue { i32, i32, i32, i32 } %683, 0, !dbg !68
|
831 |
+
%779 = bitcast i32 %778 to float, !dbg !68
|
832 |
+
%780 = extractvalue { i32, i32, i32, i32 } %612, 0, !dbg !62
|
833 |
+
%781 = bitcast i32 %780 to float, !dbg !62
|
834 |
+
%782 = fadd float %781, %779, !dbg !72
|
835 |
+
%783 = fadd float %648, %782, !dbg !73
|
836 |
+
%784 = fsub float %783, %734, !dbg !74
|
837 |
+
%785 = extractvalue { i32, i32, i32, i32 } %682, 3, !dbg !68
|
838 |
+
%786 = bitcast i32 %785 to float, !dbg !68
|
839 |
+
%787 = extractvalue { i32, i32, i32, i32 } %611, 3, !dbg !62
|
840 |
+
%788 = bitcast i32 %787 to float, !dbg !62
|
841 |
+
%789 = fadd float %788, %786, !dbg !72
|
842 |
+
%790 = fadd float %647, %789, !dbg !73
|
843 |
+
%791 = fmul float %674, %678, !dbg !50
|
844 |
+
%792 = fadd float %503, %791, !dbg !51
|
845 |
+
%793 = fsub float %790, %792, !dbg !74
|
846 |
+
%794 = extractvalue { i32, i32, i32, i32 } %682, 2, !dbg !68
|
847 |
+
%795 = bitcast i32 %794 to float, !dbg !68
|
848 |
+
%796 = extractvalue { i32, i32, i32, i32 } %611, 2, !dbg !62
|
849 |
+
%797 = bitcast i32 %796 to float, !dbg !62
|
850 |
+
%798 = fadd float %797, %795, !dbg !72
|
851 |
+
%799 = fadd float %646, %798, !dbg !73
|
852 |
+
%800 = fsub float %799, %792, !dbg !74
|
853 |
+
%801 = extractvalue { i32, i32, i32, i32 } %682, 1, !dbg !68
|
854 |
+
%802 = bitcast i32 %801 to float, !dbg !68
|
855 |
+
%803 = extractvalue { i32, i32, i32, i32 } %611, 1, !dbg !62
|
856 |
+
%804 = bitcast i32 %803 to float, !dbg !62
|
857 |
+
%805 = fadd float %804, %802, !dbg !72
|
858 |
+
%806 = fadd float %645, %805, !dbg !73
|
859 |
+
%807 = fsub float %806, %792, !dbg !74
|
860 |
+
%808 = extractvalue { i32, i32, i32, i32 } %682, 0, !dbg !68
|
861 |
+
%809 = bitcast i32 %808 to float, !dbg !68
|
862 |
+
%810 = extractvalue { i32, i32, i32, i32 } %611, 0, !dbg !62
|
863 |
+
%811 = bitcast i32 %810 to float, !dbg !62
|
864 |
+
%812 = fadd float %811, %809, !dbg !72
|
865 |
+
%813 = fadd float %644, %812, !dbg !73
|
866 |
+
%814 = fsub float %813, %792, !dbg !74
|
867 |
+
%815 = extractvalue { i32, i32, i32, i32 } %681, 3, !dbg !68
|
868 |
+
%816 = bitcast i32 %815 to float, !dbg !68
|
869 |
+
%817 = extractvalue { i32, i32, i32, i32 } %610, 3, !dbg !62
|
870 |
+
%818 = bitcast i32 %817 to float, !dbg !62
|
871 |
+
%819 = fadd float %818, %816, !dbg !72
|
872 |
+
%820 = fadd float %643, %819, !dbg !73
|
873 |
+
%821 = fsub float %820, %792, !dbg !74
|
874 |
+
%822 = extractvalue { i32, i32, i32, i32 } %681, 2, !dbg !68
|
875 |
+
%823 = bitcast i32 %822 to float, !dbg !68
|
876 |
+
%824 = extractvalue { i32, i32, i32, i32 } %610, 2, !dbg !62
|
877 |
+
%825 = bitcast i32 %824 to float, !dbg !62
|
878 |
+
%826 = fadd float %825, %823, !dbg !72
|
879 |
+
%827 = fadd float %642, %826, !dbg !73
|
880 |
+
%828 = fsub float %827, %792, !dbg !74
|
881 |
+
%829 = extractvalue { i32, i32, i32, i32 } %681, 1, !dbg !68
|
882 |
+
%830 = bitcast i32 %829 to float, !dbg !68
|
883 |
+
%831 = extractvalue { i32, i32, i32, i32 } %610, 1, !dbg !62
|
884 |
+
%832 = bitcast i32 %831 to float, !dbg !62
|
885 |
+
%833 = fadd float %832, %830, !dbg !72
|
886 |
+
%834 = fadd float %641, %833, !dbg !73
|
887 |
+
%835 = fsub float %834, %792, !dbg !74
|
888 |
+
%836 = extractvalue { i32, i32, i32, i32 } %681, 0, !dbg !68
|
889 |
+
%837 = bitcast i32 %836 to float, !dbg !68
|
890 |
+
%838 = extractvalue { i32, i32, i32, i32 } %610, 0, !dbg !62
|
891 |
+
%839 = bitcast i32 %838 to float, !dbg !62
|
892 |
+
%840 = fadd float %839, %837, !dbg !72
|
893 |
+
%841 = fadd float %640, %840, !dbg !73
|
894 |
+
%842 = fsub float %841, %792, !dbg !74
|
895 |
+
%843 = fmul float %842, %.0.i, !dbg !75
|
896 |
+
%844 = fmul float %835, %.0.i, !dbg !75
|
897 |
+
%845 = fmul float %828, %.0.i, !dbg !75
|
898 |
+
%846 = fmul float %821, %.0.i, !dbg !75
|
899 |
+
%847 = fmul float %814, %.0.i, !dbg !75
|
900 |
+
%848 = fmul float %807, %.0.i, !dbg !75
|
901 |
+
%849 = fmul float %800, %.0.i, !dbg !75
|
902 |
+
%850 = fmul float %793, %.0.i, !dbg !75
|
903 |
+
%851 = fmul float %784, %.0.i38, !dbg !75
|
904 |
+
%852 = fmul float %777, %.0.i38, !dbg !75
|
905 |
+
%853 = fmul float %770, %.0.i38, !dbg !75
|
906 |
+
%854 = fmul float %763, %.0.i38, !dbg !75
|
907 |
+
%855 = fmul float %756, %.0.i38, !dbg !75
|
908 |
+
%856 = fmul float %749, %.0.i38, !dbg !75
|
909 |
+
%857 = fmul float %742, %.0.i38, !dbg !75
|
910 |
+
%858 = fmul float %735, %.0.i38, !dbg !75
|
911 |
+
%859 = getelementptr float, ptr addrspace(3) @global_smem, i64 %656, !dbg !76
|
912 |
+
store i32 %658, ptr addrspace(3) %859, align 4, !dbg !76
|
913 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !76
|
914 |
+
%860 = getelementptr float, ptr addrspace(3) @global_smem, i64 %160, !dbg !76
|
915 |
+
%861 = load float, ptr addrspace(3) %860, align 32, !dbg !76
|
916 |
+
%862 = getelementptr inbounds <8 x float>, ptr addrspace(3) %860, i64 0, i64 1, !dbg !76
|
917 |
+
%863 = load float, ptr addrspace(3) %862, align 4, !dbg !76
|
918 |
+
%864 = getelementptr inbounds <8 x float>, ptr addrspace(3) %860, i64 0, i64 2, !dbg !76
|
919 |
+
%865 = load float, ptr addrspace(3) %864, align 8, !dbg !76
|
920 |
+
%866 = getelementptr inbounds <8 x float>, ptr addrspace(3) %860, i64 0, i64 3, !dbg !76
|
921 |
+
%867 = load float, ptr addrspace(3) %866, align 4, !dbg !76
|
922 |
+
%868 = getelementptr inbounds <8 x float>, ptr addrspace(3) %860, i64 0, i64 4, !dbg !76
|
923 |
+
%869 = load float, ptr addrspace(3) %868, align 16, !dbg !76
|
924 |
+
%870 = getelementptr inbounds <8 x float>, ptr addrspace(3) %860, i64 0, i64 5, !dbg !76
|
925 |
+
%871 = load float, ptr addrspace(3) %870, align 4, !dbg !76
|
926 |
+
%872 = getelementptr inbounds <8 x float>, ptr addrspace(3) %860, i64 0, i64 6, !dbg !76
|
927 |
+
%873 = load float, ptr addrspace(3) %872, align 8, !dbg !76
|
928 |
+
%874 = getelementptr inbounds <8 x float>, ptr addrspace(3) %860, i64 0, i64 7, !dbg !76
|
929 |
+
%875 = load float, ptr addrspace(3) %874, align 4, !dbg !76
|
930 |
+
%876 = fmul float %843, %861, !dbg !76
|
931 |
+
%877 = fmul float %844, %863, !dbg !76
|
932 |
+
%878 = fmul float %845, %865, !dbg !76
|
933 |
+
%879 = fmul float %846, %867, !dbg !76
|
934 |
+
%880 = fmul float %847, %869, !dbg !76
|
935 |
+
%881 = fmul float %848, %871, !dbg !76
|
936 |
+
%882 = fmul float %849, %873, !dbg !76
|
937 |
+
%883 = fmul float %850, %875, !dbg !76
|
938 |
+
%884 = fmul float %851, %861, !dbg !76
|
939 |
+
%885 = fmul float %852, %863, !dbg !76
|
940 |
+
%886 = fmul float %853, %865, !dbg !76
|
941 |
+
%887 = fmul float %854, %867, !dbg !76
|
942 |
+
%888 = fmul float %855, %869, !dbg !76
|
943 |
+
%889 = fmul float %856, %871, !dbg !76
|
944 |
+
%890 = fmul float %857, %873, !dbg !76
|
945 |
+
%891 = fmul float %858, %875, !dbg !76
|
946 |
+
%892 = getelementptr i16, ptr addrspace(1) %5, i64 %100, !dbg !77
|
947 |
+
%893 = getelementptr i16, ptr addrspace(1) %5, i64 %102, !dbg !77
|
948 |
+
%894 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %876) #6, !dbg !78
|
949 |
+
%895 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %877) #6, !dbg !78
|
950 |
+
%896 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %878) #6, !dbg !78
|
951 |
+
%897 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %879) #6, !dbg !78
|
952 |
+
%898 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %880) #6, !dbg !78
|
953 |
+
%899 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %881) #6, !dbg !78
|
954 |
+
%900 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %882) #6, !dbg !78
|
955 |
+
%901 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %883) #6, !dbg !78
|
956 |
+
%902 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %884) #6, !dbg !78
|
957 |
+
%903 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %885) #6, !dbg !78
|
958 |
+
%904 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %886) #6, !dbg !78
|
959 |
+
%905 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %887) #6, !dbg !78
|
960 |
+
%906 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %888) #6, !dbg !78
|
961 |
+
%907 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %889) #6, !dbg !78
|
962 |
+
%908 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %890) #6, !dbg !78
|
963 |
+
%909 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %891) #6, !dbg !78
|
964 |
+
%910 = insertelement <2 x i16> undef, i16 %894, i64 0, !dbg !78
|
965 |
+
%911 = insertelement <2 x i16> %910, i16 %895, i64 1, !dbg !78
|
966 |
+
%912 = bitcast <2 x i16> %911 to i32, !dbg !78
|
967 |
+
%913 = insertelement <2 x i16> undef, i16 %896, i64 0, !dbg !78
|
968 |
+
%914 = insertelement <2 x i16> %913, i16 %897, i64 1, !dbg !78
|
969 |
+
%915 = bitcast <2 x i16> %914 to i32, !dbg !78
|
970 |
+
%916 = insertelement <2 x i16> undef, i16 %898, i64 0, !dbg !78
|
971 |
+
%917 = insertelement <2 x i16> %916, i16 %899, i64 1, !dbg !78
|
972 |
+
%918 = bitcast <2 x i16> %917 to i32, !dbg !78
|
973 |
+
%919 = insertelement <2 x i16> undef, i16 %900, i64 0, !dbg !78
|
974 |
+
%920 = insertelement <2 x i16> %919, i16 %901, i64 1, !dbg !78
|
975 |
+
%921 = bitcast <2 x i16> %920 to i32, !dbg !78
|
976 |
+
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %912, i32 %915, i32 %918, i32 %921, ptr addrspace(1) %892, i1 true) #6, !dbg !78
|
977 |
+
%922 = insertelement <2 x i16> undef, i16 %902, i64 0, !dbg !78
|
978 |
+
%923 = insertelement <2 x i16> %922, i16 %903, i64 1, !dbg !78
|
979 |
+
%924 = bitcast <2 x i16> %923 to i32, !dbg !78
|
980 |
+
%925 = insertelement <2 x i16> undef, i16 %904, i64 0, !dbg !78
|
981 |
+
%926 = insertelement <2 x i16> %925, i16 %905, i64 1, !dbg !78
|
982 |
+
%927 = bitcast <2 x i16> %926 to i32, !dbg !78
|
983 |
+
%928 = insertelement <2 x i16> undef, i16 %906, i64 0, !dbg !78
|
984 |
+
%929 = insertelement <2 x i16> %928, i16 %907, i64 1, !dbg !78
|
985 |
+
%930 = bitcast <2 x i16> %929 to i32, !dbg !78
|
986 |
+
%931 = insertelement <2 x i16> undef, i16 %908, i64 0, !dbg !78
|
987 |
+
%932 = insertelement <2 x i16> %931, i16 %909, i64 1, !dbg !78
|
988 |
+
%933 = bitcast <2 x i16> %932 to i32, !dbg !78
|
989 |
+
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %924, i32 %927, i32 %930, i32 %933, ptr addrspace(1) %893, i1 true) #6, !dbg !78
|
990 |
+
ret void, !dbg !79
|
991 |
+
}
|
992 |
+
|
993 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
994 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
995 |
+
|
996 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
997 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
|
998 |
+
|
999 |
+
; Function Attrs: convergent nocallback nounwind
|
1000 |
+
declare void @llvm.nvvm.barrier0() #2
|
1001 |
+
|
1002 |
+
; Function Attrs: alwaysinline nounwind
|
1003 |
+
define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
|
1004 |
+
%1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
|
1005 |
+
%.not = icmp eq i32 %1, 0
|
1006 |
+
br i1 %.not, label %4, label %2
|
1007 |
+
|
1008 |
+
2: ; preds = %0
|
1009 |
+
%3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
|
1010 |
+
br label %6
|
1011 |
+
|
1012 |
+
4: ; preds = %0
|
1013 |
+
%5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
|
1014 |
+
br label %6
|
1015 |
+
|
1016 |
+
6: ; preds = %4, %2
|
1017 |
+
%.0 = phi float [ %3, %2 ], [ %5, %4 ]
|
1018 |
+
ret float %.0
|
1019 |
+
}
|
1020 |
+
|
1021 |
+
declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
|
1022 |
+
|
1023 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
1024 |
+
declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
|
1025 |
+
|
1026 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
1027 |
+
declare float @llvm.nvvm.rsqrt.approx.f(float) #5
|
1028 |
+
|
1029 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
1030 |
+
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
1031 |
+
attributes #2 = { convergent nocallback nounwind }
|
1032 |
+
attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
1033 |
+
attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
1034 |
+
attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
|
1035 |
+
attributes #6 = { nounwind }
|
1036 |
+
|
1037 |
+
!llvm.module.flags = !{!0, !1}
|
1038 |
+
!llvm.dbg.cu = !{!2}
|
1039 |
+
!nvvm.annotations = !{!4, !5, !5, !4}
|
1040 |
+
!llvm.ident = !{!6}
|
1041 |
+
|
1042 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
1043 |
+
!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
1044 |
+
!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
1045 |
+
!3 = !DIFile(filename: "cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py", directory: "/tmp/torchinductor_root/pn")
|
1046 |
+
!4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1}
|
1047 |
+
!5 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 256}
|
1048 |
+
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
|
1049 |
+
!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
|
1050 |
+
!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
|
1051 |
+
!9 = !{}
|
1052 |
+
!10 = !DILocation(line: 22, column: 44, scope: !7)
|
1053 |
+
!11 = !DILocation(line: 24, column: 33, scope: !7)
|
1054 |
+
!12 = !DILocation(line: 21, column: 28, scope: !7)
|
1055 |
+
!13 = !DILocation(line: 21, column: 33, scope: !7)
|
1056 |
+
!14 = !DILocation(line: 22, column: 23, scope: !7)
|
1057 |
+
!15 = !DILocation(line: 26, column: 30, scope: !7)
|
1058 |
+
!16 = !DILocation(line: 26, column: 35, scope: !7)
|
1059 |
+
!17 = !DILocation(line: 27, column: 18, scope: !7)
|
1060 |
+
!18 = !DILocation(line: 35, column: 44, scope: !7)
|
1061 |
+
!19 = !DILocation(line: 35, column: 40, scope: !7)
|
1062 |
+
!20 = !DILocation(line: 35, column: 34, scope: !7)
|
1063 |
+
!21 = !DILocation(line: 35, column: 50, scope: !7)
|
1064 |
+
!22 = !DILocation(line: 36, column: 44, scope: !7)
|
1065 |
+
!23 = !DILocation(line: 36, column: 40, scope: !7)
|
1066 |
+
!24 = !DILocation(line: 36, column: 34, scope: !7)
|
1067 |
+
!25 = !DILocation(line: 36, column: 50, scope: !7)
|
1068 |
+
!26 = !DILocation(line: 36, column: 101, scope: !7)
|
1069 |
+
!27 = !DILocation(line: 37, column: 22, scope: !7)
|
1070 |
+
!28 = !DILocation(line: 38, column: 22, scope: !7)
|
1071 |
+
!29 = !DILocation(line: 39, column: 36, scope: !7)
|
1072 |
+
!30 = !DILocation(line: 40, column: 40, scope: !7)
|
1073 |
+
!31 = !DILocation(line: 40, column: 55, scope: !7)
|
1074 |
+
!32 = !DILocation(line: 41, column: 44, scope: !7)
|
1075 |
+
!33 = !DILocation(line: 41, column: 40, scope: !7)
|
1076 |
+
!34 = !DILocation(line: 41, column: 34, scope: !7)
|
1077 |
+
!35 = !DILocation(line: 41, column: 52, scope: !7)
|
1078 |
+
!36 = !DILocation(line: 42, column: 22, scope: !7)
|
1079 |
+
!37 = !DILocation(line: 44, column: 22, scope: !7)
|
1080 |
+
!38 = !DILocation(line: 98, column: 22, scope: !39, inlinedAt: !41)
|
1081 |
+
!39 = distinct !DILexicalBlockFile(scope: !7, file: !40, discriminator: 0)
|
1082 |
+
!40 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
|
1083 |
+
!41 = !DILocation(line: 47, column: 41, scope: !39)
|
1084 |
+
!42 = !DILocation(line: 101, column: 30, scope: !39, inlinedAt: !41)
|
1085 |
+
!43 = !DILocation(line: 101, column: 22, scope: !39, inlinedAt: !41)
|
1086 |
+
!44 = !DILocation(line: 101, column: 13, scope: !39, inlinedAt: !41)
|
1087 |
+
!45 = !DILocation(line: 108, column: 21, scope: !46, inlinedAt: !47)
|
1088 |
+
!46 = distinct !DILexicalBlockFile(scope: !39, file: !40, discriminator: 0)
|
1089 |
+
!47 = !DILocation(line: 120, column: 46, scope: !46, inlinedAt: !48)
|
1090 |
+
!48 = !DILocation(line: 53, column: 44, scope: !46)
|
1091 |
+
!49 = !DILocation(line: 110, column: 60, scope: !46, inlinedAt: !47)
|
1092 |
+
!50 = !DILocation(line: 112, column: 25, scope: !46, inlinedAt: !47)
|
1093 |
+
!51 = !DILocation(line: 112, column: 17, scope: !46, inlinedAt: !47)
|
1094 |
+
!52 = !DILocation(line: 113, column: 15, scope: !46, inlinedAt: !47)
|
1095 |
+
!53 = !DILocation(line: 113, column: 30, scope: !46, inlinedAt: !47)
|
1096 |
+
!54 = !DILocation(line: 113, column: 49, scope: !46, inlinedAt: !47)
|
1097 |
+
!55 = !DILocation(line: 113, column: 22, scope: !46, inlinedAt: !47)
|
1098 |
+
!56 = !DILocation(line: 113, column: 38, scope: !46, inlinedAt: !47)
|
1099 |
+
!57 = !DILocation(line: 120, column: 46, scope: !39, inlinedAt: !58)
|
1100 |
+
!58 = !DILocation(line: 53, column: 44, scope: !39)
|
1101 |
+
!59 = !DILocation(line: 109, column: 28, scope: !46, inlinedAt: !47)
|
1102 |
+
!60 = !DILocation(line: 110, column: 39, scope: !46, inlinedAt: !47)
|
1103 |
+
!61 = !DILocation(line: 110, column: 49, scope: !46, inlinedAt: !47)
|
1104 |
+
!62 = !DILocation(line: 62, column: 51, scope: !7)
|
1105 |
+
!63 = !DILocation(line: 63, column: 51, scope: !7)
|
1106 |
+
!64 = !DILocation(line: 63, column: 103, scope: !7)
|
1107 |
+
!65 = !DILocation(line: 64, column: 35, scope: !7)
|
1108 |
+
!66 = !DILocation(line: 64, column: 40, scope: !7)
|
1109 |
+
!67 = !DILocation(line: 68, column: 57, scope: !7)
|
1110 |
+
!68 = !DILocation(line: 69, column: 54, scope: !7)
|
1111 |
+
!69 = !DILocation(line: 75, column: 24, scope: !7)
|
1112 |
+
!70 = !DILocation(line: 77, column: 24, scope: !7)
|
1113 |
+
!71 = !DILocation(line: 78, column: 30, scope: !7)
|
1114 |
+
!72 = !DILocation(line: 70, column: 24, scope: !7)
|
1115 |
+
!73 = !DILocation(line: 72, column: 24, scope: !7)
|
1116 |
+
!74 = !DILocation(line: 73, column: 24, scope: !7)
|
1117 |
+
!75 = !DILocation(line: 79, column: 24, scope: !7)
|
1118 |
+
!76 = !DILocation(line: 80, column: 24, scope: !7)
|
1119 |
+
!77 = !DILocation(line: 82, column: 29, scope: !7)
|
1120 |
+
!78 = !DILocation(line: 82, column: 52, scope: !7)
|
1121 |
+
!79 = !DILocation(line: 58, column: 4, scope: !7)
|
.triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.llir
ADDED
@@ -0,0 +1,330 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
|
5 |
+
@assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
|
6 |
+
@assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp13 < 50257"
|
7 |
+
@assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
|
8 |
+
@assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
|
9 |
+
@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
|
10 |
+
@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
|
11 |
+
|
12 |
+
declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
|
13 |
+
|
14 |
+
define void @triton__0d1d2d3d4d5de6de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i32 %5, i32 %6) local_unnamed_addr !dbg !7 {
|
15 |
+
%8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
|
16 |
+
%9 = lshr i32 %8, 2, !dbg !10
|
17 |
+
%10 = and i32 %9, 63, !dbg !10
|
18 |
+
%11 = and i32 %8, 63, !dbg !10
|
19 |
+
%12 = and i32 %8, 3, !dbg !11
|
20 |
+
%13 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #5, !dbg !12
|
21 |
+
%14 = shl i32 %13, 6, !dbg !13
|
22 |
+
%15 = or i32 %14, %10, !dbg !14
|
23 |
+
%16 = or i32 %14, %11, !dbg !14
|
24 |
+
%17 = sext i32 %15 to i64, !dbg !15
|
25 |
+
%18 = getelementptr i64, ptr addrspace(1) %0, i64 %17, !dbg !15
|
26 |
+
%19 = sext i32 %16 to i64, !dbg !15
|
27 |
+
%20 = getelementptr i64, ptr addrspace(1) %0, i64 %19, !dbg !15
|
28 |
+
%21 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %18, i1 true) #5, !dbg !16
|
29 |
+
%22 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %20, i1 true) #5, !dbg !16
|
30 |
+
%23 = srem i32 %15, 512, !dbg !17
|
31 |
+
%24 = shl nsw i32 %23, 8, !dbg !18
|
32 |
+
%25 = add i64 %22, 50257, !dbg !19
|
33 |
+
%26 = icmp slt i64 %21, 0, !dbg !20
|
34 |
+
%27 = icmp slt i64 %22, 0, !dbg !20
|
35 |
+
%28 = select i1 %27, i64 %25, i64 %22, !dbg !21
|
36 |
+
%.fr8 = freeze i64 %28, !dbg !22
|
37 |
+
%29 = icmp ugt i64 %.fr8, 50256, !dbg !22
|
38 |
+
%30 = shl i64 %21, 8, !dbg !23
|
39 |
+
%31 = add i64 %30, 12865792, !dbg !23
|
40 |
+
%32 = select i1 %26, i64 %31, i64 %30, !dbg !23
|
41 |
+
%33 = getelementptr float, ptr addrspace(1) %1, i64 %32
|
42 |
+
br i1 %29, label %.split.us, label %.split, !dbg !24
|
43 |
+
|
44 |
+
.split.us: ; preds = %7, %.split.us
|
45 |
+
%34 = phi float [ %50, %.split.us ], [ 0.000000e+00, %7 ]
|
46 |
+
%35 = phi float [ %55, %.split.us ], [ 0.000000e+00, %7 ]
|
47 |
+
%36 = phi float [ %52, %.split.us ], [ 0.000000e+00, %7 ]
|
48 |
+
%37 = phi i32 [ %56, %.split.us ], [ 0, %7 ]
|
49 |
+
%38 = or i32 %37, %12, !dbg !25
|
50 |
+
%39 = add i32 %38, %24, !dbg !26
|
51 |
+
%40 = sext i32 %39 to i64, !dbg !27
|
52 |
+
%41 = getelementptr float, ptr addrspace(1) %2, i64 %40, !dbg !27
|
53 |
+
%42 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %41, i1 true, i32 0, i1 true) #5, !dbg !28
|
54 |
+
%43 = bitcast i32 %42 to float, !dbg !28
|
55 |
+
tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !24
|
56 |
+
%44 = zext nneg i32 %38 to i64, !dbg !29
|
57 |
+
%45 = getelementptr float, ptr addrspace(1) %33, i64 %44, !dbg !30
|
58 |
+
%46 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %45, i1 true, i32 0, i1 true) #5, !dbg !31
|
59 |
+
%47 = bitcast i32 %46 to float, !dbg !31
|
60 |
+
%48 = fadd float %43, %47, !dbg !32
|
61 |
+
%49 = fsub float %48, %36, !dbg !33
|
62 |
+
%50 = fadd float %34, 1.000000e+00, !dbg !37
|
63 |
+
%51 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %49, float %50) #5, !dbg !38
|
64 |
+
%52 = fadd float %36, %51, !dbg !39
|
65 |
+
%53 = fsub float %48, %52, !dbg !40
|
66 |
+
%54 = fmul float %49, %53, !dbg !41
|
67 |
+
%55 = fadd float %35, %54, !dbg !42
|
68 |
+
%56 = add nuw nsw i32 %37, 4, !dbg !43
|
69 |
+
%57 = icmp ult i32 %37, 252, !dbg !43
|
70 |
+
br i1 %57, label %.split.us, label %.split5.us, !dbg !43
|
71 |
+
|
72 |
+
.split: ; preds = %7, %.split
|
73 |
+
%58 = phi float [ %74, %.split ], [ 0.000000e+00, %7 ]
|
74 |
+
%59 = phi float [ %79, %.split ], [ 0.000000e+00, %7 ]
|
75 |
+
%60 = phi float [ %76, %.split ], [ 0.000000e+00, %7 ]
|
76 |
+
%61 = phi i32 [ %80, %.split ], [ 0, %7 ]
|
77 |
+
%62 = or i32 %61, %12, !dbg !25
|
78 |
+
%63 = add i32 %62, %24, !dbg !26
|
79 |
+
%64 = sext i32 %63 to i64, !dbg !27
|
80 |
+
%65 = getelementptr float, ptr addrspace(1) %2, i64 %64, !dbg !27
|
81 |
+
%66 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %65, i1 true, i32 0, i1 true) #5, !dbg !28
|
82 |
+
%67 = bitcast i32 %66 to float, !dbg !28
|
83 |
+
%68 = zext nneg i32 %62 to i64, !dbg !29
|
84 |
+
%69 = getelementptr float, ptr addrspace(1) %33, i64 %68, !dbg !30
|
85 |
+
%70 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %69, i1 true, i32 0, i1 true) #5, !dbg !31
|
86 |
+
%71 = bitcast i32 %70 to float, !dbg !31
|
87 |
+
%72 = fadd float %67, %71, !dbg !32
|
88 |
+
%73 = fsub float %72, %60, !dbg !33
|
89 |
+
%74 = fadd float %58, 1.000000e+00, !dbg !37
|
90 |
+
%75 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %73, float %74) #5, !dbg !38
|
91 |
+
%76 = fadd float %60, %75, !dbg !39
|
92 |
+
%77 = fsub float %72, %76, !dbg !40
|
93 |
+
%78 = fmul float %73, %77, !dbg !41
|
94 |
+
%79 = fadd float %59, %78, !dbg !42
|
95 |
+
%80 = add nuw nsw i32 %61, 4, !dbg !43
|
96 |
+
%81 = icmp ult i32 %61, 252, !dbg !43
|
97 |
+
br i1 %81, label %.split, label %.split5.us, !dbg !43
|
98 |
+
|
99 |
+
.split5.us: ; preds = %.split, %.split.us
|
100 |
+
%.us-phi = phi float [ %52, %.split.us ], [ %76, %.split ]
|
101 |
+
%.us-phi6 = phi float [ %55, %.split.us ], [ %79, %.split ]
|
102 |
+
%.us-phi7 = phi float [ %50, %.split.us ], [ %74, %.split ]
|
103 |
+
%82 = bitcast float %.us-phi to i32, !dbg !44
|
104 |
+
%83 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %82, i32 2, i32 31), !dbg !44
|
105 |
+
%84 = bitcast i32 %83 to float, !dbg !44
|
106 |
+
%85 = bitcast float %.us-phi6 to i32, !dbg !44
|
107 |
+
%86 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %85, i32 2, i32 31), !dbg !44
|
108 |
+
%87 = bitcast i32 %86 to float, !dbg !44
|
109 |
+
%88 = bitcast float %.us-phi7 to i32, !dbg !44
|
110 |
+
%89 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %88, i32 2, i32 31), !dbg !44
|
111 |
+
%90 = bitcast i32 %89 to float, !dbg !44
|
112 |
+
%91 = fsub float %84, %.us-phi, !dbg !46
|
113 |
+
%92 = fadd float %.us-phi7, %90, !dbg !50
|
114 |
+
%93 = fcmp oeq float %92, 0.000000e+00, !dbg !51
|
115 |
+
%94 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %90, float %92) #5, !dbg !52
|
116 |
+
%95 = select i1 %93, float 0.000000e+00, float %94, !dbg !53
|
117 |
+
%96 = fmul float %91, %95, !dbg !54
|
118 |
+
%97 = fadd float %.us-phi, %96, !dbg !55
|
119 |
+
%98 = fadd float %.us-phi6, %87, !dbg !56
|
120 |
+
%99 = fmul float %91, %91, !dbg !57
|
121 |
+
%100 = fmul float %.us-phi7, %99, !dbg !58
|
122 |
+
%101 = fmul float %100, %95, !dbg !59
|
123 |
+
%102 = fadd float %98, %101, !dbg !60
|
124 |
+
%103 = bitcast float %97 to i32, !dbg !44
|
125 |
+
%104 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %103, i32 1, i32 31), !dbg !44
|
126 |
+
%105 = bitcast i32 %104 to float, !dbg !44
|
127 |
+
%106 = bitcast float %102 to i32, !dbg !44
|
128 |
+
%107 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %106, i32 1, i32 31), !dbg !44
|
129 |
+
%108 = bitcast i32 %107 to float, !dbg !44
|
130 |
+
%109 = bitcast float %92 to i32, !dbg !44
|
131 |
+
%110 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %109, i32 1, i32 31), !dbg !44
|
132 |
+
%111 = bitcast i32 %110 to float, !dbg !44
|
133 |
+
%112 = fsub float %105, %97, !dbg !46
|
134 |
+
%113 = fadd float %92, %111, !dbg !50
|
135 |
+
%114 = fcmp oeq float %113, 0.000000e+00, !dbg !51
|
136 |
+
%115 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %111, float %113) #5, !dbg !52
|
137 |
+
%116 = select i1 %114, float 0.000000e+00, float %115, !dbg !53
|
138 |
+
%117 = fmul float %112, %116, !dbg !54
|
139 |
+
%118 = fadd float %97, %117, !dbg !55
|
140 |
+
%119 = fadd float %102, %108, !dbg !56
|
141 |
+
%120 = fmul float %112, %112, !dbg !57
|
142 |
+
%121 = fmul float %92, %120, !dbg !58
|
143 |
+
%122 = fmul float %116, %121, !dbg !59
|
144 |
+
%123 = fadd float %119, %122, !dbg !60
|
145 |
+
%124 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %123, float 2.560000e+02) #5, !dbg !61
|
146 |
+
%125 = fadd float %124, 0x3EE4F8B580000000, !dbg !62
|
147 |
+
%126 = shl i32 %15, 8, !dbg !63
|
148 |
+
br label %127, !dbg !64
|
149 |
+
|
150 |
+
127: ; preds = %.split5.us, %__nv_rsqrtf.exit
|
151 |
+
%128 = phi i32 [ 0, %.split5.us ], [ %157, %__nv_rsqrtf.exit ]
|
152 |
+
%129 = or i32 %128, %12, !dbg !65
|
153 |
+
%130 = add i32 %129, %24, !dbg !66
|
154 |
+
%131 = sext i32 %130 to i64, !dbg !67
|
155 |
+
%132 = getelementptr float, ptr addrspace(1) %2, i64 %131, !dbg !67
|
156 |
+
%133 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %132, i1 true, i32 0, i1 true) #5, !dbg !68
|
157 |
+
%134 = bitcast i32 %133 to float, !dbg !68
|
158 |
+
%135 = zext nneg i32 %129 to i64, !dbg !69
|
159 |
+
%136 = getelementptr float, ptr addrspace(1) %3, i64 %135, !dbg !69
|
160 |
+
%137 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %136, i1 true, i32 0, i1 true) #5, !dbg !70
|
161 |
+
%138 = bitcast i32 %137 to float, !dbg !70
|
162 |
+
br i1 %29, label %139, label %140, !dbg !71
|
163 |
+
|
164 |
+
139: ; preds = %127
|
165 |
+
tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !71
|
166 |
+
br label %140, !dbg !71
|
167 |
+
|
168 |
+
140: ; preds = %139, %127
|
169 |
+
%141 = getelementptr float, ptr addrspace(1) %33, i64 %135, !dbg !72
|
170 |
+
%142 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %141, i1 true, i32 0, i1 true) #5, !dbg !73
|
171 |
+
%143 = bitcast i32 %142 to float, !dbg !73
|
172 |
+
%144 = fadd float %134, %143, !dbg !74
|
173 |
+
%145 = fsub float %144, %118, !dbg !75
|
174 |
+
%146 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !76
|
175 |
+
%.not.i = icmp eq i32 %146, 0, !dbg !76
|
176 |
+
br i1 %.not.i, label %149, label %147, !dbg !76
|
177 |
+
|
178 |
+
147: ; preds = %140
|
179 |
+
%148 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %125), !dbg !76
|
180 |
+
br label %__nv_rsqrtf.exit, !dbg !76
|
181 |
+
|
182 |
+
149: ; preds = %140
|
183 |
+
%150 = tail call float @llvm.nvvm.rsqrt.approx.f(float %125), !dbg !76
|
184 |
+
br label %__nv_rsqrtf.exit, !dbg !76
|
185 |
+
|
186 |
+
__nv_rsqrtf.exit: ; preds = %147, %149
|
187 |
+
%.0.i = phi float [ %148, %147 ], [ %150, %149 ], !dbg !76
|
188 |
+
%151 = fmul float %145, %.0.i, !dbg !77
|
189 |
+
%152 = fmul float %151, %138, !dbg !78
|
190 |
+
%153 = add i32 %129, %126, !dbg !79
|
191 |
+
%154 = sext i32 %153 to i64, !dbg !80
|
192 |
+
%155 = getelementptr i16, ptr addrspace(1) %4, i64 %154, !dbg !80
|
193 |
+
%156 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %152) #5, !dbg !81
|
194 |
+
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %156, ptr addrspace(1) %155, i1 true) #5, !dbg !81
|
195 |
+
%157 = add nuw nsw i32 %128, 4, !dbg !64
|
196 |
+
%158 = icmp ult i32 %128, 252, !dbg !64
|
197 |
+
br i1 %158, label %127, label %159, !dbg !64
|
198 |
+
|
199 |
+
159: ; preds = %__nv_rsqrtf.exit
|
200 |
+
ret void, !dbg !82
|
201 |
+
}
|
202 |
+
|
203 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
204 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
205 |
+
|
206 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
207 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
|
208 |
+
|
209 |
+
; Function Attrs: alwaysinline nounwind
|
210 |
+
define float @__nv_rsqrtf(float %x) local_unnamed_addr #2 {
|
211 |
+
%1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5
|
212 |
+
%.not = icmp eq i32 %1, 0
|
213 |
+
br i1 %.not, label %4, label %2
|
214 |
+
|
215 |
+
2: ; preds = %0
|
216 |
+
%3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
|
217 |
+
br label %6
|
218 |
+
|
219 |
+
4: ; preds = %0
|
220 |
+
%5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
|
221 |
+
br label %6
|
222 |
+
|
223 |
+
6: ; preds = %4, %2
|
224 |
+
%.0 = phi float [ %3, %2 ], [ %5, %4 ]
|
225 |
+
ret float %.0
|
226 |
+
}
|
227 |
+
|
228 |
+
declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #3
|
229 |
+
|
230 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
231 |
+
declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #4
|
232 |
+
|
233 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
234 |
+
declare float @llvm.nvvm.rsqrt.approx.f(float) #4
|
235 |
+
|
236 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
237 |
+
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
238 |
+
attributes #2 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
239 |
+
attributes #3 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
240 |
+
attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
|
241 |
+
attributes #5 = { nounwind }
|
242 |
+
|
243 |
+
!llvm.module.flags = !{!0, !1}
|
244 |
+
!llvm.dbg.cu = !{!2}
|
245 |
+
!nvvm.annotations = !{!4, !5, !5, !4}
|
246 |
+
!llvm.ident = !{!6}
|
247 |
+
|
248 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
249 |
+
!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
250 |
+
!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
251 |
+
!3 = !DIFile(filename: "cgx5lxpuexpindj4dsmjz5x42uhyy7iskevq7ovzpwagb3t5powj.py", directory: "/tmp/torchinductor_root/gx")
|
252 |
+
!4 = !{ptr @triton__0d1d2d3d4d5de6de, !"kernel", i32 1}
|
253 |
+
!5 = !{ptr @triton__0d1d2d3d4d5de6de, !"maxntidx", i32 256}
|
254 |
+
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
|
255 |
+
!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5de6de", linkageName: "triton__0d1d2d3d4d5de6de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
|
256 |
+
!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
|
257 |
+
!9 = !{}
|
258 |
+
!10 = !DILocation(line: 22, column: 44, scope: !7)
|
259 |
+
!11 = !DILocation(line: 24, column: 33, scope: !7)
|
260 |
+
!12 = !DILocation(line: 21, column: 28, scope: !7)
|
261 |
+
!13 = !DILocation(line: 21, column: 33, scope: !7)
|
262 |
+
!14 = !DILocation(line: 22, column: 23, scope: !7)
|
263 |
+
!15 = !DILocation(line: 26, column: 30, scope: !7)
|
264 |
+
!16 = !DILocation(line: 26, column: 35, scope: !7)
|
265 |
+
!17 = !DILocation(line: 27, column: 18, scope: !7)
|
266 |
+
!18 = !DILocation(line: 35, column: 44, scope: !7)
|
267 |
+
!19 = !DILocation(line: 36, column: 22, scope: !7)
|
268 |
+
!20 = !DILocation(line: 37, column: 22, scope: !7)
|
269 |
+
!21 = !DILocation(line: 38, column: 36, scope: !7)
|
270 |
+
!22 = !DILocation(line: 39, column: 40, scope: !7)
|
271 |
+
!23 = !DILocation(line: 40, column: 44, scope: !7)
|
272 |
+
!24 = !DILocation(line: 39, column: 55, scope: !7)
|
273 |
+
!25 = !DILocation(line: 32, column: 27, scope: !7)
|
274 |
+
!26 = !DILocation(line: 35, column: 40, scope: !7)
|
275 |
+
!27 = !DILocation(line: 35, column: 34, scope: !7)
|
276 |
+
!28 = !DILocation(line: 35, column: 50, scope: !7)
|
277 |
+
!29 = !DILocation(line: 40, column: 40, scope: !7)
|
278 |
+
!30 = !DILocation(line: 40, column: 34, scope: !7)
|
279 |
+
!31 = !DILocation(line: 40, column: 52, scope: !7)
|
280 |
+
!32 = !DILocation(line: 41, column: 22, scope: !7)
|
281 |
+
!33 = !DILocation(line: 96, column: 20, scope: !34, inlinedAt: !36)
|
282 |
+
!34 = distinct !DILexicalBlockFile(scope: !7, file: !35, discriminator: 0)
|
283 |
+
!35 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
|
284 |
+
!36 = !DILocation(line: 44, column: 38, scope: !34)
|
285 |
+
!37 = !DILocation(line: 97, column: 26, scope: !34, inlinedAt: !36)
|
286 |
+
!38 = !DILocation(line: 98, column: 30, scope: !34, inlinedAt: !36)
|
287 |
+
!39 = !DILocation(line: 98, column: 22, scope: !34, inlinedAt: !36)
|
288 |
+
!40 = !DILocation(line: 101, column: 30, scope: !34, inlinedAt: !36)
|
289 |
+
!41 = !DILocation(line: 101, column: 22, scope: !34, inlinedAt: !36)
|
290 |
+
!42 = !DILocation(line: 47, column: 48, scope: !7)
|
291 |
+
!43 = !DILocation(line: 31, column: 36, scope: !7)
|
292 |
+
!44 = !DILocation(line: 120, column: 46, scope: !34, inlinedAt: !45)
|
293 |
+
!45 = !DILocation(line: 50, column: 41, scope: !34)
|
294 |
+
!46 = !DILocation(line: 108, column: 21, scope: !47, inlinedAt: !48)
|
295 |
+
!47 = distinct !DILexicalBlockFile(scope: !34, file: !35, discriminator: 0)
|
296 |
+
!48 = !DILocation(line: 120, column: 46, scope: !47, inlinedAt: !49)
|
297 |
+
!49 = !DILocation(line: 50, column: 41, scope: !47)
|
298 |
+
!50 = !DILocation(line: 109, column: 28, scope: !47, inlinedAt: !48)
|
299 |
+
!51 = !DILocation(line: 110, column: 39, scope: !47, inlinedAt: !48)
|
300 |
+
!52 = !DILocation(line: 110, column: 60, scope: !47, inlinedAt: !48)
|
301 |
+
!53 = !DILocation(line: 110, column: 49, scope: !47, inlinedAt: !48)
|
302 |
+
!54 = !DILocation(line: 112, column: 25, scope: !47, inlinedAt: !48)
|
303 |
+
!55 = !DILocation(line: 112, column: 17, scope: !47, inlinedAt: !48)
|
304 |
+
!56 = !DILocation(line: 113, column: 15, scope: !47, inlinedAt: !48)
|
305 |
+
!57 = !DILocation(line: 113, column: 30, scope: !47, inlinedAt: !48)
|
306 |
+
!58 = !DILocation(line: 113, column: 38, scope: !47, inlinedAt: !48)
|
307 |
+
!59 = !DILocation(line: 113, column: 49, scope: !47, inlinedAt: !48)
|
308 |
+
!60 = !DILocation(line: 113, column: 22, scope: !47, inlinedAt: !48)
|
309 |
+
!61 = !DILocation(line: 69, column: 23, scope: !7)
|
310 |
+
!62 = !DILocation(line: 71, column: 24, scope: !7)
|
311 |
+
!63 = !DILocation(line: 76, column: 39, scope: !7)
|
312 |
+
!64 = !DILocation(line: 55, column: 36, scope: !7)
|
313 |
+
!65 = !DILocation(line: 56, column: 27, scope: !7)
|
314 |
+
!66 = !DILocation(line: 59, column: 41, scope: !7)
|
315 |
+
!67 = !DILocation(line: 59, column: 35, scope: !7)
|
316 |
+
!68 = !DILocation(line: 59, column: 51, scope: !7)
|
317 |
+
!69 = !DILocation(line: 60, column: 35, scope: !7)
|
318 |
+
!70 = !DILocation(line: 60, column: 40, scope: !7)
|
319 |
+
!71 = !DILocation(line: 64, column: 57, scope: !7)
|
320 |
+
!72 = !DILocation(line: 65, column: 35, scope: !7)
|
321 |
+
!73 = !DILocation(line: 65, column: 54, scope: !7)
|
322 |
+
!74 = !DILocation(line: 66, column: 24, scope: !7)
|
323 |
+
!75 = !DILocation(line: 67, column: 24, scope: !7)
|
324 |
+
!76 = !DILocation(line: 72, column: 30, scope: !7)
|
325 |
+
!77 = !DILocation(line: 73, column: 24, scope: !7)
|
326 |
+
!78 = !DILocation(line: 74, column: 24, scope: !7)
|
327 |
+
!79 = !DILocation(line: 76, column: 35, scope: !7)
|
328 |
+
!80 = !DILocation(line: 76, column: 29, scope: !7)
|
329 |
+
!81 = !DILocation(line: 76, column: 52, scope: !7)
|
330 |
+
!82 = !DILocation(line: 55, column: 4, scope: !7)
|
.triton/dump/21d0195c63fb062bfc567b79c9bb2771/triton_.llir
ADDED
@@ -0,0 +1,321 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@global_smem = external addrspace(3) global [0 x i8]
|
5 |
+
|
6 |
+
define void @triton__0d1d2d3d4d5d6d7d8de9de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, ptr addrspace(1) %7, i32 %8, i32 %9) local_unnamed_addr !dbg !5 {
|
7 |
+
%11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
|
8 |
+
%12 = and i32 %11, 31, !dbg !8
|
9 |
+
%13 = lshr i32 %11, 5, !dbg !8
|
10 |
+
%14 = and i32 %13, 1, !dbg !8
|
11 |
+
%urem = shl i32 %11, 2, !dbg !8
|
12 |
+
%15 = and i32 %urem, 252, !dbg !8
|
13 |
+
%16 = or i32 %15, 1, !dbg !8
|
14 |
+
%17 = or i32 %15, 2, !dbg !8
|
15 |
+
%18 = or i32 %15, 3, !dbg !8
|
16 |
+
%19 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !9
|
17 |
+
%20 = shl i32 %19, 8, !dbg !10
|
18 |
+
%21 = or i32 %20, %15, !dbg !11
|
19 |
+
%22 = sext i32 %21 to i64, !dbg !12
|
20 |
+
%23 = getelementptr i16, ptr addrspace(1) %1, i64 %22, !dbg !12
|
21 |
+
%24 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %23, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !13
|
22 |
+
%25 = extractvalue { i32, i32 } %24, 0, !dbg !13
|
23 |
+
%26 = extractvalue { i32, i32 } %24, 1, !dbg !13
|
24 |
+
%27 = trunc i32 %25 to i16, !dbg !13
|
25 |
+
%extelt.offset = lshr i32 %25, 16, !dbg !13
|
26 |
+
%28 = trunc i32 %extelt.offset to i16, !dbg !13
|
27 |
+
%29 = trunc i32 %26 to i16, !dbg !13
|
28 |
+
%extelt.offset1 = lshr i32 %26, 16, !dbg !13
|
29 |
+
%30 = trunc i32 %extelt.offset1 to i16, !dbg !13
|
30 |
+
%31 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %27) #3, !dbg !14
|
31 |
+
%32 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %28) #3, !dbg !14
|
32 |
+
%33 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %29) #3, !dbg !14
|
33 |
+
%34 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %30) #3, !dbg !14
|
34 |
+
%35 = zext nneg i32 %15 to i64, !dbg !15
|
35 |
+
%36 = getelementptr float, ptr addrspace(1) %2, i64 %35, !dbg !15
|
36 |
+
%37 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %36, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !16
|
37 |
+
%38 = extractvalue { i32, i32, i32, i32 } %37, 0, !dbg !16
|
38 |
+
%39 = extractvalue { i32, i32, i32, i32 } %37, 1, !dbg !16
|
39 |
+
%40 = extractvalue { i32, i32, i32, i32 } %37, 2, !dbg !16
|
40 |
+
%41 = extractvalue { i32, i32, i32, i32 } %37, 3, !dbg !16
|
41 |
+
%42 = bitcast i32 %38 to float, !dbg !16
|
42 |
+
%43 = bitcast i32 %39 to float, !dbg !16
|
43 |
+
%44 = bitcast i32 %40 to float, !dbg !16
|
44 |
+
%45 = bitcast i32 %41 to float, !dbg !16
|
45 |
+
%46 = getelementptr float, ptr addrspace(1) %3, i64 %22, !dbg !17
|
46 |
+
%47 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %46, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !18
|
47 |
+
%48 = extractvalue { i32, i32, i32, i32 } %47, 0, !dbg !18
|
48 |
+
%49 = extractvalue { i32, i32, i32, i32 } %47, 1, !dbg !18
|
49 |
+
%50 = extractvalue { i32, i32, i32, i32 } %47, 2, !dbg !18
|
50 |
+
%51 = extractvalue { i32, i32, i32, i32 } %47, 3, !dbg !18
|
51 |
+
%52 = bitcast i32 %48 to float, !dbg !18
|
52 |
+
%53 = bitcast i32 %49 to float, !dbg !18
|
53 |
+
%54 = bitcast i32 %50 to float, !dbg !18
|
54 |
+
%55 = bitcast i32 %51 to float, !dbg !18
|
55 |
+
%56 = sext i32 %19 to i64, !dbg !19
|
56 |
+
%57 = getelementptr float, ptr addrspace(1) %4, i64 %56, !dbg !19
|
57 |
+
%58 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %57, i1 true) #3, !dbg !20
|
58 |
+
%59 = bitcast i32 %58 to float, !dbg !20
|
59 |
+
%60 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %57, i1 true) #3, !dbg !20
|
60 |
+
%61 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %57, i1 true) #3, !dbg !20
|
61 |
+
%62 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %57, i1 true) #3, !dbg !20
|
62 |
+
%63 = getelementptr float, ptr addrspace(1) %5, i64 %56, !dbg !21
|
63 |
+
%64 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %63, i1 true) #3, !dbg !22
|
64 |
+
%65 = bitcast i32 %64 to float, !dbg !22
|
65 |
+
%66 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %63, i1 true) #3, !dbg !22
|
66 |
+
%67 = bitcast i32 %66 to float, !dbg !22
|
67 |
+
%68 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %63, i1 true) #3, !dbg !22
|
68 |
+
%69 = bitcast i32 %68 to float, !dbg !22
|
69 |
+
%70 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %63, i1 true) #3, !dbg !22
|
70 |
+
%71 = bitcast i32 %70 to float, !dbg !22
|
71 |
+
%72 = getelementptr i64, ptr addrspace(1) %6, i64 %56, !dbg !23
|
72 |
+
%73 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %72, i1 true) #3, !dbg !24
|
73 |
+
%74 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %72, i1 true) #3, !dbg !24
|
74 |
+
%75 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %72, i1 true) #3, !dbg !24
|
75 |
+
%76 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %72, i1 true) #3, !dbg !24
|
76 |
+
%77 = getelementptr float, ptr addrspace(1) %0, i64 %22, !dbg !25
|
77 |
+
%78 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %77, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !26
|
78 |
+
%79 = extractvalue { i32, i32, i32, i32 } %78, 0, !dbg !26
|
79 |
+
%80 = extractvalue { i32, i32, i32, i32 } %78, 1, !dbg !26
|
80 |
+
%81 = extractvalue { i32, i32, i32, i32 } %78, 2, !dbg !26
|
81 |
+
%82 = extractvalue { i32, i32, i32, i32 } %78, 3, !dbg !26
|
82 |
+
%83 = bitcast i32 %79 to float, !dbg !26
|
83 |
+
%84 = bitcast i32 %80 to float, !dbg !26
|
84 |
+
%85 = bitcast i32 %81 to float, !dbg !26
|
85 |
+
%86 = bitcast i32 %82 to float, !dbg !26
|
86 |
+
%87 = fmul float %31, %42, !dbg !27
|
87 |
+
%88 = fmul float %32, %43, !dbg !27
|
88 |
+
%89 = fmul float %33, %44, !dbg !27
|
89 |
+
%90 = fmul float %34, %45, !dbg !27
|
90 |
+
%91 = fadd float %87, %88, !dbg !28
|
91 |
+
%92 = fadd float %89, %91, !dbg !28
|
92 |
+
%93 = fadd float %90, %92, !dbg !28
|
93 |
+
%94 = bitcast float %93 to i32, !dbg !34
|
94 |
+
%95 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %94, i32 16, i32 31), !dbg !34
|
95 |
+
%96 = bitcast i32 %95 to float, !dbg !34
|
96 |
+
%97 = fadd float %93, %96, !dbg !28
|
97 |
+
%98 = bitcast float %97 to i32, !dbg !34
|
98 |
+
%99 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %98, i32 8, i32 31), !dbg !34
|
99 |
+
%100 = bitcast i32 %99 to float, !dbg !34
|
100 |
+
%101 = fadd float %97, %100, !dbg !28
|
101 |
+
%102 = bitcast float %101 to i32, !dbg !34
|
102 |
+
%103 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %102, i32 4, i32 31), !dbg !34
|
103 |
+
%104 = bitcast i32 %103 to float, !dbg !34
|
104 |
+
%105 = fadd float %101, %104, !dbg !28
|
105 |
+
%106 = bitcast float %105 to i32, !dbg !34
|
106 |
+
%107 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %106, i32 2, i32 31), !dbg !34
|
107 |
+
%108 = bitcast i32 %107 to float, !dbg !34
|
108 |
+
%109 = fadd float %105, %108, !dbg !28
|
109 |
+
%110 = bitcast float %109 to i32, !dbg !34
|
110 |
+
%111 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %110, i32 1, i32 31), !dbg !34
|
111 |
+
%112 = bitcast i32 %111 to float, !dbg !34
|
112 |
+
%113 = fadd float %109, %112, !dbg !28
|
113 |
+
%114 = icmp eq i32 %12, 0, !dbg !34
|
114 |
+
%115 = zext nneg i32 %14 to i64, !dbg !34
|
115 |
+
%116 = getelementptr float, ptr addrspace(3) @global_smem, i64 %115, !dbg !34
|
116 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %116, float %113, i1 %114) #3, !dbg !34
|
117 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !34
|
118 |
+
%117 = icmp slt i32 %11, 2, !dbg !34
|
119 |
+
%118 = sext i32 %11 to i64, !dbg !34
|
120 |
+
%119 = getelementptr float, ptr addrspace(3) @global_smem, i64 %118, !dbg !34
|
121 |
+
%120 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %119, i1 %117) #3, !dbg !34
|
122 |
+
%121 = bitcast float %120 to i32, !dbg !34
|
123 |
+
%122 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %121, i32 1, i32 31), !dbg !34
|
124 |
+
%123 = bitcast i32 %122 to float, !dbg !34
|
125 |
+
%124 = fadd float %120, %123, !dbg !28
|
126 |
+
%125 = and i32 %11, 1, !dbg !34
|
127 |
+
%126 = icmp eq i32 %125, 0, !dbg !34
|
128 |
+
%127 = and i1 %117, %126, !dbg !34
|
129 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %119, float %124, i1 %127) #3, !dbg !34
|
130 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !34
|
131 |
+
%128 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !34
|
132 |
+
%129 = fadd float %128, 0.000000e+00, !dbg !36
|
133 |
+
%130 = fsub float %52, %59, !dbg !40
|
134 |
+
%131 = fsub float %53, %59, !dbg !40
|
135 |
+
%132 = fsub float %54, %59, !dbg !40
|
136 |
+
%133 = fsub float %55, %59, !dbg !40
|
137 |
+
%134 = fmul float %130, %65, !dbg !41
|
138 |
+
%135 = fmul float %131, %65, !dbg !41
|
139 |
+
%136 = fmul float %132, %65, !dbg !41
|
140 |
+
%137 = fmul float %133, %65, !dbg !41
|
141 |
+
%138 = fmul float %87, %134, !dbg !42
|
142 |
+
%139 = fmul float %88, %135, !dbg !42
|
143 |
+
%140 = fmul float %89, %136, !dbg !42
|
144 |
+
%141 = fmul float %90, %137, !dbg !42
|
145 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !43
|
146 |
+
%142 = fadd float %138, %139, !dbg !45
|
147 |
+
%143 = fadd float %140, %142, !dbg !45
|
148 |
+
%144 = fadd float %141, %143, !dbg !45
|
149 |
+
%145 = bitcast float %144 to i32, !dbg !43
|
150 |
+
%146 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %145, i32 16, i32 31), !dbg !43
|
151 |
+
%147 = bitcast i32 %146 to float, !dbg !43
|
152 |
+
%148 = fadd float %144, %147, !dbg !45
|
153 |
+
%149 = bitcast float %148 to i32, !dbg !43
|
154 |
+
%150 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %149, i32 8, i32 31), !dbg !43
|
155 |
+
%151 = bitcast i32 %150 to float, !dbg !43
|
156 |
+
%152 = fadd float %148, %151, !dbg !45
|
157 |
+
%153 = bitcast float %152 to i32, !dbg !43
|
158 |
+
%154 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %153, i32 4, i32 31), !dbg !43
|
159 |
+
%155 = bitcast i32 %154 to float, !dbg !43
|
160 |
+
%156 = fadd float %152, %155, !dbg !45
|
161 |
+
%157 = bitcast float %156 to i32, !dbg !43
|
162 |
+
%158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 2, i32 31), !dbg !43
|
163 |
+
%159 = bitcast i32 %158 to float, !dbg !43
|
164 |
+
%160 = fadd float %156, %159, !dbg !45
|
165 |
+
%161 = bitcast float %160 to i32, !dbg !43
|
166 |
+
%162 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %161, i32 1, i32 31), !dbg !43
|
167 |
+
%163 = bitcast i32 %162 to float, !dbg !43
|
168 |
+
%164 = fadd float %160, %163, !dbg !45
|
169 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %116, float %164, i1 %114) #3, !dbg !43
|
170 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !43
|
171 |
+
%165 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %119, i1 %117) #3, !dbg !43
|
172 |
+
%166 = bitcast float %165 to i32, !dbg !43
|
173 |
+
%167 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %166, i32 1, i32 31), !dbg !43
|
174 |
+
%168 = bitcast i32 %167 to float, !dbg !43
|
175 |
+
%169 = fadd float %165, %168, !dbg !45
|
176 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %119, float %169, i1 %127) #3, !dbg !43
|
177 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !43
|
178 |
+
%170 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !43
|
179 |
+
%171 = fadd float %170, 0.000000e+00, !dbg !48
|
180 |
+
%172 = icmp eq i64 %73, -1, !dbg !50
|
181 |
+
%173 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %65, float 2.560000e+02) #3, !dbg !51
|
182 |
+
%174 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %67, float 2.560000e+02) #3, !dbg !51
|
183 |
+
%175 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %69, float 2.560000e+02) #3, !dbg !51
|
184 |
+
%176 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %71, float 2.560000e+02) #3, !dbg !51
|
185 |
+
%177 = fmul float %87, 2.560000e+02, !dbg !52
|
186 |
+
%178 = fmul float %88, 2.560000e+02, !dbg !52
|
187 |
+
%179 = fmul float %89, 2.560000e+02, !dbg !52
|
188 |
+
%180 = fmul float %90, 2.560000e+02, !dbg !52
|
189 |
+
%181 = fsub float %177, %129, !dbg !53
|
190 |
+
%182 = fsub float %178, %129, !dbg !53
|
191 |
+
%183 = fsub float %179, %129, !dbg !53
|
192 |
+
%184 = fsub float %180, %129, !dbg !53
|
193 |
+
%185 = fmul float %134, %171, !dbg !54
|
194 |
+
%186 = fmul float %135, %171, !dbg !54
|
195 |
+
%187 = fmul float %136, %171, !dbg !54
|
196 |
+
%188 = fmul float %137, %171, !dbg !54
|
197 |
+
%189 = fsub float %181, %185, !dbg !55
|
198 |
+
%190 = fsub float %182, %186, !dbg !55
|
199 |
+
%191 = fsub float %183, %187, !dbg !55
|
200 |
+
%192 = fsub float %184, %188, !dbg !55
|
201 |
+
%193 = fmul float %173, %189, !dbg !56
|
202 |
+
%194 = fmul float %173, %190, !dbg !56
|
203 |
+
%195 = fmul float %173, %191, !dbg !56
|
204 |
+
%196 = fmul float %173, %192, !dbg !56
|
205 |
+
%197 = fadd float %193, %83, !dbg !57
|
206 |
+
%198 = fadd float %194, %84, !dbg !57
|
207 |
+
%199 = fadd float %195, %85, !dbg !57
|
208 |
+
%200 = fadd float %196, %86, !dbg !57
|
209 |
+
%201 = select i1 %172, float 0.000000e+00, float %197, !dbg !58
|
210 |
+
%202 = select i1 %172, float 0.000000e+00, float %198, !dbg !58
|
211 |
+
%203 = select i1 %172, float 0.000000e+00, float %199, !dbg !58
|
212 |
+
%204 = select i1 %172, float 0.000000e+00, float %200, !dbg !58
|
213 |
+
%205 = icmp slt i64 %73, 0, !dbg !59
|
214 |
+
%206 = shl i64 %73, 8, !dbg !60
|
215 |
+
%207 = add i64 %206, 12865792, !dbg !60
|
216 |
+
%208 = select i1 %205, i64 %207, i64 %206, !dbg !60
|
217 |
+
%209 = zext nneg i32 %16 to i64
|
218 |
+
%210 = zext nneg i32 %17 to i64
|
219 |
+
%211 = zext nneg i32 %18 to i64
|
220 |
+
%212 = or i64 %208, %35, !dbg !61
|
221 |
+
%213 = or i64 %208, %209, !dbg !61
|
222 |
+
%214 = or i64 %208, %210, !dbg !61
|
223 |
+
%215 = or i64 %208, %211, !dbg !61
|
224 |
+
%216 = getelementptr float, ptr addrspace(1) %7, i64 %212, !dbg !62
|
225 |
+
%217 = getelementptr float, ptr addrspace(1) %7, i64 %213, !dbg !62
|
226 |
+
%218 = getelementptr float, ptr addrspace(1) %7, i64 %214, !dbg !62
|
227 |
+
%219 = getelementptr float, ptr addrspace(1) %7, i64 %215, !dbg !62
|
228 |
+
%220 = insertelement <1 x float> undef, float %201, i64 0, !dbg !63
|
229 |
+
%221 = tail call float asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 atom.global.gpu.acq_rel.add.f32 $0, [ $1 + 0 ], $2;", "=r,l,r,b"(ptr addrspace(1) %216, <1 x float> %220, i1 true) #3, !dbg !63
|
230 |
+
%222 = insertelement <1 x float> undef, float %202, i64 0, !dbg !63
|
231 |
+
%223 = tail call float asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 atom.global.gpu.acq_rel.add.f32 $0, [ $1 + 0 ], $2;", "=r,l,r,b"(ptr addrspace(1) %217, <1 x float> %222, i1 true) #3, !dbg !63
|
232 |
+
%224 = insertelement <1 x float> undef, float %203, i64 0, !dbg !63
|
233 |
+
%225 = tail call float asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 atom.global.gpu.acq_rel.add.f32 $0, [ $1 + 0 ], $2;", "=r,l,r,b"(ptr addrspace(1) %218, <1 x float> %224, i1 true) #3, !dbg !63
|
234 |
+
%226 = insertelement <1 x float> undef, float %204, i64 0, !dbg !63
|
235 |
+
%227 = tail call float asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 atom.global.gpu.acq_rel.add.f32 $0, [ $1 + 0 ], $2;", "=r,l,r,b"(ptr addrspace(1) %219, <1 x float> %226, i1 true) #3, !dbg !63
|
236 |
+
ret void, !dbg !64
|
237 |
+
}
|
238 |
+
|
239 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
240 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
241 |
+
|
242 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
243 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
|
244 |
+
|
245 |
+
; Function Attrs: convergent nocallback nounwind
|
246 |
+
declare void @llvm.nvvm.barrier0() #2
|
247 |
+
|
248 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
249 |
+
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
250 |
+
attributes #2 = { convergent nocallback nounwind }
|
251 |
+
attributes #3 = { nounwind }
|
252 |
+
|
253 |
+
!llvm.module.flags = !{!0}
|
254 |
+
!llvm.dbg.cu = !{!1}
|
255 |
+
!nvvm.annotations = !{!3, !4, !4, !3}
|
256 |
+
|
257 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
258 |
+
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
259 |
+
!2 = !DIFile(filename: "cqryxm46jcxyr3qdktqirn53eap7h3pjjqiqavyqqyvflabjpvmd.py", directory: "/tmp/torchinductor_root/qr")
|
260 |
+
!3 = !{ptr @triton__0d1d2d3d4d5d6d7d8de9de, !"kernel", i32 1}
|
261 |
+
!4 = !{ptr @triton__0d1d2d3d4d5d6d7d8de9de, !"maxntidx", i32 64}
|
262 |
+
!5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7d8de9de", linkageName: "triton__0d1d2d3d4d5d6d7d8de9de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
|
263 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
264 |
+
!7 = !{}
|
265 |
+
!8 = !DILocation(line: 26, column: 26, scope: !5)
|
266 |
+
!9 = !DILocation(line: 23, column: 28, scope: !5)
|
267 |
+
!10 = !DILocation(line: 30, column: 40, scope: !5)
|
268 |
+
!11 = !DILocation(line: 30, column: 36, scope: !5)
|
269 |
+
!12 = !DILocation(line: 30, column: 30, scope: !5)
|
270 |
+
!13 = !DILocation(line: 30, column: 46, scope: !5)
|
271 |
+
!14 = !DILocation(line: 30, column: 67, scope: !5)
|
272 |
+
!15 = !DILocation(line: 31, column: 30, scope: !5)
|
273 |
+
!16 = !DILocation(line: 31, column: 35, scope: !5)
|
274 |
+
!17 = !DILocation(line: 32, column: 30, scope: !5)
|
275 |
+
!18 = !DILocation(line: 32, column: 46, scope: !5)
|
276 |
+
!19 = !DILocation(line: 33, column: 30, scope: !5)
|
277 |
+
!20 = !DILocation(line: 33, column: 35, scope: !5)
|
278 |
+
!21 = !DILocation(line: 34, column: 31, scope: !5)
|
279 |
+
!22 = !DILocation(line: 34, column: 36, scope: !5)
|
280 |
+
!23 = !DILocation(line: 35, column: 31, scope: !5)
|
281 |
+
!24 = !DILocation(line: 35, column: 36, scope: !5)
|
282 |
+
!25 = !DILocation(line: 36, column: 35, scope: !5)
|
283 |
+
!26 = !DILocation(line: 36, column: 51, scope: !5)
|
284 |
+
!27 = !DILocation(line: 38, column: 18, scope: !5)
|
285 |
+
!28 = !DILocation(line: 233, column: 15, scope: !29, inlinedAt: !32)
|
286 |
+
!29 = distinct !DILexicalBlockFile(scope: !31, file: !30, discriminator: 0)
|
287 |
+
!30 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
|
288 |
+
!31 = distinct !DILexicalBlockFile(scope: !5, file: !30, discriminator: 0)
|
289 |
+
!32 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !33)
|
290 |
+
!33 = !DILocation(line: 41, column: 57, scope: !29)
|
291 |
+
!34 = !DILocation(line: 243, column: 36, scope: !31, inlinedAt: !35)
|
292 |
+
!35 = !DILocation(line: 41, column: 57, scope: !31)
|
293 |
+
!36 = !DILocation(line: 8, column: 15, scope: !37, inlinedAt: !39)
|
294 |
+
!37 = distinct !DILexicalBlockFile(scope: !5, file: !38, discriminator: 0)
|
295 |
+
!38 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
|
296 |
+
!39 = !DILocation(line: 41, column: 44, scope: !37)
|
297 |
+
!40 = !DILocation(line: 42, column: 19, scope: !5)
|
298 |
+
!41 = !DILocation(line: 43, column: 20, scope: !5)
|
299 |
+
!42 = !DILocation(line: 44, column: 19, scope: !5)
|
300 |
+
!43 = !DILocation(line: 243, column: 36, scope: !31, inlinedAt: !44)
|
301 |
+
!44 = !DILocation(line: 47, column: 59, scope: !31)
|
302 |
+
!45 = !DILocation(line: 233, column: 15, scope: !29, inlinedAt: !46)
|
303 |
+
!46 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !47)
|
304 |
+
!47 = !DILocation(line: 47, column: 59, scope: !29)
|
305 |
+
!48 = !DILocation(line: 8, column: 15, scope: !37, inlinedAt: !49)
|
306 |
+
!49 = !DILocation(line: 47, column: 45, scope: !37)
|
307 |
+
!50 = !DILocation(line: 49, column: 21, scope: !5)
|
308 |
+
!51 = !DILocation(line: 51, column: 20, scope: !5)
|
309 |
+
!52 = !DILocation(line: 52, column: 19, scope: !5)
|
310 |
+
!53 = !DILocation(line: 53, column: 20, scope: !5)
|
311 |
+
!54 = !DILocation(line: 54, column: 20, scope: !5)
|
312 |
+
!55 = !DILocation(line: 55, column: 20, scope: !5)
|
313 |
+
!56 = !DILocation(line: 56, column: 20, scope: !5)
|
314 |
+
!57 = !DILocation(line: 57, column: 20, scope: !5)
|
315 |
+
!58 = !DILocation(line: 59, column: 35, scope: !5)
|
316 |
+
!59 = !DILocation(line: 61, column: 20, scope: !5)
|
317 |
+
!60 = !DILocation(line: 63, column: 56, scope: !5)
|
318 |
+
!61 = !DILocation(line: 63, column: 52, scope: !5)
|
319 |
+
!62 = !DILocation(line: 63, column: 30, scope: !5)
|
320 |
+
!63 = !DILocation(line: 63, column: 83, scope: !5)
|
321 |
+
!64 = !DILocation(line: 63, column: 4, scope: !5)
|
.triton/dump/21d0195c63fb062bfc567b79c9bb2771/triton_.ttir
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2d3d4d5d6d7d8de9de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%c256_i32 = arith.constant 256 : i32
|
4 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<256xbf16>
|
5 |
+
%cst_0 = arith.constant 0.000000e+00 : f32
|
6 |
+
%cst_1 = arith.constant dense<256> : tensor<1xi64>
|
7 |
+
%cst_2 = arith.constant dense<0> : tensor<1xi64>
|
8 |
+
%cst_3 = arith.constant dense<50257> : tensor<1xi64>
|
9 |
+
%cst_4 = arith.constant dense<0.000000e+00> : tensor<256xf32>
|
10 |
+
%cst_5 = arith.constant dense<2.560000e+02> : tensor<256xf32>
|
11 |
+
%cst_6 = arith.constant dense<2.560000e+02> : tensor<1xf32>
|
12 |
+
%cst_7 = arith.constant dense<-1> : tensor<1xi64>
|
13 |
+
%cst_8 = arith.constant dense<256> : tensor<256xi32>
|
14 |
+
%0 = tt.get_program_id x : i32
|
15 |
+
%1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
|
16 |
+
%2 = arith.cmpi slt, %1, %cst_8 : tensor<256xi32>
|
17 |
+
%3 = arith.muli %0, %c256_i32 : i32
|
18 |
+
%4 = tt.splat %3 : (i32) -> tensor<256xi32>
|
19 |
+
%5 = arith.addi %1, %4 : tensor<256xi32>
|
20 |
+
%6 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
|
21 |
+
%7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
|
22 |
+
%8 = tt.load %7, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
|
23 |
+
%9 = arith.extf %8 : tensor<256xbf16> to tensor<256xf32>
|
24 |
+
%10 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
|
25 |
+
%11 = tt.addptr %10, %1 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
|
26 |
+
%12 = tt.load %11, %2, %cst_4 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
|
27 |
+
%13 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
|
28 |
+
%14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
|
29 |
+
%15 = tt.load %14, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
|
30 |
+
%16 = tt.addptr %arg4, %0 : !tt.ptr<f32, 1>, i32
|
31 |
+
%17 = tt.splat %16 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>>
|
32 |
+
%18 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32>
|
33 |
+
%19 = tt.addptr %arg5, %0 : !tt.ptr<f32, 1>, i32
|
34 |
+
%20 = tt.splat %19 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>>
|
35 |
+
%21 = tt.load %20 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32>
|
36 |
+
%22 = tt.addptr %arg6, %0 : !tt.ptr<i64, 1>, i32
|
37 |
+
%23 = tt.splat %22 : (!tt.ptr<i64, 1>) -> tensor<1x!tt.ptr<i64, 1>>
|
38 |
+
%24 = tt.load %23 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xi64>
|
39 |
+
%25 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
|
40 |
+
%26 = tt.addptr %25, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
|
41 |
+
%27 = tt.load %26, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
|
42 |
+
%28 = arith.mulf %9, %12 : tensor<256xf32>
|
43 |
+
%29 = arith.select %2, %28, %cst_4 : tensor<256xi1>, tensor<256xf32>
|
44 |
+
%30 = "tt.reduce"(%29) <{axis = 0 : i32}> ({
|
45 |
+
^bb0(%arg10: f32, %arg11: f32):
|
46 |
+
%63 = arith.addf %arg10, %arg11 : f32
|
47 |
+
tt.reduce.return %63 : f32
|
48 |
+
}) : (tensor<256xf32>) -> f32
|
49 |
+
%31 = arith.addf %30, %cst_0 : f32
|
50 |
+
%32 = tt.broadcast %18 : (tensor<1xf32>) -> tensor<256xf32>
|
51 |
+
%33 = arith.subf %15, %32 : tensor<256xf32>
|
52 |
+
%34 = tt.broadcast %21 : (tensor<1xf32>) -> tensor<256xf32>
|
53 |
+
%35 = arith.mulf %33, %34 : tensor<256xf32>
|
54 |
+
%36 = arith.mulf %28, %35 : tensor<256xf32>
|
55 |
+
%37 = arith.select %2, %36, %cst_4 : tensor<256xi1>, tensor<256xf32>
|
56 |
+
%38 = "tt.reduce"(%37) <{axis = 0 : i32}> ({
|
57 |
+
^bb0(%arg10: f32, %arg11: f32):
|
58 |
+
%63 = arith.addf %arg10, %arg11 : f32
|
59 |
+
tt.reduce.return %63 : f32
|
60 |
+
}) : (tensor<256xf32>) -> f32
|
61 |
+
%39 = arith.addf %38, %cst_0 : f32
|
62 |
+
%40 = arith.cmpi eq, %24, %cst_7 : tensor<1xi64>
|
63 |
+
%41 = arith.divf %21, %cst_6 : tensor<1xf32>
|
64 |
+
%42 = arith.mulf %28, %cst_5 : tensor<256xf32>
|
65 |
+
%43 = tt.splat %31 : (f32) -> tensor<256xf32>
|
66 |
+
%44 = arith.subf %42, %43 : tensor<256xf32>
|
67 |
+
%45 = tt.splat %39 : (f32) -> tensor<256xf32>
|
68 |
+
%46 = arith.mulf %35, %45 : tensor<256xf32>
|
69 |
+
%47 = arith.subf %44, %46 : tensor<256xf32>
|
70 |
+
%48 = tt.broadcast %41 : (tensor<1xf32>) -> tensor<256xf32>
|
71 |
+
%49 = arith.mulf %48, %47 : tensor<256xf32>
|
72 |
+
%50 = arith.addf %27, %49 : tensor<256xf32>
|
73 |
+
%51 = tt.broadcast %40 : (tensor<1xi1>) -> tensor<256xi1>
|
74 |
+
%52 = arith.select %51, %cst_4, %50 : tensor<256xi1>, tensor<256xf32>
|
75 |
+
%53 = arith.addi %24, %cst_3 : tensor<1xi64>
|
76 |
+
%54 = arith.cmpi slt, %24, %cst_2 : tensor<1xi64>
|
77 |
+
%55 = arith.select %54, %53, %24 : tensor<1xi1>, tensor<1xi64>
|
78 |
+
%56 = arith.muli %55, %cst_1 : tensor<1xi64>
|
79 |
+
%57 = tt.broadcast %56 : (tensor<1xi64>) -> tensor<256xi64>
|
80 |
+
%58 = arith.extsi %1 : tensor<256xi32> to tensor<256xi64>
|
81 |
+
%59 = arith.addi %58, %57 : tensor<256xi64>
|
82 |
+
%60 = tt.splat %arg7 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
|
83 |
+
%61 = tt.addptr %60, %59 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi64>
|
84 |
+
%62 = "tt.atomic_rmw"(%61, %52, %2) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<256x!tt.ptr<f32, 1>>, tensor<256xf32>, tensor<256xi1>) -> tensor<256xf32>
|
85 |
+
tt.return
|
86 |
+
}
|
87 |
+
}
|
.triton/dump/345a87a492fd703c73ab83265a21fcb6/triton_.cubin
ADDED
Binary file (52.2 kB). View file
|
|
.triton/dump/345a87a492fd703c73ab83265a21fcb6/triton_.llir
ADDED
@@ -0,0 +1,980 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
|
5 |
+
@assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
|
6 |
+
@assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp13 < 50257"
|
7 |
+
@assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
|
8 |
+
@assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
|
9 |
+
@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
|
10 |
+
@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
|
11 |
+
@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
|
12 |
+
|
13 |
+
declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
|
14 |
+
|
15 |
+
define void @triton__0d1d2d3d4d5de6de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i32 %5, i32 %6) local_unnamed_addr !dbg !7 {
|
16 |
+
%8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
|
17 |
+
%9 = lshr i32 %8, 5, !dbg !10
|
18 |
+
%10 = and i32 %9, 7, !dbg !10
|
19 |
+
%11 = and i32 %8, 15, !dbg !10
|
20 |
+
%12 = shl i32 %8, 3, !dbg !11
|
21 |
+
%13 = and i32 %12, 248, !dbg !11
|
22 |
+
%14 = or i32 %13, 4, !dbg !11
|
23 |
+
%urem = and i32 %8, 255, !dbg !11
|
24 |
+
%15 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !12
|
25 |
+
%16 = shl i32 %15, 4, !dbg !13
|
26 |
+
%17 = or i32 %16, %10, !dbg !14
|
27 |
+
%18 = or i32 %17, 8, !dbg !14
|
28 |
+
%19 = or i32 %16, %11, !dbg !14
|
29 |
+
%20 = sext i32 %17 to i64, !dbg !15
|
30 |
+
%21 = getelementptr i64, ptr addrspace(1) %0, i64 %20, !dbg !15
|
31 |
+
%22 = sext i32 %18 to i64, !dbg !15
|
32 |
+
%23 = getelementptr i64, ptr addrspace(1) %0, i64 %22, !dbg !15
|
33 |
+
%24 = sext i32 %19 to i64, !dbg !15
|
34 |
+
%25 = getelementptr i64, ptr addrspace(1) %0, i64 %24, !dbg !15
|
35 |
+
%26 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %21, i1 true) #6, !dbg !16
|
36 |
+
%27 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %21, i1 true) #6, !dbg !16
|
37 |
+
%28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %21, i1 true) #6, !dbg !16
|
38 |
+
%29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %21, i1 true) #6, !dbg !16
|
39 |
+
%30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %21, i1 true) #6, !dbg !16
|
40 |
+
%31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %21, i1 true) #6, !dbg !16
|
41 |
+
%32 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %21, i1 true) #6, !dbg !16
|
42 |
+
%33 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %21, i1 true) #6, !dbg !16
|
43 |
+
%34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
|
44 |
+
%35 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
|
45 |
+
%36 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
|
46 |
+
%37 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
|
47 |
+
%38 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
|
48 |
+
%39 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
|
49 |
+
%40 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
|
50 |
+
%41 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
|
51 |
+
%42 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !16
|
52 |
+
%43 = srem i32 %17, 512, !dbg !17
|
53 |
+
%44 = srem i32 %18, 512, !dbg !17
|
54 |
+
%45 = shl nsw i32 %43, 8, !dbg !18
|
55 |
+
%46 = shl nsw i32 %44, 8, !dbg !18
|
56 |
+
%47 = or i32 %45, %13, !dbg !19
|
57 |
+
%48 = or i32 %45, %14, !dbg !19
|
58 |
+
%49 = or i32 %46, %13, !dbg !19
|
59 |
+
%50 = or i32 %46, %14, !dbg !19
|
60 |
+
%51 = sext i32 %47 to i64, !dbg !20
|
61 |
+
%52 = getelementptr float, ptr addrspace(1) %2, i64 %51, !dbg !20
|
62 |
+
%53 = sext i32 %48 to i64, !dbg !20
|
63 |
+
%54 = getelementptr float, ptr addrspace(1) %2, i64 %53, !dbg !20
|
64 |
+
%55 = sext i32 %49 to i64, !dbg !20
|
65 |
+
%56 = getelementptr float, ptr addrspace(1) %2, i64 %55, !dbg !20
|
66 |
+
%57 = sext i32 %50 to i64, !dbg !20
|
67 |
+
%58 = getelementptr float, ptr addrspace(1) %2, i64 %57, !dbg !20
|
68 |
+
%59 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %52, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21
|
69 |
+
%60 = extractvalue { i32, i32, i32, i32 } %59, 0, !dbg !21
|
70 |
+
%61 = extractvalue { i32, i32, i32, i32 } %59, 1, !dbg !21
|
71 |
+
%62 = extractvalue { i32, i32, i32, i32 } %59, 2, !dbg !21
|
72 |
+
%63 = extractvalue { i32, i32, i32, i32 } %59, 3, !dbg !21
|
73 |
+
%64 = bitcast i32 %60 to float, !dbg !21
|
74 |
+
%65 = bitcast i32 %61 to float, !dbg !21
|
75 |
+
%66 = bitcast i32 %62 to float, !dbg !21
|
76 |
+
%67 = bitcast i32 %63 to float, !dbg !21
|
77 |
+
%68 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %54, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21
|
78 |
+
%69 = extractvalue { i32, i32, i32, i32 } %68, 0, !dbg !21
|
79 |
+
%70 = extractvalue { i32, i32, i32, i32 } %68, 1, !dbg !21
|
80 |
+
%71 = extractvalue { i32, i32, i32, i32 } %68, 2, !dbg !21
|
81 |
+
%72 = extractvalue { i32, i32, i32, i32 } %68, 3, !dbg !21
|
82 |
+
%73 = bitcast i32 %69 to float, !dbg !21
|
83 |
+
%74 = bitcast i32 %70 to float, !dbg !21
|
84 |
+
%75 = bitcast i32 %71 to float, !dbg !21
|
85 |
+
%76 = bitcast i32 %72 to float, !dbg !21
|
86 |
+
%77 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %56, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21
|
87 |
+
%78 = extractvalue { i32, i32, i32, i32 } %77, 0, !dbg !21
|
88 |
+
%79 = extractvalue { i32, i32, i32, i32 } %77, 1, !dbg !21
|
89 |
+
%80 = extractvalue { i32, i32, i32, i32 } %77, 2, !dbg !21
|
90 |
+
%81 = extractvalue { i32, i32, i32, i32 } %77, 3, !dbg !21
|
91 |
+
%82 = bitcast i32 %78 to float, !dbg !21
|
92 |
+
%83 = bitcast i32 %79 to float, !dbg !21
|
93 |
+
%84 = bitcast i32 %80 to float, !dbg !21
|
94 |
+
%85 = bitcast i32 %81 to float, !dbg !21
|
95 |
+
%86 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %58, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21
|
96 |
+
%87 = extractvalue { i32, i32, i32, i32 } %86, 0, !dbg !21
|
97 |
+
%88 = extractvalue { i32, i32, i32, i32 } %86, 1, !dbg !21
|
98 |
+
%89 = extractvalue { i32, i32, i32, i32 } %86, 2, !dbg !21
|
99 |
+
%90 = extractvalue { i32, i32, i32, i32 } %86, 3, !dbg !21
|
100 |
+
%91 = bitcast i32 %87 to float, !dbg !21
|
101 |
+
%92 = bitcast i32 %88 to float, !dbg !21
|
102 |
+
%93 = bitcast i32 %89 to float, !dbg !21
|
103 |
+
%94 = bitcast i32 %90 to float, !dbg !21
|
104 |
+
%95 = add i64 %42, 50257, !dbg !22
|
105 |
+
%96 = icmp slt i64 %26, 0, !dbg !23
|
106 |
+
%97 = icmp slt i64 %34, 0, !dbg !23
|
107 |
+
%98 = icmp slt i64 %42, 0, !dbg !23
|
108 |
+
%99 = select i1 %98, i64 %95, i64 %42, !dbg !24
|
109 |
+
%100 = icmp ugt i64 %99, 50256, !dbg !25
|
110 |
+
br i1 %100, label %101, label %102, !dbg !26
|
111 |
+
|
112 |
+
101: ; preds = %7
|
113 |
+
tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !26
|
114 |
+
br label %102, !dbg !26
|
115 |
+
|
116 |
+
102: ; preds = %101, %7
|
117 |
+
%103 = shl i64 %26, 8, !dbg !27
|
118 |
+
%104 = add i64 %103, 12865792, !dbg !27
|
119 |
+
%105 = select i1 %96, i64 %104, i64 %103, !dbg !27
|
120 |
+
%106 = shl i64 %34, 8, !dbg !27
|
121 |
+
%107 = add i64 %106, 12865792, !dbg !27
|
122 |
+
%108 = select i1 %97, i64 %107, i64 %106, !dbg !27
|
123 |
+
%109 = zext nneg i32 %13 to i64
|
124 |
+
%110 = zext nneg i32 %14 to i64
|
125 |
+
%111 = or i64 %105, %109, !dbg !28
|
126 |
+
%112 = or i64 %105, %110, !dbg !28
|
127 |
+
%113 = or i64 %108, %109, !dbg !28
|
128 |
+
%114 = or i64 %108, %110, !dbg !28
|
129 |
+
%115 = getelementptr float, ptr addrspace(1) %1, i64 %111, !dbg !29
|
130 |
+
%116 = getelementptr float, ptr addrspace(1) %1, i64 %112, !dbg !29
|
131 |
+
%117 = getelementptr float, ptr addrspace(1) %1, i64 %113, !dbg !29
|
132 |
+
%118 = getelementptr float, ptr addrspace(1) %1, i64 %114, !dbg !29
|
133 |
+
%119 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %115, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !30
|
134 |
+
%120 = extractvalue { i32, i32, i32, i32 } %119, 0, !dbg !30
|
135 |
+
%121 = extractvalue { i32, i32, i32, i32 } %119, 1, !dbg !30
|
136 |
+
%122 = extractvalue { i32, i32, i32, i32 } %119, 2, !dbg !30
|
137 |
+
%123 = extractvalue { i32, i32, i32, i32 } %119, 3, !dbg !30
|
138 |
+
%124 = bitcast i32 %120 to float, !dbg !30
|
139 |
+
%125 = bitcast i32 %121 to float, !dbg !30
|
140 |
+
%126 = bitcast i32 %122 to float, !dbg !30
|
141 |
+
%127 = bitcast i32 %123 to float, !dbg !30
|
142 |
+
%128 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %116, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !30
|
143 |
+
%129 = extractvalue { i32, i32, i32, i32 } %128, 0, !dbg !30
|
144 |
+
%130 = extractvalue { i32, i32, i32, i32 } %128, 1, !dbg !30
|
145 |
+
%131 = extractvalue { i32, i32, i32, i32 } %128, 2, !dbg !30
|
146 |
+
%132 = extractvalue { i32, i32, i32, i32 } %128, 3, !dbg !30
|
147 |
+
%133 = bitcast i32 %129 to float, !dbg !30
|
148 |
+
%134 = bitcast i32 %130 to float, !dbg !30
|
149 |
+
%135 = bitcast i32 %131 to float, !dbg !30
|
150 |
+
%136 = bitcast i32 %132 to float, !dbg !30
|
151 |
+
%137 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %117, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !30
|
152 |
+
%138 = extractvalue { i32, i32, i32, i32 } %137, 0, !dbg !30
|
153 |
+
%139 = extractvalue { i32, i32, i32, i32 } %137, 1, !dbg !30
|
154 |
+
%140 = extractvalue { i32, i32, i32, i32 } %137, 2, !dbg !30
|
155 |
+
%141 = extractvalue { i32, i32, i32, i32 } %137, 3, !dbg !30
|
156 |
+
%142 = bitcast i32 %138 to float, !dbg !30
|
157 |
+
%143 = bitcast i32 %139 to float, !dbg !30
|
158 |
+
%144 = bitcast i32 %140 to float, !dbg !30
|
159 |
+
%145 = bitcast i32 %141 to float, !dbg !30
|
160 |
+
%146 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %118, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !30
|
161 |
+
%147 = extractvalue { i32, i32, i32, i32 } %146, 0, !dbg !30
|
162 |
+
%148 = extractvalue { i32, i32, i32, i32 } %146, 1, !dbg !30
|
163 |
+
%149 = extractvalue { i32, i32, i32, i32 } %146, 2, !dbg !30
|
164 |
+
%150 = extractvalue { i32, i32, i32, i32 } %146, 3, !dbg !30
|
165 |
+
%151 = bitcast i32 %147 to float, !dbg !30
|
166 |
+
%152 = bitcast i32 %148 to float, !dbg !30
|
167 |
+
%153 = bitcast i32 %149 to float, !dbg !30
|
168 |
+
%154 = bitcast i32 %150 to float, !dbg !30
|
169 |
+
%155 = fadd float %64, %124, !dbg !31
|
170 |
+
%156 = fadd float %65, %125, !dbg !31
|
171 |
+
%157 = fadd float %66, %126, !dbg !31
|
172 |
+
%158 = fadd float %67, %127, !dbg !31
|
173 |
+
%159 = fadd float %73, %133, !dbg !31
|
174 |
+
%160 = fadd float %74, %134, !dbg !31
|
175 |
+
%161 = fadd float %75, %135, !dbg !31
|
176 |
+
%162 = fadd float %76, %136, !dbg !31
|
177 |
+
%163 = fadd float %82, %142, !dbg !31
|
178 |
+
%164 = fadd float %83, %143, !dbg !31
|
179 |
+
%165 = fadd float %84, %144, !dbg !31
|
180 |
+
%166 = fadd float %85, %145, !dbg !31
|
181 |
+
%167 = fadd float %91, %151, !dbg !31
|
182 |
+
%168 = fadd float %92, %152, !dbg !31
|
183 |
+
%169 = fadd float %93, %153, !dbg !31
|
184 |
+
%170 = fadd float %94, %154, !dbg !31
|
185 |
+
%171 = fadd float %155, 0.000000e+00, !dbg !32
|
186 |
+
%172 = fadd float %156, 0.000000e+00, !dbg !32
|
187 |
+
%173 = fadd float %157, 0.000000e+00, !dbg !32
|
188 |
+
%174 = fadd float %158, 0.000000e+00, !dbg !32
|
189 |
+
%175 = fadd float %159, 0.000000e+00, !dbg !32
|
190 |
+
%176 = fadd float %160, 0.000000e+00, !dbg !32
|
191 |
+
%177 = fadd float %161, 0.000000e+00, !dbg !32
|
192 |
+
%178 = fadd float %162, 0.000000e+00, !dbg !32
|
193 |
+
%179 = fadd float %163, 0.000000e+00, !dbg !32
|
194 |
+
%180 = fadd float %164, 0.000000e+00, !dbg !32
|
195 |
+
%181 = fadd float %165, 0.000000e+00, !dbg !32
|
196 |
+
%182 = fadd float %166, 0.000000e+00, !dbg !32
|
197 |
+
%183 = fadd float %167, 0.000000e+00, !dbg !32
|
198 |
+
%184 = fadd float %168, 0.000000e+00, !dbg !32
|
199 |
+
%185 = fadd float %169, 0.000000e+00, !dbg !32
|
200 |
+
%186 = fadd float %170, 0.000000e+00, !dbg !32
|
201 |
+
%187 = fsub float %155, %171, !dbg !36
|
202 |
+
%188 = fsub float %156, %172, !dbg !36
|
203 |
+
%189 = fsub float %157, %173, !dbg !36
|
204 |
+
%190 = fsub float %158, %174, !dbg !36
|
205 |
+
%191 = fsub float %159, %175, !dbg !36
|
206 |
+
%192 = fsub float %160, %176, !dbg !36
|
207 |
+
%193 = fsub float %161, %177, !dbg !36
|
208 |
+
%194 = fsub float %162, %178, !dbg !36
|
209 |
+
%195 = fsub float %163, %179, !dbg !36
|
210 |
+
%196 = fsub float %164, %180, !dbg !36
|
211 |
+
%197 = fsub float %165, %181, !dbg !36
|
212 |
+
%198 = fsub float %166, %182, !dbg !36
|
213 |
+
%199 = fsub float %167, %183, !dbg !36
|
214 |
+
%200 = fsub float %168, %184, !dbg !36
|
215 |
+
%201 = fsub float %169, %185, !dbg !36
|
216 |
+
%202 = fsub float %170, %186, !dbg !36
|
217 |
+
%203 = fmul float %155, %187, !dbg !37
|
218 |
+
%204 = fmul float %156, %188, !dbg !37
|
219 |
+
%205 = fmul float %157, %189, !dbg !37
|
220 |
+
%206 = fmul float %158, %190, !dbg !37
|
221 |
+
%207 = fmul float %159, %191, !dbg !37
|
222 |
+
%208 = fmul float %160, %192, !dbg !37
|
223 |
+
%209 = fmul float %161, %193, !dbg !37
|
224 |
+
%210 = fmul float %162, %194, !dbg !37
|
225 |
+
%211 = fmul float %163, %195, !dbg !37
|
226 |
+
%212 = fmul float %164, %196, !dbg !37
|
227 |
+
%213 = fmul float %165, %197, !dbg !37
|
228 |
+
%214 = fmul float %166, %198, !dbg !37
|
229 |
+
%215 = fmul float %167, %199, !dbg !37
|
230 |
+
%216 = fmul float %168, %200, !dbg !37
|
231 |
+
%217 = fmul float %169, %201, !dbg !37
|
232 |
+
%218 = fmul float %170, %202, !dbg !37
|
233 |
+
%219 = fadd float %203, 0.000000e+00, !dbg !38
|
234 |
+
%220 = fadd float %204, 0.000000e+00, !dbg !38
|
235 |
+
%221 = fadd float %205, 0.000000e+00, !dbg !38
|
236 |
+
%222 = fadd float %206, 0.000000e+00, !dbg !38
|
237 |
+
%223 = fadd float %207, 0.000000e+00, !dbg !38
|
238 |
+
%224 = fadd float %208, 0.000000e+00, !dbg !38
|
239 |
+
%225 = fadd float %209, 0.000000e+00, !dbg !38
|
240 |
+
%226 = fadd float %210, 0.000000e+00, !dbg !38
|
241 |
+
%227 = fadd float %211, 0.000000e+00, !dbg !38
|
242 |
+
%228 = fadd float %212, 0.000000e+00, !dbg !38
|
243 |
+
%229 = fadd float %213, 0.000000e+00, !dbg !38
|
244 |
+
%230 = fadd float %214, 0.000000e+00, !dbg !38
|
245 |
+
%231 = fadd float %215, 0.000000e+00, !dbg !38
|
246 |
+
%232 = fadd float %216, 0.000000e+00, !dbg !38
|
247 |
+
%233 = fadd float %217, 0.000000e+00, !dbg !38
|
248 |
+
%234 = fadd float %218, 0.000000e+00, !dbg !38
|
249 |
+
%235 = fsub float %172, %171, !dbg !39
|
250 |
+
%236 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 2.000000e+00) #6, !dbg !43
|
251 |
+
%237 = fmul float %236, %235, !dbg !44
|
252 |
+
%238 = fadd float %171, %237, !dbg !45
|
253 |
+
%239 = fadd float %219, %220, !dbg !46
|
254 |
+
%240 = fmul float %235, %235, !dbg !47
|
255 |
+
%241 = fmul float %236, %240, !dbg !48
|
256 |
+
%242 = fadd float %241, %239, !dbg !49
|
257 |
+
%243 = fsub float %173, %238, !dbg !39
|
258 |
+
%244 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 3.000000e+00) #6, !dbg !43
|
259 |
+
%245 = fmul float %244, %243, !dbg !44
|
260 |
+
%246 = fadd float %238, %245, !dbg !45
|
261 |
+
%247 = fadd float %221, %242, !dbg !46
|
262 |
+
%248 = fmul float %243, %243, !dbg !47
|
263 |
+
%249 = fmul float %248, 2.000000e+00, !dbg !50
|
264 |
+
%250 = fmul float %244, %249, !dbg !48
|
265 |
+
%251 = fadd float %247, %250, !dbg !49
|
266 |
+
%252 = fsub float %174, %246, !dbg !39
|
267 |
+
%253 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 4.000000e+00) #6, !dbg !43
|
268 |
+
%254 = fmul float %253, %252, !dbg !44
|
269 |
+
%255 = fadd float %246, %254, !dbg !45
|
270 |
+
%256 = fadd float %222, %251, !dbg !46
|
271 |
+
%257 = fmul float %252, %252, !dbg !47
|
272 |
+
%258 = fmul float %257, 3.000000e+00, !dbg !50
|
273 |
+
%259 = fmul float %253, %258, !dbg !48
|
274 |
+
%260 = fadd float %256, %259, !dbg !49
|
275 |
+
%261 = fsub float %175, %255, !dbg !39
|
276 |
+
%262 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 5.000000e+00) #6, !dbg !43
|
277 |
+
%263 = fmul float %262, %261, !dbg !44
|
278 |
+
%264 = fadd float %255, %263, !dbg !45
|
279 |
+
%265 = fadd float %223, %260, !dbg !46
|
280 |
+
%266 = fmul float %261, %261, !dbg !47
|
281 |
+
%267 = fmul float %266, 4.000000e+00, !dbg !50
|
282 |
+
%268 = fmul float %262, %267, !dbg !48
|
283 |
+
%269 = fadd float %265, %268, !dbg !49
|
284 |
+
%270 = fsub float %176, %264, !dbg !39
|
285 |
+
%271 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 6.000000e+00) #6, !dbg !43
|
286 |
+
%272 = fmul float %271, %270, !dbg !44
|
287 |
+
%273 = fadd float %264, %272, !dbg !45
|
288 |
+
%274 = fadd float %224, %269, !dbg !46
|
289 |
+
%275 = fmul float %270, %270, !dbg !47
|
290 |
+
%276 = fmul float %275, 5.000000e+00, !dbg !50
|
291 |
+
%277 = fmul float %271, %276, !dbg !48
|
292 |
+
%278 = fadd float %274, %277, !dbg !49
|
293 |
+
%279 = fsub float %177, %273, !dbg !39
|
294 |
+
%280 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 7.000000e+00) #6, !dbg !43
|
295 |
+
%281 = fmul float %280, %279, !dbg !44
|
296 |
+
%282 = fadd float %273, %281, !dbg !45
|
297 |
+
%283 = fadd float %225, %278, !dbg !46
|
298 |
+
%284 = fmul float %279, %279, !dbg !47
|
299 |
+
%285 = fmul float %284, 6.000000e+00, !dbg !50
|
300 |
+
%286 = fmul float %280, %285, !dbg !48
|
301 |
+
%287 = fadd float %283, %286, !dbg !49
|
302 |
+
%288 = fsub float %178, %282, !dbg !39
|
303 |
+
%289 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 8.000000e+00) #6, !dbg !43
|
304 |
+
%290 = fmul float %289, %288, !dbg !44
|
305 |
+
%291 = fadd float %282, %290, !dbg !45
|
306 |
+
%292 = fadd float %226, %287, !dbg !46
|
307 |
+
%293 = fmul float %288, %288, !dbg !47
|
308 |
+
%294 = fmul float %293, 7.000000e+00, !dbg !50
|
309 |
+
%295 = fmul float %289, %294, !dbg !48
|
310 |
+
%296 = fadd float %292, %295, !dbg !49
|
311 |
+
%297 = fsub float %180, %179, !dbg !39
|
312 |
+
%298 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 2.000000e+00) #6, !dbg !43
|
313 |
+
%299 = fmul float %297, %298, !dbg !44
|
314 |
+
%300 = fadd float %179, %299, !dbg !45
|
315 |
+
%301 = fadd float %227, %228, !dbg !46
|
316 |
+
%302 = fmul float %297, %297, !dbg !47
|
317 |
+
%303 = fmul float %302, %298, !dbg !48
|
318 |
+
%304 = fadd float %301, %303, !dbg !49
|
319 |
+
%305 = fsub float %181, %300, !dbg !39
|
320 |
+
%306 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 3.000000e+00) #6, !dbg !43
|
321 |
+
%307 = fmul float %306, %305, !dbg !44
|
322 |
+
%308 = fadd float %300, %307, !dbg !45
|
323 |
+
%309 = fadd float %229, %304, !dbg !46
|
324 |
+
%310 = fmul float %305, %305, !dbg !47
|
325 |
+
%311 = fmul float %310, 2.000000e+00, !dbg !50
|
326 |
+
%312 = fmul float %306, %311, !dbg !48
|
327 |
+
%313 = fadd float %309, %312, !dbg !49
|
328 |
+
%314 = fsub float %182, %308, !dbg !39
|
329 |
+
%315 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 4.000000e+00) #6, !dbg !43
|
330 |
+
%316 = fmul float %315, %314, !dbg !44
|
331 |
+
%317 = fadd float %308, %316, !dbg !45
|
332 |
+
%318 = fadd float %230, %313, !dbg !46
|
333 |
+
%319 = fmul float %314, %314, !dbg !47
|
334 |
+
%320 = fmul float %319, 3.000000e+00, !dbg !50
|
335 |
+
%321 = fmul float %315, %320, !dbg !48
|
336 |
+
%322 = fadd float %318, %321, !dbg !49
|
337 |
+
%323 = fsub float %183, %317, !dbg !39
|
338 |
+
%324 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 5.000000e+00) #6, !dbg !43
|
339 |
+
%325 = fmul float %324, %323, !dbg !44
|
340 |
+
%326 = fadd float %317, %325, !dbg !45
|
341 |
+
%327 = fadd float %231, %322, !dbg !46
|
342 |
+
%328 = fmul float %323, %323, !dbg !47
|
343 |
+
%329 = fmul float %328, 4.000000e+00, !dbg !50
|
344 |
+
%330 = fmul float %324, %329, !dbg !48
|
345 |
+
%331 = fadd float %327, %330, !dbg !49
|
346 |
+
%332 = fsub float %184, %326, !dbg !39
|
347 |
+
%333 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 6.000000e+00) #6, !dbg !43
|
348 |
+
%334 = fmul float %333, %332, !dbg !44
|
349 |
+
%335 = fadd float %326, %334, !dbg !45
|
350 |
+
%336 = fadd float %232, %331, !dbg !46
|
351 |
+
%337 = fmul float %332, %332, !dbg !47
|
352 |
+
%338 = fmul float %337, 5.000000e+00, !dbg !50
|
353 |
+
%339 = fmul float %333, %338, !dbg !48
|
354 |
+
%340 = fadd float %336, %339, !dbg !49
|
355 |
+
%341 = fsub float %185, %335, !dbg !39
|
356 |
+
%342 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 7.000000e+00) #6, !dbg !43
|
357 |
+
%343 = fmul float %342, %341, !dbg !44
|
358 |
+
%344 = fadd float %335, %343, !dbg !45
|
359 |
+
%345 = fadd float %233, %340, !dbg !46
|
360 |
+
%346 = fmul float %341, %341, !dbg !47
|
361 |
+
%347 = fmul float %346, 6.000000e+00, !dbg !50
|
362 |
+
%348 = fmul float %342, %347, !dbg !48
|
363 |
+
%349 = fadd float %345, %348, !dbg !49
|
364 |
+
%350 = fsub float %186, %344, !dbg !39
|
365 |
+
%351 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 8.000000e+00) #6, !dbg !43
|
366 |
+
%352 = fmul float %351, %350, !dbg !44
|
367 |
+
%353 = fadd float %344, %352, !dbg !45
|
368 |
+
%354 = fadd float %234, %349, !dbg !46
|
369 |
+
%355 = fmul float %350, %350, !dbg !47
|
370 |
+
%356 = fmul float %355, 7.000000e+00, !dbg !50
|
371 |
+
%357 = fmul float %351, %356, !dbg !48
|
372 |
+
%358 = fadd float %354, %357, !dbg !49
|
373 |
+
%359 = bitcast float %291 to i32, !dbg !51
|
374 |
+
%360 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %359, i32 16, i32 31), !dbg !51
|
375 |
+
%361 = bitcast i32 %360 to float, !dbg !51
|
376 |
+
%362 = bitcast float %296 to i32, !dbg !51
|
377 |
+
%363 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %362, i32 16, i32 31), !dbg !51
|
378 |
+
%364 = bitcast i32 %363 to float, !dbg !51
|
379 |
+
%365 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 1090519040, i32 16, i32 31), !dbg !51
|
380 |
+
%366 = bitcast i32 %365 to float, !dbg !51
|
381 |
+
%367 = fsub float %361, %291, !dbg !39
|
382 |
+
%368 = fadd float %366, 8.000000e+00, !dbg !53
|
383 |
+
%369 = fcmp oeq float %368, 0.000000e+00, !dbg !54
|
384 |
+
%370 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %366, float %368) #6, !dbg !43
|
385 |
+
%371 = select i1 %369, float 0.000000e+00, float %370, !dbg !55
|
386 |
+
%372 = fmul float %371, %367, !dbg !44
|
387 |
+
%373 = fadd float %291, %372, !dbg !45
|
388 |
+
%374 = fadd float %296, %364, !dbg !46
|
389 |
+
%375 = fmul float %367, %367, !dbg !47
|
390 |
+
%376 = fmul float %375, 8.000000e+00, !dbg !50
|
391 |
+
%377 = fmul float %371, %376, !dbg !48
|
392 |
+
%378 = fadd float %374, %377, !dbg !49
|
393 |
+
%379 = bitcast float %373 to i32, !dbg !51
|
394 |
+
%380 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %379, i32 8, i32 31), !dbg !51
|
395 |
+
%381 = bitcast i32 %380 to float, !dbg !51
|
396 |
+
%382 = bitcast float %378 to i32, !dbg !51
|
397 |
+
%383 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %382, i32 8, i32 31), !dbg !51
|
398 |
+
%384 = bitcast i32 %383 to float, !dbg !51
|
399 |
+
%385 = bitcast float %368 to i32, !dbg !51
|
400 |
+
%386 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %385, i32 8, i32 31), !dbg !51
|
401 |
+
%387 = bitcast i32 %386 to float, !dbg !51
|
402 |
+
%388 = fsub float %381, %373, !dbg !39
|
403 |
+
%389 = fadd float %368, %387, !dbg !53
|
404 |
+
%390 = fcmp oeq float %389, 0.000000e+00, !dbg !54
|
405 |
+
%391 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %387, float %389) #6, !dbg !43
|
406 |
+
%392 = select i1 %390, float 0.000000e+00, float %391, !dbg !55
|
407 |
+
%393 = fmul float %392, %388, !dbg !44
|
408 |
+
%394 = fadd float %373, %393, !dbg !45
|
409 |
+
%395 = fadd float %378, %384, !dbg !46
|
410 |
+
%396 = fmul float %388, %388, !dbg !47
|
411 |
+
%397 = fmul float %368, %396, !dbg !50
|
412 |
+
%398 = fmul float %392, %397, !dbg !48
|
413 |
+
%399 = fadd float %395, %398, !dbg !49
|
414 |
+
%400 = bitcast float %394 to i32, !dbg !51
|
415 |
+
%401 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %400, i32 4, i32 31), !dbg !51
|
416 |
+
%402 = bitcast i32 %401 to float, !dbg !51
|
417 |
+
%403 = bitcast float %399 to i32, !dbg !51
|
418 |
+
%404 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %403, i32 4, i32 31), !dbg !51
|
419 |
+
%405 = bitcast i32 %404 to float, !dbg !51
|
420 |
+
%406 = bitcast float %389 to i32, !dbg !51
|
421 |
+
%407 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %406, i32 4, i32 31), !dbg !51
|
422 |
+
%408 = bitcast i32 %407 to float, !dbg !51
|
423 |
+
%409 = fsub float %402, %394, !dbg !39
|
424 |
+
%410 = fadd float %389, %408, !dbg !53
|
425 |
+
%411 = fcmp oeq float %410, 0.000000e+00, !dbg !54
|
426 |
+
%412 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %408, float %410) #6, !dbg !43
|
427 |
+
%413 = select i1 %411, float 0.000000e+00, float %412, !dbg !55
|
428 |
+
%414 = fmul float %409, %413, !dbg !44
|
429 |
+
%415 = fadd float %394, %414, !dbg !45
|
430 |
+
%416 = fadd float %399, %405, !dbg !46
|
431 |
+
%417 = fmul float %409, %409, !dbg !47
|
432 |
+
%418 = fmul float %389, %417, !dbg !50
|
433 |
+
%419 = fmul float %413, %418, !dbg !48
|
434 |
+
%420 = fadd float %416, %419, !dbg !49
|
435 |
+
%421 = bitcast float %415 to i32, !dbg !51
|
436 |
+
%422 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %421, i32 2, i32 31), !dbg !51
|
437 |
+
%423 = bitcast i32 %422 to float, !dbg !51
|
438 |
+
%424 = bitcast float %420 to i32, !dbg !51
|
439 |
+
%425 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %424, i32 2, i32 31), !dbg !51
|
440 |
+
%426 = bitcast i32 %425 to float, !dbg !51
|
441 |
+
%427 = bitcast float %410 to i32, !dbg !51
|
442 |
+
%428 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %427, i32 2, i32 31), !dbg !51
|
443 |
+
%429 = bitcast i32 %428 to float, !dbg !51
|
444 |
+
%430 = fsub float %423, %415, !dbg !39
|
445 |
+
%431 = fadd float %410, %429, !dbg !53
|
446 |
+
%432 = fcmp oeq float %431, 0.000000e+00, !dbg !54
|
447 |
+
%433 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %429, float %431) #6, !dbg !43
|
448 |
+
%434 = select i1 %432, float 0.000000e+00, float %433, !dbg !55
|
449 |
+
%435 = fmul float %430, %434, !dbg !44
|
450 |
+
%436 = fadd float %415, %435, !dbg !45
|
451 |
+
%437 = fadd float %420, %426, !dbg !46
|
452 |
+
%438 = fmul float %430, %430, !dbg !47
|
453 |
+
%439 = fmul float %410, %438, !dbg !50
|
454 |
+
%440 = fmul float %434, %439, !dbg !48
|
455 |
+
%441 = fadd float %437, %440, !dbg !49
|
456 |
+
%442 = bitcast float %436 to i32, !dbg !51
|
457 |
+
%443 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %442, i32 1, i32 31), !dbg !51
|
458 |
+
%444 = bitcast float %441 to i32, !dbg !51
|
459 |
+
%445 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %444, i32 1, i32 31), !dbg !51
|
460 |
+
%446 = bitcast float %431 to i32, !dbg !51
|
461 |
+
%447 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %446, i32 1, i32 31), !dbg !51
|
462 |
+
%448 = bitcast i32 %447 to float, !dbg !51
|
463 |
+
%449 = fadd float %431, %448, !dbg !53
|
464 |
+
%450 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %448, float %449) #6, !dbg !43
|
465 |
+
%451 = bitcast float %353 to i32, !dbg !51
|
466 |
+
%452 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %451, i32 16, i32 31), !dbg !51
|
467 |
+
%453 = bitcast i32 %452 to float, !dbg !51
|
468 |
+
%454 = bitcast float %358 to i32, !dbg !51
|
469 |
+
%455 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %454, i32 16, i32 31), !dbg !51
|
470 |
+
%456 = bitcast i32 %455 to float, !dbg !51
|
471 |
+
%457 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 1090519040, i32 16, i32 31), !dbg !51
|
472 |
+
%458 = bitcast i32 %457 to float, !dbg !51
|
473 |
+
%459 = fsub float %453, %353, !dbg !39
|
474 |
+
%460 = fadd float %458, 8.000000e+00, !dbg !53
|
475 |
+
%461 = fcmp oeq float %460, 0.000000e+00, !dbg !54
|
476 |
+
%462 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %458, float %460) #6, !dbg !43
|
477 |
+
%463 = select i1 %461, float 0.000000e+00, float %462, !dbg !55
|
478 |
+
%464 = fmul float %459, %463, !dbg !44
|
479 |
+
%465 = fadd float %353, %464, !dbg !45
|
480 |
+
%466 = fadd float %358, %456, !dbg !46
|
481 |
+
%467 = fmul float %459, %459, !dbg !47
|
482 |
+
%468 = fmul float %467, 8.000000e+00, !dbg !50
|
483 |
+
%469 = fmul float %468, %463, !dbg !48
|
484 |
+
%470 = fadd float %466, %469, !dbg !49
|
485 |
+
%471 = bitcast float %465 to i32, !dbg !51
|
486 |
+
%472 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %471, i32 8, i32 31), !dbg !51
|
487 |
+
%473 = bitcast i32 %472 to float, !dbg !51
|
488 |
+
%474 = bitcast float %470 to i32, !dbg !51
|
489 |
+
%475 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %474, i32 8, i32 31), !dbg !51
|
490 |
+
%476 = bitcast i32 %475 to float, !dbg !51
|
491 |
+
%477 = bitcast float %460 to i32, !dbg !51
|
492 |
+
%478 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %477, i32 8, i32 31), !dbg !51
|
493 |
+
%479 = bitcast i32 %478 to float, !dbg !51
|
494 |
+
%480 = fsub float %473, %465, !dbg !39
|
495 |
+
%481 = fadd float %460, %479, !dbg !53
|
496 |
+
%482 = fcmp oeq float %481, 0.000000e+00, !dbg !54
|
497 |
+
%483 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %479, float %481) #6, !dbg !43
|
498 |
+
%484 = select i1 %482, float 0.000000e+00, float %483, !dbg !55
|
499 |
+
%485 = fmul float %480, %484, !dbg !44
|
500 |
+
%486 = fadd float %465, %485, !dbg !45
|
501 |
+
%487 = fadd float %470, %476, !dbg !46
|
502 |
+
%488 = fmul float %480, %480, !dbg !47
|
503 |
+
%489 = fmul float %460, %488, !dbg !50
|
504 |
+
%490 = fmul float %484, %489, !dbg !48
|
505 |
+
%491 = fadd float %487, %490, !dbg !49
|
506 |
+
%492 = bitcast float %486 to i32, !dbg !51
|
507 |
+
%493 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %492, i32 4, i32 31), !dbg !51
|
508 |
+
%494 = bitcast i32 %493 to float, !dbg !51
|
509 |
+
%495 = bitcast float %491 to i32, !dbg !51
|
510 |
+
%496 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %495, i32 4, i32 31), !dbg !51
|
511 |
+
%497 = bitcast i32 %496 to float, !dbg !51
|
512 |
+
%498 = bitcast float %481 to i32, !dbg !51
|
513 |
+
%499 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %498, i32 4, i32 31), !dbg !51
|
514 |
+
%500 = bitcast i32 %499 to float, !dbg !51
|
515 |
+
%501 = fsub float %494, %486, !dbg !39
|
516 |
+
%502 = fadd float %481, %500, !dbg !53
|
517 |
+
%503 = fcmp oeq float %502, 0.000000e+00, !dbg !54
|
518 |
+
%504 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %500, float %502) #6, !dbg !43
|
519 |
+
%505 = select i1 %503, float 0.000000e+00, float %504, !dbg !55
|
520 |
+
%506 = fmul float %501, %505, !dbg !44
|
521 |
+
%507 = fadd float %486, %506, !dbg !45
|
522 |
+
%508 = fadd float %491, %497, !dbg !46
|
523 |
+
%509 = fmul float %501, %501, !dbg !47
|
524 |
+
%510 = fmul float %481, %509, !dbg !50
|
525 |
+
%511 = fmul float %505, %510, !dbg !48
|
526 |
+
%512 = fadd float %508, %511, !dbg !49
|
527 |
+
%513 = bitcast float %507 to i32, !dbg !51
|
528 |
+
%514 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %513, i32 2, i32 31), !dbg !51
|
529 |
+
%515 = bitcast i32 %514 to float, !dbg !51
|
530 |
+
%516 = bitcast float %512 to i32, !dbg !51
|
531 |
+
%517 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %516, i32 2, i32 31), !dbg !51
|
532 |
+
%518 = bitcast i32 %517 to float, !dbg !51
|
533 |
+
%519 = bitcast float %502 to i32, !dbg !51
|
534 |
+
%520 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %519, i32 2, i32 31), !dbg !51
|
535 |
+
%521 = bitcast i32 %520 to float, !dbg !51
|
536 |
+
%522 = fsub float %515, %507, !dbg !39
|
537 |
+
%523 = fadd float %502, %521, !dbg !53
|
538 |
+
%524 = fcmp oeq float %523, 0.000000e+00, !dbg !54
|
539 |
+
%525 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %521, float %523) #6, !dbg !43
|
540 |
+
%526 = select i1 %524, float 0.000000e+00, float %525, !dbg !55
|
541 |
+
%527 = fmul float %522, %526, !dbg !44
|
542 |
+
%528 = fadd float %507, %527, !dbg !45
|
543 |
+
%529 = fadd float %512, %518, !dbg !46
|
544 |
+
%530 = fmul float %522, %522, !dbg !47
|
545 |
+
%531 = fmul float %502, %530, !dbg !50
|
546 |
+
%532 = fmul float %526, %531, !dbg !48
|
547 |
+
%533 = fadd float %529, %532, !dbg !49
|
548 |
+
%534 = bitcast float %528 to i32, !dbg !51
|
549 |
+
%535 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %534, i32 1, i32 31), !dbg !51
|
550 |
+
%536 = bitcast float %533 to i32, !dbg !51
|
551 |
+
%537 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %536, i32 1, i32 31), !dbg !51
|
552 |
+
%538 = bitcast float %523 to i32, !dbg !51
|
553 |
+
%539 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %538, i32 1, i32 31), !dbg !51
|
554 |
+
%540 = bitcast i32 %539 to float, !dbg !51
|
555 |
+
%541 = fadd float %523, %540, !dbg !53
|
556 |
+
%542 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %540, float %541) #6, !dbg !43
|
557 |
+
%543 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %52, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !56
|
558 |
+
%544 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %54, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !56
|
559 |
+
%545 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %56, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !56
|
560 |
+
%546 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %58, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !56
|
561 |
+
%547 = zext nneg i32 %urem to i64, !dbg !57
|
562 |
+
%548 = getelementptr float, ptr addrspace(1) %3, i64 %547, !dbg !57
|
563 |
+
%549 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %548, i1 true, i32 0, i1 true) #6, !dbg !58
|
564 |
+
br i1 %100, label %550, label %551, !dbg !59
|
565 |
+
|
566 |
+
550: ; preds = %102
|
567 |
+
tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !59
|
568 |
+
br label %551, !dbg !59
|
569 |
+
|
570 |
+
551: ; preds = %550, %102
|
571 |
+
%552 = bitcast i32 %537 to float, !dbg !51
|
572 |
+
%553 = fadd float %533, %552, !dbg !46
|
573 |
+
%554 = bitcast i32 %535 to float, !dbg !51
|
574 |
+
%555 = fsub float %554, %528, !dbg !39
|
575 |
+
%556 = fmul float %555, %555, !dbg !47
|
576 |
+
%557 = fmul float %523, %556, !dbg !50
|
577 |
+
%558 = fcmp oeq float %541, 0.000000e+00, !dbg !54
|
578 |
+
%559 = select i1 %558, float 0.000000e+00, float %542, !dbg !55
|
579 |
+
%560 = fmul float %559, %557, !dbg !48
|
580 |
+
%561 = fadd float %553, %560, !dbg !49
|
581 |
+
%562 = bitcast i32 %445 to float, !dbg !51
|
582 |
+
%563 = fadd float %441, %562, !dbg !46
|
583 |
+
%564 = bitcast i32 %443 to float, !dbg !51
|
584 |
+
%565 = fsub float %564, %436, !dbg !39
|
585 |
+
%566 = fmul float %565, %565, !dbg !47
|
586 |
+
%567 = fmul float %431, %566, !dbg !50
|
587 |
+
%568 = fcmp oeq float %449, 0.000000e+00, !dbg !54
|
588 |
+
%569 = select i1 %568, float 0.000000e+00, float %450, !dbg !55
|
589 |
+
%570 = fmul float %569, %567, !dbg !48
|
590 |
+
%571 = fadd float %563, %570, !dbg !49
|
591 |
+
%572 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %115, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !60
|
592 |
+
%573 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %116, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !60
|
593 |
+
%574 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %117, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !60
|
594 |
+
%575 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %118, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !60
|
595 |
+
%576 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %571, float 2.560000e+02) #6, !dbg !61
|
596 |
+
%577 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %571, float 2.560000e+02) #6, !dbg !61
|
597 |
+
%578 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %571, float 2.560000e+02) #6, !dbg !61
|
598 |
+
%579 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %571, float 2.560000e+02) #6, !dbg !61
|
599 |
+
%580 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %571, float 2.560000e+02) #6, !dbg !61
|
600 |
+
%581 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %571, float 2.560000e+02) #6, !dbg !61
|
601 |
+
%582 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %571, float 2.560000e+02) #6, !dbg !61
|
602 |
+
%583 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %571, float 2.560000e+02) #6, !dbg !61
|
603 |
+
%584 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %561, float 2.560000e+02) #6, !dbg !61
|
604 |
+
%585 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %561, float 2.560000e+02) #6, !dbg !61
|
605 |
+
%586 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %561, float 2.560000e+02) #6, !dbg !61
|
606 |
+
%587 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %561, float 2.560000e+02) #6, !dbg !61
|
607 |
+
%588 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %561, float 2.560000e+02) #6, !dbg !61
|
608 |
+
%589 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %561, float 2.560000e+02) #6, !dbg !61
|
609 |
+
%590 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %561, float 2.560000e+02) #6, !dbg !61
|
610 |
+
%591 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %561, float 2.560000e+02) #6, !dbg !61
|
611 |
+
%592 = fadd float %576, 0x3EE4F8B580000000, !dbg !62
|
612 |
+
%593 = fadd float %584, 0x3EE4F8B580000000, !dbg !62
|
613 |
+
%594 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
|
614 |
+
%.not.i = icmp eq i32 %594, 0, !dbg !63
|
615 |
+
br i1 %.not.i, label %597, label %595, !dbg !63
|
616 |
+
|
617 |
+
595: ; preds = %551
|
618 |
+
%596 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %592), !dbg !63
|
619 |
+
br label %__nv_rsqrtf.exit, !dbg !63
|
620 |
+
|
621 |
+
597: ; preds = %551
|
622 |
+
%598 = tail call float @llvm.nvvm.rsqrt.approx.f(float %592), !dbg !63
|
623 |
+
br label %__nv_rsqrtf.exit, !dbg !63
|
624 |
+
|
625 |
+
__nv_rsqrtf.exit: ; preds = %595, %597
|
626 |
+
%.0.i = phi float [ %596, %595 ], [ %598, %597 ], !dbg !63
|
627 |
+
%599 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
|
628 |
+
%600 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
|
629 |
+
%601 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
|
630 |
+
%602 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
|
631 |
+
%603 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
|
632 |
+
%604 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
|
633 |
+
%605 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
|
634 |
+
%606 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
|
635 |
+
%.not.i22 = icmp eq i32 %606, 0, !dbg !63
|
636 |
+
br i1 %.not.i22, label %609, label %607, !dbg !63
|
637 |
+
|
638 |
+
607: ; preds = %__nv_rsqrtf.exit
|
639 |
+
%608 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %593), !dbg !63
|
640 |
+
br label %__nv_rsqrtf.exit24, !dbg !63
|
641 |
+
|
642 |
+
609: ; preds = %__nv_rsqrtf.exit
|
643 |
+
%610 = tail call float @llvm.nvvm.rsqrt.approx.f(float %593), !dbg !63
|
644 |
+
br label %__nv_rsqrtf.exit24, !dbg !63
|
645 |
+
|
646 |
+
__nv_rsqrtf.exit24: ; preds = %607, %609
|
647 |
+
%.0.i23 = phi float [ %608, %607 ], [ %610, %609 ], !dbg !63
|
648 |
+
%611 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
|
649 |
+
%612 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
|
650 |
+
%613 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
|
651 |
+
%614 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
|
652 |
+
%615 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
|
653 |
+
%616 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
|
654 |
+
%617 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
|
655 |
+
%618 = extractvalue { i32, i32, i32, i32 } %575, 3, !dbg !60
|
656 |
+
%619 = bitcast i32 %618 to float, !dbg !60
|
657 |
+
%620 = extractvalue { i32, i32, i32, i32 } %546, 3, !dbg !56
|
658 |
+
%621 = bitcast i32 %620 to float, !dbg !56
|
659 |
+
%622 = fadd float %621, %619, !dbg !64
|
660 |
+
%623 = fmul float %555, %559, !dbg !44
|
661 |
+
%624 = fadd float %528, %623, !dbg !45
|
662 |
+
%625 = fsub float %622, %624, !dbg !65
|
663 |
+
%626 = extractvalue { i32, i32, i32, i32 } %575, 2, !dbg !60
|
664 |
+
%627 = bitcast i32 %626 to float, !dbg !60
|
665 |
+
%628 = extractvalue { i32, i32, i32, i32 } %546, 2, !dbg !56
|
666 |
+
%629 = bitcast i32 %628 to float, !dbg !56
|
667 |
+
%630 = fadd float %629, %627, !dbg !64
|
668 |
+
%631 = fsub float %630, %624, !dbg !65
|
669 |
+
%632 = extractvalue { i32, i32, i32, i32 } %575, 1, !dbg !60
|
670 |
+
%633 = bitcast i32 %632 to float, !dbg !60
|
671 |
+
%634 = extractvalue { i32, i32, i32, i32 } %546, 1, !dbg !56
|
672 |
+
%635 = bitcast i32 %634 to float, !dbg !56
|
673 |
+
%636 = fadd float %635, %633, !dbg !64
|
674 |
+
%637 = fsub float %636, %624, !dbg !65
|
675 |
+
%638 = extractvalue { i32, i32, i32, i32 } %575, 0, !dbg !60
|
676 |
+
%639 = bitcast i32 %638 to float, !dbg !60
|
677 |
+
%640 = extractvalue { i32, i32, i32, i32 } %546, 0, !dbg !56
|
678 |
+
%641 = bitcast i32 %640 to float, !dbg !56
|
679 |
+
%642 = fadd float %641, %639, !dbg !64
|
680 |
+
%643 = fsub float %642, %624, !dbg !65
|
681 |
+
%644 = extractvalue { i32, i32, i32, i32 } %574, 3, !dbg !60
|
682 |
+
%645 = bitcast i32 %644 to float, !dbg !60
|
683 |
+
%646 = extractvalue { i32, i32, i32, i32 } %545, 3, !dbg !56
|
684 |
+
%647 = bitcast i32 %646 to float, !dbg !56
|
685 |
+
%648 = fadd float %647, %645, !dbg !64
|
686 |
+
%649 = fsub float %648, %624, !dbg !65
|
687 |
+
%650 = extractvalue { i32, i32, i32, i32 } %574, 2, !dbg !60
|
688 |
+
%651 = bitcast i32 %650 to float, !dbg !60
|
689 |
+
%652 = extractvalue { i32, i32, i32, i32 } %545, 2, !dbg !56
|
690 |
+
%653 = bitcast i32 %652 to float, !dbg !56
|
691 |
+
%654 = fadd float %653, %651, !dbg !64
|
692 |
+
%655 = fsub float %654, %624, !dbg !65
|
693 |
+
%656 = extractvalue { i32, i32, i32, i32 } %574, 1, !dbg !60
|
694 |
+
%657 = bitcast i32 %656 to float, !dbg !60
|
695 |
+
%658 = extractvalue { i32, i32, i32, i32 } %545, 1, !dbg !56
|
696 |
+
%659 = bitcast i32 %658 to float, !dbg !56
|
697 |
+
%660 = fadd float %659, %657, !dbg !64
|
698 |
+
%661 = fsub float %660, %624, !dbg !65
|
699 |
+
%662 = extractvalue { i32, i32, i32, i32 } %574, 0, !dbg !60
|
700 |
+
%663 = bitcast i32 %662 to float, !dbg !60
|
701 |
+
%664 = extractvalue { i32, i32, i32, i32 } %545, 0, !dbg !56
|
702 |
+
%665 = bitcast i32 %664 to float, !dbg !56
|
703 |
+
%666 = fadd float %665, %663, !dbg !64
|
704 |
+
%667 = fsub float %666, %624, !dbg !65
|
705 |
+
%668 = extractvalue { i32, i32, i32, i32 } %573, 3, !dbg !60
|
706 |
+
%669 = bitcast i32 %668 to float, !dbg !60
|
707 |
+
%670 = extractvalue { i32, i32, i32, i32 } %544, 3, !dbg !56
|
708 |
+
%671 = bitcast i32 %670 to float, !dbg !56
|
709 |
+
%672 = fadd float %671, %669, !dbg !64
|
710 |
+
%673 = fmul float %565, %569, !dbg !44
|
711 |
+
%674 = fadd float %436, %673, !dbg !45
|
712 |
+
%675 = fsub float %672, %674, !dbg !65
|
713 |
+
%676 = extractvalue { i32, i32, i32, i32 } %573, 2, !dbg !60
|
714 |
+
%677 = bitcast i32 %676 to float, !dbg !60
|
715 |
+
%678 = extractvalue { i32, i32, i32, i32 } %544, 2, !dbg !56
|
716 |
+
%679 = bitcast i32 %678 to float, !dbg !56
|
717 |
+
%680 = fadd float %679, %677, !dbg !64
|
718 |
+
%681 = fsub float %680, %674, !dbg !65
|
719 |
+
%682 = extractvalue { i32, i32, i32, i32 } %573, 1, !dbg !60
|
720 |
+
%683 = bitcast i32 %682 to float, !dbg !60
|
721 |
+
%684 = extractvalue { i32, i32, i32, i32 } %544, 1, !dbg !56
|
722 |
+
%685 = bitcast i32 %684 to float, !dbg !56
|
723 |
+
%686 = fadd float %685, %683, !dbg !64
|
724 |
+
%687 = fsub float %686, %674, !dbg !65
|
725 |
+
%688 = extractvalue { i32, i32, i32, i32 } %573, 0, !dbg !60
|
726 |
+
%689 = bitcast i32 %688 to float, !dbg !60
|
727 |
+
%690 = extractvalue { i32, i32, i32, i32 } %544, 0, !dbg !56
|
728 |
+
%691 = bitcast i32 %690 to float, !dbg !56
|
729 |
+
%692 = fadd float %691, %689, !dbg !64
|
730 |
+
%693 = fsub float %692, %674, !dbg !65
|
731 |
+
%694 = extractvalue { i32, i32, i32, i32 } %572, 3, !dbg !60
|
732 |
+
%695 = bitcast i32 %694 to float, !dbg !60
|
733 |
+
%696 = extractvalue { i32, i32, i32, i32 } %543, 3, !dbg !56
|
734 |
+
%697 = bitcast i32 %696 to float, !dbg !56
|
735 |
+
%698 = fadd float %697, %695, !dbg !64
|
736 |
+
%699 = fsub float %698, %674, !dbg !65
|
737 |
+
%700 = extractvalue { i32, i32, i32, i32 } %572, 2, !dbg !60
|
738 |
+
%701 = bitcast i32 %700 to float, !dbg !60
|
739 |
+
%702 = extractvalue { i32, i32, i32, i32 } %543, 2, !dbg !56
|
740 |
+
%703 = bitcast i32 %702 to float, !dbg !56
|
741 |
+
%704 = fadd float %703, %701, !dbg !64
|
742 |
+
%705 = fsub float %704, %674, !dbg !65
|
743 |
+
%706 = extractvalue { i32, i32, i32, i32 } %572, 1, !dbg !60
|
744 |
+
%707 = bitcast i32 %706 to float, !dbg !60
|
745 |
+
%708 = extractvalue { i32, i32, i32, i32 } %543, 1, !dbg !56
|
746 |
+
%709 = bitcast i32 %708 to float, !dbg !56
|
747 |
+
%710 = fadd float %709, %707, !dbg !64
|
748 |
+
%711 = fsub float %710, %674, !dbg !65
|
749 |
+
%712 = extractvalue { i32, i32, i32, i32 } %572, 0, !dbg !60
|
750 |
+
%713 = bitcast i32 %712 to float, !dbg !60
|
751 |
+
%714 = extractvalue { i32, i32, i32, i32 } %543, 0, !dbg !56
|
752 |
+
%715 = bitcast i32 %714 to float, !dbg !56
|
753 |
+
%716 = fadd float %715, %713, !dbg !64
|
754 |
+
%717 = fsub float %716, %674, !dbg !65
|
755 |
+
%718 = fmul float %717, %.0.i, !dbg !66
|
756 |
+
%719 = fmul float %711, %.0.i, !dbg !66
|
757 |
+
%720 = fmul float %705, %.0.i, !dbg !66
|
758 |
+
%721 = fmul float %699, %.0.i, !dbg !66
|
759 |
+
%722 = fmul float %693, %.0.i, !dbg !66
|
760 |
+
%723 = fmul float %687, %.0.i, !dbg !66
|
761 |
+
%724 = fmul float %681, %.0.i, !dbg !66
|
762 |
+
%725 = fmul float %675, %.0.i, !dbg !66
|
763 |
+
%726 = fmul float %667, %.0.i23, !dbg !66
|
764 |
+
%727 = fmul float %661, %.0.i23, !dbg !66
|
765 |
+
%728 = fmul float %655, %.0.i23, !dbg !66
|
766 |
+
%729 = fmul float %649, %.0.i23, !dbg !66
|
767 |
+
%730 = fmul float %643, %.0.i23, !dbg !66
|
768 |
+
%731 = fmul float %637, %.0.i23, !dbg !66
|
769 |
+
%732 = fmul float %631, %.0.i23, !dbg !66
|
770 |
+
%733 = fmul float %625, %.0.i23, !dbg !66
|
771 |
+
%734 = getelementptr float, ptr addrspace(3) @global_smem, i64 %547, !dbg !67
|
772 |
+
store i32 %549, ptr addrspace(3) %734, align 4, !dbg !67
|
773 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !67
|
774 |
+
%735 = getelementptr float, ptr addrspace(3) @global_smem, i64 %109, !dbg !67
|
775 |
+
%736 = load float, ptr addrspace(3) %735, align 32, !dbg !67
|
776 |
+
%737 = getelementptr inbounds <8 x float>, ptr addrspace(3) %735, i64 0, i64 1, !dbg !67
|
777 |
+
%738 = load float, ptr addrspace(3) %737, align 4, !dbg !67
|
778 |
+
%739 = getelementptr inbounds <8 x float>, ptr addrspace(3) %735, i64 0, i64 2, !dbg !67
|
779 |
+
%740 = load float, ptr addrspace(3) %739, align 8, !dbg !67
|
780 |
+
%741 = getelementptr inbounds <8 x float>, ptr addrspace(3) %735, i64 0, i64 3, !dbg !67
|
781 |
+
%742 = load float, ptr addrspace(3) %741, align 4, !dbg !67
|
782 |
+
%743 = getelementptr inbounds <8 x float>, ptr addrspace(3) %735, i64 0, i64 4, !dbg !67
|
783 |
+
%744 = load float, ptr addrspace(3) %743, align 16, !dbg !67
|
784 |
+
%745 = getelementptr inbounds <8 x float>, ptr addrspace(3) %735, i64 0, i64 5, !dbg !67
|
785 |
+
%746 = load float, ptr addrspace(3) %745, align 4, !dbg !67
|
786 |
+
%747 = getelementptr inbounds <8 x float>, ptr addrspace(3) %735, i64 0, i64 6, !dbg !67
|
787 |
+
%748 = load float, ptr addrspace(3) %747, align 8, !dbg !67
|
788 |
+
%749 = getelementptr inbounds <8 x float>, ptr addrspace(3) %735, i64 0, i64 7, !dbg !67
|
789 |
+
%750 = load float, ptr addrspace(3) %749, align 4, !dbg !67
|
790 |
+
%751 = fmul float %718, %736, !dbg !67
|
791 |
+
%752 = fmul float %719, %738, !dbg !67
|
792 |
+
%753 = fmul float %720, %740, !dbg !67
|
793 |
+
%754 = fmul float %721, %742, !dbg !67
|
794 |
+
%755 = fmul float %722, %744, !dbg !67
|
795 |
+
%756 = fmul float %723, %746, !dbg !67
|
796 |
+
%757 = fmul float %724, %748, !dbg !67
|
797 |
+
%758 = fmul float %725, %750, !dbg !67
|
798 |
+
%759 = fmul float %726, %736, !dbg !67
|
799 |
+
%760 = fmul float %727, %738, !dbg !67
|
800 |
+
%761 = fmul float %728, %740, !dbg !67
|
801 |
+
%762 = fmul float %729, %742, !dbg !67
|
802 |
+
%763 = fmul float %730, %744, !dbg !67
|
803 |
+
%764 = fmul float %731, %746, !dbg !67
|
804 |
+
%765 = fmul float %732, %748, !dbg !67
|
805 |
+
%766 = fmul float %733, %750, !dbg !67
|
806 |
+
%767 = shl i32 %17, 8, !dbg !68
|
807 |
+
%768 = shl i32 %18, 8, !dbg !68
|
808 |
+
%769 = or i32 %767, %13, !dbg !69
|
809 |
+
%770 = or i32 %768, %13, !dbg !69
|
810 |
+
%771 = sext i32 %769 to i64, !dbg !70
|
811 |
+
%772 = getelementptr i16, ptr addrspace(1) %4, i64 %771, !dbg !70
|
812 |
+
%773 = sext i32 %770 to i64, !dbg !70
|
813 |
+
%774 = getelementptr i16, ptr addrspace(1) %4, i64 %773, !dbg !70
|
814 |
+
%775 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %751) #6, !dbg !71
|
815 |
+
%776 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %752) #6, !dbg !71
|
816 |
+
%777 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %753) #6, !dbg !71
|
817 |
+
%778 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %754) #6, !dbg !71
|
818 |
+
%779 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %755) #6, !dbg !71
|
819 |
+
%780 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %756) #6, !dbg !71
|
820 |
+
%781 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %757) #6, !dbg !71
|
821 |
+
%782 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %758) #6, !dbg !71
|
822 |
+
%783 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %759) #6, !dbg !71
|
823 |
+
%784 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %760) #6, !dbg !71
|
824 |
+
%785 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %761) #6, !dbg !71
|
825 |
+
%786 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %762) #6, !dbg !71
|
826 |
+
%787 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %763) #6, !dbg !71
|
827 |
+
%788 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %764) #6, !dbg !71
|
828 |
+
%789 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %765) #6, !dbg !71
|
829 |
+
%790 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %766) #6, !dbg !71
|
830 |
+
%791 = insertelement <2 x i16> undef, i16 %775, i64 0, !dbg !71
|
831 |
+
%792 = insertelement <2 x i16> %791, i16 %776, i64 1, !dbg !71
|
832 |
+
%793 = bitcast <2 x i16> %792 to i32, !dbg !71
|
833 |
+
%794 = insertelement <2 x i16> undef, i16 %777, i64 0, !dbg !71
|
834 |
+
%795 = insertelement <2 x i16> %794, i16 %778, i64 1, !dbg !71
|
835 |
+
%796 = bitcast <2 x i16> %795 to i32, !dbg !71
|
836 |
+
%797 = insertelement <2 x i16> undef, i16 %779, i64 0, !dbg !71
|
837 |
+
%798 = insertelement <2 x i16> %797, i16 %780, i64 1, !dbg !71
|
838 |
+
%799 = bitcast <2 x i16> %798 to i32, !dbg !71
|
839 |
+
%800 = insertelement <2 x i16> undef, i16 %781, i64 0, !dbg !71
|
840 |
+
%801 = insertelement <2 x i16> %800, i16 %782, i64 1, !dbg !71
|
841 |
+
%802 = bitcast <2 x i16> %801 to i32, !dbg !71
|
842 |
+
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %793, i32 %796, i32 %799, i32 %802, ptr addrspace(1) %772, i1 true) #6, !dbg !71
|
843 |
+
%803 = insertelement <2 x i16> undef, i16 %783, i64 0, !dbg !71
|
844 |
+
%804 = insertelement <2 x i16> %803, i16 %784, i64 1, !dbg !71
|
845 |
+
%805 = bitcast <2 x i16> %804 to i32, !dbg !71
|
846 |
+
%806 = insertelement <2 x i16> undef, i16 %785, i64 0, !dbg !71
|
847 |
+
%807 = insertelement <2 x i16> %806, i16 %786, i64 1, !dbg !71
|
848 |
+
%808 = bitcast <2 x i16> %807 to i32, !dbg !71
|
849 |
+
%809 = insertelement <2 x i16> undef, i16 %787, i64 0, !dbg !71
|
850 |
+
%810 = insertelement <2 x i16> %809, i16 %788, i64 1, !dbg !71
|
851 |
+
%811 = bitcast <2 x i16> %810 to i32, !dbg !71
|
852 |
+
%812 = insertelement <2 x i16> undef, i16 %789, i64 0, !dbg !71
|
853 |
+
%813 = insertelement <2 x i16> %812, i16 %790, i64 1, !dbg !71
|
854 |
+
%814 = bitcast <2 x i16> %813 to i32, !dbg !71
|
855 |
+
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %805, i32 %808, i32 %811, i32 %814, ptr addrspace(1) %774, i1 true) #6, !dbg !71
|
856 |
+
ret void, !dbg !72
|
857 |
+
}
|
858 |
+
|
859 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
860 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
861 |
+
|
862 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
863 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
|
864 |
+
|
865 |
+
; Function Attrs: convergent nocallback nounwind
|
866 |
+
declare void @llvm.nvvm.barrier0() #2
|
867 |
+
|
868 |
+
; Function Attrs: alwaysinline nounwind
|
869 |
+
define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
|
870 |
+
%1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
|
871 |
+
%.not = icmp eq i32 %1, 0
|
872 |
+
br i1 %.not, label %4, label %2
|
873 |
+
|
874 |
+
2: ; preds = %0
|
875 |
+
%3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
|
876 |
+
br label %6
|
877 |
+
|
878 |
+
4: ; preds = %0
|
879 |
+
%5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
|
880 |
+
br label %6
|
881 |
+
|
882 |
+
6: ; preds = %4, %2
|
883 |
+
%.0 = phi float [ %3, %2 ], [ %5, %4 ]
|
884 |
+
ret float %.0
|
885 |
+
}
|
886 |
+
|
887 |
+
declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
|
888 |
+
|
889 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
890 |
+
declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
|
891 |
+
|
892 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
893 |
+
declare float @llvm.nvvm.rsqrt.approx.f(float) #5
|
894 |
+
|
895 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
896 |
+
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
897 |
+
attributes #2 = { convergent nocallback nounwind }
|
898 |
+
attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
899 |
+
attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
900 |
+
attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
|
901 |
+
attributes #6 = { nounwind }
|
902 |
+
|
903 |
+
!llvm.module.flags = !{!0, !1}
|
904 |
+
!llvm.dbg.cu = !{!2}
|
905 |
+
!nvvm.annotations = !{!4, !5, !5, !4}
|
906 |
+
!llvm.ident = !{!6}
|
907 |
+
|
908 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
909 |
+
!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
910 |
+
!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
911 |
+
!3 = !DIFile(filename: "cgx5lxpuexpindj4dsmjz5x42uhyy7iskevq7ovzpwagb3t5powj.py", directory: "/tmp/torchinductor_root/gx")
|
912 |
+
!4 = !{ptr @triton__0d1d2d3d4d5de6de, !"kernel", i32 1}
|
913 |
+
!5 = !{ptr @triton__0d1d2d3d4d5de6de, !"maxntidx", i32 256}
|
914 |
+
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
|
915 |
+
!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5de6de", linkageName: "triton__0d1d2d3d4d5de6de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
|
916 |
+
!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
|
917 |
+
!9 = !{}
|
918 |
+
!10 = !DILocation(line: 22, column: 44, scope: !7)
|
919 |
+
!11 = !DILocation(line: 24, column: 33, scope: !7)
|
920 |
+
!12 = !DILocation(line: 21, column: 28, scope: !7)
|
921 |
+
!13 = !DILocation(line: 21, column: 33, scope: !7)
|
922 |
+
!14 = !DILocation(line: 22, column: 23, scope: !7)
|
923 |
+
!15 = !DILocation(line: 26, column: 30, scope: !7)
|
924 |
+
!16 = !DILocation(line: 26, column: 35, scope: !7)
|
925 |
+
!17 = !DILocation(line: 27, column: 18, scope: !7)
|
926 |
+
!18 = !DILocation(line: 35, column: 44, scope: !7)
|
927 |
+
!19 = !DILocation(line: 35, column: 40, scope: !7)
|
928 |
+
!20 = !DILocation(line: 35, column: 34, scope: !7)
|
929 |
+
!21 = !DILocation(line: 35, column: 50, scope: !7)
|
930 |
+
!22 = !DILocation(line: 36, column: 22, scope: !7)
|
931 |
+
!23 = !DILocation(line: 37, column: 22, scope: !7)
|
932 |
+
!24 = !DILocation(line: 38, column: 36, scope: !7)
|
933 |
+
!25 = !DILocation(line: 39, column: 40, scope: !7)
|
934 |
+
!26 = !DILocation(line: 39, column: 55, scope: !7)
|
935 |
+
!27 = !DILocation(line: 40, column: 44, scope: !7)
|
936 |
+
!28 = !DILocation(line: 40, column: 40, scope: !7)
|
937 |
+
!29 = !DILocation(line: 40, column: 34, scope: !7)
|
938 |
+
!30 = !DILocation(line: 40, column: 52, scope: !7)
|
939 |
+
!31 = !DILocation(line: 41, column: 22, scope: !7)
|
940 |
+
!32 = !DILocation(line: 98, column: 22, scope: !33, inlinedAt: !35)
|
941 |
+
!33 = distinct !DILexicalBlockFile(scope: !7, file: !34, discriminator: 0)
|
942 |
+
!34 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
|
943 |
+
!35 = !DILocation(line: 44, column: 38, scope: !33)
|
944 |
+
!36 = !DILocation(line: 101, column: 30, scope: !33, inlinedAt: !35)
|
945 |
+
!37 = !DILocation(line: 101, column: 22, scope: !33, inlinedAt: !35)
|
946 |
+
!38 = !DILocation(line: 101, column: 13, scope: !33, inlinedAt: !35)
|
947 |
+
!39 = !DILocation(line: 108, column: 21, scope: !40, inlinedAt: !41)
|
948 |
+
!40 = distinct !DILexicalBlockFile(scope: !33, file: !34, discriminator: 0)
|
949 |
+
!41 = !DILocation(line: 120, column: 46, scope: !40, inlinedAt: !42)
|
950 |
+
!42 = !DILocation(line: 50, column: 41, scope: !40)
|
951 |
+
!43 = !DILocation(line: 110, column: 60, scope: !40, inlinedAt: !41)
|
952 |
+
!44 = !DILocation(line: 112, column: 25, scope: !40, inlinedAt: !41)
|
953 |
+
!45 = !DILocation(line: 112, column: 17, scope: !40, inlinedAt: !41)
|
954 |
+
!46 = !DILocation(line: 113, column: 15, scope: !40, inlinedAt: !41)
|
955 |
+
!47 = !DILocation(line: 113, column: 30, scope: !40, inlinedAt: !41)
|
956 |
+
!48 = !DILocation(line: 113, column: 49, scope: !40, inlinedAt: !41)
|
957 |
+
!49 = !DILocation(line: 113, column: 22, scope: !40, inlinedAt: !41)
|
958 |
+
!50 = !DILocation(line: 113, column: 38, scope: !40, inlinedAt: !41)
|
959 |
+
!51 = !DILocation(line: 120, column: 46, scope: !33, inlinedAt: !52)
|
960 |
+
!52 = !DILocation(line: 50, column: 41, scope: !33)
|
961 |
+
!53 = !DILocation(line: 109, column: 28, scope: !40, inlinedAt: !41)
|
962 |
+
!54 = !DILocation(line: 110, column: 39, scope: !40, inlinedAt: !41)
|
963 |
+
!55 = !DILocation(line: 110, column: 49, scope: !40, inlinedAt: !41)
|
964 |
+
!56 = !DILocation(line: 59, column: 51, scope: !7)
|
965 |
+
!57 = !DILocation(line: 60, column: 35, scope: !7)
|
966 |
+
!58 = !DILocation(line: 60, column: 40, scope: !7)
|
967 |
+
!59 = !DILocation(line: 64, column: 57, scope: !7)
|
968 |
+
!60 = !DILocation(line: 65, column: 54, scope: !7)
|
969 |
+
!61 = !DILocation(line: 69, column: 23, scope: !7)
|
970 |
+
!62 = !DILocation(line: 71, column: 24, scope: !7)
|
971 |
+
!63 = !DILocation(line: 72, column: 30, scope: !7)
|
972 |
+
!64 = !DILocation(line: 66, column: 24, scope: !7)
|
973 |
+
!65 = !DILocation(line: 67, column: 24, scope: !7)
|
974 |
+
!66 = !DILocation(line: 73, column: 24, scope: !7)
|
975 |
+
!67 = !DILocation(line: 74, column: 24, scope: !7)
|
976 |
+
!68 = !DILocation(line: 76, column: 39, scope: !7)
|
977 |
+
!69 = !DILocation(line: 76, column: 35, scope: !7)
|
978 |
+
!70 = !DILocation(line: 76, column: 29, scope: !7)
|
979 |
+
!71 = !DILocation(line: 76, column: 52, scope: !7)
|
980 |
+
!72 = !DILocation(line: 55, column: 4, scope: !7)
|
.triton/dump/345a87a492fd703c73ab83265a21fcb6/triton_.ptx
ADDED
@@ -0,0 +1,1654 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3d4d5de6de
|
10 |
+
.extern .func __assertfail
|
11 |
+
(
|
12 |
+
.param .b64 __assertfail_param_0,
|
13 |
+
.param .b64 __assertfail_param_1,
|
14 |
+
.param .b32 __assertfail_param_2,
|
15 |
+
.param .b64 __assertfail_param_3,
|
16 |
+
.param .b64 __assertfail_param_4
|
17 |
+
)
|
18 |
+
;
|
19 |
+
.global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
|
20 |
+
.global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
|
21 |
+
.global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 51, 32, 60, 32, 53, 48, 50, 53, 55};
|
22 |
+
.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
|
23 |
+
.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
|
24 |
+
.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
|
25 |
+
.extern .shared .align 1 .b8 global_smem[];
|
26 |
+
.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
|
27 |
+
|
28 |
+
.visible .entry triton__0d1d2d3d4d5de6de(
|
29 |
+
.param .u64 triton__0d1d2d3d4d5de6de_param_0,
|
30 |
+
.param .u64 triton__0d1d2d3d4d5de6de_param_1,
|
31 |
+
.param .u64 triton__0d1d2d3d4d5de6de_param_2,
|
32 |
+
.param .u64 triton__0d1d2d3d4d5de6de_param_3,
|
33 |
+
.param .u64 triton__0d1d2d3d4d5de6de_param_4,
|
34 |
+
.param .u32 triton__0d1d2d3d4d5de6de_param_5,
|
35 |
+
.param .u32 triton__0d1d2d3d4d5de6de_param_6
|
36 |
+
)
|
37 |
+
.maxntid 256, 1, 1
|
38 |
+
{
|
39 |
+
.reg .pred %p<117>;
|
40 |
+
.reg .b16 %rs<17>;
|
41 |
+
.reg .b32 %r<375>;
|
42 |
+
.reg .f32 %f<423>;
|
43 |
+
.reg .b64 %rd<113>;
|
44 |
+
.loc 1 18 0
|
45 |
+
$L__func_begin0:
|
46 |
+
.loc 1 18 0
|
47 |
+
|
48 |
+
ld.param.u64 %rd13, [triton__0d1d2d3d4d5de6de_param_3];
|
49 |
+
ld.param.u64 %rd12, [triton__0d1d2d3d4d5de6de_param_1];
|
50 |
+
ld.param.u64 %rd53, [triton__0d1d2d3d4d5de6de_param_0];
|
51 |
+
$L__tmp0:
|
52 |
+
.loc 1 22 44
|
53 |
+
mov.u32 %r59, %tid.x;
|
54 |
+
ld.param.u64 %rd54, [triton__0d1d2d3d4d5de6de_param_2];
|
55 |
+
bfe.u32 %r60, %r59, 5, 3;
|
56 |
+
and.b32 %r61, %r59, 15;
|
57 |
+
.loc 1 24 33
|
58 |
+
shl.b32 %r62, %r59, 3;
|
59 |
+
and.b32 %r1, %r62, 248;
|
60 |
+
and.b32 %r2, %r59, 255;
|
61 |
+
.loc 1 21 28
|
62 |
+
mov.u32 %r26, %ctaid.x;
|
63 |
+
.loc 1 21 33
|
64 |
+
shl.b32 %r63, %r26, 4;
|
65 |
+
.loc 1 22 23
|
66 |
+
or.b32 %r3, %r63, %r60;
|
67 |
+
or.b32 %r4, %r3, 8;
|
68 |
+
or.b32 %r64, %r63, %r61;
|
69 |
+
.loc 1 26 30
|
70 |
+
mul.wide.s32 %rd55, %r3, 8;
|
71 |
+
add.s64 %rd16, %rd53, %rd55;
|
72 |
+
add.s64 %rd32, %rd16, 64;
|
73 |
+
mul.wide.s32 %rd56, %r64, 8;
|
74 |
+
add.s64 %rd48, %rd53, %rd56;
|
75 |
+
mov.pred %p93, -1;
|
76 |
+
.loc 1 26 35
|
77 |
+
mov.u64 %rd15, 0x0;
|
78 |
+
@%p93 ld.global.L1::evict_last.b64 { %rd15 }, [ %rd16 + 0 ];
|
79 |
+
mov.u64 %rd17, 0x0;
|
80 |
+
@%p93 ld.global.L1::evict_last.b64 { %rd17 }, [ %rd16 + 0 ];
|
81 |
+
mov.u64 %rd19, 0x0;
|
82 |
+
@%p93 ld.global.L1::evict_last.b64 { %rd19 }, [ %rd16 + 0 ];
|
83 |
+
mov.u64 %rd21, 0x0;
|
84 |
+
@%p93 ld.global.L1::evict_last.b64 { %rd21 }, [ %rd16 + 0 ];
|
85 |
+
mov.u64 %rd23, 0x0;
|
86 |
+
@%p93 ld.global.L1::evict_last.b64 { %rd23 }, [ %rd16 + 0 ];
|
87 |
+
mov.u64 %rd25, 0x0;
|
88 |
+
@%p93 ld.global.L1::evict_last.b64 { %rd25 }, [ %rd16 + 0 ];
|
89 |
+
mov.u64 %rd27, 0x0;
|
90 |
+
@%p93 ld.global.L1::evict_last.b64 { %rd27 }, [ %rd16 + 0 ];
|
91 |
+
mov.u64 %rd29, 0x0;
|
92 |
+
@%p93 ld.global.L1::evict_last.b64 { %rd29 }, [ %rd16 + 0 ];
|
93 |
+
mov.u64 %rd31, 0x0;
|
94 |
+
@%p93 ld.global.L1::evict_last.b64 { %rd31 }, [ %rd32 + 0 ];
|
95 |
+
mov.u64 %rd33, 0x0;
|
96 |
+
@%p93 ld.global.L1::evict_last.b64 { %rd33 }, [ %rd32 + 0 ];
|
97 |
+
mov.u64 %rd35, 0x0;
|
98 |
+
@%p93 ld.global.L1::evict_last.b64 { %rd35 }, [ %rd32 + 0 ];
|
99 |
+
mov.u64 %rd37, 0x0;
|
100 |
+
@%p93 ld.global.L1::evict_last.b64 { %rd37 }, [ %rd32 + 0 ];
|
101 |
+
mov.u64 %rd39, 0x0;
|
102 |
+
@%p93 ld.global.L1::evict_last.b64 { %rd39 }, [ %rd32 + 0 ];
|
103 |
+
mov.u64 %rd41, 0x0;
|
104 |
+
@%p93 ld.global.L1::evict_last.b64 { %rd41 }, [ %rd32 + 0 ];
|
105 |
+
mov.u64 %rd43, 0x0;
|
106 |
+
@%p93 ld.global.L1::evict_last.b64 { %rd43 }, [ %rd32 + 0 ];
|
107 |
+
mov.u64 %rd45, 0x0;
|
108 |
+
@%p93 ld.global.L1::evict_last.b64 { %rd45 }, [ %rd32 + 0 ];
|
109 |
+
mov.u64 %rd47, 0x0;
|
110 |
+
@%p93 ld.global.L1::evict_last.b64 { %rd47 }, [ %rd48 + 0 ];
|
111 |
+
.loc 1 27 18
|
112 |
+
bfe.s32 %r65, %r26, 27, 1;
|
113 |
+
shr.u32 %r66, %r65, 23;
|
114 |
+
add.s32 %r67, %r3, %r66;
|
115 |
+
and.b32 %r68, %r67, 16776704;
|
116 |
+
sub.s32 %r69, %r3, %r68;
|
117 |
+
add.s32 %r70, %r4, %r66;
|
118 |
+
and.b32 %r71, %r70, 16776704;
|
119 |
+
sub.s32 %r72, %r4, %r71;
|
120 |
+
.loc 1 35 44
|
121 |
+
shl.b32 %r73, %r69, 8;
|
122 |
+
shl.b32 %r74, %r72, 8;
|
123 |
+
.loc 1 35 40
|
124 |
+
or.b32 %r75, %r73, %r1;
|
125 |
+
or.b32 %r76, %r74, %r1;
|
126 |
+
.loc 1 35 34
|
127 |
+
mul.wide.s32 %rd57, %r75, 4;
|
128 |
+
add.s64 %rd80, %rd54, %rd57;
|
129 |
+
cvt.s64.s32 %rd58, %r73;
|
130 |
+
cvt.u64.u32 %rd59, %r1;
|
131 |
+
or.b64 %rd60, %rd58, %rd59;
|
132 |
+
shl.b64 %rd61, %rd60, 2;
|
133 |
+
add.s64 %rd62, %rd54, %rd61;
|
134 |
+
add.s64 %rd81, %rd62, 16;
|
135 |
+
mul.wide.s32 %rd63, %r76, 4;
|
136 |
+
add.s64 %rd82, %rd54, %rd63;
|
137 |
+
cvt.s64.s32 %rd64, %r74;
|
138 |
+
or.b64 %rd65, %rd64, %rd59;
|
139 |
+
shl.b64 %rd66, %rd65, 2;
|
140 |
+
add.s64 %rd67, %rd54, %rd66;
|
141 |
+
add.s64 %rd83, %rd67, 16;
|
142 |
+
mov.b32 %r257, 0;
|
143 |
+
.loc 1 35 50
|
144 |
+
mov.u32 %r27, 0x0;
|
145 |
+
mov.u32 %r28, 0x0;
|
146 |
+
mov.u32 %r29, 0x0;
|
147 |
+
mov.u32 %r30, 0x0;
|
148 |
+
@%p93 ld.global.L1::evict_last.v4.b32 { %r27, %r28, %r29, %r30 }, [ %rd80 + 0 ];
|
149 |
+
@!%p93 mov.u32 %r27, %r257;
|
150 |
+
@!%p93 mov.u32 %r28, %r257;
|
151 |
+
@!%p93 mov.u32 %r29, %r257;
|
152 |
+
@!%p93 mov.u32 %r30, %r257;
|
153 |
+
mov.b32 %f1, %r27;
|
154 |
+
mov.b32 %f2, %r28;
|
155 |
+
mov.b32 %f3, %r29;
|
156 |
+
mov.b32 %f4, %r30;
|
157 |
+
mov.u32 %r35, 0x0;
|
158 |
+
mov.u32 %r36, 0x0;
|
159 |
+
mov.u32 %r37, 0x0;
|
160 |
+
mov.u32 %r38, 0x0;
|
161 |
+
@%p93 ld.global.L1::evict_last.v4.b32 { %r35, %r36, %r37, %r38 }, [ %rd81 + 0 ];
|
162 |
+
@!%p93 mov.u32 %r35, %r257;
|
163 |
+
@!%p93 mov.u32 %r36, %r257;
|
164 |
+
@!%p93 mov.u32 %r37, %r257;
|
165 |
+
@!%p93 mov.u32 %r38, %r257;
|
166 |
+
mov.b32 %f5, %r35;
|
167 |
+
mov.b32 %f6, %r36;
|
168 |
+
mov.b32 %f7, %r37;
|
169 |
+
mov.b32 %f8, %r38;
|
170 |
+
mov.u32 %r43, 0x0;
|
171 |
+
mov.u32 %r44, 0x0;
|
172 |
+
mov.u32 %r45, 0x0;
|
173 |
+
mov.u32 %r46, 0x0;
|
174 |
+
@%p93 ld.global.L1::evict_last.v4.b32 { %r43, %r44, %r45, %r46 }, [ %rd82 + 0 ];
|
175 |
+
@!%p93 mov.u32 %r43, %r257;
|
176 |
+
@!%p93 mov.u32 %r44, %r257;
|
177 |
+
@!%p93 mov.u32 %r45, %r257;
|
178 |
+
@!%p93 mov.u32 %r46, %r257;
|
179 |
+
mov.b32 %f9, %r43;
|
180 |
+
mov.b32 %f10, %r44;
|
181 |
+
mov.b32 %f11, %r45;
|
182 |
+
mov.b32 %f12, %r46;
|
183 |
+
mov.u32 %r51, 0x0;
|
184 |
+
mov.u32 %r52, 0x0;
|
185 |
+
mov.u32 %r53, 0x0;
|
186 |
+
mov.u32 %r54, 0x0;
|
187 |
+
@%p93 ld.global.L1::evict_last.v4.b32 { %r51, %r52, %r53, %r54 }, [ %rd83 + 0 ];
|
188 |
+
@!%p93 mov.u32 %r51, %r257;
|
189 |
+
@!%p93 mov.u32 %r52, %r257;
|
190 |
+
@!%p93 mov.u32 %r53, %r257;
|
191 |
+
@!%p93 mov.u32 %r54, %r257;
|
192 |
+
mov.b32 %f13, %r51;
|
193 |
+
mov.b32 %f14, %r52;
|
194 |
+
mov.b32 %f15, %r53;
|
195 |
+
mov.b32 %f16, %r54;
|
196 |
+
.loc 1 36 22
|
197 |
+
add.s64 %rd68, %rd47, 50257;
|
198 |
+
.loc 1 37 22
|
199 |
+
setp.lt.s64 %p38, %rd47, 0;
|
200 |
+
.loc 1 38 36
|
201 |
+
selp.b64 %rd7, %rd68, %rd47, %p38;
|
202 |
+
.loc 1 39 40
|
203 |
+
setp.lt.u64 %p39, %rd7, 50257;
|
204 |
+
mov.b32 %r374, 883;
|
205 |
+
mov.u64 %rd112, 1;
|
206 |
+
.loc 1 39 55
|
207 |
+
@%p39 bra $L__BB0_2;
|
208 |
+
mov.u64 %rd69, assertMessage_0;
|
209 |
+
cvta.global.u64 %rd70, %rd69;
|
210 |
+
mov.u64 %rd71, assertFile_0;
|
211 |
+
cvta.global.u64 %rd72, %rd71;
|
212 |
+
mov.u64 %rd73, assertFunc_0;
|
213 |
+
cvta.global.u64 %rd74, %rd73;
|
214 |
+
{ // callseq 8, 0
|
215 |
+
.reg .b32 temp_param_reg;
|
216 |
+
.param .b64 param0;
|
217 |
+
st.param.b64 [param0+0], %rd70;
|
218 |
+
.param .b64 param1;
|
219 |
+
st.param.b64 [param1+0], %rd72;
|
220 |
+
.param .b32 param2;
|
221 |
+
st.param.b32 [param2+0], %r374;
|
222 |
+
.param .b64 param3;
|
223 |
+
st.param.b64 [param3+0], %rd74;
|
224 |
+
.param .b64 param4;
|
225 |
+
st.param.b64 [param4+0], %rd112;
|
226 |
+
call.uni
|
227 |
+
__assertfail,
|
228 |
+
(
|
229 |
+
param0,
|
230 |
+
param1,
|
231 |
+
param2,
|
232 |
+
param3,
|
233 |
+
param4
|
234 |
+
);
|
235 |
+
} // callseq 8
|
236 |
+
$L__BB0_2:
|
237 |
+
.loc 1 0 55
|
238 |
+
ld.param.u64 %rd14, [triton__0d1d2d3d4d5de6de_param_4];
|
239 |
+
.loc 1 37 22
|
240 |
+
setp.lt.s64 %p83, %rd31, 0;
|
241 |
+
setp.lt.s64 %p84, %rd15, 0;
|
242 |
+
.loc 1 40 44
|
243 |
+
shl.b64 %rd85, %rd15, 8;
|
244 |
+
add.s64 %rd86, %rd85, 12865792;
|
245 |
+
selp.b64 %rd87, %rd86, %rd85, %p84;
|
246 |
+
shl.b64 %rd88, %rd31, 8;
|
247 |
+
add.s64 %rd89, %rd88, 12865792;
|
248 |
+
selp.b64 %rd90, %rd89, %rd88, %p83;
|
249 |
+
.loc 1 40 40
|
250 |
+
or.b64 %rd92, %rd87, %rd59;
|
251 |
+
or.b64 %rd93, %rd90, %rd59;
|
252 |
+
.loc 1 40 34
|
253 |
+
shl.b64 %rd94, %rd92, 2;
|
254 |
+
add.s64 %rd104, %rd12, %rd94;
|
255 |
+
add.s64 %rd105, %rd104, 16;
|
256 |
+
shl.b64 %rd95, %rd93, 2;
|
257 |
+
add.s64 %rd106, %rd12, %rd95;
|
258 |
+
add.s64 %rd107, %rd106, 16;
|
259 |
+
.loc 1 40 52
|
260 |
+
mov.u32 %r78, 0x0;
|
261 |
+
mov.u32 %r79, 0x0;
|
262 |
+
mov.u32 %r80, 0x0;
|
263 |
+
mov.u32 %r81, 0x0;
|
264 |
+
@%p93 ld.global.L1::evict_last.v4.b32 { %r78, %r79, %r80, %r81 }, [ %rd104 + 0 ];
|
265 |
+
@!%p93 mov.u32 %r78, %r257;
|
266 |
+
@!%p93 mov.u32 %r79, %r257;
|
267 |
+
@!%p93 mov.u32 %r80, %r257;
|
268 |
+
@!%p93 mov.u32 %r81, %r257;
|
269 |
+
mov.b32 %f27, %r78;
|
270 |
+
mov.b32 %f28, %r79;
|
271 |
+
mov.b32 %f29, %r80;
|
272 |
+
mov.b32 %f30, %r81;
|
273 |
+
mov.u32 %r86, 0x0;
|
274 |
+
mov.u32 %r87, 0x0;
|
275 |
+
mov.u32 %r88, 0x0;
|
276 |
+
mov.u32 %r89, 0x0;
|
277 |
+
@%p93 ld.global.L1::evict_last.v4.b32 { %r86, %r87, %r88, %r89 }, [ %rd105 + 0 ];
|
278 |
+
@!%p93 mov.u32 %r86, %r257;
|
279 |
+
@!%p93 mov.u32 %r87, %r257;
|
280 |
+
@!%p93 mov.u32 %r88, %r257;
|
281 |
+
@!%p93 mov.u32 %r89, %r257;
|
282 |
+
mov.b32 %f31, %r86;
|
283 |
+
mov.b32 %f32, %r87;
|
284 |
+
mov.b32 %f33, %r88;
|
285 |
+
mov.b32 %f34, %r89;
|
286 |
+
mov.u32 %r94, 0x0;
|
287 |
+
mov.u32 %r95, 0x0;
|
288 |
+
mov.u32 %r96, 0x0;
|
289 |
+
mov.u32 %r97, 0x0;
|
290 |
+
@%p93 ld.global.L1::evict_last.v4.b32 { %r94, %r95, %r96, %r97 }, [ %rd106 + 0 ];
|
291 |
+
@!%p93 mov.u32 %r94, %r257;
|
292 |
+
@!%p93 mov.u32 %r95, %r257;
|
293 |
+
@!%p93 mov.u32 %r96, %r257;
|
294 |
+
@!%p93 mov.u32 %r97, %r257;
|
295 |
+
mov.b32 %f35, %r94;
|
296 |
+
mov.b32 %f36, %r95;
|
297 |
+
mov.b32 %f37, %r96;
|
298 |
+
mov.b32 %f38, %r97;
|
299 |
+
mov.u32 %r102, 0x0;
|
300 |
+
mov.u32 %r103, 0x0;
|
301 |
+
mov.u32 %r104, 0x0;
|
302 |
+
mov.u32 %r105, 0x0;
|
303 |
+
@%p93 ld.global.L1::evict_last.v4.b32 { %r102, %r103, %r104, %r105 }, [ %rd107 + 0 ];
|
304 |
+
@!%p93 mov.u32 %r102, %r257;
|
305 |
+
@!%p93 mov.u32 %r103, %r257;
|
306 |
+
@!%p93 mov.u32 %r104, %r257;
|
307 |
+
@!%p93 mov.u32 %r105, %r257;
|
308 |
+
mov.b32 %f39, %r102;
|
309 |
+
mov.b32 %f40, %r103;
|
310 |
+
mov.b32 %f41, %r104;
|
311 |
+
mov.b32 %f42, %r105;
|
312 |
+
.loc 1 41 22
|
313 |
+
add.f32 %f43, %f1, %f27;
|
314 |
+
add.f32 %f44, %f2, %f28;
|
315 |
+
add.f32 %f45, %f3, %f29;
|
316 |
+
add.f32 %f46, %f4, %f30;
|
317 |
+
add.f32 %f47, %f5, %f31;
|
318 |
+
add.f32 %f48, %f6, %f32;
|
319 |
+
add.f32 %f49, %f7, %f33;
|
320 |
+
add.f32 %f50, %f8, %f34;
|
321 |
+
add.f32 %f51, %f9, %f35;
|
322 |
+
add.f32 %f52, %f10, %f36;
|
323 |
+
add.f32 %f53, %f11, %f37;
|
324 |
+
add.f32 %f54, %f12, %f38;
|
325 |
+
add.f32 %f55, %f13, %f39;
|
326 |
+
add.f32 %f56, %f14, %f40;
|
327 |
+
add.f32 %f57, %f15, %f41;
|
328 |
+
add.f32 %f58, %f16, %f42;
|
329 |
+
$L__tmp1:
|
330 |
+
.loc 2 98 22
|
331 |
+
add.f32 %f59, %f43, 0f00000000;
|
332 |
+
add.f32 %f60, %f44, 0f00000000;
|
333 |
+
add.f32 %f61, %f45, 0f00000000;
|
334 |
+
add.f32 %f62, %f46, 0f00000000;
|
335 |
+
add.f32 %f63, %f47, 0f00000000;
|
336 |
+
add.f32 %f64, %f48, 0f00000000;
|
337 |
+
add.f32 %f65, %f49, 0f00000000;
|
338 |
+
add.f32 %f66, %f50, 0f00000000;
|
339 |
+
add.f32 %f67, %f51, 0f00000000;
|
340 |
+
add.f32 %f68, %f52, 0f00000000;
|
341 |
+
add.f32 %f69, %f53, 0f00000000;
|
342 |
+
add.f32 %f70, %f54, 0f00000000;
|
343 |
+
add.f32 %f71, %f55, 0f00000000;
|
344 |
+
add.f32 %f72, %f56, 0f00000000;
|
345 |
+
add.f32 %f73, %f57, 0f00000000;
|
346 |
+
add.f32 %f74, %f58, 0f00000000;
|
347 |
+
.loc 2 101 30
|
348 |
+
sub.f32 %f75, %f43, %f59;
|
349 |
+
sub.f32 %f76, %f44, %f60;
|
350 |
+
sub.f32 %f77, %f45, %f61;
|
351 |
+
sub.f32 %f78, %f46, %f62;
|
352 |
+
sub.f32 %f79, %f47, %f63;
|
353 |
+
sub.f32 %f80, %f48, %f64;
|
354 |
+
sub.f32 %f81, %f49, %f65;
|
355 |
+
sub.f32 %f82, %f50, %f66;
|
356 |
+
sub.f32 %f83, %f51, %f67;
|
357 |
+
sub.f32 %f84, %f52, %f68;
|
358 |
+
sub.f32 %f85, %f53, %f69;
|
359 |
+
sub.f32 %f86, %f54, %f70;
|
360 |
+
sub.f32 %f87, %f55, %f71;
|
361 |
+
sub.f32 %f88, %f56, %f72;
|
362 |
+
sub.f32 %f89, %f57, %f73;
|
363 |
+
sub.f32 %f90, %f58, %f74;
|
364 |
+
.loc 2 101 13
|
365 |
+
fma.rn.f32 %f91, %f43, %f75, 0f00000000;
|
366 |
+
fma.rn.f32 %f92, %f44, %f76, 0f00000000;
|
367 |
+
fma.rn.f32 %f93, %f45, %f77, 0f00000000;
|
368 |
+
fma.rn.f32 %f94, %f46, %f78, 0f00000000;
|
369 |
+
fma.rn.f32 %f95, %f47, %f79, 0f00000000;
|
370 |
+
fma.rn.f32 %f96, %f48, %f80, 0f00000000;
|
371 |
+
fma.rn.f32 %f97, %f49, %f81, 0f00000000;
|
372 |
+
fma.rn.f32 %f98, %f50, %f82, 0f00000000;
|
373 |
+
fma.rn.f32 %f99, %f51, %f83, 0f00000000;
|
374 |
+
fma.rn.f32 %f100, %f52, %f84, 0f00000000;
|
375 |
+
fma.rn.f32 %f101, %f53, %f85, 0f00000000;
|
376 |
+
fma.rn.f32 %f102, %f54, %f86, 0f00000000;
|
377 |
+
fma.rn.f32 %f103, %f55, %f87, 0f00000000;
|
378 |
+
fma.rn.f32 %f104, %f56, %f88, 0f00000000;
|
379 |
+
fma.rn.f32 %f105, %f57, %f89, 0f00000000;
|
380 |
+
fma.rn.f32 %f106, %f58, %f90, 0f00000000;
|
381 |
+
$L__tmp2:
|
382 |
+
.loc 2 108 21
|
383 |
+
sub.f32 %f107, %f60, %f59;
|
384 |
+
mov.b32 %r111, 1065353216;
|
385 |
+
mov.b32 %r112, 1073741824;
|
386 |
+
.loc 2 110 60
|
387 |
+
div.full.f32 %r110, %r111, %r112;
|
388 |
+
mov.b32 %f108, %r110;
|
389 |
+
.loc 2 112 17
|
390 |
+
fma.rn.f32 %f109, %f108, %f107, %f59;
|
391 |
+
.loc 2 113 15
|
392 |
+
add.f32 %f110, %f91, %f92;
|
393 |
+
.loc 2 113 30
|
394 |
+
mul.f32 %f111, %f107, %f107;
|
395 |
+
.loc 2 113 22
|
396 |
+
fma.rn.f32 %f112, %f108, %f111, %f110;
|
397 |
+
.loc 2 108 21
|
398 |
+
sub.f32 %f113, %f61, %f109;
|
399 |
+
mov.b32 %r115, 1077936128;
|
400 |
+
.loc 2 110 60
|
401 |
+
div.full.f32 %r113, %r111, %r115;
|
402 |
+
mov.b32 %f114, %r113;
|
403 |
+
.loc 2 112 17
|
404 |
+
fma.rn.f32 %f115, %f114, %f113, %f109;
|
405 |
+
.loc 2 113 15
|
406 |
+
add.f32 %f116, %f93, %f112;
|
407 |
+
.loc 2 113 30
|
408 |
+
mul.f32 %f117, %f113, %f113;
|
409 |
+
.loc 2 113 38
|
410 |
+
fma.rn.f32 %f118, %f113, %f113, %f117;
|
411 |
+
.loc 2 113 22
|
412 |
+
fma.rn.f32 %f119, %f114, %f118, %f116;
|
413 |
+
.loc 2 108 21
|
414 |
+
sub.f32 %f120, %f62, %f115;
|
415 |
+
mov.b32 %r118, 1082130432;
|
416 |
+
.loc 2 110 60
|
417 |
+
div.full.f32 %r116, %r111, %r118;
|
418 |
+
mov.b32 %f121, %r116;
|
419 |
+
.loc 2 112 17
|
420 |
+
fma.rn.f32 %f122, %f121, %f120, %f115;
|
421 |
+
.loc 2 113 15
|
422 |
+
add.f32 %f123, %f94, %f119;
|
423 |
+
.loc 2 113 30
|
424 |
+
mul.f32 %f124, %f120, %f120;
|
425 |
+
.loc 2 113 38
|
426 |
+
mul.f32 %f125, %f124, 0f40400000;
|
427 |
+
.loc 2 113 22
|
428 |
+
fma.rn.f32 %f126, %f121, %f125, %f123;
|
429 |
+
.loc 2 108 21
|
430 |
+
sub.f32 %f127, %f63, %f122;
|
431 |
+
mov.b32 %r121, 1084227584;
|
432 |
+
.loc 2 110 60
|
433 |
+
div.full.f32 %r119, %r111, %r121;
|
434 |
+
mov.b32 %f128, %r119;
|
435 |
+
.loc 2 112 17
|
436 |
+
fma.rn.f32 %f129, %f128, %f127, %f122;
|
437 |
+
.loc 2 113 15
|
438 |
+
add.f32 %f130, %f95, %f126;
|
439 |
+
.loc 2 113 30
|
440 |
+
mul.f32 %f131, %f127, %f127;
|
441 |
+
.loc 2 113 38
|
442 |
+
mul.f32 %f132, %f131, 0f40800000;
|
443 |
+
.loc 2 113 22
|
444 |
+
fma.rn.f32 %f133, %f128, %f132, %f130;
|
445 |
+
.loc 2 108 21
|
446 |
+
sub.f32 %f134, %f64, %f129;
|
447 |
+
mov.b32 %r124, 1086324736;
|
448 |
+
.loc 2 110 60
|
449 |
+
div.full.f32 %r122, %r111, %r124;
|
450 |
+
mov.b32 %f135, %r122;
|
451 |
+
.loc 2 112 17
|
452 |
+
fma.rn.f32 %f136, %f135, %f134, %f129;
|
453 |
+
.loc 2 113 15
|
454 |
+
add.f32 %f137, %f96, %f133;
|
455 |
+
.loc 2 113 30
|
456 |
+
mul.f32 %f138, %f134, %f134;
|
457 |
+
.loc 2 113 38
|
458 |
+
mul.f32 %f139, %f138, 0f40A00000;
|
459 |
+
.loc 2 113 22
|
460 |
+
fma.rn.f32 %f140, %f135, %f139, %f137;
|
461 |
+
.loc 2 108 21
|
462 |
+
sub.f32 %f141, %f65, %f136;
|
463 |
+
mov.b32 %r127, 1088421888;
|
464 |
+
.loc 2 110 60
|
465 |
+
div.full.f32 %r125, %r111, %r127;
|
466 |
+
mov.b32 %f142, %r125;
|
467 |
+
.loc 2 112 17
|
468 |
+
fma.rn.f32 %f143, %f142, %f141, %f136;
|
469 |
+
.loc 2 113 15
|
470 |
+
add.f32 %f144, %f97, %f140;
|
471 |
+
.loc 2 113 30
|
472 |
+
mul.f32 %f145, %f141, %f141;
|
473 |
+
.loc 2 113 38
|
474 |
+
mul.f32 %f146, %f145, 0f40C00000;
|
475 |
+
.loc 2 113 22
|
476 |
+
fma.rn.f32 %f147, %f142, %f146, %f144;
|
477 |
+
.loc 2 108 21
|
478 |
+
sub.f32 %f148, %f66, %f143;
|
479 |
+
mov.b32 %r130, 1090519040;
|
480 |
+
.loc 2 110 60
|
481 |
+
div.full.f32 %r128, %r111, %r130;
|
482 |
+
mov.b32 %f149, %r128;
|
483 |
+
.loc 2 112 17
|
484 |
+
fma.rn.f32 %f150, %f149, %f148, %f143;
|
485 |
+
.loc 2 113 15
|
486 |
+
add.f32 %f151, %f98, %f147;
|
487 |
+
.loc 2 113 30
|
488 |
+
mul.f32 %f152, %f148, %f148;
|
489 |
+
.loc 2 113 38
|
490 |
+
mul.f32 %f153, %f152, 0f40E00000;
|
491 |
+
.loc 2 113 22
|
492 |
+
fma.rn.f32 %f154, %f149, %f153, %f151;
|
493 |
+
.loc 2 108 21
|
494 |
+
sub.f32 %f155, %f68, %f67;
|
495 |
+
.loc 2 110 60
|
496 |
+
div.full.f32 %r131, %r111, %r112;
|
497 |
+
mov.b32 %f156, %r131;
|
498 |
+
.loc 2 112 17
|
499 |
+
fma.rn.f32 %f157, %f155, %f156, %f67;
|
500 |
+
.loc 2 113 15
|
501 |
+
add.f32 %f158, %f99, %f100;
|
502 |
+
.loc 2 113 30
|
503 |
+
mul.f32 %f159, %f155, %f155;
|
504 |
+
.loc 2 113 22
|
505 |
+
fma.rn.f32 %f160, %f159, %f156, %f158;
|
506 |
+
.loc 2 108 21
|
507 |
+
sub.f32 %f161, %f69, %f157;
|
508 |
+
.loc 2 110 60
|
509 |
+
div.full.f32 %r134, %r111, %r115;
|
510 |
+
mov.b32 %f162, %r134;
|
511 |
+
.loc 2 112 17
|
512 |
+
fma.rn.f32 %f163, %f162, %f161, %f157;
|
513 |
+
.loc 2 113 15
|
514 |
+
add.f32 %f164, %f101, %f160;
|
515 |
+
.loc 2 113 30
|
516 |
+
mul.f32 %f165, %f161, %f161;
|
517 |
+
.loc 2 113 38
|
518 |
+
fma.rn.f32 %f166, %f161, %f161, %f165;
|
519 |
+
.loc 2 113 22
|
520 |
+
fma.rn.f32 %f167, %f162, %f166, %f164;
|
521 |
+
.loc 2 108 21
|
522 |
+
sub.f32 %f168, %f70, %f163;
|
523 |
+
.loc 2 110 60
|
524 |
+
div.full.f32 %r137, %r111, %r118;
|
525 |
+
mov.b32 %f169, %r137;
|
526 |
+
.loc 2 112 17
|
527 |
+
fma.rn.f32 %f170, %f169, %f168, %f163;
|
528 |
+
.loc 2 113 15
|
529 |
+
add.f32 %f171, %f102, %f167;
|
530 |
+
.loc 2 113 30
|
531 |
+
mul.f32 %f172, %f168, %f168;
|
532 |
+
.loc 2 113 38
|
533 |
+
mul.f32 %f173, %f172, 0f40400000;
|
534 |
+
.loc 2 113 22
|
535 |
+
fma.rn.f32 %f174, %f169, %f173, %f171;
|
536 |
+
.loc 2 108 21
|
537 |
+
sub.f32 %f175, %f71, %f170;
|
538 |
+
.loc 2 110 60
|
539 |
+
div.full.f32 %r140, %r111, %r121;
|
540 |
+
mov.b32 %f176, %r140;
|
541 |
+
.loc 2 112 17
|
542 |
+
fma.rn.f32 %f177, %f176, %f175, %f170;
|
543 |
+
.loc 2 113 15
|
544 |
+
add.f32 %f178, %f103, %f174;
|
545 |
+
.loc 2 113 30
|
546 |
+
mul.f32 %f179, %f175, %f175;
|
547 |
+
.loc 2 113 38
|
548 |
+
mul.f32 %f180, %f179, 0f40800000;
|
549 |
+
.loc 2 113 22
|
550 |
+
fma.rn.f32 %f181, %f176, %f180, %f178;
|
551 |
+
.loc 2 108 21
|
552 |
+
sub.f32 %f182, %f72, %f177;
|
553 |
+
.loc 2 110 60
|
554 |
+
div.full.f32 %r143, %r111, %r124;
|
555 |
+
mov.b32 %f183, %r143;
|
556 |
+
.loc 2 112 17
|
557 |
+
fma.rn.f32 %f184, %f183, %f182, %f177;
|
558 |
+
.loc 2 113 15
|
559 |
+
add.f32 %f185, %f104, %f181;
|
560 |
+
.loc 2 113 30
|
561 |
+
mul.f32 %f186, %f182, %f182;
|
562 |
+
.loc 2 113 38
|
563 |
+
mul.f32 %f187, %f186, 0f40A00000;
|
564 |
+
.loc 2 113 22
|
565 |
+
fma.rn.f32 %f188, %f183, %f187, %f185;
|
566 |
+
.loc 2 108 21
|
567 |
+
sub.f32 %f189, %f73, %f184;
|
568 |
+
.loc 2 110 60
|
569 |
+
div.full.f32 %r146, %r111, %r127;
|
570 |
+
mov.b32 %f190, %r146;
|
571 |
+
.loc 2 112 17
|
572 |
+
fma.rn.f32 %f191, %f190, %f189, %f184;
|
573 |
+
.loc 2 113 15
|
574 |
+
add.f32 %f192, %f105, %f188;
|
575 |
+
.loc 2 113 30
|
576 |
+
mul.f32 %f193, %f189, %f189;
|
577 |
+
.loc 2 113 38
|
578 |
+
mul.f32 %f194, %f193, 0f40C00000;
|
579 |
+
.loc 2 113 22
|
580 |
+
fma.rn.f32 %f195, %f190, %f194, %f192;
|
581 |
+
.loc 2 108 21
|
582 |
+
sub.f32 %f196, %f74, %f191;
|
583 |
+
.loc 2 110 60
|
584 |
+
div.full.f32 %r149, %r111, %r130;
|
585 |
+
mov.b32 %f197, %r149;
|
586 |
+
.loc 2 112 17
|
587 |
+
fma.rn.f32 %f198, %f197, %f196, %f191;
|
588 |
+
.loc 2 113 15
|
589 |
+
add.f32 %f199, %f106, %f195;
|
590 |
+
.loc 2 113 30
|
591 |
+
mul.f32 %f200, %f196, %f196;
|
592 |
+
.loc 2 113 38
|
593 |
+
mul.f32 %f201, %f200, 0f40E00000;
|
594 |
+
.loc 2 113 22
|
595 |
+
fma.rn.f32 %f202, %f197, %f201, %f199;
|
596 |
+
$L__tmp3:
|
597 |
+
.loc 2 120 46
|
598 |
+
mov.b32 %r216, %f150;
|
599 |
+
shfl.sync.bfly.b32 %r217, %r216, 16, 31, -1;
|
600 |
+
mov.b32 %f203, %r217;
|
601 |
+
mov.b32 %r218, %f154;
|
602 |
+
shfl.sync.bfly.b32 %r219, %r218, 16, 31, -1;
|
603 |
+
mov.b32 %f204, %r219;
|
604 |
+
shfl.sync.bfly.b32 %r153, %r130, 16, 31, -1;
|
605 |
+
mov.b32 %f205, %r153;
|
606 |
+
$L__tmp4:
|
607 |
+
.loc 2 108 21
|
608 |
+
sub.f32 %f206, %f203, %f150;
|
609 |
+
.loc 2 109 28
|
610 |
+
add.f32 %f207, %f205, 0f41000000;
|
611 |
+
.loc 2 110 39
|
612 |
+
setp.eq.f32 %p85, %f207, 0f00000000;
|
613 |
+
.loc 2 110 60
|
614 |
+
mov.b32 %r154, %f207;
|
615 |
+
div.full.f32 %r152, %r153, %r154;
|
616 |
+
mov.b32 %f208, %r152;
|
617 |
+
.loc 2 110 49
|
618 |
+
selp.f32 %f209, 0f00000000, %f208, %p85;
|
619 |
+
.loc 2 112 17
|
620 |
+
fma.rn.f32 %f210, %f209, %f206, %f150;
|
621 |
+
.loc 2 113 15
|
622 |
+
add.f32 %f211, %f154, %f204;
|
623 |
+
.loc 2 113 30
|
624 |
+
mul.f32 %f212, %f206, %f206;
|
625 |
+
.loc 2 113 38
|
626 |
+
mul.f32 %f213, %f212, 0f41000000;
|
627 |
+
.loc 2 113 22
|
628 |
+
fma.rn.f32 %f214, %f209, %f213, %f211;
|
629 |
+
$L__tmp5:
|
630 |
+
.loc 2 120 46
|
631 |
+
mov.b32 %r220, %f210;
|
632 |
+
shfl.sync.bfly.b32 %r221, %r220, 8, 31, -1;
|
633 |
+
mov.b32 %f215, %r221;
|
634 |
+
mov.b32 %r222, %f214;
|
635 |
+
shfl.sync.bfly.b32 %r223, %r222, 8, 31, -1;
|
636 |
+
mov.b32 %f216, %r223;
|
637 |
+
shfl.sync.bfly.b32 %r156, %r154, 8, 31, -1;
|
638 |
+
mov.b32 %f217, %r156;
|
639 |
+
$L__tmp6:
|
640 |
+
.loc 2 108 21
|
641 |
+
sub.f32 %f218, %f215, %f210;
|
642 |
+
.loc 2 109 28
|
643 |
+
add.f32 %f219, %f207, %f217;
|
644 |
+
.loc 2 110 39
|
645 |
+
setp.eq.f32 %p86, %f219, 0f00000000;
|
646 |
+
.loc 2 110 60
|
647 |
+
mov.b32 %r157, %f219;
|
648 |
+
div.full.f32 %r155, %r156, %r157;
|
649 |
+
mov.b32 %f220, %r155;
|
650 |
+
.loc 2 110 49
|
651 |
+
selp.f32 %f221, 0f00000000, %f220, %p86;
|
652 |
+
.loc 2 112 17
|
653 |
+
fma.rn.f32 %f222, %f221, %f218, %f210;
|
654 |
+
.loc 2 113 15
|
655 |
+
add.f32 %f223, %f214, %f216;
|
656 |
+
.loc 2 113 30
|
657 |
+
mul.f32 %f224, %f218, %f218;
|
658 |
+
.loc 2 113 38
|
659 |
+
mul.f32 %f225, %f207, %f224;
|
660 |
+
.loc 2 113 22
|
661 |
+
fma.rn.f32 %f226, %f221, %f225, %f223;
|
662 |
+
$L__tmp7:
|
663 |
+
.loc 2 120 46
|
664 |
+
mov.b32 %r224, %f222;
|
665 |
+
shfl.sync.bfly.b32 %r225, %r224, 4, 31, -1;
|
666 |
+
mov.b32 %f227, %r225;
|
667 |
+
mov.b32 %r226, %f226;
|
668 |
+
shfl.sync.bfly.b32 %r227, %r226, 4, 31, -1;
|
669 |
+
mov.b32 %f228, %r227;
|
670 |
+
shfl.sync.bfly.b32 %r159, %r157, 4, 31, -1;
|
671 |
+
mov.b32 %f229, %r159;
|
672 |
+
$L__tmp8:
|
673 |
+
.loc 2 108 21
|
674 |
+
sub.f32 %f230, %f227, %f222;
|
675 |
+
.loc 2 109 28
|
676 |
+
add.f32 %f231, %f219, %f229;
|
677 |
+
.loc 2 110 39
|
678 |
+
setp.eq.f32 %p87, %f231, 0f00000000;
|
679 |
+
.loc 2 110 60
|
680 |
+
mov.b32 %r160, %f231;
|
681 |
+
div.full.f32 %r158, %r159, %r160;
|
682 |
+
mov.b32 %f232, %r158;
|
683 |
+
.loc 2 110 49
|
684 |
+
selp.f32 %f233, 0f00000000, %f232, %p87;
|
685 |
+
.loc 2 112 17
|
686 |
+
fma.rn.f32 %f234, %f230, %f233, %f222;
|
687 |
+
.loc 2 113 15
|
688 |
+
add.f32 %f235, %f226, %f228;
|
689 |
+
.loc 2 113 30
|
690 |
+
mul.f32 %f236, %f230, %f230;
|
691 |
+
.loc 2 113 38
|
692 |
+
mul.f32 %f237, %f219, %f236;
|
693 |
+
.loc 2 113 22
|
694 |
+
fma.rn.f32 %f238, %f233, %f237, %f235;
|
695 |
+
$L__tmp9:
|
696 |
+
.loc 2 120 46
|
697 |
+
mov.b32 %r228, %f234;
|
698 |
+
shfl.sync.bfly.b32 %r229, %r228, 2, 31, -1;
|
699 |
+
mov.b32 %f239, %r229;
|
700 |
+
mov.b32 %r230, %f238;
|
701 |
+
shfl.sync.bfly.b32 %r231, %r230, 2, 31, -1;
|
702 |
+
mov.b32 %f240, %r231;
|
703 |
+
shfl.sync.bfly.b32 %r162, %r160, 2, 31, -1;
|
704 |
+
mov.b32 %f241, %r162;
|
705 |
+
$L__tmp10:
|
706 |
+
.loc 2 108 21
|
707 |
+
sub.f32 %f242, %f239, %f234;
|
708 |
+
.loc 2 109 28
|
709 |
+
add.f32 %f17, %f231, %f241;
|
710 |
+
.loc 2 110 39
|
711 |
+
setp.eq.f32 %p88, %f17, 0f00000000;
|
712 |
+
.loc 2 110 60
|
713 |
+
mov.b32 %r163, %f17;
|
714 |
+
div.full.f32 %r161, %r162, %r163;
|
715 |
+
mov.b32 %f243, %r161;
|
716 |
+
.loc 2 110 49
|
717 |
+
selp.f32 %f244, 0f00000000, %f243, %p88;
|
718 |
+
.loc 2 112 17
|
719 |
+
fma.rn.f32 %f18, %f242, %f244, %f234;
|
720 |
+
.loc 2 113 15
|
721 |
+
add.f32 %f245, %f238, %f240;
|
722 |
+
.loc 2 113 30
|
723 |
+
mul.f32 %f246, %f242, %f242;
|
724 |
+
.loc 2 113 38
|
725 |
+
mul.f32 %f247, %f231, %f246;
|
726 |
+
.loc 2 113 22
|
727 |
+
fma.rn.f32 %f19, %f244, %f247, %f245;
|
728 |
+
$L__tmp11:
|
729 |
+
.loc 2 120 46
|
730 |
+
mov.b32 %r232, %f18;
|
731 |
+
shfl.sync.bfly.b32 %r5, %r232, 1, 31, -1;
|
732 |
+
mov.b32 %r233, %f19;
|
733 |
+
shfl.sync.bfly.b32 %r6, %r233, 1, 31, -1;
|
734 |
+
shfl.sync.bfly.b32 %r165, %r163, 1, 31, -1;
|
735 |
+
mov.b32 %f248, %r165;
|
736 |
+
$L__tmp12:
|
737 |
+
.loc 2 109 28
|
738 |
+
add.f32 %f20, %f17, %f248;
|
739 |
+
.loc 2 110 60
|
740 |
+
mov.b32 %r166, %f20;
|
741 |
+
div.full.f32 %r164, %r165, %r166;
|
742 |
+
mov.b32 %f21, %r164;
|
743 |
+
$L__tmp13:
|
744 |
+
.loc 2 120 46
|
745 |
+
mov.b32 %r234, %f198;
|
746 |
+
shfl.sync.bfly.b32 %r235, %r234, 16, 31, -1;
|
747 |
+
mov.b32 %f249, %r235;
|
748 |
+
mov.b32 %r236, %f202;
|
749 |
+
shfl.sync.bfly.b32 %r237, %r236, 16, 31, -1;
|
750 |
+
mov.b32 %f250, %r237;
|
751 |
+
shfl.sync.bfly.b32 %r168, %r130, 16, 31, -1;
|
752 |
+
mov.b32 %f251, %r168;
|
753 |
+
$L__tmp14:
|
754 |
+
.loc 2 108 21
|
755 |
+
sub.f32 %f252, %f249, %f198;
|
756 |
+
.loc 2 109 28
|
757 |
+
add.f32 %f253, %f251, 0f41000000;
|
758 |
+
.loc 2 110 39
|
759 |
+
setp.eq.f32 %p89, %f253, 0f00000000;
|
760 |
+
.loc 2 110 60
|
761 |
+
mov.b32 %r169, %f253;
|
762 |
+
div.full.f32 %r167, %r168, %r169;
|
763 |
+
mov.b32 %f254, %r167;
|
764 |
+
.loc 2 110 49
|
765 |
+
selp.f32 %f255, 0f00000000, %f254, %p89;
|
766 |
+
.loc 2 112 17
|
767 |
+
fma.rn.f32 %f256, %f252, %f255, %f198;
|
768 |
+
.loc 2 113 15
|
769 |
+
add.f32 %f257, %f202, %f250;
|
770 |
+
.loc 2 113 30
|
771 |
+
mul.f32 %f258, %f252, %f252;
|
772 |
+
.loc 2 113 38
|
773 |
+
mul.f32 %f259, %f258, 0f41000000;
|
774 |
+
.loc 2 113 22
|
775 |
+
fma.rn.f32 %f260, %f259, %f255, %f257;
|
776 |
+
$L__tmp15:
|
777 |
+
.loc 2 120 46
|
778 |
+
mov.b32 %r238, %f256;
|
779 |
+
shfl.sync.bfly.b32 %r239, %r238, 8, 31, -1;
|
780 |
+
mov.b32 %f261, %r239;
|
781 |
+
mov.b32 %r240, %f260;
|
782 |
+
shfl.sync.bfly.b32 %r241, %r240, 8, 31, -1;
|
783 |
+
mov.b32 %f262, %r241;
|
784 |
+
shfl.sync.bfly.b32 %r171, %r169, 8, 31, -1;
|
785 |
+
mov.b32 %f263, %r171;
|
786 |
+
$L__tmp16:
|
787 |
+
.loc 2 108 21
|
788 |
+
sub.f32 %f264, %f261, %f256;
|
789 |
+
.loc 2 109 28
|
790 |
+
add.f32 %f265, %f253, %f263;
|
791 |
+
.loc 2 110 39
|
792 |
+
setp.eq.f32 %p90, %f265, 0f00000000;
|
793 |
+
.loc 2 110 60
|
794 |
+
mov.b32 %r172, %f265;
|
795 |
+
div.full.f32 %r170, %r171, %r172;
|
796 |
+
mov.b32 %f266, %r170;
|
797 |
+
.loc 2 110 49
|
798 |
+
selp.f32 %f267, 0f00000000, %f266, %p90;
|
799 |
+
.loc 2 112 17
|
800 |
+
fma.rn.f32 %f268, %f264, %f267, %f256;
|
801 |
+
.loc 2 113 15
|
802 |
+
add.f32 %f269, %f260, %f262;
|
803 |
+
.loc 2 113 30
|
804 |
+
mul.f32 %f270, %f264, %f264;
|
805 |
+
.loc 2 113 38
|
806 |
+
mul.f32 %f271, %f253, %f270;
|
807 |
+
.loc 2 113 22
|
808 |
+
fma.rn.f32 %f272, %f267, %f271, %f269;
|
809 |
+
$L__tmp17:
|
810 |
+
.loc 2 120 46
|
811 |
+
mov.b32 %r242, %f268;
|
812 |
+
shfl.sync.bfly.b32 %r243, %r242, 4, 31, -1;
|
813 |
+
mov.b32 %f273, %r243;
|
814 |
+
mov.b32 %r244, %f272;
|
815 |
+
shfl.sync.bfly.b32 %r245, %r244, 4, 31, -1;
|
816 |
+
mov.b32 %f274, %r245;
|
817 |
+
shfl.sync.bfly.b32 %r174, %r172, 4, 31, -1;
|
818 |
+
mov.b32 %f275, %r174;
|
819 |
+
$L__tmp18:
|
820 |
+
.loc 2 108 21
|
821 |
+
sub.f32 %f276, %f273, %f268;
|
822 |
+
.loc 2 109 28
|
823 |
+
add.f32 %f277, %f265, %f275;
|
824 |
+
.loc 2 110 39
|
825 |
+
setp.eq.f32 %p91, %f277, 0f00000000;
|
826 |
+
.loc 2 110 60
|
827 |
+
mov.b32 %r175, %f277;
|
828 |
+
div.full.f32 %r173, %r174, %r175;
|
829 |
+
mov.b32 %f278, %r173;
|
830 |
+
.loc 2 110 49
|
831 |
+
selp.f32 %f279, 0f00000000, %f278, %p91;
|
832 |
+
.loc 2 112 17
|
833 |
+
fma.rn.f32 %f280, %f276, %f279, %f268;
|
834 |
+
.loc 2 113 15
|
835 |
+
add.f32 %f281, %f272, %f274;
|
836 |
+
.loc 2 113 30
|
837 |
+
mul.f32 %f282, %f276, %f276;
|
838 |
+
.loc 2 113 38
|
839 |
+
mul.f32 %f283, %f265, %f282;
|
840 |
+
.loc 2 113 22
|
841 |
+
fma.rn.f32 %f284, %f279, %f283, %f281;
|
842 |
+
$L__tmp19:
|
843 |
+
.loc 2 120 46
|
844 |
+
mov.b32 %r246, %f280;
|
845 |
+
shfl.sync.bfly.b32 %r247, %r246, 2, 31, -1;
|
846 |
+
mov.b32 %f285, %r247;
|
847 |
+
mov.b32 %r248, %f284;
|
848 |
+
shfl.sync.bfly.b32 %r249, %r248, 2, 31, -1;
|
849 |
+
mov.b32 %f286, %r249;
|
850 |
+
shfl.sync.bfly.b32 %r177, %r175, 2, 31, -1;
|
851 |
+
mov.b32 %f287, %r177;
|
852 |
+
$L__tmp20:
|
853 |
+
.loc 2 108 21
|
854 |
+
sub.f32 %f288, %f285, %f280;
|
855 |
+
.loc 2 109 28
|
856 |
+
add.f32 %f22, %f277, %f287;
|
857 |
+
.loc 2 110 39
|
858 |
+
setp.eq.f32 %p92, %f22, 0f00000000;
|
859 |
+
.loc 2 110 60
|
860 |
+
mov.b32 %r178, %f22;
|
861 |
+
div.full.f32 %r176, %r177, %r178;
|
862 |
+
mov.b32 %f289, %r176;
|
863 |
+
.loc 2 110 49
|
864 |
+
selp.f32 %f290, 0f00000000, %f289, %p92;
|
865 |
+
.loc 2 112 17
|
866 |
+
fma.rn.f32 %f23, %f288, %f290, %f280;
|
867 |
+
.loc 2 113 15
|
868 |
+
add.f32 %f291, %f284, %f286;
|
869 |
+
.loc 2 113 30
|
870 |
+
mul.f32 %f292, %f288, %f288;
|
871 |
+
.loc 2 113 38
|
872 |
+
mul.f32 %f293, %f277, %f292;
|
873 |
+
.loc 2 113 22
|
874 |
+
fma.rn.f32 %f24, %f290, %f293, %f291;
|
875 |
+
$L__tmp21:
|
876 |
+
.loc 2 120 46
|
877 |
+
mov.b32 %r250, %f23;
|
878 |
+
shfl.sync.bfly.b32 %r7, %r250, 1, 31, -1;
|
879 |
+
mov.b32 %r251, %f24;
|
880 |
+
shfl.sync.bfly.b32 %r8, %r251, 1, 31, -1;
|
881 |
+
shfl.sync.bfly.b32 %r180, %r178, 1, 31, -1;
|
882 |
+
mov.b32 %f294, %r180;
|
883 |
+
$L__tmp22:
|
884 |
+
.loc 2 109 28
|
885 |
+
add.f32 %f25, %f22, %f294;
|
886 |
+
.loc 2 110 60
|
887 |
+
mov.b32 %r181, %f25;
|
888 |
+
div.full.f32 %r179, %r180, %r181;
|
889 |
+
mov.b32 %f26, %r179;
|
890 |
+
$L__tmp23:
|
891 |
+
.loc 1 59 51
|
892 |
+
mov.u32 %r182, 0x0;
|
893 |
+
mov.u32 %r183, 0x0;
|
894 |
+
mov.u32 %r184, 0x0;
|
895 |
+
mov.u32 %r185, 0x0;
|
896 |
+
@%p93 ld.global.L1::evict_last.v4.b32 { %r182, %r183, %r184, %r185 }, [ %rd80 + 0 ];
|
897 |
+
@!%p93 mov.u32 %r182, %r257;
|
898 |
+
@!%p93 mov.u32 %r183, %r257;
|
899 |
+
@!%p93 mov.u32 %r184, %r257;
|
900 |
+
@!%p93 mov.u32 %r185, %r257;
|
901 |
+
mov.u32 %r190, 0x0;
|
902 |
+
mov.u32 %r191, 0x0;
|
903 |
+
mov.u32 %r192, 0x0;
|
904 |
+
mov.u32 %r193, 0x0;
|
905 |
+
@%p93 ld.global.L1::evict_last.v4.b32 { %r190, %r191, %r192, %r193 }, [ %rd81 + 0 ];
|
906 |
+
@!%p93 mov.u32 %r190, %r257;
|
907 |
+
@!%p93 mov.u32 %r191, %r257;
|
908 |
+
@!%p93 mov.u32 %r192, %r257;
|
909 |
+
@!%p93 mov.u32 %r193, %r257;
|
910 |
+
mov.u32 %r198, 0x0;
|
911 |
+
mov.u32 %r199, 0x0;
|
912 |
+
mov.u32 %r200, 0x0;
|
913 |
+
mov.u32 %r201, 0x0;
|
914 |
+
@%p93 ld.global.L1::evict_last.v4.b32 { %r198, %r199, %r200, %r201 }, [ %rd82 + 0 ];
|
915 |
+
@!%p93 mov.u32 %r198, %r257;
|
916 |
+
@!%p93 mov.u32 %r199, %r257;
|
917 |
+
@!%p93 mov.u32 %r200, %r257;
|
918 |
+
@!%p93 mov.u32 %r201, %r257;
|
919 |
+
mov.u32 %r206, 0x0;
|
920 |
+
mov.u32 %r207, 0x0;
|
921 |
+
mov.u32 %r208, 0x0;
|
922 |
+
mov.u32 %r209, 0x0;
|
923 |
+
@%p93 ld.global.L1::evict_last.v4.b32 { %r206, %r207, %r208, %r209 }, [ %rd83 + 0 ];
|
924 |
+
@!%p93 mov.u32 %r206, %r257;
|
925 |
+
@!%p93 mov.u32 %r207, %r257;
|
926 |
+
@!%p93 mov.u32 %r208, %r257;
|
927 |
+
@!%p93 mov.u32 %r209, %r257;
|
928 |
+
.loc 1 60 35
|
929 |
+
mul.wide.u32 %rd96, %r2, 4;
|
930 |
+
add.s64 %rd84, %rd13, %rd96;
|
931 |
+
.loc 1 60 40
|
932 |
+
mov.u32 %r214, 0x0;
|
933 |
+
@%p93 ld.global.L1::evict_last.b32 { %r214 }, [ %rd84 + 0 ];
|
934 |
+
@!%p93 mov.u32 %r214, %r257;
|
935 |
+
.loc 1 64 57
|
936 |
+
@%p39 bra $L__BB0_4;
|
937 |
+
mov.u64 %rd97, assertMessage_1;
|
938 |
+
cvta.global.u64 %rd98, %rd97;
|
939 |
+
mov.u64 %rd99, assertFile_1;
|
940 |
+
cvta.global.u64 %rd100, %rd99;
|
941 |
+
mov.u64 %rd101, assertFunc_1;
|
942 |
+
cvta.global.u64 %rd102, %rd101;
|
943 |
+
{ // callseq 9, 0
|
944 |
+
.reg .b32 temp_param_reg;
|
945 |
+
.param .b64 param0;
|
946 |
+
st.param.b64 [param0+0], %rd98;
|
947 |
+
.param .b64 param1;
|
948 |
+
st.param.b64 [param1+0], %rd100;
|
949 |
+
.param .b32 param2;
|
950 |
+
st.param.b32 [param2+0], %r374;
|
951 |
+
.param .b64 param3;
|
952 |
+
st.param.b64 [param3+0], %rd102;
|
953 |
+
.param .b64 param4;
|
954 |
+
st.param.b64 [param4+0], %rd112;
|
955 |
+
call.uni
|
956 |
+
__assertfail,
|
957 |
+
(
|
958 |
+
param0,
|
959 |
+
param1,
|
960 |
+
param2,
|
961 |
+
param3,
|
962 |
+
param4
|
963 |
+
);
|
964 |
+
} // callseq 9
|
965 |
+
$L__BB0_4:
|
966 |
+
$L__tmp24:
|
967 |
+
.loc 2 120 46
|
968 |
+
mov.b32 %f295, %r8;
|
969 |
+
$L__tmp25:
|
970 |
+
.loc 2 113 15
|
971 |
+
add.f32 %f296, %f24, %f295;
|
972 |
+
$L__tmp26:
|
973 |
+
.loc 2 120 46
|
974 |
+
mov.b32 %f297, %r7;
|
975 |
+
$L__tmp27:
|
976 |
+
.loc 2 108 21
|
977 |
+
sub.f32 %f298, %f297, %f23;
|
978 |
+
.loc 2 113 30
|
979 |
+
mul.f32 %f299, %f298, %f298;
|
980 |
+
.loc 2 113 38
|
981 |
+
mul.f32 %f300, %f22, %f299;
|
982 |
+
.loc 2 110 39
|
983 |
+
setp.eq.f32 %p115, %f25, 0f00000000;
|
984 |
+
.loc 2 110 49
|
985 |
+
selp.f32 %f301, 0f00000000, %f26, %p115;
|
986 |
+
.loc 2 113 22
|
987 |
+
fma.rn.f32 %f302, %f301, %f300, %f296;
|
988 |
+
$L__tmp28:
|
989 |
+
.loc 2 120 46
|
990 |
+
mov.b32 %f303, %r6;
|
991 |
+
$L__tmp29:
|
992 |
+
.loc 2 113 15
|
993 |
+
add.f32 %f304, %f19, %f303;
|
994 |
+
$L__tmp30:
|
995 |
+
.loc 2 120 46
|
996 |
+
mov.b32 %f305, %r5;
|
997 |
+
$L__tmp31:
|
998 |
+
.loc 2 108 21
|
999 |
+
sub.f32 %f306, %f305, %f18;
|
1000 |
+
.loc 2 113 30
|
1001 |
+
mul.f32 %f307, %f306, %f306;
|
1002 |
+
.loc 2 113 38
|
1003 |
+
mul.f32 %f308, %f17, %f307;
|
1004 |
+
.loc 2 110 39
|
1005 |
+
setp.eq.f32 %p116, %f20, 0f00000000;
|
1006 |
+
.loc 2 110 49
|
1007 |
+
selp.f32 %f309, 0f00000000, %f21, %p116;
|
1008 |
+
.loc 2 113 22
|
1009 |
+
fma.rn.f32 %f310, %f309, %f308, %f304;
|
1010 |
+
$L__tmp32:
|
1011 |
+
.loc 1 65 54
|
1012 |
+
mov.u32 %r253, 0x0;
|
1013 |
+
mov.u32 %r254, 0x0;
|
1014 |
+
mov.u32 %r255, 0x0;
|
1015 |
+
mov.u32 %r256, 0x0;
|
1016 |
+
@%p93 ld.global.L1::evict_first.v4.b32 { %r253, %r254, %r255, %r256 }, [ %rd104 + 0 ];
|
1017 |
+
@!%p93 mov.u32 %r253, %r257;
|
1018 |
+
@!%p93 mov.u32 %r254, %r257;
|
1019 |
+
@!%p93 mov.u32 %r255, %r257;
|
1020 |
+
@!%p93 mov.u32 %r256, %r257;
|
1021 |
+
mov.u32 %r261, 0x0;
|
1022 |
+
mov.u32 %r262, 0x0;
|
1023 |
+
mov.u32 %r263, 0x0;
|
1024 |
+
mov.u32 %r264, 0x0;
|
1025 |
+
@%p93 ld.global.L1::evict_first.v4.b32 { %r261, %r262, %r263, %r264 }, [ %rd105 + 0 ];
|
1026 |
+
@!%p93 mov.u32 %r261, %r257;
|
1027 |
+
@!%p93 mov.u32 %r262, %r257;
|
1028 |
+
@!%p93 mov.u32 %r263, %r257;
|
1029 |
+
@!%p93 mov.u32 %r264, %r257;
|
1030 |
+
mov.u32 %r269, 0x0;
|
1031 |
+
mov.u32 %r270, 0x0;
|
1032 |
+
mov.u32 %r271, 0x0;
|
1033 |
+
mov.u32 %r272, 0x0;
|
1034 |
+
@%p93 ld.global.L1::evict_first.v4.b32 { %r269, %r270, %r271, %r272 }, [ %rd106 + 0 ];
|
1035 |
+
@!%p93 mov.u32 %r269, %r257;
|
1036 |
+
@!%p93 mov.u32 %r270, %r257;
|
1037 |
+
@!%p93 mov.u32 %r271, %r257;
|
1038 |
+
@!%p93 mov.u32 %r272, %r257;
|
1039 |
+
mov.u32 %r277, 0x0;
|
1040 |
+
mov.u32 %r278, 0x0;
|
1041 |
+
mov.u32 %r279, 0x0;
|
1042 |
+
mov.u32 %r280, 0x0;
|
1043 |
+
@%p93 ld.global.L1::evict_first.v4.b32 { %r277, %r278, %r279, %r280 }, [ %rd107 + 0 ];
|
1044 |
+
@!%p93 mov.u32 %r277, %r257;
|
1045 |
+
@!%p93 mov.u32 %r278, %r257;
|
1046 |
+
@!%p93 mov.u32 %r279, %r257;
|
1047 |
+
@!%p93 mov.u32 %r280, %r257;
|
1048 |
+
.loc 1 69 23
|
1049 |
+
mov.b32 %r286, %f310;
|
1050 |
+
mov.b32 %r287, 1132462080;
|
1051 |
+
div.full.f32 %r285, %r286, %r287;
|
1052 |
+
mov.b32 %f311, %r285;
|
1053 |
+
mov.b32 %r310, %f302;
|
1054 |
+
div.full.f32 %r309, %r310, %r287;
|
1055 |
+
mov.b32 %f312, %r309;
|
1056 |
+
.loc 1 71 24
|
1057 |
+
add.f32 %f313, %f311, 0f3727C5AC;
|
1058 |
+
add.f32 %f314, %f312, 0f3727C5AC;
|
1059 |
+
.loc 1 72 30
|
1060 |
+
rsqrt.approx.ftz.f32 %f315, %f313;
|
1061 |
+
rsqrt.approx.ftz.f32 %f316, %f314;
|
1062 |
+
.loc 1 65 54
|
1063 |
+
mov.b32 %f317, %r280;
|
1064 |
+
.loc 1 59 51
|
1065 |
+
mov.b32 %f318, %r209;
|
1066 |
+
.loc 1 66 24
|
1067 |
+
add.f32 %f319, %f318, %f317;
|
1068 |
+
$L__tmp33:
|
1069 |
+
.loc 2 112 17
|
1070 |
+
fma.rn.f32 %f320, %f298, %f301, %f23;
|
1071 |
+
$L__tmp34:
|
1072 |
+
.loc 1 67 24
|
1073 |
+
sub.f32 %f321, %f319, %f320;
|
1074 |
+
.loc 1 65 54
|
1075 |
+
mov.b32 %f322, %r279;
|
1076 |
+
.loc 1 59 51
|
1077 |
+
mov.b32 %f323, %r208;
|
1078 |
+
.loc 1 66 24
|
1079 |
+
add.f32 %f324, %f323, %f322;
|
1080 |
+
.loc 1 67 24
|
1081 |
+
sub.f32 %f325, %f324, %f320;
|
1082 |
+
.loc 1 65 54
|
1083 |
+
mov.b32 %f326, %r278;
|
1084 |
+
.loc 1 59 51
|
1085 |
+
mov.b32 %f327, %r207;
|
1086 |
+
.loc 1 66 24
|
1087 |
+
add.f32 %f328, %f327, %f326;
|
1088 |
+
.loc 1 67 24
|
1089 |
+
sub.f32 %f329, %f328, %f320;
|
1090 |
+
.loc 1 65 54
|
1091 |
+
mov.b32 %f330, %r277;
|
1092 |
+
.loc 1 59 51
|
1093 |
+
mov.b32 %f331, %r206;
|
1094 |
+
.loc 1 66 24
|
1095 |
+
add.f32 %f332, %f331, %f330;
|
1096 |
+
.loc 1 67 24
|
1097 |
+
sub.f32 %f333, %f332, %f320;
|
1098 |
+
.loc 1 65 54
|
1099 |
+
mov.b32 %f334, %r272;
|
1100 |
+
.loc 1 59 51
|
1101 |
+
mov.b32 %f335, %r201;
|
1102 |
+
.loc 1 66 24
|
1103 |
+
add.f32 %f336, %f335, %f334;
|
1104 |
+
.loc 1 67 24
|
1105 |
+
sub.f32 %f337, %f336, %f320;
|
1106 |
+
.loc 1 65 54
|
1107 |
+
mov.b32 %f338, %r271;
|
1108 |
+
.loc 1 59 51
|
1109 |
+
mov.b32 %f339, %r200;
|
1110 |
+
.loc 1 66 24
|
1111 |
+
add.f32 %f340, %f339, %f338;
|
1112 |
+
.loc 1 67 24
|
1113 |
+
sub.f32 %f341, %f340, %f320;
|
1114 |
+
.loc 1 65 54
|
1115 |
+
mov.b32 %f342, %r270;
|
1116 |
+
.loc 1 59 51
|
1117 |
+
mov.b32 %f343, %r199;
|
1118 |
+
.loc 1 66 24
|
1119 |
+
add.f32 %f344, %f343, %f342;
|
1120 |
+
.loc 1 67 24
|
1121 |
+
sub.f32 %f345, %f344, %f320;
|
1122 |
+
.loc 1 65 54
|
1123 |
+
mov.b32 %f346, %r269;
|
1124 |
+
.loc 1 59 51
|
1125 |
+
mov.b32 %f347, %r198;
|
1126 |
+
.loc 1 66 24
|
1127 |
+
add.f32 %f348, %f347, %f346;
|
1128 |
+
.loc 1 67 24
|
1129 |
+
sub.f32 %f349, %f348, %f320;
|
1130 |
+
.loc 1 65 54
|
1131 |
+
mov.b32 %f350, %r264;
|
1132 |
+
.loc 1 59 51
|
1133 |
+
mov.b32 %f351, %r193;
|
1134 |
+
.loc 1 66 24
|
1135 |
+
add.f32 %f352, %f351, %f350;
|
1136 |
+
$L__tmp35:
|
1137 |
+
.loc 2 112 17
|
1138 |
+
fma.rn.f32 %f353, %f306, %f309, %f18;
|
1139 |
+
$L__tmp36:
|
1140 |
+
.loc 1 67 24
|
1141 |
+
sub.f32 %f354, %f352, %f353;
|
1142 |
+
.loc 1 65 54
|
1143 |
+
mov.b32 %f355, %r263;
|
1144 |
+
.loc 1 59 51
|
1145 |
+
mov.b32 %f356, %r192;
|
1146 |
+
.loc 1 66 24
|
1147 |
+
add.f32 %f357, %f356, %f355;
|
1148 |
+
.loc 1 67 24
|
1149 |
+
sub.f32 %f358, %f357, %f353;
|
1150 |
+
.loc 1 65 54
|
1151 |
+
mov.b32 %f359, %r262;
|
1152 |
+
.loc 1 59 51
|
1153 |
+
mov.b32 %f360, %r191;
|
1154 |
+
.loc 1 66 24
|
1155 |
+
add.f32 %f361, %f360, %f359;
|
1156 |
+
.loc 1 67 24
|
1157 |
+
sub.f32 %f362, %f361, %f353;
|
1158 |
+
.loc 1 65 54
|
1159 |
+
mov.b32 %f363, %r261;
|
1160 |
+
.loc 1 59 51
|
1161 |
+
mov.b32 %f364, %r190;
|
1162 |
+
.loc 1 66 24
|
1163 |
+
add.f32 %f365, %f364, %f363;
|
1164 |
+
.loc 1 67 24
|
1165 |
+
sub.f32 %f366, %f365, %f353;
|
1166 |
+
.loc 1 65 54
|
1167 |
+
mov.b32 %f367, %r256;
|
1168 |
+
.loc 1 59 51
|
1169 |
+
mov.b32 %f368, %r185;
|
1170 |
+
.loc 1 66 24
|
1171 |
+
add.f32 %f369, %f368, %f367;
|
1172 |
+
.loc 1 67 24
|
1173 |
+
sub.f32 %f370, %f369, %f353;
|
1174 |
+
.loc 1 65 54
|
1175 |
+
mov.b32 %f371, %r255;
|
1176 |
+
.loc 1 59 51
|
1177 |
+
mov.b32 %f372, %r184;
|
1178 |
+
.loc 1 66 24
|
1179 |
+
add.f32 %f373, %f372, %f371;
|
1180 |
+
.loc 1 67 24
|
1181 |
+
sub.f32 %f374, %f373, %f353;
|
1182 |
+
.loc 1 65 54
|
1183 |
+
mov.b32 %f375, %r254;
|
1184 |
+
.loc 1 59 51
|
1185 |
+
mov.b32 %f376, %r183;
|
1186 |
+
.loc 1 66 24
|
1187 |
+
add.f32 %f377, %f376, %f375;
|
1188 |
+
.loc 1 67 24
|
1189 |
+
sub.f32 %f378, %f377, %f353;
|
1190 |
+
.loc 1 65 54
|
1191 |
+
mov.b32 %f379, %r253;
|
1192 |
+
.loc 1 59 51
|
1193 |
+
mov.b32 %f380, %r182;
|
1194 |
+
.loc 1 66 24
|
1195 |
+
add.f32 %f381, %f380, %f379;
|
1196 |
+
.loc 1 67 24
|
1197 |
+
sub.f32 %f382, %f381, %f353;
|
1198 |
+
.loc 1 73 24
|
1199 |
+
mul.f32 %f383, %f382, %f315;
|
1200 |
+
mul.f32 %f384, %f378, %f315;
|
1201 |
+
mul.f32 %f385, %f374, %f315;
|
1202 |
+
mul.f32 %f386, %f370, %f315;
|
1203 |
+
mul.f32 %f387, %f366, %f315;
|
1204 |
+
mul.f32 %f388, %f362, %f315;
|
1205 |
+
mul.f32 %f389, %f358, %f315;
|
1206 |
+
mul.f32 %f390, %f354, %f315;
|
1207 |
+
mul.f32 %f391, %f349, %f316;
|
1208 |
+
mul.f32 %f392, %f345, %f316;
|
1209 |
+
mul.f32 %f393, %f341, %f316;
|
1210 |
+
mul.f32 %f394, %f337, %f316;
|
1211 |
+
mul.f32 %f395, %f333, %f316;
|
1212 |
+
mul.f32 %f396, %f329, %f316;
|
1213 |
+
mul.f32 %f397, %f325, %f316;
|
1214 |
+
mul.f32 %f398, %f321, %f316;
|
1215 |
+
.loc 1 74 24
|
1216 |
+
shl.b32 %r357, %r2, 2;
|
1217 |
+
mov.u32 %r358, global_smem;
|
1218 |
+
add.s32 %r359, %r358, %r357;
|
1219 |
+
st.shared.u32 [%r359], %r214;
|
1220 |
+
bar.sync 0;
|
1221 |
+
shl.b32 %r360, %r1, 2;
|
1222 |
+
add.s32 %r361, %r358, %r360;
|
1223 |
+
ld.shared.v4.f32 {%f399, %f400, %f401, %f402}, [%r361];
|
1224 |
+
ld.shared.v4.f32 {%f403, %f404, %f405, %f406}, [%r361+16];
|
1225 |
+
mul.f32 %f407, %f383, %f399;
|
1226 |
+
mul.f32 %f408, %f384, %f400;
|
1227 |
+
mul.f32 %f409, %f385, %f401;
|
1228 |
+
mul.f32 %f410, %f386, %f402;
|
1229 |
+
mul.f32 %f411, %f387, %f403;
|
1230 |
+
mul.f32 %f412, %f388, %f404;
|
1231 |
+
mul.f32 %f413, %f389, %f405;
|
1232 |
+
mul.f32 %f414, %f390, %f406;
|
1233 |
+
mul.f32 %f415, %f391, %f399;
|
1234 |
+
mul.f32 %f416, %f392, %f400;
|
1235 |
+
mul.f32 %f417, %f393, %f401;
|
1236 |
+
mul.f32 %f418, %f394, %f402;
|
1237 |
+
mul.f32 %f419, %f395, %f403;
|
1238 |
+
mul.f32 %f420, %f396, %f404;
|
1239 |
+
mul.f32 %f421, %f397, %f405;
|
1240 |
+
mul.f32 %f422, %f398, %f406;
|
1241 |
+
.loc 1 76 39
|
1242 |
+
shl.b32 %r362, %r3, 8;
|
1243 |
+
shl.b32 %r363, %r4, 8;
|
1244 |
+
.loc 1 76 35
|
1245 |
+
or.b32 %r364, %r362, %r1;
|
1246 |
+
or.b32 %r365, %r363, %r1;
|
1247 |
+
.loc 1 76 29
|
1248 |
+
mul.wide.s32 %rd110, %r364, 2;
|
1249 |
+
add.s64 %rd108, %rd14, %rd110;
|
1250 |
+
mul.wide.s32 %rd111, %r365, 2;
|
1251 |
+
add.s64 %rd109, %rd14, %rd111;
|
1252 |
+
.loc 1 76 52
|
1253 |
+
mov.b32 %r333, %f407;
|
1254 |
+
cvt.rn.bf16.f32 %rs1, %r333;
|
1255 |
+
mov.b32 %r334, %f408;
|
1256 |
+
cvt.rn.bf16.f32 %rs2, %r334;
|
1257 |
+
mov.b32 %r335, %f409;
|
1258 |
+
cvt.rn.bf16.f32 %rs3, %r335;
|
1259 |
+
mov.b32 %r336, %f410;
|
1260 |
+
cvt.rn.bf16.f32 %rs4, %r336;
|
1261 |
+
mov.b32 %r337, %f411;
|
1262 |
+
cvt.rn.bf16.f32 %rs5, %r337;
|
1263 |
+
mov.b32 %r338, %f412;
|
1264 |
+
cvt.rn.bf16.f32 %rs6, %r338;
|
1265 |
+
mov.b32 %r339, %f413;
|
1266 |
+
cvt.rn.bf16.f32 %rs7, %r339;
|
1267 |
+
mov.b32 %r340, %f414;
|
1268 |
+
cvt.rn.bf16.f32 %rs8, %r340;
|
1269 |
+
mov.b32 %r341, %f415;
|
1270 |
+
cvt.rn.bf16.f32 %rs9, %r341;
|
1271 |
+
mov.b32 %r342, %f416;
|
1272 |
+
cvt.rn.bf16.f32 %rs10, %r342;
|
1273 |
+
mov.b32 %r343, %f417;
|
1274 |
+
cvt.rn.bf16.f32 %rs11, %r343;
|
1275 |
+
mov.b32 %r344, %f418;
|
1276 |
+
cvt.rn.bf16.f32 %rs12, %r344;
|
1277 |
+
mov.b32 %r345, %f419;
|
1278 |
+
cvt.rn.bf16.f32 %rs13, %r345;
|
1279 |
+
mov.b32 %r346, %f420;
|
1280 |
+
cvt.rn.bf16.f32 %rs14, %r346;
|
1281 |
+
mov.b32 %r347, %f421;
|
1282 |
+
cvt.rn.bf16.f32 %rs15, %r347;
|
1283 |
+
mov.b32 %r348, %f422;
|
1284 |
+
cvt.rn.bf16.f32 %rs16, %r348;
|
1285 |
+
mov.b32 %r366, {%rs1, %rs2};
|
1286 |
+
mov.b32 %r367, {%rs3, %rs4};
|
1287 |
+
mov.b32 %r368, {%rs5, %rs6};
|
1288 |
+
mov.b32 %r369, {%rs7, %rs8};
|
1289 |
+
@%p93 st.global.v4.b32 [ %rd108 + 0 ], { %r366, %r367, %r368, %r369 };
|
1290 |
+
mov.b32 %r370, {%rs9, %rs10};
|
1291 |
+
mov.b32 %r371, {%rs11, %rs12};
|
1292 |
+
mov.b32 %r372, {%rs13, %rs14};
|
1293 |
+
mov.b32 %r373, {%rs15, %rs16};
|
1294 |
+
@%p93 st.global.v4.b32 [ %rd109 + 0 ], { %r370, %r371, %r372, %r373 };
|
1295 |
+
.loc 1 55 4
|
1296 |
+
ret;
|
1297 |
+
$L__tmp37:
|
1298 |
+
$L__func_end0:
|
1299 |
+
|
1300 |
+
}
|
1301 |
+
// .globl __nv_rsqrtf
|
1302 |
+
.visible .func (.param .b32 func_retval0) __nv_rsqrtf(
|
1303 |
+
.param .b32 __nv_rsqrtf_param_0
|
1304 |
+
)
|
1305 |
+
{
|
1306 |
+
.reg .f32 %f<3>;
|
1307 |
+
$L__func_begin1:
|
1308 |
+
|
1309 |
+
ld.param.f32 %f1, [__nv_rsqrtf_param_0];
|
1310 |
+
rsqrt.approx.ftz.f32 %f2, %f1;
|
1311 |
+
st.param.f32 [func_retval0+0], %f2;
|
1312 |
+
ret;
|
1313 |
+
$L__func_end1:
|
1314 |
+
|
1315 |
+
}
|
1316 |
+
.file 1 "/tmp/torchinductor_root/gx/cgx5lxpuexpindj4dsmjz5x42uhyy7iskevq7ovzpwagb3t5powj.py"
|
1317 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
|
1318 |
+
.section .debug_abbrev
|
1319 |
+
{
|
1320 |
+
.b8 1
|
1321 |
+
.b8 17
|
1322 |
+
.b8 1
|
1323 |
+
.b8 37
|
1324 |
+
.b8 8
|
1325 |
+
.b8 19
|
1326 |
+
.b8 5
|
1327 |
+
.b8 3
|
1328 |
+
.b8 8
|
1329 |
+
.b8 16
|
1330 |
+
.b8 6
|
1331 |
+
.b8 27
|
1332 |
+
.b8 8
|
1333 |
+
.b8 180
|
1334 |
+
.b8 66
|
1335 |
+
.b8 12
|
1336 |
+
.b8 17
|
1337 |
+
.b8 1
|
1338 |
+
.b8 18
|
1339 |
+
.b8 1
|
1340 |
+
.b8 0
|
1341 |
+
.b8 0
|
1342 |
+
.b8 2
|
1343 |
+
.b8 46
|
1344 |
+
.b8 0
|
1345 |
+
.b8 135
|
1346 |
+
.b8 64
|
1347 |
+
.b8 8
|
1348 |
+
.b8 3
|
1349 |
+
.b8 8
|
1350 |
+
.b8 58
|
1351 |
+
.b8 11
|
1352 |
+
.b8 59
|
1353 |
+
.b8 11
|
1354 |
+
.b8 63
|
1355 |
+
.b8 12
|
1356 |
+
.b8 32
|
1357 |
+
.b8 11
|
1358 |
+
.b8 0
|
1359 |
+
.b8 0
|
1360 |
+
.b8 3
|
1361 |
+
.b8 46
|
1362 |
+
.b8 1
|
1363 |
+
.b8 17
|
1364 |
+
.b8 1
|
1365 |
+
.b8 18
|
1366 |
+
.b8 1
|
1367 |
+
.b8 64
|
1368 |
+
.b8 10
|
1369 |
+
.b8 49
|
1370 |
+
.b8 19
|
1371 |
+
.b8 0
|
1372 |
+
.b8 0
|
1373 |
+
.b8 4
|
1374 |
+
.b8 29
|
1375 |
+
.b8 0
|
1376 |
+
.b8 49
|
1377 |
+
.b8 19
|
1378 |
+
.b8 17
|
1379 |
+
.b8 1
|
1380 |
+
.b8 18
|
1381 |
+
.b8 1
|
1382 |
+
.b8 88
|
1383 |
+
.b8 11
|
1384 |
+
.b8 89
|
1385 |
+
.b8 11
|
1386 |
+
.b8 87
|
1387 |
+
.b8 11
|
1388 |
+
.b8 0
|
1389 |
+
.b8 0
|
1390 |
+
.b8 5
|
1391 |
+
.b8 29
|
1392 |
+
.b8 1
|
1393 |
+
.b8 49
|
1394 |
+
.b8 19
|
1395 |
+
.b8 17
|
1396 |
+
.b8 1
|
1397 |
+
.b8 18
|
1398 |
+
.b8 1
|
1399 |
+
.b8 88
|
1400 |
+
.b8 11
|
1401 |
+
.b8 89
|
1402 |
+
.b8 11
|
1403 |
+
.b8 87
|
1404 |
+
.b8 11
|
1405 |
+
.b8 0
|
1406 |
+
.b8 0
|
1407 |
+
.b8 0
|
1408 |
+
}
|
1409 |
+
.section .debug_info
|
1410 |
+
{
|
1411 |
+
.b32 298
|
1412 |
+
.b8 2
|
1413 |
+
.b8 0
|
1414 |
+
.b32 .debug_abbrev
|
1415 |
+
.b8 8
|
1416 |
+
.b8 1
|
1417 |
+
.b8 116
|
1418 |
+
.b8 114
|
1419 |
+
.b8 105
|
1420 |
+
.b8 116
|
1421 |
+
.b8 111
|
1422 |
+
.b8 110
|
1423 |
+
.b8 0
|
1424 |
+
.b8 2
|
1425 |
+
.b8 0
|
1426 |
+
.b8 99
|
1427 |
+
.b8 103
|
1428 |
+
.b8 120
|
1429 |
+
.b8 53
|
1430 |
+
.b8 108
|
1431 |
+
.b8 120
|
1432 |
+
.b8 112
|
1433 |
+
.b8 117
|
1434 |
+
.b8 101
|
1435 |
+
.b8 120
|
1436 |
+
.b8 112
|
1437 |
+
.b8 105
|
1438 |
+
.b8 110
|
1439 |
+
.b8 100
|
1440 |
+
.b8 106
|
1441 |
+
.b8 52
|
1442 |
+
.b8 100
|
1443 |
+
.b8 115
|
1444 |
+
.b8 109
|
1445 |
+
.b8 106
|
1446 |
+
.b8 122
|
1447 |
+
.b8 53
|
1448 |
+
.b8 120
|
1449 |
+
.b8 52
|
1450 |
+
.b8 50
|
1451 |
+
.b8 117
|
1452 |
+
.b8 104
|
1453 |
+
.b8 121
|
1454 |
+
.b8 121
|
1455 |
+
.b8 55
|
1456 |
+
.b8 105
|
1457 |
+
.b8 115
|
1458 |
+
.b8 107
|
1459 |
+
.b8 101
|
1460 |
+
.b8 118
|
1461 |
+
.b8 113
|
1462 |
+
.b8 55
|
1463 |
+
.b8 111
|
1464 |
+
.b8 118
|
1465 |
+
.b8 122
|
1466 |
+
.b8 112
|
1467 |
+
.b8 119
|
1468 |
+
.b8 97
|
1469 |
+
.b8 103
|
1470 |
+
.b8 98
|
1471 |
+
.b8 51
|
1472 |
+
.b8 116
|
1473 |
+
.b8 53
|
1474 |
+
.b8 112
|
1475 |
+
.b8 111
|
1476 |
+
.b8 119
|
1477 |
+
.b8 106
|
1478 |
+
.b8 46
|
1479 |
+
.b8 112
|
1480 |
+
.b8 121
|
1481 |
+
.b8 0
|
1482 |
+
.b32 .debug_line
|
1483 |
+
.b8 47
|
1484 |
+
.b8 116
|
1485 |
+
.b8 109
|
1486 |
+
.b8 112
|
1487 |
+
.b8 47
|
1488 |
+
.b8 116
|
1489 |
+
.b8 111
|
1490 |
+
.b8 114
|
1491 |
+
.b8 99
|
1492 |
+
.b8 104
|
1493 |
+
.b8 105
|
1494 |
+
.b8 110
|
1495 |
+
.b8 100
|
1496 |
+
.b8 117
|
1497 |
+
.b8 99
|
1498 |
+
.b8 116
|
1499 |
+
.b8 111
|
1500 |
+
.b8 114
|
1501 |
+
.b8 95
|
1502 |
+
.b8 114
|
1503 |
+
.b8 111
|
1504 |
+
.b8 111
|
1505 |
+
.b8 116
|
1506 |
+
.b8 47
|
1507 |
+
.b8 103
|
1508 |
+
.b8 120
|
1509 |
+
.b8 0
|
1510 |
+
.b8 1
|
1511 |
+
.b64 $L__func_begin0
|
1512 |
+
.b64 $L__func_end0
|
1513 |
+
.b8 2
|
1514 |
+
.b8 116
|
1515 |
+
.b8 114
|
1516 |
+
.b8 105
|
1517 |
+
.b8 116
|
1518 |
+
.b8 111
|
1519 |
+
.b8 110
|
1520 |
+
.b8 95
|
1521 |
+
.b8 95
|
1522 |
+
.b8 48
|
1523 |
+
.b8 100
|
1524 |
+
.b8 49
|
1525 |
+
.b8 100
|
1526 |
+
.b8 50
|
1527 |
+
.b8 100
|
1528 |
+
.b8 51
|
1529 |
+
.b8 100
|
1530 |
+
.b8 52
|
1531 |
+
.b8 100
|
1532 |
+
.b8 53
|
1533 |
+
.b8 100
|
1534 |
+
.b8 101
|
1535 |
+
.b8 54
|
1536 |
+
.b8 100
|
1537 |
+
.b8 101
|
1538 |
+
.b8 0
|
1539 |
+
.b8 116
|
1540 |
+
.b8 114
|
1541 |
+
.b8 105
|
1542 |
+
.b8 116
|
1543 |
+
.b8 111
|
1544 |
+
.b8 110
|
1545 |
+
.b8 95
|
1546 |
+
.b8 95
|
1547 |
+
.b8 48
|
1548 |
+
.b8 100
|
1549 |
+
.b8 49
|
1550 |
+
.b8 100
|
1551 |
+
.b8 50
|
1552 |
+
.b8 100
|
1553 |
+
.b8 51
|
1554 |
+
.b8 100
|
1555 |
+
.b8 52
|
1556 |
+
.b8 100
|
1557 |
+
.b8 53
|
1558 |
+
.b8 100
|
1559 |
+
.b8 101
|
1560 |
+
.b8 54
|
1561 |
+
.b8 100
|
1562 |
+
.b8 101
|
1563 |
+
.b8 0
|
1564 |
+
.b8 1
|
1565 |
+
.b8 18
|
1566 |
+
.b8 1
|
1567 |
+
.b8 1
|
1568 |
+
.b8 3
|
1569 |
+
.b64 $L__func_begin0
|
1570 |
+
.b64 $L__func_end0
|
1571 |
+
.b8 1
|
1572 |
+
.b8 156
|
1573 |
+
.b32 125
|
1574 |
+
.b8 4
|
1575 |
+
.b32 125
|
1576 |
+
.b64 $L__tmp1
|
1577 |
+
.b64 $L__tmp2
|
1578 |
+
.b8 2
|
1579 |
+
.b8 44
|
1580 |
+
.b8 38
|
1581 |
+
.b8 5
|
1582 |
+
.b32 125
|
1583 |
+
.b64 $L__tmp2
|
1584 |
+
.b64 $L__tmp36
|
1585 |
+
.b8 2
|
1586 |
+
.b8 50
|
1587 |
+
.b8 41
|
1588 |
+
.b8 4
|
1589 |
+
.b32 125
|
1590 |
+
.b64 $L__tmp2
|
1591 |
+
.b64 $L__tmp36
|
1592 |
+
.b8 2
|
1593 |
+
.b8 120
|
1594 |
+
.b8 46
|
1595 |
+
.b8 0
|
1596 |
+
.b8 4
|
1597 |
+
.b32 125
|
1598 |
+
.b64 $L__tmp3
|
1599 |
+
.b64 $L__tmp31
|
1600 |
+
.b8 2
|
1601 |
+
.b8 50
|
1602 |
+
.b8 41
|
1603 |
+
.b8 0
|
1604 |
+
.b8 0
|
1605 |
+
}
|
1606 |
+
.section .debug_pubnames
|
1607 |
+
{
|
1608 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
1609 |
+
$L__pubNames_start0:
|
1610 |
+
.b8 2
|
1611 |
+
.b8 0
|
1612 |
+
.b32 .debug_info
|
1613 |
+
.b32 302
|
1614 |
+
.b32 125
|
1615 |
+
.b8 116
|
1616 |
+
.b8 114
|
1617 |
+
.b8 105
|
1618 |
+
.b8 116
|
1619 |
+
.b8 111
|
1620 |
+
.b8 110
|
1621 |
+
.b8 95
|
1622 |
+
.b8 95
|
1623 |
+
.b8 48
|
1624 |
+
.b8 100
|
1625 |
+
.b8 49
|
1626 |
+
.b8 100
|
1627 |
+
.b8 50
|
1628 |
+
.b8 100
|
1629 |
+
.b8 51
|
1630 |
+
.b8 100
|
1631 |
+
.b8 52
|
1632 |
+
.b8 100
|
1633 |
+
.b8 53
|
1634 |
+
.b8 100
|
1635 |
+
.b8 101
|
1636 |
+
.b8 54
|
1637 |
+
.b8 100
|
1638 |
+
.b8 101
|
1639 |
+
.b8 0
|
1640 |
+
.b32 0
|
1641 |
+
$L__pubNames_end0:
|
1642 |
+
}
|
1643 |
+
.section .debug_pubtypes
|
1644 |
+
{
|
1645 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
1646 |
+
$L__pubTypes_start0:
|
1647 |
+
.b8 2
|
1648 |
+
.b8 0
|
1649 |
+
.b32 .debug_info
|
1650 |
+
.b32 302
|
1651 |
+
.b32 0
|
1652 |
+
$L__pubTypes_end0:
|
1653 |
+
}
|
1654 |
+
.section .debug_loc { }
|
.triton/dump/345a87a492fd703c73ab83265a21fcb6/triton_.ttgir
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 32], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
2 |
+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
3 |
+
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 8], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
4 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
5 |
+
tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
6 |
+
%cst = arith.constant dense<512> : tensor<16x1xi32, #blocked>
|
7 |
+
%cst_0 = arith.constant dense<256> : tensor<1x256xi32, #blocked>
|
8 |
+
%cst_1 = arith.constant dense<256> : tensor<16x1xi32, #blocked>
|
9 |
+
%cst_2 = arith.constant dense<1.000000e+00> : tensor<1x256xf32, #blocked>
|
10 |
+
%cst_3 = arith.constant dense<0.000000e+00> : tensor<1x256xf32, #blocked>
|
11 |
+
%cst_4 = arith.constant dense<256> : tensor<16x1xi64, #blocked>
|
12 |
+
%cst_5 = arith.constant dense<50257> : tensor<16x1xi64, #blocked>
|
13 |
+
%cst_6 = arith.constant dense<0> : tensor<16x1xi64, #blocked>
|
14 |
+
%cst_7 = arith.constant dense<0> : tensor<16x1xi64, #blocked1>
|
15 |
+
%cst_8 = arith.constant dense<50257> : tensor<16x1xi64, #blocked1>
|
16 |
+
%cst_9 = arith.constant 0.000000e+00 : f32
|
17 |
+
%cst_10 = arith.constant dense<0.000000e+00> : tensor<1x256xf32, #blocked2>
|
18 |
+
%cst_11 = arith.constant dense<256> : tensor<1x256xi32, #blocked2>
|
19 |
+
%cst_12 = arith.constant dense<9.99999974E-6> : tensor<16x1xf32, #blocked>
|
20 |
+
%cst_13 = arith.constant dense<2.560000e+02> : tensor<16x1xf32, #blocked>
|
21 |
+
%cst_14 = arith.constant dense<0.000000e+00> : tensor<16x256xf32, #blocked>
|
22 |
+
%c16_i32 = arith.constant 16 : i32
|
23 |
+
%0 = tt.get_program_id x : i32
|
24 |
+
%1 = arith.muli %0, %c16_i32 : i32
|
25 |
+
%2 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
26 |
+
%3 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
|
27 |
+
%4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xi32, #blocked>
|
28 |
+
%5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<16x1xi32, #blocked1>
|
29 |
+
%6 = tt.splat %1 : (i32) -> tensor<16x1xi32, #blocked>
|
30 |
+
%7 = tt.splat %1 : (i32) -> tensor<16x1xi32, #blocked1>
|
31 |
+
%8 = arith.addi %6, %4 : tensor<16x1xi32, #blocked>
|
32 |
+
%9 = arith.addi %7, %5 : tensor<16x1xi32, #blocked1>
|
33 |
+
%10 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
|
34 |
+
%11 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>
|
35 |
+
%12 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x256xi32, #blocked>
|
36 |
+
%13 = tt.expand_dims %11 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x256xi32, #blocked2>
|
37 |
+
%14 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<16x1x!tt.ptr<i64, 1>, #blocked>
|
38 |
+
%15 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<16x1x!tt.ptr<i64, 1>, #blocked1>
|
39 |
+
%16 = tt.addptr %14, %8 : tensor<16x1x!tt.ptr<i64, 1>, #blocked>, tensor<16x1xi32, #blocked>
|
40 |
+
%17 = tt.addptr %15, %9 : tensor<16x1x!tt.ptr<i64, 1>, #blocked1>, tensor<16x1xi32, #blocked1>
|
41 |
+
%18 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64, #blocked>
|
42 |
+
%19 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64, #blocked1>
|
43 |
+
%20 = arith.remsi %8, %cst : tensor<16x1xi32, #blocked>
|
44 |
+
%21 = arith.cmpi slt, %12, %cst_0 : tensor<1x256xi32, #blocked>
|
45 |
+
%22 = arith.cmpi slt, %13, %cst_11 : tensor<1x256xi32, #blocked2>
|
46 |
+
%23 = arith.muli %20, %cst_1 : tensor<16x1xi32, #blocked>
|
47 |
+
%24 = tt.broadcast %12 : (tensor<1x256xi32, #blocked>) -> tensor<16x256xi32, #blocked>
|
48 |
+
%25 = tt.broadcast %23 : (tensor<16x1xi32, #blocked>) -> tensor<16x256xi32, #blocked>
|
49 |
+
%26 = arith.addi %24, %25 : tensor<16x256xi32, #blocked>
|
50 |
+
%27 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<16x256x!tt.ptr<f32, 1>, #blocked>
|
51 |
+
%28 = tt.addptr %27, %26 : tensor<16x256x!tt.ptr<f32, 1>, #blocked>, tensor<16x256xi32, #blocked>
|
52 |
+
%29 = tt.broadcast %21 : (tensor<1x256xi1, #blocked>) -> tensor<16x256xi1, #blocked>
|
53 |
+
%30 = tt.load %28, %29, %cst_14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32, #blocked>
|
54 |
+
%31 = arith.addi %18, %cst_5 : tensor<16x1xi64, #blocked>
|
55 |
+
%32 = arith.addi %19, %cst_8 : tensor<16x1xi64, #blocked1>
|
56 |
+
%33 = arith.cmpi slt, %18, %cst_6 : tensor<16x1xi64, #blocked>
|
57 |
+
%34 = arith.cmpi slt, %19, %cst_7 : tensor<16x1xi64, #blocked1>
|
58 |
+
%35 = arith.select %33, %31, %18 : tensor<16x1xi1, #blocked>, tensor<16x1xi64, #blocked>
|
59 |
+
%36 = arith.select %34, %32, %19 : tensor<16x1xi1, #blocked1>, tensor<16x1xi64, #blocked1>
|
60 |
+
%37 = arith.cmpi sge, %36, %cst_7 : tensor<16x1xi64, #blocked1>
|
61 |
+
%38 = arith.cmpi slt, %36, %cst_8 : tensor<16x1xi64, #blocked1>
|
62 |
+
%39 = arith.andi %37, %38 : tensor<16x1xi1, #blocked1>
|
63 |
+
tt.assert %39, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<16x1xi1, #blocked1>
|
64 |
+
%40 = arith.muli %35, %cst_4 : tensor<16x1xi64, #blocked>
|
65 |
+
%41 = tt.broadcast %40 : (tensor<16x1xi64, #blocked>) -> tensor<16x256xi64, #blocked>
|
66 |
+
%42 = arith.extsi %12 : tensor<1x256xi32, #blocked> to tensor<1x256xi64, #blocked>
|
67 |
+
%43 = tt.broadcast %42 : (tensor<1x256xi64, #blocked>) -> tensor<16x256xi64, #blocked>
|
68 |
+
%44 = arith.addi %43, %41 : tensor<16x256xi64, #blocked>
|
69 |
+
%45 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<16x256x!tt.ptr<f32, 1>, #blocked>
|
70 |
+
%46 = tt.addptr %45, %44 : tensor<16x256x!tt.ptr<f32, 1>, #blocked>, tensor<16x256xi64, #blocked>
|
71 |
+
%47 = tt.load %46, %29, %cst_14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32, #blocked>
|
72 |
+
%48 = arith.addf %47, %30 : tensor<16x256xf32, #blocked>
|
73 |
+
%49 = arith.addf %48, %cst_14 : tensor<16x256xf32, #blocked>
|
74 |
+
%50 = arith.subf %48, %49 : tensor<16x256xf32, #blocked>
|
75 |
+
%51 = arith.mulf %48, %50 : tensor<16x256xf32, #blocked>
|
76 |
+
%52 = arith.addf %51, %cst_14 : tensor<16x256xf32, #blocked>
|
77 |
+
%53 = arith.select %29, %49, %cst_14 : tensor<16x256xi1, #blocked>, tensor<16x256xf32, #blocked>
|
78 |
+
%54 = arith.select %29, %52, %cst_14 : tensor<16x256xi1, #blocked>, tensor<16x256xf32, #blocked>
|
79 |
+
%55 = arith.select %21, %cst_2, %cst_3 : tensor<1x256xi1, #blocked>, tensor<1x256xf32, #blocked>
|
80 |
+
%56 = tt.broadcast %55 : (tensor<1x256xf32, #blocked>) -> tensor<16x256xf32, #blocked>
|
81 |
+
%57:3 = "tt.reduce"(%53, %54, %56) <{axis = 1 : i32}> ({
|
82 |
+
^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32):
|
83 |
+
%82 = arith.subf %arg10, %arg7 : f32
|
84 |
+
%83 = arith.addf %arg9, %arg12 : f32
|
85 |
+
%84 = arith.cmpf oeq, %83, %cst_9 : f32
|
86 |
+
%85 = arith.divf %arg12, %83 : f32
|
87 |
+
%86 = arith.select %84, %cst_9, %85 : f32
|
88 |
+
%87 = arith.mulf %82, %86 : f32
|
89 |
+
%88 = arith.addf %arg7, %87 : f32
|
90 |
+
%89 = arith.addf %arg8, %arg11 : f32
|
91 |
+
%90 = arith.mulf %82, %82 : f32
|
92 |
+
%91 = arith.mulf %90, %arg9 : f32
|
93 |
+
%92 = arith.mulf %91, %86 : f32
|
94 |
+
%93 = arith.addf %89, %92 : f32
|
95 |
+
tt.reduce.return %88, %93, %83 : f32, f32, f32
|
96 |
+
}) : (tensor<16x256xf32, #blocked>, tensor<16x256xf32, #blocked>, tensor<16x256xf32, #blocked>) -> (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
|
97 |
+
%58 = tt.expand_dims %57#0 {axis = 1 : i32} : (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xf32, #blocked>
|
98 |
+
%59 = tt.expand_dims %57#1 {axis = 1 : i32} : (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xf32, #blocked>
|
99 |
+
%60 = tt.load %28, %29, %cst_14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32, #blocked>
|
100 |
+
%61 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>, #blocked2>
|
101 |
+
%62 = tt.addptr %61, %13 : tensor<1x256x!tt.ptr<f32, 1>, #blocked2>, tensor<1x256xi32, #blocked2>
|
102 |
+
%63 = tt.load %62, %22, %cst_10 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32, #blocked2>
|
103 |
+
tt.assert %39, "index out of bounds: 0 <= tmp13 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<16x1xi1, #blocked1>
|
104 |
+
%64 = tt.load %46, %29, %cst_14 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x256xf32, #blocked>
|
105 |
+
%65 = arith.addf %64, %60 : tensor<16x256xf32, #blocked>
|
106 |
+
%66 = tt.broadcast %58 : (tensor<16x1xf32, #blocked>) -> tensor<16x256xf32, #blocked>
|
107 |
+
%67 = arith.subf %65, %66 : tensor<16x256xf32, #blocked>
|
108 |
+
%68 = arith.divf %59, %cst_13 : tensor<16x1xf32, #blocked>
|
109 |
+
%69 = arith.addf %68, %cst_12 : tensor<16x1xf32, #blocked>
|
110 |
+
%70 = tt.extern_elementwise %69 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<16x1xf32, #blocked>) -> tensor<16x1xf32, #blocked>
|
111 |
+
%71 = tt.broadcast %70 : (tensor<16x1xf32, #blocked>) -> tensor<16x256xf32, #blocked>
|
112 |
+
%72 = arith.mulf %67, %71 : tensor<16x256xf32, #blocked>
|
113 |
+
%73 = triton_gpu.convert_layout %63 : (tensor<1x256xf32, #blocked2>) -> tensor<1x256xf32, #blocked>
|
114 |
+
%74 = tt.broadcast %73 : (tensor<1x256xf32, #blocked>) -> tensor<16x256xf32, #blocked>
|
115 |
+
%75 = arith.mulf %72, %74 : tensor<16x256xf32, #blocked>
|
116 |
+
%76 = arith.muli %8, %cst_1 : tensor<16x1xi32, #blocked>
|
117 |
+
%77 = tt.broadcast %76 : (tensor<16x1xi32, #blocked>) -> tensor<16x256xi32, #blocked>
|
118 |
+
%78 = arith.addi %24, %77 : tensor<16x256xi32, #blocked>
|
119 |
+
%79 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<16x256x!tt.ptr<bf16, 1>, #blocked>
|
120 |
+
%80 = tt.addptr %79, %78 : tensor<16x256x!tt.ptr<bf16, 1>, #blocked>, tensor<16x256xi32, #blocked>
|
121 |
+
%81 = arith.truncf %75 : tensor<16x256xf32, #blocked> to tensor<16x256xbf16, #blocked>
|
122 |
+
tt.store %80, %81, %29 {cache = 1 : i32, evict = 1 : i32} : tensor<16x256xbf16, #blocked>
|
123 |
+
tt.return
|
124 |
+
}
|
125 |
+
}
|
.triton/dump/345a87a492fd703c73ab83265a21fcb6/triton_.ttir
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<1.000000e+00> : tensor<1x256xf32>
|
4 |
+
%cst_0 = arith.constant dense<0.000000e+00> : tensor<1x256xf32>
|
5 |
+
%cst_1 = arith.constant 0.000000e+00 : f32
|
6 |
+
%cst_2 = arith.constant dense<256> : tensor<16x1xi64>
|
7 |
+
%cst_3 = arith.constant dense<50257> : tensor<16x1xi64>
|
8 |
+
%cst_4 = arith.constant dense<0> : tensor<16x1xi64>
|
9 |
+
%cst_5 = arith.constant dense<9.99999974E-6> : tensor<16x1xf32>
|
10 |
+
%cst_6 = arith.constant dense<2.560000e+02> : tensor<16x1xf32>
|
11 |
+
%cst_7 = arith.constant dense<0.000000e+00> : tensor<16x256xf32>
|
12 |
+
%cst_8 = arith.constant dense<256> : tensor<16x1xi32>
|
13 |
+
%cst_9 = arith.constant dense<256> : tensor<1x256xi32>
|
14 |
+
%cst_10 = arith.constant dense<512> : tensor<16x1xi32>
|
15 |
+
%c16_i32 = arith.constant 16 : i32
|
16 |
+
%0 = tt.get_program_id x : i32
|
17 |
+
%1 = arith.muli %0, %c16_i32 : i32
|
18 |
+
%2 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32>
|
19 |
+
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<16xi32>) -> tensor<16x1xi32>
|
20 |
+
%4 = tt.splat %1 : (i32) -> tensor<16x1xi32>
|
21 |
+
%5 = arith.addi %4, %3 : tensor<16x1xi32>
|
22 |
+
%6 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
|
23 |
+
%7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<256xi32>) -> tensor<1x256xi32>
|
24 |
+
%8 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<16x1x!tt.ptr<i64, 1>>
|
25 |
+
%9 = tt.addptr %8, %5 : tensor<16x1x!tt.ptr<i64, 1>>, tensor<16x1xi32>
|
26 |
+
%10 = tt.load %9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64>
|
27 |
+
%11 = arith.remsi %5, %cst_10 : tensor<16x1xi32>
|
28 |
+
%12 = arith.cmpi slt, %7, %cst_9 : tensor<1x256xi32>
|
29 |
+
%13 = arith.muli %11, %cst_8 : tensor<16x1xi32>
|
30 |
+
%14 = tt.broadcast %7 : (tensor<1x256xi32>) -> tensor<16x256xi32>
|
31 |
+
%15 = tt.broadcast %13 : (tensor<16x1xi32>) -> tensor<16x256xi32>
|
32 |
+
%16 = arith.addi %14, %15 : tensor<16x256xi32>
|
33 |
+
%17 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<16x256x!tt.ptr<f32, 1>>
|
34 |
+
%18 = tt.addptr %17, %16 : tensor<16x256x!tt.ptr<f32, 1>>, tensor<16x256xi32>
|
35 |
+
%19 = tt.broadcast %12 : (tensor<1x256xi1>) -> tensor<16x256xi1>
|
36 |
+
%20 = tt.load %18, %19, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32>
|
37 |
+
%21 = arith.addi %10, %cst_3 : tensor<16x1xi64>
|
38 |
+
%22 = arith.cmpi slt, %10, %cst_4 : tensor<16x1xi64>
|
39 |
+
%23 = arith.select %22, %21, %10 : tensor<16x1xi1>, tensor<16x1xi64>
|
40 |
+
%24 = arith.cmpi sge, %23, %cst_4 : tensor<16x1xi64>
|
41 |
+
%25 = arith.cmpi slt, %23, %cst_3 : tensor<16x1xi64>
|
42 |
+
%26 = arith.andi %24, %25 : tensor<16x1xi1>
|
43 |
+
tt.assert %26, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<16x1xi1>
|
44 |
+
%27 = arith.muli %23, %cst_2 : tensor<16x1xi64>
|
45 |
+
%28 = tt.broadcast %27 : (tensor<16x1xi64>) -> tensor<16x256xi64>
|
46 |
+
%29 = arith.extsi %7 : tensor<1x256xi32> to tensor<1x256xi64>
|
47 |
+
%30 = tt.broadcast %29 : (tensor<1x256xi64>) -> tensor<16x256xi64>
|
48 |
+
%31 = arith.addi %30, %28 : tensor<16x256xi64>
|
49 |
+
%32 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<16x256x!tt.ptr<f32, 1>>
|
50 |
+
%33 = tt.addptr %32, %31 : tensor<16x256x!tt.ptr<f32, 1>>, tensor<16x256xi64>
|
51 |
+
%34 = tt.load %33, %19, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32>
|
52 |
+
%35 = arith.addf %34, %20 : tensor<16x256xf32>
|
53 |
+
%36 = arith.addf %35, %cst_7 : tensor<16x256xf32>
|
54 |
+
%37 = arith.subf %35, %36 : tensor<16x256xf32>
|
55 |
+
%38 = arith.mulf %35, %37 : tensor<16x256xf32>
|
56 |
+
%39 = arith.addf %38, %cst_7 : tensor<16x256xf32>
|
57 |
+
%40 = arith.select %19, %36, %cst_7 : tensor<16x256xi1>, tensor<16x256xf32>
|
58 |
+
%41 = arith.select %19, %39, %cst_7 : tensor<16x256xi1>, tensor<16x256xf32>
|
59 |
+
%42 = arith.select %12, %cst, %cst_0 : tensor<1x256xi1>, tensor<1x256xf32>
|
60 |
+
%43 = tt.broadcast %42 : (tensor<1x256xf32>) -> tensor<16x256xf32>
|
61 |
+
%44:3 = "tt.reduce"(%40, %41, %43) <{axis = 1 : i32}> ({
|
62 |
+
^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32):
|
63 |
+
%68 = arith.subf %arg10, %arg7 : f32
|
64 |
+
%69 = arith.addf %arg9, %arg12 : f32
|
65 |
+
%70 = arith.cmpf oeq, %69, %cst_1 : f32
|
66 |
+
%71 = arith.divf %arg12, %69 : f32
|
67 |
+
%72 = arith.select %70, %cst_1, %71 : f32
|
68 |
+
%73 = arith.mulf %68, %72 : f32
|
69 |
+
%74 = arith.addf %arg7, %73 : f32
|
70 |
+
%75 = arith.addf %arg8, %arg11 : f32
|
71 |
+
%76 = arith.mulf %68, %68 : f32
|
72 |
+
%77 = arith.mulf %76, %arg9 : f32
|
73 |
+
%78 = arith.mulf %77, %72 : f32
|
74 |
+
%79 = arith.addf %75, %78 : f32
|
75 |
+
tt.reduce.return %74, %79, %69 : f32, f32, f32
|
76 |
+
}) : (tensor<16x256xf32>, tensor<16x256xf32>, tensor<16x256xf32>) -> (tensor<16xf32>, tensor<16xf32>, tensor<16xf32>)
|
77 |
+
%45 = tt.expand_dims %44#0 {axis = 1 : i32} : (tensor<16xf32>) -> tensor<16x1xf32>
|
78 |
+
%46 = tt.expand_dims %44#1 {axis = 1 : i32} : (tensor<16xf32>) -> tensor<16x1xf32>
|
79 |
+
%47 = tt.load %18, %19, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32>
|
80 |
+
%48 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>>
|
81 |
+
%49 = tt.addptr %48, %7 : tensor<1x256x!tt.ptr<f32, 1>>, tensor<1x256xi32>
|
82 |
+
%50 = tt.load %49, %12, %cst_0 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32>
|
83 |
+
tt.assert %26, "index out of bounds: 0 <= tmp13 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<16x1xi1>
|
84 |
+
%51 = tt.load %33, %19, %cst_7 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x256xf32>
|
85 |
+
%52 = arith.addf %51, %47 : tensor<16x256xf32>
|
86 |
+
%53 = tt.broadcast %45 : (tensor<16x1xf32>) -> tensor<16x256xf32>
|
87 |
+
%54 = arith.subf %52, %53 : tensor<16x256xf32>
|
88 |
+
%55 = arith.divf %46, %cst_6 : tensor<16x1xf32>
|
89 |
+
%56 = arith.addf %55, %cst_5 : tensor<16x1xf32>
|
90 |
+
%57 = tt.extern_elementwise %56 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<16x1xf32>) -> tensor<16x1xf32>
|
91 |
+
%58 = tt.broadcast %57 : (tensor<16x1xf32>) -> tensor<16x256xf32>
|
92 |
+
%59 = arith.mulf %54, %58 : tensor<16x256xf32>
|
93 |
+
%60 = tt.broadcast %50 : (tensor<1x256xf32>) -> tensor<16x256xf32>
|
94 |
+
%61 = arith.mulf %59, %60 : tensor<16x256xf32>
|
95 |
+
%62 = arith.muli %5, %cst_8 : tensor<16x1xi32>
|
96 |
+
%63 = tt.broadcast %62 : (tensor<16x1xi32>) -> tensor<16x256xi32>
|
97 |
+
%64 = arith.addi %14, %63 : tensor<16x256xi32>
|
98 |
+
%65 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<16x256x!tt.ptr<bf16, 1>>
|
99 |
+
%66 = tt.addptr %65, %64 : tensor<16x256x!tt.ptr<bf16, 1>>, tensor<16x256xi32>
|
100 |
+
%67 = arith.truncf %61 : tensor<16x256xf32> to tensor<16x256xbf16>
|
101 |
+
tt.store %66, %67, %19 {cache = 1 : i32, evict = 1 : i32} : tensor<16x256xbf16>
|
102 |
+
tt.return
|
103 |
+
}
|
104 |
+
}
|
.triton/dump/3cd3b6d7993c56f7d0340d40c84f737c/triton_.llir
ADDED
@@ -0,0 +1,355 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
|
5 |
+
@assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
|
6 |
+
@assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp16 < 50257"
|
7 |
+
@assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
|
8 |
+
@assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
|
9 |
+
@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
|
10 |
+
@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
|
11 |
+
|
12 |
+
declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
|
13 |
+
|
14 |
+
define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !7 {
|
15 |
+
%9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
|
16 |
+
%10 = lshr i32 %9, 2, !dbg !10
|
17 |
+
%11 = and i32 %10, 63, !dbg !10
|
18 |
+
%12 = and i32 %9, 63, !dbg !10
|
19 |
+
%13 = and i32 %9, 3, !dbg !11
|
20 |
+
%14 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #5, !dbg !12
|
21 |
+
%15 = shl i32 %14, 6, !dbg !13
|
22 |
+
%16 = or i32 %15, %11, !dbg !14
|
23 |
+
%17 = or i32 %15, %12, !dbg !14
|
24 |
+
%18 = sext i32 %16 to i64, !dbg !15
|
25 |
+
%19 = getelementptr i64, ptr addrspace(1) %0, i64 %18, !dbg !15
|
26 |
+
%20 = sext i32 %17 to i64, !dbg !15
|
27 |
+
%21 = getelementptr i64, ptr addrspace(1) %0, i64 %20, !dbg !15
|
28 |
+
%22 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 true) #5, !dbg !16
|
29 |
+
%23 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %21, i1 true) #5, !dbg !16
|
30 |
+
%24 = srem i32 %16, 512, !dbg !17
|
31 |
+
%25 = shl nsw i32 %24, 8, !dbg !18
|
32 |
+
%26 = shl i32 %16, 8, !dbg !19
|
33 |
+
%27 = add i64 %23, 50257, !dbg !20
|
34 |
+
%28 = icmp slt i64 %22, 0, !dbg !21
|
35 |
+
%29 = icmp slt i64 %23, 0, !dbg !21
|
36 |
+
%30 = select i1 %29, i64 %27, i64 %23, !dbg !22
|
37 |
+
%.fr8 = freeze i64 %30, !dbg !23
|
38 |
+
%31 = icmp ugt i64 %.fr8, 50256, !dbg !23
|
39 |
+
%32 = shl i64 %22, 8, !dbg !24
|
40 |
+
%33 = add i64 %32, 12865792, !dbg !24
|
41 |
+
%34 = select i1 %28, i64 %33, i64 %32, !dbg !24
|
42 |
+
%35 = getelementptr float, ptr addrspace(1) %1, i64 %34
|
43 |
+
br i1 %31, label %.split.us, label %.split, !dbg !25
|
44 |
+
|
45 |
+
.split.us: ; preds = %8, %.split.us
|
46 |
+
%36 = phi float [ %58, %.split.us ], [ 0.000000e+00, %8 ]
|
47 |
+
%37 = phi float [ %63, %.split.us ], [ 0.000000e+00, %8 ]
|
48 |
+
%38 = phi float [ %60, %.split.us ], [ 0.000000e+00, %8 ]
|
49 |
+
%39 = phi i32 [ %64, %.split.us ], [ 0, %8 ]
|
50 |
+
%40 = or i32 %39, %13, !dbg !26
|
51 |
+
%41 = add i32 %40, %25, !dbg !27
|
52 |
+
%42 = sext i32 %41 to i64, !dbg !28
|
53 |
+
%43 = getelementptr float, ptr addrspace(1) %2, i64 %42, !dbg !28
|
54 |
+
%44 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %43, i1 true, i32 0, i1 true) #5, !dbg !29
|
55 |
+
%45 = bitcast i32 %44 to float, !dbg !29
|
56 |
+
%46 = add i32 %40, %26, !dbg !30
|
57 |
+
%47 = sext i32 %46 to i64, !dbg !31
|
58 |
+
%48 = getelementptr i16, ptr addrspace(1) %3, i64 %47, !dbg !31
|
59 |
+
%49 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %48, i1 true, i16 0, i1 true) #5, !dbg !32
|
60 |
+
%50 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %49) #5, !dbg !33
|
61 |
+
tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !25
|
62 |
+
%51 = zext nneg i32 %40 to i64, !dbg !34
|
63 |
+
%52 = getelementptr float, ptr addrspace(1) %35, i64 %51, !dbg !35
|
64 |
+
%53 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %52, i1 true, i32 0, i1 true) #5, !dbg !36
|
65 |
+
%54 = bitcast i32 %53 to float, !dbg !36
|
66 |
+
%55 = fadd float %45, %54, !dbg !37
|
67 |
+
%56 = fadd float %50, %55, !dbg !38
|
68 |
+
%57 = fsub float %56, %38, !dbg !39
|
69 |
+
%58 = fadd float %36, 1.000000e+00, !dbg !43
|
70 |
+
%59 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %57, float %58) #5, !dbg !44
|
71 |
+
%60 = fadd float %38, %59, !dbg !45
|
72 |
+
%61 = fsub float %56, %60, !dbg !46
|
73 |
+
%62 = fmul float %57, %61, !dbg !47
|
74 |
+
%63 = fadd float %37, %62, !dbg !48
|
75 |
+
%64 = add nuw nsw i32 %39, 4, !dbg !49
|
76 |
+
%65 = icmp ult i32 %39, 252, !dbg !49
|
77 |
+
br i1 %65, label %.split.us, label %.split5.us, !dbg !49
|
78 |
+
|
79 |
+
.split: ; preds = %8, %.split
|
80 |
+
%66 = phi float [ %88, %.split ], [ 0.000000e+00, %8 ]
|
81 |
+
%67 = phi float [ %93, %.split ], [ 0.000000e+00, %8 ]
|
82 |
+
%68 = phi float [ %90, %.split ], [ 0.000000e+00, %8 ]
|
83 |
+
%69 = phi i32 [ %94, %.split ], [ 0, %8 ]
|
84 |
+
%70 = or i32 %69, %13, !dbg !26
|
85 |
+
%71 = add i32 %70, %25, !dbg !27
|
86 |
+
%72 = sext i32 %71 to i64, !dbg !28
|
87 |
+
%73 = getelementptr float, ptr addrspace(1) %2, i64 %72, !dbg !28
|
88 |
+
%74 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %73, i1 true, i32 0, i1 true) #5, !dbg !29
|
89 |
+
%75 = bitcast i32 %74 to float, !dbg !29
|
90 |
+
%76 = add i32 %70, %26, !dbg !30
|
91 |
+
%77 = sext i32 %76 to i64, !dbg !31
|
92 |
+
%78 = getelementptr i16, ptr addrspace(1) %3, i64 %77, !dbg !31
|
93 |
+
%79 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %78, i1 true, i16 0, i1 true) #5, !dbg !32
|
94 |
+
%80 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %79) #5, !dbg !33
|
95 |
+
%81 = zext nneg i32 %70 to i64, !dbg !34
|
96 |
+
%82 = getelementptr float, ptr addrspace(1) %35, i64 %81, !dbg !35
|
97 |
+
%83 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %82, i1 true, i32 0, i1 true) #5, !dbg !36
|
98 |
+
%84 = bitcast i32 %83 to float, !dbg !36
|
99 |
+
%85 = fadd float %75, %84, !dbg !37
|
100 |
+
%86 = fadd float %80, %85, !dbg !38
|
101 |
+
%87 = fsub float %86, %68, !dbg !39
|
102 |
+
%88 = fadd float %66, 1.000000e+00, !dbg !43
|
103 |
+
%89 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %87, float %88) #5, !dbg !44
|
104 |
+
%90 = fadd float %68, %89, !dbg !45
|
105 |
+
%91 = fsub float %86, %90, !dbg !46
|
106 |
+
%92 = fmul float %87, %91, !dbg !47
|
107 |
+
%93 = fadd float %67, %92, !dbg !48
|
108 |
+
%94 = add nuw nsw i32 %69, 4, !dbg !49
|
109 |
+
%95 = icmp ult i32 %69, 252, !dbg !49
|
110 |
+
br i1 %95, label %.split, label %.split5.us, !dbg !49
|
111 |
+
|
112 |
+
.split5.us: ; preds = %.split, %.split.us
|
113 |
+
%.us-phi = phi float [ %60, %.split.us ], [ %90, %.split ]
|
114 |
+
%.us-phi6 = phi float [ %63, %.split.us ], [ %93, %.split ]
|
115 |
+
%.us-phi7 = phi float [ %58, %.split.us ], [ %88, %.split ]
|
116 |
+
%96 = bitcast float %.us-phi to i32, !dbg !50
|
117 |
+
%97 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %96, i32 2, i32 31), !dbg !50
|
118 |
+
%98 = bitcast i32 %97 to float, !dbg !50
|
119 |
+
%99 = bitcast float %.us-phi6 to i32, !dbg !50
|
120 |
+
%100 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %99, i32 2, i32 31), !dbg !50
|
121 |
+
%101 = bitcast i32 %100 to float, !dbg !50
|
122 |
+
%102 = bitcast float %.us-phi7 to i32, !dbg !50
|
123 |
+
%103 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %102, i32 2, i32 31), !dbg !50
|
124 |
+
%104 = bitcast i32 %103 to float, !dbg !50
|
125 |
+
%105 = fsub float %98, %.us-phi, !dbg !52
|
126 |
+
%106 = fadd float %.us-phi7, %104, !dbg !56
|
127 |
+
%107 = fcmp oeq float %106, 0.000000e+00, !dbg !57
|
128 |
+
%108 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %104, float %106) #5, !dbg !58
|
129 |
+
%109 = select i1 %107, float 0.000000e+00, float %108, !dbg !59
|
130 |
+
%110 = fmul float %105, %109, !dbg !60
|
131 |
+
%111 = fadd float %.us-phi, %110, !dbg !61
|
132 |
+
%112 = fadd float %.us-phi6, %101, !dbg !62
|
133 |
+
%113 = fmul float %105, %105, !dbg !63
|
134 |
+
%114 = fmul float %.us-phi7, %113, !dbg !64
|
135 |
+
%115 = fmul float %114, %109, !dbg !65
|
136 |
+
%116 = fadd float %112, %115, !dbg !66
|
137 |
+
%117 = bitcast float %111 to i32, !dbg !50
|
138 |
+
%118 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %117, i32 1, i32 31), !dbg !50
|
139 |
+
%119 = bitcast i32 %118 to float, !dbg !50
|
140 |
+
%120 = bitcast float %116 to i32, !dbg !50
|
141 |
+
%121 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %120, i32 1, i32 31), !dbg !50
|
142 |
+
%122 = bitcast i32 %121 to float, !dbg !50
|
143 |
+
%123 = bitcast float %106 to i32, !dbg !50
|
144 |
+
%124 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %123, i32 1, i32 31), !dbg !50
|
145 |
+
%125 = bitcast i32 %124 to float, !dbg !50
|
146 |
+
%126 = fsub float %119, %111, !dbg !52
|
147 |
+
%127 = fadd float %106, %125, !dbg !56
|
148 |
+
%128 = fcmp oeq float %127, 0.000000e+00, !dbg !57
|
149 |
+
%129 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %125, float %127) #5, !dbg !58
|
150 |
+
%130 = select i1 %128, float 0.000000e+00, float %129, !dbg !59
|
151 |
+
%131 = fmul float %126, %130, !dbg !60
|
152 |
+
%132 = fadd float %111, %131, !dbg !61
|
153 |
+
%133 = fadd float %116, %122, !dbg !62
|
154 |
+
%134 = fmul float %126, %126, !dbg !63
|
155 |
+
%135 = fmul float %106, %134, !dbg !64
|
156 |
+
%136 = fmul float %130, %135, !dbg !65
|
157 |
+
%137 = fadd float %133, %136, !dbg !66
|
158 |
+
%138 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %137, float 2.560000e+02) #5, !dbg !67
|
159 |
+
%139 = fadd float %138, 0x3EE4F8B580000000, !dbg !68
|
160 |
+
br label %140, !dbg !69
|
161 |
+
|
162 |
+
140: ; preds = %.split5.us, %__nv_rsqrtf.exit
|
163 |
+
%141 = phi i32 [ 0, %.split5.us ], [ %174, %__nv_rsqrtf.exit ]
|
164 |
+
%142 = or i32 %141, %13, !dbg !70
|
165 |
+
%143 = add i32 %142, %25, !dbg !71
|
166 |
+
%144 = sext i32 %143 to i64, !dbg !72
|
167 |
+
%145 = getelementptr float, ptr addrspace(1) %2, i64 %144, !dbg !72
|
168 |
+
%146 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %145, i1 true, i32 0, i1 true) #5, !dbg !73
|
169 |
+
%147 = bitcast i32 %146 to float, !dbg !73
|
170 |
+
%148 = add i32 %142, %26, !dbg !74
|
171 |
+
%149 = sext i32 %148 to i64, !dbg !75
|
172 |
+
%150 = getelementptr i16, ptr addrspace(1) %3, i64 %149, !dbg !75
|
173 |
+
%151 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %150, i1 true, i16 0, i1 true) #5, !dbg !76
|
174 |
+
%152 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %151) #5, !dbg !77
|
175 |
+
%153 = zext nneg i32 %142 to i64, !dbg !78
|
176 |
+
%154 = getelementptr float, ptr addrspace(1) %4, i64 %153, !dbg !78
|
177 |
+
%155 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %154, i1 true, i32 0, i1 true) #5, !dbg !79
|
178 |
+
%156 = bitcast i32 %155 to float, !dbg !79
|
179 |
+
br i1 %31, label %157, label %158, !dbg !80
|
180 |
+
|
181 |
+
157: ; preds = %140
|
182 |
+
tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !80
|
183 |
+
br label %158, !dbg !80
|
184 |
+
|
185 |
+
158: ; preds = %157, %140
|
186 |
+
%159 = getelementptr float, ptr addrspace(1) %35, i64 %153, !dbg !81
|
187 |
+
%160 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %159, i1 true, i32 0, i1 true) #5, !dbg !82
|
188 |
+
%161 = bitcast i32 %160 to float, !dbg !82
|
189 |
+
%162 = fadd float %147, %161, !dbg !83
|
190 |
+
%163 = fadd float %152, %162, !dbg !84
|
191 |
+
%164 = fsub float %163, %132, !dbg !85
|
192 |
+
%165 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !86
|
193 |
+
%.not.i = icmp eq i32 %165, 0, !dbg !86
|
194 |
+
br i1 %.not.i, label %168, label %166, !dbg !86
|
195 |
+
|
196 |
+
166: ; preds = %158
|
197 |
+
%167 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %139), !dbg !86
|
198 |
+
br label %__nv_rsqrtf.exit, !dbg !86
|
199 |
+
|
200 |
+
168: ; preds = %158
|
201 |
+
%169 = tail call float @llvm.nvvm.rsqrt.approx.f(float %139), !dbg !86
|
202 |
+
br label %__nv_rsqrtf.exit, !dbg !86
|
203 |
+
|
204 |
+
__nv_rsqrtf.exit: ; preds = %166, %168
|
205 |
+
%.0.i = phi float [ %167, %166 ], [ %169, %168 ], !dbg !86
|
206 |
+
%170 = fmul float %164, %.0.i, !dbg !87
|
207 |
+
%171 = fmul float %170, %156, !dbg !88
|
208 |
+
%172 = getelementptr i16, ptr addrspace(1) %5, i64 %149, !dbg !89
|
209 |
+
%173 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %171) #5, !dbg !90
|
210 |
+
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %173, ptr addrspace(1) %172, i1 true) #5, !dbg !90
|
211 |
+
%174 = add nuw nsw i32 %141, 4, !dbg !69
|
212 |
+
%175 = icmp ult i32 %141, 252, !dbg !69
|
213 |
+
br i1 %175, label %140, label %176, !dbg !69
|
214 |
+
|
215 |
+
176: ; preds = %__nv_rsqrtf.exit
|
216 |
+
ret void, !dbg !91
|
217 |
+
}
|
218 |
+
|
219 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
220 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
221 |
+
|
222 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
223 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
|
224 |
+
|
225 |
+
; Function Attrs: alwaysinline nounwind
|
226 |
+
define float @__nv_rsqrtf(float %x) local_unnamed_addr #2 {
|
227 |
+
%1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5
|
228 |
+
%.not = icmp eq i32 %1, 0
|
229 |
+
br i1 %.not, label %4, label %2
|
230 |
+
|
231 |
+
2: ; preds = %0
|
232 |
+
%3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
|
233 |
+
br label %6
|
234 |
+
|
235 |
+
4: ; preds = %0
|
236 |
+
%5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
|
237 |
+
br label %6
|
238 |
+
|
239 |
+
6: ; preds = %4, %2
|
240 |
+
%.0 = phi float [ %3, %2 ], [ %5, %4 ]
|
241 |
+
ret float %.0
|
242 |
+
}
|
243 |
+
|
244 |
+
declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #3
|
245 |
+
|
246 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
247 |
+
declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #4
|
248 |
+
|
249 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
250 |
+
declare float @llvm.nvvm.rsqrt.approx.f(float) #4
|
251 |
+
|
252 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
253 |
+
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
254 |
+
attributes #2 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
255 |
+
attributes #3 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
256 |
+
attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
|
257 |
+
attributes #5 = { nounwind }
|
258 |
+
|
259 |
+
!llvm.module.flags = !{!0, !1}
|
260 |
+
!llvm.dbg.cu = !{!2}
|
261 |
+
!nvvm.annotations = !{!4, !5, !5, !4}
|
262 |
+
!llvm.ident = !{!6}
|
263 |
+
|
264 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
265 |
+
!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
266 |
+
!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
267 |
+
!3 = !DIFile(filename: "cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py", directory: "/tmp/torchinductor_root/pn")
|
268 |
+
!4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1}
|
269 |
+
!5 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 256}
|
270 |
+
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
|
271 |
+
!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
|
272 |
+
!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
|
273 |
+
!9 = !{}
|
274 |
+
!10 = !DILocation(line: 22, column: 44, scope: !7)
|
275 |
+
!11 = !DILocation(line: 24, column: 33, scope: !7)
|
276 |
+
!12 = !DILocation(line: 21, column: 28, scope: !7)
|
277 |
+
!13 = !DILocation(line: 21, column: 33, scope: !7)
|
278 |
+
!14 = !DILocation(line: 22, column: 23, scope: !7)
|
279 |
+
!15 = !DILocation(line: 26, column: 30, scope: !7)
|
280 |
+
!16 = !DILocation(line: 26, column: 35, scope: !7)
|
281 |
+
!17 = !DILocation(line: 27, column: 18, scope: !7)
|
282 |
+
!18 = !DILocation(line: 35, column: 44, scope: !7)
|
283 |
+
!19 = !DILocation(line: 36, column: 44, scope: !7)
|
284 |
+
!20 = !DILocation(line: 37, column: 22, scope: !7)
|
285 |
+
!21 = !DILocation(line: 38, column: 22, scope: !7)
|
286 |
+
!22 = !DILocation(line: 39, column: 36, scope: !7)
|
287 |
+
!23 = !DILocation(line: 40, column: 40, scope: !7)
|
288 |
+
!24 = !DILocation(line: 41, column: 44, scope: !7)
|
289 |
+
!25 = !DILocation(line: 40, column: 55, scope: !7)
|
290 |
+
!26 = !DILocation(line: 32, column: 27, scope: !7)
|
291 |
+
!27 = !DILocation(line: 35, column: 40, scope: !7)
|
292 |
+
!28 = !DILocation(line: 35, column: 34, scope: !7)
|
293 |
+
!29 = !DILocation(line: 35, column: 50, scope: !7)
|
294 |
+
!30 = !DILocation(line: 36, column: 40, scope: !7)
|
295 |
+
!31 = !DILocation(line: 36, column: 34, scope: !7)
|
296 |
+
!32 = !DILocation(line: 36, column: 50, scope: !7)
|
297 |
+
!33 = !DILocation(line: 36, column: 101, scope: !7)
|
298 |
+
!34 = !DILocation(line: 41, column: 40, scope: !7)
|
299 |
+
!35 = !DILocation(line: 41, column: 34, scope: !7)
|
300 |
+
!36 = !DILocation(line: 41, column: 52, scope: !7)
|
301 |
+
!37 = !DILocation(line: 42, column: 22, scope: !7)
|
302 |
+
!38 = !DILocation(line: 44, column: 22, scope: !7)
|
303 |
+
!39 = !DILocation(line: 96, column: 20, scope: !40, inlinedAt: !42)
|
304 |
+
!40 = distinct !DILexicalBlockFile(scope: !7, file: !41, discriminator: 0)
|
305 |
+
!41 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
|
306 |
+
!42 = !DILocation(line: 47, column: 41, scope: !40)
|
307 |
+
!43 = !DILocation(line: 97, column: 26, scope: !40, inlinedAt: !42)
|
308 |
+
!44 = !DILocation(line: 98, column: 30, scope: !40, inlinedAt: !42)
|
309 |
+
!45 = !DILocation(line: 98, column: 22, scope: !40, inlinedAt: !42)
|
310 |
+
!46 = !DILocation(line: 101, column: 30, scope: !40, inlinedAt: !42)
|
311 |
+
!47 = !DILocation(line: 101, column: 22, scope: !40, inlinedAt: !42)
|
312 |
+
!48 = !DILocation(line: 50, column: 50, scope: !7)
|
313 |
+
!49 = !DILocation(line: 31, column: 36, scope: !7)
|
314 |
+
!50 = !DILocation(line: 120, column: 46, scope: !40, inlinedAt: !51)
|
315 |
+
!51 = !DILocation(line: 53, column: 44, scope: !40)
|
316 |
+
!52 = !DILocation(line: 108, column: 21, scope: !53, inlinedAt: !54)
|
317 |
+
!53 = distinct !DILexicalBlockFile(scope: !40, file: !41, discriminator: 0)
|
318 |
+
!54 = !DILocation(line: 120, column: 46, scope: !53, inlinedAt: !55)
|
319 |
+
!55 = !DILocation(line: 53, column: 44, scope: !53)
|
320 |
+
!56 = !DILocation(line: 109, column: 28, scope: !53, inlinedAt: !54)
|
321 |
+
!57 = !DILocation(line: 110, column: 39, scope: !53, inlinedAt: !54)
|
322 |
+
!58 = !DILocation(line: 110, column: 60, scope: !53, inlinedAt: !54)
|
323 |
+
!59 = !DILocation(line: 110, column: 49, scope: !53, inlinedAt: !54)
|
324 |
+
!60 = !DILocation(line: 112, column: 25, scope: !53, inlinedAt: !54)
|
325 |
+
!61 = !DILocation(line: 112, column: 17, scope: !53, inlinedAt: !54)
|
326 |
+
!62 = !DILocation(line: 113, column: 15, scope: !53, inlinedAt: !54)
|
327 |
+
!63 = !DILocation(line: 113, column: 30, scope: !53, inlinedAt: !54)
|
328 |
+
!64 = !DILocation(line: 113, column: 38, scope: !53, inlinedAt: !54)
|
329 |
+
!65 = !DILocation(line: 113, column: 49, scope: !53, inlinedAt: !54)
|
330 |
+
!66 = !DILocation(line: 113, column: 22, scope: !53, inlinedAt: !54)
|
331 |
+
!67 = !DILocation(line: 75, column: 24, scope: !7)
|
332 |
+
!68 = !DILocation(line: 77, column: 24, scope: !7)
|
333 |
+
!69 = !DILocation(line: 58, column: 36, scope: !7)
|
334 |
+
!70 = !DILocation(line: 59, column: 27, scope: !7)
|
335 |
+
!71 = !DILocation(line: 62, column: 41, scope: !7)
|
336 |
+
!72 = !DILocation(line: 62, column: 35, scope: !7)
|
337 |
+
!73 = !DILocation(line: 62, column: 51, scope: !7)
|
338 |
+
!74 = !DILocation(line: 63, column: 41, scope: !7)
|
339 |
+
!75 = !DILocation(line: 63, column: 35, scope: !7)
|
340 |
+
!76 = !DILocation(line: 63, column: 51, scope: !7)
|
341 |
+
!77 = !DILocation(line: 63, column: 103, scope: !7)
|
342 |
+
!78 = !DILocation(line: 64, column: 35, scope: !7)
|
343 |
+
!79 = !DILocation(line: 64, column: 40, scope: !7)
|
344 |
+
!80 = !DILocation(line: 68, column: 57, scope: !7)
|
345 |
+
!81 = !DILocation(line: 69, column: 35, scope: !7)
|
346 |
+
!82 = !DILocation(line: 69, column: 54, scope: !7)
|
347 |
+
!83 = !DILocation(line: 70, column: 24, scope: !7)
|
348 |
+
!84 = !DILocation(line: 72, column: 24, scope: !7)
|
349 |
+
!85 = !DILocation(line: 73, column: 24, scope: !7)
|
350 |
+
!86 = !DILocation(line: 78, column: 30, scope: !7)
|
351 |
+
!87 = !DILocation(line: 79, column: 24, scope: !7)
|
352 |
+
!88 = !DILocation(line: 80, column: 24, scope: !7)
|
353 |
+
!89 = !DILocation(line: 82, column: 29, scope: !7)
|
354 |
+
!90 = !DILocation(line: 82, column: 52, scope: !7)
|
355 |
+
!91 = !DILocation(line: 58, column: 4, scope: !7)
|
.triton/dump/4710f23a3addbad00b260d7a02366fe0/triton_.ttgir
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
2 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
3 |
+
tt.func public @triton__0d1d2d34e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg3: i32, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
|
4 |
+
%cst = arith.constant dense<0> : tensor<1x8xi64, #blocked>
|
5 |
+
%cst_0 = arith.constant dense<0.000000e+00> : tensor<1x8xf32, #blocked>
|
6 |
+
%cst_1 = arith.constant dense<8> : tensor<1x8xi32, #blocked>
|
7 |
+
%c0_i32 = arith.constant 0 : i32
|
8 |
+
%0 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
|
9 |
+
%1 = tt.expand_dims %0 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x8xi32, #blocked>
|
10 |
+
%2 = arith.cmpi slt, %1, %cst_1 : tensor<1x8xi32, #blocked>
|
11 |
+
%3 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<1x8x!tt.ptr<f32, 1>, #blocked>
|
12 |
+
%4 = tt.addptr %3, %1 : tensor<1x8x!tt.ptr<f32, 1>, #blocked>, tensor<1x8xi32, #blocked>
|
13 |
+
%5 = tt.load %4, %2, %cst_0 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x8xf32, #blocked>
|
14 |
+
%6 = tt.splat %arg2 : (!tt.ptr<i64, 1>) -> tensor<1x8x!tt.ptr<i64, 1>, #blocked>
|
15 |
+
%7 = tt.addptr %6, %1 : tensor<1x8x!tt.ptr<i64, 1>, #blocked>, tensor<1x8xi32, #blocked>
|
16 |
+
%8 = tt.load %7, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x8xi64, #blocked>
|
17 |
+
%9 = arith.select %2, %5, %cst_0 : tensor<1x8xi1, #blocked>, tensor<1x8xf32, #blocked>
|
18 |
+
%10 = "tt.reduce"(%9) <{axis = 1 : i32}> ({
|
19 |
+
^bb0(%arg5: f32, %arg6: f32):
|
20 |
+
%19 = arith.addf %arg5, %arg6 : f32
|
21 |
+
tt.reduce.return %19 : f32
|
22 |
+
}) : (tensor<1x8xf32, #blocked>) -> tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
23 |
+
%11 = tt.expand_dims %10 {axis = 1 : i32} : (tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1x1xf32, #blocked>
|
24 |
+
%12 = arith.select %2, %8, %cst : tensor<1x8xi1, #blocked>, tensor<1x8xi64, #blocked>
|
25 |
+
%13 = "tt.reduce"(%12) <{axis = 1 : i32}> ({
|
26 |
+
^bb0(%arg5: i64, %arg6: i64):
|
27 |
+
%19 = arith.addi %arg5, %arg6 : i64
|
28 |
+
tt.reduce.return %19 : i64
|
29 |
+
}) : (tensor<1x8xi64, #blocked>) -> tensor<1xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
30 |
+
%14 = tt.expand_dims %13 {axis = 1 : i32} : (tensor<1xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1x1xi64, #blocked>
|
31 |
+
%15 = arith.sitofp %14 : tensor<1x1xi64, #blocked> to tensor<1x1xf32, #blocked>
|
32 |
+
%16 = arith.divf %11, %15 : tensor<1x1xf32, #blocked>
|
33 |
+
gpu.barrier
|
34 |
+
%17 = tt.addptr %arg0, %c0_i32 : !tt.ptr<f32, 1>, i32
|
35 |
+
%18 = tt.splat %17 : (!tt.ptr<f32, 1>) -> tensor<1x1x!tt.ptr<f32, 1>, #blocked>
|
36 |
+
tt.store %18, %16 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xf32, #blocked>
|
37 |
+
tt.return
|
38 |
+
}
|
39 |
+
}
|
.triton/dump/53075505618c3af0ef6ce61f3300cdcb/triton_.cubin
ADDED
Binary file (73.7 kB). View file
|
|
.triton/dump/53075505618c3af0ef6ce61f3300cdcb/triton_.ptx
ADDED
@@ -0,0 +1,2004 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3d4d5d6de7de
|
10 |
+
.extern .func __assertfail
|
11 |
+
(
|
12 |
+
.param .b64 __assertfail_param_0,
|
13 |
+
.param .b64 __assertfail_param_1,
|
14 |
+
.param .b32 __assertfail_param_2,
|
15 |
+
.param .b64 __assertfail_param_3,
|
16 |
+
.param .b64 __assertfail_param_4
|
17 |
+
)
|
18 |
+
;
|
19 |
+
.global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
|
20 |
+
.global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
|
21 |
+
.global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 54, 32, 60, 32, 53, 48, 50, 53, 55};
|
22 |
+
.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
|
23 |
+
.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
|
24 |
+
.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
|
25 |
+
.extern .shared .align 1 .b8 global_smem[];
|
26 |
+
.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
|
27 |
+
|
28 |
+
.visible .entry triton__0d1d2d3d4d5d6de7de(
|
29 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_0,
|
30 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_1,
|
31 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_2,
|
32 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_3,
|
33 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_4,
|
34 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_5,
|
35 |
+
.param .u32 triton__0d1d2d3d4d5d6de7de_param_6,
|
36 |
+
.param .u32 triton__0d1d2d3d4d5d6de7de_param_7
|
37 |
+
)
|
38 |
+
.maxntid 256, 1, 1
|
39 |
+
{
|
40 |
+
.reg .pred %p<157>;
|
41 |
+
.reg .b16 %rs<49>;
|
42 |
+
.reg .b32 %r<474>;
|
43 |
+
.reg .f32 %f<678>;
|
44 |
+
.reg .b64 %rd<118>;
|
45 |
+
.loc 1 18 0
|
46 |
+
$L__func_begin0:
|
47 |
+
.loc 1 18 0
|
48 |
+
|
49 |
+
ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6de7de_param_5];
|
50 |
+
ld.param.u64 %rd15, [triton__0d1d2d3d4d5d6de7de_param_4];
|
51 |
+
ld.param.u64 %rd14, [triton__0d1d2d3d4d5d6de7de_param_3];
|
52 |
+
ld.param.u64 %rd52, [triton__0d1d2d3d4d5d6de7de_param_0];
|
53 |
+
ld.param.u64 %rd53, [triton__0d1d2d3d4d5d6de7de_param_1];
|
54 |
+
$L__tmp0:
|
55 |
+
.loc 1 22 44
|
56 |
+
mov.u32 %r12, %tid.x;
|
57 |
+
ld.param.u64 %rd54, [triton__0d1d2d3d4d5d6de7de_param_2];
|
58 |
+
bfe.u32 %r1, %r12, 3, 5;
|
59 |
+
and.b32 %r2, %r12, 63;
|
60 |
+
.loc 1 24 33
|
61 |
+
shl.b32 %r13, %r12, 3;
|
62 |
+
and.b32 %r3, %r13, 56;
|
63 |
+
.loc 1 31 36
|
64 |
+
shr.u32 %r4, %r12, 6;
|
65 |
+
.loc 1 21 28
|
66 |
+
mov.u32 %r10, %ctaid.x;
|
67 |
+
.loc 1 21 33
|
68 |
+
shl.b32 %r14, %r10, 6;
|
69 |
+
.loc 1 22 23
|
70 |
+
or.b32 %r15, %r14, %r1;
|
71 |
+
or.b32 %r16, %r15, 32;
|
72 |
+
or.b32 %r17, %r14, %r2;
|
73 |
+
.loc 1 26 30
|
74 |
+
mul.wide.s32 %rd55, %r15, 8;
|
75 |
+
add.s64 %rd18, %rd52, %rd55;
|
76 |
+
add.s64 %rd34, %rd18, 256;
|
77 |
+
mul.wide.s32 %rd56, %r17, 8;
|
78 |
+
add.s64 %rd50, %rd52, %rd56;
|
79 |
+
mov.pred %p1, -1;
|
80 |
+
.loc 1 26 35
|
81 |
+
mov.u64 %rd17, 0x0;
|
82 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd17 }, [ %rd18 + 0 ];
|
83 |
+
mov.u64 %rd19, 0x0;
|
84 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd19 }, [ %rd18 + 0 ];
|
85 |
+
mov.u64 %rd21, 0x0;
|
86 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd21 }, [ %rd18 + 0 ];
|
87 |
+
mov.u64 %rd23, 0x0;
|
88 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd23 }, [ %rd18 + 0 ];
|
89 |
+
mov.u64 %rd25, 0x0;
|
90 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd25 }, [ %rd18 + 0 ];
|
91 |
+
mov.u64 %rd27, 0x0;
|
92 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd27 }, [ %rd18 + 0 ];
|
93 |
+
mov.u64 %rd29, 0x0;
|
94 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd29 }, [ %rd18 + 0 ];
|
95 |
+
mov.u64 %rd31, 0x0;
|
96 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd31 }, [ %rd18 + 0 ];
|
97 |
+
mov.u64 %rd33, 0x0;
|
98 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd33 }, [ %rd34 + 0 ];
|
99 |
+
mov.u64 %rd35, 0x0;
|
100 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd35 }, [ %rd34 + 0 ];
|
101 |
+
mov.u64 %rd37, 0x0;
|
102 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd37 }, [ %rd34 + 0 ];
|
103 |
+
mov.u64 %rd39, 0x0;
|
104 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd39 }, [ %rd34 + 0 ];
|
105 |
+
mov.u64 %rd41, 0x0;
|
106 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd41 }, [ %rd34 + 0 ];
|
107 |
+
mov.u64 %rd43, 0x0;
|
108 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd43 }, [ %rd34 + 0 ];
|
109 |
+
mov.u64 %rd45, 0x0;
|
110 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd45 }, [ %rd34 + 0 ];
|
111 |
+
mov.u64 %rd47, 0x0;
|
112 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd47 }, [ %rd34 + 0 ];
|
113 |
+
mov.u64 %rd49, 0x0;
|
114 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd49 }, [ %rd50 + 0 ];
|
115 |
+
.loc 1 27 18
|
116 |
+
bfe.s32 %r18, %r10, 25, 1;
|
117 |
+
shr.u32 %r19, %r18, 23;
|
118 |
+
add.s32 %r20, %r15, %r19;
|
119 |
+
and.b32 %r21, %r20, 16776704;
|
120 |
+
sub.s32 %r22, %r15, %r21;
|
121 |
+
add.s32 %r23, %r16, %r19;
|
122 |
+
and.b32 %r24, %r23, 16776704;
|
123 |
+
sub.s32 %r25, %r16, %r24;
|
124 |
+
.loc 1 35 44
|
125 |
+
shl.b32 %r26, %r22, 8;
|
126 |
+
shl.b32 %r27, %r25, 8;
|
127 |
+
.loc 1 37 22
|
128 |
+
add.s64 %rd57, %rd49, 50257;
|
129 |
+
.loc 1 38 22
|
130 |
+
setp.lt.s64 %p18, %rd17, 0;
|
131 |
+
setp.lt.s64 %p19, %rd33, 0;
|
132 |
+
setp.lt.s64 %p20, %rd49, 0;
|
133 |
+
.loc 1 39 36
|
134 |
+
selp.b64 %rd1, %rd57, %rd49, %p20;
|
135 |
+
.loc 1 41 44
|
136 |
+
shl.b64 %rd58, %rd17, 8;
|
137 |
+
add.s64 %rd59, %rd58, 12865792;
|
138 |
+
selp.b64 %rd60, %rd59, %rd58, %p18;
|
139 |
+
shl.b64 %rd61, %rd33, 8;
|
140 |
+
add.s64 %rd62, %rd61, 12865792;
|
141 |
+
selp.b64 %rd63, %rd62, %rd61, %p19;
|
142 |
+
.loc 1 31 36
|
143 |
+
and.b32 %r28, %r12, 7;
|
144 |
+
mul.wide.u32 %rd2, %r28, 32;
|
145 |
+
shl.b64 %rd64, %rd63, 2;
|
146 |
+
or.b64 %rd65, %rd2, %rd64;
|
147 |
+
add.s64 %rd3, %rd53, %rd65;
|
148 |
+
shl.b64 %rd66, %rd60, 2;
|
149 |
+
or.b64 %rd67, %rd2, %rd66;
|
150 |
+
add.s64 %rd4, %rd53, %rd67;
|
151 |
+
or.b32 %r29, %r27, %r3;
|
152 |
+
mul.wide.s32 %rd68, %r29, 4;
|
153 |
+
add.s64 %rd5, %rd54, %rd68;
|
154 |
+
or.b32 %r30, %r26, %r3;
|
155 |
+
mul.wide.s32 %rd69, %r30, 4;
|
156 |
+
add.s64 %rd6, %rd54, %rd69;
|
157 |
+
shl.b32 %r31, %r10, 14;
|
158 |
+
shl.b32 %r32, %r1, 8;
|
159 |
+
or.b32 %r33, %r31, %r32;
|
160 |
+
or.b32 %r5, %r33, %r3;
|
161 |
+
mov.f32 %f614, 0f00000000;
|
162 |
+
mov.u64 %rd116, 0;
|
163 |
+
mov.b32 %r472, -64;
|
164 |
+
mov.f32 %f615, %f614;
|
165 |
+
mov.f32 %f616, %f614;
|
166 |
+
mov.f32 %f617, %f614;
|
167 |
+
mov.f32 %f618, %f614;
|
168 |
+
mov.f32 %f619, %f614;
|
169 |
+
mov.f32 %f620, %f614;
|
170 |
+
mov.f32 %f621, %f614;
|
171 |
+
mov.f32 %f622, %f614;
|
172 |
+
mov.f32 %f623, %f614;
|
173 |
+
mov.f32 %f624, %f614;
|
174 |
+
mov.f32 %f625, %f614;
|
175 |
+
mov.f32 %f626, %f614;
|
176 |
+
mov.f32 %f627, %f614;
|
177 |
+
mov.f32 %f628, %f614;
|
178 |
+
mov.f32 %f629, %f614;
|
179 |
+
mov.f32 %f630, %f614;
|
180 |
+
mov.f32 %f631, %f614;
|
181 |
+
mov.f32 %f632, %f614;
|
182 |
+
mov.f32 %f633, %f614;
|
183 |
+
mov.f32 %f634, %f614;
|
184 |
+
mov.f32 %f635, %f614;
|
185 |
+
mov.f32 %f636, %f614;
|
186 |
+
mov.f32 %f637, %f614;
|
187 |
+
mov.f32 %f638, %f614;
|
188 |
+
mov.f32 %f639, %f614;
|
189 |
+
mov.f32 %f640, %f614;
|
190 |
+
mov.f32 %f641, %f614;
|
191 |
+
mov.f32 %f642, %f614;
|
192 |
+
mov.f32 %f643, %f614;
|
193 |
+
mov.f32 %f644, %f614;
|
194 |
+
mov.f32 %f645, %f614;
|
195 |
+
mov.f32 %f646, %f614;
|
196 |
+
mov.f32 %f647, %f614;
|
197 |
+
mov.f32 %f648, %f614;
|
198 |
+
mov.f32 %f649, %f614;
|
199 |
+
mov.f32 %f650, %f614;
|
200 |
+
mov.f32 %f651, %f614;
|
201 |
+
mov.f32 %f652, %f614;
|
202 |
+
mov.f32 %f653, %f614;
|
203 |
+
mov.f32 %f654, %f614;
|
204 |
+
mov.f32 %f655, %f614;
|
205 |
+
mov.f32 %f656, %f614;
|
206 |
+
mov.f32 %f657, %f614;
|
207 |
+
mov.f32 %f658, %f614;
|
208 |
+
mov.f32 %f659, %f614;
|
209 |
+
mov.f32 %f660, %f614;
|
210 |
+
mov.f32 %f661, %f614;
|
211 |
+
mov.f32 %f662, %f614;
|
212 |
+
mov.f32 %f663, %f614;
|
213 |
+
mov.f32 %f664, %f614;
|
214 |
+
mov.f32 %f665, %f614;
|
215 |
+
mov.f32 %f666, %f614;
|
216 |
+
mov.f32 %f667, %f614;
|
217 |
+
mov.f32 %f668, %f614;
|
218 |
+
mov.f32 %f669, %f614;
|
219 |
+
mov.f32 %f670, %f614;
|
220 |
+
mov.f32 %f671, %f614;
|
221 |
+
mov.f32 %f672, %f614;
|
222 |
+
mov.f32 %f673, %f614;
|
223 |
+
mov.f32 %f674, %f614;
|
224 |
+
mov.f32 %f675, %f614;
|
225 |
+
mov.f32 %f676, %f614;
|
226 |
+
mov.f32 %f677, %f614;
|
227 |
+
bra.uni $L__BB0_1;
|
228 |
+
$L__BB0_3:
|
229 |
+
.loc 1 41 40
|
230 |
+
add.s64 %rd85, %rd4, %rd116;
|
231 |
+
.loc 1 41 34
|
232 |
+
add.s64 %rd86, %rd85, 16;
|
233 |
+
add.s64 %rd87, %rd3, %rd116;
|
234 |
+
.loc 1 41 52
|
235 |
+
add.s64 %rd88, %rd87, 16;
|
236 |
+
mov.u32 %r102, 0x0;
|
237 |
+
mov.u32 %r103, 0x0;
|
238 |
+
mov.u32 %r104, 0x0;
|
239 |
+
mov.u32 %r105, 0x0;
|
240 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r102, %r103, %r104, %r105 }, [ %rd85 + 0 ];
|
241 |
+
@!%p1 mov.u32 %r102, %r411;
|
242 |
+
@!%p1 mov.u32 %r103, %r411;
|
243 |
+
@!%p1 mov.u32 %r104, %r411;
|
244 |
+
@!%p1 mov.u32 %r105, %r411;
|
245 |
+
mov.b32 %f206, %r102;
|
246 |
+
mov.b32 %f207, %r103;
|
247 |
+
mov.b32 %f208, %r104;
|
248 |
+
mov.b32 %f209, %r105;
|
249 |
+
mov.u32 %r110, 0x0;
|
250 |
+
mov.u32 %r111, 0x0;
|
251 |
+
mov.u32 %r112, 0x0;
|
252 |
+
mov.u32 %r113, 0x0;
|
253 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r110, %r111, %r112, %r113 }, [ %rd86 + 0 ];
|
254 |
+
@!%p1 mov.u32 %r110, %r411;
|
255 |
+
@!%p1 mov.u32 %r111, %r411;
|
256 |
+
@!%p1 mov.u32 %r112, %r411;
|
257 |
+
@!%p1 mov.u32 %r113, %r411;
|
258 |
+
mov.b32 %f210, %r110;
|
259 |
+
mov.b32 %f211, %r111;
|
260 |
+
mov.b32 %f212, %r112;
|
261 |
+
mov.b32 %f213, %r113;
|
262 |
+
mov.u32 %r118, 0x0;
|
263 |
+
mov.u32 %r119, 0x0;
|
264 |
+
mov.u32 %r120, 0x0;
|
265 |
+
mov.u32 %r121, 0x0;
|
266 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r118, %r119, %r120, %r121 }, [ %rd87 + 0 ];
|
267 |
+
@!%p1 mov.u32 %r118, %r411;
|
268 |
+
@!%p1 mov.u32 %r119, %r411;
|
269 |
+
@!%p1 mov.u32 %r120, %r411;
|
270 |
+
@!%p1 mov.u32 %r121, %r411;
|
271 |
+
mov.b32 %f214, %r118;
|
272 |
+
mov.b32 %f215, %r119;
|
273 |
+
mov.b32 %f216, %r120;
|
274 |
+
mov.b32 %f217, %r121;
|
275 |
+
mov.u32 %r126, 0x0;
|
276 |
+
mov.u32 %r127, 0x0;
|
277 |
+
mov.u32 %r128, 0x0;
|
278 |
+
mov.u32 %r129, 0x0;
|
279 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r126, %r127, %r128, %r129 }, [ %rd88 + 0 ];
|
280 |
+
@!%p1 mov.u32 %r126, %r411;
|
281 |
+
@!%p1 mov.u32 %r127, %r411;
|
282 |
+
@!%p1 mov.u32 %r128, %r411;
|
283 |
+
@!%p1 mov.u32 %r129, %r411;
|
284 |
+
mov.b32 %f218, %r126;
|
285 |
+
mov.b32 %f219, %r127;
|
286 |
+
mov.b32 %f220, %r128;
|
287 |
+
mov.b32 %f221, %r129;
|
288 |
+
.loc 1 42 22
|
289 |
+
add.f32 %f222, %f65, %f206;
|
290 |
+
add.f32 %f223, %f66, %f207;
|
291 |
+
add.f32 %f224, %f67, %f208;
|
292 |
+
add.f32 %f225, %f68, %f209;
|
293 |
+
add.f32 %f226, %f69, %f210;
|
294 |
+
add.f32 %f227, %f70, %f211;
|
295 |
+
add.f32 %f228, %f71, %f212;
|
296 |
+
add.f32 %f229, %f72, %f213;
|
297 |
+
add.f32 %f230, %f73, %f214;
|
298 |
+
add.f32 %f231, %f74, %f215;
|
299 |
+
add.f32 %f232, %f75, %f216;
|
300 |
+
add.f32 %f233, %f76, %f217;
|
301 |
+
add.f32 %f234, %f77, %f218;
|
302 |
+
add.f32 %f235, %f78, %f219;
|
303 |
+
add.f32 %f236, %f79, %f220;
|
304 |
+
add.f32 %f237, %f80, %f221;
|
305 |
+
.loc 1 44 22
|
306 |
+
add.f32 %f238, %f81, %f222;
|
307 |
+
add.f32 %f239, %f82, %f223;
|
308 |
+
add.f32 %f240, %f83, %f224;
|
309 |
+
add.f32 %f241, %f84, %f225;
|
310 |
+
add.f32 %f242, %f85, %f226;
|
311 |
+
add.f32 %f243, %f86, %f227;
|
312 |
+
add.f32 %f244, %f87, %f228;
|
313 |
+
add.f32 %f245, %f88, %f229;
|
314 |
+
add.f32 %f246, %f89, %f230;
|
315 |
+
add.f32 %f247, %f90, %f231;
|
316 |
+
add.f32 %f248, %f91, %f232;
|
317 |
+
add.f32 %f249, %f92, %f233;
|
318 |
+
add.f32 %f250, %f93, %f234;
|
319 |
+
add.f32 %f251, %f94, %f235;
|
320 |
+
add.f32 %f252, %f95, %f236;
|
321 |
+
add.f32 %f253, %f96, %f237;
|
322 |
+
$L__tmp1:
|
323 |
+
.loc 2 96 20
|
324 |
+
sub.f32 %f254, %f238, %f662;
|
325 |
+
sub.f32 %f255, %f239, %f663;
|
326 |
+
sub.f32 %f256, %f240, %f664;
|
327 |
+
sub.f32 %f257, %f241, %f665;
|
328 |
+
sub.f32 %f258, %f242, %f666;
|
329 |
+
sub.f32 %f259, %f243, %f667;
|
330 |
+
sub.f32 %f260, %f244, %f668;
|
331 |
+
sub.f32 %f261, %f245, %f669;
|
332 |
+
sub.f32 %f262, %f246, %f670;
|
333 |
+
sub.f32 %f263, %f247, %f671;
|
334 |
+
sub.f32 %f264, %f248, %f672;
|
335 |
+
sub.f32 %f265, %f249, %f673;
|
336 |
+
sub.f32 %f266, %f250, %f674;
|
337 |
+
sub.f32 %f267, %f251, %f675;
|
338 |
+
sub.f32 %f268, %f252, %f676;
|
339 |
+
sub.f32 %f269, %f253, %f677;
|
340 |
+
.loc 2 97 26
|
341 |
+
add.f32 %f614, %f614, 0f3F800000;
|
342 |
+
add.f32 %f615, %f615, 0f3F800000;
|
343 |
+
add.f32 %f616, %f616, 0f3F800000;
|
344 |
+
add.f32 %f617, %f617, 0f3F800000;
|
345 |
+
add.f32 %f618, %f618, 0f3F800000;
|
346 |
+
add.f32 %f619, %f619, 0f3F800000;
|
347 |
+
add.f32 %f620, %f620, 0f3F800000;
|
348 |
+
add.f32 %f621, %f621, 0f3F800000;
|
349 |
+
add.f32 %f622, %f622, 0f3F800000;
|
350 |
+
add.f32 %f623, %f623, 0f3F800000;
|
351 |
+
add.f32 %f624, %f624, 0f3F800000;
|
352 |
+
add.f32 %f625, %f625, 0f3F800000;
|
353 |
+
add.f32 %f626, %f626, 0f3F800000;
|
354 |
+
add.f32 %f627, %f627, 0f3F800000;
|
355 |
+
add.f32 %f628, %f628, 0f3F800000;
|
356 |
+
add.f32 %f629, %f629, 0f3F800000;
|
357 |
+
add.f32 %f630, %f630, 0f3F800000;
|
358 |
+
add.f32 %f631, %f631, 0f3F800000;
|
359 |
+
add.f32 %f632, %f632, 0f3F800000;
|
360 |
+
add.f32 %f633, %f633, 0f3F800000;
|
361 |
+
add.f32 %f634, %f634, 0f3F800000;
|
362 |
+
add.f32 %f635, %f635, 0f3F800000;
|
363 |
+
add.f32 %f636, %f636, 0f3F800000;
|
364 |
+
add.f32 %f637, %f637, 0f3F800000;
|
365 |
+
add.f32 %f638, %f638, 0f3F800000;
|
366 |
+
add.f32 %f639, %f639, 0f3F800000;
|
367 |
+
add.f32 %f640, %f640, 0f3F800000;
|
368 |
+
add.f32 %f641, %f641, 0f3F800000;
|
369 |
+
add.f32 %f642, %f642, 0f3F800000;
|
370 |
+
add.f32 %f643, %f643, 0f3F800000;
|
371 |
+
add.f32 %f644, %f644, 0f3F800000;
|
372 |
+
add.f32 %f645, %f645, 0f3F800000;
|
373 |
+
.loc 2 98 30
|
374 |
+
mov.b32 %r135, %f254;
|
375 |
+
mov.b32 %r136, %f614;
|
376 |
+
div.full.f32 %r134, %r135, %r136;
|
377 |
+
mov.b32 %f270, %r134;
|
378 |
+
mov.b32 %r138, %f255;
|
379 |
+
mov.b32 %r139, %f615;
|
380 |
+
div.full.f32 %r137, %r138, %r139;
|
381 |
+
mov.b32 %f271, %r137;
|
382 |
+
mov.b32 %r141, %f256;
|
383 |
+
mov.b32 %r142, %f616;
|
384 |
+
div.full.f32 %r140, %r141, %r142;
|
385 |
+
mov.b32 %f272, %r140;
|
386 |
+
mov.b32 %r144, %f257;
|
387 |
+
mov.b32 %r145, %f617;
|
388 |
+
div.full.f32 %r143, %r144, %r145;
|
389 |
+
mov.b32 %f273, %r143;
|
390 |
+
mov.b32 %r147, %f258;
|
391 |
+
mov.b32 %r148, %f618;
|
392 |
+
div.full.f32 %r146, %r147, %r148;
|
393 |
+
mov.b32 %f274, %r146;
|
394 |
+
mov.b32 %r150, %f259;
|
395 |
+
mov.b32 %r151, %f619;
|
396 |
+
div.full.f32 %r149, %r150, %r151;
|
397 |
+
mov.b32 %f275, %r149;
|
398 |
+
mov.b32 %r153, %f260;
|
399 |
+
mov.b32 %r154, %f620;
|
400 |
+
div.full.f32 %r152, %r153, %r154;
|
401 |
+
mov.b32 %f276, %r152;
|
402 |
+
mov.b32 %r156, %f261;
|
403 |
+
mov.b32 %r157, %f621;
|
404 |
+
div.full.f32 %r155, %r156, %r157;
|
405 |
+
mov.b32 %f277, %r155;
|
406 |
+
mov.b32 %r159, %f262;
|
407 |
+
mov.b32 %r160, %f622;
|
408 |
+
div.full.f32 %r158, %r159, %r160;
|
409 |
+
mov.b32 %f278, %r158;
|
410 |
+
mov.b32 %r162, %f263;
|
411 |
+
mov.b32 %r163, %f623;
|
412 |
+
div.full.f32 %r161, %r162, %r163;
|
413 |
+
mov.b32 %f279, %r161;
|
414 |
+
mov.b32 %r165, %f264;
|
415 |
+
mov.b32 %r166, %f624;
|
416 |
+
div.full.f32 %r164, %r165, %r166;
|
417 |
+
mov.b32 %f280, %r164;
|
418 |
+
mov.b32 %r168, %f265;
|
419 |
+
mov.b32 %r169, %f625;
|
420 |
+
div.full.f32 %r167, %r168, %r169;
|
421 |
+
mov.b32 %f281, %r167;
|
422 |
+
mov.b32 %r171, %f266;
|
423 |
+
mov.b32 %r172, %f626;
|
424 |
+
div.full.f32 %r170, %r171, %r172;
|
425 |
+
mov.b32 %f282, %r170;
|
426 |
+
mov.b32 %r174, %f267;
|
427 |
+
mov.b32 %r175, %f627;
|
428 |
+
div.full.f32 %r173, %r174, %r175;
|
429 |
+
mov.b32 %f283, %r173;
|
430 |
+
mov.b32 %r177, %f268;
|
431 |
+
mov.b32 %r178, %f628;
|
432 |
+
div.full.f32 %r176, %r177, %r178;
|
433 |
+
mov.b32 %f284, %r176;
|
434 |
+
mov.b32 %r180, %f269;
|
435 |
+
mov.b32 %r181, %f629;
|
436 |
+
div.full.f32 %r179, %r180, %r181;
|
437 |
+
mov.b32 %f285, %r179;
|
438 |
+
.loc 2 98 22
|
439 |
+
add.f32 %f662, %f662, %f270;
|
440 |
+
add.f32 %f663, %f663, %f271;
|
441 |
+
add.f32 %f664, %f664, %f272;
|
442 |
+
add.f32 %f665, %f665, %f273;
|
443 |
+
add.f32 %f666, %f666, %f274;
|
444 |
+
add.f32 %f667, %f667, %f275;
|
445 |
+
add.f32 %f668, %f668, %f276;
|
446 |
+
add.f32 %f669, %f669, %f277;
|
447 |
+
add.f32 %f670, %f670, %f278;
|
448 |
+
add.f32 %f671, %f671, %f279;
|
449 |
+
add.f32 %f672, %f672, %f280;
|
450 |
+
add.f32 %f673, %f673, %f281;
|
451 |
+
add.f32 %f674, %f674, %f282;
|
452 |
+
add.f32 %f675, %f675, %f283;
|
453 |
+
add.f32 %f676, %f676, %f284;
|
454 |
+
add.f32 %f677, %f677, %f285;
|
455 |
+
.loc 2 101 30
|
456 |
+
sub.f32 %f286, %f238, %f662;
|
457 |
+
sub.f32 %f287, %f239, %f663;
|
458 |
+
sub.f32 %f288, %f240, %f664;
|
459 |
+
sub.f32 %f289, %f241, %f665;
|
460 |
+
sub.f32 %f290, %f242, %f666;
|
461 |
+
sub.f32 %f291, %f243, %f667;
|
462 |
+
sub.f32 %f292, %f244, %f668;
|
463 |
+
sub.f32 %f293, %f245, %f669;
|
464 |
+
sub.f32 %f294, %f246, %f670;
|
465 |
+
sub.f32 %f295, %f247, %f671;
|
466 |
+
sub.f32 %f296, %f248, %f672;
|
467 |
+
sub.f32 %f297, %f249, %f673;
|
468 |
+
sub.f32 %f298, %f250, %f674;
|
469 |
+
sub.f32 %f299, %f251, %f675;
|
470 |
+
sub.f32 %f300, %f252, %f676;
|
471 |
+
sub.f32 %f301, %f253, %f677;
|
472 |
+
$L__tmp2:
|
473 |
+
.loc 1 50 50
|
474 |
+
fma.rn.f32 %f646, %f254, %f286, %f646;
|
475 |
+
fma.rn.f32 %f647, %f255, %f287, %f647;
|
476 |
+
fma.rn.f32 %f648, %f256, %f288, %f648;
|
477 |
+
fma.rn.f32 %f649, %f257, %f289, %f649;
|
478 |
+
fma.rn.f32 %f650, %f258, %f290, %f650;
|
479 |
+
fma.rn.f32 %f651, %f259, %f291, %f651;
|
480 |
+
fma.rn.f32 %f652, %f260, %f292, %f652;
|
481 |
+
fma.rn.f32 %f653, %f261, %f293, %f653;
|
482 |
+
fma.rn.f32 %f654, %f262, %f294, %f654;
|
483 |
+
fma.rn.f32 %f655, %f263, %f295, %f655;
|
484 |
+
fma.rn.f32 %f656, %f264, %f296, %f656;
|
485 |
+
fma.rn.f32 %f657, %f265, %f297, %f657;
|
486 |
+
fma.rn.f32 %f658, %f266, %f298, %f658;
|
487 |
+
fma.rn.f32 %f659, %f267, %f299, %f659;
|
488 |
+
fma.rn.f32 %f660, %f268, %f300, %f660;
|
489 |
+
fma.rn.f32 %f661, %f269, %f301, %f661;
|
490 |
+
.loc 1 31 36
|
491 |
+
add.s64 %rd116, %rd116, 256;
|
492 |
+
add.s32 %r472, %r472, 64;
|
493 |
+
setp.lt.u32 %p72, %r472, 192;
|
494 |
+
@%p72 bra $L__BB0_1;
|
495 |
+
bra.uni $L__BB0_4;
|
496 |
+
$L__BB0_1:
|
497 |
+
.loc 1 40 40
|
498 |
+
setp.lt.u64 %p51, %rd1, 50257;
|
499 |
+
.loc 1 35 34
|
500 |
+
add.s64 %rd70, %rd6, %rd116;
|
501 |
+
add.s64 %rd71, %rd70, 16;
|
502 |
+
add.s64 %rd72, %rd5, %rd116;
|
503 |
+
.loc 1 35 50
|
504 |
+
add.s64 %rd73, %rd72, 16;
|
505 |
+
mov.b32 %r411, 0;
|
506 |
+
mov.u32 %r34, 0x0;
|
507 |
+
mov.u32 %r35, 0x0;
|
508 |
+
mov.u32 %r36, 0x0;
|
509 |
+
mov.u32 %r37, 0x0;
|
510 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r34, %r35, %r36, %r37 }, [ %rd70 + 0 ];
|
511 |
+
@!%p1 mov.u32 %r34, %r411;
|
512 |
+
@!%p1 mov.u32 %r35, %r411;
|
513 |
+
@!%p1 mov.u32 %r36, %r411;
|
514 |
+
@!%p1 mov.u32 %r37, %r411;
|
515 |
+
mov.b32 %f65, %r34;
|
516 |
+
mov.b32 %f66, %r35;
|
517 |
+
mov.b32 %f67, %r36;
|
518 |
+
mov.b32 %f68, %r37;
|
519 |
+
mov.u32 %r42, 0x0;
|
520 |
+
mov.u32 %r43, 0x0;
|
521 |
+
mov.u32 %r44, 0x0;
|
522 |
+
mov.u32 %r45, 0x0;
|
523 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r42, %r43, %r44, %r45 }, [ %rd71 + 0 ];
|
524 |
+
@!%p1 mov.u32 %r42, %r411;
|
525 |
+
@!%p1 mov.u32 %r43, %r411;
|
526 |
+
@!%p1 mov.u32 %r44, %r411;
|
527 |
+
@!%p1 mov.u32 %r45, %r411;
|
528 |
+
mov.b32 %f69, %r42;
|
529 |
+
mov.b32 %f70, %r43;
|
530 |
+
mov.b32 %f71, %r44;
|
531 |
+
mov.b32 %f72, %r45;
|
532 |
+
mov.u32 %r50, 0x0;
|
533 |
+
mov.u32 %r51, 0x0;
|
534 |
+
mov.u32 %r52, 0x0;
|
535 |
+
mov.u32 %r53, 0x0;
|
536 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r50, %r51, %r52, %r53 }, [ %rd72 + 0 ];
|
537 |
+
@!%p1 mov.u32 %r50, %r411;
|
538 |
+
@!%p1 mov.u32 %r51, %r411;
|
539 |
+
@!%p1 mov.u32 %r52, %r411;
|
540 |
+
@!%p1 mov.u32 %r53, %r411;
|
541 |
+
mov.b32 %f73, %r50;
|
542 |
+
mov.b32 %f74, %r51;
|
543 |
+
mov.b32 %f75, %r52;
|
544 |
+
mov.b32 %f76, %r53;
|
545 |
+
mov.u32 %r58, 0x0;
|
546 |
+
mov.u32 %r59, 0x0;
|
547 |
+
mov.u32 %r60, 0x0;
|
548 |
+
mov.u32 %r61, 0x0;
|
549 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r58, %r59, %r60, %r61 }, [ %rd73 + 0 ];
|
550 |
+
@!%p1 mov.u32 %r58, %r411;
|
551 |
+
@!%p1 mov.u32 %r59, %r411;
|
552 |
+
@!%p1 mov.u32 %r60, %r411;
|
553 |
+
@!%p1 mov.u32 %r61, %r411;
|
554 |
+
mov.b32 %f77, %r58;
|
555 |
+
mov.b32 %f78, %r59;
|
556 |
+
mov.b32 %f79, %r60;
|
557 |
+
mov.b32 %f80, %r61;
|
558 |
+
.loc 1 36 40
|
559 |
+
add.s32 %r98, %r5, %r472;
|
560 |
+
add.s32 %r99, %r98, 64;
|
561 |
+
.loc 1 36 34
|
562 |
+
add.s32 %r100, %r98, 8256;
|
563 |
+
mul.wide.s32 %rd76, %r99, 2;
|
564 |
+
add.s64 %rd74, %rd14, %rd76;
|
565 |
+
mul.wide.s32 %rd77, %r100, 2;
|
566 |
+
add.s64 %rd75, %rd14, %rd77;
|
567 |
+
.loc 1 36 50
|
568 |
+
mov.u32 %r66, 0x0;
|
569 |
+
mov.u32 %r67, 0x0;
|
570 |
+
mov.u32 %r68, 0x0;
|
571 |
+
mov.u32 %r69, 0x0;
|
572 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r66, %r67, %r68, %r69 }, [ %rd74 + 0 ];
|
573 |
+
@!%p1 mov.u32 %r66, %r411;
|
574 |
+
@!%p1 mov.u32 %r67, %r411;
|
575 |
+
@!%p1 mov.u32 %r68, %r411;
|
576 |
+
@!%p1 mov.u32 %r69, %r411;
|
577 |
+
cvt.u16.u32 %rs1, %r66;
|
578 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r66; }
|
579 |
+
cvt.u16.u32 %rs3, %r67;
|
580 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r67; }
|
581 |
+
cvt.u16.u32 %rs5, %r68;
|
582 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r68; }
|
583 |
+
cvt.u16.u32 %rs7, %r69;
|
584 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r69; }
|
585 |
+
mov.u32 %r74, 0x0;
|
586 |
+
mov.u32 %r75, 0x0;
|
587 |
+
mov.u32 %r76, 0x0;
|
588 |
+
mov.u32 %r77, 0x0;
|
589 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r74, %r75, %r76, %r77 }, [ %rd75 + 0 ];
|
590 |
+
@!%p1 mov.u32 %r74, %r411;
|
591 |
+
@!%p1 mov.u32 %r75, %r411;
|
592 |
+
@!%p1 mov.u32 %r76, %r411;
|
593 |
+
@!%p1 mov.u32 %r77, %r411;
|
594 |
+
cvt.u16.u32 %rs9, %r74;
|
595 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r74; }
|
596 |
+
cvt.u16.u32 %rs11, %r75;
|
597 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r75; }
|
598 |
+
cvt.u16.u32 %rs13, %r76;
|
599 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs14}, %r76; }
|
600 |
+
cvt.u16.u32 %rs15, %r77;
|
601 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs16}, %r77; }
|
602 |
+
.loc 1 36 101
|
603 |
+
cvt.f32.bf16 %r82, %rs1;
|
604 |
+
mov.b32 %f81, %r82;
|
605 |
+
cvt.f32.bf16 %r83, %rs2;
|
606 |
+
mov.b32 %f82, %r83;
|
607 |
+
cvt.f32.bf16 %r84, %rs3;
|
608 |
+
mov.b32 %f83, %r84;
|
609 |
+
cvt.f32.bf16 %r85, %rs4;
|
610 |
+
mov.b32 %f84, %r85;
|
611 |
+
cvt.f32.bf16 %r86, %rs5;
|
612 |
+
mov.b32 %f85, %r86;
|
613 |
+
cvt.f32.bf16 %r87, %rs6;
|
614 |
+
mov.b32 %f86, %r87;
|
615 |
+
cvt.f32.bf16 %r88, %rs7;
|
616 |
+
mov.b32 %f87, %r88;
|
617 |
+
cvt.f32.bf16 %r89, %rs8;
|
618 |
+
mov.b32 %f88, %r89;
|
619 |
+
cvt.f32.bf16 %r90, %rs9;
|
620 |
+
mov.b32 %f89, %r90;
|
621 |
+
cvt.f32.bf16 %r91, %rs10;
|
622 |
+
mov.b32 %f90, %r91;
|
623 |
+
cvt.f32.bf16 %r92, %rs11;
|
624 |
+
mov.b32 %f91, %r92;
|
625 |
+
cvt.f32.bf16 %r93, %rs12;
|
626 |
+
mov.b32 %f92, %r93;
|
627 |
+
cvt.f32.bf16 %r94, %rs13;
|
628 |
+
mov.b32 %f93, %r94;
|
629 |
+
cvt.f32.bf16 %r95, %rs14;
|
630 |
+
mov.b32 %f94, %r95;
|
631 |
+
cvt.f32.bf16 %r96, %rs15;
|
632 |
+
mov.b32 %f95, %r96;
|
633 |
+
cvt.f32.bf16 %r97, %rs16;
|
634 |
+
mov.b32 %f96, %r97;
|
635 |
+
mov.b32 %r471, 883;
|
636 |
+
mov.u64 %rd115, 1;
|
637 |
+
.loc 1 40 55
|
638 |
+
@%p51 bra $L__BB0_3;
|
639 |
+
mov.u64 %rd78, assertMessage_0;
|
640 |
+
cvta.global.u64 %rd79, %rd78;
|
641 |
+
mov.u64 %rd80, assertFile_0;
|
642 |
+
cvta.global.u64 %rd81, %rd80;
|
643 |
+
mov.u64 %rd82, assertFunc_0;
|
644 |
+
cvta.global.u64 %rd83, %rd82;
|
645 |
+
{ // callseq 6, 0
|
646 |
+
.reg .b32 temp_param_reg;
|
647 |
+
.param .b64 param0;
|
648 |
+
st.param.b64 [param0+0], %rd79;
|
649 |
+
.param .b64 param1;
|
650 |
+
st.param.b64 [param1+0], %rd81;
|
651 |
+
.param .b32 param2;
|
652 |
+
st.param.b32 [param2+0], %r471;
|
653 |
+
.param .b64 param3;
|
654 |
+
st.param.b64 [param3+0], %rd83;
|
655 |
+
.param .b64 param4;
|
656 |
+
st.param.b64 [param4+0], %rd115;
|
657 |
+
call.uni
|
658 |
+
__assertfail,
|
659 |
+
(
|
660 |
+
param0,
|
661 |
+
param1,
|
662 |
+
param2,
|
663 |
+
param3,
|
664 |
+
param4
|
665 |
+
);
|
666 |
+
} // callseq 6
|
667 |
+
bra.uni $L__BB0_3;
|
668 |
+
$L__BB0_4:
|
669 |
+
.loc 1 31 36
|
670 |
+
and.b32 %r291, %r4, 3;
|
671 |
+
mad.lo.s32 %r292, %r291, 72, %r2;
|
672 |
+
shl.b32 %r293, %r292, 2;
|
673 |
+
mov.u32 %r294, global_smem;
|
674 |
+
add.s32 %r295, %r294, %r293;
|
675 |
+
st.shared.f32 [%r295], %f630;
|
676 |
+
st.shared.f32 [%r295+1152], %f631;
|
677 |
+
st.shared.f32 [%r295+2304], %f632;
|
678 |
+
st.shared.f32 [%r295+3456], %f633;
|
679 |
+
st.shared.f32 [%r295+4608], %f634;
|
680 |
+
st.shared.f32 [%r295+5760], %f635;
|
681 |
+
st.shared.f32 [%r295+6912], %f636;
|
682 |
+
st.shared.f32 [%r295+8064], %f637;
|
683 |
+
bar.sync 0;
|
684 |
+
mad.lo.s32 %r296, %r1, 72, %r3;
|
685 |
+
shl.b32 %r297, %r296, 2;
|
686 |
+
add.s32 %r298, %r294, %r297;
|
687 |
+
ld.shared.v4.f32 {%f302, %f303, %f304, %f305}, [%r298];
|
688 |
+
ld.shared.v4.f32 {%f306, %f307, %f308, %f309}, [%r298+16];
|
689 |
+
bar.sync 0;
|
690 |
+
st.shared.f32 [%r295], %f638;
|
691 |
+
st.shared.f32 [%r295+1152], %f639;
|
692 |
+
st.shared.f32 [%r295+2304], %f640;
|
693 |
+
st.shared.f32 [%r295+3456], %f641;
|
694 |
+
st.shared.f32 [%r295+4608], %f642;
|
695 |
+
st.shared.f32 [%r295+5760], %f643;
|
696 |
+
st.shared.f32 [%r295+6912], %f644;
|
697 |
+
st.shared.f32 [%r295+8064], %f645;
|
698 |
+
bar.sync 0;
|
699 |
+
ld.shared.v4.f32 {%f310, %f311, %f312, %f313}, [%r298];
|
700 |
+
ld.shared.v4.f32 {%f314, %f315, %f316, %f317}, [%r298+16];
|
701 |
+
$L__tmp3:
|
702 |
+
.loc 2 108 21
|
703 |
+
sub.f32 %f318, %f663, %f662;
|
704 |
+
.loc 2 109 28
|
705 |
+
add.f32 %f319, %f302, %f303;
|
706 |
+
.loc 2 110 39
|
707 |
+
setp.eq.f32 %p73, %f319, 0f00000000;
|
708 |
+
.loc 2 110 60
|
709 |
+
mov.b32 %r183, %f303;
|
710 |
+
mov.b32 %r184, %f319;
|
711 |
+
div.full.f32 %r182, %r183, %r184;
|
712 |
+
mov.b32 %f320, %r182;
|
713 |
+
.loc 2 110 49
|
714 |
+
selp.f32 %f321, 0f00000000, %f320, %p73;
|
715 |
+
.loc 2 112 17
|
716 |
+
fma.rn.f32 %f322, %f318, %f321, %f662;
|
717 |
+
.loc 2 113 15
|
718 |
+
add.f32 %f323, %f646, %f647;
|
719 |
+
.loc 2 113 30
|
720 |
+
mul.f32 %f324, %f318, %f318;
|
721 |
+
.loc 2 113 38
|
722 |
+
mul.f32 %f325, %f324, %f302;
|
723 |
+
.loc 2 113 22
|
724 |
+
fma.rn.f32 %f326, %f325, %f321, %f323;
|
725 |
+
.loc 2 108 21
|
726 |
+
sub.f32 %f327, %f664, %f322;
|
727 |
+
.loc 2 109 28
|
728 |
+
add.f32 %f328, %f304, %f319;
|
729 |
+
.loc 2 110 39
|
730 |
+
setp.eq.f32 %p74, %f328, 0f00000000;
|
731 |
+
.loc 2 110 60
|
732 |
+
mov.b32 %r187, %f328;
|
733 |
+
mov.b32 %r186, %f304;
|
734 |
+
div.full.f32 %r185, %r186, %r187;
|
735 |
+
mov.b32 %f329, %r185;
|
736 |
+
.loc 2 110 49
|
737 |
+
selp.f32 %f330, 0f00000000, %f329, %p74;
|
738 |
+
.loc 2 112 17
|
739 |
+
fma.rn.f32 %f331, %f330, %f327, %f322;
|
740 |
+
.loc 2 113 15
|
741 |
+
add.f32 %f332, %f648, %f326;
|
742 |
+
.loc 2 113 30
|
743 |
+
mul.f32 %f333, %f327, %f327;
|
744 |
+
.loc 2 113 38
|
745 |
+
mul.f32 %f334, %f319, %f333;
|
746 |
+
.loc 2 113 22
|
747 |
+
fma.rn.f32 %f335, %f330, %f334, %f332;
|
748 |
+
.loc 2 108 21
|
749 |
+
sub.f32 %f336, %f665, %f331;
|
750 |
+
.loc 2 109 28
|
751 |
+
add.f32 %f337, %f305, %f328;
|
752 |
+
.loc 2 110 39
|
753 |
+
setp.eq.f32 %p75, %f337, 0f00000000;
|
754 |
+
.loc 2 110 60
|
755 |
+
mov.b32 %r190, %f337;
|
756 |
+
mov.b32 %r189, %f305;
|
757 |
+
div.full.f32 %r188, %r189, %r190;
|
758 |
+
mov.b32 %f338, %r188;
|
759 |
+
.loc 2 110 49
|
760 |
+
selp.f32 %f339, 0f00000000, %f338, %p75;
|
761 |
+
.loc 2 112 17
|
762 |
+
fma.rn.f32 %f340, %f339, %f336, %f331;
|
763 |
+
.loc 2 113 15
|
764 |
+
add.f32 %f341, %f649, %f335;
|
765 |
+
.loc 2 113 30
|
766 |
+
mul.f32 %f342, %f336, %f336;
|
767 |
+
.loc 2 113 38
|
768 |
+
mul.f32 %f343, %f328, %f342;
|
769 |
+
.loc 2 113 22
|
770 |
+
fma.rn.f32 %f344, %f339, %f343, %f341;
|
771 |
+
.loc 2 108 21
|
772 |
+
sub.f32 %f345, %f666, %f340;
|
773 |
+
.loc 2 109 28
|
774 |
+
add.f32 %f346, %f306, %f337;
|
775 |
+
.loc 2 110 39
|
776 |
+
setp.eq.f32 %p76, %f346, 0f00000000;
|
777 |
+
.loc 2 110 60
|
778 |
+
mov.b32 %r193, %f346;
|
779 |
+
mov.b32 %r192, %f306;
|
780 |
+
div.full.f32 %r191, %r192, %r193;
|
781 |
+
mov.b32 %f347, %r191;
|
782 |
+
.loc 2 110 49
|
783 |
+
selp.f32 %f348, 0f00000000, %f347, %p76;
|
784 |
+
.loc 2 112 17
|
785 |
+
fma.rn.f32 %f349, %f348, %f345, %f340;
|
786 |
+
.loc 2 113 15
|
787 |
+
add.f32 %f350, %f650, %f344;
|
788 |
+
.loc 2 113 30
|
789 |
+
mul.f32 %f351, %f345, %f345;
|
790 |
+
.loc 2 113 38
|
791 |
+
mul.f32 %f352, %f337, %f351;
|
792 |
+
.loc 2 113 22
|
793 |
+
fma.rn.f32 %f353, %f348, %f352, %f350;
|
794 |
+
.loc 2 108 21
|
795 |
+
sub.f32 %f354, %f667, %f349;
|
796 |
+
.loc 2 109 28
|
797 |
+
add.f32 %f355, %f307, %f346;
|
798 |
+
.loc 2 110 39
|
799 |
+
setp.eq.f32 %p77, %f355, 0f00000000;
|
800 |
+
.loc 2 110 60
|
801 |
+
mov.b32 %r196, %f355;
|
802 |
+
mov.b32 %r195, %f307;
|
803 |
+
div.full.f32 %r194, %r195, %r196;
|
804 |
+
mov.b32 %f356, %r194;
|
805 |
+
.loc 2 110 49
|
806 |
+
selp.f32 %f357, 0f00000000, %f356, %p77;
|
807 |
+
.loc 2 112 17
|
808 |
+
fma.rn.f32 %f358, %f357, %f354, %f349;
|
809 |
+
.loc 2 113 15
|
810 |
+
add.f32 %f359, %f651, %f353;
|
811 |
+
.loc 2 113 30
|
812 |
+
mul.f32 %f360, %f354, %f354;
|
813 |
+
.loc 2 113 38
|
814 |
+
mul.f32 %f361, %f346, %f360;
|
815 |
+
.loc 2 113 22
|
816 |
+
fma.rn.f32 %f362, %f357, %f361, %f359;
|
817 |
+
.loc 2 108 21
|
818 |
+
sub.f32 %f363, %f668, %f358;
|
819 |
+
.loc 2 109 28
|
820 |
+
add.f32 %f364, %f308, %f355;
|
821 |
+
.loc 2 110 39
|
822 |
+
setp.eq.f32 %p78, %f364, 0f00000000;
|
823 |
+
.loc 2 110 60
|
824 |
+
mov.b32 %r199, %f364;
|
825 |
+
mov.b32 %r198, %f308;
|
826 |
+
div.full.f32 %r197, %r198, %r199;
|
827 |
+
mov.b32 %f365, %r197;
|
828 |
+
.loc 2 110 49
|
829 |
+
selp.f32 %f366, 0f00000000, %f365, %p78;
|
830 |
+
.loc 2 112 17
|
831 |
+
fma.rn.f32 %f367, %f366, %f363, %f358;
|
832 |
+
.loc 2 113 15
|
833 |
+
add.f32 %f368, %f652, %f362;
|
834 |
+
.loc 2 113 30
|
835 |
+
mul.f32 %f369, %f363, %f363;
|
836 |
+
.loc 2 113 38
|
837 |
+
mul.f32 %f370, %f355, %f369;
|
838 |
+
.loc 2 113 22
|
839 |
+
fma.rn.f32 %f371, %f366, %f370, %f368;
|
840 |
+
.loc 2 108 21
|
841 |
+
sub.f32 %f372, %f669, %f367;
|
842 |
+
.loc 2 109 28
|
843 |
+
add.f32 %f373, %f309, %f364;
|
844 |
+
.loc 2 110 39
|
845 |
+
setp.eq.f32 %p79, %f373, 0f00000000;
|
846 |
+
.loc 2 110 60
|
847 |
+
mov.b32 %r202, %f373;
|
848 |
+
mov.b32 %r201, %f309;
|
849 |
+
div.full.f32 %r200, %r201, %r202;
|
850 |
+
mov.b32 %f374, %r200;
|
851 |
+
.loc 2 110 49
|
852 |
+
selp.f32 %f375, 0f00000000, %f374, %p79;
|
853 |
+
.loc 2 112 17
|
854 |
+
fma.rn.f32 %f376, %f375, %f372, %f367;
|
855 |
+
.loc 2 113 15
|
856 |
+
add.f32 %f377, %f653, %f371;
|
857 |
+
.loc 2 113 30
|
858 |
+
mul.f32 %f378, %f372, %f372;
|
859 |
+
.loc 2 113 38
|
860 |
+
mul.f32 %f379, %f364, %f378;
|
861 |
+
.loc 2 113 22
|
862 |
+
fma.rn.f32 %f380, %f375, %f379, %f377;
|
863 |
+
.loc 2 108 21
|
864 |
+
sub.f32 %f381, %f671, %f670;
|
865 |
+
.loc 2 109 28
|
866 |
+
add.f32 %f382, %f310, %f311;
|
867 |
+
.loc 2 110 39
|
868 |
+
setp.eq.f32 %p80, %f382, 0f00000000;
|
869 |
+
.loc 2 110 60
|
870 |
+
mov.b32 %r204, %f311;
|
871 |
+
mov.b32 %r205, %f382;
|
872 |
+
div.full.f32 %r203, %r204, %r205;
|
873 |
+
mov.b32 %f383, %r203;
|
874 |
+
.loc 2 110 49
|
875 |
+
selp.f32 %f384, 0f00000000, %f383, %p80;
|
876 |
+
.loc 2 112 17
|
877 |
+
fma.rn.f32 %f385, %f381, %f384, %f670;
|
878 |
+
.loc 2 113 15
|
879 |
+
add.f32 %f386, %f654, %f655;
|
880 |
+
.loc 2 113 30
|
881 |
+
mul.f32 %f387, %f381, %f381;
|
882 |
+
.loc 2 113 38
|
883 |
+
mul.f32 %f388, %f387, %f310;
|
884 |
+
.loc 2 113 22
|
885 |
+
fma.rn.f32 %f389, %f388, %f384, %f386;
|
886 |
+
.loc 2 108 21
|
887 |
+
sub.f32 %f390, %f672, %f385;
|
888 |
+
.loc 2 109 28
|
889 |
+
add.f32 %f391, %f312, %f382;
|
890 |
+
.loc 2 110 39
|
891 |
+
setp.eq.f32 %p81, %f391, 0f00000000;
|
892 |
+
.loc 2 110 60
|
893 |
+
mov.b32 %r208, %f391;
|
894 |
+
mov.b32 %r207, %f312;
|
895 |
+
div.full.f32 %r206, %r207, %r208;
|
896 |
+
mov.b32 %f392, %r206;
|
897 |
+
.loc 2 110 49
|
898 |
+
selp.f32 %f393, 0f00000000, %f392, %p81;
|
899 |
+
.loc 2 112 17
|
900 |
+
fma.rn.f32 %f394, %f393, %f390, %f385;
|
901 |
+
.loc 2 113 15
|
902 |
+
add.f32 %f395, %f656, %f389;
|
903 |
+
.loc 2 113 30
|
904 |
+
mul.f32 %f396, %f390, %f390;
|
905 |
+
.loc 2 113 38
|
906 |
+
mul.f32 %f397, %f382, %f396;
|
907 |
+
.loc 2 113 22
|
908 |
+
fma.rn.f32 %f398, %f393, %f397, %f395;
|
909 |
+
.loc 2 108 21
|
910 |
+
sub.f32 %f399, %f673, %f394;
|
911 |
+
.loc 2 109 28
|
912 |
+
add.f32 %f400, %f313, %f391;
|
913 |
+
.loc 2 110 39
|
914 |
+
setp.eq.f32 %p82, %f400, 0f00000000;
|
915 |
+
.loc 2 110 60
|
916 |
+
mov.b32 %r211, %f400;
|
917 |
+
mov.b32 %r210, %f313;
|
918 |
+
div.full.f32 %r209, %r210, %r211;
|
919 |
+
mov.b32 %f401, %r209;
|
920 |
+
.loc 2 110 49
|
921 |
+
selp.f32 %f402, 0f00000000, %f401, %p82;
|
922 |
+
.loc 2 112 17
|
923 |
+
fma.rn.f32 %f403, %f402, %f399, %f394;
|
924 |
+
.loc 2 113 15
|
925 |
+
add.f32 %f404, %f657, %f398;
|
926 |
+
.loc 2 113 30
|
927 |
+
mul.f32 %f405, %f399, %f399;
|
928 |
+
.loc 2 113 38
|
929 |
+
mul.f32 %f406, %f391, %f405;
|
930 |
+
.loc 2 113 22
|
931 |
+
fma.rn.f32 %f407, %f402, %f406, %f404;
|
932 |
+
.loc 2 108 21
|
933 |
+
sub.f32 %f408, %f674, %f403;
|
934 |
+
.loc 2 109 28
|
935 |
+
add.f32 %f409, %f314, %f400;
|
936 |
+
.loc 2 110 39
|
937 |
+
setp.eq.f32 %p83, %f409, 0f00000000;
|
938 |
+
.loc 2 110 60
|
939 |
+
mov.b32 %r214, %f409;
|
940 |
+
mov.b32 %r213, %f314;
|
941 |
+
div.full.f32 %r212, %r213, %r214;
|
942 |
+
mov.b32 %f410, %r212;
|
943 |
+
.loc 2 110 49
|
944 |
+
selp.f32 %f411, 0f00000000, %f410, %p83;
|
945 |
+
.loc 2 112 17
|
946 |
+
fma.rn.f32 %f412, %f411, %f408, %f403;
|
947 |
+
.loc 2 113 15
|
948 |
+
add.f32 %f413, %f658, %f407;
|
949 |
+
.loc 2 113 30
|
950 |
+
mul.f32 %f414, %f408, %f408;
|
951 |
+
.loc 2 113 38
|
952 |
+
mul.f32 %f415, %f400, %f414;
|
953 |
+
.loc 2 113 22
|
954 |
+
fma.rn.f32 %f416, %f411, %f415, %f413;
|
955 |
+
.loc 2 108 21
|
956 |
+
sub.f32 %f417, %f675, %f412;
|
957 |
+
.loc 2 109 28
|
958 |
+
add.f32 %f418, %f315, %f409;
|
959 |
+
.loc 2 110 39
|
960 |
+
setp.eq.f32 %p84, %f418, 0f00000000;
|
961 |
+
.loc 2 110 60
|
962 |
+
mov.b32 %r217, %f418;
|
963 |
+
mov.b32 %r216, %f315;
|
964 |
+
div.full.f32 %r215, %r216, %r217;
|
965 |
+
mov.b32 %f419, %r215;
|
966 |
+
.loc 2 110 49
|
967 |
+
selp.f32 %f420, 0f00000000, %f419, %p84;
|
968 |
+
.loc 2 112 17
|
969 |
+
fma.rn.f32 %f421, %f420, %f417, %f412;
|
970 |
+
.loc 2 113 15
|
971 |
+
add.f32 %f422, %f659, %f416;
|
972 |
+
.loc 2 113 30
|
973 |
+
mul.f32 %f423, %f417, %f417;
|
974 |
+
.loc 2 113 38
|
975 |
+
mul.f32 %f424, %f409, %f423;
|
976 |
+
.loc 2 113 22
|
977 |
+
fma.rn.f32 %f425, %f420, %f424, %f422;
|
978 |
+
.loc 2 108 21
|
979 |
+
sub.f32 %f426, %f676, %f421;
|
980 |
+
.loc 2 109 28
|
981 |
+
add.f32 %f427, %f316, %f418;
|
982 |
+
.loc 2 110 39
|
983 |
+
setp.eq.f32 %p85, %f427, 0f00000000;
|
984 |
+
.loc 2 110 60
|
985 |
+
mov.b32 %r220, %f427;
|
986 |
+
mov.b32 %r219, %f316;
|
987 |
+
div.full.f32 %r218, %r219, %r220;
|
988 |
+
mov.b32 %f428, %r218;
|
989 |
+
.loc 2 110 49
|
990 |
+
selp.f32 %f429, 0f00000000, %f428, %p85;
|
991 |
+
.loc 2 112 17
|
992 |
+
fma.rn.f32 %f430, %f429, %f426, %f421;
|
993 |
+
.loc 2 113 15
|
994 |
+
add.f32 %f431, %f660, %f425;
|
995 |
+
.loc 2 113 30
|
996 |
+
mul.f32 %f432, %f426, %f426;
|
997 |
+
.loc 2 113 38
|
998 |
+
mul.f32 %f433, %f418, %f432;
|
999 |
+
.loc 2 113 22
|
1000 |
+
fma.rn.f32 %f434, %f429, %f433, %f431;
|
1001 |
+
.loc 2 108 21
|
1002 |
+
sub.f32 %f435, %f677, %f430;
|
1003 |
+
.loc 2 109 28
|
1004 |
+
add.f32 %f436, %f317, %f427;
|
1005 |
+
.loc 2 110 39
|
1006 |
+
setp.eq.f32 %p86, %f436, 0f00000000;
|
1007 |
+
.loc 2 110 60
|
1008 |
+
mov.b32 %r223, %f436;
|
1009 |
+
mov.b32 %r222, %f317;
|
1010 |
+
div.full.f32 %r221, %r222, %r223;
|
1011 |
+
mov.b32 %f437, %r221;
|
1012 |
+
.loc 2 110 49
|
1013 |
+
selp.f32 %f438, 0f00000000, %f437, %p86;
|
1014 |
+
.loc 2 112 17
|
1015 |
+
fma.rn.f32 %f439, %f438, %f435, %f430;
|
1016 |
+
.loc 2 113 15
|
1017 |
+
add.f32 %f440, %f661, %f434;
|
1018 |
+
.loc 2 113 30
|
1019 |
+
mul.f32 %f441, %f435, %f435;
|
1020 |
+
.loc 2 113 38
|
1021 |
+
mul.f32 %f442, %f427, %f441;
|
1022 |
+
.loc 2 113 22
|
1023 |
+
fma.rn.f32 %f443, %f438, %f442, %f440;
|
1024 |
+
$L__tmp4:
|
1025 |
+
.loc 2 120 46
|
1026 |
+
mov.b32 %r299, %f376;
|
1027 |
+
shfl.sync.bfly.b32 %r300, %r299, 4, 31, -1;
|
1028 |
+
mov.b32 %f444, %r300;
|
1029 |
+
mov.b32 %r301, %f380;
|
1030 |
+
shfl.sync.bfly.b32 %r302, %r301, 4, 31, -1;
|
1031 |
+
mov.b32 %f445, %r302;
|
1032 |
+
shfl.sync.bfly.b32 %r225, %r202, 4, 31, -1;
|
1033 |
+
mov.b32 %f446, %r225;
|
1034 |
+
$L__tmp5:
|
1035 |
+
.loc 2 108 21
|
1036 |
+
sub.f32 %f447, %f444, %f376;
|
1037 |
+
.loc 2 109 28
|
1038 |
+
add.f32 %f448, %f373, %f446;
|
1039 |
+
.loc 2 110 39
|
1040 |
+
setp.eq.f32 %p87, %f448, 0f00000000;
|
1041 |
+
.loc 2 110 60
|
1042 |
+
mov.b32 %r226, %f448;
|
1043 |
+
div.full.f32 %r224, %r225, %r226;
|
1044 |
+
mov.b32 %f449, %r224;
|
1045 |
+
.loc 2 110 49
|
1046 |
+
selp.f32 %f450, 0f00000000, %f449, %p87;
|
1047 |
+
.loc 2 112 17
|
1048 |
+
fma.rn.f32 %f451, %f450, %f447, %f376;
|
1049 |
+
.loc 2 113 15
|
1050 |
+
add.f32 %f452, %f380, %f445;
|
1051 |
+
.loc 2 113 30
|
1052 |
+
mul.f32 %f453, %f447, %f447;
|
1053 |
+
.loc 2 113 38
|
1054 |
+
mul.f32 %f454, %f373, %f453;
|
1055 |
+
.loc 2 113 22
|
1056 |
+
fma.rn.f32 %f455, %f450, %f454, %f452;
|
1057 |
+
$L__tmp6:
|
1058 |
+
.loc 2 120 46
|
1059 |
+
mov.b32 %r303, %f451;
|
1060 |
+
shfl.sync.bfly.b32 %r304, %r303, 2, 31, -1;
|
1061 |
+
mov.b32 %f456, %r304;
|
1062 |
+
mov.b32 %r305, %f455;
|
1063 |
+
shfl.sync.bfly.b32 %r306, %r305, 2, 31, -1;
|
1064 |
+
mov.b32 %f457, %r306;
|
1065 |
+
shfl.sync.bfly.b32 %r228, %r226, 2, 31, -1;
|
1066 |
+
mov.b32 %f458, %r228;
|
1067 |
+
$L__tmp7:
|
1068 |
+
.loc 2 108 21
|
1069 |
+
sub.f32 %f459, %f456, %f451;
|
1070 |
+
.loc 2 109 28
|
1071 |
+
add.f32 %f460, %f448, %f458;
|
1072 |
+
.loc 2 110 39
|
1073 |
+
setp.eq.f32 %p88, %f460, 0f00000000;
|
1074 |
+
.loc 2 110 60
|
1075 |
+
mov.b32 %r229, %f460;
|
1076 |
+
div.full.f32 %r227, %r228, %r229;
|
1077 |
+
mov.b32 %f461, %r227;
|
1078 |
+
.loc 2 110 49
|
1079 |
+
selp.f32 %f462, 0f00000000, %f461, %p88;
|
1080 |
+
.loc 2 112 17
|
1081 |
+
fma.rn.f32 %f463, %f462, %f459, %f451;
|
1082 |
+
.loc 2 113 15
|
1083 |
+
add.f32 %f464, %f455, %f457;
|
1084 |
+
.loc 2 113 30
|
1085 |
+
mul.f32 %f465, %f459, %f459;
|
1086 |
+
.loc 2 113 38
|
1087 |
+
mul.f32 %f466, %f448, %f465;
|
1088 |
+
.loc 2 113 22
|
1089 |
+
fma.rn.f32 %f467, %f462, %f466, %f464;
|
1090 |
+
$L__tmp8:
|
1091 |
+
.loc 2 120 46
|
1092 |
+
mov.b32 %r307, %f463;
|
1093 |
+
shfl.sync.bfly.b32 %r308, %r307, 1, 31, -1;
|
1094 |
+
mov.b32 %f468, %r308;
|
1095 |
+
mov.b32 %r309, %f467;
|
1096 |
+
shfl.sync.bfly.b32 %r310, %r309, 1, 31, -1;
|
1097 |
+
mov.b32 %f469, %r310;
|
1098 |
+
shfl.sync.bfly.b32 %r231, %r229, 1, 31, -1;
|
1099 |
+
mov.b32 %f470, %r231;
|
1100 |
+
$L__tmp9:
|
1101 |
+
.loc 2 108 21
|
1102 |
+
sub.f32 %f471, %f468, %f463;
|
1103 |
+
.loc 2 109 28
|
1104 |
+
add.f32 %f472, %f460, %f470;
|
1105 |
+
.loc 2 110 39
|
1106 |
+
setp.eq.f32 %p89, %f472, 0f00000000;
|
1107 |
+
.loc 2 110 60
|
1108 |
+
mov.b32 %r232, %f472;
|
1109 |
+
div.full.f32 %r230, %r231, %r232;
|
1110 |
+
mov.b32 %f473, %r230;
|
1111 |
+
.loc 2 110 49
|
1112 |
+
selp.f32 %f474, 0f00000000, %f473, %p89;
|
1113 |
+
.loc 2 112 17
|
1114 |
+
fma.rn.f32 %f161, %f471, %f474, %f463;
|
1115 |
+
.loc 2 113 15
|
1116 |
+
add.f32 %f475, %f467, %f469;
|
1117 |
+
.loc 2 113 30
|
1118 |
+
mul.f32 %f476, %f471, %f471;
|
1119 |
+
.loc 2 113 38
|
1120 |
+
mul.f32 %f477, %f460, %f476;
|
1121 |
+
.loc 2 113 22
|
1122 |
+
fma.rn.f32 %f478, %f474, %f477, %f475;
|
1123 |
+
$L__tmp10:
|
1124 |
+
.loc 2 120 46
|
1125 |
+
mov.b32 %r311, %f439;
|
1126 |
+
shfl.sync.bfly.b32 %r312, %r311, 4, 31, -1;
|
1127 |
+
mov.b32 %f479, %r312;
|
1128 |
+
mov.b32 %r313, %f443;
|
1129 |
+
shfl.sync.bfly.b32 %r314, %r313, 4, 31, -1;
|
1130 |
+
mov.b32 %f480, %r314;
|
1131 |
+
shfl.sync.bfly.b32 %r234, %r223, 4, 31, -1;
|
1132 |
+
mov.b32 %f481, %r234;
|
1133 |
+
$L__tmp11:
|
1134 |
+
.loc 2 108 21
|
1135 |
+
sub.f32 %f482, %f479, %f439;
|
1136 |
+
.loc 2 109 28
|
1137 |
+
add.f32 %f483, %f436, %f481;
|
1138 |
+
.loc 2 110 39
|
1139 |
+
setp.eq.f32 %p90, %f483, 0f00000000;
|
1140 |
+
.loc 2 110 60
|
1141 |
+
mov.b32 %r235, %f483;
|
1142 |
+
div.full.f32 %r233, %r234, %r235;
|
1143 |
+
mov.b32 %f484, %r233;
|
1144 |
+
.loc 2 110 49
|
1145 |
+
selp.f32 %f485, 0f00000000, %f484, %p90;
|
1146 |
+
.loc 2 112 17
|
1147 |
+
fma.rn.f32 %f486, %f482, %f485, %f439;
|
1148 |
+
.loc 2 113 15
|
1149 |
+
add.f32 %f487, %f443, %f480;
|
1150 |
+
.loc 2 113 30
|
1151 |
+
mul.f32 %f488, %f482, %f482;
|
1152 |
+
.loc 2 113 38
|
1153 |
+
mul.f32 %f489, %f436, %f488;
|
1154 |
+
.loc 2 113 22
|
1155 |
+
fma.rn.f32 %f490, %f489, %f485, %f487;
|
1156 |
+
$L__tmp12:
|
1157 |
+
.loc 2 120 46
|
1158 |
+
mov.b32 %r315, %f486;
|
1159 |
+
shfl.sync.bfly.b32 %r316, %r315, 2, 31, -1;
|
1160 |
+
mov.b32 %f491, %r316;
|
1161 |
+
mov.b32 %r317, %f490;
|
1162 |
+
shfl.sync.bfly.b32 %r318, %r317, 2, 31, -1;
|
1163 |
+
mov.b32 %f492, %r318;
|
1164 |
+
shfl.sync.bfly.b32 %r237, %r235, 2, 31, -1;
|
1165 |
+
mov.b32 %f493, %r237;
|
1166 |
+
$L__tmp13:
|
1167 |
+
.loc 2 108 21
|
1168 |
+
sub.f32 %f494, %f491, %f486;
|
1169 |
+
.loc 2 109 28
|
1170 |
+
add.f32 %f495, %f483, %f493;
|
1171 |
+
.loc 2 110 39
|
1172 |
+
setp.eq.f32 %p91, %f495, 0f00000000;
|
1173 |
+
.loc 2 110 60
|
1174 |
+
mov.b32 %r238, %f495;
|
1175 |
+
div.full.f32 %r236, %r237, %r238;
|
1176 |
+
mov.b32 %f496, %r236;
|
1177 |
+
.loc 2 110 49
|
1178 |
+
selp.f32 %f497, 0f00000000, %f496, %p91;
|
1179 |
+
.loc 2 112 17
|
1180 |
+
fma.rn.f32 %f498, %f494, %f497, %f486;
|
1181 |
+
.loc 2 113 15
|
1182 |
+
add.f32 %f499, %f490, %f492;
|
1183 |
+
.loc 2 113 30
|
1184 |
+
mul.f32 %f500, %f494, %f494;
|
1185 |
+
.loc 2 113 38
|
1186 |
+
mul.f32 %f501, %f483, %f500;
|
1187 |
+
.loc 2 113 22
|
1188 |
+
fma.rn.f32 %f502, %f497, %f501, %f499;
|
1189 |
+
$L__tmp14:
|
1190 |
+
.loc 2 120 46
|
1191 |
+
mov.b32 %r319, %f498;
|
1192 |
+
shfl.sync.bfly.b32 %r320, %r319, 1, 31, -1;
|
1193 |
+
mov.b32 %f503, %r320;
|
1194 |
+
mov.b32 %r321, %f502;
|
1195 |
+
shfl.sync.bfly.b32 %r322, %r321, 1, 31, -1;
|
1196 |
+
mov.b32 %f504, %r322;
|
1197 |
+
shfl.sync.bfly.b32 %r240, %r238, 1, 31, -1;
|
1198 |
+
mov.b32 %f505, %r240;
|
1199 |
+
$L__tmp15:
|
1200 |
+
.loc 2 108 21
|
1201 |
+
sub.f32 %f506, %f503, %f498;
|
1202 |
+
.loc 2 109 28
|
1203 |
+
add.f32 %f507, %f495, %f505;
|
1204 |
+
.loc 2 110 39
|
1205 |
+
setp.eq.f32 %p92, %f507, 0f00000000;
|
1206 |
+
.loc 2 110 60
|
1207 |
+
mov.b32 %r241, %f507;
|
1208 |
+
div.full.f32 %r239, %r240, %r241;
|
1209 |
+
mov.b32 %f508, %r239;
|
1210 |
+
.loc 2 110 49
|
1211 |
+
selp.f32 %f509, 0f00000000, %f508, %p92;
|
1212 |
+
.loc 2 112 17
|
1213 |
+
fma.rn.f32 %f162, %f506, %f509, %f498;
|
1214 |
+
.loc 2 113 15
|
1215 |
+
add.f32 %f510, %f502, %f504;
|
1216 |
+
.loc 2 113 30
|
1217 |
+
mul.f32 %f511, %f506, %f506;
|
1218 |
+
.loc 2 113 38
|
1219 |
+
mul.f32 %f512, %f495, %f511;
|
1220 |
+
.loc 2 113 22
|
1221 |
+
fma.rn.f32 %f513, %f509, %f512, %f510;
|
1222 |
+
$L__tmp16:
|
1223 |
+
.loc 1 75 24
|
1224 |
+
mov.b32 %r243, %f478;
|
1225 |
+
mov.b32 %r244, 1132462080;
|
1226 |
+
div.full.f32 %r242, %r243, %r244;
|
1227 |
+
mov.b32 %f514, %r242;
|
1228 |
+
mov.b32 %r267, %f513;
|
1229 |
+
div.full.f32 %r266, %r267, %r244;
|
1230 |
+
mov.b32 %f515, %r266;
|
1231 |
+
.loc 1 77 24
|
1232 |
+
add.f32 %f163, %f514, 0f3727C5AC;
|
1233 |
+
add.f32 %f164, %f515, 0f3727C5AC;
|
1234 |
+
.loc 1 58 36
|
1235 |
+
add.s64 %rd9, %rd15, %rd2;
|
1236 |
+
mov.u64 %rd117, 0;
|
1237 |
+
mov.b32 %r473, -64;
|
1238 |
+
rsqrt.approx.ftz.f32 %f580, %f163;
|
1239 |
+
rsqrt.approx.ftz.f32 %f581, %f164;
|
1240 |
+
bra.uni $L__BB0_5;
|
1241 |
+
$L__BB0_7:
|
1242 |
+
.loc 1 69 35
|
1243 |
+
add.s64 %rd107, %rd4, %rd117;
|
1244 |
+
add.s64 %rd108, %rd107, 16;
|
1245 |
+
add.s64 %rd109, %rd3, %rd117;
|
1246 |
+
.loc 1 69 54
|
1247 |
+
add.s64 %rd110, %rd109, 16;
|
1248 |
+
mov.u32 %r407, 0x0;
|
1249 |
+
mov.u32 %r408, 0x0;
|
1250 |
+
mov.u32 %r409, 0x0;
|
1251 |
+
mov.u32 %r410, 0x0;
|
1252 |
+
@%p1 ld.global.L1::evict_first.v4.b32 { %r407, %r408, %r409, %r410 }, [ %rd107 + 0 ];
|
1253 |
+
@!%p1 mov.u32 %r407, %r411;
|
1254 |
+
@!%p1 mov.u32 %r408, %r411;
|
1255 |
+
@!%p1 mov.u32 %r409, %r411;
|
1256 |
+
@!%p1 mov.u32 %r410, %r411;
|
1257 |
+
mov.b32 %f516, %r407;
|
1258 |
+
mov.b32 %f517, %r408;
|
1259 |
+
mov.b32 %f518, %r409;
|
1260 |
+
mov.b32 %f519, %r410;
|
1261 |
+
mov.u32 %r415, 0x0;
|
1262 |
+
mov.u32 %r416, 0x0;
|
1263 |
+
mov.u32 %r417, 0x0;
|
1264 |
+
mov.u32 %r418, 0x0;
|
1265 |
+
@%p1 ld.global.L1::evict_first.v4.b32 { %r415, %r416, %r417, %r418 }, [ %rd108 + 0 ];
|
1266 |
+
@!%p1 mov.u32 %r415, %r411;
|
1267 |
+
@!%p1 mov.u32 %r416, %r411;
|
1268 |
+
@!%p1 mov.u32 %r417, %r411;
|
1269 |
+
@!%p1 mov.u32 %r418, %r411;
|
1270 |
+
mov.b32 %f520, %r415;
|
1271 |
+
mov.b32 %f521, %r416;
|
1272 |
+
mov.b32 %f522, %r417;
|
1273 |
+
mov.b32 %f523, %r418;
|
1274 |
+
mov.u32 %r423, 0x0;
|
1275 |
+
mov.u32 %r424, 0x0;
|
1276 |
+
mov.u32 %r425, 0x0;
|
1277 |
+
mov.u32 %r426, 0x0;
|
1278 |
+
@%p1 ld.global.L1::evict_first.v4.b32 { %r423, %r424, %r425, %r426 }, [ %rd109 + 0 ];
|
1279 |
+
@!%p1 mov.u32 %r423, %r411;
|
1280 |
+
@!%p1 mov.u32 %r424, %r411;
|
1281 |
+
@!%p1 mov.u32 %r425, %r411;
|
1282 |
+
@!%p1 mov.u32 %r426, %r411;
|
1283 |
+
mov.b32 %f524, %r423;
|
1284 |
+
mov.b32 %f525, %r424;
|
1285 |
+
mov.b32 %f526, %r425;
|
1286 |
+
mov.b32 %f527, %r426;
|
1287 |
+
mov.u32 %r431, 0x0;
|
1288 |
+
mov.u32 %r432, 0x0;
|
1289 |
+
mov.u32 %r433, 0x0;
|
1290 |
+
mov.u32 %r434, 0x0;
|
1291 |
+
@%p1 ld.global.L1::evict_first.v4.b32 { %r431, %r432, %r433, %r434 }, [ %rd110 + 0 ];
|
1292 |
+
@!%p1 mov.u32 %r431, %r411;
|
1293 |
+
@!%p1 mov.u32 %r432, %r411;
|
1294 |
+
@!%p1 mov.u32 %r433, %r411;
|
1295 |
+
@!%p1 mov.u32 %r434, %r411;
|
1296 |
+
mov.b32 %f528, %r431;
|
1297 |
+
mov.b32 %f529, %r432;
|
1298 |
+
mov.b32 %f530, %r433;
|
1299 |
+
mov.b32 %f531, %r434;
|
1300 |
+
.loc 1 70 24
|
1301 |
+
add.f32 %f532, %f165, %f516;
|
1302 |
+
add.f32 %f533, %f166, %f517;
|
1303 |
+
add.f32 %f534, %f167, %f518;
|
1304 |
+
add.f32 %f535, %f168, %f519;
|
1305 |
+
add.f32 %f536, %f169, %f520;
|
1306 |
+
add.f32 %f537, %f170, %f521;
|
1307 |
+
add.f32 %f538, %f171, %f522;
|
1308 |
+
add.f32 %f539, %f172, %f523;
|
1309 |
+
add.f32 %f540, %f173, %f524;
|
1310 |
+
add.f32 %f541, %f174, %f525;
|
1311 |
+
add.f32 %f542, %f175, %f526;
|
1312 |
+
add.f32 %f543, %f176, %f527;
|
1313 |
+
add.f32 %f544, %f177, %f528;
|
1314 |
+
add.f32 %f545, %f178, %f529;
|
1315 |
+
add.f32 %f546, %f179, %f530;
|
1316 |
+
add.f32 %f547, %f180, %f531;
|
1317 |
+
.loc 1 72 24
|
1318 |
+
add.f32 %f548, %f181, %f532;
|
1319 |
+
add.f32 %f549, %f182, %f533;
|
1320 |
+
add.f32 %f550, %f183, %f534;
|
1321 |
+
add.f32 %f551, %f184, %f535;
|
1322 |
+
add.f32 %f552, %f185, %f536;
|
1323 |
+
add.f32 %f553, %f186, %f537;
|
1324 |
+
add.f32 %f554, %f187, %f538;
|
1325 |
+
add.f32 %f555, %f188, %f539;
|
1326 |
+
add.f32 %f556, %f189, %f540;
|
1327 |
+
add.f32 %f557, %f190, %f541;
|
1328 |
+
add.f32 %f558, %f191, %f542;
|
1329 |
+
add.f32 %f559, %f192, %f543;
|
1330 |
+
add.f32 %f560, %f193, %f544;
|
1331 |
+
add.f32 %f561, %f194, %f545;
|
1332 |
+
add.f32 %f562, %f195, %f546;
|
1333 |
+
add.f32 %f563, %f196, %f547;
|
1334 |
+
.loc 1 73 24
|
1335 |
+
sub.f32 %f564, %f548, %f161;
|
1336 |
+
sub.f32 %f565, %f549, %f161;
|
1337 |
+
sub.f32 %f566, %f550, %f161;
|
1338 |
+
sub.f32 %f567, %f551, %f161;
|
1339 |
+
sub.f32 %f568, %f552, %f161;
|
1340 |
+
sub.f32 %f569, %f553, %f161;
|
1341 |
+
sub.f32 %f570, %f554, %f161;
|
1342 |
+
sub.f32 %f571, %f555, %f161;
|
1343 |
+
sub.f32 %f572, %f556, %f162;
|
1344 |
+
sub.f32 %f573, %f557, %f162;
|
1345 |
+
sub.f32 %f574, %f558, %f162;
|
1346 |
+
sub.f32 %f575, %f559, %f162;
|
1347 |
+
sub.f32 %f576, %f560, %f162;
|
1348 |
+
sub.f32 %f577, %f561, %f162;
|
1349 |
+
sub.f32 %f578, %f562, %f162;
|
1350 |
+
sub.f32 %f579, %f563, %f162;
|
1351 |
+
.loc 1 79 24
|
1352 |
+
mul.f32 %f582, %f564, %f580;
|
1353 |
+
mul.f32 %f583, %f565, %f580;
|
1354 |
+
mul.f32 %f584, %f566, %f580;
|
1355 |
+
mul.f32 %f585, %f567, %f580;
|
1356 |
+
mul.f32 %f586, %f568, %f580;
|
1357 |
+
mul.f32 %f587, %f569, %f580;
|
1358 |
+
mul.f32 %f588, %f570, %f580;
|
1359 |
+
mul.f32 %f589, %f571, %f580;
|
1360 |
+
mul.f32 %f590, %f572, %f581;
|
1361 |
+
mul.f32 %f591, %f573, %f581;
|
1362 |
+
mul.f32 %f592, %f574, %f581;
|
1363 |
+
mul.f32 %f593, %f575, %f581;
|
1364 |
+
mul.f32 %f594, %f576, %f581;
|
1365 |
+
mul.f32 %f595, %f577, %f581;
|
1366 |
+
mul.f32 %f596, %f578, %f581;
|
1367 |
+
mul.f32 %f597, %f579, %f581;
|
1368 |
+
.loc 1 80 24
|
1369 |
+
mul.f32 %f598, %f582, %f197;
|
1370 |
+
mul.f32 %f599, %f583, %f198;
|
1371 |
+
mul.f32 %f600, %f584, %f199;
|
1372 |
+
mul.f32 %f601, %f585, %f200;
|
1373 |
+
mul.f32 %f602, %f586, %f201;
|
1374 |
+
mul.f32 %f603, %f587, %f202;
|
1375 |
+
mul.f32 %f604, %f588, %f203;
|
1376 |
+
mul.f32 %f605, %f589, %f204;
|
1377 |
+
mul.f32 %f606, %f590, %f197;
|
1378 |
+
mul.f32 %f607, %f591, %f198;
|
1379 |
+
mul.f32 %f608, %f592, %f199;
|
1380 |
+
mul.f32 %f609, %f593, %f200;
|
1381 |
+
mul.f32 %f610, %f594, %f201;
|
1382 |
+
mul.f32 %f611, %f595, %f202;
|
1383 |
+
mul.f32 %f612, %f596, %f203;
|
1384 |
+
mul.f32 %f613, %f597, %f204;
|
1385 |
+
.loc 1 82 29
|
1386 |
+
shl.b64 %rd113, %rd11, 1;
|
1387 |
+
add.s64 %rd111, %rd16, %rd113;
|
1388 |
+
shl.b64 %rd114, %rd12, 1;
|
1389 |
+
add.s64 %rd112, %rd16, %rd114;
|
1390 |
+
.loc 1 82 52
|
1391 |
+
mov.b32 %r439, %f598;
|
1392 |
+
cvt.rn.bf16.f32 %rs33, %r439;
|
1393 |
+
mov.b32 %r440, %f599;
|
1394 |
+
cvt.rn.bf16.f32 %rs34, %r440;
|
1395 |
+
mov.b32 %r441, %f600;
|
1396 |
+
cvt.rn.bf16.f32 %rs35, %r441;
|
1397 |
+
mov.b32 %r442, %f601;
|
1398 |
+
cvt.rn.bf16.f32 %rs36, %r442;
|
1399 |
+
mov.b32 %r443, %f602;
|
1400 |
+
cvt.rn.bf16.f32 %rs37, %r443;
|
1401 |
+
mov.b32 %r444, %f603;
|
1402 |
+
cvt.rn.bf16.f32 %rs38, %r444;
|
1403 |
+
mov.b32 %r445, %f604;
|
1404 |
+
cvt.rn.bf16.f32 %rs39, %r445;
|
1405 |
+
mov.b32 %r446, %f605;
|
1406 |
+
cvt.rn.bf16.f32 %rs40, %r446;
|
1407 |
+
mov.b32 %r447, %f606;
|
1408 |
+
cvt.rn.bf16.f32 %rs41, %r447;
|
1409 |
+
mov.b32 %r448, %f607;
|
1410 |
+
cvt.rn.bf16.f32 %rs42, %r448;
|
1411 |
+
mov.b32 %r449, %f608;
|
1412 |
+
cvt.rn.bf16.f32 %rs43, %r449;
|
1413 |
+
mov.b32 %r450, %f609;
|
1414 |
+
cvt.rn.bf16.f32 %rs44, %r450;
|
1415 |
+
mov.b32 %r451, %f610;
|
1416 |
+
cvt.rn.bf16.f32 %rs45, %r451;
|
1417 |
+
mov.b32 %r452, %f611;
|
1418 |
+
cvt.rn.bf16.f32 %rs46, %r452;
|
1419 |
+
mov.b32 %r453, %f612;
|
1420 |
+
cvt.rn.bf16.f32 %rs47, %r453;
|
1421 |
+
mov.b32 %r454, %f613;
|
1422 |
+
cvt.rn.bf16.f32 %rs48, %r454;
|
1423 |
+
mov.b32 %r463, {%rs33, %rs34};
|
1424 |
+
mov.b32 %r464, {%rs35, %rs36};
|
1425 |
+
mov.b32 %r465, {%rs37, %rs38};
|
1426 |
+
mov.b32 %r466, {%rs39, %rs40};
|
1427 |
+
@%p1 st.global.v4.b32 [ %rd111 + 0 ], { %r463, %r464, %r465, %r466 };
|
1428 |
+
mov.b32 %r467, {%rs41, %rs42};
|
1429 |
+
mov.b32 %r468, {%rs43, %rs44};
|
1430 |
+
mov.b32 %r469, {%rs45, %rs46};
|
1431 |
+
mov.b32 %r470, {%rs47, %rs48};
|
1432 |
+
@%p1 st.global.v4.b32 [ %rd112 + 0 ], { %r467, %r468, %r469, %r470 };
|
1433 |
+
.loc 1 58 36
|
1434 |
+
add.s64 %rd117, %rd117, 256;
|
1435 |
+
add.s32 %r473, %r473, 64;
|
1436 |
+
setp.lt.u32 %p156, %r473, 192;
|
1437 |
+
@%p156 bra $L__BB0_5;
|
1438 |
+
bra.uni $L__BB0_8;
|
1439 |
+
$L__BB0_5:
|
1440 |
+
.loc 1 62 35
|
1441 |
+
add.s64 %rd90, %rd6, %rd117;
|
1442 |
+
add.s64 %rd91, %rd90, 16;
|
1443 |
+
add.s64 %rd92, %rd5, %rd117;
|
1444 |
+
.loc 1 62 51
|
1445 |
+
add.s64 %rd93, %rd92, 16;
|
1446 |
+
mov.u32 %r323, 0x0;
|
1447 |
+
mov.u32 %r324, 0x0;
|
1448 |
+
mov.u32 %r325, 0x0;
|
1449 |
+
mov.u32 %r326, 0x0;
|
1450 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r323, %r324, %r325, %r326 }, [ %rd90 + 0 ];
|
1451 |
+
@!%p1 mov.u32 %r323, %r411;
|
1452 |
+
@!%p1 mov.u32 %r324, %r411;
|
1453 |
+
@!%p1 mov.u32 %r325, %r411;
|
1454 |
+
@!%p1 mov.u32 %r326, %r411;
|
1455 |
+
mov.b32 %f165, %r323;
|
1456 |
+
mov.b32 %f166, %r324;
|
1457 |
+
mov.b32 %f167, %r325;
|
1458 |
+
mov.b32 %f168, %r326;
|
1459 |
+
mov.u32 %r331, 0x0;
|
1460 |
+
mov.u32 %r332, 0x0;
|
1461 |
+
mov.u32 %r333, 0x0;
|
1462 |
+
mov.u32 %r334, 0x0;
|
1463 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r331, %r332, %r333, %r334 }, [ %rd91 + 0 ];
|
1464 |
+
@!%p1 mov.u32 %r331, %r411;
|
1465 |
+
@!%p1 mov.u32 %r332, %r411;
|
1466 |
+
@!%p1 mov.u32 %r333, %r411;
|
1467 |
+
@!%p1 mov.u32 %r334, %r411;
|
1468 |
+
mov.b32 %f169, %r331;
|
1469 |
+
mov.b32 %f170, %r332;
|
1470 |
+
mov.b32 %f171, %r333;
|
1471 |
+
mov.b32 %f172, %r334;
|
1472 |
+
mov.u32 %r339, 0x0;
|
1473 |
+
mov.u32 %r340, 0x0;
|
1474 |
+
mov.u32 %r341, 0x0;
|
1475 |
+
mov.u32 %r342, 0x0;
|
1476 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r339, %r340, %r341, %r342 }, [ %rd92 + 0 ];
|
1477 |
+
@!%p1 mov.u32 %r339, %r411;
|
1478 |
+
@!%p1 mov.u32 %r340, %r411;
|
1479 |
+
@!%p1 mov.u32 %r341, %r411;
|
1480 |
+
@!%p1 mov.u32 %r342, %r411;
|
1481 |
+
mov.b32 %f173, %r339;
|
1482 |
+
mov.b32 %f174, %r340;
|
1483 |
+
mov.b32 %f175, %r341;
|
1484 |
+
mov.b32 %f176, %r342;
|
1485 |
+
mov.u32 %r347, 0x0;
|
1486 |
+
mov.u32 %r348, 0x0;
|
1487 |
+
mov.u32 %r349, 0x0;
|
1488 |
+
mov.u32 %r350, 0x0;
|
1489 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r347, %r348, %r349, %r350 }, [ %rd93 + 0 ];
|
1490 |
+
@!%p1 mov.u32 %r347, %r411;
|
1491 |
+
@!%p1 mov.u32 %r348, %r411;
|
1492 |
+
@!%p1 mov.u32 %r349, %r411;
|
1493 |
+
@!%p1 mov.u32 %r350, %r411;
|
1494 |
+
mov.b32 %f177, %r347;
|
1495 |
+
mov.b32 %f178, %r348;
|
1496 |
+
mov.b32 %f179, %r349;
|
1497 |
+
mov.b32 %f180, %r350;
|
1498 |
+
.loc 1 63 41
|
1499 |
+
add.s32 %r403, %r5, %r473;
|
1500 |
+
add.s32 %r404, %r403, 64;
|
1501 |
+
.loc 1 63 35
|
1502 |
+
add.s32 %r405, %r403, 8256;
|
1503 |
+
cvt.s64.s32 %rd11, %r404;
|
1504 |
+
mul.wide.s32 %rd98, %r404, 2;
|
1505 |
+
add.s64 %rd94, %rd14, %rd98;
|
1506 |
+
cvt.s64.s32 %rd12, %r405;
|
1507 |
+
mul.wide.s32 %rd99, %r405, 2;
|
1508 |
+
add.s64 %rd95, %rd14, %rd99;
|
1509 |
+
.loc 1 63 51
|
1510 |
+
mov.u32 %r355, 0x0;
|
1511 |
+
mov.u32 %r356, 0x0;
|
1512 |
+
mov.u32 %r357, 0x0;
|
1513 |
+
mov.u32 %r358, 0x0;
|
1514 |
+
@%p1 ld.global.L1::evict_first.v4.b32 { %r355, %r356, %r357, %r358 }, [ %rd94 + 0 ];
|
1515 |
+
@!%p1 mov.u32 %r355, %r411;
|
1516 |
+
@!%p1 mov.u32 %r356, %r411;
|
1517 |
+
@!%p1 mov.u32 %r357, %r411;
|
1518 |
+
@!%p1 mov.u32 %r358, %r411;
|
1519 |
+
cvt.u16.u32 %rs17, %r355;
|
1520 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs18}, %r355; }
|
1521 |
+
cvt.u16.u32 %rs19, %r356;
|
1522 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs20}, %r356; }
|
1523 |
+
cvt.u16.u32 %rs21, %r357;
|
1524 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs22}, %r357; }
|
1525 |
+
cvt.u16.u32 %rs23, %r358;
|
1526 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs24}, %r358; }
|
1527 |
+
mov.u32 %r363, 0x0;
|
1528 |
+
mov.u32 %r364, 0x0;
|
1529 |
+
mov.u32 %r365, 0x0;
|
1530 |
+
mov.u32 %r366, 0x0;
|
1531 |
+
@%p1 ld.global.L1::evict_first.v4.b32 { %r363, %r364, %r365, %r366 }, [ %rd95 + 0 ];
|
1532 |
+
@!%p1 mov.u32 %r363, %r411;
|
1533 |
+
@!%p1 mov.u32 %r364, %r411;
|
1534 |
+
@!%p1 mov.u32 %r365, %r411;
|
1535 |
+
@!%p1 mov.u32 %r366, %r411;
|
1536 |
+
cvt.u16.u32 %rs25, %r363;
|
1537 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs26}, %r363; }
|
1538 |
+
cvt.u16.u32 %rs27, %r364;
|
1539 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs28}, %r364; }
|
1540 |
+
cvt.u16.u32 %rs29, %r365;
|
1541 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs30}, %r365; }
|
1542 |
+
cvt.u16.u32 %rs31, %r366;
|
1543 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs32}, %r366; }
|
1544 |
+
.loc 1 63 103
|
1545 |
+
cvt.f32.bf16 %r371, %rs17;
|
1546 |
+
mov.b32 %f181, %r371;
|
1547 |
+
cvt.f32.bf16 %r372, %rs18;
|
1548 |
+
mov.b32 %f182, %r372;
|
1549 |
+
cvt.f32.bf16 %r373, %rs19;
|
1550 |
+
mov.b32 %f183, %r373;
|
1551 |
+
cvt.f32.bf16 %r374, %rs20;
|
1552 |
+
mov.b32 %f184, %r374;
|
1553 |
+
cvt.f32.bf16 %r375, %rs21;
|
1554 |
+
mov.b32 %f185, %r375;
|
1555 |
+
cvt.f32.bf16 %r376, %rs22;
|
1556 |
+
mov.b32 %f186, %r376;
|
1557 |
+
cvt.f32.bf16 %r377, %rs23;
|
1558 |
+
mov.b32 %f187, %r377;
|
1559 |
+
cvt.f32.bf16 %r378, %rs24;
|
1560 |
+
mov.b32 %f188, %r378;
|
1561 |
+
cvt.f32.bf16 %r379, %rs25;
|
1562 |
+
mov.b32 %f189, %r379;
|
1563 |
+
cvt.f32.bf16 %r380, %rs26;
|
1564 |
+
mov.b32 %f190, %r380;
|
1565 |
+
cvt.f32.bf16 %r381, %rs27;
|
1566 |
+
mov.b32 %f191, %r381;
|
1567 |
+
cvt.f32.bf16 %r382, %rs28;
|
1568 |
+
mov.b32 %f192, %r382;
|
1569 |
+
cvt.f32.bf16 %r383, %rs29;
|
1570 |
+
mov.b32 %f193, %r383;
|
1571 |
+
cvt.f32.bf16 %r384, %rs30;
|
1572 |
+
mov.b32 %f194, %r384;
|
1573 |
+
cvt.f32.bf16 %r385, %rs31;
|
1574 |
+
mov.b32 %f195, %r385;
|
1575 |
+
cvt.f32.bf16 %r386, %rs32;
|
1576 |
+
mov.b32 %f196, %r386;
|
1577 |
+
.loc 1 64 35
|
1578 |
+
add.s64 %rd96, %rd9, %rd117;
|
1579 |
+
.loc 1 64 40
|
1580 |
+
add.s64 %rd97, %rd96, 16;
|
1581 |
+
mov.u32 %r387, 0x0;
|
1582 |
+
mov.u32 %r388, 0x0;
|
1583 |
+
mov.u32 %r389, 0x0;
|
1584 |
+
mov.u32 %r390, 0x0;
|
1585 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r387, %r388, %r389, %r390 }, [ %rd96 + 0 ];
|
1586 |
+
@!%p1 mov.u32 %r387, %r411;
|
1587 |
+
@!%p1 mov.u32 %r388, %r411;
|
1588 |
+
@!%p1 mov.u32 %r389, %r411;
|
1589 |
+
@!%p1 mov.u32 %r390, %r411;
|
1590 |
+
mov.b32 %f197, %r387;
|
1591 |
+
mov.b32 %f198, %r388;
|
1592 |
+
mov.b32 %f199, %r389;
|
1593 |
+
mov.b32 %f200, %r390;
|
1594 |
+
mov.u32 %r395, 0x0;
|
1595 |
+
mov.u32 %r396, 0x0;
|
1596 |
+
mov.u32 %r397, 0x0;
|
1597 |
+
mov.u32 %r398, 0x0;
|
1598 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r395, %r396, %r397, %r398 }, [ %rd97 + 0 ];
|
1599 |
+
@!%p1 mov.u32 %r395, %r411;
|
1600 |
+
@!%p1 mov.u32 %r396, %r411;
|
1601 |
+
@!%p1 mov.u32 %r397, %r411;
|
1602 |
+
@!%p1 mov.u32 %r398, %r411;
|
1603 |
+
mov.b32 %f201, %r395;
|
1604 |
+
mov.b32 %f202, %r396;
|
1605 |
+
mov.b32 %f203, %r397;
|
1606 |
+
mov.b32 %f204, %r398;
|
1607 |
+
.loc 1 68 57
|
1608 |
+
@%p51 bra $L__BB0_7;
|
1609 |
+
mov.u64 %rd100, assertMessage_1;
|
1610 |
+
cvta.global.u64 %rd101, %rd100;
|
1611 |
+
mov.u64 %rd102, assertFile_1;
|
1612 |
+
cvta.global.u64 %rd103, %rd102;
|
1613 |
+
mov.u64 %rd104, assertFunc_1;
|
1614 |
+
cvta.global.u64 %rd105, %rd104;
|
1615 |
+
{ // callseq 7, 0
|
1616 |
+
.reg .b32 temp_param_reg;
|
1617 |
+
.param .b64 param0;
|
1618 |
+
st.param.b64 [param0+0], %rd101;
|
1619 |
+
.param .b64 param1;
|
1620 |
+
st.param.b64 [param1+0], %rd103;
|
1621 |
+
.param .b32 param2;
|
1622 |
+
st.param.b32 [param2+0], %r471;
|
1623 |
+
.param .b64 param3;
|
1624 |
+
st.param.b64 [param3+0], %rd105;
|
1625 |
+
.param .b64 param4;
|
1626 |
+
st.param.b64 [param4+0], %rd115;
|
1627 |
+
call.uni
|
1628 |
+
__assertfail,
|
1629 |
+
(
|
1630 |
+
param0,
|
1631 |
+
param1,
|
1632 |
+
param2,
|
1633 |
+
param3,
|
1634 |
+
param4
|
1635 |
+
);
|
1636 |
+
} // callseq 7
|
1637 |
+
bra.uni $L__BB0_7;
|
1638 |
+
$L__BB0_8:
|
1639 |
+
.loc 1 58 4
|
1640 |
+
ret;
|
1641 |
+
$L__tmp17:
|
1642 |
+
$L__func_end0:
|
1643 |
+
|
1644 |
+
}
|
1645 |
+
// .globl __nv_rsqrtf
|
1646 |
+
.visible .func (.param .b32 func_retval0) __nv_rsqrtf(
|
1647 |
+
.param .b32 __nv_rsqrtf_param_0
|
1648 |
+
)
|
1649 |
+
{
|
1650 |
+
.reg .f32 %f<3>;
|
1651 |
+
$L__func_begin1:
|
1652 |
+
|
1653 |
+
ld.param.f32 %f1, [__nv_rsqrtf_param_0];
|
1654 |
+
rsqrt.approx.ftz.f32 %f2, %f1;
|
1655 |
+
st.param.f32 [func_retval0+0], %f2;
|
1656 |
+
ret;
|
1657 |
+
$L__func_end1:
|
1658 |
+
|
1659 |
+
}
|
1660 |
+
.file 1 "/tmp/torchinductor_root/pn/cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py"
|
1661 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
|
1662 |
+
.section .debug_abbrev
|
1663 |
+
{
|
1664 |
+
.b8 1
|
1665 |
+
.b8 17
|
1666 |
+
.b8 1
|
1667 |
+
.b8 37
|
1668 |
+
.b8 8
|
1669 |
+
.b8 19
|
1670 |
+
.b8 5
|
1671 |
+
.b8 3
|
1672 |
+
.b8 8
|
1673 |
+
.b8 16
|
1674 |
+
.b8 6
|
1675 |
+
.b8 27
|
1676 |
+
.b8 8
|
1677 |
+
.b8 180
|
1678 |
+
.b8 66
|
1679 |
+
.b8 12
|
1680 |
+
.b8 17
|
1681 |
+
.b8 1
|
1682 |
+
.b8 18
|
1683 |
+
.b8 1
|
1684 |
+
.b8 0
|
1685 |
+
.b8 0
|
1686 |
+
.b8 2
|
1687 |
+
.b8 46
|
1688 |
+
.b8 0
|
1689 |
+
.b8 135
|
1690 |
+
.b8 64
|
1691 |
+
.b8 8
|
1692 |
+
.b8 3
|
1693 |
+
.b8 8
|
1694 |
+
.b8 58
|
1695 |
+
.b8 11
|
1696 |
+
.b8 59
|
1697 |
+
.b8 11
|
1698 |
+
.b8 63
|
1699 |
+
.b8 12
|
1700 |
+
.b8 32
|
1701 |
+
.b8 11
|
1702 |
+
.b8 0
|
1703 |
+
.b8 0
|
1704 |
+
.b8 3
|
1705 |
+
.b8 46
|
1706 |
+
.b8 1
|
1707 |
+
.b8 17
|
1708 |
+
.b8 1
|
1709 |
+
.b8 18
|
1710 |
+
.b8 1
|
1711 |
+
.b8 64
|
1712 |
+
.b8 10
|
1713 |
+
.b8 49
|
1714 |
+
.b8 19
|
1715 |
+
.b8 0
|
1716 |
+
.b8 0
|
1717 |
+
.b8 4
|
1718 |
+
.b8 29
|
1719 |
+
.b8 0
|
1720 |
+
.b8 49
|
1721 |
+
.b8 19
|
1722 |
+
.b8 17
|
1723 |
+
.b8 1
|
1724 |
+
.b8 18
|
1725 |
+
.b8 1
|
1726 |
+
.b8 88
|
1727 |
+
.b8 11
|
1728 |
+
.b8 89
|
1729 |
+
.b8 11
|
1730 |
+
.b8 87
|
1731 |
+
.b8 11
|
1732 |
+
.b8 0
|
1733 |
+
.b8 0
|
1734 |
+
.b8 5
|
1735 |
+
.b8 29
|
1736 |
+
.b8 1
|
1737 |
+
.b8 49
|
1738 |
+
.b8 19
|
1739 |
+
.b8 17
|
1740 |
+
.b8 1
|
1741 |
+
.b8 18
|
1742 |
+
.b8 1
|
1743 |
+
.b8 88
|
1744 |
+
.b8 11
|
1745 |
+
.b8 89
|
1746 |
+
.b8 11
|
1747 |
+
.b8 87
|
1748 |
+
.b8 11
|
1749 |
+
.b8 0
|
1750 |
+
.b8 0
|
1751 |
+
.b8 0
|
1752 |
+
}
|
1753 |
+
.section .debug_info
|
1754 |
+
{
|
1755 |
+
.b32 302
|
1756 |
+
.b8 2
|
1757 |
+
.b8 0
|
1758 |
+
.b32 .debug_abbrev
|
1759 |
+
.b8 8
|
1760 |
+
.b8 1
|
1761 |
+
.b8 116
|
1762 |
+
.b8 114
|
1763 |
+
.b8 105
|
1764 |
+
.b8 116
|
1765 |
+
.b8 111
|
1766 |
+
.b8 110
|
1767 |
+
.b8 0
|
1768 |
+
.b8 2
|
1769 |
+
.b8 0
|
1770 |
+
.b8 99
|
1771 |
+
.b8 112
|
1772 |
+
.b8 110
|
1773 |
+
.b8 51
|
1774 |
+
.b8 108
|
1775 |
+
.b8 97
|
1776 |
+
.b8 119
|
1777 |
+
.b8 103
|
1778 |
+
.b8 54
|
1779 |
+
.b8 53
|
1780 |
+
.b8 108
|
1781 |
+
.b8 112
|
1782 |
+
.b8 105
|
1783 |
+
.b8 54
|
1784 |
+
.b8 51
|
1785 |
+
.b8 103
|
1786 |
+
.b8 118
|
1787 |
+
.b8 54
|
1788 |
+
.b8 99
|
1789 |
+
.b8 54
|
1790 |
+
.b8 112
|
1791 |
+
.b8 110
|
1792 |
+
.b8 52
|
1793 |
+
.b8 111
|
1794 |
+
.b8 105
|
1795 |
+
.b8 107
|
1796 |
+
.b8 104
|
1797 |
+
.b8 103
|
1798 |
+
.b8 54
|
1799 |
+
.b8 113
|
1800 |
+
.b8 118
|
1801 |
+
.b8 97
|
1802 |
+
.b8 50
|
1803 |
+
.b8 104
|
1804 |
+
.b8 50
|
1805 |
+
.b8 113
|
1806 |
+
.b8 106
|
1807 |
+
.b8 100
|
1808 |
+
.b8 112
|
1809 |
+
.b8 120
|
1810 |
+
.b8 101
|
1811 |
+
.b8 54
|
1812 |
+
.b8 113
|
1813 |
+
.b8 106
|
1814 |
+
.b8 52
|
1815 |
+
.b8 108
|
1816 |
+
.b8 118
|
1817 |
+
.b8 116
|
1818 |
+
.b8 116
|
1819 |
+
.b8 119
|
1820 |
+
.b8 101
|
1821 |
+
.b8 122
|
1822 |
+
.b8 46
|
1823 |
+
.b8 112
|
1824 |
+
.b8 121
|
1825 |
+
.b8 0
|
1826 |
+
.b32 .debug_line
|
1827 |
+
.b8 47
|
1828 |
+
.b8 116
|
1829 |
+
.b8 109
|
1830 |
+
.b8 112
|
1831 |
+
.b8 47
|
1832 |
+
.b8 116
|
1833 |
+
.b8 111
|
1834 |
+
.b8 114
|
1835 |
+
.b8 99
|
1836 |
+
.b8 104
|
1837 |
+
.b8 105
|
1838 |
+
.b8 110
|
1839 |
+
.b8 100
|
1840 |
+
.b8 117
|
1841 |
+
.b8 99
|
1842 |
+
.b8 116
|
1843 |
+
.b8 111
|
1844 |
+
.b8 114
|
1845 |
+
.b8 95
|
1846 |
+
.b8 114
|
1847 |
+
.b8 111
|
1848 |
+
.b8 111
|
1849 |
+
.b8 116
|
1850 |
+
.b8 47
|
1851 |
+
.b8 112
|
1852 |
+
.b8 110
|
1853 |
+
.b8 0
|
1854 |
+
.b8 1
|
1855 |
+
.b64 $L__func_begin0
|
1856 |
+
.b64 $L__func_end0
|
1857 |
+
.b8 2
|
1858 |
+
.b8 116
|
1859 |
+
.b8 114
|
1860 |
+
.b8 105
|
1861 |
+
.b8 116
|
1862 |
+
.b8 111
|
1863 |
+
.b8 110
|
1864 |
+
.b8 95
|
1865 |
+
.b8 95
|
1866 |
+
.b8 48
|
1867 |
+
.b8 100
|
1868 |
+
.b8 49
|
1869 |
+
.b8 100
|
1870 |
+
.b8 50
|
1871 |
+
.b8 100
|
1872 |
+
.b8 51
|
1873 |
+
.b8 100
|
1874 |
+
.b8 52
|
1875 |
+
.b8 100
|
1876 |
+
.b8 53
|
1877 |
+
.b8 100
|
1878 |
+
.b8 54
|
1879 |
+
.b8 100
|
1880 |
+
.b8 101
|
1881 |
+
.b8 55
|
1882 |
+
.b8 100
|
1883 |
+
.b8 101
|
1884 |
+
.b8 0
|
1885 |
+
.b8 116
|
1886 |
+
.b8 114
|
1887 |
+
.b8 105
|
1888 |
+
.b8 116
|
1889 |
+
.b8 111
|
1890 |
+
.b8 110
|
1891 |
+
.b8 95
|
1892 |
+
.b8 95
|
1893 |
+
.b8 48
|
1894 |
+
.b8 100
|
1895 |
+
.b8 49
|
1896 |
+
.b8 100
|
1897 |
+
.b8 50
|
1898 |
+
.b8 100
|
1899 |
+
.b8 51
|
1900 |
+
.b8 100
|
1901 |
+
.b8 52
|
1902 |
+
.b8 100
|
1903 |
+
.b8 53
|
1904 |
+
.b8 100
|
1905 |
+
.b8 54
|
1906 |
+
.b8 100
|
1907 |
+
.b8 101
|
1908 |
+
.b8 55
|
1909 |
+
.b8 100
|
1910 |
+
.b8 101
|
1911 |
+
.b8 0
|
1912 |
+
.b8 1
|
1913 |
+
.b8 18
|
1914 |
+
.b8 1
|
1915 |
+
.b8 1
|
1916 |
+
.b8 3
|
1917 |
+
.b64 $L__func_begin0
|
1918 |
+
.b64 $L__func_end0
|
1919 |
+
.b8 1
|
1920 |
+
.b8 156
|
1921 |
+
.b32 125
|
1922 |
+
.b8 4
|
1923 |
+
.b32 125
|
1924 |
+
.b64 $L__tmp1
|
1925 |
+
.b64 $L__tmp2
|
1926 |
+
.b8 2
|
1927 |
+
.b8 47
|
1928 |
+
.b8 41
|
1929 |
+
.b8 5
|
1930 |
+
.b32 125
|
1931 |
+
.b64 $L__tmp3
|
1932 |
+
.b64 $L__tmp16
|
1933 |
+
.b8 2
|
1934 |
+
.b8 53
|
1935 |
+
.b8 44
|
1936 |
+
.b8 4
|
1937 |
+
.b32 125
|
1938 |
+
.b64 $L__tmp3
|
1939 |
+
.b64 $L__tmp16
|
1940 |
+
.b8 2
|
1941 |
+
.b8 120
|
1942 |
+
.b8 46
|
1943 |
+
.b8 0
|
1944 |
+
.b8 4
|
1945 |
+
.b32 125
|
1946 |
+
.b64 $L__tmp4
|
1947 |
+
.b64 $L__tmp15
|
1948 |
+
.b8 2
|
1949 |
+
.b8 53
|
1950 |
+
.b8 44
|
1951 |
+
.b8 0
|
1952 |
+
.b8 0
|
1953 |
+
}
|
1954 |
+
.section .debug_pubnames
|
1955 |
+
{
|
1956 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
1957 |
+
$L__pubNames_start0:
|
1958 |
+
.b8 2
|
1959 |
+
.b8 0
|
1960 |
+
.b32 .debug_info
|
1961 |
+
.b32 306
|
1962 |
+
.b32 125
|
1963 |
+
.b8 116
|
1964 |
+
.b8 114
|
1965 |
+
.b8 105
|
1966 |
+
.b8 116
|
1967 |
+
.b8 111
|
1968 |
+
.b8 110
|
1969 |
+
.b8 95
|
1970 |
+
.b8 95
|
1971 |
+
.b8 48
|
1972 |
+
.b8 100
|
1973 |
+
.b8 49
|
1974 |
+
.b8 100
|
1975 |
+
.b8 50
|
1976 |
+
.b8 100
|
1977 |
+
.b8 51
|
1978 |
+
.b8 100
|
1979 |
+
.b8 52
|
1980 |
+
.b8 100
|
1981 |
+
.b8 53
|
1982 |
+
.b8 100
|
1983 |
+
.b8 54
|
1984 |
+
.b8 100
|
1985 |
+
.b8 101
|
1986 |
+
.b8 55
|
1987 |
+
.b8 100
|
1988 |
+
.b8 101
|
1989 |
+
.b8 0
|
1990 |
+
.b32 0
|
1991 |
+
$L__pubNames_end0:
|
1992 |
+
}
|
1993 |
+
.section .debug_pubtypes
|
1994 |
+
{
|
1995 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
1996 |
+
$L__pubTypes_start0:
|
1997 |
+
.b8 2
|
1998 |
+
.b8 0
|
1999 |
+
.b32 .debug_info
|
2000 |
+
.b32 306
|
2001 |
+
.b32 0
|
2002 |
+
$L__pubTypes_end0:
|
2003 |
+
}
|
2004 |
+
.section .debug_loc { }
|
.triton/dump/cde66a00b7594f6428e4f4dcdfa88537/triton_.ttgir
ADDED
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
2 |
+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
3 |
+
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 4], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
4 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
5 |
+
tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
6 |
+
%cst = arith.constant dense<512> : tensor<16x1xi32, #blocked>
|
7 |
+
%cst_0 = arith.constant dense<256> : tensor<1x128xi32, #blocked>
|
8 |
+
%cst_1 = arith.constant dense<256> : tensor<16x1xi32, #blocked>
|
9 |
+
%cst_2 = arith.constant dense<0.000000e+00> : tensor<16x128xf32, #blocked>
|
10 |
+
%cst_3 = arith.constant dense<0.000000e+00> : tensor<1x128xf32, #blocked>
|
11 |
+
%cst_4 = arith.constant dense<1.000000e+00> : tensor<16x128xf32, #blocked>
|
12 |
+
%cst_5 = arith.constant dense<256> : tensor<16x1xi64, #blocked>
|
13 |
+
%cst_6 = arith.constant dense<0> : tensor<16x1xi64, #blocked>
|
14 |
+
%cst_7 = arith.constant dense<50257> : tensor<16x1xi64, #blocked>
|
15 |
+
%cst_8 = arith.constant dense<50257> : tensor<16x1xi64, #blocked1>
|
16 |
+
%cst_9 = arith.constant dense<0> : tensor<16x1xi64, #blocked1>
|
17 |
+
%c0_i32 = arith.constant 0 : i32
|
18 |
+
%c128_i32 = arith.constant 128 : i32
|
19 |
+
%c256_i32 = arith.constant 256 : i32
|
20 |
+
%cst_10 = arith.constant dense<1.000000e+00> : tensor<16x128xf32, #blocked2>
|
21 |
+
%cst_11 = arith.constant 0.000000e+00 : f32
|
22 |
+
%cst_12 = arith.constant dense<0.000000e+00> : tensor<16x128xf32, #blocked2>
|
23 |
+
%cst_13 = arith.constant dense<256> : tensor<1x128xi32, #blocked2>
|
24 |
+
%cst_14 = arith.constant dense<9.99999974E-6> : tensor<16x1xf32, #blocked>
|
25 |
+
%cst_15 = arith.constant dense<2.560000e+02> : tensor<16x1xf32, #blocked>
|
26 |
+
%c16_i32 = arith.constant 16 : i32
|
27 |
+
%0 = tt.get_program_id x : i32
|
28 |
+
%1 = arith.muli %0, %c16_i32 : i32
|
29 |
+
%2 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
30 |
+
%3 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
|
31 |
+
%4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xi32, #blocked>
|
32 |
+
%5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<16x1xi32, #blocked1>
|
33 |
+
%6 = tt.splat %1 : (i32) -> tensor<16x1xi32, #blocked>
|
34 |
+
%7 = tt.splat %1 : (i32) -> tensor<16x1xi32, #blocked1>
|
35 |
+
%8 = arith.addi %6, %4 : tensor<16x1xi32, #blocked>
|
36 |
+
%9 = arith.addi %7, %5 : tensor<16x1xi32, #blocked1>
|
37 |
+
%10 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
|
38 |
+
%11 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>
|
39 |
+
%12 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x128xi32, #blocked>
|
40 |
+
%13 = tt.expand_dims %11 {axis = 0 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x128xi32, #blocked2>
|
41 |
+
%14 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<16x1x!tt.ptr<i64, 1>, #blocked>
|
42 |
+
%15 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<16x1x!tt.ptr<i64, 1>, #blocked1>
|
43 |
+
%16 = tt.addptr %14, %8 : tensor<16x1x!tt.ptr<i64, 1>, #blocked>, tensor<16x1xi32, #blocked>
|
44 |
+
%17 = tt.addptr %15, %9 : tensor<16x1x!tt.ptr<i64, 1>, #blocked1>, tensor<16x1xi32, #blocked1>
|
45 |
+
%18 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64, #blocked>
|
46 |
+
%19 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64, #blocked1>
|
47 |
+
%20 = arith.remsi %8, %cst : tensor<16x1xi32, #blocked>
|
48 |
+
%21 = arith.muli %20, %cst_1 : tensor<16x1xi32, #blocked>
|
49 |
+
%22 = tt.broadcast %21 : (tensor<16x1xi32, #blocked>) -> tensor<16x128xi32, #blocked>
|
50 |
+
%23 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<16x128x!tt.ptr<f32, 1>, #blocked>
|
51 |
+
%24 = arith.addi %18, %cst_7 : tensor<16x1xi64, #blocked>
|
52 |
+
%25 = arith.addi %19, %cst_8 : tensor<16x1xi64, #blocked1>
|
53 |
+
%26 = arith.cmpi slt, %18, %cst_6 : tensor<16x1xi64, #blocked>
|
54 |
+
%27 = arith.cmpi slt, %19, %cst_9 : tensor<16x1xi64, #blocked1>
|
55 |
+
%28 = arith.select %26, %24, %18 : tensor<16x1xi1, #blocked>, tensor<16x1xi64, #blocked>
|
56 |
+
%29 = arith.select %27, %25, %19 : tensor<16x1xi1, #blocked1>, tensor<16x1xi64, #blocked1>
|
57 |
+
%30 = arith.cmpi sge, %29, %cst_9 : tensor<16x1xi64, #blocked1>
|
58 |
+
%31 = arith.cmpi slt, %29, %cst_8 : tensor<16x1xi64, #blocked1>
|
59 |
+
%32 = arith.andi %30, %31 : tensor<16x1xi1, #blocked1>
|
60 |
+
%33 = arith.muli %28, %cst_5 : tensor<16x1xi64, #blocked>
|
61 |
+
%34 = tt.broadcast %33 : (tensor<16x1xi64, #blocked>) -> tensor<16x128xi64, #blocked>
|
62 |
+
%35 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<16x128x!tt.ptr<f32, 1>, #blocked>
|
63 |
+
%36:4 = scf.for %arg7 = %c0_i32 to %c256_i32 step %c128_i32 iter_args(%arg8 = %cst_2, %arg9 = %cst_2, %arg10 = %cst_12, %arg11 = %cst_2) -> (tensor<16x128xf32, #blocked>, tensor<16x128xf32, #blocked>, tensor<16x128xf32, #blocked2>, tensor<16x128xf32, #blocked>) : i32 {
|
64 |
+
%48 = tt.splat %arg7 : (i32) -> tensor<1x128xi32, #blocked>
|
65 |
+
%49 = tt.splat %arg7 : (i32) -> tensor<1x128xi32, #blocked2>
|
66 |
+
%50 = arith.addi %48, %12 : tensor<1x128xi32, #blocked>
|
67 |
+
%51 = arith.addi %49, %13 : tensor<1x128xi32, #blocked2>
|
68 |
+
%52 = arith.cmpi slt, %50, %cst_0 : tensor<1x128xi32, #blocked>
|
69 |
+
%53 = arith.cmpi slt, %51, %cst_13 : tensor<1x128xi32, #blocked2>
|
70 |
+
%54 = tt.broadcast %50 : (tensor<1x128xi32, #blocked>) -> tensor<16x128xi32, #blocked>
|
71 |
+
%55 = arith.addi %54, %22 : tensor<16x128xi32, #blocked>
|
72 |
+
%56 = tt.addptr %23, %55 : tensor<16x128x!tt.ptr<f32, 1>, #blocked>, tensor<16x128xi32, #blocked>
|
73 |
+
%57 = tt.broadcast %52 : (tensor<1x128xi1, #blocked>) -> tensor<16x128xi1, #blocked>
|
74 |
+
%58 = tt.broadcast %53 : (tensor<1x128xi1, #blocked2>) -> tensor<16x128xi1, #blocked2>
|
75 |
+
%59 = tt.load %56, %57, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x128xf32, #blocked>
|
76 |
+
tt.assert %32, "index out of bounds: 0 <= tmp3 < 50257", "/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py", "<module>", 1892 : tensor<16x1xi1, #blocked1>
|
77 |
+
%60 = arith.extsi %50 : tensor<1x128xi32, #blocked> to tensor<1x128xi64, #blocked>
|
78 |
+
%61 = tt.broadcast %60 : (tensor<1x128xi64, #blocked>) -> tensor<16x128xi64, #blocked>
|
79 |
+
%62 = arith.addi %61, %34 : tensor<16x128xi64, #blocked>
|
80 |
+
%63 = tt.addptr %35, %62 : tensor<16x128x!tt.ptr<f32, 1>, #blocked>, tensor<16x128xi64, #blocked>
|
81 |
+
%64 = tt.load %63, %57, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x128xf32, #blocked>
|
82 |
+
%65 = arith.addf %64, %59 : tensor<16x128xf32, #blocked>
|
83 |
+
%66 = arith.subf %65, %arg8 : tensor<16x128xf32, #blocked>
|
84 |
+
%67 = arith.addf %arg11, %cst_4 : tensor<16x128xf32, #blocked>
|
85 |
+
%68 = arith.addf %arg10, %cst_10 : tensor<16x128xf32, #blocked2>
|
86 |
+
%69 = arith.divf %66, %67 : tensor<16x128xf32, #blocked>
|
87 |
+
%70 = arith.addf %arg8, %69 : tensor<16x128xf32, #blocked>
|
88 |
+
%71 = arith.subf %65, %70 : tensor<16x128xf32, #blocked>
|
89 |
+
%72 = arith.mulf %66, %71 : tensor<16x128xf32, #blocked>
|
90 |
+
%73 = arith.addf %arg9, %72 : tensor<16x128xf32, #blocked>
|
91 |
+
%74 = arith.select %57, %70, %arg8 : tensor<16x128xi1, #blocked>, tensor<16x128xf32, #blocked>
|
92 |
+
%75 = arith.select %57, %73, %arg9 : tensor<16x128xi1, #blocked>, tensor<16x128xf32, #blocked>
|
93 |
+
%76 = arith.select %57, %67, %arg11 : tensor<16x128xi1, #blocked>, tensor<16x128xf32, #blocked>
|
94 |
+
%77 = arith.select %58, %68, %arg10 : tensor<16x128xi1, #blocked2>, tensor<16x128xf32, #blocked2>
|
95 |
+
scf.yield %74, %75, %77, %76 : tensor<16x128xf32, #blocked>, tensor<16x128xf32, #blocked>, tensor<16x128xf32, #blocked2>, tensor<16x128xf32, #blocked>
|
96 |
+
}
|
97 |
+
%37 = triton_gpu.convert_layout %36#2 : (tensor<16x128xf32, #blocked2>) -> tensor<16x128xf32, #blocked>
|
98 |
+
%38:3 = "tt.reduce"(%36#0, %36#1, %37) <{axis = 1 : i32}> ({
|
99 |
+
^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32):
|
100 |
+
%48 = arith.subf %arg10, %arg7 : f32
|
101 |
+
%49 = arith.addf %arg9, %arg12 : f32
|
102 |
+
%50 = arith.cmpf oeq, %49, %cst_11 : f32
|
103 |
+
%51 = arith.divf %arg12, %49 : f32
|
104 |
+
%52 = arith.select %50, %cst_11, %51 : f32
|
105 |
+
%53 = arith.mulf %48, %52 : f32
|
106 |
+
%54 = arith.addf %arg7, %53 : f32
|
107 |
+
%55 = arith.addf %arg8, %arg11 : f32
|
108 |
+
%56 = arith.mulf %48, %48 : f32
|
109 |
+
%57 = arith.mulf %56, %arg9 : f32
|
110 |
+
%58 = arith.mulf %57, %52 : f32
|
111 |
+
%59 = arith.addf %55, %58 : f32
|
112 |
+
tt.reduce.return %54, %59, %49 : f32, f32, f32
|
113 |
+
}) : (tensor<16x128xf32, #blocked>, tensor<16x128xf32, #blocked>, tensor<16x128xf32, #blocked>) -> (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
|
114 |
+
%39 = tt.expand_dims %38#0 {axis = 1 : i32} : (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xf32, #blocked>
|
115 |
+
%40 = tt.expand_dims %38#1 {axis = 1 : i32} : (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xf32, #blocked>
|
116 |
+
%41 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<1x128x!tt.ptr<f32, 1>, #blocked>
|
117 |
+
%42 = tt.broadcast %39 : (tensor<16x1xf32, #blocked>) -> tensor<16x128xf32, #blocked>
|
118 |
+
%43 = arith.divf %40, %cst_15 : tensor<16x1xf32, #blocked>
|
119 |
+
%44 = arith.addf %43, %cst_14 : tensor<16x1xf32, #blocked>
|
120 |
+
%45 = arith.muli %8, %cst_1 : tensor<16x1xi32, #blocked>
|
121 |
+
%46 = tt.broadcast %45 : (tensor<16x1xi32, #blocked>) -> tensor<16x128xi32, #blocked>
|
122 |
+
%47 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<16x128x!tt.ptr<bf16, 1>, #blocked>
|
123 |
+
scf.for %arg7 = %c0_i32 to %c256_i32 step %c128_i32 : i32 {
|
124 |
+
%48 = tt.splat %arg7 : (i32) -> tensor<1x128xi32, #blocked>
|
125 |
+
%49 = arith.addi %48, %12 : tensor<1x128xi32, #blocked>
|
126 |
+
%50 = arith.cmpi slt, %49, %cst_0 : tensor<1x128xi32, #blocked>
|
127 |
+
%51 = tt.broadcast %49 : (tensor<1x128xi32, #blocked>) -> tensor<16x128xi32, #blocked>
|
128 |
+
%52 = arith.addi %51, %22 : tensor<16x128xi32, #blocked>
|
129 |
+
%53 = tt.addptr %23, %52 : tensor<16x128x!tt.ptr<f32, 1>, #blocked>, tensor<16x128xi32, #blocked>
|
130 |
+
%54 = tt.broadcast %50 : (tensor<1x128xi1, #blocked>) -> tensor<16x128xi1, #blocked>
|
131 |
+
%55 = tt.load %53, %54, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x128xf32, #blocked>
|
132 |
+
%56 = tt.addptr %41, %49 : tensor<1x128x!tt.ptr<f32, 1>, #blocked>, tensor<1x128xi32, #blocked>
|
133 |
+
%57 = tt.load %56, %50, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x128xf32, #blocked>
|
134 |
+
tt.assert %32, "index out of bounds: 0 <= tmp13 < 50257", "/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py", "<module>", 1892 : tensor<16x1xi1, #blocked1>
|
135 |
+
%58 = arith.extsi %49 : tensor<1x128xi32, #blocked> to tensor<1x128xi64, #blocked>
|
136 |
+
%59 = tt.broadcast %58 : (tensor<1x128xi64, #blocked>) -> tensor<16x128xi64, #blocked>
|
137 |
+
%60 = arith.addi %59, %34 : tensor<16x128xi64, #blocked>
|
138 |
+
%61 = tt.addptr %35, %60 : tensor<16x128x!tt.ptr<f32, 1>, #blocked>, tensor<16x128xi64, #blocked>
|
139 |
+
%62 = tt.load %61, %54, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x128xf32, #blocked>
|
140 |
+
%63 = arith.addf %62, %55 : tensor<16x128xf32, #blocked>
|
141 |
+
%64 = arith.subf %63, %42 : tensor<16x128xf32, #blocked>
|
142 |
+
%65 = tt.extern_elementwise %44 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<16x1xf32, #blocked>) -> tensor<16x1xf32, #blocked>
|
143 |
+
%66 = tt.broadcast %65 : (tensor<16x1xf32, #blocked>) -> tensor<16x128xf32, #blocked>
|
144 |
+
%67 = arith.mulf %64, %66 : tensor<16x128xf32, #blocked>
|
145 |
+
%68 = tt.broadcast %57 : (tensor<1x128xf32, #blocked>) -> tensor<16x128xf32, #blocked>
|
146 |
+
%69 = arith.mulf %67, %68 : tensor<16x128xf32, #blocked>
|
147 |
+
%70 = arith.addi %51, %46 : tensor<16x128xi32, #blocked>
|
148 |
+
%71 = tt.addptr %47, %70 : tensor<16x128x!tt.ptr<bf16, 1>, #blocked>, tensor<16x128xi32, #blocked>
|
149 |
+
%72 = arith.truncf %69 : tensor<16x128xf32, #blocked> to tensor<16x128xbf16, #blocked>
|
150 |
+
tt.store %71, %72, %54 {cache = 1 : i32, evict = 1 : i32} : tensor<16x128xbf16, #blocked>
|
151 |
+
}
|
152 |
+
tt.return
|
153 |
+
}
|
154 |
+
}
|
wandb/run-20240926_055222-14kj2390/run-14kj2390.wandb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a41d34844187b603a549bc59bb4411ea5dc13a1b984865be55df2318b2e7a8c3
|
3 |
+
size 28769629
|
wandb/run-20240926_124123-zc6s8e8w/run-zc6s8e8w.wandb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fb8a97e2da032f4f531a370d36d1202e0c3415accd568812cc10f04ca7663010
|
3 |
+
size 28283808
|
wandb/run-20240926_192831-378lr5yg/run-378lr5yg.wandb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6072638d705ca84f37480a4e2d52557f77495381d8a7857a0fe95ba3c6b6d88b
|
3 |
+
size 28266657
|
wandb/run-20240927_021423-clesd0p8/run-clesd0p8.wandb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:80ce69a6ed252398eac1d3fc86876dd099d2ecff12750606bc8be6129112bf24
|
3 |
+
size 27656192
|