Add files using upload-large-folder tool
Browse files- .triton/dump/1e922bbbab749da355e4bad9c6b245e6/triton_.ptx +446 -0
- .triton/dump/1e922bbbab749da355e4bad9c6b245e6/triton_.ttgir +26 -0
- .triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.llir +503 -0
- .triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.ptx +988 -0
- .triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.ttir +104 -0
- .triton/dump/305a9479aab997a3a16bfe46bb303a50/triton_.llir +524 -0
- .triton/dump/305a9479aab997a3a16bfe46bb303a50/triton_.ttgir +110 -0
- .triton/dump/305a9479aab997a3a16bfe46bb303a50/triton_.ttir +101 -0
- .triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.llir +550 -0
- .triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.ttgir +134 -0
- .triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.ttir +113 -0
- .triton/dump/89f8cc1079aa03024e56dc2aee42813a/triton_.cubin +0 -0
- .triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.ptx +758 -0
- .triton/dump/d7a12c0ba96f8920b8147157303ee99f/triton_.cubin +0 -0
- .triton/dump/d7a12c0ba96f8920b8147157303ee99f/triton_.llir +310 -0
- .triton/dump/ea4af10b775ebbfe0bdcca18fd6288c8/triton_.ttir +153 -0
- .triton/dump/fac03406d1136fc802dac111a1efea36/triton_.ptx +971 -0
- .triton/dump/fac03406d1136fc802dac111a1efea36/triton_.ttir +79 -0
.triton/dump/1e922bbbab749da355e4bad9c6b245e6/triton_.ptx
ADDED
@@ -0,0 +1,446 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1de
|
10 |
+
.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
|
11 |
+
|
12 |
+
.visible .entry triton__0d1de(
|
13 |
+
.param .u64 triton__0d1de_param_0,
|
14 |
+
.param .u32 triton__0d1de_param_1
|
15 |
+
)
|
16 |
+
.maxntid 256, 1, 1
|
17 |
+
{
|
18 |
+
.reg .pred %p<9>;
|
19 |
+
.reg .b16 %rs<5>;
|
20 |
+
.reg .b32 %r<22>;
|
21 |
+
.reg .f32 %f<113>;
|
22 |
+
.reg .b64 %rd<6>;
|
23 |
+
.loc 1 18 0
|
24 |
+
$L__func_begin0:
|
25 |
+
.loc 1 18 0
|
26 |
+
|
27 |
+
ld.param.u64 %rd3, [triton__0d1de_param_0];
|
28 |
+
$L__tmp0:
|
29 |
+
.loc 1 21 36
|
30 |
+
mov.u32 %r5, %tid.x;
|
31 |
+
shl.b32 %r6, %r5, 1;
|
32 |
+
and.b32 %r7, %r6, 510;
|
33 |
+
.loc 1 20 28
|
34 |
+
mov.u32 %r1, %ctaid.x;
|
35 |
+
.loc 1 20 33
|
36 |
+
shl.b32 %r8, %r1, 9;
|
37 |
+
.loc 1 21 23
|
38 |
+
or.b32 %r9, %r8, %r7;
|
39 |
+
.loc 1 24 34
|
40 |
+
mul.wide.s32 %rd4, %r9, 2;
|
41 |
+
add.s64 %rd5, %rd3, %rd4;
|
42 |
+
mov.pred %p1, -1;
|
43 |
+
.loc 1 24 39
|
44 |
+
mov.u32 %r2, 0x0;
|
45 |
+
@%p1 ld.global.b32 { %r2 }, [ %rd5 + 0 ];
|
46 |
+
cvt.u16.u32 %rs1, %r2;
|
47 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
|
48 |
+
.loc 1 24 48
|
49 |
+
cvt.f32.bf16 %r3, %rs1;
|
50 |
+
mov.b32 %f1, %r3;
|
51 |
+
cvt.f32.bf16 %r4, %rs2;
|
52 |
+
mov.b32 %f2, %r4;
|
53 |
+
.loc 1 29 18
|
54 |
+
mul.f32 %f3, %f1, 0f3F3504F3;
|
55 |
+
.loc 1 30 23
|
56 |
+
abs.ftz.f32 %f5, %f3;
|
57 |
+
setp.ge.f32 %p2, %f5, 0f3F8060FE;
|
58 |
+
mov.f32 %f101, 0f3789CA3C;
|
59 |
+
mov.f32 %f100, 0fB9F560B9;
|
60 |
+
mov.f32 %f99, 0f3BAC840B;
|
61 |
+
mov.f32 %f98, 0fBD0C8162;
|
62 |
+
mov.f32 %f97, 0f3E1CF906;
|
63 |
+
mov.f32 %f96, 0f3F6A937E;
|
64 |
+
mov.f32 %f95, 0f3F20D842;
|
65 |
+
mov.f32 %f102, %f5;
|
66 |
+
@%p2 bra $L__BB0_2;
|
67 |
+
.loc 1 0 23
|
68 |
+
mov.f32 %f101, 0f38B1E96A;
|
69 |
+
mov.f32 %f100, 0fBA574D20;
|
70 |
+
mov.f32 %f99, 0f3BAAD5EA;
|
71 |
+
mov.f32 %f98, 0fBCDC1BE7;
|
72 |
+
mov.f32 %f97, 0f3DE718AF;
|
73 |
+
mov.f32 %f96, 0fBEC093AC;
|
74 |
+
mov.f32 %f95, 0f3E0375D3;
|
75 |
+
.loc 1 30 23
|
76 |
+
mul.f32 %f102, %f3, %f3;
|
77 |
+
$L__BB0_2:
|
78 |
+
.loc 1 0 0
|
79 |
+
mul.f32 %f4, %f2, 0f3F3504F3;
|
80 |
+
.loc 1 30 23
|
81 |
+
setp.ltu.f32 %p3, %f5, 0f3F8060FE;
|
82 |
+
fma.rn.ftz.f32 %f45, %f101, %f102, %f100;
|
83 |
+
fma.rn.ftz.f32 %f46, %f45, %f102, %f99;
|
84 |
+
fma.rn.ftz.f32 %f47, %f46, %f102, %f98;
|
85 |
+
fma.rn.ftz.f32 %f48, %f47, %f102, %f97;
|
86 |
+
fma.rn.ftz.f32 %f49, %f48, %f102, %f96;
|
87 |
+
fma.rn.ftz.f32 %f50, %f49, %f102, %f95;
|
88 |
+
neg.f32 %f51, %f102;
|
89 |
+
selp.f32 %f52, %f51, %f3, %p2;
|
90 |
+
fma.rn.ftz.f32 %f103, %f50, %f52, %f52;
|
91 |
+
mov.f32 %f94, 0f3F800000;
|
92 |
+
@%p3 bra $L__BB0_4;
|
93 |
+
ex2.approx.ftz.f32 %f53, %f103;
|
94 |
+
sub.f32 %f55, %f94, %f53;
|
95 |
+
mov.b32 %r10, %f55;
|
96 |
+
mov.b32 %r11, %f3;
|
97 |
+
and.b32 %r12, %r11, -2147483648;
|
98 |
+
or.b32 %r13, %r12, %r10;
|
99 |
+
mov.b32 %f103, %r13;
|
100 |
+
$L__BB0_4:
|
101 |
+
abs.ftz.f32 %f18, %f4;
|
102 |
+
setp.ge.f32 %p5, %f18, 0f3F8060FE;
|
103 |
+
mov.f32 %f110, 0f3789CA3C;
|
104 |
+
mov.f32 %f109, 0fB9F560B9;
|
105 |
+
mov.f32 %f108, 0f3BAC840B;
|
106 |
+
mov.f32 %f107, 0fBD0C8162;
|
107 |
+
mov.f32 %f106, 0f3E1CF906;
|
108 |
+
mov.f32 %f105, 0f3F6A937E;
|
109 |
+
mov.f32 %f104, 0f3F20D842;
|
110 |
+
mov.f32 %f111, %f18;
|
111 |
+
@%p5 bra $L__BB0_6;
|
112 |
+
mul.f32 %f111, %f4, %f4;
|
113 |
+
mov.f32 %f110, 0f38B1E96A;
|
114 |
+
mov.f32 %f109, 0fBA574D20;
|
115 |
+
mov.f32 %f108, 0f3BAAD5EA;
|
116 |
+
mov.f32 %f107, 0fBCDC1BE7;
|
117 |
+
mov.f32 %f106, 0f3DE718AF;
|
118 |
+
mov.f32 %f105, 0fBEC093AC;
|
119 |
+
mov.f32 %f104, 0f3E0375D3;
|
120 |
+
$L__BB0_6:
|
121 |
+
setp.ltu.f32 %p6, %f18, 0f3F8060FE;
|
122 |
+
fma.rn.ftz.f32 %f70, %f110, %f111, %f109;
|
123 |
+
fma.rn.ftz.f32 %f71, %f70, %f111, %f108;
|
124 |
+
fma.rn.ftz.f32 %f72, %f71, %f111, %f107;
|
125 |
+
fma.rn.ftz.f32 %f73, %f72, %f111, %f106;
|
126 |
+
fma.rn.ftz.f32 %f74, %f73, %f111, %f105;
|
127 |
+
fma.rn.ftz.f32 %f75, %f74, %f111, %f104;
|
128 |
+
neg.f32 %f76, %f111;
|
129 |
+
selp.f32 %f77, %f76, %f4, %p5;
|
130 |
+
fma.rn.ftz.f32 %f112, %f75, %f77, %f77;
|
131 |
+
@%p6 bra $L__BB0_8;
|
132 |
+
ex2.approx.ftz.f32 %f78, %f112;
|
133 |
+
sub.f32 %f80, %f94, %f78;
|
134 |
+
mov.b32 %r14, %f80;
|
135 |
+
mov.b32 %r15, %f4;
|
136 |
+
and.b32 %r16, %r15, -2147483648;
|
137 |
+
or.b32 %r17, %r16, %r14;
|
138 |
+
mov.b32 %f112, %r17;
|
139 |
+
$L__BB0_8:
|
140 |
+
.loc 1 27 18
|
141 |
+
mul.f32 %f81, %f2, 0f3F000000;
|
142 |
+
mul.f32 %f82, %f1, 0f3F000000;
|
143 |
+
.loc 1 32 18
|
144 |
+
add.f32 %f83, %f103, 0f3F800000;
|
145 |
+
add.f32 %f84, %f112, 0f3F800000;
|
146 |
+
.loc 1 33 18
|
147 |
+
mul.f32 %f85, %f82, %f83;
|
148 |
+
mul.f32 %f86, %f81, %f84;
|
149 |
+
.loc 1 35 40
|
150 |
+
mov.b32 %r18, %f85;
|
151 |
+
cvt.rn.bf16.f32 %rs3, %r18;
|
152 |
+
mov.b32 %r19, %f86;
|
153 |
+
cvt.rn.bf16.f32 %rs4, %r19;
|
154 |
+
mov.b32 %r21, {%rs3, %rs4};
|
155 |
+
@%p1 st.global.b32 [ %rd5 + 0 ], { %r21 };
|
156 |
+
.loc 1 35 4
|
157 |
+
ret;
|
158 |
+
$L__tmp1:
|
159 |
+
$L__func_end0:
|
160 |
+
|
161 |
+
}
|
162 |
+
// .globl __nv_erff
|
163 |
+
.visible .func (.param .b32 func_retval0) __nv_erff(
|
164 |
+
.param .b32 __nv_erff_param_0
|
165 |
+
)
|
166 |
+
{
|
167 |
+
.reg .pred %p<4>;
|
168 |
+
.reg .b32 %r<5>;
|
169 |
+
.reg .f32 %f<49>;
|
170 |
+
$L__func_begin1:
|
171 |
+
|
172 |
+
ld.param.f32 %f14, [__nv_erff_param_0];
|
173 |
+
abs.ftz.f32 %f1, %f14;
|
174 |
+
setp.ge.f32 %p1, %f1, 0f3F8060FE;
|
175 |
+
mov.f32 %f46, 0f3789CA3C;
|
176 |
+
mov.f32 %f45, 0fB9F560B9;
|
177 |
+
mov.f32 %f44, 0f3BAC840B;
|
178 |
+
mov.f32 %f43, 0fBD0C8162;
|
179 |
+
mov.f32 %f42, 0f3E1CF906;
|
180 |
+
mov.f32 %f41, 0f3F6A937E;
|
181 |
+
mov.f32 %f40, 0f3F20D842;
|
182 |
+
mov.f32 %f47, %f1;
|
183 |
+
@%p1 bra $L__BB1_2;
|
184 |
+
mul.f32 %f47, %f14, %f14;
|
185 |
+
mov.f32 %f46, 0f38B1E96A;
|
186 |
+
mov.f32 %f45, 0fBA574D20;
|
187 |
+
mov.f32 %f44, 0f3BAAD5EA;
|
188 |
+
mov.f32 %f43, 0fBCDC1BE7;
|
189 |
+
mov.f32 %f42, 0f3DE718AF;
|
190 |
+
mov.f32 %f41, 0fBEC093AC;
|
191 |
+
mov.f32 %f40, 0f3E0375D3;
|
192 |
+
$L__BB1_2:
|
193 |
+
setp.ltu.f32 %p2, %f1, 0f3F8060FE;
|
194 |
+
fma.rn.ftz.f32 %f29, %f46, %f47, %f45;
|
195 |
+
fma.rn.ftz.f32 %f30, %f29, %f47, %f44;
|
196 |
+
fma.rn.ftz.f32 %f31, %f30, %f47, %f43;
|
197 |
+
fma.rn.ftz.f32 %f32, %f31, %f47, %f42;
|
198 |
+
fma.rn.ftz.f32 %f33, %f32, %f47, %f41;
|
199 |
+
fma.rn.ftz.f32 %f34, %f33, %f47, %f40;
|
200 |
+
neg.f32 %f35, %f47;
|
201 |
+
selp.f32 %f36, %f35, %f14, %p1;
|
202 |
+
fma.rn.ftz.f32 %f48, %f34, %f36, %f36;
|
203 |
+
@%p2 bra $L__BB1_4;
|
204 |
+
ex2.approx.ftz.f32 %f37, %f48;
|
205 |
+
mov.f32 %f38, 0f3F800000;
|
206 |
+
sub.f32 %f39, %f38, %f37;
|
207 |
+
mov.b32 %r1, %f39;
|
208 |
+
mov.b32 %r2, %f14;
|
209 |
+
and.b32 %r3, %r2, -2147483648;
|
210 |
+
or.b32 %r4, %r3, %r1;
|
211 |
+
mov.b32 %f48, %r4;
|
212 |
+
$L__BB1_4:
|
213 |
+
st.param.f32 [func_retval0+0], %f48;
|
214 |
+
ret;
|
215 |
+
$L__func_end1:
|
216 |
+
|
217 |
+
}
|
218 |
+
.file 1 "/tmp/torchinductor_root/kp/ckphrtdpgsxl7sfarkkzylhv4st3uhmzvg3u6z5excfp6ydybq74.py"
|
219 |
+
.section .debug_abbrev
|
220 |
+
{
|
221 |
+
.b8 1
|
222 |
+
.b8 17
|
223 |
+
.b8 1
|
224 |
+
.b8 37
|
225 |
+
.b8 8
|
226 |
+
.b8 19
|
227 |
+
.b8 5
|
228 |
+
.b8 3
|
229 |
+
.b8 8
|
230 |
+
.b8 16
|
231 |
+
.b8 6
|
232 |
+
.b8 27
|
233 |
+
.b8 8
|
234 |
+
.b8 180
|
235 |
+
.b8 66
|
236 |
+
.b8 12
|
237 |
+
.b8 17
|
238 |
+
.b8 1
|
239 |
+
.b8 18
|
240 |
+
.b8 1
|
241 |
+
.b8 0
|
242 |
+
.b8 0
|
243 |
+
.b8 2
|
244 |
+
.b8 46
|
245 |
+
.b8 0
|
246 |
+
.b8 17
|
247 |
+
.b8 1
|
248 |
+
.b8 18
|
249 |
+
.b8 1
|
250 |
+
.b8 64
|
251 |
+
.b8 10
|
252 |
+
.b8 135
|
253 |
+
.b8 64
|
254 |
+
.b8 8
|
255 |
+
.b8 3
|
256 |
+
.b8 8
|
257 |
+
.b8 58
|
258 |
+
.b8 11
|
259 |
+
.b8 59
|
260 |
+
.b8 11
|
261 |
+
.b8 63
|
262 |
+
.b8 12
|
263 |
+
.b8 0
|
264 |
+
.b8 0
|
265 |
+
.b8 0
|
266 |
+
}
|
267 |
+
.section .debug_info
|
268 |
+
{
|
269 |
+
.b32 172
|
270 |
+
.b8 2
|
271 |
+
.b8 0
|
272 |
+
.b32 .debug_abbrev
|
273 |
+
.b8 8
|
274 |
+
.b8 1
|
275 |
+
.b8 116
|
276 |
+
.b8 114
|
277 |
+
.b8 105
|
278 |
+
.b8 116
|
279 |
+
.b8 111
|
280 |
+
.b8 110
|
281 |
+
.b8 0
|
282 |
+
.b8 2
|
283 |
+
.b8 0
|
284 |
+
.b8 99
|
285 |
+
.b8 107
|
286 |
+
.b8 112
|
287 |
+
.b8 104
|
288 |
+
.b8 114
|
289 |
+
.b8 116
|
290 |
+
.b8 100
|
291 |
+
.b8 112
|
292 |
+
.b8 103
|
293 |
+
.b8 115
|
294 |
+
.b8 120
|
295 |
+
.b8 108
|
296 |
+
.b8 55
|
297 |
+
.b8 115
|
298 |
+
.b8 102
|
299 |
+
.b8 97
|
300 |
+
.b8 114
|
301 |
+
.b8 107
|
302 |
+
.b8 107
|
303 |
+
.b8 122
|
304 |
+
.b8 121
|
305 |
+
.b8 108
|
306 |
+
.b8 104
|
307 |
+
.b8 118
|
308 |
+
.b8 52
|
309 |
+
.b8 115
|
310 |
+
.b8 116
|
311 |
+
.b8 51
|
312 |
+
.b8 117
|
313 |
+
.b8 104
|
314 |
+
.b8 109
|
315 |
+
.b8 122
|
316 |
+
.b8 118
|
317 |
+
.b8 103
|
318 |
+
.b8 51
|
319 |
+
.b8 117
|
320 |
+
.b8 54
|
321 |
+
.b8 122
|
322 |
+
.b8 53
|
323 |
+
.b8 101
|
324 |
+
.b8 120
|
325 |
+
.b8 99
|
326 |
+
.b8 102
|
327 |
+
.b8 112
|
328 |
+
.b8 54
|
329 |
+
.b8 121
|
330 |
+
.b8 100
|
331 |
+
.b8 121
|
332 |
+
.b8 98
|
333 |
+
.b8 113
|
334 |
+
.b8 55
|
335 |
+
.b8 52
|
336 |
+
.b8 46
|
337 |
+
.b8 112
|
338 |
+
.b8 121
|
339 |
+
.b8 0
|
340 |
+
.b32 .debug_line
|
341 |
+
.b8 47
|
342 |
+
.b8 116
|
343 |
+
.b8 109
|
344 |
+
.b8 112
|
345 |
+
.b8 47
|
346 |
+
.b8 116
|
347 |
+
.b8 111
|
348 |
+
.b8 114
|
349 |
+
.b8 99
|
350 |
+
.b8 104
|
351 |
+
.b8 105
|
352 |
+
.b8 110
|
353 |
+
.b8 100
|
354 |
+
.b8 117
|
355 |
+
.b8 99
|
356 |
+
.b8 116
|
357 |
+
.b8 111
|
358 |
+
.b8 114
|
359 |
+
.b8 95
|
360 |
+
.b8 114
|
361 |
+
.b8 111
|
362 |
+
.b8 111
|
363 |
+
.b8 116
|
364 |
+
.b8 47
|
365 |
+
.b8 107
|
366 |
+
.b8 112
|
367 |
+
.b8 0
|
368 |
+
.b8 1
|
369 |
+
.b64 $L__func_begin0
|
370 |
+
.b64 $L__func_end0
|
371 |
+
.b8 2
|
372 |
+
.b64 $L__func_begin0
|
373 |
+
.b64 $L__func_end0
|
374 |
+
.b8 1
|
375 |
+
.b8 156
|
376 |
+
.b8 116
|
377 |
+
.b8 114
|
378 |
+
.b8 105
|
379 |
+
.b8 116
|
380 |
+
.b8 111
|
381 |
+
.b8 110
|
382 |
+
.b8 95
|
383 |
+
.b8 95
|
384 |
+
.b8 48
|
385 |
+
.b8 100
|
386 |
+
.b8 49
|
387 |
+
.b8 100
|
388 |
+
.b8 101
|
389 |
+
.b8 0
|
390 |
+
.b8 116
|
391 |
+
.b8 114
|
392 |
+
.b8 105
|
393 |
+
.b8 116
|
394 |
+
.b8 111
|
395 |
+
.b8 110
|
396 |
+
.b8 95
|
397 |
+
.b8 95
|
398 |
+
.b8 48
|
399 |
+
.b8 100
|
400 |
+
.b8 49
|
401 |
+
.b8 100
|
402 |
+
.b8 101
|
403 |
+
.b8 0
|
404 |
+
.b8 1
|
405 |
+
.b8 18
|
406 |
+
.b8 1
|
407 |
+
.b8 0
|
408 |
+
}
|
409 |
+
.section .debug_pubnames
|
410 |
+
{
|
411 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
412 |
+
$L__pubNames_start0:
|
413 |
+
.b8 2
|
414 |
+
.b8 0
|
415 |
+
.b32 .debug_info
|
416 |
+
.b32 176
|
417 |
+
.b32 125
|
418 |
+
.b8 116
|
419 |
+
.b8 114
|
420 |
+
.b8 105
|
421 |
+
.b8 116
|
422 |
+
.b8 111
|
423 |
+
.b8 110
|
424 |
+
.b8 95
|
425 |
+
.b8 95
|
426 |
+
.b8 48
|
427 |
+
.b8 100
|
428 |
+
.b8 49
|
429 |
+
.b8 100
|
430 |
+
.b8 101
|
431 |
+
.b8 0
|
432 |
+
.b32 0
|
433 |
+
$L__pubNames_end0:
|
434 |
+
}
|
435 |
+
.section .debug_pubtypes
|
436 |
+
{
|
437 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
438 |
+
$L__pubTypes_start0:
|
439 |
+
.b8 2
|
440 |
+
.b8 0
|
441 |
+
.b32 .debug_info
|
442 |
+
.b32 176
|
443 |
+
.b32 0
|
444 |
+
$L__pubTypes_end0:
|
445 |
+
}
|
446 |
+
.section .debug_loc { }
|
.triton/dump/1e922bbbab749da355e4bad9c6b245e6/triton_.ttgir
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
3 |
+
tt.func public @triton__0d1de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
4 |
+
%cst = arith.constant dense<1.000000e+00> : tensor<512xf32, #blocked>
|
5 |
+
%cst_0 = arith.constant dense<0.707106769> : tensor<512xf32, #blocked>
|
6 |
+
%cst_1 = arith.constant dense<5.000000e-01> : tensor<512xf32, #blocked>
|
7 |
+
%c512_i32 = arith.constant 512 : i32
|
8 |
+
%0 = tt.get_program_id x : i32
|
9 |
+
%1 = arith.muli %0, %c512_i32 : i32
|
10 |
+
%2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked>
|
11 |
+
%3 = tt.splat %1 : (i32) -> tensor<512xi32, #blocked>
|
12 |
+
%4 = arith.addi %3, %2 : tensor<512xi32, #blocked>
|
13 |
+
%5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>, #blocked>
|
14 |
+
%6 = tt.addptr %5, %4 : tensor<512x!tt.ptr<bf16, 1>, #blocked>, tensor<512xi32, #blocked>
|
15 |
+
%7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16, #blocked>
|
16 |
+
%8 = arith.extf %7 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked>
|
17 |
+
%9 = arith.mulf %8, %cst_1 : tensor<512xf32, #blocked>
|
18 |
+
%10 = arith.mulf %8, %cst_0 : tensor<512xf32, #blocked>
|
19 |
+
%11 = tt.extern_elementwise %10 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<512xf32, #blocked>) -> tensor<512xf32, #blocked>
|
20 |
+
%12 = arith.addf %11, %cst : tensor<512xf32, #blocked>
|
21 |
+
%13 = arith.mulf %9, %12 : tensor<512xf32, #blocked>
|
22 |
+
%14 = arith.truncf %13 : tensor<512xf32, #blocked> to tensor<512xbf16, #blocked>
|
23 |
+
tt.store %6, %14 {cache = 1 : i32, evict = 1 : i32} : tensor<512xbf16, #blocked>
|
24 |
+
tt.return
|
25 |
+
}
|
26 |
+
}
|
.triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.llir
ADDED
@@ -0,0 +1,503 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
|
5 |
+
@assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
|
6 |
+
@assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp13 < 50257"
|
7 |
+
@assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
|
8 |
+
@assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
|
9 |
+
@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
|
10 |
+
@global_smem = external addrspace(3) global [0 x i8]
|
11 |
+
@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
|
12 |
+
|
13 |
+
declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
|
14 |
+
|
15 |
+
define void @triton__0d1d2d3d4d5de6de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i32 %5, i32 %6) local_unnamed_addr !dbg !7 {
|
16 |
+
%8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
|
17 |
+
%9 = and i32 %8, 31, !dbg !10
|
18 |
+
%10 = lshr i32 %8, 5, !dbg !10
|
19 |
+
%11 = lshr i32 %8, 6, !dbg !10
|
20 |
+
%12 = and i32 %11, 1, !dbg !10
|
21 |
+
%13 = and i32 %8, 1, !dbg !10
|
22 |
+
%14 = and i32 %10, 1, !dbg !11
|
23 |
+
%urem = shl i32 %8, 2, !dbg !11
|
24 |
+
%15 = and i32 %urem, 252, !dbg !11
|
25 |
+
%16 = shl i32 %8, 1, !dbg !11
|
26 |
+
%17 = and i32 %16, 254, !dbg !11
|
27 |
+
%18 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !12
|
28 |
+
%19 = shl i32 %18, 1, !dbg !13
|
29 |
+
%20 = or i32 %19, %12, !dbg !14
|
30 |
+
%21 = or i32 %19, %13, !dbg !14
|
31 |
+
%22 = sext i32 %20 to i64, !dbg !15
|
32 |
+
%23 = getelementptr i64, ptr addrspace(1) %0, i64 %22, !dbg !15
|
33 |
+
%24 = sext i32 %21 to i64, !dbg !15
|
34 |
+
%25 = getelementptr i64, ptr addrspace(1) %0, i64 %24, !dbg !15
|
35 |
+
%26 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
|
36 |
+
%27 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
|
37 |
+
%28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
|
38 |
+
%29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
|
39 |
+
%30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !16
|
40 |
+
%31 = srem i32 %20, 512, !dbg !17
|
41 |
+
%32 = shl nsw i32 %31, 8, !dbg !18
|
42 |
+
%33 = or i32 %32, %15, !dbg !19
|
43 |
+
%34 = sext i32 %33 to i64, !dbg !20
|
44 |
+
%35 = getelementptr float, ptr addrspace(1) %2, i64 %34, !dbg !20
|
45 |
+
%36 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %35, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21
|
46 |
+
%37 = extractvalue { i32, i32, i32, i32 } %36, 0, !dbg !21
|
47 |
+
%38 = extractvalue { i32, i32, i32, i32 } %36, 1, !dbg !21
|
48 |
+
%39 = extractvalue { i32, i32, i32, i32 } %36, 2, !dbg !21
|
49 |
+
%40 = extractvalue { i32, i32, i32, i32 } %36, 3, !dbg !21
|
50 |
+
%41 = bitcast i32 %37 to float, !dbg !21
|
51 |
+
%42 = bitcast i32 %38 to float, !dbg !21
|
52 |
+
%43 = bitcast i32 %39 to float, !dbg !21
|
53 |
+
%44 = bitcast i32 %40 to float, !dbg !21
|
54 |
+
%45 = add i64 %30, 50257, !dbg !22
|
55 |
+
%46 = icmp slt i64 %26, 0, !dbg !23
|
56 |
+
%47 = icmp slt i64 %30, 0, !dbg !23
|
57 |
+
%48 = select i1 %47, i64 %45, i64 %30, !dbg !24
|
58 |
+
%49 = icmp ugt i64 %48, 50256, !dbg !25
|
59 |
+
br i1 %49, label %50, label %51, !dbg !26
|
60 |
+
|
61 |
+
50: ; preds = %7
|
62 |
+
tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !26
|
63 |
+
br label %51, !dbg !26
|
64 |
+
|
65 |
+
51: ; preds = %50, %7
|
66 |
+
%52 = shl i64 %26, 8, !dbg !27
|
67 |
+
%53 = add i64 %52, 12865792, !dbg !27
|
68 |
+
%54 = select i1 %46, i64 %53, i64 %52, !dbg !27
|
69 |
+
%55 = zext nneg i32 %15 to i64
|
70 |
+
%56 = or i64 %54, %55, !dbg !28
|
71 |
+
%57 = getelementptr float, ptr addrspace(1) %1, i64 %56, !dbg !29
|
72 |
+
%58 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %57, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !30
|
73 |
+
%59 = extractvalue { i32, i32, i32, i32 } %58, 0, !dbg !30
|
74 |
+
%60 = extractvalue { i32, i32, i32, i32 } %58, 1, !dbg !30
|
75 |
+
%61 = extractvalue { i32, i32, i32, i32 } %58, 2, !dbg !30
|
76 |
+
%62 = extractvalue { i32, i32, i32, i32 } %58, 3, !dbg !30
|
77 |
+
%63 = bitcast i32 %59 to float, !dbg !30
|
78 |
+
%64 = bitcast i32 %60 to float, !dbg !30
|
79 |
+
%65 = bitcast i32 %61 to float, !dbg !30
|
80 |
+
%66 = bitcast i32 %62 to float, !dbg !30
|
81 |
+
%67 = fadd float %41, %63, !dbg !31
|
82 |
+
%68 = fadd float %42, %64, !dbg !31
|
83 |
+
%69 = fadd float %43, %65, !dbg !31
|
84 |
+
%70 = fadd float %44, %66, !dbg !31
|
85 |
+
%71 = fadd float %67, 0.000000e+00, !dbg !32
|
86 |
+
%72 = fadd float %68, 0.000000e+00, !dbg !32
|
87 |
+
%73 = fadd float %69, 0.000000e+00, !dbg !32
|
88 |
+
%74 = fadd float %70, 0.000000e+00, !dbg !32
|
89 |
+
%75 = fsub float %67, %71, !dbg !36
|
90 |
+
%76 = fsub float %68, %72, !dbg !36
|
91 |
+
%77 = fsub float %69, %73, !dbg !36
|
92 |
+
%78 = fsub float %70, %74, !dbg !36
|
93 |
+
%79 = fmul float %67, %75, !dbg !37
|
94 |
+
%80 = fmul float %68, %76, !dbg !37
|
95 |
+
%81 = fmul float %69, %77, !dbg !37
|
96 |
+
%82 = fmul float %70, %78, !dbg !37
|
97 |
+
%83 = fadd float %79, 0.000000e+00, !dbg !38
|
98 |
+
%84 = fadd float %80, 0.000000e+00, !dbg !38
|
99 |
+
%85 = fadd float %81, 0.000000e+00, !dbg !38
|
100 |
+
%86 = fadd float %82, 0.000000e+00, !dbg !38
|
101 |
+
%87 = fsub float %72, %71, !dbg !39
|
102 |
+
%88 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 2.000000e+00) #6, !dbg !43
|
103 |
+
%89 = fmul float %88, %87, !dbg !44
|
104 |
+
%90 = fadd float %71, %89, !dbg !45
|
105 |
+
%91 = fadd float %83, %84, !dbg !46
|
106 |
+
%92 = fmul float %87, %87, !dbg !47
|
107 |
+
%93 = fmul float %88, %92, !dbg !48
|
108 |
+
%94 = fadd float %93, %91, !dbg !49
|
109 |
+
%95 = fsub float %73, %90, !dbg !39
|
110 |
+
%96 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 3.000000e+00) #6, !dbg !43
|
111 |
+
%97 = fmul float %96, %95, !dbg !44
|
112 |
+
%98 = fadd float %90, %97, !dbg !45
|
113 |
+
%99 = fadd float %85, %94, !dbg !46
|
114 |
+
%100 = fmul float %95, %95, !dbg !47
|
115 |
+
%101 = fmul float %100, 2.000000e+00, !dbg !50
|
116 |
+
%102 = fmul float %96, %101, !dbg !48
|
117 |
+
%103 = fadd float %99, %102, !dbg !49
|
118 |
+
%104 = fsub float %74, %98, !dbg !39
|
119 |
+
%105 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 4.000000e+00) #6, !dbg !43
|
120 |
+
%106 = fmul float %105, %104, !dbg !44
|
121 |
+
%107 = fadd float %98, %106, !dbg !45
|
122 |
+
%108 = fadd float %86, %103, !dbg !46
|
123 |
+
%109 = fmul float %104, %104, !dbg !47
|
124 |
+
%110 = fmul float %109, 3.000000e+00, !dbg !50
|
125 |
+
%111 = fmul float %105, %110, !dbg !48
|
126 |
+
%112 = fadd float %108, %111, !dbg !49
|
127 |
+
%113 = bitcast float %107 to i32, !dbg !51
|
128 |
+
%114 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %113, i32 16, i32 31), !dbg !51
|
129 |
+
%115 = bitcast i32 %114 to float, !dbg !51
|
130 |
+
%116 = bitcast float %112 to i32, !dbg !51
|
131 |
+
%117 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %116, i32 16, i32 31), !dbg !51
|
132 |
+
%118 = bitcast i32 %117 to float, !dbg !51
|
133 |
+
%119 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 1082130432, i32 16, i32 31), !dbg !51
|
134 |
+
%120 = bitcast i32 %119 to float, !dbg !51
|
135 |
+
%121 = fsub float %115, %107, !dbg !39
|
136 |
+
%122 = fadd float %120, 4.000000e+00, !dbg !53
|
137 |
+
%123 = fcmp oeq float %122, 0.000000e+00, !dbg !54
|
138 |
+
%124 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %120, float %122) #6, !dbg !43
|
139 |
+
%125 = select i1 %123, float 0.000000e+00, float %124, !dbg !55
|
140 |
+
%126 = fmul float %125, %121, !dbg !44
|
141 |
+
%127 = fadd float %107, %126, !dbg !45
|
142 |
+
%128 = fadd float %112, %118, !dbg !46
|
143 |
+
%129 = fmul float %121, %121, !dbg !47
|
144 |
+
%130 = fmul float %129, 4.000000e+00, !dbg !50
|
145 |
+
%131 = fmul float %125, %130, !dbg !48
|
146 |
+
%132 = fadd float %128, %131, !dbg !49
|
147 |
+
%133 = bitcast float %127 to i32, !dbg !51
|
148 |
+
%134 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %133, i32 8, i32 31), !dbg !51
|
149 |
+
%135 = bitcast i32 %134 to float, !dbg !51
|
150 |
+
%136 = bitcast float %132 to i32, !dbg !51
|
151 |
+
%137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 8, i32 31), !dbg !51
|
152 |
+
%138 = bitcast i32 %137 to float, !dbg !51
|
153 |
+
%139 = bitcast float %122 to i32, !dbg !51
|
154 |
+
%140 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %139, i32 8, i32 31), !dbg !51
|
155 |
+
%141 = bitcast i32 %140 to float, !dbg !51
|
156 |
+
%142 = fsub float %135, %127, !dbg !39
|
157 |
+
%143 = fadd float %122, %141, !dbg !53
|
158 |
+
%144 = fcmp oeq float %143, 0.000000e+00, !dbg !54
|
159 |
+
%145 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %141, float %143) #6, !dbg !43
|
160 |
+
%146 = select i1 %144, float 0.000000e+00, float %145, !dbg !55
|
161 |
+
%147 = fmul float %146, %142, !dbg !44
|
162 |
+
%148 = fadd float %127, %147, !dbg !45
|
163 |
+
%149 = fadd float %132, %138, !dbg !46
|
164 |
+
%150 = fmul float %142, %142, !dbg !47
|
165 |
+
%151 = fmul float %122, %150, !dbg !50
|
166 |
+
%152 = fmul float %146, %151, !dbg !48
|
167 |
+
%153 = fadd float %149, %152, !dbg !49
|
168 |
+
%154 = bitcast float %148 to i32, !dbg !51
|
169 |
+
%155 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %154, i32 4, i32 31), !dbg !51
|
170 |
+
%156 = bitcast i32 %155 to float, !dbg !51
|
171 |
+
%157 = bitcast float %153 to i32, !dbg !51
|
172 |
+
%158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 4, i32 31), !dbg !51
|
173 |
+
%159 = bitcast i32 %158 to float, !dbg !51
|
174 |
+
%160 = bitcast float %143 to i32, !dbg !51
|
175 |
+
%161 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %160, i32 4, i32 31), !dbg !51
|
176 |
+
%162 = bitcast i32 %161 to float, !dbg !51
|
177 |
+
%163 = fsub float %156, %148, !dbg !39
|
178 |
+
%164 = fadd float %143, %162, !dbg !53
|
179 |
+
%165 = fcmp oeq float %164, 0.000000e+00, !dbg !54
|
180 |
+
%166 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %162, float %164) #6, !dbg !43
|
181 |
+
%167 = select i1 %165, float 0.000000e+00, float %166, !dbg !55
|
182 |
+
%168 = fmul float %167, %163, !dbg !44
|
183 |
+
%169 = fadd float %148, %168, !dbg !45
|
184 |
+
%170 = fadd float %153, %159, !dbg !46
|
185 |
+
%171 = fmul float %163, %163, !dbg !47
|
186 |
+
%172 = fmul float %143, %171, !dbg !50
|
187 |
+
%173 = fmul float %167, %172, !dbg !48
|
188 |
+
%174 = fadd float %170, %173, !dbg !49
|
189 |
+
%175 = bitcast float %169 to i32, !dbg !51
|
190 |
+
%176 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %175, i32 2, i32 31), !dbg !51
|
191 |
+
%177 = bitcast i32 %176 to float, !dbg !51
|
192 |
+
%178 = bitcast float %174 to i32, !dbg !51
|
193 |
+
%179 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %178, i32 2, i32 31), !dbg !51
|
194 |
+
%180 = bitcast i32 %179 to float, !dbg !51
|
195 |
+
%181 = bitcast float %164 to i32, !dbg !51
|
196 |
+
%182 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %181, i32 2, i32 31), !dbg !51
|
197 |
+
%183 = bitcast i32 %182 to float, !dbg !51
|
198 |
+
%184 = fsub float %177, %169, !dbg !39
|
199 |
+
%185 = fadd float %164, %183, !dbg !53
|
200 |
+
%186 = fcmp oeq float %185, 0.000000e+00, !dbg !54
|
201 |
+
%187 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %183, float %185) #6, !dbg !43
|
202 |
+
%188 = select i1 %186, float 0.000000e+00, float %187, !dbg !55
|
203 |
+
%189 = fmul float %188, %184, !dbg !44
|
204 |
+
%190 = fadd float %169, %189, !dbg !45
|
205 |
+
%191 = fadd float %174, %180, !dbg !46
|
206 |
+
%192 = fmul float %184, %184, !dbg !47
|
207 |
+
%193 = fmul float %164, %192, !dbg !50
|
208 |
+
%194 = fmul float %188, %193, !dbg !48
|
209 |
+
%195 = fadd float %191, %194, !dbg !49
|
210 |
+
%196 = bitcast float %190 to i32, !dbg !51
|
211 |
+
%197 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %196, i32 1, i32 31), !dbg !51
|
212 |
+
%198 = bitcast i32 %197 to float, !dbg !51
|
213 |
+
%199 = bitcast float %195 to i32, !dbg !51
|
214 |
+
%200 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %199, i32 1, i32 31), !dbg !51
|
215 |
+
%201 = bitcast i32 %200 to float, !dbg !51
|
216 |
+
%202 = bitcast float %185 to i32, !dbg !51
|
217 |
+
%203 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %202, i32 1, i32 31), !dbg !51
|
218 |
+
%204 = bitcast i32 %203 to float, !dbg !51
|
219 |
+
%205 = fsub float %198, %190, !dbg !39
|
220 |
+
%206 = fadd float %185, %204, !dbg !53
|
221 |
+
%207 = fcmp oeq float %206, 0.000000e+00, !dbg !54
|
222 |
+
%208 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %204, float %206) #6, !dbg !43
|
223 |
+
%209 = select i1 %207, float 0.000000e+00, float %208, !dbg !55
|
224 |
+
%210 = fmul float %205, %209, !dbg !44
|
225 |
+
%211 = fadd float %190, %210, !dbg !45
|
226 |
+
%212 = fadd float %195, %201, !dbg !46
|
227 |
+
%213 = fmul float %205, %205, !dbg !47
|
228 |
+
%214 = fmul float %185, %213, !dbg !50
|
229 |
+
%215 = fmul float %209, %214, !dbg !48
|
230 |
+
%216 = fadd float %212, %215, !dbg !49
|
231 |
+
%217 = icmp eq i32 %9, 0, !dbg !51
|
232 |
+
%218 = shl nuw nsw i32 %12, 1, !dbg !51
|
233 |
+
%219 = or i32 %218, %14, !dbg !51
|
234 |
+
%220 = zext nneg i32 %219 to i64, !dbg !51
|
235 |
+
%221 = getelementptr float, ptr addrspace(3) @global_smem, i64 %220, !dbg !51
|
236 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %221, float %211, i1 %217) #6, !dbg !51
|
237 |
+
%222 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %220, !dbg !51
|
238 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %222, float %216, i1 %217) #6, !dbg !51
|
239 |
+
%223 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 32), i64 %220, !dbg !51
|
240 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %223, float %206, i1 %217) #6, !dbg !51
|
241 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !51
|
242 |
+
%224 = icmp slt i32 %8, 4, !dbg !51
|
243 |
+
%225 = sext i32 %8 to i64, !dbg !51
|
244 |
+
%226 = getelementptr float, ptr addrspace(3) @global_smem, i64 %225, !dbg !51
|
245 |
+
%227 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %226, i1 %224) #6, !dbg !51
|
246 |
+
%228 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %225, !dbg !51
|
247 |
+
%229 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %228, i1 %224) #6, !dbg !51
|
248 |
+
%230 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 32), i64 %225, !dbg !51
|
249 |
+
%231 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %230, i1 %224) #6, !dbg !51
|
250 |
+
%232 = bitcast float %227 to i32, !dbg !51
|
251 |
+
%233 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %232, i32 1, i32 31), !dbg !51
|
252 |
+
%234 = bitcast i32 %233 to float, !dbg !51
|
253 |
+
%235 = bitcast float %229 to i32, !dbg !51
|
254 |
+
%236 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %235, i32 1, i32 31), !dbg !51
|
255 |
+
%237 = bitcast i32 %236 to float, !dbg !51
|
256 |
+
%238 = bitcast float %231 to i32, !dbg !51
|
257 |
+
%239 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %238, i32 1, i32 31), !dbg !51
|
258 |
+
%240 = bitcast i32 %239 to float, !dbg !51
|
259 |
+
%241 = fsub float %234, %227, !dbg !39
|
260 |
+
%242 = fadd float %231, %240, !dbg !53
|
261 |
+
%243 = fcmp oeq float %242, 0.000000e+00, !dbg !54
|
262 |
+
%244 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %240, float %242) #6, !dbg !43
|
263 |
+
%245 = select i1 %243, float 0.000000e+00, float %244, !dbg !55
|
264 |
+
%246 = fmul float %241, %245, !dbg !44
|
265 |
+
%247 = fadd float %227, %246, !dbg !45
|
266 |
+
%248 = fadd float %229, %237, !dbg !46
|
267 |
+
%249 = fmul float %241, %241, !dbg !47
|
268 |
+
%250 = fmul float %231, %249, !dbg !50
|
269 |
+
%251 = fmul float %250, %245, !dbg !48
|
270 |
+
%252 = fadd float %248, %251, !dbg !49
|
271 |
+
%253 = icmp eq i32 %13, 0, !dbg !51
|
272 |
+
%254 = and i1 %224, %253, !dbg !51
|
273 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %226, float %247, i1 %254) #6, !dbg !51
|
274 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %228, float %252, i1 %254) #6, !dbg !51
|
275 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %230, float %242, i1 %254) #6, !dbg !51
|
276 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !51
|
277 |
+
%255 = zext nneg i32 %218 to i64, !dbg !51
|
278 |
+
%256 = getelementptr float, ptr addrspace(3) @global_smem, i64 %255, !dbg !51
|
279 |
+
%257 = load float, ptr addrspace(3) %256, align 4, !dbg !51
|
280 |
+
%258 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %255, !dbg !51
|
281 |
+
%259 = load float, ptr addrspace(3) %258, align 4, !dbg !51
|
282 |
+
%260 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %35, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !56
|
283 |
+
%261 = zext nneg i32 %17 to i64, !dbg !57
|
284 |
+
%262 = getelementptr float, ptr addrspace(1) %3, i64 %261, !dbg !57
|
285 |
+
%263 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %262, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !58
|
286 |
+
br i1 %49, label %264, label %265, !dbg !59
|
287 |
+
|
288 |
+
264: ; preds = %51
|
289 |
+
tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !59
|
290 |
+
br label %265, !dbg !59
|
291 |
+
|
292 |
+
265: ; preds = %264, %51
|
293 |
+
%266 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %57, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !60
|
294 |
+
%267 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %259, float 2.560000e+02) #6, !dbg !61
|
295 |
+
%268 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %259, float 2.560000e+02) #6, !dbg !61
|
296 |
+
%269 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %259, float 2.560000e+02) #6, !dbg !61
|
297 |
+
%270 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %259, float 2.560000e+02) #6, !dbg !61
|
298 |
+
%271 = fadd float %267, 0x3EE4F8B580000000, !dbg !62
|
299 |
+
%272 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
|
300 |
+
%.not.i = icmp eq i32 %272, 0, !dbg !63
|
301 |
+
br i1 %.not.i, label %275, label %273, !dbg !63
|
302 |
+
|
303 |
+
273: ; preds = %265
|
304 |
+
%274 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %271), !dbg !63
|
305 |
+
br label %__nv_rsqrtf.exit, !dbg !63
|
306 |
+
|
307 |
+
275: ; preds = %265
|
308 |
+
%276 = tail call float @llvm.nvvm.rsqrt.approx.f(float %271), !dbg !63
|
309 |
+
br label %__nv_rsqrtf.exit, !dbg !63
|
310 |
+
|
311 |
+
__nv_rsqrtf.exit: ; preds = %273, %275
|
312 |
+
%.0.i = phi float [ %274, %273 ], [ %276, %275 ], !dbg !63
|
313 |
+
%277 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
|
314 |
+
%278 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
|
315 |
+
%279 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
|
316 |
+
%280 = extractvalue { i32, i32, i32, i32 } %266, 3, !dbg !60
|
317 |
+
%281 = bitcast i32 %280 to float, !dbg !60
|
318 |
+
%282 = extractvalue { i32, i32, i32, i32 } %260, 3, !dbg !56
|
319 |
+
%283 = bitcast i32 %282 to float, !dbg !56
|
320 |
+
%284 = fadd float %283, %281, !dbg !64
|
321 |
+
%285 = fsub float %284, %257, !dbg !65
|
322 |
+
%286 = extractvalue { i32, i32, i32, i32 } %266, 2, !dbg !60
|
323 |
+
%287 = bitcast i32 %286 to float, !dbg !60
|
324 |
+
%288 = extractvalue { i32, i32, i32, i32 } %260, 2, !dbg !56
|
325 |
+
%289 = bitcast i32 %288 to float, !dbg !56
|
326 |
+
%290 = fadd float %289, %287, !dbg !64
|
327 |
+
%291 = fsub float %290, %257, !dbg !65
|
328 |
+
%292 = extractvalue { i32, i32, i32, i32 } %266, 1, !dbg !60
|
329 |
+
%293 = bitcast i32 %292 to float, !dbg !60
|
330 |
+
%294 = extractvalue { i32, i32, i32, i32 } %260, 1, !dbg !56
|
331 |
+
%295 = bitcast i32 %294 to float, !dbg !56
|
332 |
+
%296 = fadd float %295, %293, !dbg !64
|
333 |
+
%297 = fsub float %296, %257, !dbg !65
|
334 |
+
%298 = extractvalue { i32, i32, i32, i32 } %266, 0, !dbg !60
|
335 |
+
%299 = bitcast i32 %298 to float, !dbg !60
|
336 |
+
%300 = extractvalue { i32, i32, i32, i32 } %260, 0, !dbg !56
|
337 |
+
%301 = bitcast i32 %300 to float, !dbg !56
|
338 |
+
%302 = fadd float %301, %299, !dbg !64
|
339 |
+
%303 = fsub float %302, %257, !dbg !65
|
340 |
+
%304 = extractvalue { i32, i32 } %263, 0, !dbg !58
|
341 |
+
%305 = extractvalue { i32, i32 } %263, 1, !dbg !58
|
342 |
+
%306 = fmul float %303, %.0.i, !dbg !66
|
343 |
+
%307 = fmul float %297, %.0.i, !dbg !66
|
344 |
+
%308 = fmul float %291, %.0.i, !dbg !66
|
345 |
+
%309 = fmul float %285, %.0.i, !dbg !66
|
346 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !67
|
347 |
+
%310 = getelementptr float, ptr addrspace(3) @global_smem, i64 %261, !dbg !67
|
348 |
+
%311 = insertelement <2 x i32> undef, i32 %304, i64 0, !dbg !67
|
349 |
+
%312 = insertelement <2 x i32> %311, i32 %305, i64 1, !dbg !67
|
350 |
+
store <2 x i32> %312, ptr addrspace(3) %310, align 8, !dbg !67
|
351 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !67
|
352 |
+
%313 = getelementptr float, ptr addrspace(3) @global_smem, i64 %55, !dbg !67
|
353 |
+
%314 = load float, ptr addrspace(3) %313, align 16, !dbg !67
|
354 |
+
%315 = getelementptr inbounds <4 x float>, ptr addrspace(3) %313, i64 0, i64 1, !dbg !67
|
355 |
+
%316 = load float, ptr addrspace(3) %315, align 4, !dbg !67
|
356 |
+
%317 = getelementptr inbounds <4 x float>, ptr addrspace(3) %313, i64 0, i64 2, !dbg !67
|
357 |
+
%318 = load float, ptr addrspace(3) %317, align 8, !dbg !67
|
358 |
+
%319 = getelementptr inbounds <4 x float>, ptr addrspace(3) %313, i64 0, i64 3, !dbg !67
|
359 |
+
%320 = load float, ptr addrspace(3) %319, align 4, !dbg !67
|
360 |
+
%321 = fmul float %306, %314, !dbg !67
|
361 |
+
%322 = fmul float %307, %316, !dbg !67
|
362 |
+
%323 = fmul float %308, %318, !dbg !67
|
363 |
+
%324 = fmul float %309, %320, !dbg !67
|
364 |
+
%325 = shl i32 %20, 8, !dbg !68
|
365 |
+
%326 = or i32 %325, %15, !dbg !69
|
366 |
+
%327 = sext i32 %326 to i64, !dbg !70
|
367 |
+
%328 = getelementptr i16, ptr addrspace(1) %4, i64 %327, !dbg !70
|
368 |
+
%329 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %321) #6, !dbg !71
|
369 |
+
%330 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %322) #6, !dbg !71
|
370 |
+
%331 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %323) #6, !dbg !71
|
371 |
+
%332 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %324) #6, !dbg !71
|
372 |
+
%333 = insertelement <2 x i16> undef, i16 %329, i64 0, !dbg !71
|
373 |
+
%334 = insertelement <2 x i16> %333, i16 %330, i64 1, !dbg !71
|
374 |
+
%335 = bitcast <2 x i16> %334 to i32, !dbg !71
|
375 |
+
%336 = insertelement <2 x i16> undef, i16 %331, i64 0, !dbg !71
|
376 |
+
%337 = insertelement <2 x i16> %336, i16 %332, i64 1, !dbg !71
|
377 |
+
%338 = bitcast <2 x i16> %337 to i32, !dbg !71
|
378 |
+
tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %335, i32 %338, ptr addrspace(1) %328, i1 true) #6, !dbg !71
|
379 |
+
ret void, !dbg !72
|
380 |
+
}
|
381 |
+
|
382 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
383 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
384 |
+
|
385 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
386 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
|
387 |
+
|
388 |
+
; Function Attrs: convergent nocallback nounwind
|
389 |
+
declare void @llvm.nvvm.barrier0() #2
|
390 |
+
|
391 |
+
; Function Attrs: alwaysinline nounwind
|
392 |
+
define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
|
393 |
+
%1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
|
394 |
+
%.not = icmp eq i32 %1, 0
|
395 |
+
br i1 %.not, label %4, label %2
|
396 |
+
|
397 |
+
2: ; preds = %0
|
398 |
+
%3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
|
399 |
+
br label %6
|
400 |
+
|
401 |
+
4: ; preds = %0
|
402 |
+
%5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
|
403 |
+
br label %6
|
404 |
+
|
405 |
+
6: ; preds = %4, %2
|
406 |
+
%.0 = phi float [ %3, %2 ], [ %5, %4 ]
|
407 |
+
ret float %.0
|
408 |
+
}
|
409 |
+
|
410 |
+
declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
|
411 |
+
|
412 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
413 |
+
declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
|
414 |
+
|
415 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
416 |
+
declare float @llvm.nvvm.rsqrt.approx.f(float) #5
|
417 |
+
|
418 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
419 |
+
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
420 |
+
attributes #2 = { convergent nocallback nounwind }
|
421 |
+
attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
422 |
+
attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
423 |
+
attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
|
424 |
+
attributes #6 = { nounwind }
|
425 |
+
|
426 |
+
!llvm.module.flags = !{!0, !1}
|
427 |
+
!llvm.dbg.cu = !{!2}
|
428 |
+
!nvvm.annotations = !{!4, !5, !5, !4}
|
429 |
+
!llvm.ident = !{!6}
|
430 |
+
|
431 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
432 |
+
!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
433 |
+
!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
434 |
+
!3 = !DIFile(filename: "cgx5lxpuexpindj4dsmjz5x42uhyy7iskevq7ovzpwagb3t5powj.py", directory: "/tmp/torchinductor_root/gx")
|
435 |
+
!4 = !{ptr @triton__0d1d2d3d4d5de6de, !"kernel", i32 1}
|
436 |
+
!5 = !{ptr @triton__0d1d2d3d4d5de6de, !"maxntidx", i32 128}
|
437 |
+
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
|
438 |
+
!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5de6de", linkageName: "triton__0d1d2d3d4d5de6de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
|
439 |
+
!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
|
440 |
+
!9 = !{}
|
441 |
+
!10 = !DILocation(line: 22, column: 44, scope: !7)
|
442 |
+
!11 = !DILocation(line: 24, column: 33, scope: !7)
|
443 |
+
!12 = !DILocation(line: 21, column: 28, scope: !7)
|
444 |
+
!13 = !DILocation(line: 21, column: 33, scope: !7)
|
445 |
+
!14 = !DILocation(line: 22, column: 23, scope: !7)
|
446 |
+
!15 = !DILocation(line: 26, column: 30, scope: !7)
|
447 |
+
!16 = !DILocation(line: 26, column: 35, scope: !7)
|
448 |
+
!17 = !DILocation(line: 27, column: 18, scope: !7)
|
449 |
+
!18 = !DILocation(line: 35, column: 44, scope: !7)
|
450 |
+
!19 = !DILocation(line: 35, column: 40, scope: !7)
|
451 |
+
!20 = !DILocation(line: 35, column: 34, scope: !7)
|
452 |
+
!21 = !DILocation(line: 35, column: 50, scope: !7)
|
453 |
+
!22 = !DILocation(line: 36, column: 22, scope: !7)
|
454 |
+
!23 = !DILocation(line: 37, column: 22, scope: !7)
|
455 |
+
!24 = !DILocation(line: 38, column: 36, scope: !7)
|
456 |
+
!25 = !DILocation(line: 39, column: 40, scope: !7)
|
457 |
+
!26 = !DILocation(line: 39, column: 55, scope: !7)
|
458 |
+
!27 = !DILocation(line: 40, column: 44, scope: !7)
|
459 |
+
!28 = !DILocation(line: 40, column: 40, scope: !7)
|
460 |
+
!29 = !DILocation(line: 40, column: 34, scope: !7)
|
461 |
+
!30 = !DILocation(line: 40, column: 52, scope: !7)
|
462 |
+
!31 = !DILocation(line: 41, column: 22, scope: !7)
|
463 |
+
!32 = !DILocation(line: 98, column: 22, scope: !33, inlinedAt: !35)
|
464 |
+
!33 = distinct !DILexicalBlockFile(scope: !7, file: !34, discriminator: 0)
|
465 |
+
!34 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
|
466 |
+
!35 = !DILocation(line: 44, column: 38, scope: !33)
|
467 |
+
!36 = !DILocation(line: 101, column: 30, scope: !33, inlinedAt: !35)
|
468 |
+
!37 = !DILocation(line: 101, column: 22, scope: !33, inlinedAt: !35)
|
469 |
+
!38 = !DILocation(line: 101, column: 13, scope: !33, inlinedAt: !35)
|
470 |
+
!39 = !DILocation(line: 108, column: 21, scope: !40, inlinedAt: !41)
|
471 |
+
!40 = distinct !DILexicalBlockFile(scope: !33, file: !34, discriminator: 0)
|
472 |
+
!41 = !DILocation(line: 120, column: 46, scope: !40, inlinedAt: !42)
|
473 |
+
!42 = !DILocation(line: 50, column: 41, scope: !40)
|
474 |
+
!43 = !DILocation(line: 110, column: 60, scope: !40, inlinedAt: !41)
|
475 |
+
!44 = !DILocation(line: 112, column: 25, scope: !40, inlinedAt: !41)
|
476 |
+
!45 = !DILocation(line: 112, column: 17, scope: !40, inlinedAt: !41)
|
477 |
+
!46 = !DILocation(line: 113, column: 15, scope: !40, inlinedAt: !41)
|
478 |
+
!47 = !DILocation(line: 113, column: 30, scope: !40, inlinedAt: !41)
|
479 |
+
!48 = !DILocation(line: 113, column: 49, scope: !40, inlinedAt: !41)
|
480 |
+
!49 = !DILocation(line: 113, column: 22, scope: !40, inlinedAt: !41)
|
481 |
+
!50 = !DILocation(line: 113, column: 38, scope: !40, inlinedAt: !41)
|
482 |
+
!51 = !DILocation(line: 120, column: 46, scope: !33, inlinedAt: !52)
|
483 |
+
!52 = !DILocation(line: 50, column: 41, scope: !33)
|
484 |
+
!53 = !DILocation(line: 109, column: 28, scope: !40, inlinedAt: !41)
|
485 |
+
!54 = !DILocation(line: 110, column: 39, scope: !40, inlinedAt: !41)
|
486 |
+
!55 = !DILocation(line: 110, column: 49, scope: !40, inlinedAt: !41)
|
487 |
+
!56 = !DILocation(line: 59, column: 51, scope: !7)
|
488 |
+
!57 = !DILocation(line: 60, column: 35, scope: !7)
|
489 |
+
!58 = !DILocation(line: 60, column: 40, scope: !7)
|
490 |
+
!59 = !DILocation(line: 64, column: 57, scope: !7)
|
491 |
+
!60 = !DILocation(line: 65, column: 54, scope: !7)
|
492 |
+
!61 = !DILocation(line: 69, column: 23, scope: !7)
|
493 |
+
!62 = !DILocation(line: 71, column: 24, scope: !7)
|
494 |
+
!63 = !DILocation(line: 72, column: 30, scope: !7)
|
495 |
+
!64 = !DILocation(line: 66, column: 24, scope: !7)
|
496 |
+
!65 = !DILocation(line: 67, column: 24, scope: !7)
|
497 |
+
!66 = !DILocation(line: 73, column: 24, scope: !7)
|
498 |
+
!67 = !DILocation(line: 74, column: 24, scope: !7)
|
499 |
+
!68 = !DILocation(line: 76, column: 39, scope: !7)
|
500 |
+
!69 = !DILocation(line: 76, column: 35, scope: !7)
|
501 |
+
!70 = !DILocation(line: 76, column: 29, scope: !7)
|
502 |
+
!71 = !DILocation(line: 76, column: 52, scope: !7)
|
503 |
+
!72 = !DILocation(line: 55, column: 4, scope: !7)
|
.triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.ptx
ADDED
@@ -0,0 +1,988 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3d4d5de6de
|
10 |
+
.extern .func __assertfail
|
11 |
+
(
|
12 |
+
.param .b64 __assertfail_param_0,
|
13 |
+
.param .b64 __assertfail_param_1,
|
14 |
+
.param .b32 __assertfail_param_2,
|
15 |
+
.param .b64 __assertfail_param_3,
|
16 |
+
.param .b64 __assertfail_param_4
|
17 |
+
)
|
18 |
+
;
|
19 |
+
.global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
|
20 |
+
.global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
|
21 |
+
.global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 51, 32, 60, 32, 53, 48, 50, 53, 55};
|
22 |
+
.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
|
23 |
+
.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
|
24 |
+
.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
|
25 |
+
.extern .shared .align 1 .b8 global_smem[];
|
26 |
+
.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
|
27 |
+
|
28 |
+
.visible .entry triton__0d1d2d3d4d5de6de(
|
29 |
+
.param .u64 triton__0d1d2d3d4d5de6de_param_0,
|
30 |
+
.param .u64 triton__0d1d2d3d4d5de6de_param_1,
|
31 |
+
.param .u64 triton__0d1d2d3d4d5de6de_param_2,
|
32 |
+
.param .u64 triton__0d1d2d3d4d5de6de_param_3,
|
33 |
+
.param .u64 triton__0d1d2d3d4d5de6de_param_4,
|
34 |
+
.param .u32 triton__0d1d2d3d4d5de6de_param_5,
|
35 |
+
.param .u32 triton__0d1d2d3d4d5de6de_param_6
|
36 |
+
)
|
37 |
+
.maxntid 128, 1, 1
|
38 |
+
{
|
39 |
+
.reg .pred %p<50>;
|
40 |
+
.reg .b16 %rs<5>;
|
41 |
+
.reg .b32 %r<169>;
|
42 |
+
.reg .f32 %f<153>;
|
43 |
+
.reg .b64 %rd<53>;
|
44 |
+
.loc 1 18 0
|
45 |
+
$L__func_begin0:
|
46 |
+
.loc 1 18 0
|
47 |
+
|
48 |
+
ld.param.u64 %rd6, [triton__0d1d2d3d4d5de6de_param_3];
|
49 |
+
ld.param.u64 %rd5, [triton__0d1d2d3d4d5de6de_param_1];
|
50 |
+
ld.param.u64 %rd19, [triton__0d1d2d3d4d5de6de_param_0];
|
51 |
+
$L__tmp0:
|
52 |
+
.loc 1 22 44
|
53 |
+
mov.u32 %r1, %tid.x;
|
54 |
+
and.b32 %r2, %r1, 31;
|
55 |
+
ld.param.u64 %rd20, [triton__0d1d2d3d4d5de6de_param_2];
|
56 |
+
bfe.u32 %r3, %r1, 6, 1;
|
57 |
+
and.b32 %r4, %r1, 1;
|
58 |
+
.loc 1 24 33
|
59 |
+
bfe.u32 %r5, %r1, 5, 1;
|
60 |
+
shl.b32 %r24, %r1, 2;
|
61 |
+
and.b32 %r6, %r24, 252;
|
62 |
+
shl.b32 %r25, %r1, 1;
|
63 |
+
and.b32 %r7, %r25, 254;
|
64 |
+
.loc 1 21 28
|
65 |
+
mov.u32 %r15, %ctaid.x;
|
66 |
+
.loc 1 21 33
|
67 |
+
shl.b32 %r26, %r15, 1;
|
68 |
+
.loc 1 22 23
|
69 |
+
or.b32 %r8, %r26, %r3;
|
70 |
+
or.b32 %r27, %r26, %r4;
|
71 |
+
.loc 1 26 30
|
72 |
+
mul.wide.s32 %rd21, %r8, 8;
|
73 |
+
add.s64 %rd9, %rd19, %rd21;
|
74 |
+
mul.wide.s32 %rd22, %r27, 8;
|
75 |
+
add.s64 %rd17, %rd19, %rd22;
|
76 |
+
mov.pred %p44, -1;
|
77 |
+
.loc 1 26 35
|
78 |
+
mov.u64 %rd8, 0x0;
|
79 |
+
@%p44 ld.global.L1::evict_last.b64 { %rd8 }, [ %rd9 + 0 ];
|
80 |
+
mov.u64 %rd10, 0x0;
|
81 |
+
@%p44 ld.global.L1::evict_last.b64 { %rd10 }, [ %rd9 + 0 ];
|
82 |
+
mov.u64 %rd12, 0x0;
|
83 |
+
@%p44 ld.global.L1::evict_last.b64 { %rd12 }, [ %rd9 + 0 ];
|
84 |
+
mov.u64 %rd14, 0x0;
|
85 |
+
@%p44 ld.global.L1::evict_last.b64 { %rd14 }, [ %rd9 + 0 ];
|
86 |
+
mov.u64 %rd16, 0x0;
|
87 |
+
@%p44 ld.global.L1::evict_last.b64 { %rd16 }, [ %rd17 + 0 ];
|
88 |
+
.loc 1 27 18
|
89 |
+
bfe.s32 %r28, %r15, 30, 1;
|
90 |
+
shr.u32 %r29, %r28, 23;
|
91 |
+
add.s32 %r30, %r8, %r29;
|
92 |
+
and.b32 %r31, %r30, 16776704;
|
93 |
+
sub.s32 %r32, %r8, %r31;
|
94 |
+
.loc 1 35 44
|
95 |
+
shl.b32 %r33, %r32, 8;
|
96 |
+
.loc 1 35 40
|
97 |
+
or.b32 %r34, %r33, %r6;
|
98 |
+
.loc 1 35 34
|
99 |
+
mul.wide.s32 %rd23, %r34, 4;
|
100 |
+
add.s64 %rd33, %rd20, %rd23;
|
101 |
+
mov.b32 %r137, 0;
|
102 |
+
.loc 1 35 50
|
103 |
+
mov.u32 %r16, 0x0;
|
104 |
+
mov.u32 %r17, 0x0;
|
105 |
+
mov.u32 %r18, 0x0;
|
106 |
+
mov.u32 %r19, 0x0;
|
107 |
+
@%p44 ld.global.L1::evict_last.v4.b32 { %r16, %r17, %r18, %r19 }, [ %rd33 + 0 ];
|
108 |
+
@!%p44 mov.u32 %r16, %r137;
|
109 |
+
@!%p44 mov.u32 %r17, %r137;
|
110 |
+
@!%p44 mov.u32 %r18, %r137;
|
111 |
+
@!%p44 mov.u32 %r19, %r137;
|
112 |
+
mov.b32 %f1, %r16;
|
113 |
+
mov.b32 %f2, %r17;
|
114 |
+
mov.b32 %f3, %r18;
|
115 |
+
mov.b32 %f4, %r19;
|
116 |
+
.loc 1 36 22
|
117 |
+
add.s64 %rd24, %rd16, 50257;
|
118 |
+
.loc 1 37 22
|
119 |
+
setp.lt.s64 %p11, %rd16, 0;
|
120 |
+
.loc 1 38 36
|
121 |
+
selp.b64 %rd3, %rd24, %rd16, %p11;
|
122 |
+
.loc 1 39 40
|
123 |
+
setp.lt.u64 %p12, %rd3, 50257;
|
124 |
+
mov.b32 %r168, 883;
|
125 |
+
mov.u64 %rd52, 1;
|
126 |
+
.loc 1 39 55
|
127 |
+
@%p12 bra $L__BB0_2;
|
128 |
+
mov.u64 %rd25, assertMessage_0;
|
129 |
+
cvta.global.u64 %rd26, %rd25;
|
130 |
+
mov.u64 %rd27, assertFile_0;
|
131 |
+
cvta.global.u64 %rd28, %rd27;
|
132 |
+
mov.u64 %rd29, assertFunc_0;
|
133 |
+
cvta.global.u64 %rd30, %rd29;
|
134 |
+
{ // callseq 4, 0
|
135 |
+
.reg .b32 temp_param_reg;
|
136 |
+
.param .b64 param0;
|
137 |
+
st.param.b64 [param0+0], %rd26;
|
138 |
+
.param .b64 param1;
|
139 |
+
st.param.b64 [param1+0], %rd28;
|
140 |
+
.param .b32 param2;
|
141 |
+
st.param.b32 [param2+0], %r168;
|
142 |
+
.param .b64 param3;
|
143 |
+
st.param.b64 [param3+0], %rd30;
|
144 |
+
.param .b64 param4;
|
145 |
+
st.param.b64 [param4+0], %rd52;
|
146 |
+
call.uni
|
147 |
+
__assertfail,
|
148 |
+
(
|
149 |
+
param0,
|
150 |
+
param1,
|
151 |
+
param2,
|
152 |
+
param3,
|
153 |
+
param4
|
154 |
+
);
|
155 |
+
} // callseq 4
|
156 |
+
$L__BB0_2:
|
157 |
+
.loc 1 0 55
|
158 |
+
ld.param.u64 %rd7, [triton__0d1d2d3d4d5de6de_param_4];
|
159 |
+
.loc 1 37 22
|
160 |
+
setp.lt.s64 %p36, %rd8, 0;
|
161 |
+
.loc 1 40 44
|
162 |
+
shl.b64 %rd35, %rd8, 8;
|
163 |
+
add.s64 %rd36, %rd35, 12865792;
|
164 |
+
selp.b64 %rd37, %rd36, %rd35, %p36;
|
165 |
+
cvt.u64.u32 %rd38, %r6;
|
166 |
+
.loc 1 40 40
|
167 |
+
or.b64 %rd39, %rd37, %rd38;
|
168 |
+
.loc 1 40 34
|
169 |
+
shl.b64 %rd40, %rd39, 2;
|
170 |
+
add.s64 %rd49, %rd5, %rd40;
|
171 |
+
.loc 1 40 52
|
172 |
+
mov.u32 %r36, 0x0;
|
173 |
+
mov.u32 %r37, 0x0;
|
174 |
+
mov.u32 %r38, 0x0;
|
175 |
+
mov.u32 %r39, 0x0;
|
176 |
+
@%p44 ld.global.L1::evict_last.v4.b32 { %r36, %r37, %r38, %r39 }, [ %rd49 + 0 ];
|
177 |
+
@!%p44 mov.u32 %r36, %r137;
|
178 |
+
@!%p44 mov.u32 %r37, %r137;
|
179 |
+
@!%p44 mov.u32 %r38, %r137;
|
180 |
+
@!%p44 mov.u32 %r39, %r137;
|
181 |
+
mov.b32 %f7, %r36;
|
182 |
+
mov.b32 %f8, %r37;
|
183 |
+
mov.b32 %f9, %r38;
|
184 |
+
mov.b32 %f10, %r39;
|
185 |
+
.loc 1 41 22
|
186 |
+
add.f32 %f11, %f1, %f7;
|
187 |
+
add.f32 %f12, %f2, %f8;
|
188 |
+
add.f32 %f13, %f3, %f9;
|
189 |
+
add.f32 %f14, %f4, %f10;
|
190 |
+
$L__tmp1:
|
191 |
+
.loc 2 98 22
|
192 |
+
add.f32 %f15, %f11, 0f00000000;
|
193 |
+
add.f32 %f16, %f12, 0f00000000;
|
194 |
+
add.f32 %f17, %f13, 0f00000000;
|
195 |
+
add.f32 %f18, %f14, 0f00000000;
|
196 |
+
.loc 2 101 30
|
197 |
+
sub.f32 %f19, %f11, %f15;
|
198 |
+
sub.f32 %f20, %f12, %f16;
|
199 |
+
sub.f32 %f21, %f13, %f17;
|
200 |
+
sub.f32 %f22, %f14, %f18;
|
201 |
+
.loc 2 101 13
|
202 |
+
fma.rn.f32 %f23, %f11, %f19, 0f00000000;
|
203 |
+
fma.rn.f32 %f24, %f12, %f20, 0f00000000;
|
204 |
+
fma.rn.f32 %f25, %f13, %f21, 0f00000000;
|
205 |
+
fma.rn.f32 %f26, %f14, %f22, 0f00000000;
|
206 |
+
$L__tmp2:
|
207 |
+
.loc 2 108 21
|
208 |
+
sub.f32 %f27, %f16, %f15;
|
209 |
+
mov.b32 %r45, 1065353216;
|
210 |
+
mov.b32 %r46, 1073741824;
|
211 |
+
.loc 2 110 60
|
212 |
+
div.full.f32 %r44, %r45, %r46;
|
213 |
+
mov.b32 %f28, %r44;
|
214 |
+
.loc 2 112 17
|
215 |
+
fma.rn.f32 %f29, %f28, %f27, %f15;
|
216 |
+
.loc 2 113 15
|
217 |
+
add.f32 %f30, %f23, %f24;
|
218 |
+
.loc 2 113 30
|
219 |
+
mul.f32 %f31, %f27, %f27;
|
220 |
+
.loc 2 113 22
|
221 |
+
fma.rn.f32 %f32, %f28, %f31, %f30;
|
222 |
+
.loc 2 108 21
|
223 |
+
sub.f32 %f33, %f17, %f29;
|
224 |
+
mov.b32 %r49, 1077936128;
|
225 |
+
.loc 2 110 60
|
226 |
+
div.full.f32 %r47, %r45, %r49;
|
227 |
+
mov.b32 %f34, %r47;
|
228 |
+
.loc 2 112 17
|
229 |
+
fma.rn.f32 %f35, %f34, %f33, %f29;
|
230 |
+
.loc 2 113 15
|
231 |
+
add.f32 %f36, %f25, %f32;
|
232 |
+
.loc 2 113 30
|
233 |
+
mul.f32 %f37, %f33, %f33;
|
234 |
+
.loc 2 113 38
|
235 |
+
fma.rn.f32 %f38, %f33, %f33, %f37;
|
236 |
+
.loc 2 113 22
|
237 |
+
fma.rn.f32 %f39, %f34, %f38, %f36;
|
238 |
+
.loc 2 108 21
|
239 |
+
sub.f32 %f40, %f18, %f35;
|
240 |
+
mov.b32 %r52, 1082130432;
|
241 |
+
.loc 2 110 60
|
242 |
+
div.full.f32 %r50, %r45, %r52;
|
243 |
+
mov.b32 %f41, %r50;
|
244 |
+
.loc 2 112 17
|
245 |
+
fma.rn.f32 %f42, %f41, %f40, %f35;
|
246 |
+
.loc 2 113 15
|
247 |
+
add.f32 %f43, %f26, %f39;
|
248 |
+
.loc 2 113 30
|
249 |
+
mul.f32 %f44, %f40, %f40;
|
250 |
+
.loc 2 113 38
|
251 |
+
mul.f32 %f45, %f44, 0f40400000;
|
252 |
+
.loc 2 113 22
|
253 |
+
fma.rn.f32 %f46, %f41, %f45, %f43;
|
254 |
+
$L__tmp3:
|
255 |
+
.loc 2 120 46
|
256 |
+
mov.b32 %r101, %f42;
|
257 |
+
shfl.sync.bfly.b32 %r102, %r101, 16, 31, -1;
|
258 |
+
mov.b32 %f47, %r102;
|
259 |
+
mov.b32 %r103, %f46;
|
260 |
+
shfl.sync.bfly.b32 %r104, %r103, 16, 31, -1;
|
261 |
+
mov.b32 %f48, %r104;
|
262 |
+
shfl.sync.bfly.b32 %r54, %r52, 16, 31, -1;
|
263 |
+
mov.b32 %f49, %r54;
|
264 |
+
$L__tmp4:
|
265 |
+
.loc 2 108 21
|
266 |
+
sub.f32 %f50, %f47, %f42;
|
267 |
+
.loc 2 109 28
|
268 |
+
add.f32 %f51, %f49, 0f40800000;
|
269 |
+
.loc 2 110 39
|
270 |
+
setp.eq.f32 %p37, %f51, 0f00000000;
|
271 |
+
.loc 2 110 60
|
272 |
+
mov.b32 %r55, %f51;
|
273 |
+
div.full.f32 %r53, %r54, %r55;
|
274 |
+
mov.b32 %f52, %r53;
|
275 |
+
.loc 2 110 49
|
276 |
+
selp.f32 %f53, 0f00000000, %f52, %p37;
|
277 |
+
.loc 2 112 17
|
278 |
+
fma.rn.f32 %f54, %f53, %f50, %f42;
|
279 |
+
.loc 2 113 15
|
280 |
+
add.f32 %f55, %f46, %f48;
|
281 |
+
.loc 2 113 30
|
282 |
+
mul.f32 %f56, %f50, %f50;
|
283 |
+
.loc 2 113 38
|
284 |
+
mul.f32 %f57, %f56, 0f40800000;
|
285 |
+
.loc 2 113 22
|
286 |
+
fma.rn.f32 %f58, %f53, %f57, %f55;
|
287 |
+
$L__tmp5:
|
288 |
+
.loc 2 120 46
|
289 |
+
mov.b32 %r105, %f54;
|
290 |
+
shfl.sync.bfly.b32 %r106, %r105, 8, 31, -1;
|
291 |
+
mov.b32 %f59, %r106;
|
292 |
+
mov.b32 %r107, %f58;
|
293 |
+
shfl.sync.bfly.b32 %r108, %r107, 8, 31, -1;
|
294 |
+
mov.b32 %f60, %r108;
|
295 |
+
shfl.sync.bfly.b32 %r57, %r55, 8, 31, -1;
|
296 |
+
mov.b32 %f61, %r57;
|
297 |
+
$L__tmp6:
|
298 |
+
.loc 2 108 21
|
299 |
+
sub.f32 %f62, %f59, %f54;
|
300 |
+
.loc 2 109 28
|
301 |
+
add.f32 %f63, %f51, %f61;
|
302 |
+
.loc 2 110 39
|
303 |
+
setp.eq.f32 %p38, %f63, 0f00000000;
|
304 |
+
.loc 2 110 60
|
305 |
+
mov.b32 %r58, %f63;
|
306 |
+
div.full.f32 %r56, %r57, %r58;
|
307 |
+
mov.b32 %f64, %r56;
|
308 |
+
.loc 2 110 49
|
309 |
+
selp.f32 %f65, 0f00000000, %f64, %p38;
|
310 |
+
.loc 2 112 17
|
311 |
+
fma.rn.f32 %f66, %f65, %f62, %f54;
|
312 |
+
.loc 2 113 15
|
313 |
+
add.f32 %f67, %f58, %f60;
|
314 |
+
.loc 2 113 30
|
315 |
+
mul.f32 %f68, %f62, %f62;
|
316 |
+
.loc 2 113 38
|
317 |
+
mul.f32 %f69, %f51, %f68;
|
318 |
+
.loc 2 113 22
|
319 |
+
fma.rn.f32 %f70, %f65, %f69, %f67;
|
320 |
+
$L__tmp7:
|
321 |
+
.loc 2 120 46
|
322 |
+
mov.b32 %r109, %f66;
|
323 |
+
shfl.sync.bfly.b32 %r110, %r109, 4, 31, -1;
|
324 |
+
mov.b32 %f71, %r110;
|
325 |
+
mov.b32 %r111, %f70;
|
326 |
+
shfl.sync.bfly.b32 %r112, %r111, 4, 31, -1;
|
327 |
+
mov.b32 %f72, %r112;
|
328 |
+
shfl.sync.bfly.b32 %r60, %r58, 4, 31, -1;
|
329 |
+
mov.b32 %f73, %r60;
|
330 |
+
$L__tmp8:
|
331 |
+
.loc 2 108 21
|
332 |
+
sub.f32 %f74, %f71, %f66;
|
333 |
+
.loc 2 109 28
|
334 |
+
add.f32 %f75, %f63, %f73;
|
335 |
+
.loc 2 110 39
|
336 |
+
setp.eq.f32 %p39, %f75, 0f00000000;
|
337 |
+
.loc 2 110 60
|
338 |
+
mov.b32 %r61, %f75;
|
339 |
+
div.full.f32 %r59, %r60, %r61;
|
340 |
+
mov.b32 %f76, %r59;
|
341 |
+
.loc 2 110 49
|
342 |
+
selp.f32 %f77, 0f00000000, %f76, %p39;
|
343 |
+
.loc 2 112 17
|
344 |
+
fma.rn.f32 %f78, %f77, %f74, %f66;
|
345 |
+
.loc 2 113 15
|
346 |
+
add.f32 %f79, %f70, %f72;
|
347 |
+
.loc 2 113 30
|
348 |
+
mul.f32 %f80, %f74, %f74;
|
349 |
+
.loc 2 113 38
|
350 |
+
mul.f32 %f81, %f63, %f80;
|
351 |
+
.loc 2 113 22
|
352 |
+
fma.rn.f32 %f82, %f77, %f81, %f79;
|
353 |
+
$L__tmp9:
|
354 |
+
.loc 2 120 46
|
355 |
+
mov.b32 %r113, %f78;
|
356 |
+
shfl.sync.bfly.b32 %r114, %r113, 2, 31, -1;
|
357 |
+
mov.b32 %f83, %r114;
|
358 |
+
mov.b32 %r115, %f82;
|
359 |
+
shfl.sync.bfly.b32 %r116, %r115, 2, 31, -1;
|
360 |
+
mov.b32 %f84, %r116;
|
361 |
+
shfl.sync.bfly.b32 %r63, %r61, 2, 31, -1;
|
362 |
+
mov.b32 %f85, %r63;
|
363 |
+
$L__tmp10:
|
364 |
+
.loc 2 108 21
|
365 |
+
sub.f32 %f86, %f83, %f78;
|
366 |
+
.loc 2 109 28
|
367 |
+
add.f32 %f87, %f75, %f85;
|
368 |
+
.loc 2 110 39
|
369 |
+
setp.eq.f32 %p40, %f87, 0f00000000;
|
370 |
+
.loc 2 110 60
|
371 |
+
mov.b32 %r64, %f87;
|
372 |
+
div.full.f32 %r62, %r63, %r64;
|
373 |
+
mov.b32 %f88, %r62;
|
374 |
+
.loc 2 110 49
|
375 |
+
selp.f32 %f89, 0f00000000, %f88, %p40;
|
376 |
+
.loc 2 112 17
|
377 |
+
fma.rn.f32 %f90, %f89, %f86, %f78;
|
378 |
+
.loc 2 113 15
|
379 |
+
add.f32 %f91, %f82, %f84;
|
380 |
+
.loc 2 113 30
|
381 |
+
mul.f32 %f92, %f86, %f86;
|
382 |
+
.loc 2 113 38
|
383 |
+
mul.f32 %f93, %f75, %f92;
|
384 |
+
.loc 2 113 22
|
385 |
+
fma.rn.f32 %f94, %f89, %f93, %f91;
|
386 |
+
$L__tmp11:
|
387 |
+
.loc 2 120 46
|
388 |
+
mov.b32 %r117, %f90;
|
389 |
+
shfl.sync.bfly.b32 %r118, %r117, 1, 31, -1;
|
390 |
+
mov.b32 %f95, %r118;
|
391 |
+
mov.b32 %r119, %f94;
|
392 |
+
shfl.sync.bfly.b32 %r120, %r119, 1, 31, -1;
|
393 |
+
mov.b32 %f96, %r120;
|
394 |
+
shfl.sync.bfly.b32 %r66, %r64, 1, 31, -1;
|
395 |
+
mov.b32 %f97, %r66;
|
396 |
+
$L__tmp12:
|
397 |
+
.loc 2 108 21
|
398 |
+
sub.f32 %f98, %f95, %f90;
|
399 |
+
.loc 2 109 28
|
400 |
+
add.f32 %f99, %f87, %f97;
|
401 |
+
.loc 2 110 39
|
402 |
+
setp.eq.f32 %p41, %f99, 0f00000000;
|
403 |
+
.loc 2 110 60
|
404 |
+
mov.b32 %r67, %f99;
|
405 |
+
div.full.f32 %r65, %r66, %r67;
|
406 |
+
mov.b32 %f100, %r65;
|
407 |
+
.loc 2 110 49
|
408 |
+
selp.f32 %f101, 0f00000000, %f100, %p41;
|
409 |
+
.loc 2 112 17
|
410 |
+
fma.rn.f32 %f102, %f98, %f101, %f90;
|
411 |
+
.loc 2 113 15
|
412 |
+
add.f32 %f103, %f94, %f96;
|
413 |
+
.loc 2 113 30
|
414 |
+
mul.f32 %f104, %f98, %f98;
|
415 |
+
.loc 2 113 38
|
416 |
+
mul.f32 %f105, %f87, %f104;
|
417 |
+
.loc 2 113 22
|
418 |
+
fma.rn.f32 %f106, %f101, %f105, %f103;
|
419 |
+
$L__tmp13:
|
420 |
+
.loc 2 120 46
|
421 |
+
setp.eq.s32 %p18, %r2, 0;
|
422 |
+
shl.b32 %r121, %r5, 2;
|
423 |
+
shl.b32 %r122, %r3, 3;
|
424 |
+
or.b32 %r123, %r122, %r121;
|
425 |
+
mov.u32 %r124, global_smem;
|
426 |
+
add.s32 %r68, %r124, %r123;
|
427 |
+
mov.b32 %r69, %f102;
|
428 |
+
@%p18 st.shared.b32 [ %r68 + 0 ], %r69;
|
429 |
+
add.s32 %r125, %r124, 16;
|
430 |
+
add.s32 %r70, %r125, %r123;
|
431 |
+
mov.b32 %r71, %f106;
|
432 |
+
@%p18 st.shared.b32 [ %r70 + 0 ], %r71;
|
433 |
+
add.s32 %r126, %r124, 32;
|
434 |
+
add.s32 %r72, %r126, %r123;
|
435 |
+
@%p18 st.shared.b32 [ %r72 + 0 ], %r67;
|
436 |
+
bar.sync 0;
|
437 |
+
setp.lt.s32 %p21, %r1, 4;
|
438 |
+
add.s32 %r75, %r124, %r24;
|
439 |
+
@%p21 ld.shared.b32 %r74, [ %r75 + 0 ];
|
440 |
+
mov.b32 %f107, %r74;
|
441 |
+
add.s32 %r77, %r125, %r24;
|
442 |
+
@%p21 ld.shared.b32 %r76, [ %r77 + 0 ];
|
443 |
+
mov.b32 %f108, %r76;
|
444 |
+
add.s32 %r79, %r126, %r24;
|
445 |
+
@%p21 ld.shared.b32 %r78, [ %r79 + 0 ];
|
446 |
+
mov.b32 %f109, %r78;
|
447 |
+
shfl.sync.bfly.b32 %r128, %r74, 1, 31, -1;
|
448 |
+
mov.b32 %f110, %r128;
|
449 |
+
shfl.sync.bfly.b32 %r129, %r76, 1, 31, -1;
|
450 |
+
mov.b32 %f111, %r129;
|
451 |
+
shfl.sync.bfly.b32 %r81, %r78, 1, 31, -1;
|
452 |
+
mov.b32 %f112, %r81;
|
453 |
+
$L__tmp14:
|
454 |
+
.loc 2 108 21
|
455 |
+
sub.f32 %f113, %f110, %f107;
|
456 |
+
.loc 2 109 28
|
457 |
+
add.f32 %f114, %f109, %f112;
|
458 |
+
.loc 2 110 39
|
459 |
+
setp.eq.f32 %p42, %f114, 0f00000000;
|
460 |
+
.loc 2 110 60
|
461 |
+
mov.b32 %r82, %f114;
|
462 |
+
div.full.f32 %r80, %r81, %r82;
|
463 |
+
mov.b32 %f115, %r80;
|
464 |
+
.loc 2 110 49
|
465 |
+
selp.f32 %f116, 0f00000000, %f115, %p42;
|
466 |
+
.loc 2 112 17
|
467 |
+
fma.rn.f32 %f117, %f113, %f116, %f107;
|
468 |
+
.loc 2 113 15
|
469 |
+
add.f32 %f118, %f108, %f111;
|
470 |
+
.loc 2 113 30
|
471 |
+
mul.f32 %f119, %f113, %f113;
|
472 |
+
.loc 2 113 38
|
473 |
+
mul.f32 %f120, %f109, %f119;
|
474 |
+
.loc 2 113 22
|
475 |
+
fma.rn.f32 %f121, %f120, %f116, %f118;
|
476 |
+
$L__tmp15:
|
477 |
+
.loc 2 120 46
|
478 |
+
setp.eq.s32 %p43, %r4, 0;
|
479 |
+
and.pred %p24, %p21, %p43;
|
480 |
+
mov.b32 %r84, %f117;
|
481 |
+
@%p24 st.shared.b32 [ %r75 + 0 ], %r84;
|
482 |
+
mov.b32 %r86, %f121;
|
483 |
+
@%p24 st.shared.b32 [ %r77 + 0 ], %r86;
|
484 |
+
@%p24 st.shared.b32 [ %r79 + 0 ], %r82;
|
485 |
+
bar.sync 0;
|
486 |
+
add.s32 %r130, %r124, %r122;
|
487 |
+
ld.shared.f32 %f5, [%r130];
|
488 |
+
add.s32 %r131, %r125, %r122;
|
489 |
+
ld.shared.f32 %f6, [%r131];
|
490 |
+
$L__tmp16:
|
491 |
+
.loc 1 59 51
|
492 |
+
mov.u32 %r89, 0x0;
|
493 |
+
mov.u32 %r90, 0x0;
|
494 |
+
mov.u32 %r91, 0x0;
|
495 |
+
mov.u32 %r92, 0x0;
|
496 |
+
@%p44 ld.global.L1::evict_last.v4.b32 { %r89, %r90, %r91, %r92 }, [ %rd33 + 0 ];
|
497 |
+
@!%p44 mov.u32 %r89, %r137;
|
498 |
+
@!%p44 mov.u32 %r90, %r137;
|
499 |
+
@!%p44 mov.u32 %r91, %r137;
|
500 |
+
@!%p44 mov.u32 %r92, %r137;
|
501 |
+
.loc 1 60 35
|
502 |
+
mul.wide.u32 %rd41, %r7, 4;
|
503 |
+
add.s64 %rd34, %rd6, %rd41;
|
504 |
+
.loc 1 60 40
|
505 |
+
mov.u32 %r97, 0x0;
|
506 |
+
mov.u32 %r98, 0x0;
|
507 |
+
@%p44 ld.global.L1::evict_last.v2.b32 { %r97, %r98 }, [ %rd34 + 0 ];
|
508 |
+
@!%p44 mov.u32 %r97, %r137;
|
509 |
+
@!%p44 mov.u32 %r98, %r137;
|
510 |
+
.loc 1 64 57
|
511 |
+
@%p12 bra $L__BB0_4;
|
512 |
+
mov.u64 %rd42, assertMessage_1;
|
513 |
+
cvta.global.u64 %rd43, %rd42;
|
514 |
+
mov.u64 %rd44, assertFile_1;
|
515 |
+
cvta.global.u64 %rd45, %rd44;
|
516 |
+
mov.u64 %rd46, assertFunc_1;
|
517 |
+
cvta.global.u64 %rd47, %rd46;
|
518 |
+
{ // callseq 5, 0
|
519 |
+
.reg .b32 temp_param_reg;
|
520 |
+
.param .b64 param0;
|
521 |
+
st.param.b64 [param0+0], %rd43;
|
522 |
+
.param .b64 param1;
|
523 |
+
st.param.b64 [param1+0], %rd45;
|
524 |
+
.param .b32 param2;
|
525 |
+
st.param.b32 [param2+0], %r168;
|
526 |
+
.param .b64 param3;
|
527 |
+
st.param.b64 [param3+0], %rd47;
|
528 |
+
.param .b64 param4;
|
529 |
+
st.param.b64 [param4+0], %rd52;
|
530 |
+
call.uni
|
531 |
+
__assertfail,
|
532 |
+
(
|
533 |
+
param0,
|
534 |
+
param1,
|
535 |
+
param2,
|
536 |
+
param3,
|
537 |
+
param4
|
538 |
+
);
|
539 |
+
} // callseq 5
|
540 |
+
$L__BB0_4:
|
541 |
+
.loc 1 65 54
|
542 |
+
mov.u32 %r133, 0x0;
|
543 |
+
mov.u32 %r134, 0x0;
|
544 |
+
mov.u32 %r135, 0x0;
|
545 |
+
mov.u32 %r136, 0x0;
|
546 |
+
@%p44 ld.global.L1::evict_first.v4.b32 { %r133, %r134, %r135, %r136 }, [ %rd49 + 0 ];
|
547 |
+
@!%p44 mov.u32 %r133, %r137;
|
548 |
+
@!%p44 mov.u32 %r134, %r137;
|
549 |
+
@!%p44 mov.u32 %r135, %r137;
|
550 |
+
@!%p44 mov.u32 %r136, %r137;
|
551 |
+
.loc 1 69 23
|
552 |
+
mov.b32 %r142, %f6;
|
553 |
+
mov.b32 %r143, 1132462080;
|
554 |
+
div.full.f32 %r141, %r142, %r143;
|
555 |
+
mov.b32 %f122, %r141;
|
556 |
+
.loc 1 71 24
|
557 |
+
add.f32 %f123, %f122, 0f3727C5AC;
|
558 |
+
.loc 1 72 30
|
559 |
+
rsqrt.approx.ftz.f32 %f124, %f123;
|
560 |
+
.loc 1 65 54
|
561 |
+
mov.b32 %f125, %r136;
|
562 |
+
.loc 1 59 51
|
563 |
+
mov.b32 %f126, %r92;
|
564 |
+
.loc 1 66 24
|
565 |
+
add.f32 %f127, %f126, %f125;
|
566 |
+
.loc 1 67 24
|
567 |
+
sub.f32 %f128, %f127, %f5;
|
568 |
+
.loc 1 65 54
|
569 |
+
mov.b32 %f129, %r135;
|
570 |
+
.loc 1 59 51
|
571 |
+
mov.b32 %f130, %r91;
|
572 |
+
.loc 1 66 24
|
573 |
+
add.f32 %f131, %f130, %f129;
|
574 |
+
.loc 1 67 24
|
575 |
+
sub.f32 %f132, %f131, %f5;
|
576 |
+
.loc 1 65 54
|
577 |
+
mov.b32 %f133, %r134;
|
578 |
+
.loc 1 59 51
|
579 |
+
mov.b32 %f134, %r90;
|
580 |
+
.loc 1 66 24
|
581 |
+
add.f32 %f135, %f134, %f133;
|
582 |
+
.loc 1 67 24
|
583 |
+
sub.f32 %f136, %f135, %f5;
|
584 |
+
.loc 1 65 54
|
585 |
+
mov.b32 %f137, %r133;
|
586 |
+
.loc 1 59 51
|
587 |
+
mov.b32 %f138, %r89;
|
588 |
+
.loc 1 66 24
|
589 |
+
add.f32 %f139, %f138, %f137;
|
590 |
+
.loc 1 67 24
|
591 |
+
sub.f32 %f140, %f139, %f5;
|
592 |
+
.loc 1 73 24
|
593 |
+
mul.f32 %f141, %f140, %f124;
|
594 |
+
mul.f32 %f142, %f136, %f124;
|
595 |
+
mul.f32 %f143, %f132, %f124;
|
596 |
+
mul.f32 %f144, %f128, %f124;
|
597 |
+
.loc 1 74 24
|
598 |
+
bar.sync 0;
|
599 |
+
shl.b32 %r159, %r7, 2;
|
600 |
+
add.s32 %r161, %r124, %r159;
|
601 |
+
st.shared.v2.u32 [%r161], {%r97, %r98};
|
602 |
+
bar.sync 0;
|
603 |
+
shl.b32 %r162, %r6, 2;
|
604 |
+
add.s32 %r163, %r124, %r162;
|
605 |
+
ld.shared.v4.f32 {%f145, %f146, %f147, %f148}, [%r163];
|
606 |
+
mul.f32 %f149, %f141, %f145;
|
607 |
+
mul.f32 %f150, %f142, %f146;
|
608 |
+
mul.f32 %f151, %f143, %f147;
|
609 |
+
mul.f32 %f152, %f144, %f148;
|
610 |
+
.loc 1 76 39
|
611 |
+
shl.b32 %r164, %r8, 8;
|
612 |
+
.loc 1 76 35
|
613 |
+
or.b32 %r165, %r164, %r6;
|
614 |
+
.loc 1 76 29
|
615 |
+
mul.wide.s32 %rd51, %r165, 2;
|
616 |
+
add.s64 %rd50, %rd7, %rd51;
|
617 |
+
.loc 1 76 52
|
618 |
+
mov.b32 %r153, %f149;
|
619 |
+
cvt.rn.bf16.f32 %rs1, %r153;
|
620 |
+
mov.b32 %r154, %f150;
|
621 |
+
cvt.rn.bf16.f32 %rs2, %r154;
|
622 |
+
mov.b32 %r155, %f151;
|
623 |
+
cvt.rn.bf16.f32 %rs3, %r155;
|
624 |
+
mov.b32 %r156, %f152;
|
625 |
+
cvt.rn.bf16.f32 %rs4, %r156;
|
626 |
+
mov.b32 %r166, {%rs1, %rs2};
|
627 |
+
mov.b32 %r167, {%rs3, %rs4};
|
628 |
+
@%p44 st.global.v2.b32 [ %rd50 + 0 ], { %r166, %r167 };
|
629 |
+
.loc 1 55 4
|
630 |
+
ret;
|
631 |
+
$L__tmp17:
|
632 |
+
$L__func_end0:
|
633 |
+
|
634 |
+
}
|
635 |
+
// .globl __nv_rsqrtf
|
636 |
+
.visible .func (.param .b32 func_retval0) __nv_rsqrtf(
|
637 |
+
.param .b32 __nv_rsqrtf_param_0
|
638 |
+
)
|
639 |
+
{
|
640 |
+
.reg .f32 %f<3>;
|
641 |
+
$L__func_begin1:
|
642 |
+
|
643 |
+
ld.param.f32 %f1, [__nv_rsqrtf_param_0];
|
644 |
+
rsqrt.approx.ftz.f32 %f2, %f1;
|
645 |
+
st.param.f32 [func_retval0+0], %f2;
|
646 |
+
ret;
|
647 |
+
$L__func_end1:
|
648 |
+
|
649 |
+
}
|
650 |
+
.file 1 "/tmp/torchinductor_root/gx/cgx5lxpuexpindj4dsmjz5x42uhyy7iskevq7ovzpwagb3t5powj.py"
|
651 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
|
652 |
+
.section .debug_abbrev
|
653 |
+
{
|
654 |
+
.b8 1
|
655 |
+
.b8 17
|
656 |
+
.b8 1
|
657 |
+
.b8 37
|
658 |
+
.b8 8
|
659 |
+
.b8 19
|
660 |
+
.b8 5
|
661 |
+
.b8 3
|
662 |
+
.b8 8
|
663 |
+
.b8 16
|
664 |
+
.b8 6
|
665 |
+
.b8 27
|
666 |
+
.b8 8
|
667 |
+
.b8 180
|
668 |
+
.b8 66
|
669 |
+
.b8 12
|
670 |
+
.b8 17
|
671 |
+
.b8 1
|
672 |
+
.b8 18
|
673 |
+
.b8 1
|
674 |
+
.b8 0
|
675 |
+
.b8 0
|
676 |
+
.b8 2
|
677 |
+
.b8 46
|
678 |
+
.b8 0
|
679 |
+
.b8 135
|
680 |
+
.b8 64
|
681 |
+
.b8 8
|
682 |
+
.b8 3
|
683 |
+
.b8 8
|
684 |
+
.b8 58
|
685 |
+
.b8 11
|
686 |
+
.b8 59
|
687 |
+
.b8 11
|
688 |
+
.b8 63
|
689 |
+
.b8 12
|
690 |
+
.b8 32
|
691 |
+
.b8 11
|
692 |
+
.b8 0
|
693 |
+
.b8 0
|
694 |
+
.b8 3
|
695 |
+
.b8 46
|
696 |
+
.b8 1
|
697 |
+
.b8 17
|
698 |
+
.b8 1
|
699 |
+
.b8 18
|
700 |
+
.b8 1
|
701 |
+
.b8 64
|
702 |
+
.b8 10
|
703 |
+
.b8 49
|
704 |
+
.b8 19
|
705 |
+
.b8 0
|
706 |
+
.b8 0
|
707 |
+
.b8 4
|
708 |
+
.b8 29
|
709 |
+
.b8 0
|
710 |
+
.b8 49
|
711 |
+
.b8 19
|
712 |
+
.b8 17
|
713 |
+
.b8 1
|
714 |
+
.b8 18
|
715 |
+
.b8 1
|
716 |
+
.b8 88
|
717 |
+
.b8 11
|
718 |
+
.b8 89
|
719 |
+
.b8 11
|
720 |
+
.b8 87
|
721 |
+
.b8 11
|
722 |
+
.b8 0
|
723 |
+
.b8 0
|
724 |
+
.b8 5
|
725 |
+
.b8 29
|
726 |
+
.b8 1
|
727 |
+
.b8 49
|
728 |
+
.b8 19
|
729 |
+
.b8 17
|
730 |
+
.b8 1
|
731 |
+
.b8 18
|
732 |
+
.b8 1
|
733 |
+
.b8 88
|
734 |
+
.b8 11
|
735 |
+
.b8 89
|
736 |
+
.b8 11
|
737 |
+
.b8 87
|
738 |
+
.b8 11
|
739 |
+
.b8 0
|
740 |
+
.b8 0
|
741 |
+
.b8 0
|
742 |
+
}
|
743 |
+
.section .debug_info
|
744 |
+
{
|
745 |
+
.b32 298
|
746 |
+
.b8 2
|
747 |
+
.b8 0
|
748 |
+
.b32 .debug_abbrev
|
749 |
+
.b8 8
|
750 |
+
.b8 1
|
751 |
+
.b8 116
|
752 |
+
.b8 114
|
753 |
+
.b8 105
|
754 |
+
.b8 116
|
755 |
+
.b8 111
|
756 |
+
.b8 110
|
757 |
+
.b8 0
|
758 |
+
.b8 2
|
759 |
+
.b8 0
|
760 |
+
.b8 99
|
761 |
+
.b8 103
|
762 |
+
.b8 120
|
763 |
+
.b8 53
|
764 |
+
.b8 108
|
765 |
+
.b8 120
|
766 |
+
.b8 112
|
767 |
+
.b8 117
|
768 |
+
.b8 101
|
769 |
+
.b8 120
|
770 |
+
.b8 112
|
771 |
+
.b8 105
|
772 |
+
.b8 110
|
773 |
+
.b8 100
|
774 |
+
.b8 106
|
775 |
+
.b8 52
|
776 |
+
.b8 100
|
777 |
+
.b8 115
|
778 |
+
.b8 109
|
779 |
+
.b8 106
|
780 |
+
.b8 122
|
781 |
+
.b8 53
|
782 |
+
.b8 120
|
783 |
+
.b8 52
|
784 |
+
.b8 50
|
785 |
+
.b8 117
|
786 |
+
.b8 104
|
787 |
+
.b8 121
|
788 |
+
.b8 121
|
789 |
+
.b8 55
|
790 |
+
.b8 105
|
791 |
+
.b8 115
|
792 |
+
.b8 107
|
793 |
+
.b8 101
|
794 |
+
.b8 118
|
795 |
+
.b8 113
|
796 |
+
.b8 55
|
797 |
+
.b8 111
|
798 |
+
.b8 118
|
799 |
+
.b8 122
|
800 |
+
.b8 112
|
801 |
+
.b8 119
|
802 |
+
.b8 97
|
803 |
+
.b8 103
|
804 |
+
.b8 98
|
805 |
+
.b8 51
|
806 |
+
.b8 116
|
807 |
+
.b8 53
|
808 |
+
.b8 112
|
809 |
+
.b8 111
|
810 |
+
.b8 119
|
811 |
+
.b8 106
|
812 |
+
.b8 46
|
813 |
+
.b8 112
|
814 |
+
.b8 121
|
815 |
+
.b8 0
|
816 |
+
.b32 .debug_line
|
817 |
+
.b8 47
|
818 |
+
.b8 116
|
819 |
+
.b8 109
|
820 |
+
.b8 112
|
821 |
+
.b8 47
|
822 |
+
.b8 116
|
823 |
+
.b8 111
|
824 |
+
.b8 114
|
825 |
+
.b8 99
|
826 |
+
.b8 104
|
827 |
+
.b8 105
|
828 |
+
.b8 110
|
829 |
+
.b8 100
|
830 |
+
.b8 117
|
831 |
+
.b8 99
|
832 |
+
.b8 116
|
833 |
+
.b8 111
|
834 |
+
.b8 114
|
835 |
+
.b8 95
|
836 |
+
.b8 114
|
837 |
+
.b8 111
|
838 |
+
.b8 111
|
839 |
+
.b8 116
|
840 |
+
.b8 47
|
841 |
+
.b8 103
|
842 |
+
.b8 120
|
843 |
+
.b8 0
|
844 |
+
.b8 1
|
845 |
+
.b64 $L__func_begin0
|
846 |
+
.b64 $L__func_end0
|
847 |
+
.b8 2
|
848 |
+
.b8 116
|
849 |
+
.b8 114
|
850 |
+
.b8 105
|
851 |
+
.b8 116
|
852 |
+
.b8 111
|
853 |
+
.b8 110
|
854 |
+
.b8 95
|
855 |
+
.b8 95
|
856 |
+
.b8 48
|
857 |
+
.b8 100
|
858 |
+
.b8 49
|
859 |
+
.b8 100
|
860 |
+
.b8 50
|
861 |
+
.b8 100
|
862 |
+
.b8 51
|
863 |
+
.b8 100
|
864 |
+
.b8 52
|
865 |
+
.b8 100
|
866 |
+
.b8 53
|
867 |
+
.b8 100
|
868 |
+
.b8 101
|
869 |
+
.b8 54
|
870 |
+
.b8 100
|
871 |
+
.b8 101
|
872 |
+
.b8 0
|
873 |
+
.b8 116
|
874 |
+
.b8 114
|
875 |
+
.b8 105
|
876 |
+
.b8 116
|
877 |
+
.b8 111
|
878 |
+
.b8 110
|
879 |
+
.b8 95
|
880 |
+
.b8 95
|
881 |
+
.b8 48
|
882 |
+
.b8 100
|
883 |
+
.b8 49
|
884 |
+
.b8 100
|
885 |
+
.b8 50
|
886 |
+
.b8 100
|
887 |
+
.b8 51
|
888 |
+
.b8 100
|
889 |
+
.b8 52
|
890 |
+
.b8 100
|
891 |
+
.b8 53
|
892 |
+
.b8 100
|
893 |
+
.b8 101
|
894 |
+
.b8 54
|
895 |
+
.b8 100
|
896 |
+
.b8 101
|
897 |
+
.b8 0
|
898 |
+
.b8 1
|
899 |
+
.b8 18
|
900 |
+
.b8 1
|
901 |
+
.b8 1
|
902 |
+
.b8 3
|
903 |
+
.b64 $L__func_begin0
|
904 |
+
.b64 $L__func_end0
|
905 |
+
.b8 1
|
906 |
+
.b8 156
|
907 |
+
.b32 125
|
908 |
+
.b8 4
|
909 |
+
.b32 125
|
910 |
+
.b64 $L__tmp1
|
911 |
+
.b64 $L__tmp2
|
912 |
+
.b8 2
|
913 |
+
.b8 44
|
914 |
+
.b8 38
|
915 |
+
.b8 5
|
916 |
+
.b32 125
|
917 |
+
.b64 $L__tmp2
|
918 |
+
.b64 $L__tmp15
|
919 |
+
.b8 2
|
920 |
+
.b8 50
|
921 |
+
.b8 41
|
922 |
+
.b8 4
|
923 |
+
.b32 125
|
924 |
+
.b64 $L__tmp2
|
925 |
+
.b64 $L__tmp15
|
926 |
+
.b8 2
|
927 |
+
.b8 120
|
928 |
+
.b8 46
|
929 |
+
.b8 0
|
930 |
+
.b8 4
|
931 |
+
.b32 125
|
932 |
+
.b64 $L__tmp3
|
933 |
+
.b64 $L__tmp16
|
934 |
+
.b8 2
|
935 |
+
.b8 50
|
936 |
+
.b8 41
|
937 |
+
.b8 0
|
938 |
+
.b8 0
|
939 |
+
}
|
940 |
+
.section .debug_pubnames
|
941 |
+
{
|
942 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
943 |
+
$L__pubNames_start0:
|
944 |
+
.b8 2
|
945 |
+
.b8 0
|
946 |
+
.b32 .debug_info
|
947 |
+
.b32 302
|
948 |
+
.b32 125
|
949 |
+
.b8 116
|
950 |
+
.b8 114
|
951 |
+
.b8 105
|
952 |
+
.b8 116
|
953 |
+
.b8 111
|
954 |
+
.b8 110
|
955 |
+
.b8 95
|
956 |
+
.b8 95
|
957 |
+
.b8 48
|
958 |
+
.b8 100
|
959 |
+
.b8 49
|
960 |
+
.b8 100
|
961 |
+
.b8 50
|
962 |
+
.b8 100
|
963 |
+
.b8 51
|
964 |
+
.b8 100
|
965 |
+
.b8 52
|
966 |
+
.b8 100
|
967 |
+
.b8 53
|
968 |
+
.b8 100
|
969 |
+
.b8 101
|
970 |
+
.b8 54
|
971 |
+
.b8 100
|
972 |
+
.b8 101
|
973 |
+
.b8 0
|
974 |
+
.b32 0
|
975 |
+
$L__pubNames_end0:
|
976 |
+
}
|
977 |
+
.section .debug_pubtypes
|
978 |
+
{
|
979 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
980 |
+
$L__pubTypes_start0:
|
981 |
+
.b8 2
|
982 |
+
.b8 0
|
983 |
+
.b32 .debug_info
|
984 |
+
.b32 302
|
985 |
+
.b32 0
|
986 |
+
$L__pubTypes_end0:
|
987 |
+
}
|
988 |
+
.section .debug_loc { }
|
.triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.ttir
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<1.000000e+00> : tensor<1x256xf32>
|
4 |
+
%cst_0 = arith.constant dense<0.000000e+00> : tensor<1x256xf32>
|
5 |
+
%cst_1 = arith.constant 0.000000e+00 : f32
|
6 |
+
%cst_2 = arith.constant dense<256> : tensor<2x1xi64>
|
7 |
+
%cst_3 = arith.constant dense<50257> : tensor<2x1xi64>
|
8 |
+
%cst_4 = arith.constant dense<0> : tensor<2x1xi64>
|
9 |
+
%cst_5 = arith.constant dense<9.99999974E-6> : tensor<2x1xf32>
|
10 |
+
%cst_6 = arith.constant dense<2.560000e+02> : tensor<2x1xf32>
|
11 |
+
%cst_7 = arith.constant dense<0.000000e+00> : tensor<2x256xf32>
|
12 |
+
%cst_8 = arith.constant dense<256> : tensor<2x1xi32>
|
13 |
+
%cst_9 = arith.constant dense<256> : tensor<1x256xi32>
|
14 |
+
%cst_10 = arith.constant dense<512> : tensor<2x1xi32>
|
15 |
+
%c2_i32 = arith.constant 2 : i32
|
16 |
+
%0 = tt.get_program_id x : i32
|
17 |
+
%1 = arith.muli %0, %c2_i32 : i32
|
18 |
+
%2 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32>
|
19 |
+
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<2xi32>) -> tensor<2x1xi32>
|
20 |
+
%4 = tt.splat %1 : (i32) -> tensor<2x1xi32>
|
21 |
+
%5 = arith.addi %4, %3 : tensor<2x1xi32>
|
22 |
+
%6 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
|
23 |
+
%7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<256xi32>) -> tensor<1x256xi32>
|
24 |
+
%8 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<2x1x!tt.ptr<i64, 1>>
|
25 |
+
%9 = tt.addptr %8, %5 : tensor<2x1x!tt.ptr<i64, 1>>, tensor<2x1xi32>
|
26 |
+
%10 = tt.load %9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x1xi64>
|
27 |
+
%11 = arith.remsi %5, %cst_10 : tensor<2x1xi32>
|
28 |
+
%12 = arith.cmpi slt, %7, %cst_9 : tensor<1x256xi32>
|
29 |
+
%13 = arith.muli %11, %cst_8 : tensor<2x1xi32>
|
30 |
+
%14 = tt.broadcast %7 : (tensor<1x256xi32>) -> tensor<2x256xi32>
|
31 |
+
%15 = tt.broadcast %13 : (tensor<2x1xi32>) -> tensor<2x256xi32>
|
32 |
+
%16 = arith.addi %14, %15 : tensor<2x256xi32>
|
33 |
+
%17 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<2x256x!tt.ptr<f32, 1>>
|
34 |
+
%18 = tt.addptr %17, %16 : tensor<2x256x!tt.ptr<f32, 1>>, tensor<2x256xi32>
|
35 |
+
%19 = tt.broadcast %12 : (tensor<1x256xi1>) -> tensor<2x256xi1>
|
36 |
+
%20 = tt.load %18, %19, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32>
|
37 |
+
%21 = arith.addi %10, %cst_3 : tensor<2x1xi64>
|
38 |
+
%22 = arith.cmpi slt, %10, %cst_4 : tensor<2x1xi64>
|
39 |
+
%23 = arith.select %22, %21, %10 : tensor<2x1xi1>, tensor<2x1xi64>
|
40 |
+
%24 = arith.cmpi sge, %23, %cst_4 : tensor<2x1xi64>
|
41 |
+
%25 = arith.cmpi slt, %23, %cst_3 : tensor<2x1xi64>
|
42 |
+
%26 = arith.andi %24, %25 : tensor<2x1xi1>
|
43 |
+
tt.assert %26, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<2x1xi1>
|
44 |
+
%27 = arith.muli %23, %cst_2 : tensor<2x1xi64>
|
45 |
+
%28 = tt.broadcast %27 : (tensor<2x1xi64>) -> tensor<2x256xi64>
|
46 |
+
%29 = arith.extsi %7 : tensor<1x256xi32> to tensor<1x256xi64>
|
47 |
+
%30 = tt.broadcast %29 : (tensor<1x256xi64>) -> tensor<2x256xi64>
|
48 |
+
%31 = arith.addi %30, %28 : tensor<2x256xi64>
|
49 |
+
%32 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<2x256x!tt.ptr<f32, 1>>
|
50 |
+
%33 = tt.addptr %32, %31 : tensor<2x256x!tt.ptr<f32, 1>>, tensor<2x256xi64>
|
51 |
+
%34 = tt.load %33, %19, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32>
|
52 |
+
%35 = arith.addf %34, %20 : tensor<2x256xf32>
|
53 |
+
%36 = arith.addf %35, %cst_7 : tensor<2x256xf32>
|
54 |
+
%37 = arith.subf %35, %36 : tensor<2x256xf32>
|
55 |
+
%38 = arith.mulf %35, %37 : tensor<2x256xf32>
|
56 |
+
%39 = arith.addf %38, %cst_7 : tensor<2x256xf32>
|
57 |
+
%40 = arith.select %19, %36, %cst_7 : tensor<2x256xi1>, tensor<2x256xf32>
|
58 |
+
%41 = arith.select %19, %39, %cst_7 : tensor<2x256xi1>, tensor<2x256xf32>
|
59 |
+
%42 = arith.select %12, %cst, %cst_0 : tensor<1x256xi1>, tensor<1x256xf32>
|
60 |
+
%43 = tt.broadcast %42 : (tensor<1x256xf32>) -> tensor<2x256xf32>
|
61 |
+
%44:3 = "tt.reduce"(%40, %41, %43) <{axis = 1 : i32}> ({
|
62 |
+
^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32):
|
63 |
+
%68 = arith.subf %arg10, %arg7 : f32
|
64 |
+
%69 = arith.addf %arg9, %arg12 : f32
|
65 |
+
%70 = arith.cmpf oeq, %69, %cst_1 : f32
|
66 |
+
%71 = arith.divf %arg12, %69 : f32
|
67 |
+
%72 = arith.select %70, %cst_1, %71 : f32
|
68 |
+
%73 = arith.mulf %68, %72 : f32
|
69 |
+
%74 = arith.addf %arg7, %73 : f32
|
70 |
+
%75 = arith.addf %arg8, %arg11 : f32
|
71 |
+
%76 = arith.mulf %68, %68 : f32
|
72 |
+
%77 = arith.mulf %76, %arg9 : f32
|
73 |
+
%78 = arith.mulf %77, %72 : f32
|
74 |
+
%79 = arith.addf %75, %78 : f32
|
75 |
+
tt.reduce.return %74, %79, %69 : f32, f32, f32
|
76 |
+
}) : (tensor<2x256xf32>, tensor<2x256xf32>, tensor<2x256xf32>) -> (tensor<2xf32>, tensor<2xf32>, tensor<2xf32>)
|
77 |
+
%45 = tt.expand_dims %44#0 {axis = 1 : i32} : (tensor<2xf32>) -> tensor<2x1xf32>
|
78 |
+
%46 = tt.expand_dims %44#1 {axis = 1 : i32} : (tensor<2xf32>) -> tensor<2x1xf32>
|
79 |
+
%47 = tt.load %18, %19, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32>
|
80 |
+
%48 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>>
|
81 |
+
%49 = tt.addptr %48, %7 : tensor<1x256x!tt.ptr<f32, 1>>, tensor<1x256xi32>
|
82 |
+
%50 = tt.load %49, %12, %cst_0 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32>
|
83 |
+
tt.assert %26, "index out of bounds: 0 <= tmp13 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<2x1xi1>
|
84 |
+
%51 = tt.load %33, %19, %cst_7 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<2x256xf32>
|
85 |
+
%52 = arith.addf %51, %47 : tensor<2x256xf32>
|
86 |
+
%53 = tt.broadcast %45 : (tensor<2x1xf32>) -> tensor<2x256xf32>
|
87 |
+
%54 = arith.subf %52, %53 : tensor<2x256xf32>
|
88 |
+
%55 = arith.divf %46, %cst_6 : tensor<2x1xf32>
|
89 |
+
%56 = arith.addf %55, %cst_5 : tensor<2x1xf32>
|
90 |
+
%57 = tt.extern_elementwise %56 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32>) -> tensor<2x1xf32>
|
91 |
+
%58 = tt.broadcast %57 : (tensor<2x1xf32>) -> tensor<2x256xf32>
|
92 |
+
%59 = arith.mulf %54, %58 : tensor<2x256xf32>
|
93 |
+
%60 = tt.broadcast %50 : (tensor<1x256xf32>) -> tensor<2x256xf32>
|
94 |
+
%61 = arith.mulf %59, %60 : tensor<2x256xf32>
|
95 |
+
%62 = arith.muli %5, %cst_8 : tensor<2x1xi32>
|
96 |
+
%63 = tt.broadcast %62 : (tensor<2x1xi32>) -> tensor<2x256xi32>
|
97 |
+
%64 = arith.addi %14, %63 : tensor<2x256xi32>
|
98 |
+
%65 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<2x256x!tt.ptr<bf16, 1>>
|
99 |
+
%66 = tt.addptr %65, %64 : tensor<2x256x!tt.ptr<bf16, 1>>, tensor<2x256xi32>
|
100 |
+
%67 = arith.truncf %61 : tensor<2x256xf32> to tensor<2x256xbf16>
|
101 |
+
tt.store %66, %67, %19 {cache = 1 : i32, evict = 1 : i32} : tensor<2x256xbf16>
|
102 |
+
tt.return
|
103 |
+
}
|
104 |
+
}
|
.triton/dump/305a9479aab997a3a16bfe46bb303a50/triton_.llir
ADDED
@@ -0,0 +1,524 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
|
5 |
+
@assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
|
6 |
+
@assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp16 < 50257"
|
7 |
+
@assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
|
8 |
+
@assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
|
9 |
+
@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
|
10 |
+
@global_smem = external addrspace(3) global [0 x i8]
|
11 |
+
@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
|
12 |
+
|
13 |
+
declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
|
14 |
+
|
15 |
+
define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !7 {
|
16 |
+
%9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
|
17 |
+
%10 = and i32 %9, 31, !dbg !10
|
18 |
+
%11 = lshr i32 %9, 5, !dbg !10
|
19 |
+
%12 = and i32 %11, 1, !dbg !10
|
20 |
+
%urem = shl i32 %9, 2, !dbg !10
|
21 |
+
%13 = and i32 %urem, 252, !dbg !10
|
22 |
+
%14 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !11
|
23 |
+
%15 = sext i32 %14 to i64, !dbg !12
|
24 |
+
%16 = getelementptr i64, ptr addrspace(1) %0, i64 %15, !dbg !12
|
25 |
+
%17 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %16, i1 true) #6, !dbg !13
|
26 |
+
%18 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %16, i1 true) #6, !dbg !13
|
27 |
+
%19 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %16, i1 true) #6, !dbg !13
|
28 |
+
%20 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %16, i1 true) #6, !dbg !13
|
29 |
+
%21 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %16, i1 true) #6, !dbg !13
|
30 |
+
%22 = srem i32 %14, 512, !dbg !14
|
31 |
+
%23 = shl nsw i32 %22, 8, !dbg !15
|
32 |
+
%24 = or i32 %23, %13, !dbg !16
|
33 |
+
%25 = sext i32 %24 to i64, !dbg !17
|
34 |
+
%26 = getelementptr float, ptr addrspace(1) %2, i64 %25, !dbg !17
|
35 |
+
%27 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %26, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !18
|
36 |
+
%28 = extractvalue { i32, i32, i32, i32 } %27, 0, !dbg !18
|
37 |
+
%29 = extractvalue { i32, i32, i32, i32 } %27, 1, !dbg !18
|
38 |
+
%30 = extractvalue { i32, i32, i32, i32 } %27, 2, !dbg !18
|
39 |
+
%31 = extractvalue { i32, i32, i32, i32 } %27, 3, !dbg !18
|
40 |
+
%32 = insertelement <2 x i32> poison, i32 %29, i64 0, !dbg !18
|
41 |
+
%33 = insertelement <2 x i32> %32, i32 %28, i64 1, !dbg !18
|
42 |
+
%34 = bitcast <2 x i32> %33 to <2 x float>, !dbg !18
|
43 |
+
%35 = bitcast i32 %30 to float, !dbg !18
|
44 |
+
%36 = bitcast i32 %31 to float, !dbg !18
|
45 |
+
%37 = shl i32 %14, 8, !dbg !19
|
46 |
+
%38 = or i32 %37, %13, !dbg !20
|
47 |
+
%39 = sext i32 %38 to i64, !dbg !21
|
48 |
+
%40 = getelementptr i16, ptr addrspace(1) %3, i64 %39, !dbg !21
|
49 |
+
%41 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %40, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !22
|
50 |
+
%42 = extractvalue { i32, i32 } %41, 0, !dbg !22
|
51 |
+
%43 = extractvalue { i32, i32 } %41, 1, !dbg !22
|
52 |
+
%44 = trunc i32 %42 to i16, !dbg !22
|
53 |
+
%extelt.offset = lshr i32 %42, 16, !dbg !22
|
54 |
+
%45 = trunc i32 %extelt.offset to i16, !dbg !22
|
55 |
+
%46 = trunc i32 %43 to i16, !dbg !22
|
56 |
+
%extelt.offset1 = lshr i32 %43, 16, !dbg !22
|
57 |
+
%47 = trunc i32 %extelt.offset1 to i16, !dbg !22
|
58 |
+
%48 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %44) #6, !dbg !23
|
59 |
+
%49 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %45) #6, !dbg !23
|
60 |
+
%50 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %46) #6, !dbg !23
|
61 |
+
%51 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %47) #6, !dbg !23
|
62 |
+
%52 = add i64 %21, 50257, !dbg !24
|
63 |
+
%53 = icmp slt i64 %17, 0, !dbg !25
|
64 |
+
%54 = icmp slt i64 %21, 0, !dbg !25
|
65 |
+
%55 = select i1 %54, i64 %52, i64 %21, !dbg !26
|
66 |
+
%56 = icmp ugt i64 %55, 50256, !dbg !27
|
67 |
+
br i1 %56, label %57, label %58, !dbg !28
|
68 |
+
|
69 |
+
57: ; preds = %8
|
70 |
+
tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !28
|
71 |
+
br label %58, !dbg !28
|
72 |
+
|
73 |
+
58: ; preds = %57, %8
|
74 |
+
%59 = shl i64 %17, 8, !dbg !29
|
75 |
+
%60 = add i64 %59, 12865792, !dbg !29
|
76 |
+
%61 = select i1 %53, i64 %60, i64 %59, !dbg !29
|
77 |
+
%62 = zext nneg i32 %13 to i64
|
78 |
+
%63 = or i64 %61, %62, !dbg !30
|
79 |
+
%64 = getelementptr float, ptr addrspace(1) %1, i64 %63, !dbg !31
|
80 |
+
%65 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %64, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
|
81 |
+
%66 = extractvalue { i32, i32, i32, i32 } %65, 0, !dbg !32
|
82 |
+
%67 = extractvalue { i32, i32, i32, i32 } %65, 1, !dbg !32
|
83 |
+
%68 = extractvalue { i32, i32, i32, i32 } %65, 2, !dbg !32
|
84 |
+
%69 = extractvalue { i32, i32, i32, i32 } %65, 3, !dbg !32
|
85 |
+
%70 = bitcast i32 %68 to float, !dbg !32
|
86 |
+
%71 = bitcast i32 %69 to float, !dbg !32
|
87 |
+
%72 = fadd float %35, %70, !dbg !33
|
88 |
+
%73 = fadd float %36, %71, !dbg !33
|
89 |
+
%74 = fadd float %50, %72, !dbg !34
|
90 |
+
%75 = fadd float %51, %73, !dbg !34
|
91 |
+
%76 = insertelement <2 x i32> poison, i32 %67, i64 0, !dbg !32
|
92 |
+
%77 = insertelement <2 x i32> %76, i32 %66, i64 1, !dbg !32
|
93 |
+
%78 = bitcast <2 x i32> %77 to <2 x float>, !dbg !32
|
94 |
+
%79 = fadd <2 x float> %34, %78, !dbg !33
|
95 |
+
%80 = insertelement <2 x float> poison, float %49, i64 0, !dbg !34
|
96 |
+
%81 = insertelement <2 x float> %80, float %48, i64 1, !dbg !34
|
97 |
+
%82 = fadd <2 x float> %81, %79, !dbg !34
|
98 |
+
%83 = fadd <2 x float> %82, zeroinitializer, !dbg !35
|
99 |
+
%84 = fadd float %74, 0.000000e+00, !dbg !35
|
100 |
+
%85 = fadd float %75, 0.000000e+00, !dbg !35
|
101 |
+
%86 = extractelement <2 x float> %83, i64 1, !dbg !39
|
102 |
+
%87 = extractelement <2 x float> %82, i64 1, !dbg !43
|
103 |
+
%88 = fsub float %87, %86, !dbg !44
|
104 |
+
%89 = extractelement <2 x float> %83, i64 0, !dbg !39
|
105 |
+
%90 = extractelement <2 x float> %82, i64 0, !dbg !43
|
106 |
+
%91 = fsub float %90, %89, !dbg !44
|
107 |
+
%92 = fsub float %74, %84, !dbg !44
|
108 |
+
%93 = fsub float %75, %85, !dbg !44
|
109 |
+
%94 = fmul float %87, %88, !dbg !43
|
110 |
+
%95 = fmul float %90, %91, !dbg !43
|
111 |
+
%96 = fmul float %74, %92, !dbg !43
|
112 |
+
%97 = fmul float %75, %93, !dbg !43
|
113 |
+
%98 = fadd float %94, 0.000000e+00, !dbg !45
|
114 |
+
%99 = fadd float %95, 0.000000e+00, !dbg !45
|
115 |
+
%100 = fadd float %96, 0.000000e+00, !dbg !45
|
116 |
+
%101 = fadd float %97, 0.000000e+00, !dbg !45
|
117 |
+
%102 = fsub float %89, %86, !dbg !39
|
118 |
+
%103 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 2.000000e+00) #6, !dbg !46
|
119 |
+
%104 = fmul float %103, %102, !dbg !47
|
120 |
+
%105 = fadd float %86, %104, !dbg !48
|
121 |
+
%106 = fadd float %98, %99, !dbg !49
|
122 |
+
%107 = fmul float %102, %102, !dbg !50
|
123 |
+
%108 = fmul float %103, %107, !dbg !51
|
124 |
+
%109 = fadd float %108, %106, !dbg !52
|
125 |
+
%110 = fsub float %84, %105, !dbg !39
|
126 |
+
%111 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 3.000000e+00) #6, !dbg !46
|
127 |
+
%112 = fmul float %111, %110, !dbg !47
|
128 |
+
%113 = fadd float %105, %112, !dbg !48
|
129 |
+
%114 = fadd float %100, %109, !dbg !49
|
130 |
+
%115 = fmul float %110, %110, !dbg !50
|
131 |
+
%116 = fmul float %115, 2.000000e+00, !dbg !53
|
132 |
+
%117 = fmul float %111, %116, !dbg !51
|
133 |
+
%118 = fadd float %114, %117, !dbg !52
|
134 |
+
%119 = fsub float %85, %113, !dbg !39
|
135 |
+
%120 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 4.000000e+00) #6, !dbg !46
|
136 |
+
%121 = fmul float %120, %119, !dbg !47
|
137 |
+
%122 = fadd float %113, %121, !dbg !48
|
138 |
+
%123 = fadd float %101, %118, !dbg !49
|
139 |
+
%124 = fmul float %119, %119, !dbg !50
|
140 |
+
%125 = fmul float %124, 3.000000e+00, !dbg !53
|
141 |
+
%126 = fmul float %120, %125, !dbg !51
|
142 |
+
%127 = fadd float %123, %126, !dbg !52
|
143 |
+
%128 = bitcast float %122 to i32, !dbg !54
|
144 |
+
%129 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %128, i32 16, i32 31), !dbg !54
|
145 |
+
%130 = bitcast i32 %129 to float, !dbg !54
|
146 |
+
%131 = bitcast float %127 to i32, !dbg !54
|
147 |
+
%132 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %131, i32 16, i32 31), !dbg !54
|
148 |
+
%133 = bitcast i32 %132 to float, !dbg !54
|
149 |
+
%134 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 1082130432, i32 16, i32 31), !dbg !54
|
150 |
+
%135 = bitcast i32 %134 to float, !dbg !54
|
151 |
+
%136 = fsub float %130, %122, !dbg !39
|
152 |
+
%137 = fadd float %135, 4.000000e+00, !dbg !56
|
153 |
+
%138 = fcmp oeq float %137, 0.000000e+00, !dbg !57
|
154 |
+
%139 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %135, float %137) #6, !dbg !46
|
155 |
+
%140 = select i1 %138, float 0.000000e+00, float %139, !dbg !58
|
156 |
+
%141 = fmul float %140, %136, !dbg !47
|
157 |
+
%142 = fadd float %122, %141, !dbg !48
|
158 |
+
%143 = fadd float %127, %133, !dbg !49
|
159 |
+
%144 = fmul float %136, %136, !dbg !50
|
160 |
+
%145 = fmul float %144, 4.000000e+00, !dbg !53
|
161 |
+
%146 = fmul float %140, %145, !dbg !51
|
162 |
+
%147 = fadd float %143, %146, !dbg !52
|
163 |
+
%148 = bitcast float %142 to i32, !dbg !54
|
164 |
+
%149 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %148, i32 8, i32 31), !dbg !54
|
165 |
+
%150 = bitcast i32 %149 to float, !dbg !54
|
166 |
+
%151 = bitcast float %147 to i32, !dbg !54
|
167 |
+
%152 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %151, i32 8, i32 31), !dbg !54
|
168 |
+
%153 = bitcast i32 %152 to float, !dbg !54
|
169 |
+
%154 = bitcast float %137 to i32, !dbg !54
|
170 |
+
%155 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %154, i32 8, i32 31), !dbg !54
|
171 |
+
%156 = bitcast i32 %155 to float, !dbg !54
|
172 |
+
%157 = fsub float %150, %142, !dbg !39
|
173 |
+
%158 = fadd float %137, %156, !dbg !56
|
174 |
+
%159 = fcmp oeq float %158, 0.000000e+00, !dbg !57
|
175 |
+
%160 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %156, float %158) #6, !dbg !46
|
176 |
+
%161 = select i1 %159, float 0.000000e+00, float %160, !dbg !58
|
177 |
+
%162 = fmul float %161, %157, !dbg !47
|
178 |
+
%163 = fadd float %142, %162, !dbg !48
|
179 |
+
%164 = fadd float %147, %153, !dbg !49
|
180 |
+
%165 = fmul float %157, %157, !dbg !50
|
181 |
+
%166 = fmul float %137, %165, !dbg !53
|
182 |
+
%167 = fmul float %161, %166, !dbg !51
|
183 |
+
%168 = fadd float %164, %167, !dbg !52
|
184 |
+
%169 = bitcast float %163 to i32, !dbg !54
|
185 |
+
%170 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %169, i32 4, i32 31), !dbg !54
|
186 |
+
%171 = bitcast i32 %170 to float, !dbg !54
|
187 |
+
%172 = bitcast float %168 to i32, !dbg !54
|
188 |
+
%173 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %172, i32 4, i32 31), !dbg !54
|
189 |
+
%174 = bitcast i32 %173 to float, !dbg !54
|
190 |
+
%175 = bitcast float %158 to i32, !dbg !54
|
191 |
+
%176 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %175, i32 4, i32 31), !dbg !54
|
192 |
+
%177 = bitcast i32 %176 to float, !dbg !54
|
193 |
+
%178 = fsub float %171, %163, !dbg !39
|
194 |
+
%179 = fadd float %158, %177, !dbg !56
|
195 |
+
%180 = fcmp oeq float %179, 0.000000e+00, !dbg !57
|
196 |
+
%181 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %177, float %179) #6, !dbg !46
|
197 |
+
%182 = select i1 %180, float 0.000000e+00, float %181, !dbg !58
|
198 |
+
%183 = fmul float %182, %178, !dbg !47
|
199 |
+
%184 = fadd float %163, %183, !dbg !48
|
200 |
+
%185 = fadd float %168, %174, !dbg !49
|
201 |
+
%186 = fmul float %178, %178, !dbg !50
|
202 |
+
%187 = fmul float %158, %186, !dbg !53
|
203 |
+
%188 = fmul float %182, %187, !dbg !51
|
204 |
+
%189 = fadd float %185, %188, !dbg !52
|
205 |
+
%190 = bitcast float %184 to i32, !dbg !54
|
206 |
+
%191 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %190, i32 2, i32 31), !dbg !54
|
207 |
+
%192 = bitcast i32 %191 to float, !dbg !54
|
208 |
+
%193 = bitcast float %189 to i32, !dbg !54
|
209 |
+
%194 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %193, i32 2, i32 31), !dbg !54
|
210 |
+
%195 = bitcast i32 %194 to float, !dbg !54
|
211 |
+
%196 = bitcast float %179 to i32, !dbg !54
|
212 |
+
%197 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %196, i32 2, i32 31), !dbg !54
|
213 |
+
%198 = bitcast i32 %197 to float, !dbg !54
|
214 |
+
%199 = fsub float %192, %184, !dbg !39
|
215 |
+
%200 = fadd float %179, %198, !dbg !56
|
216 |
+
%201 = fcmp oeq float %200, 0.000000e+00, !dbg !57
|
217 |
+
%202 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %198, float %200) #6, !dbg !46
|
218 |
+
%203 = select i1 %201, float 0.000000e+00, float %202, !dbg !58
|
219 |
+
%204 = fmul float %203, %199, !dbg !47
|
220 |
+
%205 = fadd float %184, %204, !dbg !48
|
221 |
+
%206 = fadd float %189, %195, !dbg !49
|
222 |
+
%207 = fmul float %199, %199, !dbg !50
|
223 |
+
%208 = fmul float %179, %207, !dbg !53
|
224 |
+
%209 = fmul float %203, %208, !dbg !51
|
225 |
+
%210 = fadd float %206, %209, !dbg !52
|
226 |
+
%211 = bitcast float %205 to i32, !dbg !54
|
227 |
+
%212 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %211, i32 1, i32 31), !dbg !54
|
228 |
+
%213 = bitcast i32 %212 to float, !dbg !54
|
229 |
+
%214 = bitcast float %210 to i32, !dbg !54
|
230 |
+
%215 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %214, i32 1, i32 31), !dbg !54
|
231 |
+
%216 = bitcast i32 %215 to float, !dbg !54
|
232 |
+
%217 = bitcast float %200 to i32, !dbg !54
|
233 |
+
%218 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %217, i32 1, i32 31), !dbg !54
|
234 |
+
%219 = bitcast i32 %218 to float, !dbg !54
|
235 |
+
%220 = fsub float %213, %205, !dbg !39
|
236 |
+
%221 = fadd float %200, %219, !dbg !56
|
237 |
+
%222 = fcmp oeq float %221, 0.000000e+00, !dbg !57
|
238 |
+
%223 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %219, float %221) #6, !dbg !46
|
239 |
+
%224 = select i1 %222, float 0.000000e+00, float %223, !dbg !58
|
240 |
+
%225 = fmul float %224, %220, !dbg !47
|
241 |
+
%226 = fadd float %205, %225, !dbg !48
|
242 |
+
%227 = fadd float %210, %216, !dbg !49
|
243 |
+
%228 = fmul float %220, %220, !dbg !50
|
244 |
+
%229 = fmul float %200, %228, !dbg !53
|
245 |
+
%230 = fmul float %224, %229, !dbg !51
|
246 |
+
%231 = fadd float %227, %230, !dbg !52
|
247 |
+
%232 = icmp eq i32 %10, 0, !dbg !54
|
248 |
+
%233 = zext nneg i32 %12 to i64, !dbg !54
|
249 |
+
%234 = getelementptr float, ptr addrspace(3) @global_smem, i64 %233, !dbg !54
|
250 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %234, float %226, i1 %232) #6, !dbg !54
|
251 |
+
%235 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 8), i64 %233, !dbg !54
|
252 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %235, float %231, i1 %232) #6, !dbg !54
|
253 |
+
%236 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %233, !dbg !54
|
254 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %236, float %221, i1 %232) #6, !dbg !54
|
255 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !54
|
256 |
+
%237 = icmp slt i32 %9, 2, !dbg !54
|
257 |
+
%238 = sext i32 %9 to i64, !dbg !54
|
258 |
+
%239 = getelementptr float, ptr addrspace(3) @global_smem, i64 %238, !dbg !54
|
259 |
+
%240 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %239, i1 %237) #6, !dbg !54
|
260 |
+
%241 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 8), i64 %238, !dbg !54
|
261 |
+
%242 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %241, i1 %237) #6, !dbg !54
|
262 |
+
%243 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %238, !dbg !54
|
263 |
+
%244 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %243, i1 %237) #6, !dbg !54
|
264 |
+
%245 = bitcast float %240 to i32, !dbg !54
|
265 |
+
%246 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %245, i32 1, i32 31), !dbg !54
|
266 |
+
%247 = bitcast i32 %246 to float, !dbg !54
|
267 |
+
%248 = bitcast float %242 to i32, !dbg !54
|
268 |
+
%249 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %248, i32 1, i32 31), !dbg !54
|
269 |
+
%250 = bitcast i32 %249 to float, !dbg !54
|
270 |
+
%251 = bitcast float %244 to i32, !dbg !54
|
271 |
+
%252 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %251, i32 1, i32 31), !dbg !54
|
272 |
+
%253 = bitcast i32 %252 to float, !dbg !54
|
273 |
+
%254 = fsub float %247, %240, !dbg !39
|
274 |
+
%255 = fadd float %244, %253, !dbg !56
|
275 |
+
%256 = fcmp oeq float %255, 0.000000e+00, !dbg !57
|
276 |
+
%257 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %253, float %255) #6, !dbg !46
|
277 |
+
%258 = select i1 %256, float 0.000000e+00, float %257, !dbg !58
|
278 |
+
%259 = fmul float %254, %258, !dbg !47
|
279 |
+
%260 = fadd float %240, %259, !dbg !48
|
280 |
+
%261 = fadd float %242, %250, !dbg !49
|
281 |
+
%262 = fmul float %254, %254, !dbg !50
|
282 |
+
%263 = fmul float %244, %262, !dbg !53
|
283 |
+
%264 = fmul float %263, %258, !dbg !51
|
284 |
+
%265 = fadd float %261, %264, !dbg !52
|
285 |
+
%266 = and i32 %9, 1, !dbg !54
|
286 |
+
%267 = icmp eq i32 %266, 0, !dbg !54
|
287 |
+
%268 = and i1 %237, %267, !dbg !54
|
288 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %239, float %260, i1 %268) #6, !dbg !54
|
289 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %241, float %265, i1 %268) #6, !dbg !54
|
290 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %243, float %255, i1 %268) #6, !dbg !54
|
291 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !54
|
292 |
+
%269 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !54
|
293 |
+
%270 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 8), align 4, !dbg !54
|
294 |
+
%271 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %26, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !59
|
295 |
+
%272 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %40, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !60
|
296 |
+
%273 = extractvalue { i32, i32 } %272, 0, !dbg !60
|
297 |
+
%274 = extractvalue { i32, i32 } %272, 1, !dbg !60
|
298 |
+
%275 = trunc i32 %273 to i16, !dbg !60
|
299 |
+
%extelt.offset2 = lshr i32 %273, 16, !dbg !60
|
300 |
+
%276 = trunc i32 %extelt.offset2 to i16, !dbg !60
|
301 |
+
%277 = trunc i32 %274 to i16, !dbg !60
|
302 |
+
%extelt.offset3 = lshr i32 %274, 16, !dbg !60
|
303 |
+
%278 = trunc i32 %extelt.offset3 to i16, !dbg !60
|
304 |
+
%279 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %275) #6, !dbg !61
|
305 |
+
%280 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %276) #6, !dbg !61
|
306 |
+
%281 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %277) #6, !dbg !61
|
307 |
+
%282 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %278) #6, !dbg !61
|
308 |
+
%283 = getelementptr float, ptr addrspace(1) %4, i64 %62, !dbg !62
|
309 |
+
%284 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %283, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !63
|
310 |
+
br i1 %56, label %285, label %286, !dbg !64
|
311 |
+
|
312 |
+
285: ; preds = %58
|
313 |
+
tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !64
|
314 |
+
br label %286, !dbg !64
|
315 |
+
|
316 |
+
286: ; preds = %285, %58
|
317 |
+
%287 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %64, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !65
|
318 |
+
%288 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %270, float 2.560000e+02) #6, !dbg !66
|
319 |
+
%289 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %270, float 2.560000e+02) #6, !dbg !66
|
320 |
+
%290 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %270, float 2.560000e+02) #6, !dbg !66
|
321 |
+
%291 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %270, float 2.560000e+02) #6, !dbg !66
|
322 |
+
%292 = fadd float %288, 0x3EE4F8B580000000, !dbg !67
|
323 |
+
%293 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !68
|
324 |
+
%.not.i = icmp eq i32 %293, 0, !dbg !68
|
325 |
+
br i1 %.not.i, label %296, label %294, !dbg !68
|
326 |
+
|
327 |
+
294: ; preds = %286
|
328 |
+
%295 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %292), !dbg !68
|
329 |
+
br label %__nv_rsqrtf.exit, !dbg !68
|
330 |
+
|
331 |
+
296: ; preds = %286
|
332 |
+
%297 = tail call float @llvm.nvvm.rsqrt.approx.f(float %292), !dbg !68
|
333 |
+
br label %__nv_rsqrtf.exit, !dbg !68
|
334 |
+
|
335 |
+
__nv_rsqrtf.exit: ; preds = %294, %296
|
336 |
+
%.0.i = phi float [ %295, %294 ], [ %297, %296 ], !dbg !68
|
337 |
+
%298 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !68
|
338 |
+
%299 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !68
|
339 |
+
%300 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !68
|
340 |
+
%301 = extractvalue { i32, i32, i32, i32 } %287, 3, !dbg !65
|
341 |
+
%302 = bitcast i32 %301 to float, !dbg !65
|
342 |
+
%303 = extractvalue { i32, i32, i32, i32 } %271, 3, !dbg !59
|
343 |
+
%304 = bitcast i32 %303 to float, !dbg !59
|
344 |
+
%305 = fadd float %304, %302, !dbg !69
|
345 |
+
%306 = fadd float %282, %305, !dbg !70
|
346 |
+
%307 = fsub float %306, %269, !dbg !71
|
347 |
+
%308 = extractvalue { i32, i32, i32, i32 } %287, 2, !dbg !65
|
348 |
+
%309 = bitcast i32 %308 to float, !dbg !65
|
349 |
+
%310 = extractvalue { i32, i32, i32, i32 } %271, 2, !dbg !59
|
350 |
+
%311 = bitcast i32 %310 to float, !dbg !59
|
351 |
+
%312 = fadd float %311, %309, !dbg !69
|
352 |
+
%313 = fadd float %281, %312, !dbg !70
|
353 |
+
%314 = fsub float %313, %269, !dbg !71
|
354 |
+
%315 = extractvalue { i32, i32, i32, i32 } %287, 1, !dbg !65
|
355 |
+
%316 = bitcast i32 %315 to float, !dbg !65
|
356 |
+
%317 = extractvalue { i32, i32, i32, i32 } %271, 1, !dbg !59
|
357 |
+
%318 = bitcast i32 %317 to float, !dbg !59
|
358 |
+
%319 = fadd float %318, %316, !dbg !69
|
359 |
+
%320 = fadd float %280, %319, !dbg !70
|
360 |
+
%321 = fsub float %320, %269, !dbg !71
|
361 |
+
%322 = extractvalue { i32, i32, i32, i32 } %287, 0, !dbg !65
|
362 |
+
%323 = bitcast i32 %322 to float, !dbg !65
|
363 |
+
%324 = extractvalue { i32, i32, i32, i32 } %271, 0, !dbg !59
|
364 |
+
%325 = bitcast i32 %324 to float, !dbg !59
|
365 |
+
%326 = fadd float %325, %323, !dbg !69
|
366 |
+
%327 = fadd float %279, %326, !dbg !70
|
367 |
+
%328 = fsub float %327, %269, !dbg !71
|
368 |
+
%329 = extractvalue { i32, i32, i32, i32 } %284, 0, !dbg !63
|
369 |
+
%330 = bitcast i32 %329 to float, !dbg !63
|
370 |
+
%331 = extractvalue { i32, i32, i32, i32 } %284, 1, !dbg !63
|
371 |
+
%332 = bitcast i32 %331 to float, !dbg !63
|
372 |
+
%333 = extractvalue { i32, i32, i32, i32 } %284, 2, !dbg !63
|
373 |
+
%334 = bitcast i32 %333 to float, !dbg !63
|
374 |
+
%335 = extractvalue { i32, i32, i32, i32 } %284, 3, !dbg !63
|
375 |
+
%336 = bitcast i32 %335 to float, !dbg !63
|
376 |
+
%337 = fmul float %328, %.0.i, !dbg !72
|
377 |
+
%338 = fmul float %321, %.0.i, !dbg !72
|
378 |
+
%339 = fmul float %314, %.0.i, !dbg !72
|
379 |
+
%340 = fmul float %307, %.0.i, !dbg !72
|
380 |
+
%341 = fmul float %337, %330, !dbg !73
|
381 |
+
%342 = fmul float %338, %332, !dbg !73
|
382 |
+
%343 = fmul float %339, %334, !dbg !73
|
383 |
+
%344 = fmul float %340, %336, !dbg !73
|
384 |
+
%345 = getelementptr i16, ptr addrspace(1) %5, i64 %39, !dbg !74
|
385 |
+
%346 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %341) #6, !dbg !75
|
386 |
+
%347 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %342) #6, !dbg !75
|
387 |
+
%348 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %343) #6, !dbg !75
|
388 |
+
%349 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %344) #6, !dbg !75
|
389 |
+
%350 = insertelement <2 x i16> undef, i16 %346, i64 0, !dbg !75
|
390 |
+
%351 = insertelement <2 x i16> %350, i16 %347, i64 1, !dbg !75
|
391 |
+
%352 = bitcast <2 x i16> %351 to i32, !dbg !75
|
392 |
+
%353 = insertelement <2 x i16> undef, i16 %348, i64 0, !dbg !75
|
393 |
+
%354 = insertelement <2 x i16> %353, i16 %349, i64 1, !dbg !75
|
394 |
+
%355 = bitcast <2 x i16> %354 to i32, !dbg !75
|
395 |
+
tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %352, i32 %355, ptr addrspace(1) %345, i1 true) #6, !dbg !75
|
396 |
+
ret void, !dbg !76
|
397 |
+
}
|
398 |
+
|
399 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
400 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
401 |
+
|
402 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
403 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
|
404 |
+
|
405 |
+
; Function Attrs: convergent nocallback nounwind
|
406 |
+
declare void @llvm.nvvm.barrier0() #2
|
407 |
+
|
408 |
+
; Function Attrs: alwaysinline nounwind
|
409 |
+
define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
|
410 |
+
%1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
|
411 |
+
%.not = icmp eq i32 %1, 0
|
412 |
+
br i1 %.not, label %4, label %2
|
413 |
+
|
414 |
+
2: ; preds = %0
|
415 |
+
%3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
|
416 |
+
br label %6
|
417 |
+
|
418 |
+
4: ; preds = %0
|
419 |
+
%5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
|
420 |
+
br label %6
|
421 |
+
|
422 |
+
6: ; preds = %4, %2
|
423 |
+
%.0 = phi float [ %3, %2 ], [ %5, %4 ]
|
424 |
+
ret float %.0
|
425 |
+
}
|
426 |
+
|
427 |
+
declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
|
428 |
+
|
429 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
430 |
+
declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
|
431 |
+
|
432 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
433 |
+
declare float @llvm.nvvm.rsqrt.approx.f(float) #5
|
434 |
+
|
435 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
436 |
+
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
437 |
+
attributes #2 = { convergent nocallback nounwind }
|
438 |
+
attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
439 |
+
attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
440 |
+
attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
|
441 |
+
attributes #6 = { nounwind }
|
442 |
+
|
443 |
+
!llvm.module.flags = !{!0, !1}
|
444 |
+
!llvm.dbg.cu = !{!2}
|
445 |
+
!nvvm.annotations = !{!4, !5, !5, !4}
|
446 |
+
!llvm.ident = !{!6}
|
447 |
+
|
448 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
449 |
+
!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
450 |
+
!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
451 |
+
!3 = !DIFile(filename: "cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py", directory: "/tmp/torchinductor_root/pn")
|
452 |
+
!4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1}
|
453 |
+
!5 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 64}
|
454 |
+
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
|
455 |
+
!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
|
456 |
+
!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
|
457 |
+
!9 = !{}
|
458 |
+
!10 = !DILocation(line: 24, column: 33, scope: !7)
|
459 |
+
!11 = !DILocation(line: 21, column: 28, scope: !7)
|
460 |
+
!12 = !DILocation(line: 26, column: 30, scope: !7)
|
461 |
+
!13 = !DILocation(line: 26, column: 35, scope: !7)
|
462 |
+
!14 = !DILocation(line: 27, column: 18, scope: !7)
|
463 |
+
!15 = !DILocation(line: 35, column: 44, scope: !7)
|
464 |
+
!16 = !DILocation(line: 35, column: 40, scope: !7)
|
465 |
+
!17 = !DILocation(line: 35, column: 34, scope: !7)
|
466 |
+
!18 = !DILocation(line: 35, column: 50, scope: !7)
|
467 |
+
!19 = !DILocation(line: 36, column: 44, scope: !7)
|
468 |
+
!20 = !DILocation(line: 36, column: 40, scope: !7)
|
469 |
+
!21 = !DILocation(line: 36, column: 34, scope: !7)
|
470 |
+
!22 = !DILocation(line: 36, column: 50, scope: !7)
|
471 |
+
!23 = !DILocation(line: 36, column: 101, scope: !7)
|
472 |
+
!24 = !DILocation(line: 37, column: 22, scope: !7)
|
473 |
+
!25 = !DILocation(line: 38, column: 22, scope: !7)
|
474 |
+
!26 = !DILocation(line: 39, column: 36, scope: !7)
|
475 |
+
!27 = !DILocation(line: 40, column: 40, scope: !7)
|
476 |
+
!28 = !DILocation(line: 40, column: 55, scope: !7)
|
477 |
+
!29 = !DILocation(line: 41, column: 44, scope: !7)
|
478 |
+
!30 = !DILocation(line: 41, column: 40, scope: !7)
|
479 |
+
!31 = !DILocation(line: 41, column: 34, scope: !7)
|
480 |
+
!32 = !DILocation(line: 41, column: 52, scope: !7)
|
481 |
+
!33 = !DILocation(line: 42, column: 22, scope: !7)
|
482 |
+
!34 = !DILocation(line: 44, column: 22, scope: !7)
|
483 |
+
!35 = !DILocation(line: 98, column: 22, scope: !36, inlinedAt: !38)
|
484 |
+
!36 = distinct !DILexicalBlockFile(scope: !7, file: !37, discriminator: 0)
|
485 |
+
!37 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
|
486 |
+
!38 = !DILocation(line: 47, column: 41, scope: !36)
|
487 |
+
!39 = !DILocation(line: 108, column: 21, scope: !40, inlinedAt: !41)
|
488 |
+
!40 = distinct !DILexicalBlockFile(scope: !36, file: !37, discriminator: 0)
|
489 |
+
!41 = !DILocation(line: 120, column: 46, scope: !40, inlinedAt: !42)
|
490 |
+
!42 = !DILocation(line: 53, column: 44, scope: !40)
|
491 |
+
!43 = !DILocation(line: 101, column: 22, scope: !36, inlinedAt: !38)
|
492 |
+
!44 = !DILocation(line: 101, column: 30, scope: !36, inlinedAt: !38)
|
493 |
+
!45 = !DILocation(line: 101, column: 13, scope: !36, inlinedAt: !38)
|
494 |
+
!46 = !DILocation(line: 110, column: 60, scope: !40, inlinedAt: !41)
|
495 |
+
!47 = !DILocation(line: 112, column: 25, scope: !40, inlinedAt: !41)
|
496 |
+
!48 = !DILocation(line: 112, column: 17, scope: !40, inlinedAt: !41)
|
497 |
+
!49 = !DILocation(line: 113, column: 15, scope: !40, inlinedAt: !41)
|
498 |
+
!50 = !DILocation(line: 113, column: 30, scope: !40, inlinedAt: !41)
|
499 |
+
!51 = !DILocation(line: 113, column: 49, scope: !40, inlinedAt: !41)
|
500 |
+
!52 = !DILocation(line: 113, column: 22, scope: !40, inlinedAt: !41)
|
501 |
+
!53 = !DILocation(line: 113, column: 38, scope: !40, inlinedAt: !41)
|
502 |
+
!54 = !DILocation(line: 120, column: 46, scope: !36, inlinedAt: !55)
|
503 |
+
!55 = !DILocation(line: 53, column: 44, scope: !36)
|
504 |
+
!56 = !DILocation(line: 109, column: 28, scope: !40, inlinedAt: !41)
|
505 |
+
!57 = !DILocation(line: 110, column: 39, scope: !40, inlinedAt: !41)
|
506 |
+
!58 = !DILocation(line: 110, column: 49, scope: !40, inlinedAt: !41)
|
507 |
+
!59 = !DILocation(line: 62, column: 51, scope: !7)
|
508 |
+
!60 = !DILocation(line: 63, column: 51, scope: !7)
|
509 |
+
!61 = !DILocation(line: 63, column: 103, scope: !7)
|
510 |
+
!62 = !DILocation(line: 64, column: 35, scope: !7)
|
511 |
+
!63 = !DILocation(line: 64, column: 40, scope: !7)
|
512 |
+
!64 = !DILocation(line: 68, column: 57, scope: !7)
|
513 |
+
!65 = !DILocation(line: 69, column: 54, scope: !7)
|
514 |
+
!66 = !DILocation(line: 75, column: 24, scope: !7)
|
515 |
+
!67 = !DILocation(line: 77, column: 24, scope: !7)
|
516 |
+
!68 = !DILocation(line: 78, column: 30, scope: !7)
|
517 |
+
!69 = !DILocation(line: 70, column: 24, scope: !7)
|
518 |
+
!70 = !DILocation(line: 72, column: 24, scope: !7)
|
519 |
+
!71 = !DILocation(line: 73, column: 24, scope: !7)
|
520 |
+
!72 = !DILocation(line: 79, column: 24, scope: !7)
|
521 |
+
!73 = !DILocation(line: 80, column: 24, scope: !7)
|
522 |
+
!74 = !DILocation(line: 82, column: 29, scope: !7)
|
523 |
+
!75 = !DILocation(line: 82, column: 52, scope: !7)
|
524 |
+
!76 = !DILocation(line: 58, column: 4, scope: !7)
|
.triton/dump/305a9479aab997a3a16bfe46bb303a50/triton_.ttgir
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
2 |
+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
3 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
4 |
+
tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
5 |
+
%cst = arith.constant dense<256> : tensor<1x256xi32, #blocked>
|
6 |
+
%cst_0 = arith.constant dense<0.000000e+00> : tensor<1x256xf32, #blocked>
|
7 |
+
%cst_1 = arith.constant dense<1.000000e+00> : tensor<1x256xf32, #blocked>
|
8 |
+
%cst_2 = arith.constant dense<256> : tensor<1x1xi64, #blocked>
|
9 |
+
%cst_3 = arith.constant dense<50257> : tensor<1x1xi64, #blocked>
|
10 |
+
%cst_4 = arith.constant dense<0> : tensor<1x1xi64, #blocked>
|
11 |
+
%cst_5 = arith.constant dense<0> : tensor<1x1xi64, #blocked1>
|
12 |
+
%cst_6 = arith.constant dense<50257> : tensor<1x1xi64, #blocked1>
|
13 |
+
%cst_7 = arith.constant 0.000000e+00 : f32
|
14 |
+
%c256_i32 = arith.constant 256 : i32
|
15 |
+
%c512_i32 = arith.constant 512 : i32
|
16 |
+
%cst_8 = arith.constant dense<9.99999974E-6> : tensor<1x1xf32, #blocked>
|
17 |
+
%cst_9 = arith.constant dense<2.560000e+02> : tensor<1x1xf32, #blocked>
|
18 |
+
%cst_10 = arith.constant dense<0.000000e+00> : tensor<1x256xbf16, #blocked>
|
19 |
+
%0 = tt.get_program_id x : i32
|
20 |
+
%1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
|
21 |
+
%2 = tt.expand_dims %1 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x256xi32, #blocked>
|
22 |
+
%3 = tt.addptr %arg0, %0 : !tt.ptr<i64, 1>, i32
|
23 |
+
%4 = tt.splat %3 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>, #blocked>
|
24 |
+
%5 = tt.splat %3 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>, #blocked1>
|
25 |
+
%6 = tt.load %4 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x1xi64, #blocked>
|
26 |
+
%7 = tt.load %5 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x1xi64, #blocked1>
|
27 |
+
%8 = arith.remsi %0, %c512_i32 : i32
|
28 |
+
%9 = arith.cmpi slt, %2, %cst : tensor<1x256xi32, #blocked>
|
29 |
+
%10 = arith.muli %8, %c256_i32 : i32
|
30 |
+
%11 = tt.splat %10 : (i32) -> tensor<1x256xi32, #blocked>
|
31 |
+
%12 = arith.addi %2, %11 : tensor<1x256xi32, #blocked>
|
32 |
+
%13 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>, #blocked>
|
33 |
+
%14 = tt.addptr %13, %12 : tensor<1x256x!tt.ptr<f32, 1>, #blocked>, tensor<1x256xi32, #blocked>
|
34 |
+
%15 = tt.load %14, %9, %cst_0 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32, #blocked>
|
35 |
+
%16 = arith.muli %0, %c256_i32 : i32
|
36 |
+
%17 = tt.splat %16 : (i32) -> tensor<1x256xi32, #blocked>
|
37 |
+
%18 = arith.addi %2, %17 : tensor<1x256xi32, #blocked>
|
38 |
+
%19 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<1x256x!tt.ptr<bf16, 1>, #blocked>
|
39 |
+
%20 = tt.addptr %19, %18 : tensor<1x256x!tt.ptr<bf16, 1>, #blocked>, tensor<1x256xi32, #blocked>
|
40 |
+
%21 = tt.load %20, %9, %cst_10 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xbf16, #blocked>
|
41 |
+
%22 = arith.extf %21 : tensor<1x256xbf16, #blocked> to tensor<1x256xf32, #blocked>
|
42 |
+
%23 = arith.addi %6, %cst_3 : tensor<1x1xi64, #blocked>
|
43 |
+
%24 = arith.addi %7, %cst_6 : tensor<1x1xi64, #blocked1>
|
44 |
+
%25 = arith.cmpi slt, %6, %cst_4 : tensor<1x1xi64, #blocked>
|
45 |
+
%26 = arith.cmpi slt, %7, %cst_5 : tensor<1x1xi64, #blocked1>
|
46 |
+
%27 = arith.select %25, %23, %6 : tensor<1x1xi1, #blocked>, tensor<1x1xi64, #blocked>
|
47 |
+
%28 = arith.select %26, %24, %7 : tensor<1x1xi1, #blocked1>, tensor<1x1xi64, #blocked1>
|
48 |
+
%29 = arith.cmpi sge, %28, %cst_5 : tensor<1x1xi64, #blocked1>
|
49 |
+
%30 = arith.cmpi slt, %28, %cst_6 : tensor<1x1xi64, #blocked1>
|
50 |
+
%31 = arith.andi %29, %30 : tensor<1x1xi1, #blocked1>
|
51 |
+
tt.assert %31, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<1x1xi1, #blocked1>
|
52 |
+
%32 = arith.muli %27, %cst_2 : tensor<1x1xi64, #blocked>
|
53 |
+
%33 = tt.broadcast %32 : (tensor<1x1xi64, #blocked>) -> tensor<1x256xi64, #blocked>
|
54 |
+
%34 = arith.extsi %2 : tensor<1x256xi32, #blocked> to tensor<1x256xi64, #blocked>
|
55 |
+
%35 = arith.addi %34, %33 : tensor<1x256xi64, #blocked>
|
56 |
+
%36 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>, #blocked>
|
57 |
+
%37 = tt.addptr %36, %35 : tensor<1x256x!tt.ptr<f32, 1>, #blocked>, tensor<1x256xi64, #blocked>
|
58 |
+
%38 = tt.load %37, %9, %cst_0 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32, #blocked>
|
59 |
+
%39 = arith.addf %38, %15 : tensor<1x256xf32, #blocked>
|
60 |
+
%40 = arith.addf %39, %22 : tensor<1x256xf32, #blocked>
|
61 |
+
%41 = arith.addf %40, %cst_0 : tensor<1x256xf32, #blocked>
|
62 |
+
%42 = arith.subf %40, %41 : tensor<1x256xf32, #blocked>
|
63 |
+
%43 = arith.mulf %40, %42 : tensor<1x256xf32, #blocked>
|
64 |
+
%44 = arith.addf %43, %cst_0 : tensor<1x256xf32, #blocked>
|
65 |
+
%45 = arith.select %9, %41, %cst_0 : tensor<1x256xi1, #blocked>, tensor<1x256xf32, #blocked>
|
66 |
+
%46 = arith.select %9, %44, %cst_0 : tensor<1x256xi1, #blocked>, tensor<1x256xf32, #blocked>
|
67 |
+
%47 = arith.select %9, %cst_1, %cst_0 : tensor<1x256xi1, #blocked>, tensor<1x256xf32, #blocked>
|
68 |
+
%48:3 = "tt.reduce"(%45, %46, %47) <{axis = 1 : i32}> ({
|
69 |
+
^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
|
70 |
+
%71 = arith.subf %arg11, %arg8 : f32
|
71 |
+
%72 = arith.addf %arg10, %arg13 : f32
|
72 |
+
%73 = arith.cmpf oeq, %72, %cst_7 : f32
|
73 |
+
%74 = arith.divf %arg13, %72 : f32
|
74 |
+
%75 = arith.select %73, %cst_7, %74 : f32
|
75 |
+
%76 = arith.mulf %71, %75 : f32
|
76 |
+
%77 = arith.addf %arg8, %76 : f32
|
77 |
+
%78 = arith.addf %arg9, %arg12 : f32
|
78 |
+
%79 = arith.mulf %71, %71 : f32
|
79 |
+
%80 = arith.mulf %79, %arg10 : f32
|
80 |
+
%81 = arith.mulf %80, %75 : f32
|
81 |
+
%82 = arith.addf %78, %81 : f32
|
82 |
+
tt.reduce.return %77, %82, %72 : f32, f32, f32
|
83 |
+
}) : (tensor<1x256xf32, #blocked>, tensor<1x256xf32, #blocked>, tensor<1x256xf32, #blocked>) -> (tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
|
84 |
+
%49 = tt.expand_dims %48#0 {axis = 1 : i32} : (tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1x1xf32, #blocked>
|
85 |
+
%50 = tt.expand_dims %48#1 {axis = 1 : i32} : (tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1x1xf32, #blocked>
|
86 |
+
%51 = tt.load %14, %9, %cst_0 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32, #blocked>
|
87 |
+
%52 = tt.load %20, %9, %cst_10 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x256xbf16, #blocked>
|
88 |
+
%53 = arith.extf %52 : tensor<1x256xbf16, #blocked> to tensor<1x256xf32, #blocked>
|
89 |
+
%54 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>, #blocked>
|
90 |
+
%55 = tt.addptr %54, %2 : tensor<1x256x!tt.ptr<f32, 1>, #blocked>, tensor<1x256xi32, #blocked>
|
91 |
+
%56 = tt.load %55, %9, %cst_0 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32, #blocked>
|
92 |
+
tt.assert %31, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<1x1xi1, #blocked1>
|
93 |
+
%57 = tt.load %37, %9, %cst_0 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x256xf32, #blocked>
|
94 |
+
%58 = arith.addf %57, %51 : tensor<1x256xf32, #blocked>
|
95 |
+
%59 = arith.addf %58, %53 : tensor<1x256xf32, #blocked>
|
96 |
+
%60 = tt.broadcast %49 : (tensor<1x1xf32, #blocked>) -> tensor<1x256xf32, #blocked>
|
97 |
+
%61 = arith.subf %59, %60 : tensor<1x256xf32, #blocked>
|
98 |
+
%62 = arith.divf %50, %cst_9 : tensor<1x1xf32, #blocked>
|
99 |
+
%63 = arith.addf %62, %cst_8 : tensor<1x1xf32, #blocked>
|
100 |
+
%64 = tt.extern_elementwise %63 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32, #blocked>) -> tensor<1x1xf32, #blocked>
|
101 |
+
%65 = tt.broadcast %64 : (tensor<1x1xf32, #blocked>) -> tensor<1x256xf32, #blocked>
|
102 |
+
%66 = arith.mulf %61, %65 : tensor<1x256xf32, #blocked>
|
103 |
+
%67 = arith.mulf %66, %56 : tensor<1x256xf32, #blocked>
|
104 |
+
%68 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<1x256x!tt.ptr<bf16, 1>, #blocked>
|
105 |
+
%69 = tt.addptr %68, %18 : tensor<1x256x!tt.ptr<bf16, 1>, #blocked>, tensor<1x256xi32, #blocked>
|
106 |
+
%70 = arith.truncf %67 : tensor<1x256xf32, #blocked> to tensor<1x256xbf16, #blocked>
|
107 |
+
tt.store %69, %70, %9 {cache = 1 : i32, evict = 1 : i32} : tensor<1x256xbf16, #blocked>
|
108 |
+
tt.return
|
109 |
+
}
|
110 |
+
}
|
.triton/dump/305a9479aab997a3a16bfe46bb303a50/triton_.ttir
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%c512_i32 = arith.constant 512 : i32
|
4 |
+
%c256_i32 = arith.constant 256 : i32
|
5 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<1x256xbf16>
|
6 |
+
%cst_0 = arith.constant dense<1.000000e+00> : tensor<1x256xf32>
|
7 |
+
%cst_1 = arith.constant 0.000000e+00 : f32
|
8 |
+
%cst_2 = arith.constant dense<256> : tensor<1x1xi64>
|
9 |
+
%cst_3 = arith.constant dense<50257> : tensor<1x1xi64>
|
10 |
+
%cst_4 = arith.constant dense<0> : tensor<1x1xi64>
|
11 |
+
%cst_5 = arith.constant dense<9.99999974E-6> : tensor<1x1xf32>
|
12 |
+
%cst_6 = arith.constant dense<2.560000e+02> : tensor<1x1xf32>
|
13 |
+
%cst_7 = arith.constant dense<0.000000e+00> : tensor<1x256xf32>
|
14 |
+
%cst_8 = arith.constant dense<256> : tensor<1x256xi32>
|
15 |
+
%0 = tt.get_program_id x : i32
|
16 |
+
%1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
|
17 |
+
%2 = tt.expand_dims %1 {axis = 0 : i32} : (tensor<256xi32>) -> tensor<1x256xi32>
|
18 |
+
%3 = tt.addptr %arg0, %0 : !tt.ptr<i64, 1>, i32
|
19 |
+
%4 = tt.splat %3 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>>
|
20 |
+
%5 = tt.load %4 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x1xi64>
|
21 |
+
%6 = arith.remsi %0, %c512_i32 : i32
|
22 |
+
%7 = arith.cmpi slt, %2, %cst_8 : tensor<1x256xi32>
|
23 |
+
%8 = arith.muli %6, %c256_i32 : i32
|
24 |
+
%9 = tt.splat %8 : (i32) -> tensor<1x256xi32>
|
25 |
+
%10 = arith.addi %2, %9 : tensor<1x256xi32>
|
26 |
+
%11 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>>
|
27 |
+
%12 = tt.addptr %11, %10 : tensor<1x256x!tt.ptr<f32, 1>>, tensor<1x256xi32>
|
28 |
+
%13 = tt.load %12, %7, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32>
|
29 |
+
%14 = arith.muli %0, %c256_i32 : i32
|
30 |
+
%15 = tt.splat %14 : (i32) -> tensor<1x256xi32>
|
31 |
+
%16 = arith.addi %2, %15 : tensor<1x256xi32>
|
32 |
+
%17 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<1x256x!tt.ptr<bf16, 1>>
|
33 |
+
%18 = tt.addptr %17, %16 : tensor<1x256x!tt.ptr<bf16, 1>>, tensor<1x256xi32>
|
34 |
+
%19 = tt.load %18, %7, %cst {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xbf16>
|
35 |
+
%20 = arith.extf %19 : tensor<1x256xbf16> to tensor<1x256xf32>
|
36 |
+
%21 = arith.addi %5, %cst_3 : tensor<1x1xi64>
|
37 |
+
%22 = arith.cmpi slt, %5, %cst_4 : tensor<1x1xi64>
|
38 |
+
%23 = arith.select %22, %21, %5 : tensor<1x1xi1>, tensor<1x1xi64>
|
39 |
+
%24 = arith.cmpi sge, %23, %cst_4 : tensor<1x1xi64>
|
40 |
+
%25 = arith.cmpi slt, %23, %cst_3 : tensor<1x1xi64>
|
41 |
+
%26 = arith.andi %24, %25 : tensor<1x1xi1>
|
42 |
+
tt.assert %26, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<1x1xi1>
|
43 |
+
%27 = arith.muli %23, %cst_2 : tensor<1x1xi64>
|
44 |
+
%28 = tt.broadcast %27 : (tensor<1x1xi64>) -> tensor<1x256xi64>
|
45 |
+
%29 = arith.extsi %2 : tensor<1x256xi32> to tensor<1x256xi64>
|
46 |
+
%30 = arith.addi %29, %28 : tensor<1x256xi64>
|
47 |
+
%31 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>>
|
48 |
+
%32 = tt.addptr %31, %30 : tensor<1x256x!tt.ptr<f32, 1>>, tensor<1x256xi64>
|
49 |
+
%33 = tt.load %32, %7, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32>
|
50 |
+
%34 = arith.addf %33, %13 : tensor<1x256xf32>
|
51 |
+
%35 = arith.addf %34, %20 : tensor<1x256xf32>
|
52 |
+
%36 = arith.addf %35, %cst_7 : tensor<1x256xf32>
|
53 |
+
%37 = arith.subf %35, %36 : tensor<1x256xf32>
|
54 |
+
%38 = arith.mulf %35, %37 : tensor<1x256xf32>
|
55 |
+
%39 = arith.addf %38, %cst_7 : tensor<1x256xf32>
|
56 |
+
%40 = arith.select %7, %36, %cst_7 : tensor<1x256xi1>, tensor<1x256xf32>
|
57 |
+
%41 = arith.select %7, %39, %cst_7 : tensor<1x256xi1>, tensor<1x256xf32>
|
58 |
+
%42 = arith.select %7, %cst_0, %cst_7 : tensor<1x256xi1>, tensor<1x256xf32>
|
59 |
+
%43:3 = "tt.reduce"(%40, %41, %42) <{axis = 1 : i32}> ({
|
60 |
+
^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
|
61 |
+
%66 = arith.subf %arg11, %arg8 : f32
|
62 |
+
%67 = arith.addf %arg10, %arg13 : f32
|
63 |
+
%68 = arith.cmpf oeq, %67, %cst_1 : f32
|
64 |
+
%69 = arith.divf %arg13, %67 : f32
|
65 |
+
%70 = arith.select %68, %cst_1, %69 : f32
|
66 |
+
%71 = arith.mulf %66, %70 : f32
|
67 |
+
%72 = arith.addf %arg8, %71 : f32
|
68 |
+
%73 = arith.addf %arg9, %arg12 : f32
|
69 |
+
%74 = arith.mulf %66, %66 : f32
|
70 |
+
%75 = arith.mulf %74, %arg10 : f32
|
71 |
+
%76 = arith.mulf %75, %70 : f32
|
72 |
+
%77 = arith.addf %73, %76 : f32
|
73 |
+
tt.reduce.return %72, %77, %67 : f32, f32, f32
|
74 |
+
}) : (tensor<1x256xf32>, tensor<1x256xf32>, tensor<1x256xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>)
|
75 |
+
%44 = tt.expand_dims %43#0 {axis = 1 : i32} : (tensor<1xf32>) -> tensor<1x1xf32>
|
76 |
+
%45 = tt.expand_dims %43#1 {axis = 1 : i32} : (tensor<1xf32>) -> tensor<1x1xf32>
|
77 |
+
%46 = tt.load %12, %7, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32>
|
78 |
+
%47 = tt.load %18, %7, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x256xbf16>
|
79 |
+
%48 = arith.extf %47 : tensor<1x256xbf16> to tensor<1x256xf32>
|
80 |
+
%49 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>>
|
81 |
+
%50 = tt.addptr %49, %2 : tensor<1x256x!tt.ptr<f32, 1>>, tensor<1x256xi32>
|
82 |
+
%51 = tt.load %50, %7, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32>
|
83 |
+
tt.assert %26, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<1x1xi1>
|
84 |
+
%52 = tt.load %32, %7, %cst_7 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x256xf32>
|
85 |
+
%53 = arith.addf %52, %46 : tensor<1x256xf32>
|
86 |
+
%54 = arith.addf %53, %48 : tensor<1x256xf32>
|
87 |
+
%55 = tt.broadcast %44 : (tensor<1x1xf32>) -> tensor<1x256xf32>
|
88 |
+
%56 = arith.subf %54, %55 : tensor<1x256xf32>
|
89 |
+
%57 = arith.divf %45, %cst_6 : tensor<1x1xf32>
|
90 |
+
%58 = arith.addf %57, %cst_5 : tensor<1x1xf32>
|
91 |
+
%59 = tt.extern_elementwise %58 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32>
|
92 |
+
%60 = tt.broadcast %59 : (tensor<1x1xf32>) -> tensor<1x256xf32>
|
93 |
+
%61 = arith.mulf %56, %60 : tensor<1x256xf32>
|
94 |
+
%62 = arith.mulf %61, %51 : tensor<1x256xf32>
|
95 |
+
%63 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<1x256x!tt.ptr<bf16, 1>>
|
96 |
+
%64 = tt.addptr %63, %16 : tensor<1x256x!tt.ptr<bf16, 1>>, tensor<1x256xi32>
|
97 |
+
%65 = arith.truncf %62 : tensor<1x256xf32> to tensor<1x256xbf16>
|
98 |
+
tt.store %64, %65, %7 {cache = 1 : i32, evict = 1 : i32} : tensor<1x256xbf16>
|
99 |
+
tt.return
|
100 |
+
}
|
101 |
+
}
|
.triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.llir
ADDED
@@ -0,0 +1,550 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
|
5 |
+
@assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
|
6 |
+
@assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp16 < 50257"
|
7 |
+
@assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
|
8 |
+
@assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
|
9 |
+
@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
|
10 |
+
@global_smem = external addrspace(3) global [0 x i8]
|
11 |
+
@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
|
12 |
+
|
13 |
+
declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
|
14 |
+
|
15 |
+
define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !7 {
|
16 |
+
%9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
|
17 |
+
%10 = and i32 %9, 31, !dbg !10
|
18 |
+
%11 = lshr i32 %9, 5, !dbg !10
|
19 |
+
%12 = lshr i32 %9, 6, !dbg !10
|
20 |
+
%13 = and i32 %12, 1, !dbg !10
|
21 |
+
%14 = and i32 %9, 1, !dbg !10
|
22 |
+
%15 = and i32 %11, 1, !dbg !11
|
23 |
+
%urem = shl i32 %9, 2, !dbg !11
|
24 |
+
%16 = and i32 %urem, 252, !dbg !11
|
25 |
+
%17 = shl i32 %9, 1, !dbg !11
|
26 |
+
%18 = and i32 %17, 254, !dbg !11
|
27 |
+
%19 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !12
|
28 |
+
%20 = shl i32 %19, 1, !dbg !13
|
29 |
+
%21 = or i32 %20, %13, !dbg !14
|
30 |
+
%22 = or i32 %20, %14, !dbg !14
|
31 |
+
%23 = sext i32 %21 to i64, !dbg !15
|
32 |
+
%24 = getelementptr i64, ptr addrspace(1) %0, i64 %23, !dbg !15
|
33 |
+
%25 = sext i32 %22 to i64, !dbg !15
|
34 |
+
%26 = getelementptr i64, ptr addrspace(1) %0, i64 %25, !dbg !15
|
35 |
+
%27 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
|
36 |
+
%28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
|
37 |
+
%29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
|
38 |
+
%30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
|
39 |
+
%31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #6, !dbg !16
|
40 |
+
%32 = srem i32 %21, 512, !dbg !17
|
41 |
+
%33 = shl nsw i32 %32, 8, !dbg !18
|
42 |
+
%34 = or i32 %33, %16, !dbg !19
|
43 |
+
%35 = sext i32 %34 to i64, !dbg !20
|
44 |
+
%36 = getelementptr float, ptr addrspace(1) %2, i64 %35, !dbg !20
|
45 |
+
%37 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %36, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21
|
46 |
+
%38 = extractvalue { i32, i32, i32, i32 } %37, 0, !dbg !21
|
47 |
+
%39 = extractvalue { i32, i32, i32, i32 } %37, 1, !dbg !21
|
48 |
+
%40 = extractvalue { i32, i32, i32, i32 } %37, 2, !dbg !21
|
49 |
+
%41 = extractvalue { i32, i32, i32, i32 } %37, 3, !dbg !21
|
50 |
+
%42 = insertelement <2 x i32> poison, i32 %39, i64 0, !dbg !21
|
51 |
+
%43 = insertelement <2 x i32> %42, i32 %38, i64 1, !dbg !21
|
52 |
+
%44 = bitcast <2 x i32> %43 to <2 x float>, !dbg !21
|
53 |
+
%45 = bitcast i32 %40 to float, !dbg !21
|
54 |
+
%46 = bitcast i32 %41 to float, !dbg !21
|
55 |
+
%47 = shl i32 %21, 8, !dbg !22
|
56 |
+
%48 = or i32 %47, %16, !dbg !23
|
57 |
+
%49 = sext i32 %48 to i64, !dbg !24
|
58 |
+
%50 = getelementptr i16, ptr addrspace(1) %3, i64 %49, !dbg !24
|
59 |
+
%51 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %50, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !25
|
60 |
+
%52 = extractvalue { i32, i32 } %51, 0, !dbg !25
|
61 |
+
%53 = extractvalue { i32, i32 } %51, 1, !dbg !25
|
62 |
+
%54 = trunc i32 %52 to i16, !dbg !25
|
63 |
+
%extelt.offset = lshr i32 %52, 16, !dbg !25
|
64 |
+
%55 = trunc i32 %extelt.offset to i16, !dbg !25
|
65 |
+
%56 = trunc i32 %53 to i16, !dbg !25
|
66 |
+
%extelt.offset1 = lshr i32 %53, 16, !dbg !25
|
67 |
+
%57 = trunc i32 %extelt.offset1 to i16, !dbg !25
|
68 |
+
%58 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %54) #6, !dbg !26
|
69 |
+
%59 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %55) #6, !dbg !26
|
70 |
+
%60 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %56) #6, !dbg !26
|
71 |
+
%61 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %57) #6, !dbg !26
|
72 |
+
%62 = add i64 %31, 50257, !dbg !27
|
73 |
+
%63 = icmp slt i64 %27, 0, !dbg !28
|
74 |
+
%64 = icmp slt i64 %31, 0, !dbg !28
|
75 |
+
%65 = select i1 %64, i64 %62, i64 %31, !dbg !29
|
76 |
+
%66 = icmp ugt i64 %65, 50256, !dbg !30
|
77 |
+
br i1 %66, label %67, label %68, !dbg !31
|
78 |
+
|
79 |
+
67: ; preds = %8
|
80 |
+
tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !31
|
81 |
+
br label %68, !dbg !31
|
82 |
+
|
83 |
+
68: ; preds = %67, %8
|
84 |
+
%69 = shl i64 %27, 8, !dbg !32
|
85 |
+
%70 = add i64 %69, 12865792, !dbg !32
|
86 |
+
%71 = select i1 %63, i64 %70, i64 %69, !dbg !32
|
87 |
+
%72 = zext nneg i32 %16 to i64
|
88 |
+
%73 = or i64 %71, %72, !dbg !33
|
89 |
+
%74 = getelementptr float, ptr addrspace(1) %1, i64 %73, !dbg !34
|
90 |
+
%75 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %74, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !35
|
91 |
+
%76 = extractvalue { i32, i32, i32, i32 } %75, 0, !dbg !35
|
92 |
+
%77 = extractvalue { i32, i32, i32, i32 } %75, 1, !dbg !35
|
93 |
+
%78 = extractvalue { i32, i32, i32, i32 } %75, 2, !dbg !35
|
94 |
+
%79 = extractvalue { i32, i32, i32, i32 } %75, 3, !dbg !35
|
95 |
+
%80 = bitcast i32 %78 to float, !dbg !35
|
96 |
+
%81 = bitcast i32 %79 to float, !dbg !35
|
97 |
+
%82 = fadd float %45, %80, !dbg !36
|
98 |
+
%83 = fadd float %46, %81, !dbg !36
|
99 |
+
%84 = fadd float %60, %82, !dbg !37
|
100 |
+
%85 = fadd float %61, %83, !dbg !37
|
101 |
+
%86 = insertelement <2 x i32> poison, i32 %77, i64 0, !dbg !35
|
102 |
+
%87 = insertelement <2 x i32> %86, i32 %76, i64 1, !dbg !35
|
103 |
+
%88 = bitcast <2 x i32> %87 to <2 x float>, !dbg !35
|
104 |
+
%89 = fadd <2 x float> %44, %88, !dbg !36
|
105 |
+
%90 = insertelement <2 x float> poison, float %59, i64 0, !dbg !37
|
106 |
+
%91 = insertelement <2 x float> %90, float %58, i64 1, !dbg !37
|
107 |
+
%92 = fadd <2 x float> %91, %89, !dbg !37
|
108 |
+
%93 = fadd <2 x float> %92, zeroinitializer, !dbg !38
|
109 |
+
%94 = fadd float %84, 0.000000e+00, !dbg !38
|
110 |
+
%95 = fadd float %85, 0.000000e+00, !dbg !38
|
111 |
+
%96 = extractelement <2 x float> %93, i64 1, !dbg !42
|
112 |
+
%97 = extractelement <2 x float> %92, i64 1, !dbg !46
|
113 |
+
%98 = fsub float %97, %96, !dbg !47
|
114 |
+
%99 = extractelement <2 x float> %93, i64 0, !dbg !42
|
115 |
+
%100 = extractelement <2 x float> %92, i64 0, !dbg !46
|
116 |
+
%101 = fsub float %100, %99, !dbg !47
|
117 |
+
%102 = fsub float %84, %94, !dbg !47
|
118 |
+
%103 = fsub float %85, %95, !dbg !47
|
119 |
+
%104 = fmul float %97, %98, !dbg !46
|
120 |
+
%105 = fmul float %100, %101, !dbg !46
|
121 |
+
%106 = fmul float %84, %102, !dbg !46
|
122 |
+
%107 = fmul float %85, %103, !dbg !46
|
123 |
+
%108 = fadd float %104, 0.000000e+00, !dbg !48
|
124 |
+
%109 = fadd float %105, 0.000000e+00, !dbg !48
|
125 |
+
%110 = fadd float %106, 0.000000e+00, !dbg !48
|
126 |
+
%111 = fadd float %107, 0.000000e+00, !dbg !48
|
127 |
+
%112 = fsub float %99, %96, !dbg !42
|
128 |
+
%113 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 2.000000e+00) #6, !dbg !49
|
129 |
+
%114 = fmul float %113, %112, !dbg !50
|
130 |
+
%115 = fadd float %96, %114, !dbg !51
|
131 |
+
%116 = fadd float %108, %109, !dbg !52
|
132 |
+
%117 = fmul float %112, %112, !dbg !53
|
133 |
+
%118 = fmul float %113, %117, !dbg !54
|
134 |
+
%119 = fadd float %118, %116, !dbg !55
|
135 |
+
%120 = fsub float %94, %115, !dbg !42
|
136 |
+
%121 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 3.000000e+00) #6, !dbg !49
|
137 |
+
%122 = fmul float %121, %120, !dbg !50
|
138 |
+
%123 = fadd float %115, %122, !dbg !51
|
139 |
+
%124 = fadd float %110, %119, !dbg !52
|
140 |
+
%125 = fmul float %120, %120, !dbg !53
|
141 |
+
%126 = fmul float %125, 2.000000e+00, !dbg !56
|
142 |
+
%127 = fmul float %121, %126, !dbg !54
|
143 |
+
%128 = fadd float %124, %127, !dbg !55
|
144 |
+
%129 = fsub float %95, %123, !dbg !42
|
145 |
+
%130 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 4.000000e+00) #6, !dbg !49
|
146 |
+
%131 = fmul float %130, %129, !dbg !50
|
147 |
+
%132 = fadd float %123, %131, !dbg !51
|
148 |
+
%133 = fadd float %111, %128, !dbg !52
|
149 |
+
%134 = fmul float %129, %129, !dbg !53
|
150 |
+
%135 = fmul float %134, 3.000000e+00, !dbg !56
|
151 |
+
%136 = fmul float %130, %135, !dbg !54
|
152 |
+
%137 = fadd float %133, %136, !dbg !55
|
153 |
+
%138 = bitcast float %132 to i32, !dbg !57
|
154 |
+
%139 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %138, i32 16, i32 31), !dbg !57
|
155 |
+
%140 = bitcast i32 %139 to float, !dbg !57
|
156 |
+
%141 = bitcast float %137 to i32, !dbg !57
|
157 |
+
%142 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %141, i32 16, i32 31), !dbg !57
|
158 |
+
%143 = bitcast i32 %142 to float, !dbg !57
|
159 |
+
%144 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 1082130432, i32 16, i32 31), !dbg !57
|
160 |
+
%145 = bitcast i32 %144 to float, !dbg !57
|
161 |
+
%146 = fsub float %140, %132, !dbg !42
|
162 |
+
%147 = fadd float %145, 4.000000e+00, !dbg !59
|
163 |
+
%148 = fcmp oeq float %147, 0.000000e+00, !dbg !60
|
164 |
+
%149 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %145, float %147) #6, !dbg !49
|
165 |
+
%150 = select i1 %148, float 0.000000e+00, float %149, !dbg !61
|
166 |
+
%151 = fmul float %150, %146, !dbg !50
|
167 |
+
%152 = fadd float %132, %151, !dbg !51
|
168 |
+
%153 = fadd float %137, %143, !dbg !52
|
169 |
+
%154 = fmul float %146, %146, !dbg !53
|
170 |
+
%155 = fmul float %154, 4.000000e+00, !dbg !56
|
171 |
+
%156 = fmul float %150, %155, !dbg !54
|
172 |
+
%157 = fadd float %153, %156, !dbg !55
|
173 |
+
%158 = bitcast float %152 to i32, !dbg !57
|
174 |
+
%159 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %158, i32 8, i32 31), !dbg !57
|
175 |
+
%160 = bitcast i32 %159 to float, !dbg !57
|
176 |
+
%161 = bitcast float %157 to i32, !dbg !57
|
177 |
+
%162 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %161, i32 8, i32 31), !dbg !57
|
178 |
+
%163 = bitcast i32 %162 to float, !dbg !57
|
179 |
+
%164 = bitcast float %147 to i32, !dbg !57
|
180 |
+
%165 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %164, i32 8, i32 31), !dbg !57
|
181 |
+
%166 = bitcast i32 %165 to float, !dbg !57
|
182 |
+
%167 = fsub float %160, %152, !dbg !42
|
183 |
+
%168 = fadd float %147, %166, !dbg !59
|
184 |
+
%169 = fcmp oeq float %168, 0.000000e+00, !dbg !60
|
185 |
+
%170 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %166, float %168) #6, !dbg !49
|
186 |
+
%171 = select i1 %169, float 0.000000e+00, float %170, !dbg !61
|
187 |
+
%172 = fmul float %171, %167, !dbg !50
|
188 |
+
%173 = fadd float %152, %172, !dbg !51
|
189 |
+
%174 = fadd float %157, %163, !dbg !52
|
190 |
+
%175 = fmul float %167, %167, !dbg !53
|
191 |
+
%176 = fmul float %147, %175, !dbg !56
|
192 |
+
%177 = fmul float %171, %176, !dbg !54
|
193 |
+
%178 = fadd float %174, %177, !dbg !55
|
194 |
+
%179 = bitcast float %173 to i32, !dbg !57
|
195 |
+
%180 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %179, i32 4, i32 31), !dbg !57
|
196 |
+
%181 = bitcast i32 %180 to float, !dbg !57
|
197 |
+
%182 = bitcast float %178 to i32, !dbg !57
|
198 |
+
%183 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %182, i32 4, i32 31), !dbg !57
|
199 |
+
%184 = bitcast i32 %183 to float, !dbg !57
|
200 |
+
%185 = bitcast float %168 to i32, !dbg !57
|
201 |
+
%186 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %185, i32 4, i32 31), !dbg !57
|
202 |
+
%187 = bitcast i32 %186 to float, !dbg !57
|
203 |
+
%188 = fsub float %181, %173, !dbg !42
|
204 |
+
%189 = fadd float %168, %187, !dbg !59
|
205 |
+
%190 = fcmp oeq float %189, 0.000000e+00, !dbg !60
|
206 |
+
%191 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %187, float %189) #6, !dbg !49
|
207 |
+
%192 = select i1 %190, float 0.000000e+00, float %191, !dbg !61
|
208 |
+
%193 = fmul float %192, %188, !dbg !50
|
209 |
+
%194 = fadd float %173, %193, !dbg !51
|
210 |
+
%195 = fadd float %178, %184, !dbg !52
|
211 |
+
%196 = fmul float %188, %188, !dbg !53
|
212 |
+
%197 = fmul float %168, %196, !dbg !56
|
213 |
+
%198 = fmul float %192, %197, !dbg !54
|
214 |
+
%199 = fadd float %195, %198, !dbg !55
|
215 |
+
%200 = bitcast float %194 to i32, !dbg !57
|
216 |
+
%201 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %200, i32 2, i32 31), !dbg !57
|
217 |
+
%202 = bitcast i32 %201 to float, !dbg !57
|
218 |
+
%203 = bitcast float %199 to i32, !dbg !57
|
219 |
+
%204 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %203, i32 2, i32 31), !dbg !57
|
220 |
+
%205 = bitcast i32 %204 to float, !dbg !57
|
221 |
+
%206 = bitcast float %189 to i32, !dbg !57
|
222 |
+
%207 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %206, i32 2, i32 31), !dbg !57
|
223 |
+
%208 = bitcast i32 %207 to float, !dbg !57
|
224 |
+
%209 = fsub float %202, %194, !dbg !42
|
225 |
+
%210 = fadd float %189, %208, !dbg !59
|
226 |
+
%211 = fcmp oeq float %210, 0.000000e+00, !dbg !60
|
227 |
+
%212 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %208, float %210) #6, !dbg !49
|
228 |
+
%213 = select i1 %211, float 0.000000e+00, float %212, !dbg !61
|
229 |
+
%214 = fmul float %213, %209, !dbg !50
|
230 |
+
%215 = fadd float %194, %214, !dbg !51
|
231 |
+
%216 = fadd float %199, %205, !dbg !52
|
232 |
+
%217 = fmul float %209, %209, !dbg !53
|
233 |
+
%218 = fmul float %189, %217, !dbg !56
|
234 |
+
%219 = fmul float %213, %218, !dbg !54
|
235 |
+
%220 = fadd float %216, %219, !dbg !55
|
236 |
+
%221 = bitcast float %215 to i32, !dbg !57
|
237 |
+
%222 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %221, i32 1, i32 31), !dbg !57
|
238 |
+
%223 = bitcast i32 %222 to float, !dbg !57
|
239 |
+
%224 = bitcast float %220 to i32, !dbg !57
|
240 |
+
%225 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %224, i32 1, i32 31), !dbg !57
|
241 |
+
%226 = bitcast i32 %225 to float, !dbg !57
|
242 |
+
%227 = bitcast float %210 to i32, !dbg !57
|
243 |
+
%228 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %227, i32 1, i32 31), !dbg !57
|
244 |
+
%229 = bitcast i32 %228 to float, !dbg !57
|
245 |
+
%230 = fsub float %223, %215, !dbg !42
|
246 |
+
%231 = fadd float %210, %229, !dbg !59
|
247 |
+
%232 = fcmp oeq float %231, 0.000000e+00, !dbg !60
|
248 |
+
%233 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %229, float %231) #6, !dbg !49
|
249 |
+
%234 = select i1 %232, float 0.000000e+00, float %233, !dbg !61
|
250 |
+
%235 = fmul float %234, %230, !dbg !50
|
251 |
+
%236 = fadd float %215, %235, !dbg !51
|
252 |
+
%237 = fadd float %220, %226, !dbg !52
|
253 |
+
%238 = fmul float %230, %230, !dbg !53
|
254 |
+
%239 = fmul float %210, %238, !dbg !56
|
255 |
+
%240 = fmul float %234, %239, !dbg !54
|
256 |
+
%241 = fadd float %237, %240, !dbg !55
|
257 |
+
%242 = icmp eq i32 %10, 0, !dbg !57
|
258 |
+
%243 = shl nuw nsw i32 %13, 1, !dbg !57
|
259 |
+
%244 = or i32 %243, %15, !dbg !57
|
260 |
+
%245 = zext nneg i32 %244 to i64, !dbg !57
|
261 |
+
%246 = getelementptr float, ptr addrspace(3) @global_smem, i64 %245, !dbg !57
|
262 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %246, float %236, i1 %242) #6, !dbg !57
|
263 |
+
%247 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %245, !dbg !57
|
264 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %247, float %241, i1 %242) #6, !dbg !57
|
265 |
+
%248 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 32), i64 %245, !dbg !57
|
266 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %248, float %231, i1 %242) #6, !dbg !57
|
267 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !57
|
268 |
+
%249 = icmp slt i32 %9, 4, !dbg !57
|
269 |
+
%250 = sext i32 %9 to i64, !dbg !57
|
270 |
+
%251 = getelementptr float, ptr addrspace(3) @global_smem, i64 %250, !dbg !57
|
271 |
+
%252 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %251, i1 %249) #6, !dbg !57
|
272 |
+
%253 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %250, !dbg !57
|
273 |
+
%254 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %253, i1 %249) #6, !dbg !57
|
274 |
+
%255 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 32), i64 %250, !dbg !57
|
275 |
+
%256 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %255, i1 %249) #6, !dbg !57
|
276 |
+
%257 = bitcast float %252 to i32, !dbg !57
|
277 |
+
%258 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %257, i32 1, i32 31), !dbg !57
|
278 |
+
%259 = bitcast i32 %258 to float, !dbg !57
|
279 |
+
%260 = bitcast float %254 to i32, !dbg !57
|
280 |
+
%261 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %260, i32 1, i32 31), !dbg !57
|
281 |
+
%262 = bitcast i32 %261 to float, !dbg !57
|
282 |
+
%263 = bitcast float %256 to i32, !dbg !57
|
283 |
+
%264 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %263, i32 1, i32 31), !dbg !57
|
284 |
+
%265 = bitcast i32 %264 to float, !dbg !57
|
285 |
+
%266 = fsub float %259, %252, !dbg !42
|
286 |
+
%267 = fadd float %256, %265, !dbg !59
|
287 |
+
%268 = fcmp oeq float %267, 0.000000e+00, !dbg !60
|
288 |
+
%269 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %265, float %267) #6, !dbg !49
|
289 |
+
%270 = select i1 %268, float 0.000000e+00, float %269, !dbg !61
|
290 |
+
%271 = fmul float %266, %270, !dbg !50
|
291 |
+
%272 = fadd float %252, %271, !dbg !51
|
292 |
+
%273 = fadd float %254, %262, !dbg !52
|
293 |
+
%274 = fmul float %266, %266, !dbg !53
|
294 |
+
%275 = fmul float %256, %274, !dbg !56
|
295 |
+
%276 = fmul float %275, %270, !dbg !54
|
296 |
+
%277 = fadd float %273, %276, !dbg !55
|
297 |
+
%278 = icmp eq i32 %14, 0, !dbg !57
|
298 |
+
%279 = and i1 %249, %278, !dbg !57
|
299 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %251, float %272, i1 %279) #6, !dbg !57
|
300 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %253, float %277, i1 %279) #6, !dbg !57
|
301 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %255, float %267, i1 %279) #6, !dbg !57
|
302 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !57
|
303 |
+
%280 = zext nneg i32 %243 to i64, !dbg !57
|
304 |
+
%281 = getelementptr float, ptr addrspace(3) @global_smem, i64 %280, !dbg !57
|
305 |
+
%282 = load float, ptr addrspace(3) %281, align 4, !dbg !57
|
306 |
+
%283 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %280, !dbg !57
|
307 |
+
%284 = load float, ptr addrspace(3) %283, align 4, !dbg !57
|
308 |
+
%285 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %36, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !62
|
309 |
+
%286 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %50, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !63
|
310 |
+
%287 = extractvalue { i32, i32 } %286, 0, !dbg !63
|
311 |
+
%288 = extractvalue { i32, i32 } %286, 1, !dbg !63
|
312 |
+
%289 = trunc i32 %287 to i16, !dbg !63
|
313 |
+
%extelt.offset2 = lshr i32 %287, 16, !dbg !63
|
314 |
+
%290 = trunc i32 %extelt.offset2 to i16, !dbg !63
|
315 |
+
%291 = trunc i32 %288 to i16, !dbg !63
|
316 |
+
%extelt.offset3 = lshr i32 %288, 16, !dbg !63
|
317 |
+
%292 = trunc i32 %extelt.offset3 to i16, !dbg !63
|
318 |
+
%293 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %289) #6, !dbg !64
|
319 |
+
%294 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %290) #6, !dbg !64
|
320 |
+
%295 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %291) #6, !dbg !64
|
321 |
+
%296 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %292) #6, !dbg !64
|
322 |
+
%297 = zext nneg i32 %18 to i64, !dbg !65
|
323 |
+
%298 = getelementptr float, ptr addrspace(1) %4, i64 %297, !dbg !65
|
324 |
+
%299 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %298, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !66
|
325 |
+
br i1 %66, label %300, label %301, !dbg !67
|
326 |
+
|
327 |
+
300: ; preds = %68
|
328 |
+
tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !67
|
329 |
+
br label %301, !dbg !67
|
330 |
+
|
331 |
+
301: ; preds = %300, %68
|
332 |
+
%302 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %74, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
|
333 |
+
%303 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %284, float 2.560000e+02) #6, !dbg !69
|
334 |
+
%304 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %284, float 2.560000e+02) #6, !dbg !69
|
335 |
+
%305 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %284, float 2.560000e+02) #6, !dbg !69
|
336 |
+
%306 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %284, float 2.560000e+02) #6, !dbg !69
|
337 |
+
%307 = fadd float %303, 0x3EE4F8B580000000, !dbg !70
|
338 |
+
%308 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
|
339 |
+
%.not.i = icmp eq i32 %308, 0, !dbg !71
|
340 |
+
br i1 %.not.i, label %311, label %309, !dbg !71
|
341 |
+
|
342 |
+
309: ; preds = %301
|
343 |
+
%310 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %307), !dbg !71
|
344 |
+
br label %__nv_rsqrtf.exit, !dbg !71
|
345 |
+
|
346 |
+
311: ; preds = %301
|
347 |
+
%312 = tail call float @llvm.nvvm.rsqrt.approx.f(float %307), !dbg !71
|
348 |
+
br label %__nv_rsqrtf.exit, !dbg !71
|
349 |
+
|
350 |
+
__nv_rsqrtf.exit: ; preds = %309, %311
|
351 |
+
%.0.i = phi float [ %310, %309 ], [ %312, %311 ], !dbg !71
|
352 |
+
%313 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
|
353 |
+
%314 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
|
354 |
+
%315 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
|
355 |
+
%316 = extractvalue { i32, i32, i32, i32 } %302, 3, !dbg !68
|
356 |
+
%317 = bitcast i32 %316 to float, !dbg !68
|
357 |
+
%318 = extractvalue { i32, i32, i32, i32 } %285, 3, !dbg !62
|
358 |
+
%319 = bitcast i32 %318 to float, !dbg !62
|
359 |
+
%320 = fadd float %319, %317, !dbg !72
|
360 |
+
%321 = fadd float %296, %320, !dbg !73
|
361 |
+
%322 = fsub float %321, %282, !dbg !74
|
362 |
+
%323 = extractvalue { i32, i32, i32, i32 } %302, 2, !dbg !68
|
363 |
+
%324 = bitcast i32 %323 to float, !dbg !68
|
364 |
+
%325 = extractvalue { i32, i32, i32, i32 } %285, 2, !dbg !62
|
365 |
+
%326 = bitcast i32 %325 to float, !dbg !62
|
366 |
+
%327 = fadd float %326, %324, !dbg !72
|
367 |
+
%328 = fadd float %295, %327, !dbg !73
|
368 |
+
%329 = fsub float %328, %282, !dbg !74
|
369 |
+
%330 = extractvalue { i32, i32, i32, i32 } %302, 1, !dbg !68
|
370 |
+
%331 = bitcast i32 %330 to float, !dbg !68
|
371 |
+
%332 = extractvalue { i32, i32, i32, i32 } %285, 1, !dbg !62
|
372 |
+
%333 = bitcast i32 %332 to float, !dbg !62
|
373 |
+
%334 = fadd float %333, %331, !dbg !72
|
374 |
+
%335 = fadd float %294, %334, !dbg !73
|
375 |
+
%336 = fsub float %335, %282, !dbg !74
|
376 |
+
%337 = extractvalue { i32, i32, i32, i32 } %302, 0, !dbg !68
|
377 |
+
%338 = bitcast i32 %337 to float, !dbg !68
|
378 |
+
%339 = extractvalue { i32, i32, i32, i32 } %285, 0, !dbg !62
|
379 |
+
%340 = bitcast i32 %339 to float, !dbg !62
|
380 |
+
%341 = fadd float %340, %338, !dbg !72
|
381 |
+
%342 = fadd float %293, %341, !dbg !73
|
382 |
+
%343 = fsub float %342, %282, !dbg !74
|
383 |
+
%344 = extractvalue { i32, i32 } %299, 0, !dbg !66
|
384 |
+
%345 = extractvalue { i32, i32 } %299, 1, !dbg !66
|
385 |
+
%346 = fmul float %343, %.0.i, !dbg !75
|
386 |
+
%347 = fmul float %336, %.0.i, !dbg !75
|
387 |
+
%348 = fmul float %329, %.0.i, !dbg !75
|
388 |
+
%349 = fmul float %322, %.0.i, !dbg !75
|
389 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !76
|
390 |
+
%350 = getelementptr float, ptr addrspace(3) @global_smem, i64 %297, !dbg !76
|
391 |
+
%351 = insertelement <2 x i32> undef, i32 %344, i64 0, !dbg !76
|
392 |
+
%352 = insertelement <2 x i32> %351, i32 %345, i64 1, !dbg !76
|
393 |
+
store <2 x i32> %352, ptr addrspace(3) %350, align 8, !dbg !76
|
394 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !76
|
395 |
+
%353 = getelementptr float, ptr addrspace(3) @global_smem, i64 %72, !dbg !76
|
396 |
+
%354 = load float, ptr addrspace(3) %353, align 16, !dbg !76
|
397 |
+
%355 = getelementptr inbounds <4 x float>, ptr addrspace(3) %353, i64 0, i64 1, !dbg !76
|
398 |
+
%356 = load float, ptr addrspace(3) %355, align 4, !dbg !76
|
399 |
+
%357 = getelementptr inbounds <4 x float>, ptr addrspace(3) %353, i64 0, i64 2, !dbg !76
|
400 |
+
%358 = load float, ptr addrspace(3) %357, align 8, !dbg !76
|
401 |
+
%359 = getelementptr inbounds <4 x float>, ptr addrspace(3) %353, i64 0, i64 3, !dbg !76
|
402 |
+
%360 = load float, ptr addrspace(3) %359, align 4, !dbg !76
|
403 |
+
%361 = fmul float %346, %354, !dbg !76
|
404 |
+
%362 = fmul float %347, %356, !dbg !76
|
405 |
+
%363 = fmul float %348, %358, !dbg !76
|
406 |
+
%364 = fmul float %349, %360, !dbg !76
|
407 |
+
%365 = getelementptr i16, ptr addrspace(1) %5, i64 %49, !dbg !77
|
408 |
+
%366 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %361) #6, !dbg !78
|
409 |
+
%367 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %362) #6, !dbg !78
|
410 |
+
%368 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %363) #6, !dbg !78
|
411 |
+
%369 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %364) #6, !dbg !78
|
412 |
+
%370 = insertelement <2 x i16> undef, i16 %366, i64 0, !dbg !78
|
413 |
+
%371 = insertelement <2 x i16> %370, i16 %367, i64 1, !dbg !78
|
414 |
+
%372 = bitcast <2 x i16> %371 to i32, !dbg !78
|
415 |
+
%373 = insertelement <2 x i16> undef, i16 %368, i64 0, !dbg !78
|
416 |
+
%374 = insertelement <2 x i16> %373, i16 %369, i64 1, !dbg !78
|
417 |
+
%375 = bitcast <2 x i16> %374 to i32, !dbg !78
|
418 |
+
tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %372, i32 %375, ptr addrspace(1) %365, i1 true) #6, !dbg !78
|
419 |
+
ret void, !dbg !79
|
420 |
+
}
|
421 |
+
|
422 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
423 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
424 |
+
|
425 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
426 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
|
427 |
+
|
428 |
+
; Function Attrs: convergent nocallback nounwind
|
429 |
+
declare void @llvm.nvvm.barrier0() #2
|
430 |
+
|
431 |
+
; Function Attrs: alwaysinline nounwind
|
432 |
+
define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
|
433 |
+
%1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
|
434 |
+
%.not = icmp eq i32 %1, 0
|
435 |
+
br i1 %.not, label %4, label %2
|
436 |
+
|
437 |
+
2: ; preds = %0
|
438 |
+
%3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
|
439 |
+
br label %6
|
440 |
+
|
441 |
+
4: ; preds = %0
|
442 |
+
%5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
|
443 |
+
br label %6
|
444 |
+
|
445 |
+
6: ; preds = %4, %2
|
446 |
+
%.0 = phi float [ %3, %2 ], [ %5, %4 ]
|
447 |
+
ret float %.0
|
448 |
+
}
|
449 |
+
|
450 |
+
declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
|
451 |
+
|
452 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
453 |
+
declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
|
454 |
+
|
455 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
456 |
+
declare float @llvm.nvvm.rsqrt.approx.f(float) #5
|
457 |
+
|
458 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
459 |
+
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
460 |
+
attributes #2 = { convergent nocallback nounwind }
|
461 |
+
attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
462 |
+
attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
463 |
+
attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
|
464 |
+
attributes #6 = { nounwind }
|
465 |
+
|
466 |
+
!llvm.module.flags = !{!0, !1}
|
467 |
+
!llvm.dbg.cu = !{!2}
|
468 |
+
!nvvm.annotations = !{!4, !5, !5, !4}
|
469 |
+
!llvm.ident = !{!6}
|
470 |
+
|
471 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
472 |
+
!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
473 |
+
!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
474 |
+
!3 = !DIFile(filename: "cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py", directory: "/tmp/torchinductor_root/pn")
|
475 |
+
!4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1}
|
476 |
+
!5 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 128}
|
477 |
+
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
|
478 |
+
!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
|
479 |
+
!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
|
480 |
+
!9 = !{}
|
481 |
+
!10 = !DILocation(line: 22, column: 44, scope: !7)
|
482 |
+
!11 = !DILocation(line: 24, column: 33, scope: !7)
|
483 |
+
!12 = !DILocation(line: 21, column: 28, scope: !7)
|
484 |
+
!13 = !DILocation(line: 21, column: 33, scope: !7)
|
485 |
+
!14 = !DILocation(line: 22, column: 23, scope: !7)
|
486 |
+
!15 = !DILocation(line: 26, column: 30, scope: !7)
|
487 |
+
!16 = !DILocation(line: 26, column: 35, scope: !7)
|
488 |
+
!17 = !DILocation(line: 27, column: 18, scope: !7)
|
489 |
+
!18 = !DILocation(line: 35, column: 44, scope: !7)
|
490 |
+
!19 = !DILocation(line: 35, column: 40, scope: !7)
|
491 |
+
!20 = !DILocation(line: 35, column: 34, scope: !7)
|
492 |
+
!21 = !DILocation(line: 35, column: 50, scope: !7)
|
493 |
+
!22 = !DILocation(line: 36, column: 44, scope: !7)
|
494 |
+
!23 = !DILocation(line: 36, column: 40, scope: !7)
|
495 |
+
!24 = !DILocation(line: 36, column: 34, scope: !7)
|
496 |
+
!25 = !DILocation(line: 36, column: 50, scope: !7)
|
497 |
+
!26 = !DILocation(line: 36, column: 101, scope: !7)
|
498 |
+
!27 = !DILocation(line: 37, column: 22, scope: !7)
|
499 |
+
!28 = !DILocation(line: 38, column: 22, scope: !7)
|
500 |
+
!29 = !DILocation(line: 39, column: 36, scope: !7)
|
501 |
+
!30 = !DILocation(line: 40, column: 40, scope: !7)
|
502 |
+
!31 = !DILocation(line: 40, column: 55, scope: !7)
|
503 |
+
!32 = !DILocation(line: 41, column: 44, scope: !7)
|
504 |
+
!33 = !DILocation(line: 41, column: 40, scope: !7)
|
505 |
+
!34 = !DILocation(line: 41, column: 34, scope: !7)
|
506 |
+
!35 = !DILocation(line: 41, column: 52, scope: !7)
|
507 |
+
!36 = !DILocation(line: 42, column: 22, scope: !7)
|
508 |
+
!37 = !DILocation(line: 44, column: 22, scope: !7)
|
509 |
+
!38 = !DILocation(line: 98, column: 22, scope: !39, inlinedAt: !41)
|
510 |
+
!39 = distinct !DILexicalBlockFile(scope: !7, file: !40, discriminator: 0)
|
511 |
+
!40 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
|
512 |
+
!41 = !DILocation(line: 47, column: 41, scope: !39)
|
513 |
+
!42 = !DILocation(line: 108, column: 21, scope: !43, inlinedAt: !44)
|
514 |
+
!43 = distinct !DILexicalBlockFile(scope: !39, file: !40, discriminator: 0)
|
515 |
+
!44 = !DILocation(line: 120, column: 46, scope: !43, inlinedAt: !45)
|
516 |
+
!45 = !DILocation(line: 53, column: 44, scope: !43)
|
517 |
+
!46 = !DILocation(line: 101, column: 22, scope: !39, inlinedAt: !41)
|
518 |
+
!47 = !DILocation(line: 101, column: 30, scope: !39, inlinedAt: !41)
|
519 |
+
!48 = !DILocation(line: 101, column: 13, scope: !39, inlinedAt: !41)
|
520 |
+
!49 = !DILocation(line: 110, column: 60, scope: !43, inlinedAt: !44)
|
521 |
+
!50 = !DILocation(line: 112, column: 25, scope: !43, inlinedAt: !44)
|
522 |
+
!51 = !DILocation(line: 112, column: 17, scope: !43, inlinedAt: !44)
|
523 |
+
!52 = !DILocation(line: 113, column: 15, scope: !43, inlinedAt: !44)
|
524 |
+
!53 = !DILocation(line: 113, column: 30, scope: !43, inlinedAt: !44)
|
525 |
+
!54 = !DILocation(line: 113, column: 49, scope: !43, inlinedAt: !44)
|
526 |
+
!55 = !DILocation(line: 113, column: 22, scope: !43, inlinedAt: !44)
|
527 |
+
!56 = !DILocation(line: 113, column: 38, scope: !43, inlinedAt: !44)
|
528 |
+
!57 = !DILocation(line: 120, column: 46, scope: !39, inlinedAt: !58)
|
529 |
+
!58 = !DILocation(line: 53, column: 44, scope: !39)
|
530 |
+
!59 = !DILocation(line: 109, column: 28, scope: !43, inlinedAt: !44)
|
531 |
+
!60 = !DILocation(line: 110, column: 39, scope: !43, inlinedAt: !44)
|
532 |
+
!61 = !DILocation(line: 110, column: 49, scope: !43, inlinedAt: !44)
|
533 |
+
!62 = !DILocation(line: 62, column: 51, scope: !7)
|
534 |
+
!63 = !DILocation(line: 63, column: 51, scope: !7)
|
535 |
+
!64 = !DILocation(line: 63, column: 103, scope: !7)
|
536 |
+
!65 = !DILocation(line: 64, column: 35, scope: !7)
|
537 |
+
!66 = !DILocation(line: 64, column: 40, scope: !7)
|
538 |
+
!67 = !DILocation(line: 68, column: 57, scope: !7)
|
539 |
+
!68 = !DILocation(line: 69, column: 54, scope: !7)
|
540 |
+
!69 = !DILocation(line: 75, column: 24, scope: !7)
|
541 |
+
!70 = !DILocation(line: 77, column: 24, scope: !7)
|
542 |
+
!71 = !DILocation(line: 78, column: 30, scope: !7)
|
543 |
+
!72 = !DILocation(line: 70, column: 24, scope: !7)
|
544 |
+
!73 = !DILocation(line: 72, column: 24, scope: !7)
|
545 |
+
!74 = !DILocation(line: 73, column: 24, scope: !7)
|
546 |
+
!75 = !DILocation(line: 79, column: 24, scope: !7)
|
547 |
+
!76 = !DILocation(line: 80, column: 24, scope: !7)
|
548 |
+
!77 = !DILocation(line: 82, column: 29, scope: !7)
|
549 |
+
!78 = !DILocation(line: 82, column: 52, scope: !7)
|
550 |
+
!79 = !DILocation(line: 58, column: 4, scope: !7)
|
.triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.ttgir
ADDED
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [2, 2], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
2 |
+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 2], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
3 |
+
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
4 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
5 |
+
tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
6 |
+
%cst = arith.constant dense<512> : tensor<2x1xi32, #blocked>
|
7 |
+
%cst_0 = arith.constant dense<256> : tensor<1x256xi32, #blocked>
|
8 |
+
%cst_1 = arith.constant dense<256> : tensor<1x256xi32, #blocked1>
|
9 |
+
%cst_2 = arith.constant dense<256> : tensor<2x1xi32, #blocked>
|
10 |
+
%cst_3 = arith.constant dense<1.000000e+00> : tensor<1x256xf32, #blocked>
|
11 |
+
%cst_4 = arith.constant dense<0.000000e+00> : tensor<1x256xf32, #blocked>
|
12 |
+
%cst_5 = arith.constant dense<256> : tensor<2x1xi64, #blocked>
|
13 |
+
%cst_6 = arith.constant dense<50257> : tensor<2x1xi64, #blocked>
|
14 |
+
%cst_7 = arith.constant dense<0> : tensor<2x1xi64, #blocked>
|
15 |
+
%cst_8 = arith.constant dense<0> : tensor<2x1xi64, #blocked2>
|
16 |
+
%cst_9 = arith.constant dense<50257> : tensor<2x1xi64, #blocked2>
|
17 |
+
%cst_10 = arith.constant 0.000000e+00 : f32
|
18 |
+
%cst_11 = arith.constant dense<9.99999974E-6> : tensor<2x1xf32, #blocked>
|
19 |
+
%cst_12 = arith.constant dense<2.560000e+02> : tensor<2x1xf32, #blocked>
|
20 |
+
%cst_13 = arith.constant dense<0.000000e+00> : tensor<2x256xf32, #blocked>
|
21 |
+
%cst_14 = arith.constant dense<0.000000e+00> : tensor<2x256xbf16, #blocked>
|
22 |
+
%cst_15 = arith.constant dense<0.000000e+00> : tensor<1x256xf32, #blocked1>
|
23 |
+
%c2_i32 = arith.constant 2 : i32
|
24 |
+
%0 = tt.get_program_id x : i32
|
25 |
+
%1 = arith.muli %0, %c2_i32 : i32
|
26 |
+
%2 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
27 |
+
%3 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>
|
28 |
+
%4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<2x1xi32, #blocked>
|
29 |
+
%5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>) -> tensor<2x1xi32, #blocked2>
|
30 |
+
%6 = tt.splat %1 : (i32) -> tensor<2x1xi32, #blocked>
|
31 |
+
%7 = tt.splat %1 : (i32) -> tensor<2x1xi32, #blocked2>
|
32 |
+
%8 = arith.addi %6, %4 : tensor<2x1xi32, #blocked>
|
33 |
+
%9 = arith.addi %7, %5 : tensor<2x1xi32, #blocked2>
|
34 |
+
%10 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
|
35 |
+
%11 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>
|
36 |
+
%12 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x256xi32, #blocked>
|
37 |
+
%13 = tt.expand_dims %11 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x256xi32, #blocked1>
|
38 |
+
%14 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<2x1x!tt.ptr<i64, 1>, #blocked>
|
39 |
+
%15 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<2x1x!tt.ptr<i64, 1>, #blocked2>
|
40 |
+
%16 = tt.addptr %14, %8 : tensor<2x1x!tt.ptr<i64, 1>, #blocked>, tensor<2x1xi32, #blocked>
|
41 |
+
%17 = tt.addptr %15, %9 : tensor<2x1x!tt.ptr<i64, 1>, #blocked2>, tensor<2x1xi32, #blocked2>
|
42 |
+
%18 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x1xi64, #blocked>
|
43 |
+
%19 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x1xi64, #blocked2>
|
44 |
+
%20 = arith.remsi %8, %cst : tensor<2x1xi32, #blocked>
|
45 |
+
%21 = arith.cmpi slt, %12, %cst_0 : tensor<1x256xi32, #blocked>
|
46 |
+
%22 = arith.cmpi slt, %13, %cst_1 : tensor<1x256xi32, #blocked1>
|
47 |
+
%23 = arith.muli %20, %cst_2 : tensor<2x1xi32, #blocked>
|
48 |
+
%24 = tt.broadcast %12 : (tensor<1x256xi32, #blocked>) -> tensor<2x256xi32, #blocked>
|
49 |
+
%25 = tt.broadcast %23 : (tensor<2x1xi32, #blocked>) -> tensor<2x256xi32, #blocked>
|
50 |
+
%26 = arith.addi %24, %25 : tensor<2x256xi32, #blocked>
|
51 |
+
%27 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<2x256x!tt.ptr<f32, 1>, #blocked>
|
52 |
+
%28 = tt.addptr %27, %26 : tensor<2x256x!tt.ptr<f32, 1>, #blocked>, tensor<2x256xi32, #blocked>
|
53 |
+
%29 = tt.broadcast %21 : (tensor<1x256xi1, #blocked>) -> tensor<2x256xi1, #blocked>
|
54 |
+
%30 = tt.load %28, %29, %cst_13 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32, #blocked>
|
55 |
+
%31 = arith.muli %8, %cst_2 : tensor<2x1xi32, #blocked>
|
56 |
+
%32 = tt.broadcast %31 : (tensor<2x1xi32, #blocked>) -> tensor<2x256xi32, #blocked>
|
57 |
+
%33 = arith.addi %24, %32 : tensor<2x256xi32, #blocked>
|
58 |
+
%34 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<2x256x!tt.ptr<bf16, 1>, #blocked>
|
59 |
+
%35 = tt.addptr %34, %33 : tensor<2x256x!tt.ptr<bf16, 1>, #blocked>, tensor<2x256xi32, #blocked>
|
60 |
+
%36 = tt.load %35, %29, %cst_14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xbf16, #blocked>
|
61 |
+
%37 = arith.extf %36 : tensor<2x256xbf16, #blocked> to tensor<2x256xf32, #blocked>
|
62 |
+
%38 = arith.addi %18, %cst_6 : tensor<2x1xi64, #blocked>
|
63 |
+
%39 = arith.addi %19, %cst_9 : tensor<2x1xi64, #blocked2>
|
64 |
+
%40 = arith.cmpi slt, %18, %cst_7 : tensor<2x1xi64, #blocked>
|
65 |
+
%41 = arith.cmpi slt, %19, %cst_8 : tensor<2x1xi64, #blocked2>
|
66 |
+
%42 = arith.select %40, %38, %18 : tensor<2x1xi1, #blocked>, tensor<2x1xi64, #blocked>
|
67 |
+
%43 = arith.select %41, %39, %19 : tensor<2x1xi1, #blocked2>, tensor<2x1xi64, #blocked2>
|
68 |
+
%44 = arith.cmpi sge, %43, %cst_8 : tensor<2x1xi64, #blocked2>
|
69 |
+
%45 = arith.cmpi slt, %43, %cst_9 : tensor<2x1xi64, #blocked2>
|
70 |
+
%46 = arith.andi %44, %45 : tensor<2x1xi1, #blocked2>
|
71 |
+
tt.assert %46, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<2x1xi1, #blocked2>
|
72 |
+
%47 = arith.muli %42, %cst_5 : tensor<2x1xi64, #blocked>
|
73 |
+
%48 = tt.broadcast %47 : (tensor<2x1xi64, #blocked>) -> tensor<2x256xi64, #blocked>
|
74 |
+
%49 = arith.extsi %12 : tensor<1x256xi32, #blocked> to tensor<1x256xi64, #blocked>
|
75 |
+
%50 = tt.broadcast %49 : (tensor<1x256xi64, #blocked>) -> tensor<2x256xi64, #blocked>
|
76 |
+
%51 = arith.addi %50, %48 : tensor<2x256xi64, #blocked>
|
77 |
+
%52 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<2x256x!tt.ptr<f32, 1>, #blocked>
|
78 |
+
%53 = tt.addptr %52, %51 : tensor<2x256x!tt.ptr<f32, 1>, #blocked>, tensor<2x256xi64, #blocked>
|
79 |
+
%54 = tt.load %53, %29, %cst_13 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32, #blocked>
|
80 |
+
%55 = arith.addf %54, %30 : tensor<2x256xf32, #blocked>
|
81 |
+
%56 = arith.addf %55, %37 : tensor<2x256xf32, #blocked>
|
82 |
+
%57 = arith.addf %56, %cst_13 : tensor<2x256xf32, #blocked>
|
83 |
+
%58 = arith.subf %56, %57 : tensor<2x256xf32, #blocked>
|
84 |
+
%59 = arith.mulf %56, %58 : tensor<2x256xf32, #blocked>
|
85 |
+
%60 = arith.addf %59, %cst_13 : tensor<2x256xf32, #blocked>
|
86 |
+
%61 = arith.select %29, %57, %cst_13 : tensor<2x256xi1, #blocked>, tensor<2x256xf32, #blocked>
|
87 |
+
%62 = arith.select %29, %60, %cst_13 : tensor<2x256xi1, #blocked>, tensor<2x256xf32, #blocked>
|
88 |
+
%63 = arith.select %21, %cst_3, %cst_4 : tensor<1x256xi1, #blocked>, tensor<1x256xf32, #blocked>
|
89 |
+
%64 = tt.broadcast %63 : (tensor<1x256xf32, #blocked>) -> tensor<2x256xf32, #blocked>
|
90 |
+
%65:3 = "tt.reduce"(%61, %62, %64) <{axis = 1 : i32}> ({
|
91 |
+
^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
|
92 |
+
%90 = arith.subf %arg11, %arg8 : f32
|
93 |
+
%91 = arith.addf %arg10, %arg13 : f32
|
94 |
+
%92 = arith.cmpf oeq, %91, %cst_10 : f32
|
95 |
+
%93 = arith.divf %arg13, %91 : f32
|
96 |
+
%94 = arith.select %92, %cst_10, %93 : f32
|
97 |
+
%95 = arith.mulf %90, %94 : f32
|
98 |
+
%96 = arith.addf %arg8, %95 : f32
|
99 |
+
%97 = arith.addf %arg9, %arg12 : f32
|
100 |
+
%98 = arith.mulf %90, %90 : f32
|
101 |
+
%99 = arith.mulf %98, %arg10 : f32
|
102 |
+
%100 = arith.mulf %99, %94 : f32
|
103 |
+
%101 = arith.addf %97, %100 : f32
|
104 |
+
tt.reduce.return %96, %101, %91 : f32, f32, f32
|
105 |
+
}) : (tensor<2x256xf32, #blocked>, tensor<2x256xf32, #blocked>, tensor<2x256xf32, #blocked>) -> (tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
|
106 |
+
%66 = tt.expand_dims %65#0 {axis = 1 : i32} : (tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<2x1xf32, #blocked>
|
107 |
+
%67 = tt.expand_dims %65#1 {axis = 1 : i32} : (tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<2x1xf32, #blocked>
|
108 |
+
%68 = tt.load %28, %29, %cst_13 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32, #blocked>
|
109 |
+
%69 = tt.load %35, %29, %cst_14 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<2x256xbf16, #blocked>
|
110 |
+
%70 = arith.extf %69 : tensor<2x256xbf16, #blocked> to tensor<2x256xf32, #blocked>
|
111 |
+
%71 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>, #blocked1>
|
112 |
+
%72 = tt.addptr %71, %13 : tensor<1x256x!tt.ptr<f32, 1>, #blocked1>, tensor<1x256xi32, #blocked1>
|
113 |
+
%73 = tt.load %72, %22, %cst_15 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32, #blocked1>
|
114 |
+
tt.assert %46, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<2x1xi1, #blocked2>
|
115 |
+
%74 = tt.load %53, %29, %cst_13 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<2x256xf32, #blocked>
|
116 |
+
%75 = arith.addf %74, %68 : tensor<2x256xf32, #blocked>
|
117 |
+
%76 = arith.addf %75, %70 : tensor<2x256xf32, #blocked>
|
118 |
+
%77 = tt.broadcast %66 : (tensor<2x1xf32, #blocked>) -> tensor<2x256xf32, #blocked>
|
119 |
+
%78 = arith.subf %76, %77 : tensor<2x256xf32, #blocked>
|
120 |
+
%79 = arith.divf %67, %cst_12 : tensor<2x1xf32, #blocked>
|
121 |
+
%80 = arith.addf %79, %cst_11 : tensor<2x1xf32, #blocked>
|
122 |
+
%81 = tt.extern_elementwise %80 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32, #blocked>) -> tensor<2x1xf32, #blocked>
|
123 |
+
%82 = tt.broadcast %81 : (tensor<2x1xf32, #blocked>) -> tensor<2x256xf32, #blocked>
|
124 |
+
%83 = arith.mulf %78, %82 : tensor<2x256xf32, #blocked>
|
125 |
+
%84 = triton_gpu.convert_layout %73 : (tensor<1x256xf32, #blocked1>) -> tensor<1x256xf32, #blocked>
|
126 |
+
%85 = tt.broadcast %84 : (tensor<1x256xf32, #blocked>) -> tensor<2x256xf32, #blocked>
|
127 |
+
%86 = arith.mulf %83, %85 : tensor<2x256xf32, #blocked>
|
128 |
+
%87 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<2x256x!tt.ptr<bf16, 1>, #blocked>
|
129 |
+
%88 = tt.addptr %87, %33 : tensor<2x256x!tt.ptr<bf16, 1>, #blocked>, tensor<2x256xi32, #blocked>
|
130 |
+
%89 = arith.truncf %86 : tensor<2x256xf32, #blocked> to tensor<2x256xbf16, #blocked>
|
131 |
+
tt.store %88, %89, %29 {cache = 1 : i32, evict = 1 : i32} : tensor<2x256xbf16, #blocked>
|
132 |
+
tt.return
|
133 |
+
}
|
134 |
+
}
|
.triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.ttir
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<2x256xbf16>
|
4 |
+
%cst_0 = arith.constant dense<1.000000e+00> : tensor<1x256xf32>
|
5 |
+
%cst_1 = arith.constant dense<0.000000e+00> : tensor<1x256xf32>
|
6 |
+
%cst_2 = arith.constant 0.000000e+00 : f32
|
7 |
+
%cst_3 = arith.constant dense<256> : tensor<2x1xi64>
|
8 |
+
%cst_4 = arith.constant dense<50257> : tensor<2x1xi64>
|
9 |
+
%cst_5 = arith.constant dense<0> : tensor<2x1xi64>
|
10 |
+
%cst_6 = arith.constant dense<9.99999974E-6> : tensor<2x1xf32>
|
11 |
+
%cst_7 = arith.constant dense<2.560000e+02> : tensor<2x1xf32>
|
12 |
+
%cst_8 = arith.constant dense<0.000000e+00> : tensor<2x256xf32>
|
13 |
+
%cst_9 = arith.constant dense<256> : tensor<2x1xi32>
|
14 |
+
%cst_10 = arith.constant dense<256> : tensor<1x256xi32>
|
15 |
+
%cst_11 = arith.constant dense<512> : tensor<2x1xi32>
|
16 |
+
%c2_i32 = arith.constant 2 : i32
|
17 |
+
%0 = tt.get_program_id x : i32
|
18 |
+
%1 = arith.muli %0, %c2_i32 : i32
|
19 |
+
%2 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32>
|
20 |
+
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<2xi32>) -> tensor<2x1xi32>
|
21 |
+
%4 = tt.splat %1 : (i32) -> tensor<2x1xi32>
|
22 |
+
%5 = arith.addi %4, %3 : tensor<2x1xi32>
|
23 |
+
%6 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
|
24 |
+
%7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<256xi32>) -> tensor<1x256xi32>
|
25 |
+
%8 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<2x1x!tt.ptr<i64, 1>>
|
26 |
+
%9 = tt.addptr %8, %5 : tensor<2x1x!tt.ptr<i64, 1>>, tensor<2x1xi32>
|
27 |
+
%10 = tt.load %9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x1xi64>
|
28 |
+
%11 = arith.remsi %5, %cst_11 : tensor<2x1xi32>
|
29 |
+
%12 = arith.cmpi slt, %7, %cst_10 : tensor<1x256xi32>
|
30 |
+
%13 = arith.muli %11, %cst_9 : tensor<2x1xi32>
|
31 |
+
%14 = tt.broadcast %7 : (tensor<1x256xi32>) -> tensor<2x256xi32>
|
32 |
+
%15 = tt.broadcast %13 : (tensor<2x1xi32>) -> tensor<2x256xi32>
|
33 |
+
%16 = arith.addi %14, %15 : tensor<2x256xi32>
|
34 |
+
%17 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<2x256x!tt.ptr<f32, 1>>
|
35 |
+
%18 = tt.addptr %17, %16 : tensor<2x256x!tt.ptr<f32, 1>>, tensor<2x256xi32>
|
36 |
+
%19 = tt.broadcast %12 : (tensor<1x256xi1>) -> tensor<2x256xi1>
|
37 |
+
%20 = tt.load %18, %19, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32>
|
38 |
+
%21 = arith.muli %5, %cst_9 : tensor<2x1xi32>
|
39 |
+
%22 = tt.broadcast %21 : (tensor<2x1xi32>) -> tensor<2x256xi32>
|
40 |
+
%23 = arith.addi %14, %22 : tensor<2x256xi32>
|
41 |
+
%24 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<2x256x!tt.ptr<bf16, 1>>
|
42 |
+
%25 = tt.addptr %24, %23 : tensor<2x256x!tt.ptr<bf16, 1>>, tensor<2x256xi32>
|
43 |
+
%26 = tt.load %25, %19, %cst {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xbf16>
|
44 |
+
%27 = arith.extf %26 : tensor<2x256xbf16> to tensor<2x256xf32>
|
45 |
+
%28 = arith.addi %10, %cst_4 : tensor<2x1xi64>
|
46 |
+
%29 = arith.cmpi slt, %10, %cst_5 : tensor<2x1xi64>
|
47 |
+
%30 = arith.select %29, %28, %10 : tensor<2x1xi1>, tensor<2x1xi64>
|
48 |
+
%31 = arith.cmpi sge, %30, %cst_5 : tensor<2x1xi64>
|
49 |
+
%32 = arith.cmpi slt, %30, %cst_4 : tensor<2x1xi64>
|
50 |
+
%33 = arith.andi %31, %32 : tensor<2x1xi1>
|
51 |
+
tt.assert %33, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<2x1xi1>
|
52 |
+
%34 = arith.muli %30, %cst_3 : tensor<2x1xi64>
|
53 |
+
%35 = tt.broadcast %34 : (tensor<2x1xi64>) -> tensor<2x256xi64>
|
54 |
+
%36 = arith.extsi %7 : tensor<1x256xi32> to tensor<1x256xi64>
|
55 |
+
%37 = tt.broadcast %36 : (tensor<1x256xi64>) -> tensor<2x256xi64>
|
56 |
+
%38 = arith.addi %37, %35 : tensor<2x256xi64>
|
57 |
+
%39 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<2x256x!tt.ptr<f32, 1>>
|
58 |
+
%40 = tt.addptr %39, %38 : tensor<2x256x!tt.ptr<f32, 1>>, tensor<2x256xi64>
|
59 |
+
%41 = tt.load %40, %19, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32>
|
60 |
+
%42 = arith.addf %41, %20 : tensor<2x256xf32>
|
61 |
+
%43 = arith.addf %42, %27 : tensor<2x256xf32>
|
62 |
+
%44 = arith.addf %43, %cst_8 : tensor<2x256xf32>
|
63 |
+
%45 = arith.subf %43, %44 : tensor<2x256xf32>
|
64 |
+
%46 = arith.mulf %43, %45 : tensor<2x256xf32>
|
65 |
+
%47 = arith.addf %46, %cst_8 : tensor<2x256xf32>
|
66 |
+
%48 = arith.select %19, %44, %cst_8 : tensor<2x256xi1>, tensor<2x256xf32>
|
67 |
+
%49 = arith.select %19, %47, %cst_8 : tensor<2x256xi1>, tensor<2x256xf32>
|
68 |
+
%50 = arith.select %12, %cst_0, %cst_1 : tensor<1x256xi1>, tensor<1x256xf32>
|
69 |
+
%51 = tt.broadcast %50 : (tensor<1x256xf32>) -> tensor<2x256xf32>
|
70 |
+
%52:3 = "tt.reduce"(%48, %49, %51) <{axis = 1 : i32}> ({
|
71 |
+
^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
|
72 |
+
%76 = arith.subf %arg11, %arg8 : f32
|
73 |
+
%77 = arith.addf %arg10, %arg13 : f32
|
74 |
+
%78 = arith.cmpf oeq, %77, %cst_2 : f32
|
75 |
+
%79 = arith.divf %arg13, %77 : f32
|
76 |
+
%80 = arith.select %78, %cst_2, %79 : f32
|
77 |
+
%81 = arith.mulf %76, %80 : f32
|
78 |
+
%82 = arith.addf %arg8, %81 : f32
|
79 |
+
%83 = arith.addf %arg9, %arg12 : f32
|
80 |
+
%84 = arith.mulf %76, %76 : f32
|
81 |
+
%85 = arith.mulf %84, %arg10 : f32
|
82 |
+
%86 = arith.mulf %85, %80 : f32
|
83 |
+
%87 = arith.addf %83, %86 : f32
|
84 |
+
tt.reduce.return %82, %87, %77 : f32, f32, f32
|
85 |
+
}) : (tensor<2x256xf32>, tensor<2x256xf32>, tensor<2x256xf32>) -> (tensor<2xf32>, tensor<2xf32>, tensor<2xf32>)
|
86 |
+
%53 = tt.expand_dims %52#0 {axis = 1 : i32} : (tensor<2xf32>) -> tensor<2x1xf32>
|
87 |
+
%54 = tt.expand_dims %52#1 {axis = 1 : i32} : (tensor<2xf32>) -> tensor<2x1xf32>
|
88 |
+
%55 = tt.load %18, %19, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32>
|
89 |
+
%56 = tt.load %25, %19, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<2x256xbf16>
|
90 |
+
%57 = arith.extf %56 : tensor<2x256xbf16> to tensor<2x256xf32>
|
91 |
+
%58 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>>
|
92 |
+
%59 = tt.addptr %58, %7 : tensor<1x256x!tt.ptr<f32, 1>>, tensor<1x256xi32>
|
93 |
+
%60 = tt.load %59, %12, %cst_1 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32>
|
94 |
+
tt.assert %33, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<2x1xi1>
|
95 |
+
%61 = tt.load %40, %19, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<2x256xf32>
|
96 |
+
%62 = arith.addf %61, %55 : tensor<2x256xf32>
|
97 |
+
%63 = arith.addf %62, %57 : tensor<2x256xf32>
|
98 |
+
%64 = tt.broadcast %53 : (tensor<2x1xf32>) -> tensor<2x256xf32>
|
99 |
+
%65 = arith.subf %63, %64 : tensor<2x256xf32>
|
100 |
+
%66 = arith.divf %54, %cst_7 : tensor<2x1xf32>
|
101 |
+
%67 = arith.addf %66, %cst_6 : tensor<2x1xf32>
|
102 |
+
%68 = tt.extern_elementwise %67 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32>) -> tensor<2x1xf32>
|
103 |
+
%69 = tt.broadcast %68 : (tensor<2x1xf32>) -> tensor<2x256xf32>
|
104 |
+
%70 = arith.mulf %65, %69 : tensor<2x256xf32>
|
105 |
+
%71 = tt.broadcast %60 : (tensor<1x256xf32>) -> tensor<2x256xf32>
|
106 |
+
%72 = arith.mulf %70, %71 : tensor<2x256xf32>
|
107 |
+
%73 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<2x256x!tt.ptr<bf16, 1>>
|
108 |
+
%74 = tt.addptr %73, %23 : tensor<2x256x!tt.ptr<bf16, 1>>, tensor<2x256xi32>
|
109 |
+
%75 = arith.truncf %72 : tensor<2x256xf32> to tensor<2x256xbf16>
|
110 |
+
tt.store %74, %75, %19 {cache = 1 : i32, evict = 1 : i32} : tensor<2x256xbf16>
|
111 |
+
tt.return
|
112 |
+
}
|
113 |
+
}
|
.triton/dump/89f8cc1079aa03024e56dc2aee42813a/triton_.cubin
ADDED
Binary file (58.1 kB). View file
|
|
.triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.ptx
ADDED
@@ -0,0 +1,758 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3d4d5d6de7de
|
10 |
+
.extern .shared .align 1 .b8 global_smem[];
|
11 |
+
.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
|
12 |
+
|
13 |
+
.visible .entry triton__0d1d2d3d4d5d6de7de(
|
14 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_0,
|
15 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_1,
|
16 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_2,
|
17 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_3,
|
18 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_4,
|
19 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_5,
|
20 |
+
.param .u32 triton__0d1d2d3d4d5d6de7de_param_6,
|
21 |
+
.param .u32 triton__0d1d2d3d4d5d6de7de_param_7
|
22 |
+
)
|
23 |
+
.maxntid 64, 1, 1
|
24 |
+
{
|
25 |
+
.reg .pred %p<29>;
|
26 |
+
.reg .b16 %rs<17>;
|
27 |
+
.reg .b32 %r<100>;
|
28 |
+
.reg .f32 %f<86>;
|
29 |
+
.reg .b64 %rd<16>;
|
30 |
+
.loc 1 18 0
|
31 |
+
$L__func_begin0:
|
32 |
+
.loc 1 18 0
|
33 |
+
|
34 |
+
ld.param.u64 %rd7, [triton__0d1d2d3d4d5d6de7de_param_0];
|
35 |
+
ld.param.u64 %rd8, [triton__0d1d2d3d4d5d6de7de_param_1];
|
36 |
+
$L__tmp0:
|
37 |
+
.loc 1 26 26
|
38 |
+
mov.u32 %r66, %tid.x;
|
39 |
+
and.b32 %r67, %r66, 31;
|
40 |
+
ld.param.u64 %rd9, [triton__0d1d2d3d4d5d6de7de_param_2];
|
41 |
+
ld.param.u64 %rd10, [triton__0d1d2d3d4d5d6de7de_param_3];
|
42 |
+
ld.param.u64 %rd11, [triton__0d1d2d3d4d5d6de7de_param_4];
|
43 |
+
shl.b32 %r68, %r66, 2;
|
44 |
+
ld.param.u64 %rd12, [triton__0d1d2d3d4d5d6de7de_param_5];
|
45 |
+
and.b32 %r69, %r68, 252;
|
46 |
+
.loc 1 23 28
|
47 |
+
mov.u32 %r1, %ctaid.x;
|
48 |
+
.loc 1 30 40
|
49 |
+
shl.b32 %r70, %r1, 8;
|
50 |
+
.loc 1 30 36
|
51 |
+
or.b32 %r71, %r70, %r69;
|
52 |
+
.loc 1 30 30
|
53 |
+
mul.wide.s32 %rd13, %r71, 4;
|
54 |
+
add.s64 %rd1, %rd7, %rd13;
|
55 |
+
mov.b32 %r6, 0;
|
56 |
+
mov.pred %p1, -1;
|
57 |
+
.loc 1 30 46
|
58 |
+
mov.u32 %r2, 0x0;
|
59 |
+
mov.u32 %r3, 0x0;
|
60 |
+
mov.u32 %r4, 0x0;
|
61 |
+
mov.u32 %r5, 0x0;
|
62 |
+
@%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
|
63 |
+
@!%p1 mov.u32 %r2, %r6;
|
64 |
+
@!%p1 mov.u32 %r3, %r6;
|
65 |
+
@!%p1 mov.u32 %r4, %r6;
|
66 |
+
@!%p1 mov.u32 %r5, %r6;
|
67 |
+
mov.b32 %f1, %r4;
|
68 |
+
mov.b32 %f2, %r5;
|
69 |
+
.loc 1 31 30
|
70 |
+
mul.wide.s32 %rd14, %r71, 2;
|
71 |
+
add.s64 %rd2, %rd8, %rd14;
|
72 |
+
.loc 1 31 46
|
73 |
+
mov.u32 %r10, 0x0;
|
74 |
+
mov.u32 %r11, 0x0;
|
75 |
+
@%p1 ld.global.v2.b32 { %r10, %r11 }, [ %rd2 + 0 ];
|
76 |
+
@!%p1 mov.u32 %r10, %r6;
|
77 |
+
@!%p1 mov.u32 %r11, %r6;
|
78 |
+
cvt.u16.u32 %rs1, %r10;
|
79 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r10; }
|
80 |
+
cvt.u16.u32 %rs3, %r11;
|
81 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r11; }
|
82 |
+
.loc 1 31 67
|
83 |
+
cvt.f32.bf16 %r14, %rs1;
|
84 |
+
mov.b32 %f3, %r14;
|
85 |
+
cvt.f32.bf16 %r15, %rs2;
|
86 |
+
mov.b32 %f4, %r15;
|
87 |
+
cvt.f32.bf16 %r16, %rs3;
|
88 |
+
mov.b32 %f5, %r16;
|
89 |
+
cvt.f32.bf16 %r17, %rs4;
|
90 |
+
mov.b32 %f6, %r17;
|
91 |
+
.loc 1 32 30
|
92 |
+
add.s64 %rd3, %rd9, %rd14;
|
93 |
+
.loc 1 32 46
|
94 |
+
mov.u32 %r18, 0x0;
|
95 |
+
mov.u32 %r19, 0x0;
|
96 |
+
@%p1 ld.global.v2.b32 { %r18, %r19 }, [ %rd3 + 0 ];
|
97 |
+
@!%p1 mov.u32 %r18, %r6;
|
98 |
+
@!%p1 mov.u32 %r19, %r6;
|
99 |
+
cvt.u16.u32 %rs5, %r18;
|
100 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r18; }
|
101 |
+
cvt.u16.u32 %rs7, %r19;
|
102 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r19; }
|
103 |
+
.loc 1 32 67
|
104 |
+
cvt.f32.bf16 %r22, %rs5;
|
105 |
+
mov.b32 %f7, %r22;
|
106 |
+
cvt.f32.bf16 %r23, %rs6;
|
107 |
+
mov.b32 %f8, %r23;
|
108 |
+
cvt.f32.bf16 %r24, %rs7;
|
109 |
+
mov.b32 %f9, %r24;
|
110 |
+
cvt.f32.bf16 %r25, %rs8;
|
111 |
+
mov.b32 %f10, %r25;
|
112 |
+
.loc 1 33 30
|
113 |
+
add.s64 %rd4, %rd10, %rd14;
|
114 |
+
.loc 1 33 46
|
115 |
+
mov.u32 %r26, 0x0;
|
116 |
+
mov.u32 %r27, 0x0;
|
117 |
+
@%p1 ld.global.v2.b32 { %r26, %r27 }, [ %rd4 + 0 ];
|
118 |
+
@!%p1 mov.u32 %r26, %r6;
|
119 |
+
@!%p1 mov.u32 %r27, %r6;
|
120 |
+
cvt.u16.u32 %rs9, %r26;
|
121 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r26; }
|
122 |
+
cvt.u16.u32 %rs11, %r27;
|
123 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r27; }
|
124 |
+
.loc 1 33 67
|
125 |
+
cvt.f32.bf16 %r30, %rs9;
|
126 |
+
mov.b32 %f11, %r30;
|
127 |
+
cvt.f32.bf16 %r31, %rs10;
|
128 |
+
mov.b32 %f12, %r31;
|
129 |
+
cvt.f32.bf16 %r32, %rs11;
|
130 |
+
mov.b32 %f13, %r32;
|
131 |
+
cvt.f32.bf16 %r33, %rs12;
|
132 |
+
mov.b32 %f14, %r33;
|
133 |
+
.loc 1 34 31
|
134 |
+
mul.wide.u32 %rd15, %r69, 4;
|
135 |
+
add.s64 %rd5, %rd11, %rd15;
|
136 |
+
.loc 1 34 36
|
137 |
+
mov.u32 %r34, 0x0;
|
138 |
+
mov.u32 %r35, 0x0;
|
139 |
+
mov.u32 %r36, 0x0;
|
140 |
+
mov.u32 %r37, 0x0;
|
141 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r34, %r35, %r36, %r37 }, [ %rd5 + 0 ];
|
142 |
+
@!%p1 mov.u32 %r34, %r6;
|
143 |
+
@!%p1 mov.u32 %r35, %r6;
|
144 |
+
@!%p1 mov.u32 %r36, %r6;
|
145 |
+
@!%p1 mov.u32 %r37, %r6;
|
146 |
+
.loc 1 36 18
|
147 |
+
add.f32 %f15, %f5, %f1;
|
148 |
+
add.f32 %f16, %f6, %f2;
|
149 |
+
.loc 1 38 18
|
150 |
+
add.f32 %f17, %f15, %f9;
|
151 |
+
add.f32 %f18, %f16, %f10;
|
152 |
+
.loc 1 30 46
|
153 |
+
mov.b32 %f19, %r2;
|
154 |
+
mov.b32 %f20, %r3;
|
155 |
+
.loc 1 36 18
|
156 |
+
add.f32 %f21, %f4, %f20;
|
157 |
+
add.f32 %f22, %f3, %f19;
|
158 |
+
.loc 1 38 18
|
159 |
+
add.f32 %f23, %f22, %f7;
|
160 |
+
add.f32 %f24, %f21, %f8;
|
161 |
+
.loc 1 40 18
|
162 |
+
add.f32 %f25, %f24, %f12;
|
163 |
+
add.f32 %f26, %f23, %f11;
|
164 |
+
add.f32 %f27, %f17, %f13;
|
165 |
+
add.f32 %f28, %f18, %f14;
|
166 |
+
$L__tmp1:
|
167 |
+
.loc 2 233 15
|
168 |
+
add.f32 %f29, %f26, %f25;
|
169 |
+
add.f32 %f30, %f29, %f27;
|
170 |
+
add.f32 %f31, %f30, %f28;
|
171 |
+
$L__tmp2:
|
172 |
+
.loc 2 243 36
|
173 |
+
mov.b32 %r72, %f31;
|
174 |
+
shfl.sync.bfly.b32 %r73, %r72, 16, 31, -1;
|
175 |
+
mov.b32 %f32, %r73;
|
176 |
+
$L__tmp3:
|
177 |
+
.loc 2 233 15
|
178 |
+
add.f32 %f33, %f31, %f32;
|
179 |
+
$L__tmp4:
|
180 |
+
.loc 2 243 36
|
181 |
+
mov.b32 %r74, %f33;
|
182 |
+
shfl.sync.bfly.b32 %r75, %r74, 8, 31, -1;
|
183 |
+
mov.b32 %f34, %r75;
|
184 |
+
$L__tmp5:
|
185 |
+
.loc 2 233 15
|
186 |
+
add.f32 %f35, %f33, %f34;
|
187 |
+
$L__tmp6:
|
188 |
+
.loc 2 243 36
|
189 |
+
mov.b32 %r76, %f35;
|
190 |
+
shfl.sync.bfly.b32 %r77, %r76, 4, 31, -1;
|
191 |
+
mov.b32 %f36, %r77;
|
192 |
+
$L__tmp7:
|
193 |
+
.loc 2 233 15
|
194 |
+
add.f32 %f37, %f35, %f36;
|
195 |
+
$L__tmp8:
|
196 |
+
.loc 2 243 36
|
197 |
+
mov.b32 %r78, %f37;
|
198 |
+
shfl.sync.bfly.b32 %r79, %r78, 2, 31, -1;
|
199 |
+
mov.b32 %f38, %r79;
|
200 |
+
$L__tmp9:
|
201 |
+
.loc 2 233 15
|
202 |
+
add.f32 %f39, %f37, %f38;
|
203 |
+
$L__tmp10:
|
204 |
+
.loc 2 243 36
|
205 |
+
mov.b32 %r80, %f39;
|
206 |
+
shfl.sync.bfly.b32 %r81, %r80, 1, 31, -1;
|
207 |
+
mov.b32 %f40, %r81;
|
208 |
+
$L__tmp11:
|
209 |
+
.loc 2 233 15
|
210 |
+
add.f32 %f41, %f39, %f40;
|
211 |
+
$L__tmp12:
|
212 |
+
.loc 2 243 36
|
213 |
+
setp.eq.s32 %p20, %r67, 0;
|
214 |
+
shr.u32 %r82, %r66, 3;
|
215 |
+
and.b32 %r83, %r82, 4;
|
216 |
+
mov.u32 %r84, global_smem;
|
217 |
+
add.s32 %r42, %r84, %r83;
|
218 |
+
mov.b32 %r43, %f41;
|
219 |
+
@%p20 st.shared.b32 [ %r42 + 0 ], %r43;
|
220 |
+
bar.sync 0;
|
221 |
+
setp.lt.s32 %p21, %r66, 2;
|
222 |
+
add.s32 %r45, %r84, %r68;
|
223 |
+
@%p21 ld.shared.b32 %r44, [ %r45 + 0 ];
|
224 |
+
mov.b32 %f42, %r44;
|
225 |
+
shfl.sync.bfly.b32 %r85, %r44, 1, 31, -1;
|
226 |
+
mov.b32 %f43, %r85;
|
227 |
+
$L__tmp13:
|
228 |
+
.loc 2 233 15
|
229 |
+
add.f32 %f44, %f42, %f43;
|
230 |
+
$L__tmp14:
|
231 |
+
.loc 2 243 36
|
232 |
+
and.b32 %r86, %r66, 1;
|
233 |
+
setp.eq.b32 %p27, %r86, 1;
|
234 |
+
not.pred %p28, %p27;
|
235 |
+
and.pred %p22, %p21, %p28;
|
236 |
+
mov.b32 %r47, %f44;
|
237 |
+
@%p22 st.shared.b32 [ %r45 + 0 ], %r47;
|
238 |
+
bar.sync 0;
|
239 |
+
ld.shared.f32 %f45, [global_smem];
|
240 |
+
$L__tmp15:
|
241 |
+
.loc 3 8 15
|
242 |
+
add.f32 %f46, %f45, 0f00000000;
|
243 |
+
$L__tmp16:
|
244 |
+
.loc 1 48 20
|
245 |
+
mov.b32 %r49, %f46;
|
246 |
+
mov.b32 %r50, 1132462080;
|
247 |
+
div.full.f32 %r48, %r49, %r50;
|
248 |
+
mov.b32 %f47, %r48;
|
249 |
+
.loc 1 49 20
|
250 |
+
sub.f32 %f48, %f26, %f47;
|
251 |
+
sub.f32 %f49, %f25, %f47;
|
252 |
+
sub.f32 %f50, %f27, %f47;
|
253 |
+
sub.f32 %f51, %f28, %f47;
|
254 |
+
.loc 1 50 20
|
255 |
+
mul.f32 %f52, %f49, %f49;
|
256 |
+
$L__tmp17:
|
257 |
+
.loc 2 243 36
|
258 |
+
bar.sync 0;
|
259 |
+
$L__tmp18:
|
260 |
+
.loc 2 233 15
|
261 |
+
fma.rn.f32 %f53, %f48, %f48, %f52;
|
262 |
+
fma.rn.f32 %f54, %f50, %f50, %f53;
|
263 |
+
fma.rn.f32 %f55, %f51, %f51, %f54;
|
264 |
+
$L__tmp19:
|
265 |
+
.loc 2 243 36
|
266 |
+
mov.b32 %r87, %f55;
|
267 |
+
shfl.sync.bfly.b32 %r88, %r87, 16, 31, -1;
|
268 |
+
mov.b32 %f56, %r88;
|
269 |
+
$L__tmp20:
|
270 |
+
.loc 2 233 15
|
271 |
+
add.f32 %f57, %f55, %f56;
|
272 |
+
$L__tmp21:
|
273 |
+
.loc 2 243 36
|
274 |
+
mov.b32 %r89, %f57;
|
275 |
+
shfl.sync.bfly.b32 %r90, %r89, 8, 31, -1;
|
276 |
+
mov.b32 %f58, %r90;
|
277 |
+
$L__tmp22:
|
278 |
+
.loc 2 233 15
|
279 |
+
add.f32 %f59, %f57, %f58;
|
280 |
+
$L__tmp23:
|
281 |
+
.loc 2 243 36
|
282 |
+
mov.b32 %r91, %f59;
|
283 |
+
shfl.sync.bfly.b32 %r92, %r91, 4, 31, -1;
|
284 |
+
mov.b32 %f60, %r92;
|
285 |
+
$L__tmp24:
|
286 |
+
.loc 2 233 15
|
287 |
+
add.f32 %f61, %f59, %f60;
|
288 |
+
$L__tmp25:
|
289 |
+
.loc 2 243 36
|
290 |
+
mov.b32 %r93, %f61;
|
291 |
+
shfl.sync.bfly.b32 %r94, %r93, 2, 31, -1;
|
292 |
+
mov.b32 %f62, %r94;
|
293 |
+
$L__tmp26:
|
294 |
+
.loc 2 233 15
|
295 |
+
add.f32 %f63, %f61, %f62;
|
296 |
+
$L__tmp27:
|
297 |
+
.loc 2 243 36
|
298 |
+
mov.b32 %r95, %f63;
|
299 |
+
shfl.sync.bfly.b32 %r96, %r95, 1, 31, -1;
|
300 |
+
mov.b32 %f64, %r96;
|
301 |
+
$L__tmp28:
|
302 |
+
.loc 2 233 15
|
303 |
+
add.f32 %f65, %f63, %f64;
|
304 |
+
$L__tmp29:
|
305 |
+
.loc 2 243 36
|
306 |
+
mov.b32 %r52, %f65;
|
307 |
+
@%p20 st.shared.b32 [ %r42 + 0 ], %r52;
|
308 |
+
bar.sync 0;
|
309 |
+
@%p21 ld.shared.b32 %r53, [ %r45 + 0 ];
|
310 |
+
mov.b32 %f66, %r53;
|
311 |
+
shfl.sync.bfly.b32 %r97, %r53, 1, 31, -1;
|
312 |
+
mov.b32 %f67, %r97;
|
313 |
+
$L__tmp30:
|
314 |
+
.loc 2 233 15
|
315 |
+
add.f32 %f68, %f66, %f67;
|
316 |
+
$L__tmp31:
|
317 |
+
.loc 2 243 36
|
318 |
+
mov.b32 %r56, %f68;
|
319 |
+
@%p22 st.shared.b32 [ %r45 + 0 ], %r56;
|
320 |
+
bar.sync 0;
|
321 |
+
ld.shared.f32 %f69, [global_smem];
|
322 |
+
$L__tmp32:
|
323 |
+
.loc 3 8 15
|
324 |
+
add.f32 %f70, %f69, 0f00000000;
|
325 |
+
$L__tmp33:
|
326 |
+
.loc 1 56 20
|
327 |
+
mov.b32 %r58, %f70;
|
328 |
+
div.full.f32 %r57, %r58, %r50;
|
329 |
+
mov.b32 %f71, %r57;
|
330 |
+
.loc 1 58 20
|
331 |
+
add.f32 %f72, %f71, 0f3727C5AC;
|
332 |
+
.loc 1 59 26
|
333 |
+
rsqrt.approx.ftz.f32 %f73, %f72;
|
334 |
+
.loc 1 34 36
|
335 |
+
mov.b32 %f74, %r37;
|
336 |
+
mov.b32 %f75, %r36;
|
337 |
+
mov.b32 %f76, %r35;
|
338 |
+
mov.b32 %f77, %r34;
|
339 |
+
.loc 1 60 20
|
340 |
+
mul.f32 %f78, %f48, %f73;
|
341 |
+
mul.f32 %f79, %f49, %f73;
|
342 |
+
mul.f32 %f80, %f50, %f73;
|
343 |
+
mul.f32 %f81, %f51, %f73;
|
344 |
+
.loc 1 61 20
|
345 |
+
mul.f32 %f82, %f78, %f77;
|
346 |
+
mul.f32 %f83, %f79, %f76;
|
347 |
+
mul.f32 %f84, %f80, %f75;
|
348 |
+
mul.f32 %f85, %f81, %f74;
|
349 |
+
.loc 1 63 25
|
350 |
+
add.s64 %rd6, %rd12, %rd14;
|
351 |
+
.loc 1 63 48
|
352 |
+
mov.b32 %r60, %f82;
|
353 |
+
cvt.rn.bf16.f32 %rs13, %r60;
|
354 |
+
mov.b32 %r61, %f83;
|
355 |
+
cvt.rn.bf16.f32 %rs14, %r61;
|
356 |
+
mov.b32 %r62, %f84;
|
357 |
+
cvt.rn.bf16.f32 %rs15, %r62;
|
358 |
+
mov.b32 %r63, %f85;
|
359 |
+
cvt.rn.bf16.f32 %rs16, %r63;
|
360 |
+
mov.b32 %r98, {%rs13, %rs14};
|
361 |
+
mov.b32 %r99, {%rs15, %rs16};
|
362 |
+
@%p1 st.global.v2.b32 [ %rd6 + 0 ], { %r98, %r99 };
|
363 |
+
.loc 1 63 4
|
364 |
+
ret;
|
365 |
+
$L__tmp34:
|
366 |
+
$L__func_end0:
|
367 |
+
|
368 |
+
}
|
369 |
+
// .globl __nv_rsqrtf
|
370 |
+
.visible .func (.param .b32 func_retval0) __nv_rsqrtf(
|
371 |
+
.param .b32 __nv_rsqrtf_param_0
|
372 |
+
)
|
373 |
+
{
|
374 |
+
.reg .f32 %f<3>;
|
375 |
+
$L__func_begin1:
|
376 |
+
|
377 |
+
ld.param.f32 %f1, [__nv_rsqrtf_param_0];
|
378 |
+
rsqrt.approx.ftz.f32 %f2, %f1;
|
379 |
+
st.param.f32 [func_retval0+0], %f2;
|
380 |
+
ret;
|
381 |
+
$L__func_end1:
|
382 |
+
|
383 |
+
}
|
384 |
+
.file 1 "/tmp/torchinductor_root/pw/cpwl4wgyi5spzbgbswrqxfrxlyk2m76a4bakbp6l5ltopjbkjadt.py"
|
385 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
|
386 |
+
.file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
|
387 |
+
.section .debug_abbrev
|
388 |
+
{
|
389 |
+
.b8 1
|
390 |
+
.b8 17
|
391 |
+
.b8 1
|
392 |
+
.b8 37
|
393 |
+
.b8 8
|
394 |
+
.b8 19
|
395 |
+
.b8 5
|
396 |
+
.b8 3
|
397 |
+
.b8 8
|
398 |
+
.b8 16
|
399 |
+
.b8 6
|
400 |
+
.b8 27
|
401 |
+
.b8 8
|
402 |
+
.b8 180
|
403 |
+
.b8 66
|
404 |
+
.b8 12
|
405 |
+
.b8 17
|
406 |
+
.b8 1
|
407 |
+
.b8 18
|
408 |
+
.b8 1
|
409 |
+
.b8 0
|
410 |
+
.b8 0
|
411 |
+
.b8 2
|
412 |
+
.b8 46
|
413 |
+
.b8 0
|
414 |
+
.b8 135
|
415 |
+
.b8 64
|
416 |
+
.b8 8
|
417 |
+
.b8 3
|
418 |
+
.b8 8
|
419 |
+
.b8 58
|
420 |
+
.b8 11
|
421 |
+
.b8 59
|
422 |
+
.b8 11
|
423 |
+
.b8 63
|
424 |
+
.b8 12
|
425 |
+
.b8 32
|
426 |
+
.b8 11
|
427 |
+
.b8 0
|
428 |
+
.b8 0
|
429 |
+
.b8 3
|
430 |
+
.b8 46
|
431 |
+
.b8 1
|
432 |
+
.b8 17
|
433 |
+
.b8 1
|
434 |
+
.b8 18
|
435 |
+
.b8 1
|
436 |
+
.b8 64
|
437 |
+
.b8 10
|
438 |
+
.b8 49
|
439 |
+
.b8 19
|
440 |
+
.b8 0
|
441 |
+
.b8 0
|
442 |
+
.b8 4
|
443 |
+
.b8 29
|
444 |
+
.b8 1
|
445 |
+
.b8 49
|
446 |
+
.b8 19
|
447 |
+
.b8 17
|
448 |
+
.b8 1
|
449 |
+
.b8 18
|
450 |
+
.b8 1
|
451 |
+
.b8 88
|
452 |
+
.b8 11
|
453 |
+
.b8 89
|
454 |
+
.b8 11
|
455 |
+
.b8 87
|
456 |
+
.b8 11
|
457 |
+
.b8 0
|
458 |
+
.b8 0
|
459 |
+
.b8 5
|
460 |
+
.b8 29
|
461 |
+
.b8 0
|
462 |
+
.b8 49
|
463 |
+
.b8 19
|
464 |
+
.b8 17
|
465 |
+
.b8 1
|
466 |
+
.b8 18
|
467 |
+
.b8 1
|
468 |
+
.b8 88
|
469 |
+
.b8 11
|
470 |
+
.b8 89
|
471 |
+
.b8 11
|
472 |
+
.b8 87
|
473 |
+
.b8 11
|
474 |
+
.b8 0
|
475 |
+
.b8 0
|
476 |
+
.b8 0
|
477 |
+
}
|
478 |
+
.section .debug_info
|
479 |
+
{
|
480 |
+
.b32 399
|
481 |
+
.b8 2
|
482 |
+
.b8 0
|
483 |
+
.b32 .debug_abbrev
|
484 |
+
.b8 8
|
485 |
+
.b8 1
|
486 |
+
.b8 116
|
487 |
+
.b8 114
|
488 |
+
.b8 105
|
489 |
+
.b8 116
|
490 |
+
.b8 111
|
491 |
+
.b8 110
|
492 |
+
.b8 0
|
493 |
+
.b8 2
|
494 |
+
.b8 0
|
495 |
+
.b8 99
|
496 |
+
.b8 112
|
497 |
+
.b8 119
|
498 |
+
.b8 108
|
499 |
+
.b8 52
|
500 |
+
.b8 119
|
501 |
+
.b8 103
|
502 |
+
.b8 121
|
503 |
+
.b8 105
|
504 |
+
.b8 53
|
505 |
+
.b8 115
|
506 |
+
.b8 112
|
507 |
+
.b8 122
|
508 |
+
.b8 98
|
509 |
+
.b8 103
|
510 |
+
.b8 98
|
511 |
+
.b8 115
|
512 |
+
.b8 119
|
513 |
+
.b8 114
|
514 |
+
.b8 113
|
515 |
+
.b8 120
|
516 |
+
.b8 102
|
517 |
+
.b8 114
|
518 |
+
.b8 120
|
519 |
+
.b8 108
|
520 |
+
.b8 121
|
521 |
+
.b8 107
|
522 |
+
.b8 50
|
523 |
+
.b8 109
|
524 |
+
.b8 55
|
525 |
+
.b8 54
|
526 |
+
.b8 97
|
527 |
+
.b8 52
|
528 |
+
.b8 98
|
529 |
+
.b8 97
|
530 |
+
.b8 107
|
531 |
+
.b8 98
|
532 |
+
.b8 112
|
533 |
+
.b8 54
|
534 |
+
.b8 108
|
535 |
+
.b8 53
|
536 |
+
.b8 108
|
537 |
+
.b8 116
|
538 |
+
.b8 111
|
539 |
+
.b8 112
|
540 |
+
.b8 106
|
541 |
+
.b8 98
|
542 |
+
.b8 107
|
543 |
+
.b8 106
|
544 |
+
.b8 97
|
545 |
+
.b8 100
|
546 |
+
.b8 116
|
547 |
+
.b8 46
|
548 |
+
.b8 112
|
549 |
+
.b8 121
|
550 |
+
.b8 0
|
551 |
+
.b32 .debug_line
|
552 |
+
.b8 47
|
553 |
+
.b8 116
|
554 |
+
.b8 109
|
555 |
+
.b8 112
|
556 |
+
.b8 47
|
557 |
+
.b8 116
|
558 |
+
.b8 111
|
559 |
+
.b8 114
|
560 |
+
.b8 99
|
561 |
+
.b8 104
|
562 |
+
.b8 105
|
563 |
+
.b8 110
|
564 |
+
.b8 100
|
565 |
+
.b8 117
|
566 |
+
.b8 99
|
567 |
+
.b8 116
|
568 |
+
.b8 111
|
569 |
+
.b8 114
|
570 |
+
.b8 95
|
571 |
+
.b8 114
|
572 |
+
.b8 111
|
573 |
+
.b8 111
|
574 |
+
.b8 116
|
575 |
+
.b8 47
|
576 |
+
.b8 112
|
577 |
+
.b8 119
|
578 |
+
.b8 0
|
579 |
+
.b8 1
|
580 |
+
.b64 $L__func_begin0
|
581 |
+
.b64 $L__func_end0
|
582 |
+
.b8 2
|
583 |
+
.b8 116
|
584 |
+
.b8 114
|
585 |
+
.b8 105
|
586 |
+
.b8 116
|
587 |
+
.b8 111
|
588 |
+
.b8 110
|
589 |
+
.b8 95
|
590 |
+
.b8 95
|
591 |
+
.b8 48
|
592 |
+
.b8 100
|
593 |
+
.b8 49
|
594 |
+
.b8 100
|
595 |
+
.b8 50
|
596 |
+
.b8 100
|
597 |
+
.b8 51
|
598 |
+
.b8 100
|
599 |
+
.b8 52
|
600 |
+
.b8 100
|
601 |
+
.b8 53
|
602 |
+
.b8 100
|
603 |
+
.b8 54
|
604 |
+
.b8 100
|
605 |
+
.b8 101
|
606 |
+
.b8 55
|
607 |
+
.b8 100
|
608 |
+
.b8 101
|
609 |
+
.b8 0
|
610 |
+
.b8 116
|
611 |
+
.b8 114
|
612 |
+
.b8 105
|
613 |
+
.b8 116
|
614 |
+
.b8 111
|
615 |
+
.b8 110
|
616 |
+
.b8 95
|
617 |
+
.b8 95
|
618 |
+
.b8 48
|
619 |
+
.b8 100
|
620 |
+
.b8 49
|
621 |
+
.b8 100
|
622 |
+
.b8 50
|
623 |
+
.b8 100
|
624 |
+
.b8 51
|
625 |
+
.b8 100
|
626 |
+
.b8 52
|
627 |
+
.b8 100
|
628 |
+
.b8 53
|
629 |
+
.b8 100
|
630 |
+
.b8 54
|
631 |
+
.b8 100
|
632 |
+
.b8 101
|
633 |
+
.b8 55
|
634 |
+
.b8 100
|
635 |
+
.b8 101
|
636 |
+
.b8 0
|
637 |
+
.b8 1
|
638 |
+
.b8 18
|
639 |
+
.b8 1
|
640 |
+
.b8 1
|
641 |
+
.b8 3
|
642 |
+
.b64 $L__func_begin0
|
643 |
+
.b64 $L__func_end0
|
644 |
+
.b8 1
|
645 |
+
.b8 156
|
646 |
+
.b32 125
|
647 |
+
.b8 4
|
648 |
+
.b32 125
|
649 |
+
.b64 $L__tmp1
|
650 |
+
.b64 $L__tmp14
|
651 |
+
.b8 2
|
652 |
+
.b8 45
|
653 |
+
.b8 59
|
654 |
+
.b8 5
|
655 |
+
.b32 125
|
656 |
+
.b64 $L__tmp1
|
657 |
+
.b64 $L__tmp14
|
658 |
+
.b8 2
|
659 |
+
.b8 243
|
660 |
+
.b8 36
|
661 |
+
.b8 0
|
662 |
+
.b8 5
|
663 |
+
.b32 125
|
664 |
+
.b64 $L__tmp2
|
665 |
+
.b64 $L__tmp15
|
666 |
+
.b8 2
|
667 |
+
.b8 45
|
668 |
+
.b8 59
|
669 |
+
.b8 5
|
670 |
+
.b32 125
|
671 |
+
.b64 $L__tmp15
|
672 |
+
.b64 $L__tmp16
|
673 |
+
.b8 3
|
674 |
+
.b8 45
|
675 |
+
.b8 45
|
676 |
+
.b8 5
|
677 |
+
.b32 125
|
678 |
+
.b64 $L__tmp17
|
679 |
+
.b64 $L__tmp32
|
680 |
+
.b8 2
|
681 |
+
.b8 53
|
682 |
+
.b8 59
|
683 |
+
.b8 4
|
684 |
+
.b32 125
|
685 |
+
.b64 $L__tmp18
|
686 |
+
.b64 $L__tmp31
|
687 |
+
.b8 2
|
688 |
+
.b8 53
|
689 |
+
.b8 59
|
690 |
+
.b8 5
|
691 |
+
.b32 125
|
692 |
+
.b64 $L__tmp18
|
693 |
+
.b64 $L__tmp31
|
694 |
+
.b8 2
|
695 |
+
.b8 243
|
696 |
+
.b8 36
|
697 |
+
.b8 0
|
698 |
+
.b8 5
|
699 |
+
.b32 125
|
700 |
+
.b64 $L__tmp32
|
701 |
+
.b64 $L__tmp33
|
702 |
+
.b8 3
|
703 |
+
.b8 53
|
704 |
+
.b8 45
|
705 |
+
.b8 0
|
706 |
+
.b8 0
|
707 |
+
}
|
708 |
+
.section .debug_pubnames
|
709 |
+
{
|
710 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
711 |
+
$L__pubNames_start0:
|
712 |
+
.b8 2
|
713 |
+
.b8 0
|
714 |
+
.b32 .debug_info
|
715 |
+
.b32 403
|
716 |
+
.b32 125
|
717 |
+
.b8 116
|
718 |
+
.b8 114
|
719 |
+
.b8 105
|
720 |
+
.b8 116
|
721 |
+
.b8 111
|
722 |
+
.b8 110
|
723 |
+
.b8 95
|
724 |
+
.b8 95
|
725 |
+
.b8 48
|
726 |
+
.b8 100
|
727 |
+
.b8 49
|
728 |
+
.b8 100
|
729 |
+
.b8 50
|
730 |
+
.b8 100
|
731 |
+
.b8 51
|
732 |
+
.b8 100
|
733 |
+
.b8 52
|
734 |
+
.b8 100
|
735 |
+
.b8 53
|
736 |
+
.b8 100
|
737 |
+
.b8 54
|
738 |
+
.b8 100
|
739 |
+
.b8 101
|
740 |
+
.b8 55
|
741 |
+
.b8 100
|
742 |
+
.b8 101
|
743 |
+
.b8 0
|
744 |
+
.b32 0
|
745 |
+
$L__pubNames_end0:
|
746 |
+
}
|
747 |
+
.section .debug_pubtypes
|
748 |
+
{
|
749 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
750 |
+
$L__pubTypes_start0:
|
751 |
+
.b8 2
|
752 |
+
.b8 0
|
753 |
+
.b32 .debug_info
|
754 |
+
.b32 403
|
755 |
+
.b32 0
|
756 |
+
$L__pubTypes_end0:
|
757 |
+
}
|
758 |
+
.section .debug_loc { }
|
.triton/dump/d7a12c0ba96f8920b8147157303ee99f/triton_.cubin
ADDED
Binary file (14.1 kB). View file
|
|
.triton/dump/d7a12c0ba96f8920b8147157303ee99f/triton_.llir
ADDED
@@ -0,0 +1,310 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@global_smem = external addrspace(3) global [0 x i8]
|
5 |
+
@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
|
6 |
+
|
7 |
+
define void @triton__0d1d2d3d4d5de6de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i32 %5, i32 %6) local_unnamed_addr !dbg !7 {
|
8 |
+
%8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
|
9 |
+
%9 = and i32 %8, 31, !dbg !10
|
10 |
+
%10 = lshr i32 %8, 5, !dbg !10
|
11 |
+
%11 = and i32 %10, 1, !dbg !10
|
12 |
+
%urem = shl i32 %8, 2, !dbg !10
|
13 |
+
%12 = and i32 %urem, 252, !dbg !10
|
14 |
+
%13 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !11
|
15 |
+
%14 = shl i32 %13, 8, !dbg !12
|
16 |
+
%15 = or i32 %14, %12, !dbg !13
|
17 |
+
%16 = sext i32 %15 to i64, !dbg !14
|
18 |
+
%17 = getelementptr float, ptr addrspace(1) %0, i64 %16, !dbg !14
|
19 |
+
%18 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %17, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !15
|
20 |
+
%19 = extractvalue { i32, i32, i32, i32 } %18, 0, !dbg !15
|
21 |
+
%20 = extractvalue { i32, i32, i32, i32 } %18, 1, !dbg !15
|
22 |
+
%21 = extractvalue { i32, i32, i32, i32 } %18, 2, !dbg !15
|
23 |
+
%22 = extractvalue { i32, i32, i32, i32 } %18, 3, !dbg !15
|
24 |
+
%23 = bitcast i32 %21 to float, !dbg !15
|
25 |
+
%24 = bitcast i32 %22 to float, !dbg !15
|
26 |
+
%25 = getelementptr i16, ptr addrspace(1) %1, i64 %16, !dbg !16
|
27 |
+
%26 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %25, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !17
|
28 |
+
%27 = extractvalue { i32, i32 } %26, 0, !dbg !17
|
29 |
+
%28 = extractvalue { i32, i32 } %26, 1, !dbg !17
|
30 |
+
%29 = trunc i32 %27 to i16, !dbg !17
|
31 |
+
%extelt.offset = lshr i32 %27, 16, !dbg !17
|
32 |
+
%30 = trunc i32 %extelt.offset to i16, !dbg !17
|
33 |
+
%31 = trunc i32 %28 to i16, !dbg !17
|
34 |
+
%extelt.offset1 = lshr i32 %28, 16, !dbg !17
|
35 |
+
%32 = trunc i32 %extelt.offset1 to i16, !dbg !17
|
36 |
+
%33 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %29) #6, !dbg !18
|
37 |
+
%34 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %30) #6, !dbg !18
|
38 |
+
%35 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %31) #6, !dbg !18
|
39 |
+
%36 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %32) #6, !dbg !18
|
40 |
+
%37 = getelementptr i16, ptr addrspace(1) %2, i64 %16, !dbg !19
|
41 |
+
%38 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %37, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !20
|
42 |
+
%39 = extractvalue { i32, i32 } %38, 0, !dbg !20
|
43 |
+
%40 = extractvalue { i32, i32 } %38, 1, !dbg !20
|
44 |
+
%41 = trunc i32 %39 to i16, !dbg !20
|
45 |
+
%extelt.offset2 = lshr i32 %39, 16, !dbg !20
|
46 |
+
%42 = trunc i32 %extelt.offset2 to i16, !dbg !20
|
47 |
+
%43 = trunc i32 %40 to i16, !dbg !20
|
48 |
+
%extelt.offset3 = lshr i32 %40, 16, !dbg !20
|
49 |
+
%44 = trunc i32 %extelt.offset3 to i16, !dbg !20
|
50 |
+
%45 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %41) #6, !dbg !21
|
51 |
+
%46 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %42) #6, !dbg !21
|
52 |
+
%47 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %43) #6, !dbg !21
|
53 |
+
%48 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %44) #6, !dbg !21
|
54 |
+
%49 = zext nneg i32 %12 to i64, !dbg !22
|
55 |
+
%50 = getelementptr float, ptr addrspace(1) %3, i64 %49, !dbg !22
|
56 |
+
%51 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %50, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !23
|
57 |
+
%52 = fadd float %35, %23, !dbg !24
|
58 |
+
%53 = fadd float %36, %24, !dbg !24
|
59 |
+
%54 = insertelement <2 x i32> poison, i32 %19, i64 0, !dbg !15
|
60 |
+
%55 = insertelement <2 x i32> %54, i32 %20, i64 1, !dbg !15
|
61 |
+
%56 = bitcast <2 x i32> %55 to <2 x float>, !dbg !15
|
62 |
+
%57 = insertelement <2 x float> poison, float %33, i64 0, !dbg !24
|
63 |
+
%58 = insertelement <2 x float> %57, float %34, i64 1, !dbg !24
|
64 |
+
%59 = fadd <2 x float> %58, %56, !dbg !24
|
65 |
+
%60 = insertelement <2 x float> poison, float %45, i64 0, !dbg !25
|
66 |
+
%61 = insertelement <2 x float> %60, float %46, i64 1, !dbg !25
|
67 |
+
%62 = fadd <2 x float> %59, %61, !dbg !25
|
68 |
+
%63 = fadd float %52, %47, !dbg !25
|
69 |
+
%64 = fadd float %53, %48, !dbg !25
|
70 |
+
%65 = extractelement <2 x float> %62, i64 0, !dbg !26
|
71 |
+
%66 = extractelement <2 x float> %62, i64 1, !dbg !26
|
72 |
+
%67 = fadd float %65, %66, !dbg !26
|
73 |
+
%68 = fadd float %67, %63, !dbg !26
|
74 |
+
%69 = fadd float %68, %64, !dbg !26
|
75 |
+
%70 = bitcast float %69 to i32, !dbg !32
|
76 |
+
%71 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %70, i32 16, i32 31), !dbg !32
|
77 |
+
%72 = bitcast i32 %71 to float, !dbg !32
|
78 |
+
%73 = fadd float %69, %72, !dbg !26
|
79 |
+
%74 = bitcast float %73 to i32, !dbg !32
|
80 |
+
%75 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %74, i32 8, i32 31), !dbg !32
|
81 |
+
%76 = bitcast i32 %75 to float, !dbg !32
|
82 |
+
%77 = fadd float %73, %76, !dbg !26
|
83 |
+
%78 = bitcast float %77 to i32, !dbg !32
|
84 |
+
%79 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %78, i32 4, i32 31), !dbg !32
|
85 |
+
%80 = bitcast i32 %79 to float, !dbg !32
|
86 |
+
%81 = fadd float %77, %80, !dbg !26
|
87 |
+
%82 = bitcast float %81 to i32, !dbg !32
|
88 |
+
%83 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %82, i32 2, i32 31), !dbg !32
|
89 |
+
%84 = bitcast i32 %83 to float, !dbg !32
|
90 |
+
%85 = fadd float %81, %84, !dbg !26
|
91 |
+
%86 = bitcast float %85 to i32, !dbg !32
|
92 |
+
%87 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %86, i32 1, i32 31), !dbg !32
|
93 |
+
%88 = bitcast i32 %87 to float, !dbg !32
|
94 |
+
%89 = fadd float %85, %88, !dbg !26
|
95 |
+
%90 = icmp eq i32 %9, 0, !dbg !32
|
96 |
+
%91 = zext nneg i32 %11 to i64, !dbg !32
|
97 |
+
%92 = getelementptr float, ptr addrspace(3) @global_smem, i64 %91, !dbg !32
|
98 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %92, float %89, i1 %90) #6, !dbg !32
|
99 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !32
|
100 |
+
%93 = icmp slt i32 %8, 2, !dbg !32
|
101 |
+
%94 = sext i32 %8 to i64, !dbg !32
|
102 |
+
%95 = getelementptr float, ptr addrspace(3) @global_smem, i64 %94, !dbg !32
|
103 |
+
%96 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %95, i1 %93) #6, !dbg !32
|
104 |
+
%97 = bitcast float %96 to i32, !dbg !32
|
105 |
+
%98 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %97, i32 1, i32 31), !dbg !32
|
106 |
+
%99 = bitcast i32 %98 to float, !dbg !32
|
107 |
+
%100 = fadd float %96, %99, !dbg !26
|
108 |
+
%101 = and i32 %8, 1, !dbg !32
|
109 |
+
%102 = icmp eq i32 %101, 0, !dbg !32
|
110 |
+
%103 = and i1 %93, %102, !dbg !32
|
111 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %95, float %100, i1 %103) #6, !dbg !32
|
112 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !32
|
113 |
+
%104 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !32
|
114 |
+
%105 = fadd float %104, 0.000000e+00, !dbg !34
|
115 |
+
%106 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %105, float 2.560000e+02) #6, !dbg !38
|
116 |
+
%107 = fsub float %65, %106, !dbg !39
|
117 |
+
%108 = fsub float %66, %106, !dbg !39
|
118 |
+
%109 = fsub float %63, %106, !dbg !39
|
119 |
+
%110 = fsub float %64, %106, !dbg !39
|
120 |
+
%111 = fmul float %107, %107, !dbg !40
|
121 |
+
%112 = fmul float %108, %108, !dbg !40
|
122 |
+
%113 = fmul float %109, %109, !dbg !40
|
123 |
+
%114 = fmul float %110, %110, !dbg !40
|
124 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !41
|
125 |
+
%115 = fadd float %111, %112, !dbg !43
|
126 |
+
%116 = fadd float %113, %115, !dbg !43
|
127 |
+
%117 = fadd float %114, %116, !dbg !43
|
128 |
+
%118 = bitcast float %117 to i32, !dbg !41
|
129 |
+
%119 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %118, i32 16, i32 31), !dbg !41
|
130 |
+
%120 = bitcast i32 %119 to float, !dbg !41
|
131 |
+
%121 = fadd float %117, %120, !dbg !43
|
132 |
+
%122 = bitcast float %121 to i32, !dbg !41
|
133 |
+
%123 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %122, i32 8, i32 31), !dbg !41
|
134 |
+
%124 = bitcast i32 %123 to float, !dbg !41
|
135 |
+
%125 = fadd float %121, %124, !dbg !43
|
136 |
+
%126 = bitcast float %125 to i32, !dbg !41
|
137 |
+
%127 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %126, i32 4, i32 31), !dbg !41
|
138 |
+
%128 = bitcast i32 %127 to float, !dbg !41
|
139 |
+
%129 = fadd float %125, %128, !dbg !43
|
140 |
+
%130 = bitcast float %129 to i32, !dbg !41
|
141 |
+
%131 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %130, i32 2, i32 31), !dbg !41
|
142 |
+
%132 = bitcast i32 %131 to float, !dbg !41
|
143 |
+
%133 = fadd float %129, %132, !dbg !43
|
144 |
+
%134 = bitcast float %133 to i32, !dbg !41
|
145 |
+
%135 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %134, i32 1, i32 31), !dbg !41
|
146 |
+
%136 = bitcast i32 %135 to float, !dbg !41
|
147 |
+
%137 = fadd float %133, %136, !dbg !43
|
148 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %92, float %137, i1 %90) #6, !dbg !41
|
149 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !41
|
150 |
+
%138 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %95, i1 %93) #6, !dbg !41
|
151 |
+
%139 = bitcast float %138 to i32, !dbg !41
|
152 |
+
%140 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %139, i32 1, i32 31), !dbg !41
|
153 |
+
%141 = bitcast i32 %140 to float, !dbg !41
|
154 |
+
%142 = fadd float %138, %141, !dbg !43
|
155 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %95, float %142, i1 %103) #6, !dbg !41
|
156 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !41
|
157 |
+
%143 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !41
|
158 |
+
%144 = fadd float %143, 0.000000e+00, !dbg !46
|
159 |
+
%145 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %144, float 2.560000e+02) #6, !dbg !48
|
160 |
+
%146 = fadd float %145, 0x3EE4F8B580000000, !dbg !49
|
161 |
+
%147 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !50
|
162 |
+
%.not.i = icmp eq i32 %147, 0, !dbg !50
|
163 |
+
br i1 %.not.i, label %150, label %148, !dbg !50
|
164 |
+
|
165 |
+
148: ; preds = %7
|
166 |
+
%149 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %146), !dbg !50
|
167 |
+
br label %__nv_rsqrtf.exit, !dbg !50
|
168 |
+
|
169 |
+
150: ; preds = %7
|
170 |
+
%151 = tail call float @llvm.nvvm.rsqrt.approx.f(float %146), !dbg !50
|
171 |
+
br label %__nv_rsqrtf.exit, !dbg !50
|
172 |
+
|
173 |
+
__nv_rsqrtf.exit: ; preds = %148, %150
|
174 |
+
%.0.i = phi float [ %149, %148 ], [ %151, %150 ], !dbg !50
|
175 |
+
%152 = extractvalue { i32, i32, i32, i32 } %51, 3, !dbg !23
|
176 |
+
%153 = bitcast i32 %152 to float, !dbg !23
|
177 |
+
%154 = extractvalue { i32, i32, i32, i32 } %51, 2, !dbg !23
|
178 |
+
%155 = bitcast i32 %154 to float, !dbg !23
|
179 |
+
%156 = extractvalue { i32, i32, i32, i32 } %51, 1, !dbg !23
|
180 |
+
%157 = bitcast i32 %156 to float, !dbg !23
|
181 |
+
%158 = extractvalue { i32, i32, i32, i32 } %51, 0, !dbg !23
|
182 |
+
%159 = bitcast i32 %158 to float, !dbg !23
|
183 |
+
%160 = fmul float %107, %.0.i, !dbg !51
|
184 |
+
%161 = fmul float %108, %.0.i, !dbg !51
|
185 |
+
%162 = fmul float %109, %.0.i, !dbg !51
|
186 |
+
%163 = fmul float %110, %.0.i, !dbg !51
|
187 |
+
%164 = fmul float %160, %159, !dbg !52
|
188 |
+
%165 = fmul float %161, %157, !dbg !52
|
189 |
+
%166 = fmul float %162, %155, !dbg !52
|
190 |
+
%167 = fmul float %163, %153, !dbg !52
|
191 |
+
%168 = getelementptr i16, ptr addrspace(1) %4, i64 %16, !dbg !53
|
192 |
+
%169 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %164) #6, !dbg !54
|
193 |
+
%170 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %165) #6, !dbg !54
|
194 |
+
%171 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %166) #6, !dbg !54
|
195 |
+
%172 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %167) #6, !dbg !54
|
196 |
+
%173 = insertelement <2 x i16> undef, i16 %169, i64 0, !dbg !54
|
197 |
+
%174 = insertelement <2 x i16> %173, i16 %170, i64 1, !dbg !54
|
198 |
+
%175 = bitcast <2 x i16> %174 to i32, !dbg !54
|
199 |
+
%176 = insertelement <2 x i16> undef, i16 %171, i64 0, !dbg !54
|
200 |
+
%177 = insertelement <2 x i16> %176, i16 %172, i64 1, !dbg !54
|
201 |
+
%178 = bitcast <2 x i16> %177 to i32, !dbg !54
|
202 |
+
tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %175, i32 %178, ptr addrspace(1) %168, i1 true) #6, !dbg !54
|
203 |
+
ret void, !dbg !55
|
204 |
+
}
|
205 |
+
|
206 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
207 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
208 |
+
|
209 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
210 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
|
211 |
+
|
212 |
+
; Function Attrs: convergent nocallback nounwind
|
213 |
+
declare void @llvm.nvvm.barrier0() #2
|
214 |
+
|
215 |
+
; Function Attrs: alwaysinline nounwind
|
216 |
+
define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
|
217 |
+
%1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
|
218 |
+
%.not = icmp eq i32 %1, 0
|
219 |
+
br i1 %.not, label %4, label %2
|
220 |
+
|
221 |
+
2: ; preds = %0
|
222 |
+
%3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
|
223 |
+
br label %6
|
224 |
+
|
225 |
+
4: ; preds = %0
|
226 |
+
%5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
|
227 |
+
br label %6
|
228 |
+
|
229 |
+
6: ; preds = %4, %2
|
230 |
+
%.0 = phi float [ %3, %2 ], [ %5, %4 ]
|
231 |
+
ret float %.0
|
232 |
+
}
|
233 |
+
|
234 |
+
declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
|
235 |
+
|
236 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
237 |
+
declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
|
238 |
+
|
239 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
240 |
+
declare float @llvm.nvvm.rsqrt.approx.f(float) #5
|
241 |
+
|
242 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
243 |
+
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
244 |
+
attributes #2 = { convergent nocallback nounwind }
|
245 |
+
attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
246 |
+
attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
247 |
+
attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
|
248 |
+
attributes #6 = { nounwind }
|
249 |
+
|
250 |
+
!llvm.module.flags = !{!0, !1}
|
251 |
+
!llvm.dbg.cu = !{!2}
|
252 |
+
!nvvm.annotations = !{!4, !5, !5, !4}
|
253 |
+
!llvm.ident = !{!6}
|
254 |
+
|
255 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
256 |
+
!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
257 |
+
!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
258 |
+
!3 = !DIFile(filename: "cdohrmmhfsykzlva6pepxaa7gf7klw7w5jzorpspyaldhfg3acr2.py", directory: "/tmp/torchinductor_root/do")
|
259 |
+
!4 = !{ptr @triton__0d1d2d3d4d5de6de, !"kernel", i32 1}
|
260 |
+
!5 = !{ptr @triton__0d1d2d3d4d5de6de, !"maxntidx", i32 64}
|
261 |
+
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
|
262 |
+
!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5de6de", linkageName: "triton__0d1d2d3d4d5de6de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
|
263 |
+
!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
|
264 |
+
!9 = !{}
|
265 |
+
!10 = !DILocation(line: 26, column: 26, scope: !7)
|
266 |
+
!11 = !DILocation(line: 23, column: 28, scope: !7)
|
267 |
+
!12 = !DILocation(line: 30, column: 40, scope: !7)
|
268 |
+
!13 = !DILocation(line: 30, column: 36, scope: !7)
|
269 |
+
!14 = !DILocation(line: 30, column: 30, scope: !7)
|
270 |
+
!15 = !DILocation(line: 30, column: 46, scope: !7)
|
271 |
+
!16 = !DILocation(line: 31, column: 30, scope: !7)
|
272 |
+
!17 = !DILocation(line: 31, column: 46, scope: !7)
|
273 |
+
!18 = !DILocation(line: 31, column: 67, scope: !7)
|
274 |
+
!19 = !DILocation(line: 32, column: 30, scope: !7)
|
275 |
+
!20 = !DILocation(line: 32, column: 46, scope: !7)
|
276 |
+
!21 = !DILocation(line: 32, column: 67, scope: !7)
|
277 |
+
!22 = !DILocation(line: 33, column: 31, scope: !7)
|
278 |
+
!23 = !DILocation(line: 33, column: 36, scope: !7)
|
279 |
+
!24 = !DILocation(line: 35, column: 18, scope: !7)
|
280 |
+
!25 = !DILocation(line: 37, column: 18, scope: !7)
|
281 |
+
!26 = !DILocation(line: 233, column: 15, scope: !27, inlinedAt: !30)
|
282 |
+
!27 = distinct !DILexicalBlockFile(scope: !29, file: !28, discriminator: 0)
|
283 |
+
!28 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
|
284 |
+
!29 = distinct !DILexicalBlockFile(scope: !7, file: !28, discriminator: 0)
|
285 |
+
!30 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !31)
|
286 |
+
!31 = !DILocation(line: 42, column: 59, scope: !27)
|
287 |
+
!32 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !33)
|
288 |
+
!33 = !DILocation(line: 42, column: 59, scope: !29)
|
289 |
+
!34 = !DILocation(line: 8, column: 15, scope: !35, inlinedAt: !37)
|
290 |
+
!35 = distinct !DILexicalBlockFile(scope: !7, file: !36, discriminator: 0)
|
291 |
+
!36 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
|
292 |
+
!37 = !DILocation(line: 42, column: 45, scope: !35)
|
293 |
+
!38 = !DILocation(line: 45, column: 20, scope: !7)
|
294 |
+
!39 = !DILocation(line: 46, column: 19, scope: !7)
|
295 |
+
!40 = !DILocation(line: 47, column: 20, scope: !7)
|
296 |
+
!41 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !42)
|
297 |
+
!42 = !DILocation(line: 50, column: 59, scope: !29)
|
298 |
+
!43 = !DILocation(line: 233, column: 15, scope: !27, inlinedAt: !44)
|
299 |
+
!44 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !45)
|
300 |
+
!45 = !DILocation(line: 50, column: 59, scope: !27)
|
301 |
+
!46 = !DILocation(line: 8, column: 15, scope: !35, inlinedAt: !47)
|
302 |
+
!47 = !DILocation(line: 50, column: 45, scope: !35)
|
303 |
+
!48 = !DILocation(line: 53, column: 20, scope: !7)
|
304 |
+
!49 = !DILocation(line: 55, column: 20, scope: !7)
|
305 |
+
!50 = !DILocation(line: 56, column: 26, scope: !7)
|
306 |
+
!51 = !DILocation(line: 57, column: 20, scope: !7)
|
307 |
+
!52 = !DILocation(line: 58, column: 20, scope: !7)
|
308 |
+
!53 = !DILocation(line: 60, column: 25, scope: !7)
|
309 |
+
!54 = !DILocation(line: 60, column: 48, scope: !7)
|
310 |
+
!55 = !DILocation(line: 60, column: 4, scope: !7)
|
.triton/dump/ea4af10b775ebbfe0bdcca18fd6288c8/triton_.ttir
ADDED
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<16x128xbf16>
|
4 |
+
%cst_0 = arith.constant 0.000000e+00 : f32
|
5 |
+
%cst_1 = arith.constant dense<1.000000e+00> : tensor<16x128xf32>
|
6 |
+
%c256_i32 = arith.constant 256 : i32
|
7 |
+
%c128_i32 = arith.constant 128 : i32
|
8 |
+
%c0_i32 = arith.constant 0 : i32
|
9 |
+
%cst_2 = arith.constant dense<256> : tensor<16x1xi64>
|
10 |
+
%cst_3 = arith.constant dense<0> : tensor<16x1xi64>
|
11 |
+
%cst_4 = arith.constant dense<50257> : tensor<16x1xi64>
|
12 |
+
%cst_5 = arith.constant dense<9.99999974E-6> : tensor<16x1xf32>
|
13 |
+
%cst_6 = arith.constant dense<2.560000e+02> : tensor<16x1xf32>
|
14 |
+
%cst_7 = arith.constant dense<0.000000e+00> : tensor<1x128xf32>
|
15 |
+
%cst_8 = arith.constant dense<0.000000e+00> : tensor<16x128xf32>
|
16 |
+
%cst_9 = arith.constant dense<256> : tensor<16x1xi32>
|
17 |
+
%cst_10 = arith.constant dense<256> : tensor<1x128xi32>
|
18 |
+
%cst_11 = arith.constant dense<512> : tensor<16x1xi32>
|
19 |
+
%c16_i32 = arith.constant 16 : i32
|
20 |
+
%0 = tt.get_program_id x : i32
|
21 |
+
%1 = arith.muli %0, %c16_i32 : i32
|
22 |
+
%2 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32>
|
23 |
+
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<16xi32>) -> tensor<16x1xi32>
|
24 |
+
%4 = tt.splat %1 : (i32) -> tensor<16x1xi32>
|
25 |
+
%5 = arith.addi %4, %3 : tensor<16x1xi32>
|
26 |
+
%6 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
|
27 |
+
%7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<128xi32>) -> tensor<1x128xi32>
|
28 |
+
%8 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<16x1x!tt.ptr<i64, 1>>
|
29 |
+
%9 = tt.addptr %8, %5 : tensor<16x1x!tt.ptr<i64, 1>>, tensor<16x1xi32>
|
30 |
+
%10 = tt.load %9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64>
|
31 |
+
%11 = arith.remsi %5, %cst_11 : tensor<16x1xi32>
|
32 |
+
%12 = arith.muli %11, %cst_9 : tensor<16x1xi32>
|
33 |
+
%13 = tt.broadcast %12 : (tensor<16x1xi32>) -> tensor<16x128xi32>
|
34 |
+
%14 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<16x128x!tt.ptr<f32, 1>>
|
35 |
+
%15 = arith.muli %5, %cst_9 : tensor<16x1xi32>
|
36 |
+
%16 = tt.broadcast %15 : (tensor<16x1xi32>) -> tensor<16x128xi32>
|
37 |
+
%17 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<16x128x!tt.ptr<bf16, 1>>
|
38 |
+
%18 = arith.addi %10, %cst_4 : tensor<16x1xi64>
|
39 |
+
%19 = arith.cmpi slt, %10, %cst_3 : tensor<16x1xi64>
|
40 |
+
%20 = arith.select %19, %18, %10 : tensor<16x1xi1>, tensor<16x1xi64>
|
41 |
+
%21 = arith.cmpi sge, %20, %cst_3 : tensor<16x1xi64>
|
42 |
+
%22 = arith.cmpi slt, %20, %cst_4 : tensor<16x1xi64>
|
43 |
+
%23 = arith.andi %21, %22 : tensor<16x1xi1>
|
44 |
+
%24 = arith.muli %20, %cst_2 : tensor<16x1xi64>
|
45 |
+
%25 = tt.broadcast %24 : (tensor<16x1xi64>) -> tensor<16x128xi64>
|
46 |
+
%26 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<16x128x!tt.ptr<f32, 1>>
|
47 |
+
%27:3 = scf.for %arg8 = %c0_i32 to %c256_i32 step %c128_i32 iter_args(%arg9 = %cst_8, %arg10 = %cst_8, %arg11 = %cst_8) -> (tensor<16x128xf32>, tensor<16x128xf32>, tensor<16x128xf32>) : i32 {
|
48 |
+
%51 = tt.splat %arg8 : (i32) -> tensor<1x128xi32>
|
49 |
+
%52 = arith.addi %51, %7 : tensor<1x128xi32>
|
50 |
+
%53 = arith.cmpi slt, %52, %cst_10 : tensor<1x128xi32>
|
51 |
+
%54 = tt.broadcast %52 : (tensor<1x128xi32>) -> tensor<16x128xi32>
|
52 |
+
%55 = arith.addi %54, %13 : tensor<16x128xi32>
|
53 |
+
%56 = tt.addptr %14, %55 : tensor<16x128x!tt.ptr<f32, 1>>, tensor<16x128xi32>
|
54 |
+
%57 = tt.broadcast %53 : (tensor<1x128xi1>) -> tensor<16x128xi1>
|
55 |
+
%58 = tt.load %56, %57, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x128xf32>
|
56 |
+
%59 = arith.addi %54, %16 : tensor<16x128xi32>
|
57 |
+
%60 = tt.addptr %17, %59 : tensor<16x128x!tt.ptr<bf16, 1>>, tensor<16x128xi32>
|
58 |
+
%61 = tt.load %60, %57, %cst {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x128xbf16>
|
59 |
+
%62 = arith.extf %61 : tensor<16x128xbf16> to tensor<16x128xf32>
|
60 |
+
tt.assert %23, "index out of bounds: 0 <= tmp3 < 50257", "/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py", "<module>", 1892 : tensor<16x1xi1>
|
61 |
+
%63 = arith.extsi %52 : tensor<1x128xi32> to tensor<1x128xi64>
|
62 |
+
%64 = tt.broadcast %63 : (tensor<1x128xi64>) -> tensor<16x128xi64>
|
63 |
+
%65 = arith.addi %64, %25 : tensor<16x128xi64>
|
64 |
+
%66 = tt.addptr %26, %65 : tensor<16x128x!tt.ptr<f32, 1>>, tensor<16x128xi64>
|
65 |
+
%67 = tt.load %66, %57, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x128xf32>
|
66 |
+
%68 = arith.addf %67, %58 : tensor<16x128xf32>
|
67 |
+
%69 = arith.addf %68, %62 : tensor<16x128xf32>
|
68 |
+
%70 = arith.subf %69, %arg9 : tensor<16x128xf32>
|
69 |
+
%71 = arith.addf %arg11, %cst_1 : tensor<16x128xf32>
|
70 |
+
%72 = arith.divf %70, %71 : tensor<16x128xf32>
|
71 |
+
%73 = arith.addf %arg9, %72 : tensor<16x128xf32>
|
72 |
+
%74 = arith.subf %69, %73 : tensor<16x128xf32>
|
73 |
+
%75 = arith.mulf %70, %74 : tensor<16x128xf32>
|
74 |
+
%76 = arith.addf %arg10, %75 : tensor<16x128xf32>
|
75 |
+
%77 = arith.select %57, %73, %arg9 : tensor<16x128xi1>, tensor<16x128xf32>
|
76 |
+
%78 = arith.select %57, %76, %arg10 : tensor<16x128xi1>, tensor<16x128xf32>
|
77 |
+
%79 = arith.select %57, %71, %arg11 : tensor<16x128xi1>, tensor<16x128xf32>
|
78 |
+
scf.yield %77, %78, %79 : tensor<16x128xf32>, tensor<16x128xf32>, tensor<16x128xf32>
|
79 |
+
}
|
80 |
+
%28:3 = "tt.reduce"(%27#0, %27#1, %27#2) <{axis = 1 : i32}> ({
|
81 |
+
^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
|
82 |
+
%51 = arith.subf %arg11, %arg8 : f32
|
83 |
+
%52 = arith.addf %arg10, %arg13 : f32
|
84 |
+
%53 = arith.cmpf oeq, %52, %cst_0 : f32
|
85 |
+
%54 = arith.divf %arg13, %52 : f32
|
86 |
+
%55 = arith.select %53, %cst_0, %54 : f32
|
87 |
+
%56 = arith.mulf %51, %55 : f32
|
88 |
+
%57 = arith.addf %arg8, %56 : f32
|
89 |
+
%58 = arith.addf %arg9, %arg12 : f32
|
90 |
+
%59 = arith.mulf %51, %51 : f32
|
91 |
+
%60 = arith.mulf %59, %arg10 : f32
|
92 |
+
%61 = arith.mulf %60, %55 : f32
|
93 |
+
%62 = arith.addf %58, %61 : f32
|
94 |
+
tt.reduce.return %57, %62, %52 : f32, f32, f32
|
95 |
+
}) : (tensor<16x128xf32>, tensor<16x128xf32>, tensor<16x128xf32>) -> (tensor<16xf32>, tensor<16xf32>, tensor<16xf32>)
|
96 |
+
%29 = tt.expand_dims %28#0 {axis = 1 : i32} : (tensor<16xf32>) -> tensor<16x1xf32>
|
97 |
+
%30 = tt.expand_dims %28#1 {axis = 1 : i32} : (tensor<16xf32>) -> tensor<16x1xf32>
|
98 |
+
%31 = arith.muli %11, %cst_9 : tensor<16x1xi32>
|
99 |
+
%32 = tt.broadcast %31 : (tensor<16x1xi32>) -> tensor<16x128xi32>
|
100 |
+
%33 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<16x128x!tt.ptr<f32, 1>>
|
101 |
+
%34 = arith.muli %5, %cst_9 : tensor<16x1xi32>
|
102 |
+
%35 = tt.broadcast %34 : (tensor<16x1xi32>) -> tensor<16x128xi32>
|
103 |
+
%36 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<16x128x!tt.ptr<bf16, 1>>
|
104 |
+
%37 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x128x!tt.ptr<f32, 1>>
|
105 |
+
%38 = arith.addi %10, %cst_4 : tensor<16x1xi64>
|
106 |
+
%39 = arith.cmpi slt, %10, %cst_3 : tensor<16x1xi64>
|
107 |
+
%40 = arith.select %39, %38, %10 : tensor<16x1xi1>, tensor<16x1xi64>
|
108 |
+
%41 = arith.cmpi sge, %40, %cst_3 : tensor<16x1xi64>
|
109 |
+
%42 = arith.cmpi slt, %40, %cst_4 : tensor<16x1xi64>
|
110 |
+
%43 = arith.andi %41, %42 : tensor<16x1xi1>
|
111 |
+
%44 = arith.muli %40, %cst_2 : tensor<16x1xi64>
|
112 |
+
%45 = tt.broadcast %44 : (tensor<16x1xi64>) -> tensor<16x128xi64>
|
113 |
+
%46 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<16x128x!tt.ptr<f32, 1>>
|
114 |
+
%47 = tt.broadcast %29 : (tensor<16x1xf32>) -> tensor<16x128xf32>
|
115 |
+
%48 = arith.divf %30, %cst_6 : tensor<16x1xf32>
|
116 |
+
%49 = arith.addf %48, %cst_5 : tensor<16x1xf32>
|
117 |
+
%50 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<16x128x!tt.ptr<bf16, 1>>
|
118 |
+
scf.for %arg8 = %c0_i32 to %c256_i32 step %c128_i32 : i32 {
|
119 |
+
%51 = tt.splat %arg8 : (i32) -> tensor<1x128xi32>
|
120 |
+
%52 = arith.addi %51, %7 : tensor<1x128xi32>
|
121 |
+
%53 = arith.cmpi slt, %52, %cst_10 : tensor<1x128xi32>
|
122 |
+
%54 = tt.broadcast %52 : (tensor<1x128xi32>) -> tensor<16x128xi32>
|
123 |
+
%55 = arith.addi %54, %32 : tensor<16x128xi32>
|
124 |
+
%56 = tt.addptr %33, %55 : tensor<16x128x!tt.ptr<f32, 1>>, tensor<16x128xi32>
|
125 |
+
%57 = tt.broadcast %53 : (tensor<1x128xi1>) -> tensor<16x128xi1>
|
126 |
+
%58 = tt.load %56, %57, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x128xf32>
|
127 |
+
%59 = arith.addi %54, %35 : tensor<16x128xi32>
|
128 |
+
%60 = tt.addptr %36, %59 : tensor<16x128x!tt.ptr<bf16, 1>>, tensor<16x128xi32>
|
129 |
+
%61 = tt.load %60, %57, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x128xbf16>
|
130 |
+
%62 = arith.extf %61 : tensor<16x128xbf16> to tensor<16x128xf32>
|
131 |
+
%63 = tt.addptr %37, %52 : tensor<1x128x!tt.ptr<f32, 1>>, tensor<1x128xi32>
|
132 |
+
%64 = tt.load %63, %53, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x128xf32>
|
133 |
+
tt.assert %43, "index out of bounds: 0 <= tmp16 < 50257", "/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py", "<module>", 1892 : tensor<16x1xi1>
|
134 |
+
%65 = arith.extsi %52 : tensor<1x128xi32> to tensor<1x128xi64>
|
135 |
+
%66 = tt.broadcast %65 : (tensor<1x128xi64>) -> tensor<16x128xi64>
|
136 |
+
%67 = arith.addi %66, %45 : tensor<16x128xi64>
|
137 |
+
%68 = tt.addptr %46, %67 : tensor<16x128x!tt.ptr<f32, 1>>, tensor<16x128xi64>
|
138 |
+
%69 = tt.load %68, %57, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x128xf32>
|
139 |
+
%70 = arith.addf %69, %58 : tensor<16x128xf32>
|
140 |
+
%71 = arith.addf %70, %62 : tensor<16x128xf32>
|
141 |
+
%72 = arith.subf %71, %47 : tensor<16x128xf32>
|
142 |
+
%73 = tt.extern_elementwise %49 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<16x1xf32>) -> tensor<16x1xf32>
|
143 |
+
%74 = tt.broadcast %73 : (tensor<16x1xf32>) -> tensor<16x128xf32>
|
144 |
+
%75 = arith.mulf %72, %74 : tensor<16x128xf32>
|
145 |
+
%76 = tt.broadcast %64 : (tensor<1x128xf32>) -> tensor<16x128xf32>
|
146 |
+
%77 = arith.mulf %75, %76 : tensor<16x128xf32>
|
147 |
+
%78 = tt.addptr %50, %59 : tensor<16x128x!tt.ptr<bf16, 1>>, tensor<16x128xi32>
|
148 |
+
%79 = arith.truncf %77 : tensor<16x128xf32> to tensor<16x128xbf16>
|
149 |
+
tt.store %78, %79, %57 {cache = 1 : i32, evict = 1 : i32} : tensor<16x128xbf16>
|
150 |
+
}
|
151 |
+
tt.return
|
152 |
+
}
|
153 |
+
}
|
.triton/dump/fac03406d1136fc802dac111a1efea36/triton_.ptx
ADDED
@@ -0,0 +1,971 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3de4
|
10 |
+
.extern .shared .align 1 .b8 global_smem[];
|
11 |
+
|
12 |
+
.visible .entry triton__0d1d2d3de4(
|
13 |
+
.param .u64 triton__0d1d2d3de4_param_0,
|
14 |
+
.param .u64 triton__0d1d2d3de4_param_1,
|
15 |
+
.param .u64 triton__0d1d2d3de4_param_2,
|
16 |
+
.param .u64 triton__0d1d2d3de4_param_3,
|
17 |
+
.param .u64 triton__0d1d2d3de4_param_4
|
18 |
+
)
|
19 |
+
.maxntid 256, 1, 1
|
20 |
+
{
|
21 |
+
.reg .pred %p<91>;
|
22 |
+
.reg .b16 %rs<49>;
|
23 |
+
.reg .b32 %r<84>;
|
24 |
+
.reg .f32 %f<194>;
|
25 |
+
.reg .b64 %rd<75>;
|
26 |
+
.loc 1 18 0
|
27 |
+
$L__func_begin0:
|
28 |
+
.loc 1 18 0
|
29 |
+
|
30 |
+
ld.param.u64 %rd16, [triton__0d1d2d3de4_param_2];
|
31 |
+
ld.param.u64 %rd15, [triton__0d1d2d3de4_param_1];
|
32 |
+
ld.param.u64 %rd18, [triton__0d1d2d3de4_param_0];
|
33 |
+
$L__tmp0:
|
34 |
+
.loc 1 24 33
|
35 |
+
mov.u32 %r1, %tid.x;
|
36 |
+
shr.u32 %r2, %r1, 5;
|
37 |
+
and.b32 %r3, %r1, 255;
|
38 |
+
or.b32 %r8, %r3, 256;
|
39 |
+
or.b32 %r9, %r3, 512;
|
40 |
+
or.b32 %r10, %r3, 768;
|
41 |
+
or.b32 %r11, %r3, 1024;
|
42 |
+
or.b32 %r12, %r3, 1280;
|
43 |
+
or.b32 %r13, %r3, 1536;
|
44 |
+
or.b32 %r14, %r3, 1792;
|
45 |
+
.loc 1 21 28
|
46 |
+
mov.u32 %r7, %ctaid.x;
|
47 |
+
.loc 1 21 34
|
48 |
+
cvt.s64.s32 %rd1, %r7;
|
49 |
+
cvt.u64.u32 %rd2, %r3;
|
50 |
+
cvt.u64.u32 %rd9, %r14;
|
51 |
+
cvt.u64.u32 %rd8, %r13;
|
52 |
+
cvt.u64.u32 %rd7, %r12;
|
53 |
+
cvt.u64.u32 %rd6, %r11;
|
54 |
+
cvt.u64.u32 %rd5, %r10;
|
55 |
+
cvt.u64.u32 %rd4, %r9;
|
56 |
+
cvt.u64.u32 %rd3, %r8;
|
57 |
+
.loc 1 27 36
|
58 |
+
mul.wide.s32 %rd19, %r7, 100514;
|
59 |
+
add.s64 %rd10, %rd18, %rd19;
|
60 |
+
mov.f32 %f178, 0fFF800000;
|
61 |
+
mov.u64 %rd73, 0;
|
62 |
+
mov.u16 %rs2, 0;
|
63 |
+
mov.f32 %f179, %f178;
|
64 |
+
mov.f32 %f180, %f178;
|
65 |
+
mov.f32 %f181, %f178;
|
66 |
+
mov.f32 %f182, %f178;
|
67 |
+
mov.f32 %f183, %f178;
|
68 |
+
mov.f32 %f184, %f178;
|
69 |
+
mov.f32 %f185, %f178;
|
70 |
+
$L__BB0_1:
|
71 |
+
$L__tmp1:
|
72 |
+
.loc 2 38 21
|
73 |
+
setp.num.f32 %p18, %f178, %f178;
|
74 |
+
setp.num.f32 %p19, %f179, %f179;
|
75 |
+
setp.num.f32 %p20, %f180, %f180;
|
76 |
+
setp.num.f32 %p21, %f181, %f181;
|
77 |
+
setp.num.f32 %p22, %f182, %f182;
|
78 |
+
setp.num.f32 %p23, %f183, %f183;
|
79 |
+
setp.num.f32 %p24, %f184, %f184;
|
80 |
+
setp.num.f32 %p25, %f185, %f185;
|
81 |
+
$L__tmp2:
|
82 |
+
.loc 1 28 27
|
83 |
+
or.b64 %rd28, %rd73, %rd2;
|
84 |
+
or.b64 %rd29, %rd73, %rd3;
|
85 |
+
or.b64 %rd30, %rd73, %rd4;
|
86 |
+
or.b64 %rd31, %rd73, %rd5;
|
87 |
+
or.b64 %rd32, %rd73, %rd6;
|
88 |
+
or.b64 %rd33, %rd73, %rd7;
|
89 |
+
or.b64 %rd34, %rd73, %rd8;
|
90 |
+
or.b64 %rd35, %rd73, %rd9;
|
91 |
+
.loc 1 29 25
|
92 |
+
setp.lt.u64 %p17, %rd35, 50257;
|
93 |
+
setp.lt.u64 %p15, %rd34, 50257;
|
94 |
+
setp.lt.u64 %p13, %rd33, 50257;
|
95 |
+
setp.lt.u64 %p11, %rd32, 50257;
|
96 |
+
setp.lt.u64 %p9, %rd31, 50257;
|
97 |
+
setp.lt.u64 %p7, %rd30, 50257;
|
98 |
+
setp.lt.u64 %p5, %rd29, 50257;
|
99 |
+
setp.lt.u64 %p3, %rd28, 50257;
|
100 |
+
.loc 1 31 34
|
101 |
+
shl.b64 %rd36, %rd28, 1;
|
102 |
+
add.s64 %rd20, %rd10, %rd36;
|
103 |
+
shl.b64 %rd37, %rd29, 1;
|
104 |
+
add.s64 %rd21, %rd10, %rd37;
|
105 |
+
shl.b64 %rd38, %rd30, 1;
|
106 |
+
add.s64 %rd22, %rd10, %rd38;
|
107 |
+
shl.b64 %rd39, %rd31, 1;
|
108 |
+
add.s64 %rd23, %rd10, %rd39;
|
109 |
+
shl.b64 %rd40, %rd32, 1;
|
110 |
+
add.s64 %rd24, %rd10, %rd40;
|
111 |
+
shl.b64 %rd41, %rd33, 1;
|
112 |
+
add.s64 %rd25, %rd10, %rd41;
|
113 |
+
shl.b64 %rd42, %rd34, 1;
|
114 |
+
add.s64 %rd26, %rd10, %rd42;
|
115 |
+
shl.b64 %rd43, %rd35, 1;
|
116 |
+
add.s64 %rd27, %rd10, %rd43;
|
117 |
+
.loc 1 31 52
|
118 |
+
mov.u16 %rs1, 0x0;
|
119 |
+
@%p3 ld.global.L1::evict_last.b16 { %rs1 }, [ %rd20 + 0 ];
|
120 |
+
@!%p3 mov.u16 %rs1, %rs2;
|
121 |
+
mov.u16 %rs3, 0x0;
|
122 |
+
@%p5 ld.global.L1::evict_last.b16 { %rs3 }, [ %rd21 + 0 ];
|
123 |
+
@!%p5 mov.u16 %rs3, %rs2;
|
124 |
+
mov.u16 %rs5, 0x0;
|
125 |
+
@%p7 ld.global.L1::evict_last.b16 { %rs5 }, [ %rd22 + 0 ];
|
126 |
+
@!%p7 mov.u16 %rs5, %rs2;
|
127 |
+
mov.u16 %rs7, 0x0;
|
128 |
+
@%p9 ld.global.L1::evict_last.b16 { %rs7 }, [ %rd23 + 0 ];
|
129 |
+
@!%p9 mov.u16 %rs7, %rs2;
|
130 |
+
mov.u16 %rs9, 0x0;
|
131 |
+
@%p11 ld.global.L1::evict_last.b16 { %rs9 }, [ %rd24 + 0 ];
|
132 |
+
@!%p11 mov.u16 %rs9, %rs2;
|
133 |
+
mov.u16 %rs11, 0x0;
|
134 |
+
@%p13 ld.global.L1::evict_last.b16 { %rs11 }, [ %rd25 + 0 ];
|
135 |
+
@!%p13 mov.u16 %rs11, %rs2;
|
136 |
+
mov.u16 %rs13, 0x0;
|
137 |
+
@%p15 ld.global.L1::evict_last.b16 { %rs13 }, [ %rd26 + 0 ];
|
138 |
+
@!%p15 mov.u16 %rs13, %rs2;
|
139 |
+
mov.u16 %rs15, 0x0;
|
140 |
+
@%p17 ld.global.L1::evict_last.b16 { %rs15 }, [ %rd27 + 0 ];
|
141 |
+
@!%p17 mov.u16 %rs15, %rs2;
|
142 |
+
.loc 1 31 103
|
143 |
+
cvt.f32.bf16 %r15, %rs1;
|
144 |
+
mov.b32 %f42, %r15;
|
145 |
+
cvt.f32.bf16 %r16, %rs3;
|
146 |
+
mov.b32 %f43, %r16;
|
147 |
+
cvt.f32.bf16 %r17, %rs5;
|
148 |
+
mov.b32 %f44, %r17;
|
149 |
+
cvt.f32.bf16 %r18, %rs7;
|
150 |
+
mov.b32 %f45, %r18;
|
151 |
+
cvt.f32.bf16 %r19, %rs9;
|
152 |
+
mov.b32 %f46, %r19;
|
153 |
+
cvt.f32.bf16 %r20, %rs11;
|
154 |
+
mov.b32 %f47, %r20;
|
155 |
+
cvt.f32.bf16 %r21, %rs13;
|
156 |
+
mov.b32 %f48, %r21;
|
157 |
+
cvt.f32.bf16 %r22, %rs15;
|
158 |
+
mov.b32 %f49, %r22;
|
159 |
+
$L__tmp3:
|
160 |
+
.loc 2 36 15
|
161 |
+
setp.leu.f32 %p26, %f178, %f42;
|
162 |
+
setp.leu.f32 %p27, %f179, %f43;
|
163 |
+
setp.leu.f32 %p28, %f180, %f44;
|
164 |
+
setp.leu.f32 %p29, %f181, %f45;
|
165 |
+
setp.leu.f32 %p30, %f182, %f46;
|
166 |
+
setp.leu.f32 %p31, %f183, %f47;
|
167 |
+
setp.leu.f32 %p32, %f184, %f48;
|
168 |
+
setp.leu.f32 %p33, %f185, %f49;
|
169 |
+
$L__tmp4:
|
170 |
+
.loc 1 0 0
|
171 |
+
selp.f32 %f50, %f49, %f185, %p33;
|
172 |
+
selp.f32 %f51, %f50, %f185, %p25;
|
173 |
+
selp.f32 %f185, %f51, %f185, %p17;
|
174 |
+
selp.f32 %f52, %f48, %f184, %p32;
|
175 |
+
selp.f32 %f53, %f52, %f184, %p24;
|
176 |
+
selp.f32 %f184, %f53, %f184, %p15;
|
177 |
+
selp.f32 %f54, %f47, %f183, %p31;
|
178 |
+
selp.f32 %f55, %f54, %f183, %p23;
|
179 |
+
selp.f32 %f183, %f55, %f183, %p13;
|
180 |
+
selp.f32 %f56, %f46, %f182, %p30;
|
181 |
+
selp.f32 %f57, %f56, %f182, %p22;
|
182 |
+
selp.f32 %f182, %f57, %f182, %p11;
|
183 |
+
selp.f32 %f58, %f45, %f181, %p29;
|
184 |
+
selp.f32 %f59, %f58, %f181, %p21;
|
185 |
+
selp.f32 %f181, %f59, %f181, %p9;
|
186 |
+
selp.f32 %f60, %f44, %f180, %p28;
|
187 |
+
selp.f32 %f61, %f60, %f180, %p20;
|
188 |
+
selp.f32 %f180, %f61, %f180, %p7;
|
189 |
+
selp.f32 %f62, %f43, %f179, %p27;
|
190 |
+
selp.f32 %f63, %f62, %f179, %p19;
|
191 |
+
selp.f32 %f179, %f63, %f179, %p5;
|
192 |
+
selp.f32 %f64, %f42, %f178, %p26;
|
193 |
+
selp.f32 %f65, %f64, %f178, %p18;
|
194 |
+
selp.f32 %f178, %f65, %f178, %p3;
|
195 |
+
.loc 1 27 36
|
196 |
+
add.s64 %rd73, %rd73, 2048;
|
197 |
+
cvt.u32.u64 %r23, %rd73;
|
198 |
+
add.s32 %r24, %r23, -2048;
|
199 |
+
setp.lt.u32 %p34, %r24, 48209;
|
200 |
+
@%p34 bra $L__BB0_1;
|
201 |
+
.loc 1 24 33
|
202 |
+
and.b32 %r4, %r1, 31;
|
203 |
+
and.b32 %r32, %r2, 7;
|
204 |
+
$L__tmp5:
|
205 |
+
.loc 2 36 15
|
206 |
+
setp.gt.f32 %p39, %f178, %f179;
|
207 |
+
.loc 2 38 21
|
208 |
+
setp.nan.f32 %p40, %f178, %f178;
|
209 |
+
.loc 2 39 29
|
210 |
+
selp.f32 %f74, %f178, %f179, %p40;
|
211 |
+
selp.f32 %f75, %f178, %f74, %p39;
|
212 |
+
.loc 2 36 15
|
213 |
+
setp.gt.f32 %p41, %f75, %f180;
|
214 |
+
.loc 2 38 21
|
215 |
+
setp.nan.f32 %p42, %f75, %f75;
|
216 |
+
.loc 2 39 29
|
217 |
+
selp.f32 %f76, %f75, %f180, %p42;
|
218 |
+
selp.f32 %f77, %f75, %f76, %p41;
|
219 |
+
.loc 2 36 15
|
220 |
+
setp.gt.f32 %p43, %f77, %f181;
|
221 |
+
.loc 2 38 21
|
222 |
+
setp.nan.f32 %p44, %f77, %f77;
|
223 |
+
.loc 2 39 29
|
224 |
+
selp.f32 %f78, %f77, %f181, %p44;
|
225 |
+
selp.f32 %f79, %f77, %f78, %p43;
|
226 |
+
.loc 2 36 15
|
227 |
+
setp.gt.f32 %p45, %f79, %f182;
|
228 |
+
.loc 2 38 21
|
229 |
+
setp.nan.f32 %p46, %f79, %f79;
|
230 |
+
.loc 2 39 29
|
231 |
+
selp.f32 %f80, %f79, %f182, %p46;
|
232 |
+
selp.f32 %f81, %f79, %f80, %p45;
|
233 |
+
.loc 2 36 15
|
234 |
+
setp.gt.f32 %p47, %f81, %f183;
|
235 |
+
.loc 2 38 21
|
236 |
+
setp.nan.f32 %p48, %f81, %f81;
|
237 |
+
.loc 2 39 29
|
238 |
+
selp.f32 %f82, %f81, %f183, %p48;
|
239 |
+
selp.f32 %f83, %f81, %f82, %p47;
|
240 |
+
.loc 2 36 15
|
241 |
+
setp.gt.f32 %p49, %f83, %f184;
|
242 |
+
.loc 2 38 21
|
243 |
+
setp.nan.f32 %p50, %f83, %f83;
|
244 |
+
.loc 2 39 29
|
245 |
+
selp.f32 %f84, %f83, %f184, %p50;
|
246 |
+
selp.f32 %f85, %f83, %f84, %p49;
|
247 |
+
.loc 2 36 15
|
248 |
+
setp.gt.f32 %p51, %f85, %f185;
|
249 |
+
.loc 2 38 21
|
250 |
+
setp.nan.f32 %p52, %f85, %f85;
|
251 |
+
.loc 2 39 29
|
252 |
+
selp.f32 %f86, %f85, %f185, %p52;
|
253 |
+
selp.f32 %f87, %f85, %f86, %p51;
|
254 |
+
$L__tmp6:
|
255 |
+
.loc 2 49 29
|
256 |
+
mov.b32 %r33, %f87;
|
257 |
+
shfl.sync.bfly.b32 %r34, %r33, 16, 31, -1;
|
258 |
+
mov.b32 %f88, %r34;
|
259 |
+
$L__tmp7:
|
260 |
+
.loc 2 36 15
|
261 |
+
setp.gt.f32 %p53, %f87, %f88;
|
262 |
+
.loc 2 38 21
|
263 |
+
setp.nan.f32 %p54, %f87, %f87;
|
264 |
+
.loc 2 39 29
|
265 |
+
selp.f32 %f89, %f87, %f88, %p53;
|
266 |
+
selp.f32 %f90, %f87, %f89, %p54;
|
267 |
+
$L__tmp8:
|
268 |
+
.loc 2 49 29
|
269 |
+
mov.b32 %r35, %f90;
|
270 |
+
shfl.sync.bfly.b32 %r36, %r35, 8, 31, -1;
|
271 |
+
mov.b32 %f91, %r36;
|
272 |
+
$L__tmp9:
|
273 |
+
.loc 2 36 15
|
274 |
+
setp.gt.f32 %p55, %f90, %f91;
|
275 |
+
.loc 2 38 21
|
276 |
+
setp.nan.f32 %p56, %f90, %f90;
|
277 |
+
.loc 2 39 29
|
278 |
+
selp.f32 %f92, %f90, %f91, %p56;
|
279 |
+
selp.f32 %f93, %f90, %f92, %p55;
|
280 |
+
$L__tmp10:
|
281 |
+
.loc 2 49 29
|
282 |
+
mov.b32 %r37, %f93;
|
283 |
+
shfl.sync.bfly.b32 %r38, %r37, 4, 31, -1;
|
284 |
+
mov.b32 %f94, %r38;
|
285 |
+
$L__tmp11:
|
286 |
+
.loc 2 36 15
|
287 |
+
setp.gt.f32 %p57, %f93, %f94;
|
288 |
+
.loc 2 38 21
|
289 |
+
setp.nan.f32 %p58, %f93, %f93;
|
290 |
+
.loc 2 39 29
|
291 |
+
selp.f32 %f95, %f93, %f94, %p58;
|
292 |
+
selp.f32 %f96, %f93, %f95, %p57;
|
293 |
+
$L__tmp12:
|
294 |
+
.loc 2 49 29
|
295 |
+
mov.b32 %r39, %f96;
|
296 |
+
shfl.sync.bfly.b32 %r40, %r39, 2, 31, -1;
|
297 |
+
mov.b32 %f97, %r40;
|
298 |
+
$L__tmp13:
|
299 |
+
.loc 2 36 15
|
300 |
+
setp.gt.f32 %p59, %f96, %f97;
|
301 |
+
.loc 2 38 21
|
302 |
+
setp.nan.f32 %p60, %f96, %f96;
|
303 |
+
.loc 2 39 29
|
304 |
+
selp.f32 %f98, %f96, %f97, %p60;
|
305 |
+
selp.f32 %f99, %f96, %f98, %p59;
|
306 |
+
$L__tmp14:
|
307 |
+
.loc 2 49 29
|
308 |
+
mov.b32 %r41, %f99;
|
309 |
+
shfl.sync.bfly.b32 %r42, %r41, 1, 31, -1;
|
310 |
+
mov.b32 %f100, %r42;
|
311 |
+
$L__tmp15:
|
312 |
+
.loc 2 36 15
|
313 |
+
setp.gt.f32 %p61, %f99, %f100;
|
314 |
+
.loc 2 38 21
|
315 |
+
setp.nan.f32 %p62, %f99, %f99;
|
316 |
+
.loc 2 39 29
|
317 |
+
selp.f32 %f101, %f99, %f100, %p62;
|
318 |
+
selp.f32 %f102, %f99, %f101, %p61;
|
319 |
+
$L__tmp16:
|
320 |
+
.loc 2 49 29
|
321 |
+
setp.eq.s32 %p35, %r4, 0;
|
322 |
+
shl.b32 %r43, %r32, 2;
|
323 |
+
mov.u32 %r44, global_smem;
|
324 |
+
add.s32 %r62, %r44, %r43;
|
325 |
+
mov.b32 %r26, %f102;
|
326 |
+
@%p35 st.shared.b32 [ %r62 + 0 ], %r26;
|
327 |
+
bar.sync 0;
|
328 |
+
setp.lt.s32 %p36, %r1, 8;
|
329 |
+
shl.b32 %r45, %r1, 2;
|
330 |
+
add.s32 %r65, %r44, %r45;
|
331 |
+
@%p36 ld.shared.b32 %r27, [ %r65 + 0 ];
|
332 |
+
mov.b32 %f103, %r27;
|
333 |
+
shfl.sync.bfly.b32 %r46, %r27, 4, 31, -1;
|
334 |
+
mov.b32 %f104, %r46;
|
335 |
+
$L__tmp17:
|
336 |
+
.loc 2 36 15
|
337 |
+
setp.gt.f32 %p63, %f103, %f104;
|
338 |
+
.loc 2 38 21
|
339 |
+
setp.nan.f32 %p64, %f103, %f103;
|
340 |
+
.loc 2 39 29
|
341 |
+
selp.f32 %f105, %f103, %f104, %p63;
|
342 |
+
selp.f32 %f106, %f103, %f105, %p64;
|
343 |
+
$L__tmp18:
|
344 |
+
.loc 2 49 29
|
345 |
+
mov.b32 %r47, %f106;
|
346 |
+
shfl.sync.bfly.b32 %r48, %r47, 2, 31, -1;
|
347 |
+
mov.b32 %f107, %r48;
|
348 |
+
$L__tmp19:
|
349 |
+
.loc 2 36 15
|
350 |
+
setp.gt.f32 %p65, %f106, %f107;
|
351 |
+
.loc 2 38 21
|
352 |
+
setp.nan.f32 %p66, %f106, %f106;
|
353 |
+
.loc 2 39 29
|
354 |
+
selp.f32 %f108, %f106, %f107, %p66;
|
355 |
+
selp.f32 %f109, %f106, %f108, %p65;
|
356 |
+
$L__tmp20:
|
357 |
+
.loc 2 49 29
|
358 |
+
mov.b32 %r49, %f109;
|
359 |
+
shfl.sync.bfly.b32 %r50, %r49, 1, 31, -1;
|
360 |
+
mov.b32 %f110, %r50;
|
361 |
+
$L__tmp21:
|
362 |
+
.loc 2 36 15
|
363 |
+
setp.gt.f32 %p67, %f109, %f110;
|
364 |
+
.loc 2 38 21
|
365 |
+
setp.nan.f32 %p68, %f109, %f109;
|
366 |
+
.loc 2 39 29
|
367 |
+
selp.f32 %f111, %f109, %f110, %p68;
|
368 |
+
selp.f32 %f112, %f109, %f111, %p67;
|
369 |
+
$L__tmp22:
|
370 |
+
.loc 2 49 29
|
371 |
+
and.b32 %r51, %r1, 7;
|
372 |
+
setp.eq.s32 %p69, %r51, 0;
|
373 |
+
and.pred %p89, %p36, %p69;
|
374 |
+
mov.b32 %r30, %f112;
|
375 |
+
@%p89 st.shared.b32 [ %r65 + 0 ], %r30;
|
376 |
+
bar.sync 0;
|
377 |
+
ld.shared.f32 %f17, [global_smem];
|
378 |
+
$L__tmp23:
|
379 |
+
.loc 1 36 41
|
380 |
+
bar.sync 0;
|
381 |
+
st.shared.f32 [global_smem], %f17;
|
382 |
+
bar.sync 0;
|
383 |
+
ld.shared.u32 %r31, [global_smem];
|
384 |
+
.loc 1 37 25
|
385 |
+
shl.b64 %rd46, %rd1, 2;
|
386 |
+
add.s64 %rd44, %rd15, %rd46;
|
387 |
+
.loc 1 37 36
|
388 |
+
setp.eq.s32 %p38, %r3, 0;
|
389 |
+
@%p38 st.global.b32 [ %rd44 + 0 ], { %r31 };
|
390 |
+
mov.f32 %f186, 0f00000000;
|
391 |
+
mov.u64 %rd74, 0;
|
392 |
+
mov.f32 %f187, %f186;
|
393 |
+
mov.f32 %f188, %f186;
|
394 |
+
mov.f32 %f189, %f186;
|
395 |
+
mov.f32 %f190, %f186;
|
396 |
+
mov.f32 %f191, %f186;
|
397 |
+
mov.f32 %f192, %f186;
|
398 |
+
mov.f32 %f193, %f186;
|
399 |
+
$L__BB0_3:
|
400 |
+
.loc 1 40 27
|
401 |
+
or.b64 %rd55, %rd74, %rd2;
|
402 |
+
or.b64 %rd56, %rd74, %rd3;
|
403 |
+
or.b64 %rd57, %rd74, %rd4;
|
404 |
+
or.b64 %rd58, %rd74, %rd5;
|
405 |
+
or.b64 %rd59, %rd74, %rd6;
|
406 |
+
or.b64 %rd60, %rd74, %rd7;
|
407 |
+
or.b64 %rd61, %rd74, %rd8;
|
408 |
+
or.b64 %rd62, %rd74, %rd9;
|
409 |
+
.loc 1 41 25
|
410 |
+
setp.lt.u64 %p85, %rd62, 50257;
|
411 |
+
setp.lt.u64 %p83, %rd61, 50257;
|
412 |
+
setp.lt.u64 %p81, %rd60, 50257;
|
413 |
+
setp.lt.u64 %p79, %rd59, 50257;
|
414 |
+
setp.lt.u64 %p77, %rd58, 50257;
|
415 |
+
setp.lt.u64 %p75, %rd57, 50257;
|
416 |
+
setp.lt.u64 %p73, %rd56, 50257;
|
417 |
+
setp.lt.u64 %p71, %rd55, 50257;
|
418 |
+
.loc 1 43 34
|
419 |
+
shl.b64 %rd63, %rd55, 1;
|
420 |
+
add.s64 %rd47, %rd10, %rd63;
|
421 |
+
shl.b64 %rd64, %rd56, 1;
|
422 |
+
add.s64 %rd48, %rd10, %rd64;
|
423 |
+
shl.b64 %rd65, %rd57, 1;
|
424 |
+
add.s64 %rd49, %rd10, %rd65;
|
425 |
+
shl.b64 %rd66, %rd58, 1;
|
426 |
+
add.s64 %rd50, %rd10, %rd66;
|
427 |
+
shl.b64 %rd67, %rd59, 1;
|
428 |
+
add.s64 %rd51, %rd10, %rd67;
|
429 |
+
shl.b64 %rd68, %rd60, 1;
|
430 |
+
add.s64 %rd52, %rd10, %rd68;
|
431 |
+
shl.b64 %rd69, %rd61, 1;
|
432 |
+
add.s64 %rd53, %rd10, %rd69;
|
433 |
+
shl.b64 %rd70, %rd62, 1;
|
434 |
+
add.s64 %rd54, %rd10, %rd70;
|
435 |
+
.loc 1 43 52
|
436 |
+
mov.u16 %rs25, 0x0;
|
437 |
+
@%p71 ld.global.L1::evict_first.b16 { %rs25 }, [ %rd47 + 0 ];
|
438 |
+
@!%p71 mov.u16 %rs25, %rs2;
|
439 |
+
mov.u16 %rs27, 0x0;
|
440 |
+
@%p73 ld.global.L1::evict_first.b16 { %rs27 }, [ %rd48 + 0 ];
|
441 |
+
@!%p73 mov.u16 %rs27, %rs2;
|
442 |
+
mov.u16 %rs29, 0x0;
|
443 |
+
@%p75 ld.global.L1::evict_first.b16 { %rs29 }, [ %rd49 + 0 ];
|
444 |
+
@!%p75 mov.u16 %rs29, %rs2;
|
445 |
+
mov.u16 %rs31, 0x0;
|
446 |
+
@%p77 ld.global.L1::evict_first.b16 { %rs31 }, [ %rd50 + 0 ];
|
447 |
+
@!%p77 mov.u16 %rs31, %rs2;
|
448 |
+
mov.u16 %rs33, 0x0;
|
449 |
+
@%p79 ld.global.L1::evict_first.b16 { %rs33 }, [ %rd51 + 0 ];
|
450 |
+
@!%p79 mov.u16 %rs33, %rs2;
|
451 |
+
mov.u16 %rs35, 0x0;
|
452 |
+
@%p81 ld.global.L1::evict_first.b16 { %rs35 }, [ %rd52 + 0 ];
|
453 |
+
@!%p81 mov.u16 %rs35, %rs2;
|
454 |
+
mov.u16 %rs37, 0x0;
|
455 |
+
@%p83 ld.global.L1::evict_first.b16 { %rs37 }, [ %rd53 + 0 ];
|
456 |
+
@!%p83 mov.u16 %rs37, %rs2;
|
457 |
+
mov.u16 %rs39, 0x0;
|
458 |
+
@%p85 ld.global.L1::evict_first.b16 { %rs39 }, [ %rd54 + 0 ];
|
459 |
+
@!%p85 mov.u16 %rs39, %rs2;
|
460 |
+
.loc 1 43 104
|
461 |
+
cvt.f32.bf16 %r52, %rs25;
|
462 |
+
mov.b32 %f129, %r52;
|
463 |
+
cvt.f32.bf16 %r53, %rs27;
|
464 |
+
mov.b32 %f130, %r53;
|
465 |
+
cvt.f32.bf16 %r54, %rs29;
|
466 |
+
mov.b32 %f131, %r54;
|
467 |
+
cvt.f32.bf16 %r55, %rs31;
|
468 |
+
mov.b32 %f132, %r55;
|
469 |
+
cvt.f32.bf16 %r56, %rs33;
|
470 |
+
mov.b32 %f133, %r56;
|
471 |
+
cvt.f32.bf16 %r57, %rs35;
|
472 |
+
mov.b32 %f134, %r57;
|
473 |
+
cvt.f32.bf16 %r58, %rs37;
|
474 |
+
mov.b32 %f135, %r58;
|
475 |
+
cvt.f32.bf16 %r59, %rs39;
|
476 |
+
mov.b32 %f136, %r59;
|
477 |
+
.loc 1 45 22
|
478 |
+
sub.f32 %f137, %f129, %f17;
|
479 |
+
sub.f32 %f138, %f130, %f17;
|
480 |
+
sub.f32 %f139, %f131, %f17;
|
481 |
+
sub.f32 %f140, %f132, %f17;
|
482 |
+
sub.f32 %f141, %f133, %f17;
|
483 |
+
sub.f32 %f142, %f134, %f17;
|
484 |
+
sub.f32 %f143, %f135, %f17;
|
485 |
+
sub.f32 %f144, %f136, %f17;
|
486 |
+
.loc 1 46 22
|
487 |
+
mul.f32 %f114, %f137, 0f3FB8AA3B;
|
488 |
+
ex2.approx.f32 %f113, %f114;
|
489 |
+
mul.f32 %f116, %f138, 0f3FB8AA3B;
|
490 |
+
ex2.approx.f32 %f115, %f116;
|
491 |
+
mul.f32 %f118, %f139, 0f3FB8AA3B;
|
492 |
+
ex2.approx.f32 %f117, %f118;
|
493 |
+
mul.f32 %f120, %f140, 0f3FB8AA3B;
|
494 |
+
ex2.approx.f32 %f119, %f120;
|
495 |
+
mul.f32 %f122, %f141, 0f3FB8AA3B;
|
496 |
+
ex2.approx.f32 %f121, %f122;
|
497 |
+
mul.f32 %f124, %f142, 0f3FB8AA3B;
|
498 |
+
ex2.approx.f32 %f123, %f124;
|
499 |
+
mul.f32 %f126, %f143, 0f3FB8AA3B;
|
500 |
+
ex2.approx.f32 %f125, %f126;
|
501 |
+
mul.f32 %f128, %f144, 0f3FB8AA3B;
|
502 |
+
ex2.approx.f32 %f127, %f128;
|
503 |
+
.loc 1 49 40
|
504 |
+
selp.f32 %f145, %f113, 0f80000000, %p71;
|
505 |
+
selp.f32 %f146, %f115, 0f80000000, %p73;
|
506 |
+
selp.f32 %f147, %f117, 0f80000000, %p75;
|
507 |
+
selp.f32 %f148, %f119, 0f80000000, %p77;
|
508 |
+
selp.f32 %f149, %f121, 0f80000000, %p79;
|
509 |
+
selp.f32 %f150, %f123, 0f80000000, %p81;
|
510 |
+
selp.f32 %f151, %f125, 0f80000000, %p83;
|
511 |
+
selp.f32 %f152, %f127, 0f80000000, %p85;
|
512 |
+
add.f32 %f193, %f193, %f152;
|
513 |
+
add.f32 %f192, %f192, %f151;
|
514 |
+
add.f32 %f191, %f191, %f150;
|
515 |
+
add.f32 %f190, %f190, %f149;
|
516 |
+
add.f32 %f189, %f189, %f148;
|
517 |
+
add.f32 %f188, %f188, %f147;
|
518 |
+
add.f32 %f187, %f187, %f146;
|
519 |
+
add.f32 %f186, %f186, %f145;
|
520 |
+
.loc 1 39 36
|
521 |
+
add.s64 %rd74, %rd74, 2048;
|
522 |
+
cvt.u32.u64 %r60, %rd74;
|
523 |
+
add.s32 %r61, %r60, -2048;
|
524 |
+
setp.lt.u32 %p86, %r61, 48209;
|
525 |
+
@%p86 bra $L__BB0_3;
|
526 |
+
$L__tmp24:
|
527 |
+
.loc 3 243 36
|
528 |
+
bar.sync 0;
|
529 |
+
$L__tmp25:
|
530 |
+
.loc 3 233 15
|
531 |
+
add.f32 %f153, %f186, %f187;
|
532 |
+
add.f32 %f154, %f188, %f153;
|
533 |
+
add.f32 %f155, %f189, %f154;
|
534 |
+
add.f32 %f156, %f190, %f155;
|
535 |
+
add.f32 %f157, %f191, %f156;
|
536 |
+
add.f32 %f158, %f192, %f157;
|
537 |
+
add.f32 %f159, %f193, %f158;
|
538 |
+
$L__tmp26:
|
539 |
+
.loc 3 243 36
|
540 |
+
mov.b32 %r69, %f159;
|
541 |
+
shfl.sync.bfly.b32 %r70, %r69, 16, 31, -1;
|
542 |
+
mov.b32 %f160, %r70;
|
543 |
+
$L__tmp27:
|
544 |
+
.loc 3 233 15
|
545 |
+
add.f32 %f161, %f159, %f160;
|
546 |
+
$L__tmp28:
|
547 |
+
.loc 3 243 36
|
548 |
+
mov.b32 %r71, %f161;
|
549 |
+
shfl.sync.bfly.b32 %r72, %r71, 8, 31, -1;
|
550 |
+
mov.b32 %f162, %r72;
|
551 |
+
$L__tmp29:
|
552 |
+
.loc 3 233 15
|
553 |
+
add.f32 %f163, %f161, %f162;
|
554 |
+
$L__tmp30:
|
555 |
+
.loc 3 243 36
|
556 |
+
mov.b32 %r73, %f163;
|
557 |
+
shfl.sync.bfly.b32 %r74, %r73, 4, 31, -1;
|
558 |
+
mov.b32 %f164, %r74;
|
559 |
+
$L__tmp31:
|
560 |
+
.loc 3 233 15
|
561 |
+
add.f32 %f165, %f163, %f164;
|
562 |
+
$L__tmp32:
|
563 |
+
.loc 3 243 36
|
564 |
+
mov.b32 %r75, %f165;
|
565 |
+
shfl.sync.bfly.b32 %r76, %r75, 2, 31, -1;
|
566 |
+
mov.b32 %f166, %r76;
|
567 |
+
$L__tmp33:
|
568 |
+
.loc 3 233 15
|
569 |
+
add.f32 %f167, %f165, %f166;
|
570 |
+
$L__tmp34:
|
571 |
+
.loc 3 243 36
|
572 |
+
mov.b32 %r77, %f167;
|
573 |
+
shfl.sync.bfly.b32 %r78, %r77, 1, 31, -1;
|
574 |
+
mov.b32 %f168, %r78;
|
575 |
+
$L__tmp35:
|
576 |
+
.loc 3 233 15
|
577 |
+
add.f32 %f169, %f167, %f168;
|
578 |
+
$L__tmp36:
|
579 |
+
.loc 3 243 36
|
580 |
+
mov.b32 %r63, %f169;
|
581 |
+
@%p35 st.shared.b32 [ %r62 + 0 ], %r63;
|
582 |
+
bar.sync 0;
|
583 |
+
@%p36 ld.shared.b32 %r64, [ %r65 + 0 ];
|
584 |
+
mov.b32 %f170, %r64;
|
585 |
+
shfl.sync.bfly.b32 %r79, %r64, 4, 31, -1;
|
586 |
+
mov.b32 %f171, %r79;
|
587 |
+
$L__tmp37:
|
588 |
+
.loc 3 233 15
|
589 |
+
add.f32 %f172, %f170, %f171;
|
590 |
+
$L__tmp38:
|
591 |
+
.loc 3 243 36
|
592 |
+
mov.b32 %r80, %f172;
|
593 |
+
shfl.sync.bfly.b32 %r81, %r80, 2, 31, -1;
|
594 |
+
mov.b32 %f173, %r81;
|
595 |
+
$L__tmp39:
|
596 |
+
.loc 3 233 15
|
597 |
+
add.f32 %f174, %f172, %f173;
|
598 |
+
$L__tmp40:
|
599 |
+
.loc 3 243 36
|
600 |
+
mov.b32 %r82, %f174;
|
601 |
+
shfl.sync.bfly.b32 %r83, %r82, 1, 31, -1;
|
602 |
+
mov.b32 %f175, %r83;
|
603 |
+
$L__tmp41:
|
604 |
+
.loc 3 233 15
|
605 |
+
add.f32 %f176, %f174, %f175;
|
606 |
+
$L__tmp42:
|
607 |
+
.loc 3 243 36
|
608 |
+
mov.b32 %r67, %f176;
|
609 |
+
@%p89 st.shared.b32 [ %r65 + 0 ], %r67;
|
610 |
+
bar.sync 0;
|
611 |
+
ld.shared.f32 %f177, [global_smem];
|
612 |
+
$L__tmp43:
|
613 |
+
.loc 1 50 30
|
614 |
+
bar.sync 0;
|
615 |
+
st.shared.f32 [global_smem], %f177;
|
616 |
+
bar.sync 0;
|
617 |
+
ld.shared.u32 %r68, [global_smem];
|
618 |
+
.loc 1 51 25
|
619 |
+
add.s64 %rd71, %rd16, %rd46;
|
620 |
+
.loc 1 51 37
|
621 |
+
@%p38 st.global.b32 [ %rd71 + 0 ], { %r68 };
|
622 |
+
.loc 1 51 4
|
623 |
+
ret;
|
624 |
+
$L__tmp44:
|
625 |
+
$L__func_end0:
|
626 |
+
|
627 |
+
}
|
628 |
+
.file 1 "/tmp/torchinductor_root/cy/ccyhhqogjmaiuaq7b54att75rswph7r3hvxgfmkjyupj74n77r6i.py"
|
629 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
|
630 |
+
.file 3 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
|
631 |
+
.section .debug_abbrev
|
632 |
+
{
|
633 |
+
.b8 1
|
634 |
+
.b8 17
|
635 |
+
.b8 1
|
636 |
+
.b8 37
|
637 |
+
.b8 8
|
638 |
+
.b8 19
|
639 |
+
.b8 5
|
640 |
+
.b8 3
|
641 |
+
.b8 8
|
642 |
+
.b8 16
|
643 |
+
.b8 6
|
644 |
+
.b8 27
|
645 |
+
.b8 8
|
646 |
+
.b8 180
|
647 |
+
.b8 66
|
648 |
+
.b8 12
|
649 |
+
.b8 17
|
650 |
+
.b8 1
|
651 |
+
.b8 18
|
652 |
+
.b8 1
|
653 |
+
.b8 0
|
654 |
+
.b8 0
|
655 |
+
.b8 2
|
656 |
+
.b8 46
|
657 |
+
.b8 0
|
658 |
+
.b8 135
|
659 |
+
.b8 64
|
660 |
+
.b8 8
|
661 |
+
.b8 3
|
662 |
+
.b8 8
|
663 |
+
.b8 58
|
664 |
+
.b8 11
|
665 |
+
.b8 59
|
666 |
+
.b8 11
|
667 |
+
.b8 63
|
668 |
+
.b8 12
|
669 |
+
.b8 32
|
670 |
+
.b8 11
|
671 |
+
.b8 0
|
672 |
+
.b8 0
|
673 |
+
.b8 3
|
674 |
+
.b8 46
|
675 |
+
.b8 1
|
676 |
+
.b8 17
|
677 |
+
.b8 1
|
678 |
+
.b8 18
|
679 |
+
.b8 1
|
680 |
+
.b8 64
|
681 |
+
.b8 10
|
682 |
+
.b8 49
|
683 |
+
.b8 19
|
684 |
+
.b8 0
|
685 |
+
.b8 0
|
686 |
+
.b8 4
|
687 |
+
.b8 29
|
688 |
+
.b8 0
|
689 |
+
.b8 49
|
690 |
+
.b8 19
|
691 |
+
.b8 17
|
692 |
+
.b8 1
|
693 |
+
.b8 18
|
694 |
+
.b8 1
|
695 |
+
.b8 88
|
696 |
+
.b8 11
|
697 |
+
.b8 89
|
698 |
+
.b8 11
|
699 |
+
.b8 87
|
700 |
+
.b8 11
|
701 |
+
.b8 0
|
702 |
+
.b8 0
|
703 |
+
.b8 5
|
704 |
+
.b8 29
|
705 |
+
.b8 1
|
706 |
+
.b8 49
|
707 |
+
.b8 19
|
708 |
+
.b8 17
|
709 |
+
.b8 1
|
710 |
+
.b8 18
|
711 |
+
.b8 1
|
712 |
+
.b8 88
|
713 |
+
.b8 11
|
714 |
+
.b8 89
|
715 |
+
.b8 11
|
716 |
+
.b8 87
|
717 |
+
.b8 11
|
718 |
+
.b8 0
|
719 |
+
.b8 0
|
720 |
+
.b8 0
|
721 |
+
}
|
722 |
+
.section .debug_info
|
723 |
+
{
|
724 |
+
.b32 359
|
725 |
+
.b8 2
|
726 |
+
.b8 0
|
727 |
+
.b32 .debug_abbrev
|
728 |
+
.b8 8
|
729 |
+
.b8 1
|
730 |
+
.b8 116
|
731 |
+
.b8 114
|
732 |
+
.b8 105
|
733 |
+
.b8 116
|
734 |
+
.b8 111
|
735 |
+
.b8 110
|
736 |
+
.b8 0
|
737 |
+
.b8 2
|
738 |
+
.b8 0
|
739 |
+
.b8 99
|
740 |
+
.b8 99
|
741 |
+
.b8 121
|
742 |
+
.b8 104
|
743 |
+
.b8 104
|
744 |
+
.b8 113
|
745 |
+
.b8 111
|
746 |
+
.b8 103
|
747 |
+
.b8 106
|
748 |
+
.b8 109
|
749 |
+
.b8 97
|
750 |
+
.b8 105
|
751 |
+
.b8 117
|
752 |
+
.b8 97
|
753 |
+
.b8 113
|
754 |
+
.b8 55
|
755 |
+
.b8 98
|
756 |
+
.b8 53
|
757 |
+
.b8 52
|
758 |
+
.b8 97
|
759 |
+
.b8 116
|
760 |
+
.b8 116
|
761 |
+
.b8 55
|
762 |
+
.b8 53
|
763 |
+
.b8 114
|
764 |
+
.b8 115
|
765 |
+
.b8 119
|
766 |
+
.b8 112
|
767 |
+
.b8 104
|
768 |
+
.b8 55
|
769 |
+
.b8 114
|
770 |
+
.b8 51
|
771 |
+
.b8 104
|
772 |
+
.b8 118
|
773 |
+
.b8 120
|
774 |
+
.b8 103
|
775 |
+
.b8 102
|
776 |
+
.b8 109
|
777 |
+
.b8 107
|
778 |
+
.b8 106
|
779 |
+
.b8 121
|
780 |
+
.b8 117
|
781 |
+
.b8 112
|
782 |
+
.b8 106
|
783 |
+
.b8 55
|
784 |
+
.b8 52
|
785 |
+
.b8 110
|
786 |
+
.b8 55
|
787 |
+
.b8 55
|
788 |
+
.b8 114
|
789 |
+
.b8 54
|
790 |
+
.b8 105
|
791 |
+
.b8 46
|
792 |
+
.b8 112
|
793 |
+
.b8 121
|
794 |
+
.b8 0
|
795 |
+
.b32 .debug_line
|
796 |
+
.b8 47
|
797 |
+
.b8 116
|
798 |
+
.b8 109
|
799 |
+
.b8 112
|
800 |
+
.b8 47
|
801 |
+
.b8 116
|
802 |
+
.b8 111
|
803 |
+
.b8 114
|
804 |
+
.b8 99
|
805 |
+
.b8 104
|
806 |
+
.b8 105
|
807 |
+
.b8 110
|
808 |
+
.b8 100
|
809 |
+
.b8 117
|
810 |
+
.b8 99
|
811 |
+
.b8 116
|
812 |
+
.b8 111
|
813 |
+
.b8 114
|
814 |
+
.b8 95
|
815 |
+
.b8 114
|
816 |
+
.b8 111
|
817 |
+
.b8 111
|
818 |
+
.b8 116
|
819 |
+
.b8 47
|
820 |
+
.b8 99
|
821 |
+
.b8 121
|
822 |
+
.b8 0
|
823 |
+
.b8 1
|
824 |
+
.b64 $L__func_begin0
|
825 |
+
.b64 $L__func_end0
|
826 |
+
.b8 2
|
827 |
+
.b8 116
|
828 |
+
.b8 114
|
829 |
+
.b8 105
|
830 |
+
.b8 116
|
831 |
+
.b8 111
|
832 |
+
.b8 110
|
833 |
+
.b8 95
|
834 |
+
.b8 95
|
835 |
+
.b8 48
|
836 |
+
.b8 100
|
837 |
+
.b8 49
|
838 |
+
.b8 100
|
839 |
+
.b8 50
|
840 |
+
.b8 100
|
841 |
+
.b8 51
|
842 |
+
.b8 100
|
843 |
+
.b8 101
|
844 |
+
.b8 52
|
845 |
+
.b8 0
|
846 |
+
.b8 116
|
847 |
+
.b8 114
|
848 |
+
.b8 105
|
849 |
+
.b8 116
|
850 |
+
.b8 111
|
851 |
+
.b8 110
|
852 |
+
.b8 95
|
853 |
+
.b8 95
|
854 |
+
.b8 48
|
855 |
+
.b8 100
|
856 |
+
.b8 49
|
857 |
+
.b8 100
|
858 |
+
.b8 50
|
859 |
+
.b8 100
|
860 |
+
.b8 51
|
861 |
+
.b8 100
|
862 |
+
.b8 101
|
863 |
+
.b8 52
|
864 |
+
.b8 0
|
865 |
+
.b8 1
|
866 |
+
.b8 18
|
867 |
+
.b8 1
|
868 |
+
.b8 1
|
869 |
+
.b8 3
|
870 |
+
.b64 $L__func_begin0
|
871 |
+
.b64 $L__func_end0
|
872 |
+
.b8 1
|
873 |
+
.b8 156
|
874 |
+
.b32 125
|
875 |
+
.b8 4
|
876 |
+
.b32 125
|
877 |
+
.b64 $L__tmp1
|
878 |
+
.b64 $L__tmp4
|
879 |
+
.b8 2
|
880 |
+
.b8 34
|
881 |
+
.b8 45
|
882 |
+
.b8 5
|
883 |
+
.b32 125
|
884 |
+
.b64 $L__tmp5
|
885 |
+
.b64 $L__tmp22
|
886 |
+
.b8 2
|
887 |
+
.b8 36
|
888 |
+
.b8 38
|
889 |
+
.b8 4
|
890 |
+
.b32 125
|
891 |
+
.b64 $L__tmp5
|
892 |
+
.b64 $L__tmp22
|
893 |
+
.b8 2
|
894 |
+
.b8 49
|
895 |
+
.b8 29
|
896 |
+
.b8 0
|
897 |
+
.b8 4
|
898 |
+
.b32 125
|
899 |
+
.b64 $L__tmp6
|
900 |
+
.b64 $L__tmp23
|
901 |
+
.b8 2
|
902 |
+
.b8 36
|
903 |
+
.b8 38
|
904 |
+
.b8 4
|
905 |
+
.b32 125
|
906 |
+
.b64 $L__tmp24
|
907 |
+
.b64 $L__tmp43
|
908 |
+
.b8 3
|
909 |
+
.b8 50
|
910 |
+
.b8 27
|
911 |
+
.b8 5
|
912 |
+
.b32 125
|
913 |
+
.b64 $L__tmp25
|
914 |
+
.b64 $L__tmp42
|
915 |
+
.b8 3
|
916 |
+
.b8 50
|
917 |
+
.b8 27
|
918 |
+
.b8 4
|
919 |
+
.b32 125
|
920 |
+
.b64 $L__tmp25
|
921 |
+
.b64 $L__tmp42
|
922 |
+
.b8 3
|
923 |
+
.b8 243
|
924 |
+
.b8 36
|
925 |
+
.b8 0
|
926 |
+
.b8 0
|
927 |
+
.b8 0
|
928 |
+
}
|
929 |
+
.section .debug_pubnames
|
930 |
+
{
|
931 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
932 |
+
$L__pubNames_start0:
|
933 |
+
.b8 2
|
934 |
+
.b8 0
|
935 |
+
.b32 .debug_info
|
936 |
+
.b32 363
|
937 |
+
.b32 125
|
938 |
+
.b8 116
|
939 |
+
.b8 114
|
940 |
+
.b8 105
|
941 |
+
.b8 116
|
942 |
+
.b8 111
|
943 |
+
.b8 110
|
944 |
+
.b8 95
|
945 |
+
.b8 95
|
946 |
+
.b8 48
|
947 |
+
.b8 100
|
948 |
+
.b8 49
|
949 |
+
.b8 100
|
950 |
+
.b8 50
|
951 |
+
.b8 100
|
952 |
+
.b8 51
|
953 |
+
.b8 100
|
954 |
+
.b8 101
|
955 |
+
.b8 52
|
956 |
+
.b8 0
|
957 |
+
.b32 0
|
958 |
+
$L__pubNames_end0:
|
959 |
+
}
|
960 |
+
.section .debug_pubtypes
|
961 |
+
{
|
962 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
963 |
+
$L__pubTypes_start0:
|
964 |
+
.b8 2
|
965 |
+
.b8 0
|
966 |
+
.b32 .debug_info
|
967 |
+
.b32 363
|
968 |
+
.b32 0
|
969 |
+
$L__pubTypes_end0:
|
970 |
+
}
|
971 |
+
.section .debug_loc { }
|
.triton/dump/fac03406d1136fc802dac111a1efea36/triton_.ttir
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2d3de4(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i64) attributes {noinline = false} {
|
3 |
+
%c50257_i64 = arith.constant 50257 : i64
|
4 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16>
|
5 |
+
%cst_0 = arith.constant dense<true> : tensor<1x2048xi1>
|
6 |
+
%c50257_i32 = arith.constant 50257 : i32
|
7 |
+
%c2048_i32 = arith.constant 2048 : i32
|
8 |
+
%c0_i32 = arith.constant 0 : i32
|
9 |
+
%cst_1 = arith.constant dense<50257> : tensor<1x2048xi64>
|
10 |
+
%cst_2 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32>
|
11 |
+
%cst_3 = arith.constant dense<0xFF800000> : tensor<1x2048xf32>
|
12 |
+
%0 = tt.get_program_id x : i32
|
13 |
+
%1 = arith.extsi %0 : i32 to i64
|
14 |
+
%2 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32>
|
15 |
+
%3 = tt.expand_dims %2 {axis = 0 : i32} : (tensor<2048xi32>) -> tensor<1x2048xi32>
|
16 |
+
%4 = arith.extsi %3 : tensor<1x2048xi32> to tensor<1x2048xi64>
|
17 |
+
%5 = arith.muli %1, %c50257_i64 : i64
|
18 |
+
%6 = tt.splat %5 : (i64) -> tensor<1x2048xi64>
|
19 |
+
%7 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1x2048x!tt.ptr<bf16, 1>>
|
20 |
+
%8 = scf.for %arg5 = %c0_i32 to %c50257_i32 step %c2048_i32 iter_args(%arg6 = %cst_3) -> (tensor<1x2048xf32>) : i32 {
|
21 |
+
%22 = arith.extsi %arg5 : i32 to i64
|
22 |
+
%23 = tt.splat %22 : (i64) -> tensor<1x2048xi64>
|
23 |
+
%24 = arith.addi %23, %4 : tensor<1x2048xi64>
|
24 |
+
%25 = arith.cmpi slt, %24, %cst_1 : tensor<1x2048xi64>
|
25 |
+
%26 = arith.addi %24, %6 : tensor<1x2048xi64>
|
26 |
+
%27 = tt.addptr %7, %26 : tensor<1x2048x!tt.ptr<bf16, 1>>, tensor<1x2048xi64>
|
27 |
+
%28 = tt.load %27, %25, %cst {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x2048xbf16>
|
28 |
+
%29 = arith.extf %28 : tensor<1x2048xbf16> to tensor<1x2048xf32>
|
29 |
+
%30 = arith.cmpf ogt, %arg6, %29 : tensor<1x2048xf32>
|
30 |
+
%31 = arith.cmpf une, %arg6, %arg6 : tensor<1x2048xf32>
|
31 |
+
%32 = arith.ori %30, %31 : tensor<1x2048xi1>
|
32 |
+
%33 = arith.xori %32, %cst_0 : tensor<1x2048xi1>
|
33 |
+
%34 = arith.andi %25, %33 : tensor<1x2048xi1>
|
34 |
+
%35 = arith.select %34, %29, %arg6 : tensor<1x2048xi1>, tensor<1x2048xf32>
|
35 |
+
scf.yield %35 : tensor<1x2048xf32>
|
36 |
+
}
|
37 |
+
%9 = "tt.reduce"(%8) <{axis = 1 : i32}> ({
|
38 |
+
^bb0(%arg5: f32, %arg6: f32):
|
39 |
+
%22 = arith.cmpf ogt, %arg5, %arg6 : f32
|
40 |
+
%23 = arith.cmpf une, %arg5, %arg5 : f32
|
41 |
+
%24 = arith.ori %22, %23 : i1
|
42 |
+
%25 = arith.select %24, %arg5, %arg6 : f32
|
43 |
+
tt.reduce.return %25 : f32
|
44 |
+
}) : (tensor<1x2048xf32>) -> tensor<1xf32>
|
45 |
+
%10 = tt.expand_dims %9 {axis = 1 : i32} : (tensor<1xf32>) -> tensor<1x1xf32>
|
46 |
+
%11 = tt.addptr %arg1, %1 : !tt.ptr<f32, 1>, i64
|
47 |
+
%12 = tt.splat %11 : (!tt.ptr<f32, 1>) -> tensor<1x1x!tt.ptr<f32, 1>>
|
48 |
+
tt.store %12, %10 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xf32>
|
49 |
+
%13 = arith.muli %1, %c50257_i64 : i64
|
50 |
+
%14 = tt.splat %13 : (i64) -> tensor<1x2048xi64>
|
51 |
+
%15 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1x2048x!tt.ptr<bf16, 1>>
|
52 |
+
%16 = tt.broadcast %10 : (tensor<1x1xf32>) -> tensor<1x2048xf32>
|
53 |
+
%17 = scf.for %arg5 = %c0_i32 to %c50257_i32 step %c2048_i32 iter_args(%arg6 = %cst_2) -> (tensor<1x2048xf32>) : i32 {
|
54 |
+
%22 = arith.extsi %arg5 : i32 to i64
|
55 |
+
%23 = tt.splat %22 : (i64) -> tensor<1x2048xi64>
|
56 |
+
%24 = arith.addi %23, %4 : tensor<1x2048xi64>
|
57 |
+
%25 = arith.cmpi slt, %24, %cst_1 : tensor<1x2048xi64>
|
58 |
+
%26 = arith.addi %24, %14 : tensor<1x2048xi64>
|
59 |
+
%27 = tt.addptr %15, %26 : tensor<1x2048x!tt.ptr<bf16, 1>>, tensor<1x2048xi64>
|
60 |
+
%28 = tt.load %27, %25, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x2048xbf16>
|
61 |
+
%29 = arith.extf %28 : tensor<1x2048xbf16> to tensor<1x2048xf32>
|
62 |
+
%30 = arith.subf %29, %16 : tensor<1x2048xf32>
|
63 |
+
%31 = math.exp %30 : tensor<1x2048xf32>
|
64 |
+
%32 = arith.addf %arg6, %31 : tensor<1x2048xf32>
|
65 |
+
%33 = arith.select %25, %32, %arg6 : tensor<1x2048xi1>, tensor<1x2048xf32>
|
66 |
+
scf.yield %33 : tensor<1x2048xf32>
|
67 |
+
}
|
68 |
+
%18 = "tt.reduce"(%17) <{axis = 1 : i32}> ({
|
69 |
+
^bb0(%arg5: f32, %arg6: f32):
|
70 |
+
%22 = arith.addf %arg5, %arg6 : f32
|
71 |
+
tt.reduce.return %22 : f32
|
72 |
+
}) : (tensor<1x2048xf32>) -> tensor<1xf32>
|
73 |
+
%19 = tt.expand_dims %18 {axis = 1 : i32} : (tensor<1xf32>) -> tensor<1x1xf32>
|
74 |
+
%20 = tt.addptr %arg2, %1 : !tt.ptr<f32, 1>, i64
|
75 |
+
%21 = tt.splat %20 : (!tt.ptr<f32, 1>) -> tensor<1x1x!tt.ptr<f32, 1>>
|
76 |
+
tt.store %21, %19 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xf32>
|
77 |
+
tt.return
|
78 |
+
}
|
79 |
+
}
|