0-hero commited on
Commit
0def249
·
verified ·
1 Parent(s): d572127

Add files using upload-large-folder tool

Browse files
.triton/dump/1e922bbbab749da355e4bad9c6b245e6/triton_.ptx ADDED
@@ -0,0 +1,446 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1de
10
+ .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
11
+
12
+ .visible .entry triton__0d1de(
13
+ .param .u64 triton__0d1de_param_0,
14
+ .param .u32 triton__0d1de_param_1
15
+ )
16
+ .maxntid 256, 1, 1
17
+ {
18
+ .reg .pred %p<9>;
19
+ .reg .b16 %rs<5>;
20
+ .reg .b32 %r<22>;
21
+ .reg .f32 %f<113>;
22
+ .reg .b64 %rd<6>;
23
+ .loc 1 18 0
24
+ $L__func_begin0:
25
+ .loc 1 18 0
26
+
27
+ ld.param.u64 %rd3, [triton__0d1de_param_0];
28
+ $L__tmp0:
29
+ .loc 1 21 36
30
+ mov.u32 %r5, %tid.x;
31
+ shl.b32 %r6, %r5, 1;
32
+ and.b32 %r7, %r6, 510;
33
+ .loc 1 20 28
34
+ mov.u32 %r1, %ctaid.x;
35
+ .loc 1 20 33
36
+ shl.b32 %r8, %r1, 9;
37
+ .loc 1 21 23
38
+ or.b32 %r9, %r8, %r7;
39
+ .loc 1 24 34
40
+ mul.wide.s32 %rd4, %r9, 2;
41
+ add.s64 %rd5, %rd3, %rd4;
42
+ mov.pred %p1, -1;
43
+ .loc 1 24 39
44
+ mov.u32 %r2, 0x0;
45
+ @%p1 ld.global.b32 { %r2 }, [ %rd5 + 0 ];
46
+ cvt.u16.u32 %rs1, %r2;
47
+ { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
48
+ .loc 1 24 48
49
+ cvt.f32.bf16 %r3, %rs1;
50
+ mov.b32 %f1, %r3;
51
+ cvt.f32.bf16 %r4, %rs2;
52
+ mov.b32 %f2, %r4;
53
+ .loc 1 29 18
54
+ mul.f32 %f3, %f1, 0f3F3504F3;
55
+ .loc 1 30 23
56
+ abs.ftz.f32 %f5, %f3;
57
+ setp.ge.f32 %p2, %f5, 0f3F8060FE;
58
+ mov.f32 %f101, 0f3789CA3C;
59
+ mov.f32 %f100, 0fB9F560B9;
60
+ mov.f32 %f99, 0f3BAC840B;
61
+ mov.f32 %f98, 0fBD0C8162;
62
+ mov.f32 %f97, 0f3E1CF906;
63
+ mov.f32 %f96, 0f3F6A937E;
64
+ mov.f32 %f95, 0f3F20D842;
65
+ mov.f32 %f102, %f5;
66
+ @%p2 bra $L__BB0_2;
67
+ .loc 1 0 23
68
+ mov.f32 %f101, 0f38B1E96A;
69
+ mov.f32 %f100, 0fBA574D20;
70
+ mov.f32 %f99, 0f3BAAD5EA;
71
+ mov.f32 %f98, 0fBCDC1BE7;
72
+ mov.f32 %f97, 0f3DE718AF;
73
+ mov.f32 %f96, 0fBEC093AC;
74
+ mov.f32 %f95, 0f3E0375D3;
75
+ .loc 1 30 23
76
+ mul.f32 %f102, %f3, %f3;
77
+ $L__BB0_2:
78
+ .loc 1 0 0
79
+ mul.f32 %f4, %f2, 0f3F3504F3;
80
+ .loc 1 30 23
81
+ setp.ltu.f32 %p3, %f5, 0f3F8060FE;
82
+ fma.rn.ftz.f32 %f45, %f101, %f102, %f100;
83
+ fma.rn.ftz.f32 %f46, %f45, %f102, %f99;
84
+ fma.rn.ftz.f32 %f47, %f46, %f102, %f98;
85
+ fma.rn.ftz.f32 %f48, %f47, %f102, %f97;
86
+ fma.rn.ftz.f32 %f49, %f48, %f102, %f96;
87
+ fma.rn.ftz.f32 %f50, %f49, %f102, %f95;
88
+ neg.f32 %f51, %f102;
89
+ selp.f32 %f52, %f51, %f3, %p2;
90
+ fma.rn.ftz.f32 %f103, %f50, %f52, %f52;
91
+ mov.f32 %f94, 0f3F800000;
92
+ @%p3 bra $L__BB0_4;
93
+ ex2.approx.ftz.f32 %f53, %f103;
94
+ sub.f32 %f55, %f94, %f53;
95
+ mov.b32 %r10, %f55;
96
+ mov.b32 %r11, %f3;
97
+ and.b32 %r12, %r11, -2147483648;
98
+ or.b32 %r13, %r12, %r10;
99
+ mov.b32 %f103, %r13;
100
+ $L__BB0_4:
101
+ abs.ftz.f32 %f18, %f4;
102
+ setp.ge.f32 %p5, %f18, 0f3F8060FE;
103
+ mov.f32 %f110, 0f3789CA3C;
104
+ mov.f32 %f109, 0fB9F560B9;
105
+ mov.f32 %f108, 0f3BAC840B;
106
+ mov.f32 %f107, 0fBD0C8162;
107
+ mov.f32 %f106, 0f3E1CF906;
108
+ mov.f32 %f105, 0f3F6A937E;
109
+ mov.f32 %f104, 0f3F20D842;
110
+ mov.f32 %f111, %f18;
111
+ @%p5 bra $L__BB0_6;
112
+ mul.f32 %f111, %f4, %f4;
113
+ mov.f32 %f110, 0f38B1E96A;
114
+ mov.f32 %f109, 0fBA574D20;
115
+ mov.f32 %f108, 0f3BAAD5EA;
116
+ mov.f32 %f107, 0fBCDC1BE7;
117
+ mov.f32 %f106, 0f3DE718AF;
118
+ mov.f32 %f105, 0fBEC093AC;
119
+ mov.f32 %f104, 0f3E0375D3;
120
+ $L__BB0_6:
121
+ setp.ltu.f32 %p6, %f18, 0f3F8060FE;
122
+ fma.rn.ftz.f32 %f70, %f110, %f111, %f109;
123
+ fma.rn.ftz.f32 %f71, %f70, %f111, %f108;
124
+ fma.rn.ftz.f32 %f72, %f71, %f111, %f107;
125
+ fma.rn.ftz.f32 %f73, %f72, %f111, %f106;
126
+ fma.rn.ftz.f32 %f74, %f73, %f111, %f105;
127
+ fma.rn.ftz.f32 %f75, %f74, %f111, %f104;
128
+ neg.f32 %f76, %f111;
129
+ selp.f32 %f77, %f76, %f4, %p5;
130
+ fma.rn.ftz.f32 %f112, %f75, %f77, %f77;
131
+ @%p6 bra $L__BB0_8;
132
+ ex2.approx.ftz.f32 %f78, %f112;
133
+ sub.f32 %f80, %f94, %f78;
134
+ mov.b32 %r14, %f80;
135
+ mov.b32 %r15, %f4;
136
+ and.b32 %r16, %r15, -2147483648;
137
+ or.b32 %r17, %r16, %r14;
138
+ mov.b32 %f112, %r17;
139
+ $L__BB0_8:
140
+ .loc 1 27 18
141
+ mul.f32 %f81, %f2, 0f3F000000;
142
+ mul.f32 %f82, %f1, 0f3F000000;
143
+ .loc 1 32 18
144
+ add.f32 %f83, %f103, 0f3F800000;
145
+ add.f32 %f84, %f112, 0f3F800000;
146
+ .loc 1 33 18
147
+ mul.f32 %f85, %f82, %f83;
148
+ mul.f32 %f86, %f81, %f84;
149
+ .loc 1 35 40
150
+ mov.b32 %r18, %f85;
151
+ cvt.rn.bf16.f32 %rs3, %r18;
152
+ mov.b32 %r19, %f86;
153
+ cvt.rn.bf16.f32 %rs4, %r19;
154
+ mov.b32 %r21, {%rs3, %rs4};
155
+ @%p1 st.global.b32 [ %rd5 + 0 ], { %r21 };
156
+ .loc 1 35 4
157
+ ret;
158
+ $L__tmp1:
159
+ $L__func_end0:
160
+
161
+ }
162
+ // .globl __nv_erff
163
+ .visible .func (.param .b32 func_retval0) __nv_erff(
164
+ .param .b32 __nv_erff_param_0
165
+ )
166
+ {
167
+ .reg .pred %p<4>;
168
+ .reg .b32 %r<5>;
169
+ .reg .f32 %f<49>;
170
+ $L__func_begin1:
171
+
172
+ ld.param.f32 %f14, [__nv_erff_param_0];
173
+ abs.ftz.f32 %f1, %f14;
174
+ setp.ge.f32 %p1, %f1, 0f3F8060FE;
175
+ mov.f32 %f46, 0f3789CA3C;
176
+ mov.f32 %f45, 0fB9F560B9;
177
+ mov.f32 %f44, 0f3BAC840B;
178
+ mov.f32 %f43, 0fBD0C8162;
179
+ mov.f32 %f42, 0f3E1CF906;
180
+ mov.f32 %f41, 0f3F6A937E;
181
+ mov.f32 %f40, 0f3F20D842;
182
+ mov.f32 %f47, %f1;
183
+ @%p1 bra $L__BB1_2;
184
+ mul.f32 %f47, %f14, %f14;
185
+ mov.f32 %f46, 0f38B1E96A;
186
+ mov.f32 %f45, 0fBA574D20;
187
+ mov.f32 %f44, 0f3BAAD5EA;
188
+ mov.f32 %f43, 0fBCDC1BE7;
189
+ mov.f32 %f42, 0f3DE718AF;
190
+ mov.f32 %f41, 0fBEC093AC;
191
+ mov.f32 %f40, 0f3E0375D3;
192
+ $L__BB1_2:
193
+ setp.ltu.f32 %p2, %f1, 0f3F8060FE;
194
+ fma.rn.ftz.f32 %f29, %f46, %f47, %f45;
195
+ fma.rn.ftz.f32 %f30, %f29, %f47, %f44;
196
+ fma.rn.ftz.f32 %f31, %f30, %f47, %f43;
197
+ fma.rn.ftz.f32 %f32, %f31, %f47, %f42;
198
+ fma.rn.ftz.f32 %f33, %f32, %f47, %f41;
199
+ fma.rn.ftz.f32 %f34, %f33, %f47, %f40;
200
+ neg.f32 %f35, %f47;
201
+ selp.f32 %f36, %f35, %f14, %p1;
202
+ fma.rn.ftz.f32 %f48, %f34, %f36, %f36;
203
+ @%p2 bra $L__BB1_4;
204
+ ex2.approx.ftz.f32 %f37, %f48;
205
+ mov.f32 %f38, 0f3F800000;
206
+ sub.f32 %f39, %f38, %f37;
207
+ mov.b32 %r1, %f39;
208
+ mov.b32 %r2, %f14;
209
+ and.b32 %r3, %r2, -2147483648;
210
+ or.b32 %r4, %r3, %r1;
211
+ mov.b32 %f48, %r4;
212
+ $L__BB1_4:
213
+ st.param.f32 [func_retval0+0], %f48;
214
+ ret;
215
+ $L__func_end1:
216
+
217
+ }
218
+ .file 1 "/tmp/torchinductor_root/kp/ckphrtdpgsxl7sfarkkzylhv4st3uhmzvg3u6z5excfp6ydybq74.py"
219
+ .section .debug_abbrev
220
+ {
221
+ .b8 1
222
+ .b8 17
223
+ .b8 1
224
+ .b8 37
225
+ .b8 8
226
+ .b8 19
227
+ .b8 5
228
+ .b8 3
229
+ .b8 8
230
+ .b8 16
231
+ .b8 6
232
+ .b8 27
233
+ .b8 8
234
+ .b8 180
235
+ .b8 66
236
+ .b8 12
237
+ .b8 17
238
+ .b8 1
239
+ .b8 18
240
+ .b8 1
241
+ .b8 0
242
+ .b8 0
243
+ .b8 2
244
+ .b8 46
245
+ .b8 0
246
+ .b8 17
247
+ .b8 1
248
+ .b8 18
249
+ .b8 1
250
+ .b8 64
251
+ .b8 10
252
+ .b8 135
253
+ .b8 64
254
+ .b8 8
255
+ .b8 3
256
+ .b8 8
257
+ .b8 58
258
+ .b8 11
259
+ .b8 59
260
+ .b8 11
261
+ .b8 63
262
+ .b8 12
263
+ .b8 0
264
+ .b8 0
265
+ .b8 0
266
+ }
267
+ .section .debug_info
268
+ {
269
+ .b32 172
270
+ .b8 2
271
+ .b8 0
272
+ .b32 .debug_abbrev
273
+ .b8 8
274
+ .b8 1
275
+ .b8 116
276
+ .b8 114
277
+ .b8 105
278
+ .b8 116
279
+ .b8 111
280
+ .b8 110
281
+ .b8 0
282
+ .b8 2
283
+ .b8 0
284
+ .b8 99
285
+ .b8 107
286
+ .b8 112
287
+ .b8 104
288
+ .b8 114
289
+ .b8 116
290
+ .b8 100
291
+ .b8 112
292
+ .b8 103
293
+ .b8 115
294
+ .b8 120
295
+ .b8 108
296
+ .b8 55
297
+ .b8 115
298
+ .b8 102
299
+ .b8 97
300
+ .b8 114
301
+ .b8 107
302
+ .b8 107
303
+ .b8 122
304
+ .b8 121
305
+ .b8 108
306
+ .b8 104
307
+ .b8 118
308
+ .b8 52
309
+ .b8 115
310
+ .b8 116
311
+ .b8 51
312
+ .b8 117
313
+ .b8 104
314
+ .b8 109
315
+ .b8 122
316
+ .b8 118
317
+ .b8 103
318
+ .b8 51
319
+ .b8 117
320
+ .b8 54
321
+ .b8 122
322
+ .b8 53
323
+ .b8 101
324
+ .b8 120
325
+ .b8 99
326
+ .b8 102
327
+ .b8 112
328
+ .b8 54
329
+ .b8 121
330
+ .b8 100
331
+ .b8 121
332
+ .b8 98
333
+ .b8 113
334
+ .b8 55
335
+ .b8 52
336
+ .b8 46
337
+ .b8 112
338
+ .b8 121
339
+ .b8 0
340
+ .b32 .debug_line
341
+ .b8 47
342
+ .b8 116
343
+ .b8 109
344
+ .b8 112
345
+ .b8 47
346
+ .b8 116
347
+ .b8 111
348
+ .b8 114
349
+ .b8 99
350
+ .b8 104
351
+ .b8 105
352
+ .b8 110
353
+ .b8 100
354
+ .b8 117
355
+ .b8 99
356
+ .b8 116
357
+ .b8 111
358
+ .b8 114
359
+ .b8 95
360
+ .b8 114
361
+ .b8 111
362
+ .b8 111
363
+ .b8 116
364
+ .b8 47
365
+ .b8 107
366
+ .b8 112
367
+ .b8 0
368
+ .b8 1
369
+ .b64 $L__func_begin0
370
+ .b64 $L__func_end0
371
+ .b8 2
372
+ .b64 $L__func_begin0
373
+ .b64 $L__func_end0
374
+ .b8 1
375
+ .b8 156
376
+ .b8 116
377
+ .b8 114
378
+ .b8 105
379
+ .b8 116
380
+ .b8 111
381
+ .b8 110
382
+ .b8 95
383
+ .b8 95
384
+ .b8 48
385
+ .b8 100
386
+ .b8 49
387
+ .b8 100
388
+ .b8 101
389
+ .b8 0
390
+ .b8 116
391
+ .b8 114
392
+ .b8 105
393
+ .b8 116
394
+ .b8 111
395
+ .b8 110
396
+ .b8 95
397
+ .b8 95
398
+ .b8 48
399
+ .b8 100
400
+ .b8 49
401
+ .b8 100
402
+ .b8 101
403
+ .b8 0
404
+ .b8 1
405
+ .b8 18
406
+ .b8 1
407
+ .b8 0
408
+ }
409
+ .section .debug_pubnames
410
+ {
411
+ .b32 $L__pubNames_end0-$L__pubNames_start0
412
+ $L__pubNames_start0:
413
+ .b8 2
414
+ .b8 0
415
+ .b32 .debug_info
416
+ .b32 176
417
+ .b32 125
418
+ .b8 116
419
+ .b8 114
420
+ .b8 105
421
+ .b8 116
422
+ .b8 111
423
+ .b8 110
424
+ .b8 95
425
+ .b8 95
426
+ .b8 48
427
+ .b8 100
428
+ .b8 49
429
+ .b8 100
430
+ .b8 101
431
+ .b8 0
432
+ .b32 0
433
+ $L__pubNames_end0:
434
+ }
435
+ .section .debug_pubtypes
436
+ {
437
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
438
+ $L__pubTypes_start0:
439
+ .b8 2
440
+ .b8 0
441
+ .b32 .debug_info
442
+ .b32 176
443
+ .b32 0
444
+ $L__pubTypes_end0:
445
+ }
446
+ .section .debug_loc { }
.triton/dump/1e922bbbab749da355e4bad9c6b245e6/triton_.ttgir ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
4
+ %cst = arith.constant dense<1.000000e+00> : tensor<512xf32, #blocked>
5
+ %cst_0 = arith.constant dense<0.707106769> : tensor<512xf32, #blocked>
6
+ %cst_1 = arith.constant dense<5.000000e-01> : tensor<512xf32, #blocked>
7
+ %c512_i32 = arith.constant 512 : i32
8
+ %0 = tt.get_program_id x : i32
9
+ %1 = arith.muli %0, %c512_i32 : i32
10
+ %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked>
11
+ %3 = tt.splat %1 : (i32) -> tensor<512xi32, #blocked>
12
+ %4 = arith.addi %3, %2 : tensor<512xi32, #blocked>
13
+ %5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>, #blocked>
14
+ %6 = tt.addptr %5, %4 : tensor<512x!tt.ptr<bf16, 1>, #blocked>, tensor<512xi32, #blocked>
15
+ %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16, #blocked>
16
+ %8 = arith.extf %7 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked>
17
+ %9 = arith.mulf %8, %cst_1 : tensor<512xf32, #blocked>
18
+ %10 = arith.mulf %8, %cst_0 : tensor<512xf32, #blocked>
19
+ %11 = tt.extern_elementwise %10 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<512xf32, #blocked>) -> tensor<512xf32, #blocked>
20
+ %12 = arith.addf %11, %cst : tensor<512xf32, #blocked>
21
+ %13 = arith.mulf %9, %12 : tensor<512xf32, #blocked>
22
+ %14 = arith.truncf %13 : tensor<512xf32, #blocked> to tensor<512xbf16, #blocked>
23
+ tt.store %6, %14 {cache = 1 : i32, evict = 1 : i32} : tensor<512xbf16, #blocked>
24
+ tt.return
25
+ }
26
+ }
.triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.llir ADDED
@@ -0,0 +1,503 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
5
+ @assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
6
+ @assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp13 < 50257"
7
+ @assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
8
+ @assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
9
+ @assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
10
+ @global_smem = external addrspace(3) global [0 x i8]
11
+ @.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
12
+
13
+ declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
14
+
15
+ define void @triton__0d1d2d3d4d5de6de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i32 %5, i32 %6) local_unnamed_addr !dbg !7 {
16
+ %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
17
+ %9 = and i32 %8, 31, !dbg !10
18
+ %10 = lshr i32 %8, 5, !dbg !10
19
+ %11 = lshr i32 %8, 6, !dbg !10
20
+ %12 = and i32 %11, 1, !dbg !10
21
+ %13 = and i32 %8, 1, !dbg !10
22
+ %14 = and i32 %10, 1, !dbg !11
23
+ %urem = shl i32 %8, 2, !dbg !11
24
+ %15 = and i32 %urem, 252, !dbg !11
25
+ %16 = shl i32 %8, 1, !dbg !11
26
+ %17 = and i32 %16, 254, !dbg !11
27
+ %18 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !12
28
+ %19 = shl i32 %18, 1, !dbg !13
29
+ %20 = or i32 %19, %12, !dbg !14
30
+ %21 = or i32 %19, %13, !dbg !14
31
+ %22 = sext i32 %20 to i64, !dbg !15
32
+ %23 = getelementptr i64, ptr addrspace(1) %0, i64 %22, !dbg !15
33
+ %24 = sext i32 %21 to i64, !dbg !15
34
+ %25 = getelementptr i64, ptr addrspace(1) %0, i64 %24, !dbg !15
35
+ %26 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
36
+ %27 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
37
+ %28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
38
+ %29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
39
+ %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !16
40
+ %31 = srem i32 %20, 512, !dbg !17
41
+ %32 = shl nsw i32 %31, 8, !dbg !18
42
+ %33 = or i32 %32, %15, !dbg !19
43
+ %34 = sext i32 %33 to i64, !dbg !20
44
+ %35 = getelementptr float, ptr addrspace(1) %2, i64 %34, !dbg !20
45
+ %36 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %35, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21
46
+ %37 = extractvalue { i32, i32, i32, i32 } %36, 0, !dbg !21
47
+ %38 = extractvalue { i32, i32, i32, i32 } %36, 1, !dbg !21
48
+ %39 = extractvalue { i32, i32, i32, i32 } %36, 2, !dbg !21
49
+ %40 = extractvalue { i32, i32, i32, i32 } %36, 3, !dbg !21
50
+ %41 = bitcast i32 %37 to float, !dbg !21
51
+ %42 = bitcast i32 %38 to float, !dbg !21
52
+ %43 = bitcast i32 %39 to float, !dbg !21
53
+ %44 = bitcast i32 %40 to float, !dbg !21
54
+ %45 = add i64 %30, 50257, !dbg !22
55
+ %46 = icmp slt i64 %26, 0, !dbg !23
56
+ %47 = icmp slt i64 %30, 0, !dbg !23
57
+ %48 = select i1 %47, i64 %45, i64 %30, !dbg !24
58
+ %49 = icmp ugt i64 %48, 50256, !dbg !25
59
+ br i1 %49, label %50, label %51, !dbg !26
60
+
61
+ 50: ; preds = %7
62
+ tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !26
63
+ br label %51, !dbg !26
64
+
65
+ 51: ; preds = %50, %7
66
+ %52 = shl i64 %26, 8, !dbg !27
67
+ %53 = add i64 %52, 12865792, !dbg !27
68
+ %54 = select i1 %46, i64 %53, i64 %52, !dbg !27
69
+ %55 = zext nneg i32 %15 to i64
70
+ %56 = or i64 %54, %55, !dbg !28
71
+ %57 = getelementptr float, ptr addrspace(1) %1, i64 %56, !dbg !29
72
+ %58 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %57, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !30
73
+ %59 = extractvalue { i32, i32, i32, i32 } %58, 0, !dbg !30
74
+ %60 = extractvalue { i32, i32, i32, i32 } %58, 1, !dbg !30
75
+ %61 = extractvalue { i32, i32, i32, i32 } %58, 2, !dbg !30
76
+ %62 = extractvalue { i32, i32, i32, i32 } %58, 3, !dbg !30
77
+ %63 = bitcast i32 %59 to float, !dbg !30
78
+ %64 = bitcast i32 %60 to float, !dbg !30
79
+ %65 = bitcast i32 %61 to float, !dbg !30
80
+ %66 = bitcast i32 %62 to float, !dbg !30
81
+ %67 = fadd float %41, %63, !dbg !31
82
+ %68 = fadd float %42, %64, !dbg !31
83
+ %69 = fadd float %43, %65, !dbg !31
84
+ %70 = fadd float %44, %66, !dbg !31
85
+ %71 = fadd float %67, 0.000000e+00, !dbg !32
86
+ %72 = fadd float %68, 0.000000e+00, !dbg !32
87
+ %73 = fadd float %69, 0.000000e+00, !dbg !32
88
+ %74 = fadd float %70, 0.000000e+00, !dbg !32
89
+ %75 = fsub float %67, %71, !dbg !36
90
+ %76 = fsub float %68, %72, !dbg !36
91
+ %77 = fsub float %69, %73, !dbg !36
92
+ %78 = fsub float %70, %74, !dbg !36
93
+ %79 = fmul float %67, %75, !dbg !37
94
+ %80 = fmul float %68, %76, !dbg !37
95
+ %81 = fmul float %69, %77, !dbg !37
96
+ %82 = fmul float %70, %78, !dbg !37
97
+ %83 = fadd float %79, 0.000000e+00, !dbg !38
98
+ %84 = fadd float %80, 0.000000e+00, !dbg !38
99
+ %85 = fadd float %81, 0.000000e+00, !dbg !38
100
+ %86 = fadd float %82, 0.000000e+00, !dbg !38
101
+ %87 = fsub float %72, %71, !dbg !39
102
+ %88 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 2.000000e+00) #6, !dbg !43
103
+ %89 = fmul float %88, %87, !dbg !44
104
+ %90 = fadd float %71, %89, !dbg !45
105
+ %91 = fadd float %83, %84, !dbg !46
106
+ %92 = fmul float %87, %87, !dbg !47
107
+ %93 = fmul float %88, %92, !dbg !48
108
+ %94 = fadd float %93, %91, !dbg !49
109
+ %95 = fsub float %73, %90, !dbg !39
110
+ %96 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 3.000000e+00) #6, !dbg !43
111
+ %97 = fmul float %96, %95, !dbg !44
112
+ %98 = fadd float %90, %97, !dbg !45
113
+ %99 = fadd float %85, %94, !dbg !46
114
+ %100 = fmul float %95, %95, !dbg !47
115
+ %101 = fmul float %100, 2.000000e+00, !dbg !50
116
+ %102 = fmul float %96, %101, !dbg !48
117
+ %103 = fadd float %99, %102, !dbg !49
118
+ %104 = fsub float %74, %98, !dbg !39
119
+ %105 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 4.000000e+00) #6, !dbg !43
120
+ %106 = fmul float %105, %104, !dbg !44
121
+ %107 = fadd float %98, %106, !dbg !45
122
+ %108 = fadd float %86, %103, !dbg !46
123
+ %109 = fmul float %104, %104, !dbg !47
124
+ %110 = fmul float %109, 3.000000e+00, !dbg !50
125
+ %111 = fmul float %105, %110, !dbg !48
126
+ %112 = fadd float %108, %111, !dbg !49
127
+ %113 = bitcast float %107 to i32, !dbg !51
128
+ %114 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %113, i32 16, i32 31), !dbg !51
129
+ %115 = bitcast i32 %114 to float, !dbg !51
130
+ %116 = bitcast float %112 to i32, !dbg !51
131
+ %117 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %116, i32 16, i32 31), !dbg !51
132
+ %118 = bitcast i32 %117 to float, !dbg !51
133
+ %119 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 1082130432, i32 16, i32 31), !dbg !51
134
+ %120 = bitcast i32 %119 to float, !dbg !51
135
+ %121 = fsub float %115, %107, !dbg !39
136
+ %122 = fadd float %120, 4.000000e+00, !dbg !53
137
+ %123 = fcmp oeq float %122, 0.000000e+00, !dbg !54
138
+ %124 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %120, float %122) #6, !dbg !43
139
+ %125 = select i1 %123, float 0.000000e+00, float %124, !dbg !55
140
+ %126 = fmul float %125, %121, !dbg !44
141
+ %127 = fadd float %107, %126, !dbg !45
142
+ %128 = fadd float %112, %118, !dbg !46
143
+ %129 = fmul float %121, %121, !dbg !47
144
+ %130 = fmul float %129, 4.000000e+00, !dbg !50
145
+ %131 = fmul float %125, %130, !dbg !48
146
+ %132 = fadd float %128, %131, !dbg !49
147
+ %133 = bitcast float %127 to i32, !dbg !51
148
+ %134 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %133, i32 8, i32 31), !dbg !51
149
+ %135 = bitcast i32 %134 to float, !dbg !51
150
+ %136 = bitcast float %132 to i32, !dbg !51
151
+ %137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 8, i32 31), !dbg !51
152
+ %138 = bitcast i32 %137 to float, !dbg !51
153
+ %139 = bitcast float %122 to i32, !dbg !51
154
+ %140 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %139, i32 8, i32 31), !dbg !51
155
+ %141 = bitcast i32 %140 to float, !dbg !51
156
+ %142 = fsub float %135, %127, !dbg !39
157
+ %143 = fadd float %122, %141, !dbg !53
158
+ %144 = fcmp oeq float %143, 0.000000e+00, !dbg !54
159
+ %145 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %141, float %143) #6, !dbg !43
160
+ %146 = select i1 %144, float 0.000000e+00, float %145, !dbg !55
161
+ %147 = fmul float %146, %142, !dbg !44
162
+ %148 = fadd float %127, %147, !dbg !45
163
+ %149 = fadd float %132, %138, !dbg !46
164
+ %150 = fmul float %142, %142, !dbg !47
165
+ %151 = fmul float %122, %150, !dbg !50
166
+ %152 = fmul float %146, %151, !dbg !48
167
+ %153 = fadd float %149, %152, !dbg !49
168
+ %154 = bitcast float %148 to i32, !dbg !51
169
+ %155 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %154, i32 4, i32 31), !dbg !51
170
+ %156 = bitcast i32 %155 to float, !dbg !51
171
+ %157 = bitcast float %153 to i32, !dbg !51
172
+ %158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 4, i32 31), !dbg !51
173
+ %159 = bitcast i32 %158 to float, !dbg !51
174
+ %160 = bitcast float %143 to i32, !dbg !51
175
+ %161 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %160, i32 4, i32 31), !dbg !51
176
+ %162 = bitcast i32 %161 to float, !dbg !51
177
+ %163 = fsub float %156, %148, !dbg !39
178
+ %164 = fadd float %143, %162, !dbg !53
179
+ %165 = fcmp oeq float %164, 0.000000e+00, !dbg !54
180
+ %166 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %162, float %164) #6, !dbg !43
181
+ %167 = select i1 %165, float 0.000000e+00, float %166, !dbg !55
182
+ %168 = fmul float %167, %163, !dbg !44
183
+ %169 = fadd float %148, %168, !dbg !45
184
+ %170 = fadd float %153, %159, !dbg !46
185
+ %171 = fmul float %163, %163, !dbg !47
186
+ %172 = fmul float %143, %171, !dbg !50
187
+ %173 = fmul float %167, %172, !dbg !48
188
+ %174 = fadd float %170, %173, !dbg !49
189
+ %175 = bitcast float %169 to i32, !dbg !51
190
+ %176 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %175, i32 2, i32 31), !dbg !51
191
+ %177 = bitcast i32 %176 to float, !dbg !51
192
+ %178 = bitcast float %174 to i32, !dbg !51
193
+ %179 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %178, i32 2, i32 31), !dbg !51
194
+ %180 = bitcast i32 %179 to float, !dbg !51
195
+ %181 = bitcast float %164 to i32, !dbg !51
196
+ %182 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %181, i32 2, i32 31), !dbg !51
197
+ %183 = bitcast i32 %182 to float, !dbg !51
198
+ %184 = fsub float %177, %169, !dbg !39
199
+ %185 = fadd float %164, %183, !dbg !53
200
+ %186 = fcmp oeq float %185, 0.000000e+00, !dbg !54
201
+ %187 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %183, float %185) #6, !dbg !43
202
+ %188 = select i1 %186, float 0.000000e+00, float %187, !dbg !55
203
+ %189 = fmul float %188, %184, !dbg !44
204
+ %190 = fadd float %169, %189, !dbg !45
205
+ %191 = fadd float %174, %180, !dbg !46
206
+ %192 = fmul float %184, %184, !dbg !47
207
+ %193 = fmul float %164, %192, !dbg !50
208
+ %194 = fmul float %188, %193, !dbg !48
209
+ %195 = fadd float %191, %194, !dbg !49
210
+ %196 = bitcast float %190 to i32, !dbg !51
211
+ %197 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %196, i32 1, i32 31), !dbg !51
212
+ %198 = bitcast i32 %197 to float, !dbg !51
213
+ %199 = bitcast float %195 to i32, !dbg !51
214
+ %200 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %199, i32 1, i32 31), !dbg !51
215
+ %201 = bitcast i32 %200 to float, !dbg !51
216
+ %202 = bitcast float %185 to i32, !dbg !51
217
+ %203 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %202, i32 1, i32 31), !dbg !51
218
+ %204 = bitcast i32 %203 to float, !dbg !51
219
+ %205 = fsub float %198, %190, !dbg !39
220
+ %206 = fadd float %185, %204, !dbg !53
221
+ %207 = fcmp oeq float %206, 0.000000e+00, !dbg !54
222
+ %208 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %204, float %206) #6, !dbg !43
223
+ %209 = select i1 %207, float 0.000000e+00, float %208, !dbg !55
224
+ %210 = fmul float %205, %209, !dbg !44
225
+ %211 = fadd float %190, %210, !dbg !45
226
+ %212 = fadd float %195, %201, !dbg !46
227
+ %213 = fmul float %205, %205, !dbg !47
228
+ %214 = fmul float %185, %213, !dbg !50
229
+ %215 = fmul float %209, %214, !dbg !48
230
+ %216 = fadd float %212, %215, !dbg !49
231
+ %217 = icmp eq i32 %9, 0, !dbg !51
232
+ %218 = shl nuw nsw i32 %12, 1, !dbg !51
233
+ %219 = or i32 %218, %14, !dbg !51
234
+ %220 = zext nneg i32 %219 to i64, !dbg !51
235
+ %221 = getelementptr float, ptr addrspace(3) @global_smem, i64 %220, !dbg !51
236
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %221, float %211, i1 %217) #6, !dbg !51
237
+ %222 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %220, !dbg !51
238
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %222, float %216, i1 %217) #6, !dbg !51
239
+ %223 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 32), i64 %220, !dbg !51
240
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %223, float %206, i1 %217) #6, !dbg !51
241
+ tail call void @llvm.nvvm.barrier0(), !dbg !51
242
+ %224 = icmp slt i32 %8, 4, !dbg !51
243
+ %225 = sext i32 %8 to i64, !dbg !51
244
+ %226 = getelementptr float, ptr addrspace(3) @global_smem, i64 %225, !dbg !51
245
+ %227 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %226, i1 %224) #6, !dbg !51
246
+ %228 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %225, !dbg !51
247
+ %229 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %228, i1 %224) #6, !dbg !51
248
+ %230 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 32), i64 %225, !dbg !51
249
+ %231 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %230, i1 %224) #6, !dbg !51
250
+ %232 = bitcast float %227 to i32, !dbg !51
251
+ %233 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %232, i32 1, i32 31), !dbg !51
252
+ %234 = bitcast i32 %233 to float, !dbg !51
253
+ %235 = bitcast float %229 to i32, !dbg !51
254
+ %236 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %235, i32 1, i32 31), !dbg !51
255
+ %237 = bitcast i32 %236 to float, !dbg !51
256
+ %238 = bitcast float %231 to i32, !dbg !51
257
+ %239 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %238, i32 1, i32 31), !dbg !51
258
+ %240 = bitcast i32 %239 to float, !dbg !51
259
+ %241 = fsub float %234, %227, !dbg !39
260
+ %242 = fadd float %231, %240, !dbg !53
261
+ %243 = fcmp oeq float %242, 0.000000e+00, !dbg !54
262
+ %244 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %240, float %242) #6, !dbg !43
263
+ %245 = select i1 %243, float 0.000000e+00, float %244, !dbg !55
264
+ %246 = fmul float %241, %245, !dbg !44
265
+ %247 = fadd float %227, %246, !dbg !45
266
+ %248 = fadd float %229, %237, !dbg !46
267
+ %249 = fmul float %241, %241, !dbg !47
268
+ %250 = fmul float %231, %249, !dbg !50
269
+ %251 = fmul float %250, %245, !dbg !48
270
+ %252 = fadd float %248, %251, !dbg !49
271
+ %253 = icmp eq i32 %13, 0, !dbg !51
272
+ %254 = and i1 %224, %253, !dbg !51
273
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %226, float %247, i1 %254) #6, !dbg !51
274
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %228, float %252, i1 %254) #6, !dbg !51
275
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %230, float %242, i1 %254) #6, !dbg !51
276
+ tail call void @llvm.nvvm.barrier0(), !dbg !51
277
+ %255 = zext nneg i32 %218 to i64, !dbg !51
278
+ %256 = getelementptr float, ptr addrspace(3) @global_smem, i64 %255, !dbg !51
279
+ %257 = load float, ptr addrspace(3) %256, align 4, !dbg !51
280
+ %258 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %255, !dbg !51
281
+ %259 = load float, ptr addrspace(3) %258, align 4, !dbg !51
282
+ %260 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %35, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !56
283
+ %261 = zext nneg i32 %17 to i64, !dbg !57
284
+ %262 = getelementptr float, ptr addrspace(1) %3, i64 %261, !dbg !57
285
+ %263 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %262, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !58
286
+ br i1 %49, label %264, label %265, !dbg !59
287
+
288
+ 264: ; preds = %51
289
+ tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !59
290
+ br label %265, !dbg !59
291
+
292
+ 265: ; preds = %264, %51
293
+ %266 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %57, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !60
294
+ %267 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %259, float 2.560000e+02) #6, !dbg !61
295
+ %268 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %259, float 2.560000e+02) #6, !dbg !61
296
+ %269 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %259, float 2.560000e+02) #6, !dbg !61
297
+ %270 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %259, float 2.560000e+02) #6, !dbg !61
298
+ %271 = fadd float %267, 0x3EE4F8B580000000, !dbg !62
299
+ %272 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
300
+ %.not.i = icmp eq i32 %272, 0, !dbg !63
301
+ br i1 %.not.i, label %275, label %273, !dbg !63
302
+
303
+ 273: ; preds = %265
304
+ %274 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %271), !dbg !63
305
+ br label %__nv_rsqrtf.exit, !dbg !63
306
+
307
+ 275: ; preds = %265
308
+ %276 = tail call float @llvm.nvvm.rsqrt.approx.f(float %271), !dbg !63
309
+ br label %__nv_rsqrtf.exit, !dbg !63
310
+
311
+ __nv_rsqrtf.exit: ; preds = %273, %275
312
+ %.0.i = phi float [ %274, %273 ], [ %276, %275 ], !dbg !63
313
+ %277 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
314
+ %278 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
315
+ %279 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
316
+ %280 = extractvalue { i32, i32, i32, i32 } %266, 3, !dbg !60
317
+ %281 = bitcast i32 %280 to float, !dbg !60
318
+ %282 = extractvalue { i32, i32, i32, i32 } %260, 3, !dbg !56
319
+ %283 = bitcast i32 %282 to float, !dbg !56
320
+ %284 = fadd float %283, %281, !dbg !64
321
+ %285 = fsub float %284, %257, !dbg !65
322
+ %286 = extractvalue { i32, i32, i32, i32 } %266, 2, !dbg !60
323
+ %287 = bitcast i32 %286 to float, !dbg !60
324
+ %288 = extractvalue { i32, i32, i32, i32 } %260, 2, !dbg !56
325
+ %289 = bitcast i32 %288 to float, !dbg !56
326
+ %290 = fadd float %289, %287, !dbg !64
327
+ %291 = fsub float %290, %257, !dbg !65
328
+ %292 = extractvalue { i32, i32, i32, i32 } %266, 1, !dbg !60
329
+ %293 = bitcast i32 %292 to float, !dbg !60
330
+ %294 = extractvalue { i32, i32, i32, i32 } %260, 1, !dbg !56
331
+ %295 = bitcast i32 %294 to float, !dbg !56
332
+ %296 = fadd float %295, %293, !dbg !64
333
+ %297 = fsub float %296, %257, !dbg !65
334
+ %298 = extractvalue { i32, i32, i32, i32 } %266, 0, !dbg !60
335
+ %299 = bitcast i32 %298 to float, !dbg !60
336
+ %300 = extractvalue { i32, i32, i32, i32 } %260, 0, !dbg !56
337
+ %301 = bitcast i32 %300 to float, !dbg !56
338
+ %302 = fadd float %301, %299, !dbg !64
339
+ %303 = fsub float %302, %257, !dbg !65
340
+ %304 = extractvalue { i32, i32 } %263, 0, !dbg !58
341
+ %305 = extractvalue { i32, i32 } %263, 1, !dbg !58
342
+ %306 = fmul float %303, %.0.i, !dbg !66
343
+ %307 = fmul float %297, %.0.i, !dbg !66
344
+ %308 = fmul float %291, %.0.i, !dbg !66
345
+ %309 = fmul float %285, %.0.i, !dbg !66
346
+ tail call void @llvm.nvvm.barrier0(), !dbg !67
347
+ %310 = getelementptr float, ptr addrspace(3) @global_smem, i64 %261, !dbg !67
348
+ %311 = insertelement <2 x i32> undef, i32 %304, i64 0, !dbg !67
349
+ %312 = insertelement <2 x i32> %311, i32 %305, i64 1, !dbg !67
350
+ store <2 x i32> %312, ptr addrspace(3) %310, align 8, !dbg !67
351
+ tail call void @llvm.nvvm.barrier0(), !dbg !67
352
+ %313 = getelementptr float, ptr addrspace(3) @global_smem, i64 %55, !dbg !67
353
+ %314 = load float, ptr addrspace(3) %313, align 16, !dbg !67
354
+ %315 = getelementptr inbounds <4 x float>, ptr addrspace(3) %313, i64 0, i64 1, !dbg !67
355
+ %316 = load float, ptr addrspace(3) %315, align 4, !dbg !67
356
+ %317 = getelementptr inbounds <4 x float>, ptr addrspace(3) %313, i64 0, i64 2, !dbg !67
357
+ %318 = load float, ptr addrspace(3) %317, align 8, !dbg !67
358
+ %319 = getelementptr inbounds <4 x float>, ptr addrspace(3) %313, i64 0, i64 3, !dbg !67
359
+ %320 = load float, ptr addrspace(3) %319, align 4, !dbg !67
360
+ %321 = fmul float %306, %314, !dbg !67
361
+ %322 = fmul float %307, %316, !dbg !67
362
+ %323 = fmul float %308, %318, !dbg !67
363
+ %324 = fmul float %309, %320, !dbg !67
364
+ %325 = shl i32 %20, 8, !dbg !68
365
+ %326 = or i32 %325, %15, !dbg !69
366
+ %327 = sext i32 %326 to i64, !dbg !70
367
+ %328 = getelementptr i16, ptr addrspace(1) %4, i64 %327, !dbg !70
368
+ %329 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %321) #6, !dbg !71
369
+ %330 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %322) #6, !dbg !71
370
+ %331 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %323) #6, !dbg !71
371
+ %332 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %324) #6, !dbg !71
372
+ %333 = insertelement <2 x i16> undef, i16 %329, i64 0, !dbg !71
373
+ %334 = insertelement <2 x i16> %333, i16 %330, i64 1, !dbg !71
374
+ %335 = bitcast <2 x i16> %334 to i32, !dbg !71
375
+ %336 = insertelement <2 x i16> undef, i16 %331, i64 0, !dbg !71
376
+ %337 = insertelement <2 x i16> %336, i16 %332, i64 1, !dbg !71
377
+ %338 = bitcast <2 x i16> %337 to i32, !dbg !71
378
+ tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %335, i32 %338, ptr addrspace(1) %328, i1 true) #6, !dbg !71
379
+ ret void, !dbg !72
380
+ }
381
+
382
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
383
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
384
+
385
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
386
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
387
+
388
+ ; Function Attrs: convergent nocallback nounwind
389
+ declare void @llvm.nvvm.barrier0() #2
390
+
391
+ ; Function Attrs: alwaysinline nounwind
392
+ define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
393
+ %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
394
+ %.not = icmp eq i32 %1, 0
395
+ br i1 %.not, label %4, label %2
396
+
397
+ 2: ; preds = %0
398
+ %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
399
+ br label %6
400
+
401
+ 4: ; preds = %0
402
+ %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
403
+ br label %6
404
+
405
+ 6: ; preds = %4, %2
406
+ %.0 = phi float [ %3, %2 ], [ %5, %4 ]
407
+ ret float %.0
408
+ }
409
+
410
+ declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
411
+
412
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
413
+ declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
414
+
415
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
416
+ declare float @llvm.nvvm.rsqrt.approx.f(float) #5
417
+
418
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
419
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
420
+ attributes #2 = { convergent nocallback nounwind }
421
+ attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
422
+ attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
423
+ attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
424
+ attributes #6 = { nounwind }
425
+
426
+ !llvm.module.flags = !{!0, !1}
427
+ !llvm.dbg.cu = !{!2}
428
+ !nvvm.annotations = !{!4, !5, !5, !4}
429
+ !llvm.ident = !{!6}
430
+
431
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
432
+ !1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
433
+ !2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
434
+ !3 = !DIFile(filename: "cgx5lxpuexpindj4dsmjz5x42uhyy7iskevq7ovzpwagb3t5powj.py", directory: "/tmp/torchinductor_root/gx")
435
+ !4 = !{ptr @triton__0d1d2d3d4d5de6de, !"kernel", i32 1}
436
+ !5 = !{ptr @triton__0d1d2d3d4d5de6de, !"maxntidx", i32 128}
437
+ !6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
438
+ !7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5de6de", linkageName: "triton__0d1d2d3d4d5de6de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
439
+ !8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
440
+ !9 = !{}
441
+ !10 = !DILocation(line: 22, column: 44, scope: !7)
442
+ !11 = !DILocation(line: 24, column: 33, scope: !7)
443
+ !12 = !DILocation(line: 21, column: 28, scope: !7)
444
+ !13 = !DILocation(line: 21, column: 33, scope: !7)
445
+ !14 = !DILocation(line: 22, column: 23, scope: !7)
446
+ !15 = !DILocation(line: 26, column: 30, scope: !7)
447
+ !16 = !DILocation(line: 26, column: 35, scope: !7)
448
+ !17 = !DILocation(line: 27, column: 18, scope: !7)
449
+ !18 = !DILocation(line: 35, column: 44, scope: !7)
450
+ !19 = !DILocation(line: 35, column: 40, scope: !7)
451
+ !20 = !DILocation(line: 35, column: 34, scope: !7)
452
+ !21 = !DILocation(line: 35, column: 50, scope: !7)
453
+ !22 = !DILocation(line: 36, column: 22, scope: !7)
454
+ !23 = !DILocation(line: 37, column: 22, scope: !7)
455
+ !24 = !DILocation(line: 38, column: 36, scope: !7)
456
+ !25 = !DILocation(line: 39, column: 40, scope: !7)
457
+ !26 = !DILocation(line: 39, column: 55, scope: !7)
458
+ !27 = !DILocation(line: 40, column: 44, scope: !7)
459
+ !28 = !DILocation(line: 40, column: 40, scope: !7)
460
+ !29 = !DILocation(line: 40, column: 34, scope: !7)
461
+ !30 = !DILocation(line: 40, column: 52, scope: !7)
462
+ !31 = !DILocation(line: 41, column: 22, scope: !7)
463
+ !32 = !DILocation(line: 98, column: 22, scope: !33, inlinedAt: !35)
464
+ !33 = distinct !DILexicalBlockFile(scope: !7, file: !34, discriminator: 0)
465
+ !34 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
466
+ !35 = !DILocation(line: 44, column: 38, scope: !33)
467
+ !36 = !DILocation(line: 101, column: 30, scope: !33, inlinedAt: !35)
468
+ !37 = !DILocation(line: 101, column: 22, scope: !33, inlinedAt: !35)
469
+ !38 = !DILocation(line: 101, column: 13, scope: !33, inlinedAt: !35)
470
+ !39 = !DILocation(line: 108, column: 21, scope: !40, inlinedAt: !41)
471
+ !40 = distinct !DILexicalBlockFile(scope: !33, file: !34, discriminator: 0)
472
+ !41 = !DILocation(line: 120, column: 46, scope: !40, inlinedAt: !42)
473
+ !42 = !DILocation(line: 50, column: 41, scope: !40)
474
+ !43 = !DILocation(line: 110, column: 60, scope: !40, inlinedAt: !41)
475
+ !44 = !DILocation(line: 112, column: 25, scope: !40, inlinedAt: !41)
476
+ !45 = !DILocation(line: 112, column: 17, scope: !40, inlinedAt: !41)
477
+ !46 = !DILocation(line: 113, column: 15, scope: !40, inlinedAt: !41)
478
+ !47 = !DILocation(line: 113, column: 30, scope: !40, inlinedAt: !41)
479
+ !48 = !DILocation(line: 113, column: 49, scope: !40, inlinedAt: !41)
480
+ !49 = !DILocation(line: 113, column: 22, scope: !40, inlinedAt: !41)
481
+ !50 = !DILocation(line: 113, column: 38, scope: !40, inlinedAt: !41)
482
+ !51 = !DILocation(line: 120, column: 46, scope: !33, inlinedAt: !52)
483
+ !52 = !DILocation(line: 50, column: 41, scope: !33)
484
+ !53 = !DILocation(line: 109, column: 28, scope: !40, inlinedAt: !41)
485
+ !54 = !DILocation(line: 110, column: 39, scope: !40, inlinedAt: !41)
486
+ !55 = !DILocation(line: 110, column: 49, scope: !40, inlinedAt: !41)
487
+ !56 = !DILocation(line: 59, column: 51, scope: !7)
488
+ !57 = !DILocation(line: 60, column: 35, scope: !7)
489
+ !58 = !DILocation(line: 60, column: 40, scope: !7)
490
+ !59 = !DILocation(line: 64, column: 57, scope: !7)
491
+ !60 = !DILocation(line: 65, column: 54, scope: !7)
492
+ !61 = !DILocation(line: 69, column: 23, scope: !7)
493
+ !62 = !DILocation(line: 71, column: 24, scope: !7)
494
+ !63 = !DILocation(line: 72, column: 30, scope: !7)
495
+ !64 = !DILocation(line: 66, column: 24, scope: !7)
496
+ !65 = !DILocation(line: 67, column: 24, scope: !7)
497
+ !66 = !DILocation(line: 73, column: 24, scope: !7)
498
+ !67 = !DILocation(line: 74, column: 24, scope: !7)
499
+ !68 = !DILocation(line: 76, column: 39, scope: !7)
500
+ !69 = !DILocation(line: 76, column: 35, scope: !7)
501
+ !70 = !DILocation(line: 76, column: 29, scope: !7)
502
+ !71 = !DILocation(line: 76, column: 52, scope: !7)
503
+ !72 = !DILocation(line: 55, column: 4, scope: !7)
.triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.ptx ADDED
@@ -0,0 +1,988 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4d5de6de
10
+ .extern .func __assertfail
11
+ (
12
+ .param .b64 __assertfail_param_0,
13
+ .param .b64 __assertfail_param_1,
14
+ .param .b32 __assertfail_param_2,
15
+ .param .b64 __assertfail_param_3,
16
+ .param .b64 __assertfail_param_4
17
+ )
18
+ ;
19
+ .global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
20
+ .global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
21
+ .global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 51, 32, 60, 32, 53, 48, 50, 53, 55};
22
+ .global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
23
+ .global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
24
+ .global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
25
+ .extern .shared .align 1 .b8 global_smem[];
26
+ .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
27
+
28
+ .visible .entry triton__0d1d2d3d4d5de6de(
29
+ .param .u64 triton__0d1d2d3d4d5de6de_param_0,
30
+ .param .u64 triton__0d1d2d3d4d5de6de_param_1,
31
+ .param .u64 triton__0d1d2d3d4d5de6de_param_2,
32
+ .param .u64 triton__0d1d2d3d4d5de6de_param_3,
33
+ .param .u64 triton__0d1d2d3d4d5de6de_param_4,
34
+ .param .u32 triton__0d1d2d3d4d5de6de_param_5,
35
+ .param .u32 triton__0d1d2d3d4d5de6de_param_6
36
+ )
37
+ .maxntid 128, 1, 1
38
+ {
39
+ .reg .pred %p<50>;
40
+ .reg .b16 %rs<5>;
41
+ .reg .b32 %r<169>;
42
+ .reg .f32 %f<153>;
43
+ .reg .b64 %rd<53>;
44
+ .loc 1 18 0
45
+ $L__func_begin0:
46
+ .loc 1 18 0
47
+
48
+ ld.param.u64 %rd6, [triton__0d1d2d3d4d5de6de_param_3];
49
+ ld.param.u64 %rd5, [triton__0d1d2d3d4d5de6de_param_1];
50
+ ld.param.u64 %rd19, [triton__0d1d2d3d4d5de6de_param_0];
51
+ $L__tmp0:
52
+ .loc 1 22 44
53
+ mov.u32 %r1, %tid.x;
54
+ and.b32 %r2, %r1, 31;
55
+ ld.param.u64 %rd20, [triton__0d1d2d3d4d5de6de_param_2];
56
+ bfe.u32 %r3, %r1, 6, 1;
57
+ and.b32 %r4, %r1, 1;
58
+ .loc 1 24 33
59
+ bfe.u32 %r5, %r1, 5, 1;
60
+ shl.b32 %r24, %r1, 2;
61
+ and.b32 %r6, %r24, 252;
62
+ shl.b32 %r25, %r1, 1;
63
+ and.b32 %r7, %r25, 254;
64
+ .loc 1 21 28
65
+ mov.u32 %r15, %ctaid.x;
66
+ .loc 1 21 33
67
+ shl.b32 %r26, %r15, 1;
68
+ .loc 1 22 23
69
+ or.b32 %r8, %r26, %r3;
70
+ or.b32 %r27, %r26, %r4;
71
+ .loc 1 26 30
72
+ mul.wide.s32 %rd21, %r8, 8;
73
+ add.s64 %rd9, %rd19, %rd21;
74
+ mul.wide.s32 %rd22, %r27, 8;
75
+ add.s64 %rd17, %rd19, %rd22;
76
+ mov.pred %p44, -1;
77
+ .loc 1 26 35
78
+ mov.u64 %rd8, 0x0;
79
+ @%p44 ld.global.L1::evict_last.b64 { %rd8 }, [ %rd9 + 0 ];
80
+ mov.u64 %rd10, 0x0;
81
+ @%p44 ld.global.L1::evict_last.b64 { %rd10 }, [ %rd9 + 0 ];
82
+ mov.u64 %rd12, 0x0;
83
+ @%p44 ld.global.L1::evict_last.b64 { %rd12 }, [ %rd9 + 0 ];
84
+ mov.u64 %rd14, 0x0;
85
+ @%p44 ld.global.L1::evict_last.b64 { %rd14 }, [ %rd9 + 0 ];
86
+ mov.u64 %rd16, 0x0;
87
+ @%p44 ld.global.L1::evict_last.b64 { %rd16 }, [ %rd17 + 0 ];
88
+ .loc 1 27 18
89
+ bfe.s32 %r28, %r15, 30, 1;
90
+ shr.u32 %r29, %r28, 23;
91
+ add.s32 %r30, %r8, %r29;
92
+ and.b32 %r31, %r30, 16776704;
93
+ sub.s32 %r32, %r8, %r31;
94
+ .loc 1 35 44
95
+ shl.b32 %r33, %r32, 8;
96
+ .loc 1 35 40
97
+ or.b32 %r34, %r33, %r6;
98
+ .loc 1 35 34
99
+ mul.wide.s32 %rd23, %r34, 4;
100
+ add.s64 %rd33, %rd20, %rd23;
101
+ mov.b32 %r137, 0;
102
+ .loc 1 35 50
103
+ mov.u32 %r16, 0x0;
104
+ mov.u32 %r17, 0x0;
105
+ mov.u32 %r18, 0x0;
106
+ mov.u32 %r19, 0x0;
107
+ @%p44 ld.global.L1::evict_last.v4.b32 { %r16, %r17, %r18, %r19 }, [ %rd33 + 0 ];
108
+ @!%p44 mov.u32 %r16, %r137;
109
+ @!%p44 mov.u32 %r17, %r137;
110
+ @!%p44 mov.u32 %r18, %r137;
111
+ @!%p44 mov.u32 %r19, %r137;
112
+ mov.b32 %f1, %r16;
113
+ mov.b32 %f2, %r17;
114
+ mov.b32 %f3, %r18;
115
+ mov.b32 %f4, %r19;
116
+ .loc 1 36 22
117
+ add.s64 %rd24, %rd16, 50257;
118
+ .loc 1 37 22
119
+ setp.lt.s64 %p11, %rd16, 0;
120
+ .loc 1 38 36
121
+ selp.b64 %rd3, %rd24, %rd16, %p11;
122
+ .loc 1 39 40
123
+ setp.lt.u64 %p12, %rd3, 50257;
124
+ mov.b32 %r168, 883;
125
+ mov.u64 %rd52, 1;
126
+ .loc 1 39 55
127
+ @%p12 bra $L__BB0_2;
128
+ mov.u64 %rd25, assertMessage_0;
129
+ cvta.global.u64 %rd26, %rd25;
130
+ mov.u64 %rd27, assertFile_0;
131
+ cvta.global.u64 %rd28, %rd27;
132
+ mov.u64 %rd29, assertFunc_0;
133
+ cvta.global.u64 %rd30, %rd29;
134
+ { // callseq 4, 0
135
+ .reg .b32 temp_param_reg;
136
+ .param .b64 param0;
137
+ st.param.b64 [param0+0], %rd26;
138
+ .param .b64 param1;
139
+ st.param.b64 [param1+0], %rd28;
140
+ .param .b32 param2;
141
+ st.param.b32 [param2+0], %r168;
142
+ .param .b64 param3;
143
+ st.param.b64 [param3+0], %rd30;
144
+ .param .b64 param4;
145
+ st.param.b64 [param4+0], %rd52;
146
+ call.uni
147
+ __assertfail,
148
+ (
149
+ param0,
150
+ param1,
151
+ param2,
152
+ param3,
153
+ param4
154
+ );
155
+ } // callseq 4
156
+ $L__BB0_2:
157
+ .loc 1 0 55
158
+ ld.param.u64 %rd7, [triton__0d1d2d3d4d5de6de_param_4];
159
+ .loc 1 37 22
160
+ setp.lt.s64 %p36, %rd8, 0;
161
+ .loc 1 40 44
162
+ shl.b64 %rd35, %rd8, 8;
163
+ add.s64 %rd36, %rd35, 12865792;
164
+ selp.b64 %rd37, %rd36, %rd35, %p36;
165
+ cvt.u64.u32 %rd38, %r6;
166
+ .loc 1 40 40
167
+ or.b64 %rd39, %rd37, %rd38;
168
+ .loc 1 40 34
169
+ shl.b64 %rd40, %rd39, 2;
170
+ add.s64 %rd49, %rd5, %rd40;
171
+ .loc 1 40 52
172
+ mov.u32 %r36, 0x0;
173
+ mov.u32 %r37, 0x0;
174
+ mov.u32 %r38, 0x0;
175
+ mov.u32 %r39, 0x0;
176
+ @%p44 ld.global.L1::evict_last.v4.b32 { %r36, %r37, %r38, %r39 }, [ %rd49 + 0 ];
177
+ @!%p44 mov.u32 %r36, %r137;
178
+ @!%p44 mov.u32 %r37, %r137;
179
+ @!%p44 mov.u32 %r38, %r137;
180
+ @!%p44 mov.u32 %r39, %r137;
181
+ mov.b32 %f7, %r36;
182
+ mov.b32 %f8, %r37;
183
+ mov.b32 %f9, %r38;
184
+ mov.b32 %f10, %r39;
185
+ .loc 1 41 22
186
+ add.f32 %f11, %f1, %f7;
187
+ add.f32 %f12, %f2, %f8;
188
+ add.f32 %f13, %f3, %f9;
189
+ add.f32 %f14, %f4, %f10;
190
+ $L__tmp1:
191
+ .loc 2 98 22
192
+ add.f32 %f15, %f11, 0f00000000;
193
+ add.f32 %f16, %f12, 0f00000000;
194
+ add.f32 %f17, %f13, 0f00000000;
195
+ add.f32 %f18, %f14, 0f00000000;
196
+ .loc 2 101 30
197
+ sub.f32 %f19, %f11, %f15;
198
+ sub.f32 %f20, %f12, %f16;
199
+ sub.f32 %f21, %f13, %f17;
200
+ sub.f32 %f22, %f14, %f18;
201
+ .loc 2 101 13
202
+ fma.rn.f32 %f23, %f11, %f19, 0f00000000;
203
+ fma.rn.f32 %f24, %f12, %f20, 0f00000000;
204
+ fma.rn.f32 %f25, %f13, %f21, 0f00000000;
205
+ fma.rn.f32 %f26, %f14, %f22, 0f00000000;
206
+ $L__tmp2:
207
+ .loc 2 108 21
208
+ sub.f32 %f27, %f16, %f15;
209
+ mov.b32 %r45, 1065353216;
210
+ mov.b32 %r46, 1073741824;
211
+ .loc 2 110 60
212
+ div.full.f32 %r44, %r45, %r46;
213
+ mov.b32 %f28, %r44;
214
+ .loc 2 112 17
215
+ fma.rn.f32 %f29, %f28, %f27, %f15;
216
+ .loc 2 113 15
217
+ add.f32 %f30, %f23, %f24;
218
+ .loc 2 113 30
219
+ mul.f32 %f31, %f27, %f27;
220
+ .loc 2 113 22
221
+ fma.rn.f32 %f32, %f28, %f31, %f30;
222
+ .loc 2 108 21
223
+ sub.f32 %f33, %f17, %f29;
224
+ mov.b32 %r49, 1077936128;
225
+ .loc 2 110 60
226
+ div.full.f32 %r47, %r45, %r49;
227
+ mov.b32 %f34, %r47;
228
+ .loc 2 112 17
229
+ fma.rn.f32 %f35, %f34, %f33, %f29;
230
+ .loc 2 113 15
231
+ add.f32 %f36, %f25, %f32;
232
+ .loc 2 113 30
233
+ mul.f32 %f37, %f33, %f33;
234
+ .loc 2 113 38
235
+ fma.rn.f32 %f38, %f33, %f33, %f37;
236
+ .loc 2 113 22
237
+ fma.rn.f32 %f39, %f34, %f38, %f36;
238
+ .loc 2 108 21
239
+ sub.f32 %f40, %f18, %f35;
240
+ mov.b32 %r52, 1082130432;
241
+ .loc 2 110 60
242
+ div.full.f32 %r50, %r45, %r52;
243
+ mov.b32 %f41, %r50;
244
+ .loc 2 112 17
245
+ fma.rn.f32 %f42, %f41, %f40, %f35;
246
+ .loc 2 113 15
247
+ add.f32 %f43, %f26, %f39;
248
+ .loc 2 113 30
249
+ mul.f32 %f44, %f40, %f40;
250
+ .loc 2 113 38
251
+ mul.f32 %f45, %f44, 0f40400000;
252
+ .loc 2 113 22
253
+ fma.rn.f32 %f46, %f41, %f45, %f43;
254
+ $L__tmp3:
255
+ .loc 2 120 46
256
+ mov.b32 %r101, %f42;
257
+ shfl.sync.bfly.b32 %r102, %r101, 16, 31, -1;
258
+ mov.b32 %f47, %r102;
259
+ mov.b32 %r103, %f46;
260
+ shfl.sync.bfly.b32 %r104, %r103, 16, 31, -1;
261
+ mov.b32 %f48, %r104;
262
+ shfl.sync.bfly.b32 %r54, %r52, 16, 31, -1;
263
+ mov.b32 %f49, %r54;
264
+ $L__tmp4:
265
+ .loc 2 108 21
266
+ sub.f32 %f50, %f47, %f42;
267
+ .loc 2 109 28
268
+ add.f32 %f51, %f49, 0f40800000;
269
+ .loc 2 110 39
270
+ setp.eq.f32 %p37, %f51, 0f00000000;
271
+ .loc 2 110 60
272
+ mov.b32 %r55, %f51;
273
+ div.full.f32 %r53, %r54, %r55;
274
+ mov.b32 %f52, %r53;
275
+ .loc 2 110 49
276
+ selp.f32 %f53, 0f00000000, %f52, %p37;
277
+ .loc 2 112 17
278
+ fma.rn.f32 %f54, %f53, %f50, %f42;
279
+ .loc 2 113 15
280
+ add.f32 %f55, %f46, %f48;
281
+ .loc 2 113 30
282
+ mul.f32 %f56, %f50, %f50;
283
+ .loc 2 113 38
284
+ mul.f32 %f57, %f56, 0f40800000;
285
+ .loc 2 113 22
286
+ fma.rn.f32 %f58, %f53, %f57, %f55;
287
+ $L__tmp5:
288
+ .loc 2 120 46
289
+ mov.b32 %r105, %f54;
290
+ shfl.sync.bfly.b32 %r106, %r105, 8, 31, -1;
291
+ mov.b32 %f59, %r106;
292
+ mov.b32 %r107, %f58;
293
+ shfl.sync.bfly.b32 %r108, %r107, 8, 31, -1;
294
+ mov.b32 %f60, %r108;
295
+ shfl.sync.bfly.b32 %r57, %r55, 8, 31, -1;
296
+ mov.b32 %f61, %r57;
297
+ $L__tmp6:
298
+ .loc 2 108 21
299
+ sub.f32 %f62, %f59, %f54;
300
+ .loc 2 109 28
301
+ add.f32 %f63, %f51, %f61;
302
+ .loc 2 110 39
303
+ setp.eq.f32 %p38, %f63, 0f00000000;
304
+ .loc 2 110 60
305
+ mov.b32 %r58, %f63;
306
+ div.full.f32 %r56, %r57, %r58;
307
+ mov.b32 %f64, %r56;
308
+ .loc 2 110 49
309
+ selp.f32 %f65, 0f00000000, %f64, %p38;
310
+ .loc 2 112 17
311
+ fma.rn.f32 %f66, %f65, %f62, %f54;
312
+ .loc 2 113 15
313
+ add.f32 %f67, %f58, %f60;
314
+ .loc 2 113 30
315
+ mul.f32 %f68, %f62, %f62;
316
+ .loc 2 113 38
317
+ mul.f32 %f69, %f51, %f68;
318
+ .loc 2 113 22
319
+ fma.rn.f32 %f70, %f65, %f69, %f67;
320
+ $L__tmp7:
321
+ .loc 2 120 46
322
+ mov.b32 %r109, %f66;
323
+ shfl.sync.bfly.b32 %r110, %r109, 4, 31, -1;
324
+ mov.b32 %f71, %r110;
325
+ mov.b32 %r111, %f70;
326
+ shfl.sync.bfly.b32 %r112, %r111, 4, 31, -1;
327
+ mov.b32 %f72, %r112;
328
+ shfl.sync.bfly.b32 %r60, %r58, 4, 31, -1;
329
+ mov.b32 %f73, %r60;
330
+ $L__tmp8:
331
+ .loc 2 108 21
332
+ sub.f32 %f74, %f71, %f66;
333
+ .loc 2 109 28
334
+ add.f32 %f75, %f63, %f73;
335
+ .loc 2 110 39
336
+ setp.eq.f32 %p39, %f75, 0f00000000;
337
+ .loc 2 110 60
338
+ mov.b32 %r61, %f75;
339
+ div.full.f32 %r59, %r60, %r61;
340
+ mov.b32 %f76, %r59;
341
+ .loc 2 110 49
342
+ selp.f32 %f77, 0f00000000, %f76, %p39;
343
+ .loc 2 112 17
344
+ fma.rn.f32 %f78, %f77, %f74, %f66;
345
+ .loc 2 113 15
346
+ add.f32 %f79, %f70, %f72;
347
+ .loc 2 113 30
348
+ mul.f32 %f80, %f74, %f74;
349
+ .loc 2 113 38
350
+ mul.f32 %f81, %f63, %f80;
351
+ .loc 2 113 22
352
+ fma.rn.f32 %f82, %f77, %f81, %f79;
353
+ $L__tmp9:
354
+ .loc 2 120 46
355
+ mov.b32 %r113, %f78;
356
+ shfl.sync.bfly.b32 %r114, %r113, 2, 31, -1;
357
+ mov.b32 %f83, %r114;
358
+ mov.b32 %r115, %f82;
359
+ shfl.sync.bfly.b32 %r116, %r115, 2, 31, -1;
360
+ mov.b32 %f84, %r116;
361
+ shfl.sync.bfly.b32 %r63, %r61, 2, 31, -1;
362
+ mov.b32 %f85, %r63;
363
+ $L__tmp10:
364
+ .loc 2 108 21
365
+ sub.f32 %f86, %f83, %f78;
366
+ .loc 2 109 28
367
+ add.f32 %f87, %f75, %f85;
368
+ .loc 2 110 39
369
+ setp.eq.f32 %p40, %f87, 0f00000000;
370
+ .loc 2 110 60
371
+ mov.b32 %r64, %f87;
372
+ div.full.f32 %r62, %r63, %r64;
373
+ mov.b32 %f88, %r62;
374
+ .loc 2 110 49
375
+ selp.f32 %f89, 0f00000000, %f88, %p40;
376
+ .loc 2 112 17
377
+ fma.rn.f32 %f90, %f89, %f86, %f78;
378
+ .loc 2 113 15
379
+ add.f32 %f91, %f82, %f84;
380
+ .loc 2 113 30
381
+ mul.f32 %f92, %f86, %f86;
382
+ .loc 2 113 38
383
+ mul.f32 %f93, %f75, %f92;
384
+ .loc 2 113 22
385
+ fma.rn.f32 %f94, %f89, %f93, %f91;
386
+ $L__tmp11:
387
+ .loc 2 120 46
388
+ mov.b32 %r117, %f90;
389
+ shfl.sync.bfly.b32 %r118, %r117, 1, 31, -1;
390
+ mov.b32 %f95, %r118;
391
+ mov.b32 %r119, %f94;
392
+ shfl.sync.bfly.b32 %r120, %r119, 1, 31, -1;
393
+ mov.b32 %f96, %r120;
394
+ shfl.sync.bfly.b32 %r66, %r64, 1, 31, -1;
395
+ mov.b32 %f97, %r66;
396
+ $L__tmp12:
397
+ .loc 2 108 21
398
+ sub.f32 %f98, %f95, %f90;
399
+ .loc 2 109 28
400
+ add.f32 %f99, %f87, %f97;
401
+ .loc 2 110 39
402
+ setp.eq.f32 %p41, %f99, 0f00000000;
403
+ .loc 2 110 60
404
+ mov.b32 %r67, %f99;
405
+ div.full.f32 %r65, %r66, %r67;
406
+ mov.b32 %f100, %r65;
407
+ .loc 2 110 49
408
+ selp.f32 %f101, 0f00000000, %f100, %p41;
409
+ .loc 2 112 17
410
+ fma.rn.f32 %f102, %f98, %f101, %f90;
411
+ .loc 2 113 15
412
+ add.f32 %f103, %f94, %f96;
413
+ .loc 2 113 30
414
+ mul.f32 %f104, %f98, %f98;
415
+ .loc 2 113 38
416
+ mul.f32 %f105, %f87, %f104;
417
+ .loc 2 113 22
418
+ fma.rn.f32 %f106, %f101, %f105, %f103;
419
+ $L__tmp13:
420
+ .loc 2 120 46
421
+ setp.eq.s32 %p18, %r2, 0;
422
+ shl.b32 %r121, %r5, 2;
423
+ shl.b32 %r122, %r3, 3;
424
+ or.b32 %r123, %r122, %r121;
425
+ mov.u32 %r124, global_smem;
426
+ add.s32 %r68, %r124, %r123;
427
+ mov.b32 %r69, %f102;
428
+ @%p18 st.shared.b32 [ %r68 + 0 ], %r69;
429
+ add.s32 %r125, %r124, 16;
430
+ add.s32 %r70, %r125, %r123;
431
+ mov.b32 %r71, %f106;
432
+ @%p18 st.shared.b32 [ %r70 + 0 ], %r71;
433
+ add.s32 %r126, %r124, 32;
434
+ add.s32 %r72, %r126, %r123;
435
+ @%p18 st.shared.b32 [ %r72 + 0 ], %r67;
436
+ bar.sync 0;
437
+ setp.lt.s32 %p21, %r1, 4;
438
+ add.s32 %r75, %r124, %r24;
439
+ @%p21 ld.shared.b32 %r74, [ %r75 + 0 ];
440
+ mov.b32 %f107, %r74;
441
+ add.s32 %r77, %r125, %r24;
442
+ @%p21 ld.shared.b32 %r76, [ %r77 + 0 ];
443
+ mov.b32 %f108, %r76;
444
+ add.s32 %r79, %r126, %r24;
445
+ @%p21 ld.shared.b32 %r78, [ %r79 + 0 ];
446
+ mov.b32 %f109, %r78;
447
+ shfl.sync.bfly.b32 %r128, %r74, 1, 31, -1;
448
+ mov.b32 %f110, %r128;
449
+ shfl.sync.bfly.b32 %r129, %r76, 1, 31, -1;
450
+ mov.b32 %f111, %r129;
451
+ shfl.sync.bfly.b32 %r81, %r78, 1, 31, -1;
452
+ mov.b32 %f112, %r81;
453
+ $L__tmp14:
454
+ .loc 2 108 21
455
+ sub.f32 %f113, %f110, %f107;
456
+ .loc 2 109 28
457
+ add.f32 %f114, %f109, %f112;
458
+ .loc 2 110 39
459
+ setp.eq.f32 %p42, %f114, 0f00000000;
460
+ .loc 2 110 60
461
+ mov.b32 %r82, %f114;
462
+ div.full.f32 %r80, %r81, %r82;
463
+ mov.b32 %f115, %r80;
464
+ .loc 2 110 49
465
+ selp.f32 %f116, 0f00000000, %f115, %p42;
466
+ .loc 2 112 17
467
+ fma.rn.f32 %f117, %f113, %f116, %f107;
468
+ .loc 2 113 15
469
+ add.f32 %f118, %f108, %f111;
470
+ .loc 2 113 30
471
+ mul.f32 %f119, %f113, %f113;
472
+ .loc 2 113 38
473
+ mul.f32 %f120, %f109, %f119;
474
+ .loc 2 113 22
475
+ fma.rn.f32 %f121, %f120, %f116, %f118;
476
+ $L__tmp15:
477
+ .loc 2 120 46
478
+ setp.eq.s32 %p43, %r4, 0;
479
+ and.pred %p24, %p21, %p43;
480
+ mov.b32 %r84, %f117;
481
+ @%p24 st.shared.b32 [ %r75 + 0 ], %r84;
482
+ mov.b32 %r86, %f121;
483
+ @%p24 st.shared.b32 [ %r77 + 0 ], %r86;
484
+ @%p24 st.shared.b32 [ %r79 + 0 ], %r82;
485
+ bar.sync 0;
486
+ add.s32 %r130, %r124, %r122;
487
+ ld.shared.f32 %f5, [%r130];
488
+ add.s32 %r131, %r125, %r122;
489
+ ld.shared.f32 %f6, [%r131];
490
+ $L__tmp16:
491
+ .loc 1 59 51
492
+ mov.u32 %r89, 0x0;
493
+ mov.u32 %r90, 0x0;
494
+ mov.u32 %r91, 0x0;
495
+ mov.u32 %r92, 0x0;
496
+ @%p44 ld.global.L1::evict_last.v4.b32 { %r89, %r90, %r91, %r92 }, [ %rd33 + 0 ];
497
+ @!%p44 mov.u32 %r89, %r137;
498
+ @!%p44 mov.u32 %r90, %r137;
499
+ @!%p44 mov.u32 %r91, %r137;
500
+ @!%p44 mov.u32 %r92, %r137;
501
+ .loc 1 60 35
502
+ mul.wide.u32 %rd41, %r7, 4;
503
+ add.s64 %rd34, %rd6, %rd41;
504
+ .loc 1 60 40
505
+ mov.u32 %r97, 0x0;
506
+ mov.u32 %r98, 0x0;
507
+ @%p44 ld.global.L1::evict_last.v2.b32 { %r97, %r98 }, [ %rd34 + 0 ];
508
+ @!%p44 mov.u32 %r97, %r137;
509
+ @!%p44 mov.u32 %r98, %r137;
510
+ .loc 1 64 57
511
+ @%p12 bra $L__BB0_4;
512
+ mov.u64 %rd42, assertMessage_1;
513
+ cvta.global.u64 %rd43, %rd42;
514
+ mov.u64 %rd44, assertFile_1;
515
+ cvta.global.u64 %rd45, %rd44;
516
+ mov.u64 %rd46, assertFunc_1;
517
+ cvta.global.u64 %rd47, %rd46;
518
+ { // callseq 5, 0
519
+ .reg .b32 temp_param_reg;
520
+ .param .b64 param0;
521
+ st.param.b64 [param0+0], %rd43;
522
+ .param .b64 param1;
523
+ st.param.b64 [param1+0], %rd45;
524
+ .param .b32 param2;
525
+ st.param.b32 [param2+0], %r168;
526
+ .param .b64 param3;
527
+ st.param.b64 [param3+0], %rd47;
528
+ .param .b64 param4;
529
+ st.param.b64 [param4+0], %rd52;
530
+ call.uni
531
+ __assertfail,
532
+ (
533
+ param0,
534
+ param1,
535
+ param2,
536
+ param3,
537
+ param4
538
+ );
539
+ } // callseq 5
540
+ $L__BB0_4:
541
+ .loc 1 65 54
542
+ mov.u32 %r133, 0x0;
543
+ mov.u32 %r134, 0x0;
544
+ mov.u32 %r135, 0x0;
545
+ mov.u32 %r136, 0x0;
546
+ @%p44 ld.global.L1::evict_first.v4.b32 { %r133, %r134, %r135, %r136 }, [ %rd49 + 0 ];
547
+ @!%p44 mov.u32 %r133, %r137;
548
+ @!%p44 mov.u32 %r134, %r137;
549
+ @!%p44 mov.u32 %r135, %r137;
550
+ @!%p44 mov.u32 %r136, %r137;
551
+ .loc 1 69 23
552
+ mov.b32 %r142, %f6;
553
+ mov.b32 %r143, 1132462080;
554
+ div.full.f32 %r141, %r142, %r143;
555
+ mov.b32 %f122, %r141;
556
+ .loc 1 71 24
557
+ add.f32 %f123, %f122, 0f3727C5AC;
558
+ .loc 1 72 30
559
+ rsqrt.approx.ftz.f32 %f124, %f123;
560
+ .loc 1 65 54
561
+ mov.b32 %f125, %r136;
562
+ .loc 1 59 51
563
+ mov.b32 %f126, %r92;
564
+ .loc 1 66 24
565
+ add.f32 %f127, %f126, %f125;
566
+ .loc 1 67 24
567
+ sub.f32 %f128, %f127, %f5;
568
+ .loc 1 65 54
569
+ mov.b32 %f129, %r135;
570
+ .loc 1 59 51
571
+ mov.b32 %f130, %r91;
572
+ .loc 1 66 24
573
+ add.f32 %f131, %f130, %f129;
574
+ .loc 1 67 24
575
+ sub.f32 %f132, %f131, %f5;
576
+ .loc 1 65 54
577
+ mov.b32 %f133, %r134;
578
+ .loc 1 59 51
579
+ mov.b32 %f134, %r90;
580
+ .loc 1 66 24
581
+ add.f32 %f135, %f134, %f133;
582
+ .loc 1 67 24
583
+ sub.f32 %f136, %f135, %f5;
584
+ .loc 1 65 54
585
+ mov.b32 %f137, %r133;
586
+ .loc 1 59 51
587
+ mov.b32 %f138, %r89;
588
+ .loc 1 66 24
589
+ add.f32 %f139, %f138, %f137;
590
+ .loc 1 67 24
591
+ sub.f32 %f140, %f139, %f5;
592
+ .loc 1 73 24
593
+ mul.f32 %f141, %f140, %f124;
594
+ mul.f32 %f142, %f136, %f124;
595
+ mul.f32 %f143, %f132, %f124;
596
+ mul.f32 %f144, %f128, %f124;
597
+ .loc 1 74 24
598
+ bar.sync 0;
599
+ shl.b32 %r159, %r7, 2;
600
+ add.s32 %r161, %r124, %r159;
601
+ st.shared.v2.u32 [%r161], {%r97, %r98};
602
+ bar.sync 0;
603
+ shl.b32 %r162, %r6, 2;
604
+ add.s32 %r163, %r124, %r162;
605
+ ld.shared.v4.f32 {%f145, %f146, %f147, %f148}, [%r163];
606
+ mul.f32 %f149, %f141, %f145;
607
+ mul.f32 %f150, %f142, %f146;
608
+ mul.f32 %f151, %f143, %f147;
609
+ mul.f32 %f152, %f144, %f148;
610
+ .loc 1 76 39
611
+ shl.b32 %r164, %r8, 8;
612
+ .loc 1 76 35
613
+ or.b32 %r165, %r164, %r6;
614
+ .loc 1 76 29
615
+ mul.wide.s32 %rd51, %r165, 2;
616
+ add.s64 %rd50, %rd7, %rd51;
617
+ .loc 1 76 52
618
+ mov.b32 %r153, %f149;
619
+ cvt.rn.bf16.f32 %rs1, %r153;
620
+ mov.b32 %r154, %f150;
621
+ cvt.rn.bf16.f32 %rs2, %r154;
622
+ mov.b32 %r155, %f151;
623
+ cvt.rn.bf16.f32 %rs3, %r155;
624
+ mov.b32 %r156, %f152;
625
+ cvt.rn.bf16.f32 %rs4, %r156;
626
+ mov.b32 %r166, {%rs1, %rs2};
627
+ mov.b32 %r167, {%rs3, %rs4};
628
+ @%p44 st.global.v2.b32 [ %rd50 + 0 ], { %r166, %r167 };
629
+ .loc 1 55 4
630
+ ret;
631
+ $L__tmp17:
632
+ $L__func_end0:
633
+
634
+ }
635
+ // .globl __nv_rsqrtf
636
+ .visible .func (.param .b32 func_retval0) __nv_rsqrtf(
637
+ .param .b32 __nv_rsqrtf_param_0
638
+ )
639
+ {
640
+ .reg .f32 %f<3>;
641
+ $L__func_begin1:
642
+
643
+ ld.param.f32 %f1, [__nv_rsqrtf_param_0];
644
+ rsqrt.approx.ftz.f32 %f2, %f1;
645
+ st.param.f32 [func_retval0+0], %f2;
646
+ ret;
647
+ $L__func_end1:
648
+
649
+ }
650
+ .file 1 "/tmp/torchinductor_root/gx/cgx5lxpuexpindj4dsmjz5x42uhyy7iskevq7ovzpwagb3t5powj.py"
651
+ .file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
652
+ .section .debug_abbrev
653
+ {
654
+ .b8 1
655
+ .b8 17
656
+ .b8 1
657
+ .b8 37
658
+ .b8 8
659
+ .b8 19
660
+ .b8 5
661
+ .b8 3
662
+ .b8 8
663
+ .b8 16
664
+ .b8 6
665
+ .b8 27
666
+ .b8 8
667
+ .b8 180
668
+ .b8 66
669
+ .b8 12
670
+ .b8 17
671
+ .b8 1
672
+ .b8 18
673
+ .b8 1
674
+ .b8 0
675
+ .b8 0
676
+ .b8 2
677
+ .b8 46
678
+ .b8 0
679
+ .b8 135
680
+ .b8 64
681
+ .b8 8
682
+ .b8 3
683
+ .b8 8
684
+ .b8 58
685
+ .b8 11
686
+ .b8 59
687
+ .b8 11
688
+ .b8 63
689
+ .b8 12
690
+ .b8 32
691
+ .b8 11
692
+ .b8 0
693
+ .b8 0
694
+ .b8 3
695
+ .b8 46
696
+ .b8 1
697
+ .b8 17
698
+ .b8 1
699
+ .b8 18
700
+ .b8 1
701
+ .b8 64
702
+ .b8 10
703
+ .b8 49
704
+ .b8 19
705
+ .b8 0
706
+ .b8 0
707
+ .b8 4
708
+ .b8 29
709
+ .b8 0
710
+ .b8 49
711
+ .b8 19
712
+ .b8 17
713
+ .b8 1
714
+ .b8 18
715
+ .b8 1
716
+ .b8 88
717
+ .b8 11
718
+ .b8 89
719
+ .b8 11
720
+ .b8 87
721
+ .b8 11
722
+ .b8 0
723
+ .b8 0
724
+ .b8 5
725
+ .b8 29
726
+ .b8 1
727
+ .b8 49
728
+ .b8 19
729
+ .b8 17
730
+ .b8 1
731
+ .b8 18
732
+ .b8 1
733
+ .b8 88
734
+ .b8 11
735
+ .b8 89
736
+ .b8 11
737
+ .b8 87
738
+ .b8 11
739
+ .b8 0
740
+ .b8 0
741
+ .b8 0
742
+ }
743
+ .section .debug_info
744
+ {
745
+ .b32 298
746
+ .b8 2
747
+ .b8 0
748
+ .b32 .debug_abbrev
749
+ .b8 8
750
+ .b8 1
751
+ .b8 116
752
+ .b8 114
753
+ .b8 105
754
+ .b8 116
755
+ .b8 111
756
+ .b8 110
757
+ .b8 0
758
+ .b8 2
759
+ .b8 0
760
+ .b8 99
761
+ .b8 103
762
+ .b8 120
763
+ .b8 53
764
+ .b8 108
765
+ .b8 120
766
+ .b8 112
767
+ .b8 117
768
+ .b8 101
769
+ .b8 120
770
+ .b8 112
771
+ .b8 105
772
+ .b8 110
773
+ .b8 100
774
+ .b8 106
775
+ .b8 52
776
+ .b8 100
777
+ .b8 115
778
+ .b8 109
779
+ .b8 106
780
+ .b8 122
781
+ .b8 53
782
+ .b8 120
783
+ .b8 52
784
+ .b8 50
785
+ .b8 117
786
+ .b8 104
787
+ .b8 121
788
+ .b8 121
789
+ .b8 55
790
+ .b8 105
791
+ .b8 115
792
+ .b8 107
793
+ .b8 101
794
+ .b8 118
795
+ .b8 113
796
+ .b8 55
797
+ .b8 111
798
+ .b8 118
799
+ .b8 122
800
+ .b8 112
801
+ .b8 119
802
+ .b8 97
803
+ .b8 103
804
+ .b8 98
805
+ .b8 51
806
+ .b8 116
807
+ .b8 53
808
+ .b8 112
809
+ .b8 111
810
+ .b8 119
811
+ .b8 106
812
+ .b8 46
813
+ .b8 112
814
+ .b8 121
815
+ .b8 0
816
+ .b32 .debug_line
817
+ .b8 47
818
+ .b8 116
819
+ .b8 109
820
+ .b8 112
821
+ .b8 47
822
+ .b8 116
823
+ .b8 111
824
+ .b8 114
825
+ .b8 99
826
+ .b8 104
827
+ .b8 105
828
+ .b8 110
829
+ .b8 100
830
+ .b8 117
831
+ .b8 99
832
+ .b8 116
833
+ .b8 111
834
+ .b8 114
835
+ .b8 95
836
+ .b8 114
837
+ .b8 111
838
+ .b8 111
839
+ .b8 116
840
+ .b8 47
841
+ .b8 103
842
+ .b8 120
843
+ .b8 0
844
+ .b8 1
845
+ .b64 $L__func_begin0
846
+ .b64 $L__func_end0
847
+ .b8 2
848
+ .b8 116
849
+ .b8 114
850
+ .b8 105
851
+ .b8 116
852
+ .b8 111
853
+ .b8 110
854
+ .b8 95
855
+ .b8 95
856
+ .b8 48
857
+ .b8 100
858
+ .b8 49
859
+ .b8 100
860
+ .b8 50
861
+ .b8 100
862
+ .b8 51
863
+ .b8 100
864
+ .b8 52
865
+ .b8 100
866
+ .b8 53
867
+ .b8 100
868
+ .b8 101
869
+ .b8 54
870
+ .b8 100
871
+ .b8 101
872
+ .b8 0
873
+ .b8 116
874
+ .b8 114
875
+ .b8 105
876
+ .b8 116
877
+ .b8 111
878
+ .b8 110
879
+ .b8 95
880
+ .b8 95
881
+ .b8 48
882
+ .b8 100
883
+ .b8 49
884
+ .b8 100
885
+ .b8 50
886
+ .b8 100
887
+ .b8 51
888
+ .b8 100
889
+ .b8 52
890
+ .b8 100
891
+ .b8 53
892
+ .b8 100
893
+ .b8 101
894
+ .b8 54
895
+ .b8 100
896
+ .b8 101
897
+ .b8 0
898
+ .b8 1
899
+ .b8 18
900
+ .b8 1
901
+ .b8 1
902
+ .b8 3
903
+ .b64 $L__func_begin0
904
+ .b64 $L__func_end0
905
+ .b8 1
906
+ .b8 156
907
+ .b32 125
908
+ .b8 4
909
+ .b32 125
910
+ .b64 $L__tmp1
911
+ .b64 $L__tmp2
912
+ .b8 2
913
+ .b8 44
914
+ .b8 38
915
+ .b8 5
916
+ .b32 125
917
+ .b64 $L__tmp2
918
+ .b64 $L__tmp15
919
+ .b8 2
920
+ .b8 50
921
+ .b8 41
922
+ .b8 4
923
+ .b32 125
924
+ .b64 $L__tmp2
925
+ .b64 $L__tmp15
926
+ .b8 2
927
+ .b8 120
928
+ .b8 46
929
+ .b8 0
930
+ .b8 4
931
+ .b32 125
932
+ .b64 $L__tmp3
933
+ .b64 $L__tmp16
934
+ .b8 2
935
+ .b8 50
936
+ .b8 41
937
+ .b8 0
938
+ .b8 0
939
+ }
940
+ .section .debug_pubnames
941
+ {
942
+ .b32 $L__pubNames_end0-$L__pubNames_start0
943
+ $L__pubNames_start0:
944
+ .b8 2
945
+ .b8 0
946
+ .b32 .debug_info
947
+ .b32 302
948
+ .b32 125
949
+ .b8 116
950
+ .b8 114
951
+ .b8 105
952
+ .b8 116
953
+ .b8 111
954
+ .b8 110
955
+ .b8 95
956
+ .b8 95
957
+ .b8 48
958
+ .b8 100
959
+ .b8 49
960
+ .b8 100
961
+ .b8 50
962
+ .b8 100
963
+ .b8 51
964
+ .b8 100
965
+ .b8 52
966
+ .b8 100
967
+ .b8 53
968
+ .b8 100
969
+ .b8 101
970
+ .b8 54
971
+ .b8 100
972
+ .b8 101
973
+ .b8 0
974
+ .b32 0
975
+ $L__pubNames_end0:
976
+ }
977
+ .section .debug_pubtypes
978
+ {
979
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
980
+ $L__pubTypes_start0:
981
+ .b8 2
982
+ .b8 0
983
+ .b32 .debug_info
984
+ .b32 302
985
+ .b32 0
986
+ $L__pubTypes_end0:
987
+ }
988
+ .section .debug_loc { }
.triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.ttir ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<1.000000e+00> : tensor<1x256xf32>
4
+ %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x256xf32>
5
+ %cst_1 = arith.constant 0.000000e+00 : f32
6
+ %cst_2 = arith.constant dense<256> : tensor<2x1xi64>
7
+ %cst_3 = arith.constant dense<50257> : tensor<2x1xi64>
8
+ %cst_4 = arith.constant dense<0> : tensor<2x1xi64>
9
+ %cst_5 = arith.constant dense<9.99999974E-6> : tensor<2x1xf32>
10
+ %cst_6 = arith.constant dense<2.560000e+02> : tensor<2x1xf32>
11
+ %cst_7 = arith.constant dense<0.000000e+00> : tensor<2x256xf32>
12
+ %cst_8 = arith.constant dense<256> : tensor<2x1xi32>
13
+ %cst_9 = arith.constant dense<256> : tensor<1x256xi32>
14
+ %cst_10 = arith.constant dense<512> : tensor<2x1xi32>
15
+ %c2_i32 = arith.constant 2 : i32
16
+ %0 = tt.get_program_id x : i32
17
+ %1 = arith.muli %0, %c2_i32 : i32
18
+ %2 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32>
19
+ %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<2xi32>) -> tensor<2x1xi32>
20
+ %4 = tt.splat %1 : (i32) -> tensor<2x1xi32>
21
+ %5 = arith.addi %4, %3 : tensor<2x1xi32>
22
+ %6 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
23
+ %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<256xi32>) -> tensor<1x256xi32>
24
+ %8 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<2x1x!tt.ptr<i64, 1>>
25
+ %9 = tt.addptr %8, %5 : tensor<2x1x!tt.ptr<i64, 1>>, tensor<2x1xi32>
26
+ %10 = tt.load %9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x1xi64>
27
+ %11 = arith.remsi %5, %cst_10 : tensor<2x1xi32>
28
+ %12 = arith.cmpi slt, %7, %cst_9 : tensor<1x256xi32>
29
+ %13 = arith.muli %11, %cst_8 : tensor<2x1xi32>
30
+ %14 = tt.broadcast %7 : (tensor<1x256xi32>) -> tensor<2x256xi32>
31
+ %15 = tt.broadcast %13 : (tensor<2x1xi32>) -> tensor<2x256xi32>
32
+ %16 = arith.addi %14, %15 : tensor<2x256xi32>
33
+ %17 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<2x256x!tt.ptr<f32, 1>>
34
+ %18 = tt.addptr %17, %16 : tensor<2x256x!tt.ptr<f32, 1>>, tensor<2x256xi32>
35
+ %19 = tt.broadcast %12 : (tensor<1x256xi1>) -> tensor<2x256xi1>
36
+ %20 = tt.load %18, %19, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32>
37
+ %21 = arith.addi %10, %cst_3 : tensor<2x1xi64>
38
+ %22 = arith.cmpi slt, %10, %cst_4 : tensor<2x1xi64>
39
+ %23 = arith.select %22, %21, %10 : tensor<2x1xi1>, tensor<2x1xi64>
40
+ %24 = arith.cmpi sge, %23, %cst_4 : tensor<2x1xi64>
41
+ %25 = arith.cmpi slt, %23, %cst_3 : tensor<2x1xi64>
42
+ %26 = arith.andi %24, %25 : tensor<2x1xi1>
43
+ tt.assert %26, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<2x1xi1>
44
+ %27 = arith.muli %23, %cst_2 : tensor<2x1xi64>
45
+ %28 = tt.broadcast %27 : (tensor<2x1xi64>) -> tensor<2x256xi64>
46
+ %29 = arith.extsi %7 : tensor<1x256xi32> to tensor<1x256xi64>
47
+ %30 = tt.broadcast %29 : (tensor<1x256xi64>) -> tensor<2x256xi64>
48
+ %31 = arith.addi %30, %28 : tensor<2x256xi64>
49
+ %32 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<2x256x!tt.ptr<f32, 1>>
50
+ %33 = tt.addptr %32, %31 : tensor<2x256x!tt.ptr<f32, 1>>, tensor<2x256xi64>
51
+ %34 = tt.load %33, %19, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32>
52
+ %35 = arith.addf %34, %20 : tensor<2x256xf32>
53
+ %36 = arith.addf %35, %cst_7 : tensor<2x256xf32>
54
+ %37 = arith.subf %35, %36 : tensor<2x256xf32>
55
+ %38 = arith.mulf %35, %37 : tensor<2x256xf32>
56
+ %39 = arith.addf %38, %cst_7 : tensor<2x256xf32>
57
+ %40 = arith.select %19, %36, %cst_7 : tensor<2x256xi1>, tensor<2x256xf32>
58
+ %41 = arith.select %19, %39, %cst_7 : tensor<2x256xi1>, tensor<2x256xf32>
59
+ %42 = arith.select %12, %cst, %cst_0 : tensor<1x256xi1>, tensor<1x256xf32>
60
+ %43 = tt.broadcast %42 : (tensor<1x256xf32>) -> tensor<2x256xf32>
61
+ %44:3 = "tt.reduce"(%40, %41, %43) <{axis = 1 : i32}> ({
62
+ ^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32):
63
+ %68 = arith.subf %arg10, %arg7 : f32
64
+ %69 = arith.addf %arg9, %arg12 : f32
65
+ %70 = arith.cmpf oeq, %69, %cst_1 : f32
66
+ %71 = arith.divf %arg12, %69 : f32
67
+ %72 = arith.select %70, %cst_1, %71 : f32
68
+ %73 = arith.mulf %68, %72 : f32
69
+ %74 = arith.addf %arg7, %73 : f32
70
+ %75 = arith.addf %arg8, %arg11 : f32
71
+ %76 = arith.mulf %68, %68 : f32
72
+ %77 = arith.mulf %76, %arg9 : f32
73
+ %78 = arith.mulf %77, %72 : f32
74
+ %79 = arith.addf %75, %78 : f32
75
+ tt.reduce.return %74, %79, %69 : f32, f32, f32
76
+ }) : (tensor<2x256xf32>, tensor<2x256xf32>, tensor<2x256xf32>) -> (tensor<2xf32>, tensor<2xf32>, tensor<2xf32>)
77
+ %45 = tt.expand_dims %44#0 {axis = 1 : i32} : (tensor<2xf32>) -> tensor<2x1xf32>
78
+ %46 = tt.expand_dims %44#1 {axis = 1 : i32} : (tensor<2xf32>) -> tensor<2x1xf32>
79
+ %47 = tt.load %18, %19, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32>
80
+ %48 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>>
81
+ %49 = tt.addptr %48, %7 : tensor<1x256x!tt.ptr<f32, 1>>, tensor<1x256xi32>
82
+ %50 = tt.load %49, %12, %cst_0 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32>
83
+ tt.assert %26, "index out of bounds: 0 <= tmp13 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<2x1xi1>
84
+ %51 = tt.load %33, %19, %cst_7 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<2x256xf32>
85
+ %52 = arith.addf %51, %47 : tensor<2x256xf32>
86
+ %53 = tt.broadcast %45 : (tensor<2x1xf32>) -> tensor<2x256xf32>
87
+ %54 = arith.subf %52, %53 : tensor<2x256xf32>
88
+ %55 = arith.divf %46, %cst_6 : tensor<2x1xf32>
89
+ %56 = arith.addf %55, %cst_5 : tensor<2x1xf32>
90
+ %57 = tt.extern_elementwise %56 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32>) -> tensor<2x1xf32>
91
+ %58 = tt.broadcast %57 : (tensor<2x1xf32>) -> tensor<2x256xf32>
92
+ %59 = arith.mulf %54, %58 : tensor<2x256xf32>
93
+ %60 = tt.broadcast %50 : (tensor<1x256xf32>) -> tensor<2x256xf32>
94
+ %61 = arith.mulf %59, %60 : tensor<2x256xf32>
95
+ %62 = arith.muli %5, %cst_8 : tensor<2x1xi32>
96
+ %63 = tt.broadcast %62 : (tensor<2x1xi32>) -> tensor<2x256xi32>
97
+ %64 = arith.addi %14, %63 : tensor<2x256xi32>
98
+ %65 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<2x256x!tt.ptr<bf16, 1>>
99
+ %66 = tt.addptr %65, %64 : tensor<2x256x!tt.ptr<bf16, 1>>, tensor<2x256xi32>
100
+ %67 = arith.truncf %61 : tensor<2x256xf32> to tensor<2x256xbf16>
101
+ tt.store %66, %67, %19 {cache = 1 : i32, evict = 1 : i32} : tensor<2x256xbf16>
102
+ tt.return
103
+ }
104
+ }
.triton/dump/305a9479aab997a3a16bfe46bb303a50/triton_.llir ADDED
@@ -0,0 +1,524 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
5
+ @assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
6
+ @assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp16 < 50257"
7
+ @assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
8
+ @assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
9
+ @assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
10
+ @global_smem = external addrspace(3) global [0 x i8]
11
+ @.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
12
+
13
+ declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
14
+
15
+ define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !7 {
16
+ %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
17
+ %10 = and i32 %9, 31, !dbg !10
18
+ %11 = lshr i32 %9, 5, !dbg !10
19
+ %12 = and i32 %11, 1, !dbg !10
20
+ %urem = shl i32 %9, 2, !dbg !10
21
+ %13 = and i32 %urem, 252, !dbg !10
22
+ %14 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !11
23
+ %15 = sext i32 %14 to i64, !dbg !12
24
+ %16 = getelementptr i64, ptr addrspace(1) %0, i64 %15, !dbg !12
25
+ %17 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %16, i1 true) #6, !dbg !13
26
+ %18 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %16, i1 true) #6, !dbg !13
27
+ %19 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %16, i1 true) #6, !dbg !13
28
+ %20 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %16, i1 true) #6, !dbg !13
29
+ %21 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %16, i1 true) #6, !dbg !13
30
+ %22 = srem i32 %14, 512, !dbg !14
31
+ %23 = shl nsw i32 %22, 8, !dbg !15
32
+ %24 = or i32 %23, %13, !dbg !16
33
+ %25 = sext i32 %24 to i64, !dbg !17
34
+ %26 = getelementptr float, ptr addrspace(1) %2, i64 %25, !dbg !17
35
+ %27 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %26, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !18
36
+ %28 = extractvalue { i32, i32, i32, i32 } %27, 0, !dbg !18
37
+ %29 = extractvalue { i32, i32, i32, i32 } %27, 1, !dbg !18
38
+ %30 = extractvalue { i32, i32, i32, i32 } %27, 2, !dbg !18
39
+ %31 = extractvalue { i32, i32, i32, i32 } %27, 3, !dbg !18
40
+ %32 = insertelement <2 x i32> poison, i32 %29, i64 0, !dbg !18
41
+ %33 = insertelement <2 x i32> %32, i32 %28, i64 1, !dbg !18
42
+ %34 = bitcast <2 x i32> %33 to <2 x float>, !dbg !18
43
+ %35 = bitcast i32 %30 to float, !dbg !18
44
+ %36 = bitcast i32 %31 to float, !dbg !18
45
+ %37 = shl i32 %14, 8, !dbg !19
46
+ %38 = or i32 %37, %13, !dbg !20
47
+ %39 = sext i32 %38 to i64, !dbg !21
48
+ %40 = getelementptr i16, ptr addrspace(1) %3, i64 %39, !dbg !21
49
+ %41 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %40, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !22
50
+ %42 = extractvalue { i32, i32 } %41, 0, !dbg !22
51
+ %43 = extractvalue { i32, i32 } %41, 1, !dbg !22
52
+ %44 = trunc i32 %42 to i16, !dbg !22
53
+ %extelt.offset = lshr i32 %42, 16, !dbg !22
54
+ %45 = trunc i32 %extelt.offset to i16, !dbg !22
55
+ %46 = trunc i32 %43 to i16, !dbg !22
56
+ %extelt.offset1 = lshr i32 %43, 16, !dbg !22
57
+ %47 = trunc i32 %extelt.offset1 to i16, !dbg !22
58
+ %48 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %44) #6, !dbg !23
59
+ %49 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %45) #6, !dbg !23
60
+ %50 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %46) #6, !dbg !23
61
+ %51 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %47) #6, !dbg !23
62
+ %52 = add i64 %21, 50257, !dbg !24
63
+ %53 = icmp slt i64 %17, 0, !dbg !25
64
+ %54 = icmp slt i64 %21, 0, !dbg !25
65
+ %55 = select i1 %54, i64 %52, i64 %21, !dbg !26
66
+ %56 = icmp ugt i64 %55, 50256, !dbg !27
67
+ br i1 %56, label %57, label %58, !dbg !28
68
+
69
+ 57: ; preds = %8
70
+ tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !28
71
+ br label %58, !dbg !28
72
+
73
+ 58: ; preds = %57, %8
74
+ %59 = shl i64 %17, 8, !dbg !29
75
+ %60 = add i64 %59, 12865792, !dbg !29
76
+ %61 = select i1 %53, i64 %60, i64 %59, !dbg !29
77
+ %62 = zext nneg i32 %13 to i64
78
+ %63 = or i64 %61, %62, !dbg !30
79
+ %64 = getelementptr float, ptr addrspace(1) %1, i64 %63, !dbg !31
80
+ %65 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %64, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
81
+ %66 = extractvalue { i32, i32, i32, i32 } %65, 0, !dbg !32
82
+ %67 = extractvalue { i32, i32, i32, i32 } %65, 1, !dbg !32
83
+ %68 = extractvalue { i32, i32, i32, i32 } %65, 2, !dbg !32
84
+ %69 = extractvalue { i32, i32, i32, i32 } %65, 3, !dbg !32
85
+ %70 = bitcast i32 %68 to float, !dbg !32
86
+ %71 = bitcast i32 %69 to float, !dbg !32
87
+ %72 = fadd float %35, %70, !dbg !33
88
+ %73 = fadd float %36, %71, !dbg !33
89
+ %74 = fadd float %50, %72, !dbg !34
90
+ %75 = fadd float %51, %73, !dbg !34
91
+ %76 = insertelement <2 x i32> poison, i32 %67, i64 0, !dbg !32
92
+ %77 = insertelement <2 x i32> %76, i32 %66, i64 1, !dbg !32
93
+ %78 = bitcast <2 x i32> %77 to <2 x float>, !dbg !32
94
+ %79 = fadd <2 x float> %34, %78, !dbg !33
95
+ %80 = insertelement <2 x float> poison, float %49, i64 0, !dbg !34
96
+ %81 = insertelement <2 x float> %80, float %48, i64 1, !dbg !34
97
+ %82 = fadd <2 x float> %81, %79, !dbg !34
98
+ %83 = fadd <2 x float> %82, zeroinitializer, !dbg !35
99
+ %84 = fadd float %74, 0.000000e+00, !dbg !35
100
+ %85 = fadd float %75, 0.000000e+00, !dbg !35
101
+ %86 = extractelement <2 x float> %83, i64 1, !dbg !39
102
+ %87 = extractelement <2 x float> %82, i64 1, !dbg !43
103
+ %88 = fsub float %87, %86, !dbg !44
104
+ %89 = extractelement <2 x float> %83, i64 0, !dbg !39
105
+ %90 = extractelement <2 x float> %82, i64 0, !dbg !43
106
+ %91 = fsub float %90, %89, !dbg !44
107
+ %92 = fsub float %74, %84, !dbg !44
108
+ %93 = fsub float %75, %85, !dbg !44
109
+ %94 = fmul float %87, %88, !dbg !43
110
+ %95 = fmul float %90, %91, !dbg !43
111
+ %96 = fmul float %74, %92, !dbg !43
112
+ %97 = fmul float %75, %93, !dbg !43
113
+ %98 = fadd float %94, 0.000000e+00, !dbg !45
114
+ %99 = fadd float %95, 0.000000e+00, !dbg !45
115
+ %100 = fadd float %96, 0.000000e+00, !dbg !45
116
+ %101 = fadd float %97, 0.000000e+00, !dbg !45
117
+ %102 = fsub float %89, %86, !dbg !39
118
+ %103 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 2.000000e+00) #6, !dbg !46
119
+ %104 = fmul float %103, %102, !dbg !47
120
+ %105 = fadd float %86, %104, !dbg !48
121
+ %106 = fadd float %98, %99, !dbg !49
122
+ %107 = fmul float %102, %102, !dbg !50
123
+ %108 = fmul float %103, %107, !dbg !51
124
+ %109 = fadd float %108, %106, !dbg !52
125
+ %110 = fsub float %84, %105, !dbg !39
126
+ %111 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 3.000000e+00) #6, !dbg !46
127
+ %112 = fmul float %111, %110, !dbg !47
128
+ %113 = fadd float %105, %112, !dbg !48
129
+ %114 = fadd float %100, %109, !dbg !49
130
+ %115 = fmul float %110, %110, !dbg !50
131
+ %116 = fmul float %115, 2.000000e+00, !dbg !53
132
+ %117 = fmul float %111, %116, !dbg !51
133
+ %118 = fadd float %114, %117, !dbg !52
134
+ %119 = fsub float %85, %113, !dbg !39
135
+ %120 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 4.000000e+00) #6, !dbg !46
136
+ %121 = fmul float %120, %119, !dbg !47
137
+ %122 = fadd float %113, %121, !dbg !48
138
+ %123 = fadd float %101, %118, !dbg !49
139
+ %124 = fmul float %119, %119, !dbg !50
140
+ %125 = fmul float %124, 3.000000e+00, !dbg !53
141
+ %126 = fmul float %120, %125, !dbg !51
142
+ %127 = fadd float %123, %126, !dbg !52
143
+ %128 = bitcast float %122 to i32, !dbg !54
144
+ %129 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %128, i32 16, i32 31), !dbg !54
145
+ %130 = bitcast i32 %129 to float, !dbg !54
146
+ %131 = bitcast float %127 to i32, !dbg !54
147
+ %132 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %131, i32 16, i32 31), !dbg !54
148
+ %133 = bitcast i32 %132 to float, !dbg !54
149
+ %134 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 1082130432, i32 16, i32 31), !dbg !54
150
+ %135 = bitcast i32 %134 to float, !dbg !54
151
+ %136 = fsub float %130, %122, !dbg !39
152
+ %137 = fadd float %135, 4.000000e+00, !dbg !56
153
+ %138 = fcmp oeq float %137, 0.000000e+00, !dbg !57
154
+ %139 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %135, float %137) #6, !dbg !46
155
+ %140 = select i1 %138, float 0.000000e+00, float %139, !dbg !58
156
+ %141 = fmul float %140, %136, !dbg !47
157
+ %142 = fadd float %122, %141, !dbg !48
158
+ %143 = fadd float %127, %133, !dbg !49
159
+ %144 = fmul float %136, %136, !dbg !50
160
+ %145 = fmul float %144, 4.000000e+00, !dbg !53
161
+ %146 = fmul float %140, %145, !dbg !51
162
+ %147 = fadd float %143, %146, !dbg !52
163
+ %148 = bitcast float %142 to i32, !dbg !54
164
+ %149 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %148, i32 8, i32 31), !dbg !54
165
+ %150 = bitcast i32 %149 to float, !dbg !54
166
+ %151 = bitcast float %147 to i32, !dbg !54
167
+ %152 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %151, i32 8, i32 31), !dbg !54
168
+ %153 = bitcast i32 %152 to float, !dbg !54
169
+ %154 = bitcast float %137 to i32, !dbg !54
170
+ %155 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %154, i32 8, i32 31), !dbg !54
171
+ %156 = bitcast i32 %155 to float, !dbg !54
172
+ %157 = fsub float %150, %142, !dbg !39
173
+ %158 = fadd float %137, %156, !dbg !56
174
+ %159 = fcmp oeq float %158, 0.000000e+00, !dbg !57
175
+ %160 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %156, float %158) #6, !dbg !46
176
+ %161 = select i1 %159, float 0.000000e+00, float %160, !dbg !58
177
+ %162 = fmul float %161, %157, !dbg !47
178
+ %163 = fadd float %142, %162, !dbg !48
179
+ %164 = fadd float %147, %153, !dbg !49
180
+ %165 = fmul float %157, %157, !dbg !50
181
+ %166 = fmul float %137, %165, !dbg !53
182
+ %167 = fmul float %161, %166, !dbg !51
183
+ %168 = fadd float %164, %167, !dbg !52
184
+ %169 = bitcast float %163 to i32, !dbg !54
185
+ %170 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %169, i32 4, i32 31), !dbg !54
186
+ %171 = bitcast i32 %170 to float, !dbg !54
187
+ %172 = bitcast float %168 to i32, !dbg !54
188
+ %173 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %172, i32 4, i32 31), !dbg !54
189
+ %174 = bitcast i32 %173 to float, !dbg !54
190
+ %175 = bitcast float %158 to i32, !dbg !54
191
+ %176 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %175, i32 4, i32 31), !dbg !54
192
+ %177 = bitcast i32 %176 to float, !dbg !54
193
+ %178 = fsub float %171, %163, !dbg !39
194
+ %179 = fadd float %158, %177, !dbg !56
195
+ %180 = fcmp oeq float %179, 0.000000e+00, !dbg !57
196
+ %181 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %177, float %179) #6, !dbg !46
197
+ %182 = select i1 %180, float 0.000000e+00, float %181, !dbg !58
198
+ %183 = fmul float %182, %178, !dbg !47
199
+ %184 = fadd float %163, %183, !dbg !48
200
+ %185 = fadd float %168, %174, !dbg !49
201
+ %186 = fmul float %178, %178, !dbg !50
202
+ %187 = fmul float %158, %186, !dbg !53
203
+ %188 = fmul float %182, %187, !dbg !51
204
+ %189 = fadd float %185, %188, !dbg !52
205
+ %190 = bitcast float %184 to i32, !dbg !54
206
+ %191 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %190, i32 2, i32 31), !dbg !54
207
+ %192 = bitcast i32 %191 to float, !dbg !54
208
+ %193 = bitcast float %189 to i32, !dbg !54
209
+ %194 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %193, i32 2, i32 31), !dbg !54
210
+ %195 = bitcast i32 %194 to float, !dbg !54
211
+ %196 = bitcast float %179 to i32, !dbg !54
212
+ %197 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %196, i32 2, i32 31), !dbg !54
213
+ %198 = bitcast i32 %197 to float, !dbg !54
214
+ %199 = fsub float %192, %184, !dbg !39
215
+ %200 = fadd float %179, %198, !dbg !56
216
+ %201 = fcmp oeq float %200, 0.000000e+00, !dbg !57
217
+ %202 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %198, float %200) #6, !dbg !46
218
+ %203 = select i1 %201, float 0.000000e+00, float %202, !dbg !58
219
+ %204 = fmul float %203, %199, !dbg !47
220
+ %205 = fadd float %184, %204, !dbg !48
221
+ %206 = fadd float %189, %195, !dbg !49
222
+ %207 = fmul float %199, %199, !dbg !50
223
+ %208 = fmul float %179, %207, !dbg !53
224
+ %209 = fmul float %203, %208, !dbg !51
225
+ %210 = fadd float %206, %209, !dbg !52
226
+ %211 = bitcast float %205 to i32, !dbg !54
227
+ %212 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %211, i32 1, i32 31), !dbg !54
228
+ %213 = bitcast i32 %212 to float, !dbg !54
229
+ %214 = bitcast float %210 to i32, !dbg !54
230
+ %215 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %214, i32 1, i32 31), !dbg !54
231
+ %216 = bitcast i32 %215 to float, !dbg !54
232
+ %217 = bitcast float %200 to i32, !dbg !54
233
+ %218 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %217, i32 1, i32 31), !dbg !54
234
+ %219 = bitcast i32 %218 to float, !dbg !54
235
+ %220 = fsub float %213, %205, !dbg !39
236
+ %221 = fadd float %200, %219, !dbg !56
237
+ %222 = fcmp oeq float %221, 0.000000e+00, !dbg !57
238
+ %223 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %219, float %221) #6, !dbg !46
239
+ %224 = select i1 %222, float 0.000000e+00, float %223, !dbg !58
240
+ %225 = fmul float %224, %220, !dbg !47
241
+ %226 = fadd float %205, %225, !dbg !48
242
+ %227 = fadd float %210, %216, !dbg !49
243
+ %228 = fmul float %220, %220, !dbg !50
244
+ %229 = fmul float %200, %228, !dbg !53
245
+ %230 = fmul float %224, %229, !dbg !51
246
+ %231 = fadd float %227, %230, !dbg !52
247
+ %232 = icmp eq i32 %10, 0, !dbg !54
248
+ %233 = zext nneg i32 %12 to i64, !dbg !54
249
+ %234 = getelementptr float, ptr addrspace(3) @global_smem, i64 %233, !dbg !54
250
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %234, float %226, i1 %232) #6, !dbg !54
251
+ %235 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 8), i64 %233, !dbg !54
252
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %235, float %231, i1 %232) #6, !dbg !54
253
+ %236 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %233, !dbg !54
254
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %236, float %221, i1 %232) #6, !dbg !54
255
+ tail call void @llvm.nvvm.barrier0(), !dbg !54
256
+ %237 = icmp slt i32 %9, 2, !dbg !54
257
+ %238 = sext i32 %9 to i64, !dbg !54
258
+ %239 = getelementptr float, ptr addrspace(3) @global_smem, i64 %238, !dbg !54
259
+ %240 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %239, i1 %237) #6, !dbg !54
260
+ %241 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 8), i64 %238, !dbg !54
261
+ %242 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %241, i1 %237) #6, !dbg !54
262
+ %243 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %238, !dbg !54
263
+ %244 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %243, i1 %237) #6, !dbg !54
264
+ %245 = bitcast float %240 to i32, !dbg !54
265
+ %246 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %245, i32 1, i32 31), !dbg !54
266
+ %247 = bitcast i32 %246 to float, !dbg !54
267
+ %248 = bitcast float %242 to i32, !dbg !54
268
+ %249 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %248, i32 1, i32 31), !dbg !54
269
+ %250 = bitcast i32 %249 to float, !dbg !54
270
+ %251 = bitcast float %244 to i32, !dbg !54
271
+ %252 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %251, i32 1, i32 31), !dbg !54
272
+ %253 = bitcast i32 %252 to float, !dbg !54
273
+ %254 = fsub float %247, %240, !dbg !39
274
+ %255 = fadd float %244, %253, !dbg !56
275
+ %256 = fcmp oeq float %255, 0.000000e+00, !dbg !57
276
+ %257 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %253, float %255) #6, !dbg !46
277
+ %258 = select i1 %256, float 0.000000e+00, float %257, !dbg !58
278
+ %259 = fmul float %254, %258, !dbg !47
279
+ %260 = fadd float %240, %259, !dbg !48
280
+ %261 = fadd float %242, %250, !dbg !49
281
+ %262 = fmul float %254, %254, !dbg !50
282
+ %263 = fmul float %244, %262, !dbg !53
283
+ %264 = fmul float %263, %258, !dbg !51
284
+ %265 = fadd float %261, %264, !dbg !52
285
+ %266 = and i32 %9, 1, !dbg !54
286
+ %267 = icmp eq i32 %266, 0, !dbg !54
287
+ %268 = and i1 %237, %267, !dbg !54
288
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %239, float %260, i1 %268) #6, !dbg !54
289
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %241, float %265, i1 %268) #6, !dbg !54
290
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %243, float %255, i1 %268) #6, !dbg !54
291
+ tail call void @llvm.nvvm.barrier0(), !dbg !54
292
+ %269 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !54
293
+ %270 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 8), align 4, !dbg !54
294
+ %271 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %26, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !59
295
+ %272 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %40, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !60
296
+ %273 = extractvalue { i32, i32 } %272, 0, !dbg !60
297
+ %274 = extractvalue { i32, i32 } %272, 1, !dbg !60
298
+ %275 = trunc i32 %273 to i16, !dbg !60
299
+ %extelt.offset2 = lshr i32 %273, 16, !dbg !60
300
+ %276 = trunc i32 %extelt.offset2 to i16, !dbg !60
301
+ %277 = trunc i32 %274 to i16, !dbg !60
302
+ %extelt.offset3 = lshr i32 %274, 16, !dbg !60
303
+ %278 = trunc i32 %extelt.offset3 to i16, !dbg !60
304
+ %279 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %275) #6, !dbg !61
305
+ %280 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %276) #6, !dbg !61
306
+ %281 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %277) #6, !dbg !61
307
+ %282 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %278) #6, !dbg !61
308
+ %283 = getelementptr float, ptr addrspace(1) %4, i64 %62, !dbg !62
309
+ %284 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %283, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !63
310
+ br i1 %56, label %285, label %286, !dbg !64
311
+
312
+ 285: ; preds = %58
313
+ tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !64
314
+ br label %286, !dbg !64
315
+
316
+ 286: ; preds = %285, %58
317
+ %287 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %64, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !65
318
+ %288 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %270, float 2.560000e+02) #6, !dbg !66
319
+ %289 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %270, float 2.560000e+02) #6, !dbg !66
320
+ %290 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %270, float 2.560000e+02) #6, !dbg !66
321
+ %291 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %270, float 2.560000e+02) #6, !dbg !66
322
+ %292 = fadd float %288, 0x3EE4F8B580000000, !dbg !67
323
+ %293 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !68
324
+ %.not.i = icmp eq i32 %293, 0, !dbg !68
325
+ br i1 %.not.i, label %296, label %294, !dbg !68
326
+
327
+ 294: ; preds = %286
328
+ %295 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %292), !dbg !68
329
+ br label %__nv_rsqrtf.exit, !dbg !68
330
+
331
+ 296: ; preds = %286
332
+ %297 = tail call float @llvm.nvvm.rsqrt.approx.f(float %292), !dbg !68
333
+ br label %__nv_rsqrtf.exit, !dbg !68
334
+
335
+ __nv_rsqrtf.exit: ; preds = %294, %296
336
+ %.0.i = phi float [ %295, %294 ], [ %297, %296 ], !dbg !68
337
+ %298 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !68
338
+ %299 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !68
339
+ %300 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !68
340
+ %301 = extractvalue { i32, i32, i32, i32 } %287, 3, !dbg !65
341
+ %302 = bitcast i32 %301 to float, !dbg !65
342
+ %303 = extractvalue { i32, i32, i32, i32 } %271, 3, !dbg !59
343
+ %304 = bitcast i32 %303 to float, !dbg !59
344
+ %305 = fadd float %304, %302, !dbg !69
345
+ %306 = fadd float %282, %305, !dbg !70
346
+ %307 = fsub float %306, %269, !dbg !71
347
+ %308 = extractvalue { i32, i32, i32, i32 } %287, 2, !dbg !65
348
+ %309 = bitcast i32 %308 to float, !dbg !65
349
+ %310 = extractvalue { i32, i32, i32, i32 } %271, 2, !dbg !59
350
+ %311 = bitcast i32 %310 to float, !dbg !59
351
+ %312 = fadd float %311, %309, !dbg !69
352
+ %313 = fadd float %281, %312, !dbg !70
353
+ %314 = fsub float %313, %269, !dbg !71
354
+ %315 = extractvalue { i32, i32, i32, i32 } %287, 1, !dbg !65
355
+ %316 = bitcast i32 %315 to float, !dbg !65
356
+ %317 = extractvalue { i32, i32, i32, i32 } %271, 1, !dbg !59
357
+ %318 = bitcast i32 %317 to float, !dbg !59
358
+ %319 = fadd float %318, %316, !dbg !69
359
+ %320 = fadd float %280, %319, !dbg !70
360
+ %321 = fsub float %320, %269, !dbg !71
361
+ %322 = extractvalue { i32, i32, i32, i32 } %287, 0, !dbg !65
362
+ %323 = bitcast i32 %322 to float, !dbg !65
363
+ %324 = extractvalue { i32, i32, i32, i32 } %271, 0, !dbg !59
364
+ %325 = bitcast i32 %324 to float, !dbg !59
365
+ %326 = fadd float %325, %323, !dbg !69
366
+ %327 = fadd float %279, %326, !dbg !70
367
+ %328 = fsub float %327, %269, !dbg !71
368
+ %329 = extractvalue { i32, i32, i32, i32 } %284, 0, !dbg !63
369
+ %330 = bitcast i32 %329 to float, !dbg !63
370
+ %331 = extractvalue { i32, i32, i32, i32 } %284, 1, !dbg !63
371
+ %332 = bitcast i32 %331 to float, !dbg !63
372
+ %333 = extractvalue { i32, i32, i32, i32 } %284, 2, !dbg !63
373
+ %334 = bitcast i32 %333 to float, !dbg !63
374
+ %335 = extractvalue { i32, i32, i32, i32 } %284, 3, !dbg !63
375
+ %336 = bitcast i32 %335 to float, !dbg !63
376
+ %337 = fmul float %328, %.0.i, !dbg !72
377
+ %338 = fmul float %321, %.0.i, !dbg !72
378
+ %339 = fmul float %314, %.0.i, !dbg !72
379
+ %340 = fmul float %307, %.0.i, !dbg !72
380
+ %341 = fmul float %337, %330, !dbg !73
381
+ %342 = fmul float %338, %332, !dbg !73
382
+ %343 = fmul float %339, %334, !dbg !73
383
+ %344 = fmul float %340, %336, !dbg !73
384
+ %345 = getelementptr i16, ptr addrspace(1) %5, i64 %39, !dbg !74
385
+ %346 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %341) #6, !dbg !75
386
+ %347 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %342) #6, !dbg !75
387
+ %348 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %343) #6, !dbg !75
388
+ %349 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %344) #6, !dbg !75
389
+ %350 = insertelement <2 x i16> undef, i16 %346, i64 0, !dbg !75
390
+ %351 = insertelement <2 x i16> %350, i16 %347, i64 1, !dbg !75
391
+ %352 = bitcast <2 x i16> %351 to i32, !dbg !75
392
+ %353 = insertelement <2 x i16> undef, i16 %348, i64 0, !dbg !75
393
+ %354 = insertelement <2 x i16> %353, i16 %349, i64 1, !dbg !75
394
+ %355 = bitcast <2 x i16> %354 to i32, !dbg !75
395
+ tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %352, i32 %355, ptr addrspace(1) %345, i1 true) #6, !dbg !75
396
+ ret void, !dbg !76
397
+ }
398
+
399
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
400
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
401
+
402
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
403
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
404
+
405
+ ; Function Attrs: convergent nocallback nounwind
406
+ declare void @llvm.nvvm.barrier0() #2
407
+
408
+ ; Function Attrs: alwaysinline nounwind
409
+ define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
410
+ %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
411
+ %.not = icmp eq i32 %1, 0
412
+ br i1 %.not, label %4, label %2
413
+
414
+ 2: ; preds = %0
415
+ %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
416
+ br label %6
417
+
418
+ 4: ; preds = %0
419
+ %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
420
+ br label %6
421
+
422
+ 6: ; preds = %4, %2
423
+ %.0 = phi float [ %3, %2 ], [ %5, %4 ]
424
+ ret float %.0
425
+ }
426
+
427
+ declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
428
+
429
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
430
+ declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
431
+
432
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
433
+ declare float @llvm.nvvm.rsqrt.approx.f(float) #5
434
+
435
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
436
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
437
+ attributes #2 = { convergent nocallback nounwind }
438
+ attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
439
+ attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
440
+ attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
441
+ attributes #6 = { nounwind }
442
+
443
+ !llvm.module.flags = !{!0, !1}
444
+ !llvm.dbg.cu = !{!2}
445
+ !nvvm.annotations = !{!4, !5, !5, !4}
446
+ !llvm.ident = !{!6}
447
+
448
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
449
+ !1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
450
+ !2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
451
+ !3 = !DIFile(filename: "cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py", directory: "/tmp/torchinductor_root/pn")
452
+ !4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1}
453
+ !5 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 64}
454
+ !6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
455
+ !7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
456
+ !8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
457
+ !9 = !{}
458
+ !10 = !DILocation(line: 24, column: 33, scope: !7)
459
+ !11 = !DILocation(line: 21, column: 28, scope: !7)
460
+ !12 = !DILocation(line: 26, column: 30, scope: !7)
461
+ !13 = !DILocation(line: 26, column: 35, scope: !7)
462
+ !14 = !DILocation(line: 27, column: 18, scope: !7)
463
+ !15 = !DILocation(line: 35, column: 44, scope: !7)
464
+ !16 = !DILocation(line: 35, column: 40, scope: !7)
465
+ !17 = !DILocation(line: 35, column: 34, scope: !7)
466
+ !18 = !DILocation(line: 35, column: 50, scope: !7)
467
+ !19 = !DILocation(line: 36, column: 44, scope: !7)
468
+ !20 = !DILocation(line: 36, column: 40, scope: !7)
469
+ !21 = !DILocation(line: 36, column: 34, scope: !7)
470
+ !22 = !DILocation(line: 36, column: 50, scope: !7)
471
+ !23 = !DILocation(line: 36, column: 101, scope: !7)
472
+ !24 = !DILocation(line: 37, column: 22, scope: !7)
473
+ !25 = !DILocation(line: 38, column: 22, scope: !7)
474
+ !26 = !DILocation(line: 39, column: 36, scope: !7)
475
+ !27 = !DILocation(line: 40, column: 40, scope: !7)
476
+ !28 = !DILocation(line: 40, column: 55, scope: !7)
477
+ !29 = !DILocation(line: 41, column: 44, scope: !7)
478
+ !30 = !DILocation(line: 41, column: 40, scope: !7)
479
+ !31 = !DILocation(line: 41, column: 34, scope: !7)
480
+ !32 = !DILocation(line: 41, column: 52, scope: !7)
481
+ !33 = !DILocation(line: 42, column: 22, scope: !7)
482
+ !34 = !DILocation(line: 44, column: 22, scope: !7)
483
+ !35 = !DILocation(line: 98, column: 22, scope: !36, inlinedAt: !38)
484
+ !36 = distinct !DILexicalBlockFile(scope: !7, file: !37, discriminator: 0)
485
+ !37 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
486
+ !38 = !DILocation(line: 47, column: 41, scope: !36)
487
+ !39 = !DILocation(line: 108, column: 21, scope: !40, inlinedAt: !41)
488
+ !40 = distinct !DILexicalBlockFile(scope: !36, file: !37, discriminator: 0)
489
+ !41 = !DILocation(line: 120, column: 46, scope: !40, inlinedAt: !42)
490
+ !42 = !DILocation(line: 53, column: 44, scope: !40)
491
+ !43 = !DILocation(line: 101, column: 22, scope: !36, inlinedAt: !38)
492
+ !44 = !DILocation(line: 101, column: 30, scope: !36, inlinedAt: !38)
493
+ !45 = !DILocation(line: 101, column: 13, scope: !36, inlinedAt: !38)
494
+ !46 = !DILocation(line: 110, column: 60, scope: !40, inlinedAt: !41)
495
+ !47 = !DILocation(line: 112, column: 25, scope: !40, inlinedAt: !41)
496
+ !48 = !DILocation(line: 112, column: 17, scope: !40, inlinedAt: !41)
497
+ !49 = !DILocation(line: 113, column: 15, scope: !40, inlinedAt: !41)
498
+ !50 = !DILocation(line: 113, column: 30, scope: !40, inlinedAt: !41)
499
+ !51 = !DILocation(line: 113, column: 49, scope: !40, inlinedAt: !41)
500
+ !52 = !DILocation(line: 113, column: 22, scope: !40, inlinedAt: !41)
501
+ !53 = !DILocation(line: 113, column: 38, scope: !40, inlinedAt: !41)
502
+ !54 = !DILocation(line: 120, column: 46, scope: !36, inlinedAt: !55)
503
+ !55 = !DILocation(line: 53, column: 44, scope: !36)
504
+ !56 = !DILocation(line: 109, column: 28, scope: !40, inlinedAt: !41)
505
+ !57 = !DILocation(line: 110, column: 39, scope: !40, inlinedAt: !41)
506
+ !58 = !DILocation(line: 110, column: 49, scope: !40, inlinedAt: !41)
507
+ !59 = !DILocation(line: 62, column: 51, scope: !7)
508
+ !60 = !DILocation(line: 63, column: 51, scope: !7)
509
+ !61 = !DILocation(line: 63, column: 103, scope: !7)
510
+ !62 = !DILocation(line: 64, column: 35, scope: !7)
511
+ !63 = !DILocation(line: 64, column: 40, scope: !7)
512
+ !64 = !DILocation(line: 68, column: 57, scope: !7)
513
+ !65 = !DILocation(line: 69, column: 54, scope: !7)
514
+ !66 = !DILocation(line: 75, column: 24, scope: !7)
515
+ !67 = !DILocation(line: 77, column: 24, scope: !7)
516
+ !68 = !DILocation(line: 78, column: 30, scope: !7)
517
+ !69 = !DILocation(line: 70, column: 24, scope: !7)
518
+ !70 = !DILocation(line: 72, column: 24, scope: !7)
519
+ !71 = !DILocation(line: 73, column: 24, scope: !7)
520
+ !72 = !DILocation(line: 79, column: 24, scope: !7)
521
+ !73 = !DILocation(line: 80, column: 24, scope: !7)
522
+ !74 = !DILocation(line: 82, column: 29, scope: !7)
523
+ !75 = !DILocation(line: 82, column: 52, scope: !7)
524
+ !76 = !DILocation(line: 58, column: 4, scope: !7)
.triton/dump/305a9479aab997a3a16bfe46bb303a50/triton_.ttgir ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
3
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
4
+ tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
5
+ %cst = arith.constant dense<256> : tensor<1x256xi32, #blocked>
6
+ %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x256xf32, #blocked>
7
+ %cst_1 = arith.constant dense<1.000000e+00> : tensor<1x256xf32, #blocked>
8
+ %cst_2 = arith.constant dense<256> : tensor<1x1xi64, #blocked>
9
+ %cst_3 = arith.constant dense<50257> : tensor<1x1xi64, #blocked>
10
+ %cst_4 = arith.constant dense<0> : tensor<1x1xi64, #blocked>
11
+ %cst_5 = arith.constant dense<0> : tensor<1x1xi64, #blocked1>
12
+ %cst_6 = arith.constant dense<50257> : tensor<1x1xi64, #blocked1>
13
+ %cst_7 = arith.constant 0.000000e+00 : f32
14
+ %c256_i32 = arith.constant 256 : i32
15
+ %c512_i32 = arith.constant 512 : i32
16
+ %cst_8 = arith.constant dense<9.99999974E-6> : tensor<1x1xf32, #blocked>
17
+ %cst_9 = arith.constant dense<2.560000e+02> : tensor<1x1xf32, #blocked>
18
+ %cst_10 = arith.constant dense<0.000000e+00> : tensor<1x256xbf16, #blocked>
19
+ %0 = tt.get_program_id x : i32
20
+ %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
21
+ %2 = tt.expand_dims %1 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x256xi32, #blocked>
22
+ %3 = tt.addptr %arg0, %0 : !tt.ptr<i64, 1>, i32
23
+ %4 = tt.splat %3 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>, #blocked>
24
+ %5 = tt.splat %3 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>, #blocked1>
25
+ %6 = tt.load %4 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x1xi64, #blocked>
26
+ %7 = tt.load %5 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x1xi64, #blocked1>
27
+ %8 = arith.remsi %0, %c512_i32 : i32
28
+ %9 = arith.cmpi slt, %2, %cst : tensor<1x256xi32, #blocked>
29
+ %10 = arith.muli %8, %c256_i32 : i32
30
+ %11 = tt.splat %10 : (i32) -> tensor<1x256xi32, #blocked>
31
+ %12 = arith.addi %2, %11 : tensor<1x256xi32, #blocked>
32
+ %13 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>, #blocked>
33
+ %14 = tt.addptr %13, %12 : tensor<1x256x!tt.ptr<f32, 1>, #blocked>, tensor<1x256xi32, #blocked>
34
+ %15 = tt.load %14, %9, %cst_0 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32, #blocked>
35
+ %16 = arith.muli %0, %c256_i32 : i32
36
+ %17 = tt.splat %16 : (i32) -> tensor<1x256xi32, #blocked>
37
+ %18 = arith.addi %2, %17 : tensor<1x256xi32, #blocked>
38
+ %19 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<1x256x!tt.ptr<bf16, 1>, #blocked>
39
+ %20 = tt.addptr %19, %18 : tensor<1x256x!tt.ptr<bf16, 1>, #blocked>, tensor<1x256xi32, #blocked>
40
+ %21 = tt.load %20, %9, %cst_10 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xbf16, #blocked>
41
+ %22 = arith.extf %21 : tensor<1x256xbf16, #blocked> to tensor<1x256xf32, #blocked>
42
+ %23 = arith.addi %6, %cst_3 : tensor<1x1xi64, #blocked>
43
+ %24 = arith.addi %7, %cst_6 : tensor<1x1xi64, #blocked1>
44
+ %25 = arith.cmpi slt, %6, %cst_4 : tensor<1x1xi64, #blocked>
45
+ %26 = arith.cmpi slt, %7, %cst_5 : tensor<1x1xi64, #blocked1>
46
+ %27 = arith.select %25, %23, %6 : tensor<1x1xi1, #blocked>, tensor<1x1xi64, #blocked>
47
+ %28 = arith.select %26, %24, %7 : tensor<1x1xi1, #blocked1>, tensor<1x1xi64, #blocked1>
48
+ %29 = arith.cmpi sge, %28, %cst_5 : tensor<1x1xi64, #blocked1>
49
+ %30 = arith.cmpi slt, %28, %cst_6 : tensor<1x1xi64, #blocked1>
50
+ %31 = arith.andi %29, %30 : tensor<1x1xi1, #blocked1>
51
+ tt.assert %31, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<1x1xi1, #blocked1>
52
+ %32 = arith.muli %27, %cst_2 : tensor<1x1xi64, #blocked>
53
+ %33 = tt.broadcast %32 : (tensor<1x1xi64, #blocked>) -> tensor<1x256xi64, #blocked>
54
+ %34 = arith.extsi %2 : tensor<1x256xi32, #blocked> to tensor<1x256xi64, #blocked>
55
+ %35 = arith.addi %34, %33 : tensor<1x256xi64, #blocked>
56
+ %36 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>, #blocked>
57
+ %37 = tt.addptr %36, %35 : tensor<1x256x!tt.ptr<f32, 1>, #blocked>, tensor<1x256xi64, #blocked>
58
+ %38 = tt.load %37, %9, %cst_0 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32, #blocked>
59
+ %39 = arith.addf %38, %15 : tensor<1x256xf32, #blocked>
60
+ %40 = arith.addf %39, %22 : tensor<1x256xf32, #blocked>
61
+ %41 = arith.addf %40, %cst_0 : tensor<1x256xf32, #blocked>
62
+ %42 = arith.subf %40, %41 : tensor<1x256xf32, #blocked>
63
+ %43 = arith.mulf %40, %42 : tensor<1x256xf32, #blocked>
64
+ %44 = arith.addf %43, %cst_0 : tensor<1x256xf32, #blocked>
65
+ %45 = arith.select %9, %41, %cst_0 : tensor<1x256xi1, #blocked>, tensor<1x256xf32, #blocked>
66
+ %46 = arith.select %9, %44, %cst_0 : tensor<1x256xi1, #blocked>, tensor<1x256xf32, #blocked>
67
+ %47 = arith.select %9, %cst_1, %cst_0 : tensor<1x256xi1, #blocked>, tensor<1x256xf32, #blocked>
68
+ %48:3 = "tt.reduce"(%45, %46, %47) <{axis = 1 : i32}> ({
69
+ ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
70
+ %71 = arith.subf %arg11, %arg8 : f32
71
+ %72 = arith.addf %arg10, %arg13 : f32
72
+ %73 = arith.cmpf oeq, %72, %cst_7 : f32
73
+ %74 = arith.divf %arg13, %72 : f32
74
+ %75 = arith.select %73, %cst_7, %74 : f32
75
+ %76 = arith.mulf %71, %75 : f32
76
+ %77 = arith.addf %arg8, %76 : f32
77
+ %78 = arith.addf %arg9, %arg12 : f32
78
+ %79 = arith.mulf %71, %71 : f32
79
+ %80 = arith.mulf %79, %arg10 : f32
80
+ %81 = arith.mulf %80, %75 : f32
81
+ %82 = arith.addf %78, %81 : f32
82
+ tt.reduce.return %77, %82, %72 : f32, f32, f32
83
+ }) : (tensor<1x256xf32, #blocked>, tensor<1x256xf32, #blocked>, tensor<1x256xf32, #blocked>) -> (tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
84
+ %49 = tt.expand_dims %48#0 {axis = 1 : i32} : (tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1x1xf32, #blocked>
85
+ %50 = tt.expand_dims %48#1 {axis = 1 : i32} : (tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1x1xf32, #blocked>
86
+ %51 = tt.load %14, %9, %cst_0 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32, #blocked>
87
+ %52 = tt.load %20, %9, %cst_10 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x256xbf16, #blocked>
88
+ %53 = arith.extf %52 : tensor<1x256xbf16, #blocked> to tensor<1x256xf32, #blocked>
89
+ %54 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>, #blocked>
90
+ %55 = tt.addptr %54, %2 : tensor<1x256x!tt.ptr<f32, 1>, #blocked>, tensor<1x256xi32, #blocked>
91
+ %56 = tt.load %55, %9, %cst_0 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32, #blocked>
92
+ tt.assert %31, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<1x1xi1, #blocked1>
93
+ %57 = tt.load %37, %9, %cst_0 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x256xf32, #blocked>
94
+ %58 = arith.addf %57, %51 : tensor<1x256xf32, #blocked>
95
+ %59 = arith.addf %58, %53 : tensor<1x256xf32, #blocked>
96
+ %60 = tt.broadcast %49 : (tensor<1x1xf32, #blocked>) -> tensor<1x256xf32, #blocked>
97
+ %61 = arith.subf %59, %60 : tensor<1x256xf32, #blocked>
98
+ %62 = arith.divf %50, %cst_9 : tensor<1x1xf32, #blocked>
99
+ %63 = arith.addf %62, %cst_8 : tensor<1x1xf32, #blocked>
100
+ %64 = tt.extern_elementwise %63 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32, #blocked>) -> tensor<1x1xf32, #blocked>
101
+ %65 = tt.broadcast %64 : (tensor<1x1xf32, #blocked>) -> tensor<1x256xf32, #blocked>
102
+ %66 = arith.mulf %61, %65 : tensor<1x256xf32, #blocked>
103
+ %67 = arith.mulf %66, %56 : tensor<1x256xf32, #blocked>
104
+ %68 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<1x256x!tt.ptr<bf16, 1>, #blocked>
105
+ %69 = tt.addptr %68, %18 : tensor<1x256x!tt.ptr<bf16, 1>, #blocked>, tensor<1x256xi32, #blocked>
106
+ %70 = arith.truncf %67 : tensor<1x256xf32, #blocked> to tensor<1x256xbf16, #blocked>
107
+ tt.store %69, %70, %9 {cache = 1 : i32, evict = 1 : i32} : tensor<1x256xbf16, #blocked>
108
+ tt.return
109
+ }
110
+ }
.triton/dump/305a9479aab997a3a16bfe46bb303a50/triton_.ttir ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %c512_i32 = arith.constant 512 : i32
4
+ %c256_i32 = arith.constant 256 : i32
5
+ %cst = arith.constant dense<0.000000e+00> : tensor<1x256xbf16>
6
+ %cst_0 = arith.constant dense<1.000000e+00> : tensor<1x256xf32>
7
+ %cst_1 = arith.constant 0.000000e+00 : f32
8
+ %cst_2 = arith.constant dense<256> : tensor<1x1xi64>
9
+ %cst_3 = arith.constant dense<50257> : tensor<1x1xi64>
10
+ %cst_4 = arith.constant dense<0> : tensor<1x1xi64>
11
+ %cst_5 = arith.constant dense<9.99999974E-6> : tensor<1x1xf32>
12
+ %cst_6 = arith.constant dense<2.560000e+02> : tensor<1x1xf32>
13
+ %cst_7 = arith.constant dense<0.000000e+00> : tensor<1x256xf32>
14
+ %cst_8 = arith.constant dense<256> : tensor<1x256xi32>
15
+ %0 = tt.get_program_id x : i32
16
+ %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
17
+ %2 = tt.expand_dims %1 {axis = 0 : i32} : (tensor<256xi32>) -> tensor<1x256xi32>
18
+ %3 = tt.addptr %arg0, %0 : !tt.ptr<i64, 1>, i32
19
+ %4 = tt.splat %3 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>>
20
+ %5 = tt.load %4 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x1xi64>
21
+ %6 = arith.remsi %0, %c512_i32 : i32
22
+ %7 = arith.cmpi slt, %2, %cst_8 : tensor<1x256xi32>
23
+ %8 = arith.muli %6, %c256_i32 : i32
24
+ %9 = tt.splat %8 : (i32) -> tensor<1x256xi32>
25
+ %10 = arith.addi %2, %9 : tensor<1x256xi32>
26
+ %11 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>>
27
+ %12 = tt.addptr %11, %10 : tensor<1x256x!tt.ptr<f32, 1>>, tensor<1x256xi32>
28
+ %13 = tt.load %12, %7, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32>
29
+ %14 = arith.muli %0, %c256_i32 : i32
30
+ %15 = tt.splat %14 : (i32) -> tensor<1x256xi32>
31
+ %16 = arith.addi %2, %15 : tensor<1x256xi32>
32
+ %17 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<1x256x!tt.ptr<bf16, 1>>
33
+ %18 = tt.addptr %17, %16 : tensor<1x256x!tt.ptr<bf16, 1>>, tensor<1x256xi32>
34
+ %19 = tt.load %18, %7, %cst {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xbf16>
35
+ %20 = arith.extf %19 : tensor<1x256xbf16> to tensor<1x256xf32>
36
+ %21 = arith.addi %5, %cst_3 : tensor<1x1xi64>
37
+ %22 = arith.cmpi slt, %5, %cst_4 : tensor<1x1xi64>
38
+ %23 = arith.select %22, %21, %5 : tensor<1x1xi1>, tensor<1x1xi64>
39
+ %24 = arith.cmpi sge, %23, %cst_4 : tensor<1x1xi64>
40
+ %25 = arith.cmpi slt, %23, %cst_3 : tensor<1x1xi64>
41
+ %26 = arith.andi %24, %25 : tensor<1x1xi1>
42
+ tt.assert %26, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<1x1xi1>
43
+ %27 = arith.muli %23, %cst_2 : tensor<1x1xi64>
44
+ %28 = tt.broadcast %27 : (tensor<1x1xi64>) -> tensor<1x256xi64>
45
+ %29 = arith.extsi %2 : tensor<1x256xi32> to tensor<1x256xi64>
46
+ %30 = arith.addi %29, %28 : tensor<1x256xi64>
47
+ %31 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>>
48
+ %32 = tt.addptr %31, %30 : tensor<1x256x!tt.ptr<f32, 1>>, tensor<1x256xi64>
49
+ %33 = tt.load %32, %7, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32>
50
+ %34 = arith.addf %33, %13 : tensor<1x256xf32>
51
+ %35 = arith.addf %34, %20 : tensor<1x256xf32>
52
+ %36 = arith.addf %35, %cst_7 : tensor<1x256xf32>
53
+ %37 = arith.subf %35, %36 : tensor<1x256xf32>
54
+ %38 = arith.mulf %35, %37 : tensor<1x256xf32>
55
+ %39 = arith.addf %38, %cst_7 : tensor<1x256xf32>
56
+ %40 = arith.select %7, %36, %cst_7 : tensor<1x256xi1>, tensor<1x256xf32>
57
+ %41 = arith.select %7, %39, %cst_7 : tensor<1x256xi1>, tensor<1x256xf32>
58
+ %42 = arith.select %7, %cst_0, %cst_7 : tensor<1x256xi1>, tensor<1x256xf32>
59
+ %43:3 = "tt.reduce"(%40, %41, %42) <{axis = 1 : i32}> ({
60
+ ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
61
+ %66 = arith.subf %arg11, %arg8 : f32
62
+ %67 = arith.addf %arg10, %arg13 : f32
63
+ %68 = arith.cmpf oeq, %67, %cst_1 : f32
64
+ %69 = arith.divf %arg13, %67 : f32
65
+ %70 = arith.select %68, %cst_1, %69 : f32
66
+ %71 = arith.mulf %66, %70 : f32
67
+ %72 = arith.addf %arg8, %71 : f32
68
+ %73 = arith.addf %arg9, %arg12 : f32
69
+ %74 = arith.mulf %66, %66 : f32
70
+ %75 = arith.mulf %74, %arg10 : f32
71
+ %76 = arith.mulf %75, %70 : f32
72
+ %77 = arith.addf %73, %76 : f32
73
+ tt.reduce.return %72, %77, %67 : f32, f32, f32
74
+ }) : (tensor<1x256xf32>, tensor<1x256xf32>, tensor<1x256xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>)
75
+ %44 = tt.expand_dims %43#0 {axis = 1 : i32} : (tensor<1xf32>) -> tensor<1x1xf32>
76
+ %45 = tt.expand_dims %43#1 {axis = 1 : i32} : (tensor<1xf32>) -> tensor<1x1xf32>
77
+ %46 = tt.load %12, %7, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32>
78
+ %47 = tt.load %18, %7, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x256xbf16>
79
+ %48 = arith.extf %47 : tensor<1x256xbf16> to tensor<1x256xf32>
80
+ %49 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>>
81
+ %50 = tt.addptr %49, %2 : tensor<1x256x!tt.ptr<f32, 1>>, tensor<1x256xi32>
82
+ %51 = tt.load %50, %7, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32>
83
+ tt.assert %26, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<1x1xi1>
84
+ %52 = tt.load %32, %7, %cst_7 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x256xf32>
85
+ %53 = arith.addf %52, %46 : tensor<1x256xf32>
86
+ %54 = arith.addf %53, %48 : tensor<1x256xf32>
87
+ %55 = tt.broadcast %44 : (tensor<1x1xf32>) -> tensor<1x256xf32>
88
+ %56 = arith.subf %54, %55 : tensor<1x256xf32>
89
+ %57 = arith.divf %45, %cst_6 : tensor<1x1xf32>
90
+ %58 = arith.addf %57, %cst_5 : tensor<1x1xf32>
91
+ %59 = tt.extern_elementwise %58 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32>
92
+ %60 = tt.broadcast %59 : (tensor<1x1xf32>) -> tensor<1x256xf32>
93
+ %61 = arith.mulf %56, %60 : tensor<1x256xf32>
94
+ %62 = arith.mulf %61, %51 : tensor<1x256xf32>
95
+ %63 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<1x256x!tt.ptr<bf16, 1>>
96
+ %64 = tt.addptr %63, %16 : tensor<1x256x!tt.ptr<bf16, 1>>, tensor<1x256xi32>
97
+ %65 = arith.truncf %62 : tensor<1x256xf32> to tensor<1x256xbf16>
98
+ tt.store %64, %65, %7 {cache = 1 : i32, evict = 1 : i32} : tensor<1x256xbf16>
99
+ tt.return
100
+ }
101
+ }
.triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.llir ADDED
@@ -0,0 +1,550 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
5
+ @assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
6
+ @assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp16 < 50257"
7
+ @assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
8
+ @assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
9
+ @assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
10
+ @global_smem = external addrspace(3) global [0 x i8]
11
+ @.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
12
+
13
+ declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
14
+
15
+ define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !7 {
16
+ %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
17
+ %10 = and i32 %9, 31, !dbg !10
18
+ %11 = lshr i32 %9, 5, !dbg !10
19
+ %12 = lshr i32 %9, 6, !dbg !10
20
+ %13 = and i32 %12, 1, !dbg !10
21
+ %14 = and i32 %9, 1, !dbg !10
22
+ %15 = and i32 %11, 1, !dbg !11
23
+ %urem = shl i32 %9, 2, !dbg !11
24
+ %16 = and i32 %urem, 252, !dbg !11
25
+ %17 = shl i32 %9, 1, !dbg !11
26
+ %18 = and i32 %17, 254, !dbg !11
27
+ %19 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !12
28
+ %20 = shl i32 %19, 1, !dbg !13
29
+ %21 = or i32 %20, %13, !dbg !14
30
+ %22 = or i32 %20, %14, !dbg !14
31
+ %23 = sext i32 %21 to i64, !dbg !15
32
+ %24 = getelementptr i64, ptr addrspace(1) %0, i64 %23, !dbg !15
33
+ %25 = sext i32 %22 to i64, !dbg !15
34
+ %26 = getelementptr i64, ptr addrspace(1) %0, i64 %25, !dbg !15
35
+ %27 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
36
+ %28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
37
+ %29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
38
+ %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
39
+ %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #6, !dbg !16
40
+ %32 = srem i32 %21, 512, !dbg !17
41
+ %33 = shl nsw i32 %32, 8, !dbg !18
42
+ %34 = or i32 %33, %16, !dbg !19
43
+ %35 = sext i32 %34 to i64, !dbg !20
44
+ %36 = getelementptr float, ptr addrspace(1) %2, i64 %35, !dbg !20
45
+ %37 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %36, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21
46
+ %38 = extractvalue { i32, i32, i32, i32 } %37, 0, !dbg !21
47
+ %39 = extractvalue { i32, i32, i32, i32 } %37, 1, !dbg !21
48
+ %40 = extractvalue { i32, i32, i32, i32 } %37, 2, !dbg !21
49
+ %41 = extractvalue { i32, i32, i32, i32 } %37, 3, !dbg !21
50
+ %42 = insertelement <2 x i32> poison, i32 %39, i64 0, !dbg !21
51
+ %43 = insertelement <2 x i32> %42, i32 %38, i64 1, !dbg !21
52
+ %44 = bitcast <2 x i32> %43 to <2 x float>, !dbg !21
53
+ %45 = bitcast i32 %40 to float, !dbg !21
54
+ %46 = bitcast i32 %41 to float, !dbg !21
55
+ %47 = shl i32 %21, 8, !dbg !22
56
+ %48 = or i32 %47, %16, !dbg !23
57
+ %49 = sext i32 %48 to i64, !dbg !24
58
+ %50 = getelementptr i16, ptr addrspace(1) %3, i64 %49, !dbg !24
59
+ %51 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %50, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !25
60
+ %52 = extractvalue { i32, i32 } %51, 0, !dbg !25
61
+ %53 = extractvalue { i32, i32 } %51, 1, !dbg !25
62
+ %54 = trunc i32 %52 to i16, !dbg !25
63
+ %extelt.offset = lshr i32 %52, 16, !dbg !25
64
+ %55 = trunc i32 %extelt.offset to i16, !dbg !25
65
+ %56 = trunc i32 %53 to i16, !dbg !25
66
+ %extelt.offset1 = lshr i32 %53, 16, !dbg !25
67
+ %57 = trunc i32 %extelt.offset1 to i16, !dbg !25
68
+ %58 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %54) #6, !dbg !26
69
+ %59 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %55) #6, !dbg !26
70
+ %60 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %56) #6, !dbg !26
71
+ %61 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %57) #6, !dbg !26
72
+ %62 = add i64 %31, 50257, !dbg !27
73
+ %63 = icmp slt i64 %27, 0, !dbg !28
74
+ %64 = icmp slt i64 %31, 0, !dbg !28
75
+ %65 = select i1 %64, i64 %62, i64 %31, !dbg !29
76
+ %66 = icmp ugt i64 %65, 50256, !dbg !30
77
+ br i1 %66, label %67, label %68, !dbg !31
78
+
79
+ 67: ; preds = %8
80
+ tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !31
81
+ br label %68, !dbg !31
82
+
83
+ 68: ; preds = %67, %8
84
+ %69 = shl i64 %27, 8, !dbg !32
85
+ %70 = add i64 %69, 12865792, !dbg !32
86
+ %71 = select i1 %63, i64 %70, i64 %69, !dbg !32
87
+ %72 = zext nneg i32 %16 to i64
88
+ %73 = or i64 %71, %72, !dbg !33
89
+ %74 = getelementptr float, ptr addrspace(1) %1, i64 %73, !dbg !34
90
+ %75 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %74, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !35
91
+ %76 = extractvalue { i32, i32, i32, i32 } %75, 0, !dbg !35
92
+ %77 = extractvalue { i32, i32, i32, i32 } %75, 1, !dbg !35
93
+ %78 = extractvalue { i32, i32, i32, i32 } %75, 2, !dbg !35
94
+ %79 = extractvalue { i32, i32, i32, i32 } %75, 3, !dbg !35
95
+ %80 = bitcast i32 %78 to float, !dbg !35
96
+ %81 = bitcast i32 %79 to float, !dbg !35
97
+ %82 = fadd float %45, %80, !dbg !36
98
+ %83 = fadd float %46, %81, !dbg !36
99
+ %84 = fadd float %60, %82, !dbg !37
100
+ %85 = fadd float %61, %83, !dbg !37
101
+ %86 = insertelement <2 x i32> poison, i32 %77, i64 0, !dbg !35
102
+ %87 = insertelement <2 x i32> %86, i32 %76, i64 1, !dbg !35
103
+ %88 = bitcast <2 x i32> %87 to <2 x float>, !dbg !35
104
+ %89 = fadd <2 x float> %44, %88, !dbg !36
105
+ %90 = insertelement <2 x float> poison, float %59, i64 0, !dbg !37
106
+ %91 = insertelement <2 x float> %90, float %58, i64 1, !dbg !37
107
+ %92 = fadd <2 x float> %91, %89, !dbg !37
108
+ %93 = fadd <2 x float> %92, zeroinitializer, !dbg !38
109
+ %94 = fadd float %84, 0.000000e+00, !dbg !38
110
+ %95 = fadd float %85, 0.000000e+00, !dbg !38
111
+ %96 = extractelement <2 x float> %93, i64 1, !dbg !42
112
+ %97 = extractelement <2 x float> %92, i64 1, !dbg !46
113
+ %98 = fsub float %97, %96, !dbg !47
114
+ %99 = extractelement <2 x float> %93, i64 0, !dbg !42
115
+ %100 = extractelement <2 x float> %92, i64 0, !dbg !46
116
+ %101 = fsub float %100, %99, !dbg !47
117
+ %102 = fsub float %84, %94, !dbg !47
118
+ %103 = fsub float %85, %95, !dbg !47
119
+ %104 = fmul float %97, %98, !dbg !46
120
+ %105 = fmul float %100, %101, !dbg !46
121
+ %106 = fmul float %84, %102, !dbg !46
122
+ %107 = fmul float %85, %103, !dbg !46
123
+ %108 = fadd float %104, 0.000000e+00, !dbg !48
124
+ %109 = fadd float %105, 0.000000e+00, !dbg !48
125
+ %110 = fadd float %106, 0.000000e+00, !dbg !48
126
+ %111 = fadd float %107, 0.000000e+00, !dbg !48
127
+ %112 = fsub float %99, %96, !dbg !42
128
+ %113 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 2.000000e+00) #6, !dbg !49
129
+ %114 = fmul float %113, %112, !dbg !50
130
+ %115 = fadd float %96, %114, !dbg !51
131
+ %116 = fadd float %108, %109, !dbg !52
132
+ %117 = fmul float %112, %112, !dbg !53
133
+ %118 = fmul float %113, %117, !dbg !54
134
+ %119 = fadd float %118, %116, !dbg !55
135
+ %120 = fsub float %94, %115, !dbg !42
136
+ %121 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 3.000000e+00) #6, !dbg !49
137
+ %122 = fmul float %121, %120, !dbg !50
138
+ %123 = fadd float %115, %122, !dbg !51
139
+ %124 = fadd float %110, %119, !dbg !52
140
+ %125 = fmul float %120, %120, !dbg !53
141
+ %126 = fmul float %125, 2.000000e+00, !dbg !56
142
+ %127 = fmul float %121, %126, !dbg !54
143
+ %128 = fadd float %124, %127, !dbg !55
144
+ %129 = fsub float %95, %123, !dbg !42
145
+ %130 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 4.000000e+00) #6, !dbg !49
146
+ %131 = fmul float %130, %129, !dbg !50
147
+ %132 = fadd float %123, %131, !dbg !51
148
+ %133 = fadd float %111, %128, !dbg !52
149
+ %134 = fmul float %129, %129, !dbg !53
150
+ %135 = fmul float %134, 3.000000e+00, !dbg !56
151
+ %136 = fmul float %130, %135, !dbg !54
152
+ %137 = fadd float %133, %136, !dbg !55
153
+ %138 = bitcast float %132 to i32, !dbg !57
154
+ %139 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %138, i32 16, i32 31), !dbg !57
155
+ %140 = bitcast i32 %139 to float, !dbg !57
156
+ %141 = bitcast float %137 to i32, !dbg !57
157
+ %142 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %141, i32 16, i32 31), !dbg !57
158
+ %143 = bitcast i32 %142 to float, !dbg !57
159
+ %144 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 1082130432, i32 16, i32 31), !dbg !57
160
+ %145 = bitcast i32 %144 to float, !dbg !57
161
+ %146 = fsub float %140, %132, !dbg !42
162
+ %147 = fadd float %145, 4.000000e+00, !dbg !59
163
+ %148 = fcmp oeq float %147, 0.000000e+00, !dbg !60
164
+ %149 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %145, float %147) #6, !dbg !49
165
+ %150 = select i1 %148, float 0.000000e+00, float %149, !dbg !61
166
+ %151 = fmul float %150, %146, !dbg !50
167
+ %152 = fadd float %132, %151, !dbg !51
168
+ %153 = fadd float %137, %143, !dbg !52
169
+ %154 = fmul float %146, %146, !dbg !53
170
+ %155 = fmul float %154, 4.000000e+00, !dbg !56
171
+ %156 = fmul float %150, %155, !dbg !54
172
+ %157 = fadd float %153, %156, !dbg !55
173
+ %158 = bitcast float %152 to i32, !dbg !57
174
+ %159 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %158, i32 8, i32 31), !dbg !57
175
+ %160 = bitcast i32 %159 to float, !dbg !57
176
+ %161 = bitcast float %157 to i32, !dbg !57
177
+ %162 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %161, i32 8, i32 31), !dbg !57
178
+ %163 = bitcast i32 %162 to float, !dbg !57
179
+ %164 = bitcast float %147 to i32, !dbg !57
180
+ %165 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %164, i32 8, i32 31), !dbg !57
181
+ %166 = bitcast i32 %165 to float, !dbg !57
182
+ %167 = fsub float %160, %152, !dbg !42
183
+ %168 = fadd float %147, %166, !dbg !59
184
+ %169 = fcmp oeq float %168, 0.000000e+00, !dbg !60
185
+ %170 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %166, float %168) #6, !dbg !49
186
+ %171 = select i1 %169, float 0.000000e+00, float %170, !dbg !61
187
+ %172 = fmul float %171, %167, !dbg !50
188
+ %173 = fadd float %152, %172, !dbg !51
189
+ %174 = fadd float %157, %163, !dbg !52
190
+ %175 = fmul float %167, %167, !dbg !53
191
+ %176 = fmul float %147, %175, !dbg !56
192
+ %177 = fmul float %171, %176, !dbg !54
193
+ %178 = fadd float %174, %177, !dbg !55
194
+ %179 = bitcast float %173 to i32, !dbg !57
195
+ %180 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %179, i32 4, i32 31), !dbg !57
196
+ %181 = bitcast i32 %180 to float, !dbg !57
197
+ %182 = bitcast float %178 to i32, !dbg !57
198
+ %183 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %182, i32 4, i32 31), !dbg !57
199
+ %184 = bitcast i32 %183 to float, !dbg !57
200
+ %185 = bitcast float %168 to i32, !dbg !57
201
+ %186 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %185, i32 4, i32 31), !dbg !57
202
+ %187 = bitcast i32 %186 to float, !dbg !57
203
+ %188 = fsub float %181, %173, !dbg !42
204
+ %189 = fadd float %168, %187, !dbg !59
205
+ %190 = fcmp oeq float %189, 0.000000e+00, !dbg !60
206
+ %191 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %187, float %189) #6, !dbg !49
207
+ %192 = select i1 %190, float 0.000000e+00, float %191, !dbg !61
208
+ %193 = fmul float %192, %188, !dbg !50
209
+ %194 = fadd float %173, %193, !dbg !51
210
+ %195 = fadd float %178, %184, !dbg !52
211
+ %196 = fmul float %188, %188, !dbg !53
212
+ %197 = fmul float %168, %196, !dbg !56
213
+ %198 = fmul float %192, %197, !dbg !54
214
+ %199 = fadd float %195, %198, !dbg !55
215
+ %200 = bitcast float %194 to i32, !dbg !57
216
+ %201 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %200, i32 2, i32 31), !dbg !57
217
+ %202 = bitcast i32 %201 to float, !dbg !57
218
+ %203 = bitcast float %199 to i32, !dbg !57
219
+ %204 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %203, i32 2, i32 31), !dbg !57
220
+ %205 = bitcast i32 %204 to float, !dbg !57
221
+ %206 = bitcast float %189 to i32, !dbg !57
222
+ %207 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %206, i32 2, i32 31), !dbg !57
223
+ %208 = bitcast i32 %207 to float, !dbg !57
224
+ %209 = fsub float %202, %194, !dbg !42
225
+ %210 = fadd float %189, %208, !dbg !59
226
+ %211 = fcmp oeq float %210, 0.000000e+00, !dbg !60
227
+ %212 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %208, float %210) #6, !dbg !49
228
+ %213 = select i1 %211, float 0.000000e+00, float %212, !dbg !61
229
+ %214 = fmul float %213, %209, !dbg !50
230
+ %215 = fadd float %194, %214, !dbg !51
231
+ %216 = fadd float %199, %205, !dbg !52
232
+ %217 = fmul float %209, %209, !dbg !53
233
+ %218 = fmul float %189, %217, !dbg !56
234
+ %219 = fmul float %213, %218, !dbg !54
235
+ %220 = fadd float %216, %219, !dbg !55
236
+ %221 = bitcast float %215 to i32, !dbg !57
237
+ %222 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %221, i32 1, i32 31), !dbg !57
238
+ %223 = bitcast i32 %222 to float, !dbg !57
239
+ %224 = bitcast float %220 to i32, !dbg !57
240
+ %225 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %224, i32 1, i32 31), !dbg !57
241
+ %226 = bitcast i32 %225 to float, !dbg !57
242
+ %227 = bitcast float %210 to i32, !dbg !57
243
+ %228 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %227, i32 1, i32 31), !dbg !57
244
+ %229 = bitcast i32 %228 to float, !dbg !57
245
+ %230 = fsub float %223, %215, !dbg !42
246
+ %231 = fadd float %210, %229, !dbg !59
247
+ %232 = fcmp oeq float %231, 0.000000e+00, !dbg !60
248
+ %233 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %229, float %231) #6, !dbg !49
249
+ %234 = select i1 %232, float 0.000000e+00, float %233, !dbg !61
250
+ %235 = fmul float %234, %230, !dbg !50
251
+ %236 = fadd float %215, %235, !dbg !51
252
+ %237 = fadd float %220, %226, !dbg !52
253
+ %238 = fmul float %230, %230, !dbg !53
254
+ %239 = fmul float %210, %238, !dbg !56
255
+ %240 = fmul float %234, %239, !dbg !54
256
+ %241 = fadd float %237, %240, !dbg !55
257
+ %242 = icmp eq i32 %10, 0, !dbg !57
258
+ %243 = shl nuw nsw i32 %13, 1, !dbg !57
259
+ %244 = or i32 %243, %15, !dbg !57
260
+ %245 = zext nneg i32 %244 to i64, !dbg !57
261
+ %246 = getelementptr float, ptr addrspace(3) @global_smem, i64 %245, !dbg !57
262
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %246, float %236, i1 %242) #6, !dbg !57
263
+ %247 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %245, !dbg !57
264
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %247, float %241, i1 %242) #6, !dbg !57
265
+ %248 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 32), i64 %245, !dbg !57
266
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %248, float %231, i1 %242) #6, !dbg !57
267
+ tail call void @llvm.nvvm.barrier0(), !dbg !57
268
+ %249 = icmp slt i32 %9, 4, !dbg !57
269
+ %250 = sext i32 %9 to i64, !dbg !57
270
+ %251 = getelementptr float, ptr addrspace(3) @global_smem, i64 %250, !dbg !57
271
+ %252 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %251, i1 %249) #6, !dbg !57
272
+ %253 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %250, !dbg !57
273
+ %254 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %253, i1 %249) #6, !dbg !57
274
+ %255 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 32), i64 %250, !dbg !57
275
+ %256 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %255, i1 %249) #6, !dbg !57
276
+ %257 = bitcast float %252 to i32, !dbg !57
277
+ %258 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %257, i32 1, i32 31), !dbg !57
278
+ %259 = bitcast i32 %258 to float, !dbg !57
279
+ %260 = bitcast float %254 to i32, !dbg !57
280
+ %261 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %260, i32 1, i32 31), !dbg !57
281
+ %262 = bitcast i32 %261 to float, !dbg !57
282
+ %263 = bitcast float %256 to i32, !dbg !57
283
+ %264 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %263, i32 1, i32 31), !dbg !57
284
+ %265 = bitcast i32 %264 to float, !dbg !57
285
+ %266 = fsub float %259, %252, !dbg !42
286
+ %267 = fadd float %256, %265, !dbg !59
287
+ %268 = fcmp oeq float %267, 0.000000e+00, !dbg !60
288
+ %269 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %265, float %267) #6, !dbg !49
289
+ %270 = select i1 %268, float 0.000000e+00, float %269, !dbg !61
290
+ %271 = fmul float %266, %270, !dbg !50
291
+ %272 = fadd float %252, %271, !dbg !51
292
+ %273 = fadd float %254, %262, !dbg !52
293
+ %274 = fmul float %266, %266, !dbg !53
294
+ %275 = fmul float %256, %274, !dbg !56
295
+ %276 = fmul float %275, %270, !dbg !54
296
+ %277 = fadd float %273, %276, !dbg !55
297
+ %278 = icmp eq i32 %14, 0, !dbg !57
298
+ %279 = and i1 %249, %278, !dbg !57
299
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %251, float %272, i1 %279) #6, !dbg !57
300
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %253, float %277, i1 %279) #6, !dbg !57
301
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %255, float %267, i1 %279) #6, !dbg !57
302
+ tail call void @llvm.nvvm.barrier0(), !dbg !57
303
+ %280 = zext nneg i32 %243 to i64, !dbg !57
304
+ %281 = getelementptr float, ptr addrspace(3) @global_smem, i64 %280, !dbg !57
305
+ %282 = load float, ptr addrspace(3) %281, align 4, !dbg !57
306
+ %283 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %280, !dbg !57
307
+ %284 = load float, ptr addrspace(3) %283, align 4, !dbg !57
308
+ %285 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %36, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !62
309
+ %286 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %50, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !63
310
+ %287 = extractvalue { i32, i32 } %286, 0, !dbg !63
311
+ %288 = extractvalue { i32, i32 } %286, 1, !dbg !63
312
+ %289 = trunc i32 %287 to i16, !dbg !63
313
+ %extelt.offset2 = lshr i32 %287, 16, !dbg !63
314
+ %290 = trunc i32 %extelt.offset2 to i16, !dbg !63
315
+ %291 = trunc i32 %288 to i16, !dbg !63
316
+ %extelt.offset3 = lshr i32 %288, 16, !dbg !63
317
+ %292 = trunc i32 %extelt.offset3 to i16, !dbg !63
318
+ %293 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %289) #6, !dbg !64
319
+ %294 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %290) #6, !dbg !64
320
+ %295 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %291) #6, !dbg !64
321
+ %296 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %292) #6, !dbg !64
322
+ %297 = zext nneg i32 %18 to i64, !dbg !65
323
+ %298 = getelementptr float, ptr addrspace(1) %4, i64 %297, !dbg !65
324
+ %299 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %298, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !66
325
+ br i1 %66, label %300, label %301, !dbg !67
326
+
327
+ 300: ; preds = %68
328
+ tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !67
329
+ br label %301, !dbg !67
330
+
331
+ 301: ; preds = %300, %68
332
+ %302 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %74, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
333
+ %303 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %284, float 2.560000e+02) #6, !dbg !69
334
+ %304 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %284, float 2.560000e+02) #6, !dbg !69
335
+ %305 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %284, float 2.560000e+02) #6, !dbg !69
336
+ %306 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %284, float 2.560000e+02) #6, !dbg !69
337
+ %307 = fadd float %303, 0x3EE4F8B580000000, !dbg !70
338
+ %308 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
339
+ %.not.i = icmp eq i32 %308, 0, !dbg !71
340
+ br i1 %.not.i, label %311, label %309, !dbg !71
341
+
342
+ 309: ; preds = %301
343
+ %310 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %307), !dbg !71
344
+ br label %__nv_rsqrtf.exit, !dbg !71
345
+
346
+ 311: ; preds = %301
347
+ %312 = tail call float @llvm.nvvm.rsqrt.approx.f(float %307), !dbg !71
348
+ br label %__nv_rsqrtf.exit, !dbg !71
349
+
350
+ __nv_rsqrtf.exit: ; preds = %309, %311
351
+ %.0.i = phi float [ %310, %309 ], [ %312, %311 ], !dbg !71
352
+ %313 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
353
+ %314 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
354
+ %315 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
355
+ %316 = extractvalue { i32, i32, i32, i32 } %302, 3, !dbg !68
356
+ %317 = bitcast i32 %316 to float, !dbg !68
357
+ %318 = extractvalue { i32, i32, i32, i32 } %285, 3, !dbg !62
358
+ %319 = bitcast i32 %318 to float, !dbg !62
359
+ %320 = fadd float %319, %317, !dbg !72
360
+ %321 = fadd float %296, %320, !dbg !73
361
+ %322 = fsub float %321, %282, !dbg !74
362
+ %323 = extractvalue { i32, i32, i32, i32 } %302, 2, !dbg !68
363
+ %324 = bitcast i32 %323 to float, !dbg !68
364
+ %325 = extractvalue { i32, i32, i32, i32 } %285, 2, !dbg !62
365
+ %326 = bitcast i32 %325 to float, !dbg !62
366
+ %327 = fadd float %326, %324, !dbg !72
367
+ %328 = fadd float %295, %327, !dbg !73
368
+ %329 = fsub float %328, %282, !dbg !74
369
+ %330 = extractvalue { i32, i32, i32, i32 } %302, 1, !dbg !68
370
+ %331 = bitcast i32 %330 to float, !dbg !68
371
+ %332 = extractvalue { i32, i32, i32, i32 } %285, 1, !dbg !62
372
+ %333 = bitcast i32 %332 to float, !dbg !62
373
+ %334 = fadd float %333, %331, !dbg !72
374
+ %335 = fadd float %294, %334, !dbg !73
375
+ %336 = fsub float %335, %282, !dbg !74
376
+ %337 = extractvalue { i32, i32, i32, i32 } %302, 0, !dbg !68
377
+ %338 = bitcast i32 %337 to float, !dbg !68
378
+ %339 = extractvalue { i32, i32, i32, i32 } %285, 0, !dbg !62
379
+ %340 = bitcast i32 %339 to float, !dbg !62
380
+ %341 = fadd float %340, %338, !dbg !72
381
+ %342 = fadd float %293, %341, !dbg !73
382
+ %343 = fsub float %342, %282, !dbg !74
383
+ %344 = extractvalue { i32, i32 } %299, 0, !dbg !66
384
+ %345 = extractvalue { i32, i32 } %299, 1, !dbg !66
385
+ %346 = fmul float %343, %.0.i, !dbg !75
386
+ %347 = fmul float %336, %.0.i, !dbg !75
387
+ %348 = fmul float %329, %.0.i, !dbg !75
388
+ %349 = fmul float %322, %.0.i, !dbg !75
389
+ tail call void @llvm.nvvm.barrier0(), !dbg !76
390
+ %350 = getelementptr float, ptr addrspace(3) @global_smem, i64 %297, !dbg !76
391
+ %351 = insertelement <2 x i32> undef, i32 %344, i64 0, !dbg !76
392
+ %352 = insertelement <2 x i32> %351, i32 %345, i64 1, !dbg !76
393
+ store <2 x i32> %352, ptr addrspace(3) %350, align 8, !dbg !76
394
+ tail call void @llvm.nvvm.barrier0(), !dbg !76
395
+ %353 = getelementptr float, ptr addrspace(3) @global_smem, i64 %72, !dbg !76
396
+ %354 = load float, ptr addrspace(3) %353, align 16, !dbg !76
397
+ %355 = getelementptr inbounds <4 x float>, ptr addrspace(3) %353, i64 0, i64 1, !dbg !76
398
+ %356 = load float, ptr addrspace(3) %355, align 4, !dbg !76
399
+ %357 = getelementptr inbounds <4 x float>, ptr addrspace(3) %353, i64 0, i64 2, !dbg !76
400
+ %358 = load float, ptr addrspace(3) %357, align 8, !dbg !76
401
+ %359 = getelementptr inbounds <4 x float>, ptr addrspace(3) %353, i64 0, i64 3, !dbg !76
402
+ %360 = load float, ptr addrspace(3) %359, align 4, !dbg !76
403
+ %361 = fmul float %346, %354, !dbg !76
404
+ %362 = fmul float %347, %356, !dbg !76
405
+ %363 = fmul float %348, %358, !dbg !76
406
+ %364 = fmul float %349, %360, !dbg !76
407
+ %365 = getelementptr i16, ptr addrspace(1) %5, i64 %49, !dbg !77
408
+ %366 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %361) #6, !dbg !78
409
+ %367 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %362) #6, !dbg !78
410
+ %368 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %363) #6, !dbg !78
411
+ %369 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %364) #6, !dbg !78
412
+ %370 = insertelement <2 x i16> undef, i16 %366, i64 0, !dbg !78
413
+ %371 = insertelement <2 x i16> %370, i16 %367, i64 1, !dbg !78
414
+ %372 = bitcast <2 x i16> %371 to i32, !dbg !78
415
+ %373 = insertelement <2 x i16> undef, i16 %368, i64 0, !dbg !78
416
+ %374 = insertelement <2 x i16> %373, i16 %369, i64 1, !dbg !78
417
+ %375 = bitcast <2 x i16> %374 to i32, !dbg !78
418
+ tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %372, i32 %375, ptr addrspace(1) %365, i1 true) #6, !dbg !78
419
+ ret void, !dbg !79
420
+ }
421
+
422
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
423
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
424
+
425
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
426
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
427
+
428
+ ; Function Attrs: convergent nocallback nounwind
429
+ declare void @llvm.nvvm.barrier0() #2
430
+
431
+ ; Function Attrs: alwaysinline nounwind
432
+ define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
433
+ %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
434
+ %.not = icmp eq i32 %1, 0
435
+ br i1 %.not, label %4, label %2
436
+
437
+ 2: ; preds = %0
438
+ %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
439
+ br label %6
440
+
441
+ 4: ; preds = %0
442
+ %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
443
+ br label %6
444
+
445
+ 6: ; preds = %4, %2
446
+ %.0 = phi float [ %3, %2 ], [ %5, %4 ]
447
+ ret float %.0
448
+ }
449
+
450
+ declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
451
+
452
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
453
+ declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
454
+
455
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
456
+ declare float @llvm.nvvm.rsqrt.approx.f(float) #5
457
+
458
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
459
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
460
+ attributes #2 = { convergent nocallback nounwind }
461
+ attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
462
+ attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
463
+ attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
464
+ attributes #6 = { nounwind }
465
+
466
+ !llvm.module.flags = !{!0, !1}
467
+ !llvm.dbg.cu = !{!2}
468
+ !nvvm.annotations = !{!4, !5, !5, !4}
469
+ !llvm.ident = !{!6}
470
+
471
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
472
+ !1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
473
+ !2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
474
+ !3 = !DIFile(filename: "cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py", directory: "/tmp/torchinductor_root/pn")
475
+ !4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1}
476
+ !5 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 128}
477
+ !6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
478
+ !7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
479
+ !8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
480
+ !9 = !{}
481
+ !10 = !DILocation(line: 22, column: 44, scope: !7)
482
+ !11 = !DILocation(line: 24, column: 33, scope: !7)
483
+ !12 = !DILocation(line: 21, column: 28, scope: !7)
484
+ !13 = !DILocation(line: 21, column: 33, scope: !7)
485
+ !14 = !DILocation(line: 22, column: 23, scope: !7)
486
+ !15 = !DILocation(line: 26, column: 30, scope: !7)
487
+ !16 = !DILocation(line: 26, column: 35, scope: !7)
488
+ !17 = !DILocation(line: 27, column: 18, scope: !7)
489
+ !18 = !DILocation(line: 35, column: 44, scope: !7)
490
+ !19 = !DILocation(line: 35, column: 40, scope: !7)
491
+ !20 = !DILocation(line: 35, column: 34, scope: !7)
492
+ !21 = !DILocation(line: 35, column: 50, scope: !7)
493
+ !22 = !DILocation(line: 36, column: 44, scope: !7)
494
+ !23 = !DILocation(line: 36, column: 40, scope: !7)
495
+ !24 = !DILocation(line: 36, column: 34, scope: !7)
496
+ !25 = !DILocation(line: 36, column: 50, scope: !7)
497
+ !26 = !DILocation(line: 36, column: 101, scope: !7)
498
+ !27 = !DILocation(line: 37, column: 22, scope: !7)
499
+ !28 = !DILocation(line: 38, column: 22, scope: !7)
500
+ !29 = !DILocation(line: 39, column: 36, scope: !7)
501
+ !30 = !DILocation(line: 40, column: 40, scope: !7)
502
+ !31 = !DILocation(line: 40, column: 55, scope: !7)
503
+ !32 = !DILocation(line: 41, column: 44, scope: !7)
504
+ !33 = !DILocation(line: 41, column: 40, scope: !7)
505
+ !34 = !DILocation(line: 41, column: 34, scope: !7)
506
+ !35 = !DILocation(line: 41, column: 52, scope: !7)
507
+ !36 = !DILocation(line: 42, column: 22, scope: !7)
508
+ !37 = !DILocation(line: 44, column: 22, scope: !7)
509
+ !38 = !DILocation(line: 98, column: 22, scope: !39, inlinedAt: !41)
510
+ !39 = distinct !DILexicalBlockFile(scope: !7, file: !40, discriminator: 0)
511
+ !40 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
512
+ !41 = !DILocation(line: 47, column: 41, scope: !39)
513
+ !42 = !DILocation(line: 108, column: 21, scope: !43, inlinedAt: !44)
514
+ !43 = distinct !DILexicalBlockFile(scope: !39, file: !40, discriminator: 0)
515
+ !44 = !DILocation(line: 120, column: 46, scope: !43, inlinedAt: !45)
516
+ !45 = !DILocation(line: 53, column: 44, scope: !43)
517
+ !46 = !DILocation(line: 101, column: 22, scope: !39, inlinedAt: !41)
518
+ !47 = !DILocation(line: 101, column: 30, scope: !39, inlinedAt: !41)
519
+ !48 = !DILocation(line: 101, column: 13, scope: !39, inlinedAt: !41)
520
+ !49 = !DILocation(line: 110, column: 60, scope: !43, inlinedAt: !44)
521
+ !50 = !DILocation(line: 112, column: 25, scope: !43, inlinedAt: !44)
522
+ !51 = !DILocation(line: 112, column: 17, scope: !43, inlinedAt: !44)
523
+ !52 = !DILocation(line: 113, column: 15, scope: !43, inlinedAt: !44)
524
+ !53 = !DILocation(line: 113, column: 30, scope: !43, inlinedAt: !44)
525
+ !54 = !DILocation(line: 113, column: 49, scope: !43, inlinedAt: !44)
526
+ !55 = !DILocation(line: 113, column: 22, scope: !43, inlinedAt: !44)
527
+ !56 = !DILocation(line: 113, column: 38, scope: !43, inlinedAt: !44)
528
+ !57 = !DILocation(line: 120, column: 46, scope: !39, inlinedAt: !58)
529
+ !58 = !DILocation(line: 53, column: 44, scope: !39)
530
+ !59 = !DILocation(line: 109, column: 28, scope: !43, inlinedAt: !44)
531
+ !60 = !DILocation(line: 110, column: 39, scope: !43, inlinedAt: !44)
532
+ !61 = !DILocation(line: 110, column: 49, scope: !43, inlinedAt: !44)
533
+ !62 = !DILocation(line: 62, column: 51, scope: !7)
534
+ !63 = !DILocation(line: 63, column: 51, scope: !7)
535
+ !64 = !DILocation(line: 63, column: 103, scope: !7)
536
+ !65 = !DILocation(line: 64, column: 35, scope: !7)
537
+ !66 = !DILocation(line: 64, column: 40, scope: !7)
538
+ !67 = !DILocation(line: 68, column: 57, scope: !7)
539
+ !68 = !DILocation(line: 69, column: 54, scope: !7)
540
+ !69 = !DILocation(line: 75, column: 24, scope: !7)
541
+ !70 = !DILocation(line: 77, column: 24, scope: !7)
542
+ !71 = !DILocation(line: 78, column: 30, scope: !7)
543
+ !72 = !DILocation(line: 70, column: 24, scope: !7)
544
+ !73 = !DILocation(line: 72, column: 24, scope: !7)
545
+ !74 = !DILocation(line: 73, column: 24, scope: !7)
546
+ !75 = !DILocation(line: 79, column: 24, scope: !7)
547
+ !76 = !DILocation(line: 80, column: 24, scope: !7)
548
+ !77 = !DILocation(line: 82, column: 29, scope: !7)
549
+ !78 = !DILocation(line: 82, column: 52, scope: !7)
550
+ !79 = !DILocation(line: 58, column: 4, scope: !7)
.triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.ttgir ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [2, 2], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 2], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
3
+ #blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
4
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
5
+ tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
6
+ %cst = arith.constant dense<512> : tensor<2x1xi32, #blocked>
7
+ %cst_0 = arith.constant dense<256> : tensor<1x256xi32, #blocked>
8
+ %cst_1 = arith.constant dense<256> : tensor<1x256xi32, #blocked1>
9
+ %cst_2 = arith.constant dense<256> : tensor<2x1xi32, #blocked>
10
+ %cst_3 = arith.constant dense<1.000000e+00> : tensor<1x256xf32, #blocked>
11
+ %cst_4 = arith.constant dense<0.000000e+00> : tensor<1x256xf32, #blocked>
12
+ %cst_5 = arith.constant dense<256> : tensor<2x1xi64, #blocked>
13
+ %cst_6 = arith.constant dense<50257> : tensor<2x1xi64, #blocked>
14
+ %cst_7 = arith.constant dense<0> : tensor<2x1xi64, #blocked>
15
+ %cst_8 = arith.constant dense<0> : tensor<2x1xi64, #blocked2>
16
+ %cst_9 = arith.constant dense<50257> : tensor<2x1xi64, #blocked2>
17
+ %cst_10 = arith.constant 0.000000e+00 : f32
18
+ %cst_11 = arith.constant dense<9.99999974E-6> : tensor<2x1xf32, #blocked>
19
+ %cst_12 = arith.constant dense<2.560000e+02> : tensor<2x1xf32, #blocked>
20
+ %cst_13 = arith.constant dense<0.000000e+00> : tensor<2x256xf32, #blocked>
21
+ %cst_14 = arith.constant dense<0.000000e+00> : tensor<2x256xbf16, #blocked>
22
+ %cst_15 = arith.constant dense<0.000000e+00> : tensor<1x256xf32, #blocked1>
23
+ %c2_i32 = arith.constant 2 : i32
24
+ %0 = tt.get_program_id x : i32
25
+ %1 = arith.muli %0, %c2_i32 : i32
26
+ %2 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
27
+ %3 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>
28
+ %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<2x1xi32, #blocked>
29
+ %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>) -> tensor<2x1xi32, #blocked2>
30
+ %6 = tt.splat %1 : (i32) -> tensor<2x1xi32, #blocked>
31
+ %7 = tt.splat %1 : (i32) -> tensor<2x1xi32, #blocked2>
32
+ %8 = arith.addi %6, %4 : tensor<2x1xi32, #blocked>
33
+ %9 = arith.addi %7, %5 : tensor<2x1xi32, #blocked2>
34
+ %10 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
35
+ %11 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>
36
+ %12 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x256xi32, #blocked>
37
+ %13 = tt.expand_dims %11 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x256xi32, #blocked1>
38
+ %14 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<2x1x!tt.ptr<i64, 1>, #blocked>
39
+ %15 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<2x1x!tt.ptr<i64, 1>, #blocked2>
40
+ %16 = tt.addptr %14, %8 : tensor<2x1x!tt.ptr<i64, 1>, #blocked>, tensor<2x1xi32, #blocked>
41
+ %17 = tt.addptr %15, %9 : tensor<2x1x!tt.ptr<i64, 1>, #blocked2>, tensor<2x1xi32, #blocked2>
42
+ %18 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x1xi64, #blocked>
43
+ %19 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x1xi64, #blocked2>
44
+ %20 = arith.remsi %8, %cst : tensor<2x1xi32, #blocked>
45
+ %21 = arith.cmpi slt, %12, %cst_0 : tensor<1x256xi32, #blocked>
46
+ %22 = arith.cmpi slt, %13, %cst_1 : tensor<1x256xi32, #blocked1>
47
+ %23 = arith.muli %20, %cst_2 : tensor<2x1xi32, #blocked>
48
+ %24 = tt.broadcast %12 : (tensor<1x256xi32, #blocked>) -> tensor<2x256xi32, #blocked>
49
+ %25 = tt.broadcast %23 : (tensor<2x1xi32, #blocked>) -> tensor<2x256xi32, #blocked>
50
+ %26 = arith.addi %24, %25 : tensor<2x256xi32, #blocked>
51
+ %27 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<2x256x!tt.ptr<f32, 1>, #blocked>
52
+ %28 = tt.addptr %27, %26 : tensor<2x256x!tt.ptr<f32, 1>, #blocked>, tensor<2x256xi32, #blocked>
53
+ %29 = tt.broadcast %21 : (tensor<1x256xi1, #blocked>) -> tensor<2x256xi1, #blocked>
54
+ %30 = tt.load %28, %29, %cst_13 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32, #blocked>
55
+ %31 = arith.muli %8, %cst_2 : tensor<2x1xi32, #blocked>
56
+ %32 = tt.broadcast %31 : (tensor<2x1xi32, #blocked>) -> tensor<2x256xi32, #blocked>
57
+ %33 = arith.addi %24, %32 : tensor<2x256xi32, #blocked>
58
+ %34 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<2x256x!tt.ptr<bf16, 1>, #blocked>
59
+ %35 = tt.addptr %34, %33 : tensor<2x256x!tt.ptr<bf16, 1>, #blocked>, tensor<2x256xi32, #blocked>
60
+ %36 = tt.load %35, %29, %cst_14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xbf16, #blocked>
61
+ %37 = arith.extf %36 : tensor<2x256xbf16, #blocked> to tensor<2x256xf32, #blocked>
62
+ %38 = arith.addi %18, %cst_6 : tensor<2x1xi64, #blocked>
63
+ %39 = arith.addi %19, %cst_9 : tensor<2x1xi64, #blocked2>
64
+ %40 = arith.cmpi slt, %18, %cst_7 : tensor<2x1xi64, #blocked>
65
+ %41 = arith.cmpi slt, %19, %cst_8 : tensor<2x1xi64, #blocked2>
66
+ %42 = arith.select %40, %38, %18 : tensor<2x1xi1, #blocked>, tensor<2x1xi64, #blocked>
67
+ %43 = arith.select %41, %39, %19 : tensor<2x1xi1, #blocked2>, tensor<2x1xi64, #blocked2>
68
+ %44 = arith.cmpi sge, %43, %cst_8 : tensor<2x1xi64, #blocked2>
69
+ %45 = arith.cmpi slt, %43, %cst_9 : tensor<2x1xi64, #blocked2>
70
+ %46 = arith.andi %44, %45 : tensor<2x1xi1, #blocked2>
71
+ tt.assert %46, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<2x1xi1, #blocked2>
72
+ %47 = arith.muli %42, %cst_5 : tensor<2x1xi64, #blocked>
73
+ %48 = tt.broadcast %47 : (tensor<2x1xi64, #blocked>) -> tensor<2x256xi64, #blocked>
74
+ %49 = arith.extsi %12 : tensor<1x256xi32, #blocked> to tensor<1x256xi64, #blocked>
75
+ %50 = tt.broadcast %49 : (tensor<1x256xi64, #blocked>) -> tensor<2x256xi64, #blocked>
76
+ %51 = arith.addi %50, %48 : tensor<2x256xi64, #blocked>
77
+ %52 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<2x256x!tt.ptr<f32, 1>, #blocked>
78
+ %53 = tt.addptr %52, %51 : tensor<2x256x!tt.ptr<f32, 1>, #blocked>, tensor<2x256xi64, #blocked>
79
+ %54 = tt.load %53, %29, %cst_13 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32, #blocked>
80
+ %55 = arith.addf %54, %30 : tensor<2x256xf32, #blocked>
81
+ %56 = arith.addf %55, %37 : tensor<2x256xf32, #blocked>
82
+ %57 = arith.addf %56, %cst_13 : tensor<2x256xf32, #blocked>
83
+ %58 = arith.subf %56, %57 : tensor<2x256xf32, #blocked>
84
+ %59 = arith.mulf %56, %58 : tensor<2x256xf32, #blocked>
85
+ %60 = arith.addf %59, %cst_13 : tensor<2x256xf32, #blocked>
86
+ %61 = arith.select %29, %57, %cst_13 : tensor<2x256xi1, #blocked>, tensor<2x256xf32, #blocked>
87
+ %62 = arith.select %29, %60, %cst_13 : tensor<2x256xi1, #blocked>, tensor<2x256xf32, #blocked>
88
+ %63 = arith.select %21, %cst_3, %cst_4 : tensor<1x256xi1, #blocked>, tensor<1x256xf32, #blocked>
89
+ %64 = tt.broadcast %63 : (tensor<1x256xf32, #blocked>) -> tensor<2x256xf32, #blocked>
90
+ %65:3 = "tt.reduce"(%61, %62, %64) <{axis = 1 : i32}> ({
91
+ ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
92
+ %90 = arith.subf %arg11, %arg8 : f32
93
+ %91 = arith.addf %arg10, %arg13 : f32
94
+ %92 = arith.cmpf oeq, %91, %cst_10 : f32
95
+ %93 = arith.divf %arg13, %91 : f32
96
+ %94 = arith.select %92, %cst_10, %93 : f32
97
+ %95 = arith.mulf %90, %94 : f32
98
+ %96 = arith.addf %arg8, %95 : f32
99
+ %97 = arith.addf %arg9, %arg12 : f32
100
+ %98 = arith.mulf %90, %90 : f32
101
+ %99 = arith.mulf %98, %arg10 : f32
102
+ %100 = arith.mulf %99, %94 : f32
103
+ %101 = arith.addf %97, %100 : f32
104
+ tt.reduce.return %96, %101, %91 : f32, f32, f32
105
+ }) : (tensor<2x256xf32, #blocked>, tensor<2x256xf32, #blocked>, tensor<2x256xf32, #blocked>) -> (tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
106
+ %66 = tt.expand_dims %65#0 {axis = 1 : i32} : (tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<2x1xf32, #blocked>
107
+ %67 = tt.expand_dims %65#1 {axis = 1 : i32} : (tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<2x1xf32, #blocked>
108
+ %68 = tt.load %28, %29, %cst_13 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32, #blocked>
109
+ %69 = tt.load %35, %29, %cst_14 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<2x256xbf16, #blocked>
110
+ %70 = arith.extf %69 : tensor<2x256xbf16, #blocked> to tensor<2x256xf32, #blocked>
111
+ %71 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>, #blocked1>
112
+ %72 = tt.addptr %71, %13 : tensor<1x256x!tt.ptr<f32, 1>, #blocked1>, tensor<1x256xi32, #blocked1>
113
+ %73 = tt.load %72, %22, %cst_15 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32, #blocked1>
114
+ tt.assert %46, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<2x1xi1, #blocked2>
115
+ %74 = tt.load %53, %29, %cst_13 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<2x256xf32, #blocked>
116
+ %75 = arith.addf %74, %68 : tensor<2x256xf32, #blocked>
117
+ %76 = arith.addf %75, %70 : tensor<2x256xf32, #blocked>
118
+ %77 = tt.broadcast %66 : (tensor<2x1xf32, #blocked>) -> tensor<2x256xf32, #blocked>
119
+ %78 = arith.subf %76, %77 : tensor<2x256xf32, #blocked>
120
+ %79 = arith.divf %67, %cst_12 : tensor<2x1xf32, #blocked>
121
+ %80 = arith.addf %79, %cst_11 : tensor<2x1xf32, #blocked>
122
+ %81 = tt.extern_elementwise %80 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32, #blocked>) -> tensor<2x1xf32, #blocked>
123
+ %82 = tt.broadcast %81 : (tensor<2x1xf32, #blocked>) -> tensor<2x256xf32, #blocked>
124
+ %83 = arith.mulf %78, %82 : tensor<2x256xf32, #blocked>
125
+ %84 = triton_gpu.convert_layout %73 : (tensor<1x256xf32, #blocked1>) -> tensor<1x256xf32, #blocked>
126
+ %85 = tt.broadcast %84 : (tensor<1x256xf32, #blocked>) -> tensor<2x256xf32, #blocked>
127
+ %86 = arith.mulf %83, %85 : tensor<2x256xf32, #blocked>
128
+ %87 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<2x256x!tt.ptr<bf16, 1>, #blocked>
129
+ %88 = tt.addptr %87, %33 : tensor<2x256x!tt.ptr<bf16, 1>, #blocked>, tensor<2x256xi32, #blocked>
130
+ %89 = arith.truncf %86 : tensor<2x256xf32, #blocked> to tensor<2x256xbf16, #blocked>
131
+ tt.store %88, %89, %29 {cache = 1 : i32, evict = 1 : i32} : tensor<2x256xbf16, #blocked>
132
+ tt.return
133
+ }
134
+ }
.triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.ttir ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<0.000000e+00> : tensor<2x256xbf16>
4
+ %cst_0 = arith.constant dense<1.000000e+00> : tensor<1x256xf32>
5
+ %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x256xf32>
6
+ %cst_2 = arith.constant 0.000000e+00 : f32
7
+ %cst_3 = arith.constant dense<256> : tensor<2x1xi64>
8
+ %cst_4 = arith.constant dense<50257> : tensor<2x1xi64>
9
+ %cst_5 = arith.constant dense<0> : tensor<2x1xi64>
10
+ %cst_6 = arith.constant dense<9.99999974E-6> : tensor<2x1xf32>
11
+ %cst_7 = arith.constant dense<2.560000e+02> : tensor<2x1xf32>
12
+ %cst_8 = arith.constant dense<0.000000e+00> : tensor<2x256xf32>
13
+ %cst_9 = arith.constant dense<256> : tensor<2x1xi32>
14
+ %cst_10 = arith.constant dense<256> : tensor<1x256xi32>
15
+ %cst_11 = arith.constant dense<512> : tensor<2x1xi32>
16
+ %c2_i32 = arith.constant 2 : i32
17
+ %0 = tt.get_program_id x : i32
18
+ %1 = arith.muli %0, %c2_i32 : i32
19
+ %2 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32>
20
+ %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<2xi32>) -> tensor<2x1xi32>
21
+ %4 = tt.splat %1 : (i32) -> tensor<2x1xi32>
22
+ %5 = arith.addi %4, %3 : tensor<2x1xi32>
23
+ %6 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
24
+ %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<256xi32>) -> tensor<1x256xi32>
25
+ %8 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<2x1x!tt.ptr<i64, 1>>
26
+ %9 = tt.addptr %8, %5 : tensor<2x1x!tt.ptr<i64, 1>>, tensor<2x1xi32>
27
+ %10 = tt.load %9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x1xi64>
28
+ %11 = arith.remsi %5, %cst_11 : tensor<2x1xi32>
29
+ %12 = arith.cmpi slt, %7, %cst_10 : tensor<1x256xi32>
30
+ %13 = arith.muli %11, %cst_9 : tensor<2x1xi32>
31
+ %14 = tt.broadcast %7 : (tensor<1x256xi32>) -> tensor<2x256xi32>
32
+ %15 = tt.broadcast %13 : (tensor<2x1xi32>) -> tensor<2x256xi32>
33
+ %16 = arith.addi %14, %15 : tensor<2x256xi32>
34
+ %17 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<2x256x!tt.ptr<f32, 1>>
35
+ %18 = tt.addptr %17, %16 : tensor<2x256x!tt.ptr<f32, 1>>, tensor<2x256xi32>
36
+ %19 = tt.broadcast %12 : (tensor<1x256xi1>) -> tensor<2x256xi1>
37
+ %20 = tt.load %18, %19, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32>
38
+ %21 = arith.muli %5, %cst_9 : tensor<2x1xi32>
39
+ %22 = tt.broadcast %21 : (tensor<2x1xi32>) -> tensor<2x256xi32>
40
+ %23 = arith.addi %14, %22 : tensor<2x256xi32>
41
+ %24 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<2x256x!tt.ptr<bf16, 1>>
42
+ %25 = tt.addptr %24, %23 : tensor<2x256x!tt.ptr<bf16, 1>>, tensor<2x256xi32>
43
+ %26 = tt.load %25, %19, %cst {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xbf16>
44
+ %27 = arith.extf %26 : tensor<2x256xbf16> to tensor<2x256xf32>
45
+ %28 = arith.addi %10, %cst_4 : tensor<2x1xi64>
46
+ %29 = arith.cmpi slt, %10, %cst_5 : tensor<2x1xi64>
47
+ %30 = arith.select %29, %28, %10 : tensor<2x1xi1>, tensor<2x1xi64>
48
+ %31 = arith.cmpi sge, %30, %cst_5 : tensor<2x1xi64>
49
+ %32 = arith.cmpi slt, %30, %cst_4 : tensor<2x1xi64>
50
+ %33 = arith.andi %31, %32 : tensor<2x1xi1>
51
+ tt.assert %33, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<2x1xi1>
52
+ %34 = arith.muli %30, %cst_3 : tensor<2x1xi64>
53
+ %35 = tt.broadcast %34 : (tensor<2x1xi64>) -> tensor<2x256xi64>
54
+ %36 = arith.extsi %7 : tensor<1x256xi32> to tensor<1x256xi64>
55
+ %37 = tt.broadcast %36 : (tensor<1x256xi64>) -> tensor<2x256xi64>
56
+ %38 = arith.addi %37, %35 : tensor<2x256xi64>
57
+ %39 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<2x256x!tt.ptr<f32, 1>>
58
+ %40 = tt.addptr %39, %38 : tensor<2x256x!tt.ptr<f32, 1>>, tensor<2x256xi64>
59
+ %41 = tt.load %40, %19, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32>
60
+ %42 = arith.addf %41, %20 : tensor<2x256xf32>
61
+ %43 = arith.addf %42, %27 : tensor<2x256xf32>
62
+ %44 = arith.addf %43, %cst_8 : tensor<2x256xf32>
63
+ %45 = arith.subf %43, %44 : tensor<2x256xf32>
64
+ %46 = arith.mulf %43, %45 : tensor<2x256xf32>
65
+ %47 = arith.addf %46, %cst_8 : tensor<2x256xf32>
66
+ %48 = arith.select %19, %44, %cst_8 : tensor<2x256xi1>, tensor<2x256xf32>
67
+ %49 = arith.select %19, %47, %cst_8 : tensor<2x256xi1>, tensor<2x256xf32>
68
+ %50 = arith.select %12, %cst_0, %cst_1 : tensor<1x256xi1>, tensor<1x256xf32>
69
+ %51 = tt.broadcast %50 : (tensor<1x256xf32>) -> tensor<2x256xf32>
70
+ %52:3 = "tt.reduce"(%48, %49, %51) <{axis = 1 : i32}> ({
71
+ ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
72
+ %76 = arith.subf %arg11, %arg8 : f32
73
+ %77 = arith.addf %arg10, %arg13 : f32
74
+ %78 = arith.cmpf oeq, %77, %cst_2 : f32
75
+ %79 = arith.divf %arg13, %77 : f32
76
+ %80 = arith.select %78, %cst_2, %79 : f32
77
+ %81 = arith.mulf %76, %80 : f32
78
+ %82 = arith.addf %arg8, %81 : f32
79
+ %83 = arith.addf %arg9, %arg12 : f32
80
+ %84 = arith.mulf %76, %76 : f32
81
+ %85 = arith.mulf %84, %arg10 : f32
82
+ %86 = arith.mulf %85, %80 : f32
83
+ %87 = arith.addf %83, %86 : f32
84
+ tt.reduce.return %82, %87, %77 : f32, f32, f32
85
+ }) : (tensor<2x256xf32>, tensor<2x256xf32>, tensor<2x256xf32>) -> (tensor<2xf32>, tensor<2xf32>, tensor<2xf32>)
86
+ %53 = tt.expand_dims %52#0 {axis = 1 : i32} : (tensor<2xf32>) -> tensor<2x1xf32>
87
+ %54 = tt.expand_dims %52#1 {axis = 1 : i32} : (tensor<2xf32>) -> tensor<2x1xf32>
88
+ %55 = tt.load %18, %19, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32>
89
+ %56 = tt.load %25, %19, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<2x256xbf16>
90
+ %57 = arith.extf %56 : tensor<2x256xbf16> to tensor<2x256xf32>
91
+ %58 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>>
92
+ %59 = tt.addptr %58, %7 : tensor<1x256x!tt.ptr<f32, 1>>, tensor<1x256xi32>
93
+ %60 = tt.load %59, %12, %cst_1 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32>
94
+ tt.assert %33, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<2x1xi1>
95
+ %61 = tt.load %40, %19, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<2x256xf32>
96
+ %62 = arith.addf %61, %55 : tensor<2x256xf32>
97
+ %63 = arith.addf %62, %57 : tensor<2x256xf32>
98
+ %64 = tt.broadcast %53 : (tensor<2x1xf32>) -> tensor<2x256xf32>
99
+ %65 = arith.subf %63, %64 : tensor<2x256xf32>
100
+ %66 = arith.divf %54, %cst_7 : tensor<2x1xf32>
101
+ %67 = arith.addf %66, %cst_6 : tensor<2x1xf32>
102
+ %68 = tt.extern_elementwise %67 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32>) -> tensor<2x1xf32>
103
+ %69 = tt.broadcast %68 : (tensor<2x1xf32>) -> tensor<2x256xf32>
104
+ %70 = arith.mulf %65, %69 : tensor<2x256xf32>
105
+ %71 = tt.broadcast %60 : (tensor<1x256xf32>) -> tensor<2x256xf32>
106
+ %72 = arith.mulf %70, %71 : tensor<2x256xf32>
107
+ %73 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<2x256x!tt.ptr<bf16, 1>>
108
+ %74 = tt.addptr %73, %23 : tensor<2x256x!tt.ptr<bf16, 1>>, tensor<2x256xi32>
109
+ %75 = arith.truncf %72 : tensor<2x256xf32> to tensor<2x256xbf16>
110
+ tt.store %74, %75, %19 {cache = 1 : i32, evict = 1 : i32} : tensor<2x256xbf16>
111
+ tt.return
112
+ }
113
+ }
.triton/dump/89f8cc1079aa03024e56dc2aee42813a/triton_.cubin ADDED
Binary file (58.1 kB). View file
 
.triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.ptx ADDED
@@ -0,0 +1,758 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4d5d6de7de
10
+ .extern .shared .align 1 .b8 global_smem[];
11
+ .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
12
+
13
+ .visible .entry triton__0d1d2d3d4d5d6de7de(
14
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_0,
15
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_1,
16
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_2,
17
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_3,
18
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_4,
19
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_5,
20
+ .param .u32 triton__0d1d2d3d4d5d6de7de_param_6,
21
+ .param .u32 triton__0d1d2d3d4d5d6de7de_param_7
22
+ )
23
+ .maxntid 64, 1, 1
24
+ {
25
+ .reg .pred %p<29>;
26
+ .reg .b16 %rs<17>;
27
+ .reg .b32 %r<100>;
28
+ .reg .f32 %f<86>;
29
+ .reg .b64 %rd<16>;
30
+ .loc 1 18 0
31
+ $L__func_begin0:
32
+ .loc 1 18 0
33
+
34
+ ld.param.u64 %rd7, [triton__0d1d2d3d4d5d6de7de_param_0];
35
+ ld.param.u64 %rd8, [triton__0d1d2d3d4d5d6de7de_param_1];
36
+ $L__tmp0:
37
+ .loc 1 26 26
38
+ mov.u32 %r66, %tid.x;
39
+ and.b32 %r67, %r66, 31;
40
+ ld.param.u64 %rd9, [triton__0d1d2d3d4d5d6de7de_param_2];
41
+ ld.param.u64 %rd10, [triton__0d1d2d3d4d5d6de7de_param_3];
42
+ ld.param.u64 %rd11, [triton__0d1d2d3d4d5d6de7de_param_4];
43
+ shl.b32 %r68, %r66, 2;
44
+ ld.param.u64 %rd12, [triton__0d1d2d3d4d5d6de7de_param_5];
45
+ and.b32 %r69, %r68, 252;
46
+ .loc 1 23 28
47
+ mov.u32 %r1, %ctaid.x;
48
+ .loc 1 30 40
49
+ shl.b32 %r70, %r1, 8;
50
+ .loc 1 30 36
51
+ or.b32 %r71, %r70, %r69;
52
+ .loc 1 30 30
53
+ mul.wide.s32 %rd13, %r71, 4;
54
+ add.s64 %rd1, %rd7, %rd13;
55
+ mov.b32 %r6, 0;
56
+ mov.pred %p1, -1;
57
+ .loc 1 30 46
58
+ mov.u32 %r2, 0x0;
59
+ mov.u32 %r3, 0x0;
60
+ mov.u32 %r4, 0x0;
61
+ mov.u32 %r5, 0x0;
62
+ @%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
63
+ @!%p1 mov.u32 %r2, %r6;
64
+ @!%p1 mov.u32 %r3, %r6;
65
+ @!%p1 mov.u32 %r4, %r6;
66
+ @!%p1 mov.u32 %r5, %r6;
67
+ mov.b32 %f1, %r4;
68
+ mov.b32 %f2, %r5;
69
+ .loc 1 31 30
70
+ mul.wide.s32 %rd14, %r71, 2;
71
+ add.s64 %rd2, %rd8, %rd14;
72
+ .loc 1 31 46
73
+ mov.u32 %r10, 0x0;
74
+ mov.u32 %r11, 0x0;
75
+ @%p1 ld.global.v2.b32 { %r10, %r11 }, [ %rd2 + 0 ];
76
+ @!%p1 mov.u32 %r10, %r6;
77
+ @!%p1 mov.u32 %r11, %r6;
78
+ cvt.u16.u32 %rs1, %r10;
79
+ { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r10; }
80
+ cvt.u16.u32 %rs3, %r11;
81
+ { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r11; }
82
+ .loc 1 31 67
83
+ cvt.f32.bf16 %r14, %rs1;
84
+ mov.b32 %f3, %r14;
85
+ cvt.f32.bf16 %r15, %rs2;
86
+ mov.b32 %f4, %r15;
87
+ cvt.f32.bf16 %r16, %rs3;
88
+ mov.b32 %f5, %r16;
89
+ cvt.f32.bf16 %r17, %rs4;
90
+ mov.b32 %f6, %r17;
91
+ .loc 1 32 30
92
+ add.s64 %rd3, %rd9, %rd14;
93
+ .loc 1 32 46
94
+ mov.u32 %r18, 0x0;
95
+ mov.u32 %r19, 0x0;
96
+ @%p1 ld.global.v2.b32 { %r18, %r19 }, [ %rd3 + 0 ];
97
+ @!%p1 mov.u32 %r18, %r6;
98
+ @!%p1 mov.u32 %r19, %r6;
99
+ cvt.u16.u32 %rs5, %r18;
100
+ { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r18; }
101
+ cvt.u16.u32 %rs7, %r19;
102
+ { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r19; }
103
+ .loc 1 32 67
104
+ cvt.f32.bf16 %r22, %rs5;
105
+ mov.b32 %f7, %r22;
106
+ cvt.f32.bf16 %r23, %rs6;
107
+ mov.b32 %f8, %r23;
108
+ cvt.f32.bf16 %r24, %rs7;
109
+ mov.b32 %f9, %r24;
110
+ cvt.f32.bf16 %r25, %rs8;
111
+ mov.b32 %f10, %r25;
112
+ .loc 1 33 30
113
+ add.s64 %rd4, %rd10, %rd14;
114
+ .loc 1 33 46
115
+ mov.u32 %r26, 0x0;
116
+ mov.u32 %r27, 0x0;
117
+ @%p1 ld.global.v2.b32 { %r26, %r27 }, [ %rd4 + 0 ];
118
+ @!%p1 mov.u32 %r26, %r6;
119
+ @!%p1 mov.u32 %r27, %r6;
120
+ cvt.u16.u32 %rs9, %r26;
121
+ { .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r26; }
122
+ cvt.u16.u32 %rs11, %r27;
123
+ { .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r27; }
124
+ .loc 1 33 67
125
+ cvt.f32.bf16 %r30, %rs9;
126
+ mov.b32 %f11, %r30;
127
+ cvt.f32.bf16 %r31, %rs10;
128
+ mov.b32 %f12, %r31;
129
+ cvt.f32.bf16 %r32, %rs11;
130
+ mov.b32 %f13, %r32;
131
+ cvt.f32.bf16 %r33, %rs12;
132
+ mov.b32 %f14, %r33;
133
+ .loc 1 34 31
134
+ mul.wide.u32 %rd15, %r69, 4;
135
+ add.s64 %rd5, %rd11, %rd15;
136
+ .loc 1 34 36
137
+ mov.u32 %r34, 0x0;
138
+ mov.u32 %r35, 0x0;
139
+ mov.u32 %r36, 0x0;
140
+ mov.u32 %r37, 0x0;
141
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r34, %r35, %r36, %r37 }, [ %rd5 + 0 ];
142
+ @!%p1 mov.u32 %r34, %r6;
143
+ @!%p1 mov.u32 %r35, %r6;
144
+ @!%p1 mov.u32 %r36, %r6;
145
+ @!%p1 mov.u32 %r37, %r6;
146
+ .loc 1 36 18
147
+ add.f32 %f15, %f5, %f1;
148
+ add.f32 %f16, %f6, %f2;
149
+ .loc 1 38 18
150
+ add.f32 %f17, %f15, %f9;
151
+ add.f32 %f18, %f16, %f10;
152
+ .loc 1 30 46
153
+ mov.b32 %f19, %r2;
154
+ mov.b32 %f20, %r3;
155
+ .loc 1 36 18
156
+ add.f32 %f21, %f4, %f20;
157
+ add.f32 %f22, %f3, %f19;
158
+ .loc 1 38 18
159
+ add.f32 %f23, %f22, %f7;
160
+ add.f32 %f24, %f21, %f8;
161
+ .loc 1 40 18
162
+ add.f32 %f25, %f24, %f12;
163
+ add.f32 %f26, %f23, %f11;
164
+ add.f32 %f27, %f17, %f13;
165
+ add.f32 %f28, %f18, %f14;
166
+ $L__tmp1:
167
+ .loc 2 233 15
168
+ add.f32 %f29, %f26, %f25;
169
+ add.f32 %f30, %f29, %f27;
170
+ add.f32 %f31, %f30, %f28;
171
+ $L__tmp2:
172
+ .loc 2 243 36
173
+ mov.b32 %r72, %f31;
174
+ shfl.sync.bfly.b32 %r73, %r72, 16, 31, -1;
175
+ mov.b32 %f32, %r73;
176
+ $L__tmp3:
177
+ .loc 2 233 15
178
+ add.f32 %f33, %f31, %f32;
179
+ $L__tmp4:
180
+ .loc 2 243 36
181
+ mov.b32 %r74, %f33;
182
+ shfl.sync.bfly.b32 %r75, %r74, 8, 31, -1;
183
+ mov.b32 %f34, %r75;
184
+ $L__tmp5:
185
+ .loc 2 233 15
186
+ add.f32 %f35, %f33, %f34;
187
+ $L__tmp6:
188
+ .loc 2 243 36
189
+ mov.b32 %r76, %f35;
190
+ shfl.sync.bfly.b32 %r77, %r76, 4, 31, -1;
191
+ mov.b32 %f36, %r77;
192
+ $L__tmp7:
193
+ .loc 2 233 15
194
+ add.f32 %f37, %f35, %f36;
195
+ $L__tmp8:
196
+ .loc 2 243 36
197
+ mov.b32 %r78, %f37;
198
+ shfl.sync.bfly.b32 %r79, %r78, 2, 31, -1;
199
+ mov.b32 %f38, %r79;
200
+ $L__tmp9:
201
+ .loc 2 233 15
202
+ add.f32 %f39, %f37, %f38;
203
+ $L__tmp10:
204
+ .loc 2 243 36
205
+ mov.b32 %r80, %f39;
206
+ shfl.sync.bfly.b32 %r81, %r80, 1, 31, -1;
207
+ mov.b32 %f40, %r81;
208
+ $L__tmp11:
209
+ .loc 2 233 15
210
+ add.f32 %f41, %f39, %f40;
211
+ $L__tmp12:
212
+ .loc 2 243 36
213
+ setp.eq.s32 %p20, %r67, 0;
214
+ shr.u32 %r82, %r66, 3;
215
+ and.b32 %r83, %r82, 4;
216
+ mov.u32 %r84, global_smem;
217
+ add.s32 %r42, %r84, %r83;
218
+ mov.b32 %r43, %f41;
219
+ @%p20 st.shared.b32 [ %r42 + 0 ], %r43;
220
+ bar.sync 0;
221
+ setp.lt.s32 %p21, %r66, 2;
222
+ add.s32 %r45, %r84, %r68;
223
+ @%p21 ld.shared.b32 %r44, [ %r45 + 0 ];
224
+ mov.b32 %f42, %r44;
225
+ shfl.sync.bfly.b32 %r85, %r44, 1, 31, -1;
226
+ mov.b32 %f43, %r85;
227
+ $L__tmp13:
228
+ .loc 2 233 15
229
+ add.f32 %f44, %f42, %f43;
230
+ $L__tmp14:
231
+ .loc 2 243 36
232
+ and.b32 %r86, %r66, 1;
233
+ setp.eq.b32 %p27, %r86, 1;
234
+ not.pred %p28, %p27;
235
+ and.pred %p22, %p21, %p28;
236
+ mov.b32 %r47, %f44;
237
+ @%p22 st.shared.b32 [ %r45 + 0 ], %r47;
238
+ bar.sync 0;
239
+ ld.shared.f32 %f45, [global_smem];
240
+ $L__tmp15:
241
+ .loc 3 8 15
242
+ add.f32 %f46, %f45, 0f00000000;
243
+ $L__tmp16:
244
+ .loc 1 48 20
245
+ mov.b32 %r49, %f46;
246
+ mov.b32 %r50, 1132462080;
247
+ div.full.f32 %r48, %r49, %r50;
248
+ mov.b32 %f47, %r48;
249
+ .loc 1 49 20
250
+ sub.f32 %f48, %f26, %f47;
251
+ sub.f32 %f49, %f25, %f47;
252
+ sub.f32 %f50, %f27, %f47;
253
+ sub.f32 %f51, %f28, %f47;
254
+ .loc 1 50 20
255
+ mul.f32 %f52, %f49, %f49;
256
+ $L__tmp17:
257
+ .loc 2 243 36
258
+ bar.sync 0;
259
+ $L__tmp18:
260
+ .loc 2 233 15
261
+ fma.rn.f32 %f53, %f48, %f48, %f52;
262
+ fma.rn.f32 %f54, %f50, %f50, %f53;
263
+ fma.rn.f32 %f55, %f51, %f51, %f54;
264
+ $L__tmp19:
265
+ .loc 2 243 36
266
+ mov.b32 %r87, %f55;
267
+ shfl.sync.bfly.b32 %r88, %r87, 16, 31, -1;
268
+ mov.b32 %f56, %r88;
269
+ $L__tmp20:
270
+ .loc 2 233 15
271
+ add.f32 %f57, %f55, %f56;
272
+ $L__tmp21:
273
+ .loc 2 243 36
274
+ mov.b32 %r89, %f57;
275
+ shfl.sync.bfly.b32 %r90, %r89, 8, 31, -1;
276
+ mov.b32 %f58, %r90;
277
+ $L__tmp22:
278
+ .loc 2 233 15
279
+ add.f32 %f59, %f57, %f58;
280
+ $L__tmp23:
281
+ .loc 2 243 36
282
+ mov.b32 %r91, %f59;
283
+ shfl.sync.bfly.b32 %r92, %r91, 4, 31, -1;
284
+ mov.b32 %f60, %r92;
285
+ $L__tmp24:
286
+ .loc 2 233 15
287
+ add.f32 %f61, %f59, %f60;
288
+ $L__tmp25:
289
+ .loc 2 243 36
290
+ mov.b32 %r93, %f61;
291
+ shfl.sync.bfly.b32 %r94, %r93, 2, 31, -1;
292
+ mov.b32 %f62, %r94;
293
+ $L__tmp26:
294
+ .loc 2 233 15
295
+ add.f32 %f63, %f61, %f62;
296
+ $L__tmp27:
297
+ .loc 2 243 36
298
+ mov.b32 %r95, %f63;
299
+ shfl.sync.bfly.b32 %r96, %r95, 1, 31, -1;
300
+ mov.b32 %f64, %r96;
301
+ $L__tmp28:
302
+ .loc 2 233 15
303
+ add.f32 %f65, %f63, %f64;
304
+ $L__tmp29:
305
+ .loc 2 243 36
306
+ mov.b32 %r52, %f65;
307
+ @%p20 st.shared.b32 [ %r42 + 0 ], %r52;
308
+ bar.sync 0;
309
+ @%p21 ld.shared.b32 %r53, [ %r45 + 0 ];
310
+ mov.b32 %f66, %r53;
311
+ shfl.sync.bfly.b32 %r97, %r53, 1, 31, -1;
312
+ mov.b32 %f67, %r97;
313
+ $L__tmp30:
314
+ .loc 2 233 15
315
+ add.f32 %f68, %f66, %f67;
316
+ $L__tmp31:
317
+ .loc 2 243 36
318
+ mov.b32 %r56, %f68;
319
+ @%p22 st.shared.b32 [ %r45 + 0 ], %r56;
320
+ bar.sync 0;
321
+ ld.shared.f32 %f69, [global_smem];
322
+ $L__tmp32:
323
+ .loc 3 8 15
324
+ add.f32 %f70, %f69, 0f00000000;
325
+ $L__tmp33:
326
+ .loc 1 56 20
327
+ mov.b32 %r58, %f70;
328
+ div.full.f32 %r57, %r58, %r50;
329
+ mov.b32 %f71, %r57;
330
+ .loc 1 58 20
331
+ add.f32 %f72, %f71, 0f3727C5AC;
332
+ .loc 1 59 26
333
+ rsqrt.approx.ftz.f32 %f73, %f72;
334
+ .loc 1 34 36
335
+ mov.b32 %f74, %r37;
336
+ mov.b32 %f75, %r36;
337
+ mov.b32 %f76, %r35;
338
+ mov.b32 %f77, %r34;
339
+ .loc 1 60 20
340
+ mul.f32 %f78, %f48, %f73;
341
+ mul.f32 %f79, %f49, %f73;
342
+ mul.f32 %f80, %f50, %f73;
343
+ mul.f32 %f81, %f51, %f73;
344
+ .loc 1 61 20
345
+ mul.f32 %f82, %f78, %f77;
346
+ mul.f32 %f83, %f79, %f76;
347
+ mul.f32 %f84, %f80, %f75;
348
+ mul.f32 %f85, %f81, %f74;
349
+ .loc 1 63 25
350
+ add.s64 %rd6, %rd12, %rd14;
351
+ .loc 1 63 48
352
+ mov.b32 %r60, %f82;
353
+ cvt.rn.bf16.f32 %rs13, %r60;
354
+ mov.b32 %r61, %f83;
355
+ cvt.rn.bf16.f32 %rs14, %r61;
356
+ mov.b32 %r62, %f84;
357
+ cvt.rn.bf16.f32 %rs15, %r62;
358
+ mov.b32 %r63, %f85;
359
+ cvt.rn.bf16.f32 %rs16, %r63;
360
+ mov.b32 %r98, {%rs13, %rs14};
361
+ mov.b32 %r99, {%rs15, %rs16};
362
+ @%p1 st.global.v2.b32 [ %rd6 + 0 ], { %r98, %r99 };
363
+ .loc 1 63 4
364
+ ret;
365
+ $L__tmp34:
366
+ $L__func_end0:
367
+
368
+ }
369
+ // .globl __nv_rsqrtf
370
+ .visible .func (.param .b32 func_retval0) __nv_rsqrtf(
371
+ .param .b32 __nv_rsqrtf_param_0
372
+ )
373
+ {
374
+ .reg .f32 %f<3>;
375
+ $L__func_begin1:
376
+
377
+ ld.param.f32 %f1, [__nv_rsqrtf_param_0];
378
+ rsqrt.approx.ftz.f32 %f2, %f1;
379
+ st.param.f32 [func_retval0+0], %f2;
380
+ ret;
381
+ $L__func_end1:
382
+
383
+ }
384
+ .file 1 "/tmp/torchinductor_root/pw/cpwl4wgyi5spzbgbswrqxfrxlyk2m76a4bakbp6l5ltopjbkjadt.py"
385
+ .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
386
+ .file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
387
+ .section .debug_abbrev
388
+ {
389
+ .b8 1
390
+ .b8 17
391
+ .b8 1
392
+ .b8 37
393
+ .b8 8
394
+ .b8 19
395
+ .b8 5
396
+ .b8 3
397
+ .b8 8
398
+ .b8 16
399
+ .b8 6
400
+ .b8 27
401
+ .b8 8
402
+ .b8 180
403
+ .b8 66
404
+ .b8 12
405
+ .b8 17
406
+ .b8 1
407
+ .b8 18
408
+ .b8 1
409
+ .b8 0
410
+ .b8 0
411
+ .b8 2
412
+ .b8 46
413
+ .b8 0
414
+ .b8 135
415
+ .b8 64
416
+ .b8 8
417
+ .b8 3
418
+ .b8 8
419
+ .b8 58
420
+ .b8 11
421
+ .b8 59
422
+ .b8 11
423
+ .b8 63
424
+ .b8 12
425
+ .b8 32
426
+ .b8 11
427
+ .b8 0
428
+ .b8 0
429
+ .b8 3
430
+ .b8 46
431
+ .b8 1
432
+ .b8 17
433
+ .b8 1
434
+ .b8 18
435
+ .b8 1
436
+ .b8 64
437
+ .b8 10
438
+ .b8 49
439
+ .b8 19
440
+ .b8 0
441
+ .b8 0
442
+ .b8 4
443
+ .b8 29
444
+ .b8 1
445
+ .b8 49
446
+ .b8 19
447
+ .b8 17
448
+ .b8 1
449
+ .b8 18
450
+ .b8 1
451
+ .b8 88
452
+ .b8 11
453
+ .b8 89
454
+ .b8 11
455
+ .b8 87
456
+ .b8 11
457
+ .b8 0
458
+ .b8 0
459
+ .b8 5
460
+ .b8 29
461
+ .b8 0
462
+ .b8 49
463
+ .b8 19
464
+ .b8 17
465
+ .b8 1
466
+ .b8 18
467
+ .b8 1
468
+ .b8 88
469
+ .b8 11
470
+ .b8 89
471
+ .b8 11
472
+ .b8 87
473
+ .b8 11
474
+ .b8 0
475
+ .b8 0
476
+ .b8 0
477
+ }
478
+ .section .debug_info
479
+ {
480
+ .b32 399
481
+ .b8 2
482
+ .b8 0
483
+ .b32 .debug_abbrev
484
+ .b8 8
485
+ .b8 1
486
+ .b8 116
487
+ .b8 114
488
+ .b8 105
489
+ .b8 116
490
+ .b8 111
491
+ .b8 110
492
+ .b8 0
493
+ .b8 2
494
+ .b8 0
495
+ .b8 99
496
+ .b8 112
497
+ .b8 119
498
+ .b8 108
499
+ .b8 52
500
+ .b8 119
501
+ .b8 103
502
+ .b8 121
503
+ .b8 105
504
+ .b8 53
505
+ .b8 115
506
+ .b8 112
507
+ .b8 122
508
+ .b8 98
509
+ .b8 103
510
+ .b8 98
511
+ .b8 115
512
+ .b8 119
513
+ .b8 114
514
+ .b8 113
515
+ .b8 120
516
+ .b8 102
517
+ .b8 114
518
+ .b8 120
519
+ .b8 108
520
+ .b8 121
521
+ .b8 107
522
+ .b8 50
523
+ .b8 109
524
+ .b8 55
525
+ .b8 54
526
+ .b8 97
527
+ .b8 52
528
+ .b8 98
529
+ .b8 97
530
+ .b8 107
531
+ .b8 98
532
+ .b8 112
533
+ .b8 54
534
+ .b8 108
535
+ .b8 53
536
+ .b8 108
537
+ .b8 116
538
+ .b8 111
539
+ .b8 112
540
+ .b8 106
541
+ .b8 98
542
+ .b8 107
543
+ .b8 106
544
+ .b8 97
545
+ .b8 100
546
+ .b8 116
547
+ .b8 46
548
+ .b8 112
549
+ .b8 121
550
+ .b8 0
551
+ .b32 .debug_line
552
+ .b8 47
553
+ .b8 116
554
+ .b8 109
555
+ .b8 112
556
+ .b8 47
557
+ .b8 116
558
+ .b8 111
559
+ .b8 114
560
+ .b8 99
561
+ .b8 104
562
+ .b8 105
563
+ .b8 110
564
+ .b8 100
565
+ .b8 117
566
+ .b8 99
567
+ .b8 116
568
+ .b8 111
569
+ .b8 114
570
+ .b8 95
571
+ .b8 114
572
+ .b8 111
573
+ .b8 111
574
+ .b8 116
575
+ .b8 47
576
+ .b8 112
577
+ .b8 119
578
+ .b8 0
579
+ .b8 1
580
+ .b64 $L__func_begin0
581
+ .b64 $L__func_end0
582
+ .b8 2
583
+ .b8 116
584
+ .b8 114
585
+ .b8 105
586
+ .b8 116
587
+ .b8 111
588
+ .b8 110
589
+ .b8 95
590
+ .b8 95
591
+ .b8 48
592
+ .b8 100
593
+ .b8 49
594
+ .b8 100
595
+ .b8 50
596
+ .b8 100
597
+ .b8 51
598
+ .b8 100
599
+ .b8 52
600
+ .b8 100
601
+ .b8 53
602
+ .b8 100
603
+ .b8 54
604
+ .b8 100
605
+ .b8 101
606
+ .b8 55
607
+ .b8 100
608
+ .b8 101
609
+ .b8 0
610
+ .b8 116
611
+ .b8 114
612
+ .b8 105
613
+ .b8 116
614
+ .b8 111
615
+ .b8 110
616
+ .b8 95
617
+ .b8 95
618
+ .b8 48
619
+ .b8 100
620
+ .b8 49
621
+ .b8 100
622
+ .b8 50
623
+ .b8 100
624
+ .b8 51
625
+ .b8 100
626
+ .b8 52
627
+ .b8 100
628
+ .b8 53
629
+ .b8 100
630
+ .b8 54
631
+ .b8 100
632
+ .b8 101
633
+ .b8 55
634
+ .b8 100
635
+ .b8 101
636
+ .b8 0
637
+ .b8 1
638
+ .b8 18
639
+ .b8 1
640
+ .b8 1
641
+ .b8 3
642
+ .b64 $L__func_begin0
643
+ .b64 $L__func_end0
644
+ .b8 1
645
+ .b8 156
646
+ .b32 125
647
+ .b8 4
648
+ .b32 125
649
+ .b64 $L__tmp1
650
+ .b64 $L__tmp14
651
+ .b8 2
652
+ .b8 45
653
+ .b8 59
654
+ .b8 5
655
+ .b32 125
656
+ .b64 $L__tmp1
657
+ .b64 $L__tmp14
658
+ .b8 2
659
+ .b8 243
660
+ .b8 36
661
+ .b8 0
662
+ .b8 5
663
+ .b32 125
664
+ .b64 $L__tmp2
665
+ .b64 $L__tmp15
666
+ .b8 2
667
+ .b8 45
668
+ .b8 59
669
+ .b8 5
670
+ .b32 125
671
+ .b64 $L__tmp15
672
+ .b64 $L__tmp16
673
+ .b8 3
674
+ .b8 45
675
+ .b8 45
676
+ .b8 5
677
+ .b32 125
678
+ .b64 $L__tmp17
679
+ .b64 $L__tmp32
680
+ .b8 2
681
+ .b8 53
682
+ .b8 59
683
+ .b8 4
684
+ .b32 125
685
+ .b64 $L__tmp18
686
+ .b64 $L__tmp31
687
+ .b8 2
688
+ .b8 53
689
+ .b8 59
690
+ .b8 5
691
+ .b32 125
692
+ .b64 $L__tmp18
693
+ .b64 $L__tmp31
694
+ .b8 2
695
+ .b8 243
696
+ .b8 36
697
+ .b8 0
698
+ .b8 5
699
+ .b32 125
700
+ .b64 $L__tmp32
701
+ .b64 $L__tmp33
702
+ .b8 3
703
+ .b8 53
704
+ .b8 45
705
+ .b8 0
706
+ .b8 0
707
+ }
708
+ .section .debug_pubnames
709
+ {
710
+ .b32 $L__pubNames_end0-$L__pubNames_start0
711
+ $L__pubNames_start0:
712
+ .b8 2
713
+ .b8 0
714
+ .b32 .debug_info
715
+ .b32 403
716
+ .b32 125
717
+ .b8 116
718
+ .b8 114
719
+ .b8 105
720
+ .b8 116
721
+ .b8 111
722
+ .b8 110
723
+ .b8 95
724
+ .b8 95
725
+ .b8 48
726
+ .b8 100
727
+ .b8 49
728
+ .b8 100
729
+ .b8 50
730
+ .b8 100
731
+ .b8 51
732
+ .b8 100
733
+ .b8 52
734
+ .b8 100
735
+ .b8 53
736
+ .b8 100
737
+ .b8 54
738
+ .b8 100
739
+ .b8 101
740
+ .b8 55
741
+ .b8 100
742
+ .b8 101
743
+ .b8 0
744
+ .b32 0
745
+ $L__pubNames_end0:
746
+ }
747
+ .section .debug_pubtypes
748
+ {
749
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
750
+ $L__pubTypes_start0:
751
+ .b8 2
752
+ .b8 0
753
+ .b32 .debug_info
754
+ .b32 403
755
+ .b32 0
756
+ $L__pubTypes_end0:
757
+ }
758
+ .section .debug_loc { }
.triton/dump/d7a12c0ba96f8920b8147157303ee99f/triton_.cubin ADDED
Binary file (14.1 kB). View file
 
.triton/dump/d7a12c0ba96f8920b8147157303ee99f/triton_.llir ADDED
@@ -0,0 +1,310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @global_smem = external addrspace(3) global [0 x i8]
5
+ @.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
6
+
7
+ define void @triton__0d1d2d3d4d5de6de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i32 %5, i32 %6) local_unnamed_addr !dbg !7 {
8
+ %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
9
+ %9 = and i32 %8, 31, !dbg !10
10
+ %10 = lshr i32 %8, 5, !dbg !10
11
+ %11 = and i32 %10, 1, !dbg !10
12
+ %urem = shl i32 %8, 2, !dbg !10
13
+ %12 = and i32 %urem, 252, !dbg !10
14
+ %13 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !11
15
+ %14 = shl i32 %13, 8, !dbg !12
16
+ %15 = or i32 %14, %12, !dbg !13
17
+ %16 = sext i32 %15 to i64, !dbg !14
18
+ %17 = getelementptr float, ptr addrspace(1) %0, i64 %16, !dbg !14
19
+ %18 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %17, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !15
20
+ %19 = extractvalue { i32, i32, i32, i32 } %18, 0, !dbg !15
21
+ %20 = extractvalue { i32, i32, i32, i32 } %18, 1, !dbg !15
22
+ %21 = extractvalue { i32, i32, i32, i32 } %18, 2, !dbg !15
23
+ %22 = extractvalue { i32, i32, i32, i32 } %18, 3, !dbg !15
24
+ %23 = bitcast i32 %21 to float, !dbg !15
25
+ %24 = bitcast i32 %22 to float, !dbg !15
26
+ %25 = getelementptr i16, ptr addrspace(1) %1, i64 %16, !dbg !16
27
+ %26 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %25, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !17
28
+ %27 = extractvalue { i32, i32 } %26, 0, !dbg !17
29
+ %28 = extractvalue { i32, i32 } %26, 1, !dbg !17
30
+ %29 = trunc i32 %27 to i16, !dbg !17
31
+ %extelt.offset = lshr i32 %27, 16, !dbg !17
32
+ %30 = trunc i32 %extelt.offset to i16, !dbg !17
33
+ %31 = trunc i32 %28 to i16, !dbg !17
34
+ %extelt.offset1 = lshr i32 %28, 16, !dbg !17
35
+ %32 = trunc i32 %extelt.offset1 to i16, !dbg !17
36
+ %33 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %29) #6, !dbg !18
37
+ %34 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %30) #6, !dbg !18
38
+ %35 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %31) #6, !dbg !18
39
+ %36 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %32) #6, !dbg !18
40
+ %37 = getelementptr i16, ptr addrspace(1) %2, i64 %16, !dbg !19
41
+ %38 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %37, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !20
42
+ %39 = extractvalue { i32, i32 } %38, 0, !dbg !20
43
+ %40 = extractvalue { i32, i32 } %38, 1, !dbg !20
44
+ %41 = trunc i32 %39 to i16, !dbg !20
45
+ %extelt.offset2 = lshr i32 %39, 16, !dbg !20
46
+ %42 = trunc i32 %extelt.offset2 to i16, !dbg !20
47
+ %43 = trunc i32 %40 to i16, !dbg !20
48
+ %extelt.offset3 = lshr i32 %40, 16, !dbg !20
49
+ %44 = trunc i32 %extelt.offset3 to i16, !dbg !20
50
+ %45 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %41) #6, !dbg !21
51
+ %46 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %42) #6, !dbg !21
52
+ %47 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %43) #6, !dbg !21
53
+ %48 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %44) #6, !dbg !21
54
+ %49 = zext nneg i32 %12 to i64, !dbg !22
55
+ %50 = getelementptr float, ptr addrspace(1) %3, i64 %49, !dbg !22
56
+ %51 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %50, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !23
57
+ %52 = fadd float %35, %23, !dbg !24
58
+ %53 = fadd float %36, %24, !dbg !24
59
+ %54 = insertelement <2 x i32> poison, i32 %19, i64 0, !dbg !15
60
+ %55 = insertelement <2 x i32> %54, i32 %20, i64 1, !dbg !15
61
+ %56 = bitcast <2 x i32> %55 to <2 x float>, !dbg !15
62
+ %57 = insertelement <2 x float> poison, float %33, i64 0, !dbg !24
63
+ %58 = insertelement <2 x float> %57, float %34, i64 1, !dbg !24
64
+ %59 = fadd <2 x float> %58, %56, !dbg !24
65
+ %60 = insertelement <2 x float> poison, float %45, i64 0, !dbg !25
66
+ %61 = insertelement <2 x float> %60, float %46, i64 1, !dbg !25
67
+ %62 = fadd <2 x float> %59, %61, !dbg !25
68
+ %63 = fadd float %52, %47, !dbg !25
69
+ %64 = fadd float %53, %48, !dbg !25
70
+ %65 = extractelement <2 x float> %62, i64 0, !dbg !26
71
+ %66 = extractelement <2 x float> %62, i64 1, !dbg !26
72
+ %67 = fadd float %65, %66, !dbg !26
73
+ %68 = fadd float %67, %63, !dbg !26
74
+ %69 = fadd float %68, %64, !dbg !26
75
+ %70 = bitcast float %69 to i32, !dbg !32
76
+ %71 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %70, i32 16, i32 31), !dbg !32
77
+ %72 = bitcast i32 %71 to float, !dbg !32
78
+ %73 = fadd float %69, %72, !dbg !26
79
+ %74 = bitcast float %73 to i32, !dbg !32
80
+ %75 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %74, i32 8, i32 31), !dbg !32
81
+ %76 = bitcast i32 %75 to float, !dbg !32
82
+ %77 = fadd float %73, %76, !dbg !26
83
+ %78 = bitcast float %77 to i32, !dbg !32
84
+ %79 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %78, i32 4, i32 31), !dbg !32
85
+ %80 = bitcast i32 %79 to float, !dbg !32
86
+ %81 = fadd float %77, %80, !dbg !26
87
+ %82 = bitcast float %81 to i32, !dbg !32
88
+ %83 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %82, i32 2, i32 31), !dbg !32
89
+ %84 = bitcast i32 %83 to float, !dbg !32
90
+ %85 = fadd float %81, %84, !dbg !26
91
+ %86 = bitcast float %85 to i32, !dbg !32
92
+ %87 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %86, i32 1, i32 31), !dbg !32
93
+ %88 = bitcast i32 %87 to float, !dbg !32
94
+ %89 = fadd float %85, %88, !dbg !26
95
+ %90 = icmp eq i32 %9, 0, !dbg !32
96
+ %91 = zext nneg i32 %11 to i64, !dbg !32
97
+ %92 = getelementptr float, ptr addrspace(3) @global_smem, i64 %91, !dbg !32
98
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %92, float %89, i1 %90) #6, !dbg !32
99
+ tail call void @llvm.nvvm.barrier0(), !dbg !32
100
+ %93 = icmp slt i32 %8, 2, !dbg !32
101
+ %94 = sext i32 %8 to i64, !dbg !32
102
+ %95 = getelementptr float, ptr addrspace(3) @global_smem, i64 %94, !dbg !32
103
+ %96 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %95, i1 %93) #6, !dbg !32
104
+ %97 = bitcast float %96 to i32, !dbg !32
105
+ %98 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %97, i32 1, i32 31), !dbg !32
106
+ %99 = bitcast i32 %98 to float, !dbg !32
107
+ %100 = fadd float %96, %99, !dbg !26
108
+ %101 = and i32 %8, 1, !dbg !32
109
+ %102 = icmp eq i32 %101, 0, !dbg !32
110
+ %103 = and i1 %93, %102, !dbg !32
111
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %95, float %100, i1 %103) #6, !dbg !32
112
+ tail call void @llvm.nvvm.barrier0(), !dbg !32
113
+ %104 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !32
114
+ %105 = fadd float %104, 0.000000e+00, !dbg !34
115
+ %106 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %105, float 2.560000e+02) #6, !dbg !38
116
+ %107 = fsub float %65, %106, !dbg !39
117
+ %108 = fsub float %66, %106, !dbg !39
118
+ %109 = fsub float %63, %106, !dbg !39
119
+ %110 = fsub float %64, %106, !dbg !39
120
+ %111 = fmul float %107, %107, !dbg !40
121
+ %112 = fmul float %108, %108, !dbg !40
122
+ %113 = fmul float %109, %109, !dbg !40
123
+ %114 = fmul float %110, %110, !dbg !40
124
+ tail call void @llvm.nvvm.barrier0(), !dbg !41
125
+ %115 = fadd float %111, %112, !dbg !43
126
+ %116 = fadd float %113, %115, !dbg !43
127
+ %117 = fadd float %114, %116, !dbg !43
128
+ %118 = bitcast float %117 to i32, !dbg !41
129
+ %119 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %118, i32 16, i32 31), !dbg !41
130
+ %120 = bitcast i32 %119 to float, !dbg !41
131
+ %121 = fadd float %117, %120, !dbg !43
132
+ %122 = bitcast float %121 to i32, !dbg !41
133
+ %123 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %122, i32 8, i32 31), !dbg !41
134
+ %124 = bitcast i32 %123 to float, !dbg !41
135
+ %125 = fadd float %121, %124, !dbg !43
136
+ %126 = bitcast float %125 to i32, !dbg !41
137
+ %127 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %126, i32 4, i32 31), !dbg !41
138
+ %128 = bitcast i32 %127 to float, !dbg !41
139
+ %129 = fadd float %125, %128, !dbg !43
140
+ %130 = bitcast float %129 to i32, !dbg !41
141
+ %131 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %130, i32 2, i32 31), !dbg !41
142
+ %132 = bitcast i32 %131 to float, !dbg !41
143
+ %133 = fadd float %129, %132, !dbg !43
144
+ %134 = bitcast float %133 to i32, !dbg !41
145
+ %135 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %134, i32 1, i32 31), !dbg !41
146
+ %136 = bitcast i32 %135 to float, !dbg !41
147
+ %137 = fadd float %133, %136, !dbg !43
148
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %92, float %137, i1 %90) #6, !dbg !41
149
+ tail call void @llvm.nvvm.barrier0(), !dbg !41
150
+ %138 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %95, i1 %93) #6, !dbg !41
151
+ %139 = bitcast float %138 to i32, !dbg !41
152
+ %140 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %139, i32 1, i32 31), !dbg !41
153
+ %141 = bitcast i32 %140 to float, !dbg !41
154
+ %142 = fadd float %138, %141, !dbg !43
155
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %95, float %142, i1 %103) #6, !dbg !41
156
+ tail call void @llvm.nvvm.barrier0(), !dbg !41
157
+ %143 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !41
158
+ %144 = fadd float %143, 0.000000e+00, !dbg !46
159
+ %145 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %144, float 2.560000e+02) #6, !dbg !48
160
+ %146 = fadd float %145, 0x3EE4F8B580000000, !dbg !49
161
+ %147 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !50
162
+ %.not.i = icmp eq i32 %147, 0, !dbg !50
163
+ br i1 %.not.i, label %150, label %148, !dbg !50
164
+
165
+ 148: ; preds = %7
166
+ %149 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %146), !dbg !50
167
+ br label %__nv_rsqrtf.exit, !dbg !50
168
+
169
+ 150: ; preds = %7
170
+ %151 = tail call float @llvm.nvvm.rsqrt.approx.f(float %146), !dbg !50
171
+ br label %__nv_rsqrtf.exit, !dbg !50
172
+
173
+ __nv_rsqrtf.exit: ; preds = %148, %150
174
+ %.0.i = phi float [ %149, %148 ], [ %151, %150 ], !dbg !50
175
+ %152 = extractvalue { i32, i32, i32, i32 } %51, 3, !dbg !23
176
+ %153 = bitcast i32 %152 to float, !dbg !23
177
+ %154 = extractvalue { i32, i32, i32, i32 } %51, 2, !dbg !23
178
+ %155 = bitcast i32 %154 to float, !dbg !23
179
+ %156 = extractvalue { i32, i32, i32, i32 } %51, 1, !dbg !23
180
+ %157 = bitcast i32 %156 to float, !dbg !23
181
+ %158 = extractvalue { i32, i32, i32, i32 } %51, 0, !dbg !23
182
+ %159 = bitcast i32 %158 to float, !dbg !23
183
+ %160 = fmul float %107, %.0.i, !dbg !51
184
+ %161 = fmul float %108, %.0.i, !dbg !51
185
+ %162 = fmul float %109, %.0.i, !dbg !51
186
+ %163 = fmul float %110, %.0.i, !dbg !51
187
+ %164 = fmul float %160, %159, !dbg !52
188
+ %165 = fmul float %161, %157, !dbg !52
189
+ %166 = fmul float %162, %155, !dbg !52
190
+ %167 = fmul float %163, %153, !dbg !52
191
+ %168 = getelementptr i16, ptr addrspace(1) %4, i64 %16, !dbg !53
192
+ %169 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %164) #6, !dbg !54
193
+ %170 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %165) #6, !dbg !54
194
+ %171 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %166) #6, !dbg !54
195
+ %172 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %167) #6, !dbg !54
196
+ %173 = insertelement <2 x i16> undef, i16 %169, i64 0, !dbg !54
197
+ %174 = insertelement <2 x i16> %173, i16 %170, i64 1, !dbg !54
198
+ %175 = bitcast <2 x i16> %174 to i32, !dbg !54
199
+ %176 = insertelement <2 x i16> undef, i16 %171, i64 0, !dbg !54
200
+ %177 = insertelement <2 x i16> %176, i16 %172, i64 1, !dbg !54
201
+ %178 = bitcast <2 x i16> %177 to i32, !dbg !54
202
+ tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %175, i32 %178, ptr addrspace(1) %168, i1 true) #6, !dbg !54
203
+ ret void, !dbg !55
204
+ }
205
+
206
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
207
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
208
+
209
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
210
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
211
+
212
+ ; Function Attrs: convergent nocallback nounwind
213
+ declare void @llvm.nvvm.barrier0() #2
214
+
215
+ ; Function Attrs: alwaysinline nounwind
216
+ define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
217
+ %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
218
+ %.not = icmp eq i32 %1, 0
219
+ br i1 %.not, label %4, label %2
220
+
221
+ 2: ; preds = %0
222
+ %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
223
+ br label %6
224
+
225
+ 4: ; preds = %0
226
+ %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
227
+ br label %6
228
+
229
+ 6: ; preds = %4, %2
230
+ %.0 = phi float [ %3, %2 ], [ %5, %4 ]
231
+ ret float %.0
232
+ }
233
+
234
+ declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
235
+
236
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
237
+ declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
238
+
239
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
240
+ declare float @llvm.nvvm.rsqrt.approx.f(float) #5
241
+
242
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
243
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
244
+ attributes #2 = { convergent nocallback nounwind }
245
+ attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
246
+ attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
247
+ attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
248
+ attributes #6 = { nounwind }
249
+
250
+ !llvm.module.flags = !{!0, !1}
251
+ !llvm.dbg.cu = !{!2}
252
+ !nvvm.annotations = !{!4, !5, !5, !4}
253
+ !llvm.ident = !{!6}
254
+
255
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
256
+ !1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
257
+ !2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
258
+ !3 = !DIFile(filename: "cdohrmmhfsykzlva6pepxaa7gf7klw7w5jzorpspyaldhfg3acr2.py", directory: "/tmp/torchinductor_root/do")
259
+ !4 = !{ptr @triton__0d1d2d3d4d5de6de, !"kernel", i32 1}
260
+ !5 = !{ptr @triton__0d1d2d3d4d5de6de, !"maxntidx", i32 64}
261
+ !6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
262
+ !7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5de6de", linkageName: "triton__0d1d2d3d4d5de6de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
263
+ !8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
264
+ !9 = !{}
265
+ !10 = !DILocation(line: 26, column: 26, scope: !7)
266
+ !11 = !DILocation(line: 23, column: 28, scope: !7)
267
+ !12 = !DILocation(line: 30, column: 40, scope: !7)
268
+ !13 = !DILocation(line: 30, column: 36, scope: !7)
269
+ !14 = !DILocation(line: 30, column: 30, scope: !7)
270
+ !15 = !DILocation(line: 30, column: 46, scope: !7)
271
+ !16 = !DILocation(line: 31, column: 30, scope: !7)
272
+ !17 = !DILocation(line: 31, column: 46, scope: !7)
273
+ !18 = !DILocation(line: 31, column: 67, scope: !7)
274
+ !19 = !DILocation(line: 32, column: 30, scope: !7)
275
+ !20 = !DILocation(line: 32, column: 46, scope: !7)
276
+ !21 = !DILocation(line: 32, column: 67, scope: !7)
277
+ !22 = !DILocation(line: 33, column: 31, scope: !7)
278
+ !23 = !DILocation(line: 33, column: 36, scope: !7)
279
+ !24 = !DILocation(line: 35, column: 18, scope: !7)
280
+ !25 = !DILocation(line: 37, column: 18, scope: !7)
281
+ !26 = !DILocation(line: 233, column: 15, scope: !27, inlinedAt: !30)
282
+ !27 = distinct !DILexicalBlockFile(scope: !29, file: !28, discriminator: 0)
283
+ !28 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
284
+ !29 = distinct !DILexicalBlockFile(scope: !7, file: !28, discriminator: 0)
285
+ !30 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !31)
286
+ !31 = !DILocation(line: 42, column: 59, scope: !27)
287
+ !32 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !33)
288
+ !33 = !DILocation(line: 42, column: 59, scope: !29)
289
+ !34 = !DILocation(line: 8, column: 15, scope: !35, inlinedAt: !37)
290
+ !35 = distinct !DILexicalBlockFile(scope: !7, file: !36, discriminator: 0)
291
+ !36 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
292
+ !37 = !DILocation(line: 42, column: 45, scope: !35)
293
+ !38 = !DILocation(line: 45, column: 20, scope: !7)
294
+ !39 = !DILocation(line: 46, column: 19, scope: !7)
295
+ !40 = !DILocation(line: 47, column: 20, scope: !7)
296
+ !41 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !42)
297
+ !42 = !DILocation(line: 50, column: 59, scope: !29)
298
+ !43 = !DILocation(line: 233, column: 15, scope: !27, inlinedAt: !44)
299
+ !44 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !45)
300
+ !45 = !DILocation(line: 50, column: 59, scope: !27)
301
+ !46 = !DILocation(line: 8, column: 15, scope: !35, inlinedAt: !47)
302
+ !47 = !DILocation(line: 50, column: 45, scope: !35)
303
+ !48 = !DILocation(line: 53, column: 20, scope: !7)
304
+ !49 = !DILocation(line: 55, column: 20, scope: !7)
305
+ !50 = !DILocation(line: 56, column: 26, scope: !7)
306
+ !51 = !DILocation(line: 57, column: 20, scope: !7)
307
+ !52 = !DILocation(line: 58, column: 20, scope: !7)
308
+ !53 = !DILocation(line: 60, column: 25, scope: !7)
309
+ !54 = !DILocation(line: 60, column: 48, scope: !7)
310
+ !55 = !DILocation(line: 60, column: 4, scope: !7)
.triton/dump/ea4af10b775ebbfe0bdcca18fd6288c8/triton_.ttir ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<0.000000e+00> : tensor<16x128xbf16>
4
+ %cst_0 = arith.constant 0.000000e+00 : f32
5
+ %cst_1 = arith.constant dense<1.000000e+00> : tensor<16x128xf32>
6
+ %c256_i32 = arith.constant 256 : i32
7
+ %c128_i32 = arith.constant 128 : i32
8
+ %c0_i32 = arith.constant 0 : i32
9
+ %cst_2 = arith.constant dense<256> : tensor<16x1xi64>
10
+ %cst_3 = arith.constant dense<0> : tensor<16x1xi64>
11
+ %cst_4 = arith.constant dense<50257> : tensor<16x1xi64>
12
+ %cst_5 = arith.constant dense<9.99999974E-6> : tensor<16x1xf32>
13
+ %cst_6 = arith.constant dense<2.560000e+02> : tensor<16x1xf32>
14
+ %cst_7 = arith.constant dense<0.000000e+00> : tensor<1x128xf32>
15
+ %cst_8 = arith.constant dense<0.000000e+00> : tensor<16x128xf32>
16
+ %cst_9 = arith.constant dense<256> : tensor<16x1xi32>
17
+ %cst_10 = arith.constant dense<256> : tensor<1x128xi32>
18
+ %cst_11 = arith.constant dense<512> : tensor<16x1xi32>
19
+ %c16_i32 = arith.constant 16 : i32
20
+ %0 = tt.get_program_id x : i32
21
+ %1 = arith.muli %0, %c16_i32 : i32
22
+ %2 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32>
23
+ %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<16xi32>) -> tensor<16x1xi32>
24
+ %4 = tt.splat %1 : (i32) -> tensor<16x1xi32>
25
+ %5 = arith.addi %4, %3 : tensor<16x1xi32>
26
+ %6 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
27
+ %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<128xi32>) -> tensor<1x128xi32>
28
+ %8 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<16x1x!tt.ptr<i64, 1>>
29
+ %9 = tt.addptr %8, %5 : tensor<16x1x!tt.ptr<i64, 1>>, tensor<16x1xi32>
30
+ %10 = tt.load %9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64>
31
+ %11 = arith.remsi %5, %cst_11 : tensor<16x1xi32>
32
+ %12 = arith.muli %11, %cst_9 : tensor<16x1xi32>
33
+ %13 = tt.broadcast %12 : (tensor<16x1xi32>) -> tensor<16x128xi32>
34
+ %14 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<16x128x!tt.ptr<f32, 1>>
35
+ %15 = arith.muli %5, %cst_9 : tensor<16x1xi32>
36
+ %16 = tt.broadcast %15 : (tensor<16x1xi32>) -> tensor<16x128xi32>
37
+ %17 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<16x128x!tt.ptr<bf16, 1>>
38
+ %18 = arith.addi %10, %cst_4 : tensor<16x1xi64>
39
+ %19 = arith.cmpi slt, %10, %cst_3 : tensor<16x1xi64>
40
+ %20 = arith.select %19, %18, %10 : tensor<16x1xi1>, tensor<16x1xi64>
41
+ %21 = arith.cmpi sge, %20, %cst_3 : tensor<16x1xi64>
42
+ %22 = arith.cmpi slt, %20, %cst_4 : tensor<16x1xi64>
43
+ %23 = arith.andi %21, %22 : tensor<16x1xi1>
44
+ %24 = arith.muli %20, %cst_2 : tensor<16x1xi64>
45
+ %25 = tt.broadcast %24 : (tensor<16x1xi64>) -> tensor<16x128xi64>
46
+ %26 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<16x128x!tt.ptr<f32, 1>>
47
+ %27:3 = scf.for %arg8 = %c0_i32 to %c256_i32 step %c128_i32 iter_args(%arg9 = %cst_8, %arg10 = %cst_8, %arg11 = %cst_8) -> (tensor<16x128xf32>, tensor<16x128xf32>, tensor<16x128xf32>) : i32 {
48
+ %51 = tt.splat %arg8 : (i32) -> tensor<1x128xi32>
49
+ %52 = arith.addi %51, %7 : tensor<1x128xi32>
50
+ %53 = arith.cmpi slt, %52, %cst_10 : tensor<1x128xi32>
51
+ %54 = tt.broadcast %52 : (tensor<1x128xi32>) -> tensor<16x128xi32>
52
+ %55 = arith.addi %54, %13 : tensor<16x128xi32>
53
+ %56 = tt.addptr %14, %55 : tensor<16x128x!tt.ptr<f32, 1>>, tensor<16x128xi32>
54
+ %57 = tt.broadcast %53 : (tensor<1x128xi1>) -> tensor<16x128xi1>
55
+ %58 = tt.load %56, %57, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x128xf32>
56
+ %59 = arith.addi %54, %16 : tensor<16x128xi32>
57
+ %60 = tt.addptr %17, %59 : tensor<16x128x!tt.ptr<bf16, 1>>, tensor<16x128xi32>
58
+ %61 = tt.load %60, %57, %cst {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x128xbf16>
59
+ %62 = arith.extf %61 : tensor<16x128xbf16> to tensor<16x128xf32>
60
+ tt.assert %23, "index out of bounds: 0 <= tmp3 < 50257", "/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py", "<module>", 1892 : tensor<16x1xi1>
61
+ %63 = arith.extsi %52 : tensor<1x128xi32> to tensor<1x128xi64>
62
+ %64 = tt.broadcast %63 : (tensor<1x128xi64>) -> tensor<16x128xi64>
63
+ %65 = arith.addi %64, %25 : tensor<16x128xi64>
64
+ %66 = tt.addptr %26, %65 : tensor<16x128x!tt.ptr<f32, 1>>, tensor<16x128xi64>
65
+ %67 = tt.load %66, %57, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x128xf32>
66
+ %68 = arith.addf %67, %58 : tensor<16x128xf32>
67
+ %69 = arith.addf %68, %62 : tensor<16x128xf32>
68
+ %70 = arith.subf %69, %arg9 : tensor<16x128xf32>
69
+ %71 = arith.addf %arg11, %cst_1 : tensor<16x128xf32>
70
+ %72 = arith.divf %70, %71 : tensor<16x128xf32>
71
+ %73 = arith.addf %arg9, %72 : tensor<16x128xf32>
72
+ %74 = arith.subf %69, %73 : tensor<16x128xf32>
73
+ %75 = arith.mulf %70, %74 : tensor<16x128xf32>
74
+ %76 = arith.addf %arg10, %75 : tensor<16x128xf32>
75
+ %77 = arith.select %57, %73, %arg9 : tensor<16x128xi1>, tensor<16x128xf32>
76
+ %78 = arith.select %57, %76, %arg10 : tensor<16x128xi1>, tensor<16x128xf32>
77
+ %79 = arith.select %57, %71, %arg11 : tensor<16x128xi1>, tensor<16x128xf32>
78
+ scf.yield %77, %78, %79 : tensor<16x128xf32>, tensor<16x128xf32>, tensor<16x128xf32>
79
+ }
80
+ %28:3 = "tt.reduce"(%27#0, %27#1, %27#2) <{axis = 1 : i32}> ({
81
+ ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
82
+ %51 = arith.subf %arg11, %arg8 : f32
83
+ %52 = arith.addf %arg10, %arg13 : f32
84
+ %53 = arith.cmpf oeq, %52, %cst_0 : f32
85
+ %54 = arith.divf %arg13, %52 : f32
86
+ %55 = arith.select %53, %cst_0, %54 : f32
87
+ %56 = arith.mulf %51, %55 : f32
88
+ %57 = arith.addf %arg8, %56 : f32
89
+ %58 = arith.addf %arg9, %arg12 : f32
90
+ %59 = arith.mulf %51, %51 : f32
91
+ %60 = arith.mulf %59, %arg10 : f32
92
+ %61 = arith.mulf %60, %55 : f32
93
+ %62 = arith.addf %58, %61 : f32
94
+ tt.reduce.return %57, %62, %52 : f32, f32, f32
95
+ }) : (tensor<16x128xf32>, tensor<16x128xf32>, tensor<16x128xf32>) -> (tensor<16xf32>, tensor<16xf32>, tensor<16xf32>)
96
+ %29 = tt.expand_dims %28#0 {axis = 1 : i32} : (tensor<16xf32>) -> tensor<16x1xf32>
97
+ %30 = tt.expand_dims %28#1 {axis = 1 : i32} : (tensor<16xf32>) -> tensor<16x1xf32>
98
+ %31 = arith.muli %11, %cst_9 : tensor<16x1xi32>
99
+ %32 = tt.broadcast %31 : (tensor<16x1xi32>) -> tensor<16x128xi32>
100
+ %33 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<16x128x!tt.ptr<f32, 1>>
101
+ %34 = arith.muli %5, %cst_9 : tensor<16x1xi32>
102
+ %35 = tt.broadcast %34 : (tensor<16x1xi32>) -> tensor<16x128xi32>
103
+ %36 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<16x128x!tt.ptr<bf16, 1>>
104
+ %37 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x128x!tt.ptr<f32, 1>>
105
+ %38 = arith.addi %10, %cst_4 : tensor<16x1xi64>
106
+ %39 = arith.cmpi slt, %10, %cst_3 : tensor<16x1xi64>
107
+ %40 = arith.select %39, %38, %10 : tensor<16x1xi1>, tensor<16x1xi64>
108
+ %41 = arith.cmpi sge, %40, %cst_3 : tensor<16x1xi64>
109
+ %42 = arith.cmpi slt, %40, %cst_4 : tensor<16x1xi64>
110
+ %43 = arith.andi %41, %42 : tensor<16x1xi1>
111
+ %44 = arith.muli %40, %cst_2 : tensor<16x1xi64>
112
+ %45 = tt.broadcast %44 : (tensor<16x1xi64>) -> tensor<16x128xi64>
113
+ %46 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<16x128x!tt.ptr<f32, 1>>
114
+ %47 = tt.broadcast %29 : (tensor<16x1xf32>) -> tensor<16x128xf32>
115
+ %48 = arith.divf %30, %cst_6 : tensor<16x1xf32>
116
+ %49 = arith.addf %48, %cst_5 : tensor<16x1xf32>
117
+ %50 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<16x128x!tt.ptr<bf16, 1>>
118
+ scf.for %arg8 = %c0_i32 to %c256_i32 step %c128_i32 : i32 {
119
+ %51 = tt.splat %arg8 : (i32) -> tensor<1x128xi32>
120
+ %52 = arith.addi %51, %7 : tensor<1x128xi32>
121
+ %53 = arith.cmpi slt, %52, %cst_10 : tensor<1x128xi32>
122
+ %54 = tt.broadcast %52 : (tensor<1x128xi32>) -> tensor<16x128xi32>
123
+ %55 = arith.addi %54, %32 : tensor<16x128xi32>
124
+ %56 = tt.addptr %33, %55 : tensor<16x128x!tt.ptr<f32, 1>>, tensor<16x128xi32>
125
+ %57 = tt.broadcast %53 : (tensor<1x128xi1>) -> tensor<16x128xi1>
126
+ %58 = tt.load %56, %57, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x128xf32>
127
+ %59 = arith.addi %54, %35 : tensor<16x128xi32>
128
+ %60 = tt.addptr %36, %59 : tensor<16x128x!tt.ptr<bf16, 1>>, tensor<16x128xi32>
129
+ %61 = tt.load %60, %57, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x128xbf16>
130
+ %62 = arith.extf %61 : tensor<16x128xbf16> to tensor<16x128xf32>
131
+ %63 = tt.addptr %37, %52 : tensor<1x128x!tt.ptr<f32, 1>>, tensor<1x128xi32>
132
+ %64 = tt.load %63, %53, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x128xf32>
133
+ tt.assert %43, "index out of bounds: 0 <= tmp16 < 50257", "/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py", "<module>", 1892 : tensor<16x1xi1>
134
+ %65 = arith.extsi %52 : tensor<1x128xi32> to tensor<1x128xi64>
135
+ %66 = tt.broadcast %65 : (tensor<1x128xi64>) -> tensor<16x128xi64>
136
+ %67 = arith.addi %66, %45 : tensor<16x128xi64>
137
+ %68 = tt.addptr %46, %67 : tensor<16x128x!tt.ptr<f32, 1>>, tensor<16x128xi64>
138
+ %69 = tt.load %68, %57, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x128xf32>
139
+ %70 = arith.addf %69, %58 : tensor<16x128xf32>
140
+ %71 = arith.addf %70, %62 : tensor<16x128xf32>
141
+ %72 = arith.subf %71, %47 : tensor<16x128xf32>
142
+ %73 = tt.extern_elementwise %49 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<16x1xf32>) -> tensor<16x1xf32>
143
+ %74 = tt.broadcast %73 : (tensor<16x1xf32>) -> tensor<16x128xf32>
144
+ %75 = arith.mulf %72, %74 : tensor<16x128xf32>
145
+ %76 = tt.broadcast %64 : (tensor<1x128xf32>) -> tensor<16x128xf32>
146
+ %77 = arith.mulf %75, %76 : tensor<16x128xf32>
147
+ %78 = tt.addptr %50, %59 : tensor<16x128x!tt.ptr<bf16, 1>>, tensor<16x128xi32>
148
+ %79 = arith.truncf %77 : tensor<16x128xf32> to tensor<16x128xbf16>
149
+ tt.store %78, %79, %57 {cache = 1 : i32, evict = 1 : i32} : tensor<16x128xbf16>
150
+ }
151
+ tt.return
152
+ }
153
+ }
.triton/dump/fac03406d1136fc802dac111a1efea36/triton_.ptx ADDED
@@ -0,0 +1,971 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3de4
10
+ .extern .shared .align 1 .b8 global_smem[];
11
+
12
+ .visible .entry triton__0d1d2d3de4(
13
+ .param .u64 triton__0d1d2d3de4_param_0,
14
+ .param .u64 triton__0d1d2d3de4_param_1,
15
+ .param .u64 triton__0d1d2d3de4_param_2,
16
+ .param .u64 triton__0d1d2d3de4_param_3,
17
+ .param .u64 triton__0d1d2d3de4_param_4
18
+ )
19
+ .maxntid 256, 1, 1
20
+ {
21
+ .reg .pred %p<91>;
22
+ .reg .b16 %rs<49>;
23
+ .reg .b32 %r<84>;
24
+ .reg .f32 %f<194>;
25
+ .reg .b64 %rd<75>;
26
+ .loc 1 18 0
27
+ $L__func_begin0:
28
+ .loc 1 18 0
29
+
30
+ ld.param.u64 %rd16, [triton__0d1d2d3de4_param_2];
31
+ ld.param.u64 %rd15, [triton__0d1d2d3de4_param_1];
32
+ ld.param.u64 %rd18, [triton__0d1d2d3de4_param_0];
33
+ $L__tmp0:
34
+ .loc 1 24 33
35
+ mov.u32 %r1, %tid.x;
36
+ shr.u32 %r2, %r1, 5;
37
+ and.b32 %r3, %r1, 255;
38
+ or.b32 %r8, %r3, 256;
39
+ or.b32 %r9, %r3, 512;
40
+ or.b32 %r10, %r3, 768;
41
+ or.b32 %r11, %r3, 1024;
42
+ or.b32 %r12, %r3, 1280;
43
+ or.b32 %r13, %r3, 1536;
44
+ or.b32 %r14, %r3, 1792;
45
+ .loc 1 21 28
46
+ mov.u32 %r7, %ctaid.x;
47
+ .loc 1 21 34
48
+ cvt.s64.s32 %rd1, %r7;
49
+ cvt.u64.u32 %rd2, %r3;
50
+ cvt.u64.u32 %rd9, %r14;
51
+ cvt.u64.u32 %rd8, %r13;
52
+ cvt.u64.u32 %rd7, %r12;
53
+ cvt.u64.u32 %rd6, %r11;
54
+ cvt.u64.u32 %rd5, %r10;
55
+ cvt.u64.u32 %rd4, %r9;
56
+ cvt.u64.u32 %rd3, %r8;
57
+ .loc 1 27 36
58
+ mul.wide.s32 %rd19, %r7, 100514;
59
+ add.s64 %rd10, %rd18, %rd19;
60
+ mov.f32 %f178, 0fFF800000;
61
+ mov.u64 %rd73, 0;
62
+ mov.u16 %rs2, 0;
63
+ mov.f32 %f179, %f178;
64
+ mov.f32 %f180, %f178;
65
+ mov.f32 %f181, %f178;
66
+ mov.f32 %f182, %f178;
67
+ mov.f32 %f183, %f178;
68
+ mov.f32 %f184, %f178;
69
+ mov.f32 %f185, %f178;
70
+ $L__BB0_1:
71
+ $L__tmp1:
72
+ .loc 2 38 21
73
+ setp.num.f32 %p18, %f178, %f178;
74
+ setp.num.f32 %p19, %f179, %f179;
75
+ setp.num.f32 %p20, %f180, %f180;
76
+ setp.num.f32 %p21, %f181, %f181;
77
+ setp.num.f32 %p22, %f182, %f182;
78
+ setp.num.f32 %p23, %f183, %f183;
79
+ setp.num.f32 %p24, %f184, %f184;
80
+ setp.num.f32 %p25, %f185, %f185;
81
+ $L__tmp2:
82
+ .loc 1 28 27
83
+ or.b64 %rd28, %rd73, %rd2;
84
+ or.b64 %rd29, %rd73, %rd3;
85
+ or.b64 %rd30, %rd73, %rd4;
86
+ or.b64 %rd31, %rd73, %rd5;
87
+ or.b64 %rd32, %rd73, %rd6;
88
+ or.b64 %rd33, %rd73, %rd7;
89
+ or.b64 %rd34, %rd73, %rd8;
90
+ or.b64 %rd35, %rd73, %rd9;
91
+ .loc 1 29 25
92
+ setp.lt.u64 %p17, %rd35, 50257;
93
+ setp.lt.u64 %p15, %rd34, 50257;
94
+ setp.lt.u64 %p13, %rd33, 50257;
95
+ setp.lt.u64 %p11, %rd32, 50257;
96
+ setp.lt.u64 %p9, %rd31, 50257;
97
+ setp.lt.u64 %p7, %rd30, 50257;
98
+ setp.lt.u64 %p5, %rd29, 50257;
99
+ setp.lt.u64 %p3, %rd28, 50257;
100
+ .loc 1 31 34
101
+ shl.b64 %rd36, %rd28, 1;
102
+ add.s64 %rd20, %rd10, %rd36;
103
+ shl.b64 %rd37, %rd29, 1;
104
+ add.s64 %rd21, %rd10, %rd37;
105
+ shl.b64 %rd38, %rd30, 1;
106
+ add.s64 %rd22, %rd10, %rd38;
107
+ shl.b64 %rd39, %rd31, 1;
108
+ add.s64 %rd23, %rd10, %rd39;
109
+ shl.b64 %rd40, %rd32, 1;
110
+ add.s64 %rd24, %rd10, %rd40;
111
+ shl.b64 %rd41, %rd33, 1;
112
+ add.s64 %rd25, %rd10, %rd41;
113
+ shl.b64 %rd42, %rd34, 1;
114
+ add.s64 %rd26, %rd10, %rd42;
115
+ shl.b64 %rd43, %rd35, 1;
116
+ add.s64 %rd27, %rd10, %rd43;
117
+ .loc 1 31 52
118
+ mov.u16 %rs1, 0x0;
119
+ @%p3 ld.global.L1::evict_last.b16 { %rs1 }, [ %rd20 + 0 ];
120
+ @!%p3 mov.u16 %rs1, %rs2;
121
+ mov.u16 %rs3, 0x0;
122
+ @%p5 ld.global.L1::evict_last.b16 { %rs3 }, [ %rd21 + 0 ];
123
+ @!%p5 mov.u16 %rs3, %rs2;
124
+ mov.u16 %rs5, 0x0;
125
+ @%p7 ld.global.L1::evict_last.b16 { %rs5 }, [ %rd22 + 0 ];
126
+ @!%p7 mov.u16 %rs5, %rs2;
127
+ mov.u16 %rs7, 0x0;
128
+ @%p9 ld.global.L1::evict_last.b16 { %rs7 }, [ %rd23 + 0 ];
129
+ @!%p9 mov.u16 %rs7, %rs2;
130
+ mov.u16 %rs9, 0x0;
131
+ @%p11 ld.global.L1::evict_last.b16 { %rs9 }, [ %rd24 + 0 ];
132
+ @!%p11 mov.u16 %rs9, %rs2;
133
+ mov.u16 %rs11, 0x0;
134
+ @%p13 ld.global.L1::evict_last.b16 { %rs11 }, [ %rd25 + 0 ];
135
+ @!%p13 mov.u16 %rs11, %rs2;
136
+ mov.u16 %rs13, 0x0;
137
+ @%p15 ld.global.L1::evict_last.b16 { %rs13 }, [ %rd26 + 0 ];
138
+ @!%p15 mov.u16 %rs13, %rs2;
139
+ mov.u16 %rs15, 0x0;
140
+ @%p17 ld.global.L1::evict_last.b16 { %rs15 }, [ %rd27 + 0 ];
141
+ @!%p17 mov.u16 %rs15, %rs2;
142
+ .loc 1 31 103
143
+ cvt.f32.bf16 %r15, %rs1;
144
+ mov.b32 %f42, %r15;
145
+ cvt.f32.bf16 %r16, %rs3;
146
+ mov.b32 %f43, %r16;
147
+ cvt.f32.bf16 %r17, %rs5;
148
+ mov.b32 %f44, %r17;
149
+ cvt.f32.bf16 %r18, %rs7;
150
+ mov.b32 %f45, %r18;
151
+ cvt.f32.bf16 %r19, %rs9;
152
+ mov.b32 %f46, %r19;
153
+ cvt.f32.bf16 %r20, %rs11;
154
+ mov.b32 %f47, %r20;
155
+ cvt.f32.bf16 %r21, %rs13;
156
+ mov.b32 %f48, %r21;
157
+ cvt.f32.bf16 %r22, %rs15;
158
+ mov.b32 %f49, %r22;
159
+ $L__tmp3:
160
+ .loc 2 36 15
161
+ setp.leu.f32 %p26, %f178, %f42;
162
+ setp.leu.f32 %p27, %f179, %f43;
163
+ setp.leu.f32 %p28, %f180, %f44;
164
+ setp.leu.f32 %p29, %f181, %f45;
165
+ setp.leu.f32 %p30, %f182, %f46;
166
+ setp.leu.f32 %p31, %f183, %f47;
167
+ setp.leu.f32 %p32, %f184, %f48;
168
+ setp.leu.f32 %p33, %f185, %f49;
169
+ $L__tmp4:
170
+ .loc 1 0 0
171
+ selp.f32 %f50, %f49, %f185, %p33;
172
+ selp.f32 %f51, %f50, %f185, %p25;
173
+ selp.f32 %f185, %f51, %f185, %p17;
174
+ selp.f32 %f52, %f48, %f184, %p32;
175
+ selp.f32 %f53, %f52, %f184, %p24;
176
+ selp.f32 %f184, %f53, %f184, %p15;
177
+ selp.f32 %f54, %f47, %f183, %p31;
178
+ selp.f32 %f55, %f54, %f183, %p23;
179
+ selp.f32 %f183, %f55, %f183, %p13;
180
+ selp.f32 %f56, %f46, %f182, %p30;
181
+ selp.f32 %f57, %f56, %f182, %p22;
182
+ selp.f32 %f182, %f57, %f182, %p11;
183
+ selp.f32 %f58, %f45, %f181, %p29;
184
+ selp.f32 %f59, %f58, %f181, %p21;
185
+ selp.f32 %f181, %f59, %f181, %p9;
186
+ selp.f32 %f60, %f44, %f180, %p28;
187
+ selp.f32 %f61, %f60, %f180, %p20;
188
+ selp.f32 %f180, %f61, %f180, %p7;
189
+ selp.f32 %f62, %f43, %f179, %p27;
190
+ selp.f32 %f63, %f62, %f179, %p19;
191
+ selp.f32 %f179, %f63, %f179, %p5;
192
+ selp.f32 %f64, %f42, %f178, %p26;
193
+ selp.f32 %f65, %f64, %f178, %p18;
194
+ selp.f32 %f178, %f65, %f178, %p3;
195
+ .loc 1 27 36
196
+ add.s64 %rd73, %rd73, 2048;
197
+ cvt.u32.u64 %r23, %rd73;
198
+ add.s32 %r24, %r23, -2048;
199
+ setp.lt.u32 %p34, %r24, 48209;
200
+ @%p34 bra $L__BB0_1;
201
+ .loc 1 24 33
202
+ and.b32 %r4, %r1, 31;
203
+ and.b32 %r32, %r2, 7;
204
+ $L__tmp5:
205
+ .loc 2 36 15
206
+ setp.gt.f32 %p39, %f178, %f179;
207
+ .loc 2 38 21
208
+ setp.nan.f32 %p40, %f178, %f178;
209
+ .loc 2 39 29
210
+ selp.f32 %f74, %f178, %f179, %p40;
211
+ selp.f32 %f75, %f178, %f74, %p39;
212
+ .loc 2 36 15
213
+ setp.gt.f32 %p41, %f75, %f180;
214
+ .loc 2 38 21
215
+ setp.nan.f32 %p42, %f75, %f75;
216
+ .loc 2 39 29
217
+ selp.f32 %f76, %f75, %f180, %p42;
218
+ selp.f32 %f77, %f75, %f76, %p41;
219
+ .loc 2 36 15
220
+ setp.gt.f32 %p43, %f77, %f181;
221
+ .loc 2 38 21
222
+ setp.nan.f32 %p44, %f77, %f77;
223
+ .loc 2 39 29
224
+ selp.f32 %f78, %f77, %f181, %p44;
225
+ selp.f32 %f79, %f77, %f78, %p43;
226
+ .loc 2 36 15
227
+ setp.gt.f32 %p45, %f79, %f182;
228
+ .loc 2 38 21
229
+ setp.nan.f32 %p46, %f79, %f79;
230
+ .loc 2 39 29
231
+ selp.f32 %f80, %f79, %f182, %p46;
232
+ selp.f32 %f81, %f79, %f80, %p45;
233
+ .loc 2 36 15
234
+ setp.gt.f32 %p47, %f81, %f183;
235
+ .loc 2 38 21
236
+ setp.nan.f32 %p48, %f81, %f81;
237
+ .loc 2 39 29
238
+ selp.f32 %f82, %f81, %f183, %p48;
239
+ selp.f32 %f83, %f81, %f82, %p47;
240
+ .loc 2 36 15
241
+ setp.gt.f32 %p49, %f83, %f184;
242
+ .loc 2 38 21
243
+ setp.nan.f32 %p50, %f83, %f83;
244
+ .loc 2 39 29
245
+ selp.f32 %f84, %f83, %f184, %p50;
246
+ selp.f32 %f85, %f83, %f84, %p49;
247
+ .loc 2 36 15
248
+ setp.gt.f32 %p51, %f85, %f185;
249
+ .loc 2 38 21
250
+ setp.nan.f32 %p52, %f85, %f85;
251
+ .loc 2 39 29
252
+ selp.f32 %f86, %f85, %f185, %p52;
253
+ selp.f32 %f87, %f85, %f86, %p51;
254
+ $L__tmp6:
255
+ .loc 2 49 29
256
+ mov.b32 %r33, %f87;
257
+ shfl.sync.bfly.b32 %r34, %r33, 16, 31, -1;
258
+ mov.b32 %f88, %r34;
259
+ $L__tmp7:
260
+ .loc 2 36 15
261
+ setp.gt.f32 %p53, %f87, %f88;
262
+ .loc 2 38 21
263
+ setp.nan.f32 %p54, %f87, %f87;
264
+ .loc 2 39 29
265
+ selp.f32 %f89, %f87, %f88, %p53;
266
+ selp.f32 %f90, %f87, %f89, %p54;
267
+ $L__tmp8:
268
+ .loc 2 49 29
269
+ mov.b32 %r35, %f90;
270
+ shfl.sync.bfly.b32 %r36, %r35, 8, 31, -1;
271
+ mov.b32 %f91, %r36;
272
+ $L__tmp9:
273
+ .loc 2 36 15
274
+ setp.gt.f32 %p55, %f90, %f91;
275
+ .loc 2 38 21
276
+ setp.nan.f32 %p56, %f90, %f90;
277
+ .loc 2 39 29
278
+ selp.f32 %f92, %f90, %f91, %p56;
279
+ selp.f32 %f93, %f90, %f92, %p55;
280
+ $L__tmp10:
281
+ .loc 2 49 29
282
+ mov.b32 %r37, %f93;
283
+ shfl.sync.bfly.b32 %r38, %r37, 4, 31, -1;
284
+ mov.b32 %f94, %r38;
285
+ $L__tmp11:
286
+ .loc 2 36 15
287
+ setp.gt.f32 %p57, %f93, %f94;
288
+ .loc 2 38 21
289
+ setp.nan.f32 %p58, %f93, %f93;
290
+ .loc 2 39 29
291
+ selp.f32 %f95, %f93, %f94, %p58;
292
+ selp.f32 %f96, %f93, %f95, %p57;
293
+ $L__tmp12:
294
+ .loc 2 49 29
295
+ mov.b32 %r39, %f96;
296
+ shfl.sync.bfly.b32 %r40, %r39, 2, 31, -1;
297
+ mov.b32 %f97, %r40;
298
+ $L__tmp13:
299
+ .loc 2 36 15
300
+ setp.gt.f32 %p59, %f96, %f97;
301
+ .loc 2 38 21
302
+ setp.nan.f32 %p60, %f96, %f96;
303
+ .loc 2 39 29
304
+ selp.f32 %f98, %f96, %f97, %p60;
305
+ selp.f32 %f99, %f96, %f98, %p59;
306
+ $L__tmp14:
307
+ .loc 2 49 29
308
+ mov.b32 %r41, %f99;
309
+ shfl.sync.bfly.b32 %r42, %r41, 1, 31, -1;
310
+ mov.b32 %f100, %r42;
311
+ $L__tmp15:
312
+ .loc 2 36 15
313
+ setp.gt.f32 %p61, %f99, %f100;
314
+ .loc 2 38 21
315
+ setp.nan.f32 %p62, %f99, %f99;
316
+ .loc 2 39 29
317
+ selp.f32 %f101, %f99, %f100, %p62;
318
+ selp.f32 %f102, %f99, %f101, %p61;
319
+ $L__tmp16:
320
+ .loc 2 49 29
321
+ setp.eq.s32 %p35, %r4, 0;
322
+ shl.b32 %r43, %r32, 2;
323
+ mov.u32 %r44, global_smem;
324
+ add.s32 %r62, %r44, %r43;
325
+ mov.b32 %r26, %f102;
326
+ @%p35 st.shared.b32 [ %r62 + 0 ], %r26;
327
+ bar.sync 0;
328
+ setp.lt.s32 %p36, %r1, 8;
329
+ shl.b32 %r45, %r1, 2;
330
+ add.s32 %r65, %r44, %r45;
331
+ @%p36 ld.shared.b32 %r27, [ %r65 + 0 ];
332
+ mov.b32 %f103, %r27;
333
+ shfl.sync.bfly.b32 %r46, %r27, 4, 31, -1;
334
+ mov.b32 %f104, %r46;
335
+ $L__tmp17:
336
+ .loc 2 36 15
337
+ setp.gt.f32 %p63, %f103, %f104;
338
+ .loc 2 38 21
339
+ setp.nan.f32 %p64, %f103, %f103;
340
+ .loc 2 39 29
341
+ selp.f32 %f105, %f103, %f104, %p63;
342
+ selp.f32 %f106, %f103, %f105, %p64;
343
+ $L__tmp18:
344
+ .loc 2 49 29
345
+ mov.b32 %r47, %f106;
346
+ shfl.sync.bfly.b32 %r48, %r47, 2, 31, -1;
347
+ mov.b32 %f107, %r48;
348
+ $L__tmp19:
349
+ .loc 2 36 15
350
+ setp.gt.f32 %p65, %f106, %f107;
351
+ .loc 2 38 21
352
+ setp.nan.f32 %p66, %f106, %f106;
353
+ .loc 2 39 29
354
+ selp.f32 %f108, %f106, %f107, %p66;
355
+ selp.f32 %f109, %f106, %f108, %p65;
356
+ $L__tmp20:
357
+ .loc 2 49 29
358
+ mov.b32 %r49, %f109;
359
+ shfl.sync.bfly.b32 %r50, %r49, 1, 31, -1;
360
+ mov.b32 %f110, %r50;
361
+ $L__tmp21:
362
+ .loc 2 36 15
363
+ setp.gt.f32 %p67, %f109, %f110;
364
+ .loc 2 38 21
365
+ setp.nan.f32 %p68, %f109, %f109;
366
+ .loc 2 39 29
367
+ selp.f32 %f111, %f109, %f110, %p68;
368
+ selp.f32 %f112, %f109, %f111, %p67;
369
+ $L__tmp22:
370
+ .loc 2 49 29
371
+ and.b32 %r51, %r1, 7;
372
+ setp.eq.s32 %p69, %r51, 0;
373
+ and.pred %p89, %p36, %p69;
374
+ mov.b32 %r30, %f112;
375
+ @%p89 st.shared.b32 [ %r65 + 0 ], %r30;
376
+ bar.sync 0;
377
+ ld.shared.f32 %f17, [global_smem];
378
+ $L__tmp23:
379
+ .loc 1 36 41
380
+ bar.sync 0;
381
+ st.shared.f32 [global_smem], %f17;
382
+ bar.sync 0;
383
+ ld.shared.u32 %r31, [global_smem];
384
+ .loc 1 37 25
385
+ shl.b64 %rd46, %rd1, 2;
386
+ add.s64 %rd44, %rd15, %rd46;
387
+ .loc 1 37 36
388
+ setp.eq.s32 %p38, %r3, 0;
389
+ @%p38 st.global.b32 [ %rd44 + 0 ], { %r31 };
390
+ mov.f32 %f186, 0f00000000;
391
+ mov.u64 %rd74, 0;
392
+ mov.f32 %f187, %f186;
393
+ mov.f32 %f188, %f186;
394
+ mov.f32 %f189, %f186;
395
+ mov.f32 %f190, %f186;
396
+ mov.f32 %f191, %f186;
397
+ mov.f32 %f192, %f186;
398
+ mov.f32 %f193, %f186;
399
+ $L__BB0_3:
400
+ .loc 1 40 27
401
+ or.b64 %rd55, %rd74, %rd2;
402
+ or.b64 %rd56, %rd74, %rd3;
403
+ or.b64 %rd57, %rd74, %rd4;
404
+ or.b64 %rd58, %rd74, %rd5;
405
+ or.b64 %rd59, %rd74, %rd6;
406
+ or.b64 %rd60, %rd74, %rd7;
407
+ or.b64 %rd61, %rd74, %rd8;
408
+ or.b64 %rd62, %rd74, %rd9;
409
+ .loc 1 41 25
410
+ setp.lt.u64 %p85, %rd62, 50257;
411
+ setp.lt.u64 %p83, %rd61, 50257;
412
+ setp.lt.u64 %p81, %rd60, 50257;
413
+ setp.lt.u64 %p79, %rd59, 50257;
414
+ setp.lt.u64 %p77, %rd58, 50257;
415
+ setp.lt.u64 %p75, %rd57, 50257;
416
+ setp.lt.u64 %p73, %rd56, 50257;
417
+ setp.lt.u64 %p71, %rd55, 50257;
418
+ .loc 1 43 34
419
+ shl.b64 %rd63, %rd55, 1;
420
+ add.s64 %rd47, %rd10, %rd63;
421
+ shl.b64 %rd64, %rd56, 1;
422
+ add.s64 %rd48, %rd10, %rd64;
423
+ shl.b64 %rd65, %rd57, 1;
424
+ add.s64 %rd49, %rd10, %rd65;
425
+ shl.b64 %rd66, %rd58, 1;
426
+ add.s64 %rd50, %rd10, %rd66;
427
+ shl.b64 %rd67, %rd59, 1;
428
+ add.s64 %rd51, %rd10, %rd67;
429
+ shl.b64 %rd68, %rd60, 1;
430
+ add.s64 %rd52, %rd10, %rd68;
431
+ shl.b64 %rd69, %rd61, 1;
432
+ add.s64 %rd53, %rd10, %rd69;
433
+ shl.b64 %rd70, %rd62, 1;
434
+ add.s64 %rd54, %rd10, %rd70;
435
+ .loc 1 43 52
436
+ mov.u16 %rs25, 0x0;
437
+ @%p71 ld.global.L1::evict_first.b16 { %rs25 }, [ %rd47 + 0 ];
438
+ @!%p71 mov.u16 %rs25, %rs2;
439
+ mov.u16 %rs27, 0x0;
440
+ @%p73 ld.global.L1::evict_first.b16 { %rs27 }, [ %rd48 + 0 ];
441
+ @!%p73 mov.u16 %rs27, %rs2;
442
+ mov.u16 %rs29, 0x0;
443
+ @%p75 ld.global.L1::evict_first.b16 { %rs29 }, [ %rd49 + 0 ];
444
+ @!%p75 mov.u16 %rs29, %rs2;
445
+ mov.u16 %rs31, 0x0;
446
+ @%p77 ld.global.L1::evict_first.b16 { %rs31 }, [ %rd50 + 0 ];
447
+ @!%p77 mov.u16 %rs31, %rs2;
448
+ mov.u16 %rs33, 0x0;
449
+ @%p79 ld.global.L1::evict_first.b16 { %rs33 }, [ %rd51 + 0 ];
450
+ @!%p79 mov.u16 %rs33, %rs2;
451
+ mov.u16 %rs35, 0x0;
452
+ @%p81 ld.global.L1::evict_first.b16 { %rs35 }, [ %rd52 + 0 ];
453
+ @!%p81 mov.u16 %rs35, %rs2;
454
+ mov.u16 %rs37, 0x0;
455
+ @%p83 ld.global.L1::evict_first.b16 { %rs37 }, [ %rd53 + 0 ];
456
+ @!%p83 mov.u16 %rs37, %rs2;
457
+ mov.u16 %rs39, 0x0;
458
+ @%p85 ld.global.L1::evict_first.b16 { %rs39 }, [ %rd54 + 0 ];
459
+ @!%p85 mov.u16 %rs39, %rs2;
460
+ .loc 1 43 104
461
+ cvt.f32.bf16 %r52, %rs25;
462
+ mov.b32 %f129, %r52;
463
+ cvt.f32.bf16 %r53, %rs27;
464
+ mov.b32 %f130, %r53;
465
+ cvt.f32.bf16 %r54, %rs29;
466
+ mov.b32 %f131, %r54;
467
+ cvt.f32.bf16 %r55, %rs31;
468
+ mov.b32 %f132, %r55;
469
+ cvt.f32.bf16 %r56, %rs33;
470
+ mov.b32 %f133, %r56;
471
+ cvt.f32.bf16 %r57, %rs35;
472
+ mov.b32 %f134, %r57;
473
+ cvt.f32.bf16 %r58, %rs37;
474
+ mov.b32 %f135, %r58;
475
+ cvt.f32.bf16 %r59, %rs39;
476
+ mov.b32 %f136, %r59;
477
+ .loc 1 45 22
478
+ sub.f32 %f137, %f129, %f17;
479
+ sub.f32 %f138, %f130, %f17;
480
+ sub.f32 %f139, %f131, %f17;
481
+ sub.f32 %f140, %f132, %f17;
482
+ sub.f32 %f141, %f133, %f17;
483
+ sub.f32 %f142, %f134, %f17;
484
+ sub.f32 %f143, %f135, %f17;
485
+ sub.f32 %f144, %f136, %f17;
486
+ .loc 1 46 22
487
+ mul.f32 %f114, %f137, 0f3FB8AA3B;
488
+ ex2.approx.f32 %f113, %f114;
489
+ mul.f32 %f116, %f138, 0f3FB8AA3B;
490
+ ex2.approx.f32 %f115, %f116;
491
+ mul.f32 %f118, %f139, 0f3FB8AA3B;
492
+ ex2.approx.f32 %f117, %f118;
493
+ mul.f32 %f120, %f140, 0f3FB8AA3B;
494
+ ex2.approx.f32 %f119, %f120;
495
+ mul.f32 %f122, %f141, 0f3FB8AA3B;
496
+ ex2.approx.f32 %f121, %f122;
497
+ mul.f32 %f124, %f142, 0f3FB8AA3B;
498
+ ex2.approx.f32 %f123, %f124;
499
+ mul.f32 %f126, %f143, 0f3FB8AA3B;
500
+ ex2.approx.f32 %f125, %f126;
501
+ mul.f32 %f128, %f144, 0f3FB8AA3B;
502
+ ex2.approx.f32 %f127, %f128;
503
+ .loc 1 49 40
504
+ selp.f32 %f145, %f113, 0f80000000, %p71;
505
+ selp.f32 %f146, %f115, 0f80000000, %p73;
506
+ selp.f32 %f147, %f117, 0f80000000, %p75;
507
+ selp.f32 %f148, %f119, 0f80000000, %p77;
508
+ selp.f32 %f149, %f121, 0f80000000, %p79;
509
+ selp.f32 %f150, %f123, 0f80000000, %p81;
510
+ selp.f32 %f151, %f125, 0f80000000, %p83;
511
+ selp.f32 %f152, %f127, 0f80000000, %p85;
512
+ add.f32 %f193, %f193, %f152;
513
+ add.f32 %f192, %f192, %f151;
514
+ add.f32 %f191, %f191, %f150;
515
+ add.f32 %f190, %f190, %f149;
516
+ add.f32 %f189, %f189, %f148;
517
+ add.f32 %f188, %f188, %f147;
518
+ add.f32 %f187, %f187, %f146;
519
+ add.f32 %f186, %f186, %f145;
520
+ .loc 1 39 36
521
+ add.s64 %rd74, %rd74, 2048;
522
+ cvt.u32.u64 %r60, %rd74;
523
+ add.s32 %r61, %r60, -2048;
524
+ setp.lt.u32 %p86, %r61, 48209;
525
+ @%p86 bra $L__BB0_3;
526
+ $L__tmp24:
527
+ .loc 3 243 36
528
+ bar.sync 0;
529
+ $L__tmp25:
530
+ .loc 3 233 15
531
+ add.f32 %f153, %f186, %f187;
532
+ add.f32 %f154, %f188, %f153;
533
+ add.f32 %f155, %f189, %f154;
534
+ add.f32 %f156, %f190, %f155;
535
+ add.f32 %f157, %f191, %f156;
536
+ add.f32 %f158, %f192, %f157;
537
+ add.f32 %f159, %f193, %f158;
538
+ $L__tmp26:
539
+ .loc 3 243 36
540
+ mov.b32 %r69, %f159;
541
+ shfl.sync.bfly.b32 %r70, %r69, 16, 31, -1;
542
+ mov.b32 %f160, %r70;
543
+ $L__tmp27:
544
+ .loc 3 233 15
545
+ add.f32 %f161, %f159, %f160;
546
+ $L__tmp28:
547
+ .loc 3 243 36
548
+ mov.b32 %r71, %f161;
549
+ shfl.sync.bfly.b32 %r72, %r71, 8, 31, -1;
550
+ mov.b32 %f162, %r72;
551
+ $L__tmp29:
552
+ .loc 3 233 15
553
+ add.f32 %f163, %f161, %f162;
554
+ $L__tmp30:
555
+ .loc 3 243 36
556
+ mov.b32 %r73, %f163;
557
+ shfl.sync.bfly.b32 %r74, %r73, 4, 31, -1;
558
+ mov.b32 %f164, %r74;
559
+ $L__tmp31:
560
+ .loc 3 233 15
561
+ add.f32 %f165, %f163, %f164;
562
+ $L__tmp32:
563
+ .loc 3 243 36
564
+ mov.b32 %r75, %f165;
565
+ shfl.sync.bfly.b32 %r76, %r75, 2, 31, -1;
566
+ mov.b32 %f166, %r76;
567
+ $L__tmp33:
568
+ .loc 3 233 15
569
+ add.f32 %f167, %f165, %f166;
570
+ $L__tmp34:
571
+ .loc 3 243 36
572
+ mov.b32 %r77, %f167;
573
+ shfl.sync.bfly.b32 %r78, %r77, 1, 31, -1;
574
+ mov.b32 %f168, %r78;
575
+ $L__tmp35:
576
+ .loc 3 233 15
577
+ add.f32 %f169, %f167, %f168;
578
+ $L__tmp36:
579
+ .loc 3 243 36
580
+ mov.b32 %r63, %f169;
581
+ @%p35 st.shared.b32 [ %r62 + 0 ], %r63;
582
+ bar.sync 0;
583
+ @%p36 ld.shared.b32 %r64, [ %r65 + 0 ];
584
+ mov.b32 %f170, %r64;
585
+ shfl.sync.bfly.b32 %r79, %r64, 4, 31, -1;
586
+ mov.b32 %f171, %r79;
587
+ $L__tmp37:
588
+ .loc 3 233 15
589
+ add.f32 %f172, %f170, %f171;
590
+ $L__tmp38:
591
+ .loc 3 243 36
592
+ mov.b32 %r80, %f172;
593
+ shfl.sync.bfly.b32 %r81, %r80, 2, 31, -1;
594
+ mov.b32 %f173, %r81;
595
+ $L__tmp39:
596
+ .loc 3 233 15
597
+ add.f32 %f174, %f172, %f173;
598
+ $L__tmp40:
599
+ .loc 3 243 36
600
+ mov.b32 %r82, %f174;
601
+ shfl.sync.bfly.b32 %r83, %r82, 1, 31, -1;
602
+ mov.b32 %f175, %r83;
603
+ $L__tmp41:
604
+ .loc 3 233 15
605
+ add.f32 %f176, %f174, %f175;
606
+ $L__tmp42:
607
+ .loc 3 243 36
608
+ mov.b32 %r67, %f176;
609
+ @%p89 st.shared.b32 [ %r65 + 0 ], %r67;
610
+ bar.sync 0;
611
+ ld.shared.f32 %f177, [global_smem];
612
+ $L__tmp43:
613
+ .loc 1 50 30
614
+ bar.sync 0;
615
+ st.shared.f32 [global_smem], %f177;
616
+ bar.sync 0;
617
+ ld.shared.u32 %r68, [global_smem];
618
+ .loc 1 51 25
619
+ add.s64 %rd71, %rd16, %rd46;
620
+ .loc 1 51 37
621
+ @%p38 st.global.b32 [ %rd71 + 0 ], { %r68 };
622
+ .loc 1 51 4
623
+ ret;
624
+ $L__tmp44:
625
+ $L__func_end0:
626
+
627
+ }
628
+ .file 1 "/tmp/torchinductor_root/cy/ccyhhqogjmaiuaq7b54att75rswph7r3hvxgfmkjyupj74n77r6i.py"
629
+ .file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
630
+ .file 3 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
631
+ .section .debug_abbrev
632
+ {
633
+ .b8 1
634
+ .b8 17
635
+ .b8 1
636
+ .b8 37
637
+ .b8 8
638
+ .b8 19
639
+ .b8 5
640
+ .b8 3
641
+ .b8 8
642
+ .b8 16
643
+ .b8 6
644
+ .b8 27
645
+ .b8 8
646
+ .b8 180
647
+ .b8 66
648
+ .b8 12
649
+ .b8 17
650
+ .b8 1
651
+ .b8 18
652
+ .b8 1
653
+ .b8 0
654
+ .b8 0
655
+ .b8 2
656
+ .b8 46
657
+ .b8 0
658
+ .b8 135
659
+ .b8 64
660
+ .b8 8
661
+ .b8 3
662
+ .b8 8
663
+ .b8 58
664
+ .b8 11
665
+ .b8 59
666
+ .b8 11
667
+ .b8 63
668
+ .b8 12
669
+ .b8 32
670
+ .b8 11
671
+ .b8 0
672
+ .b8 0
673
+ .b8 3
674
+ .b8 46
675
+ .b8 1
676
+ .b8 17
677
+ .b8 1
678
+ .b8 18
679
+ .b8 1
680
+ .b8 64
681
+ .b8 10
682
+ .b8 49
683
+ .b8 19
684
+ .b8 0
685
+ .b8 0
686
+ .b8 4
687
+ .b8 29
688
+ .b8 0
689
+ .b8 49
690
+ .b8 19
691
+ .b8 17
692
+ .b8 1
693
+ .b8 18
694
+ .b8 1
695
+ .b8 88
696
+ .b8 11
697
+ .b8 89
698
+ .b8 11
699
+ .b8 87
700
+ .b8 11
701
+ .b8 0
702
+ .b8 0
703
+ .b8 5
704
+ .b8 29
705
+ .b8 1
706
+ .b8 49
707
+ .b8 19
708
+ .b8 17
709
+ .b8 1
710
+ .b8 18
711
+ .b8 1
712
+ .b8 88
713
+ .b8 11
714
+ .b8 89
715
+ .b8 11
716
+ .b8 87
717
+ .b8 11
718
+ .b8 0
719
+ .b8 0
720
+ .b8 0
721
+ }
722
+ .section .debug_info
723
+ {
724
+ .b32 359
725
+ .b8 2
726
+ .b8 0
727
+ .b32 .debug_abbrev
728
+ .b8 8
729
+ .b8 1
730
+ .b8 116
731
+ .b8 114
732
+ .b8 105
733
+ .b8 116
734
+ .b8 111
735
+ .b8 110
736
+ .b8 0
737
+ .b8 2
738
+ .b8 0
739
+ .b8 99
740
+ .b8 99
741
+ .b8 121
742
+ .b8 104
743
+ .b8 104
744
+ .b8 113
745
+ .b8 111
746
+ .b8 103
747
+ .b8 106
748
+ .b8 109
749
+ .b8 97
750
+ .b8 105
751
+ .b8 117
752
+ .b8 97
753
+ .b8 113
754
+ .b8 55
755
+ .b8 98
756
+ .b8 53
757
+ .b8 52
758
+ .b8 97
759
+ .b8 116
760
+ .b8 116
761
+ .b8 55
762
+ .b8 53
763
+ .b8 114
764
+ .b8 115
765
+ .b8 119
766
+ .b8 112
767
+ .b8 104
768
+ .b8 55
769
+ .b8 114
770
+ .b8 51
771
+ .b8 104
772
+ .b8 118
773
+ .b8 120
774
+ .b8 103
775
+ .b8 102
776
+ .b8 109
777
+ .b8 107
778
+ .b8 106
779
+ .b8 121
780
+ .b8 117
781
+ .b8 112
782
+ .b8 106
783
+ .b8 55
784
+ .b8 52
785
+ .b8 110
786
+ .b8 55
787
+ .b8 55
788
+ .b8 114
789
+ .b8 54
790
+ .b8 105
791
+ .b8 46
792
+ .b8 112
793
+ .b8 121
794
+ .b8 0
795
+ .b32 .debug_line
796
+ .b8 47
797
+ .b8 116
798
+ .b8 109
799
+ .b8 112
800
+ .b8 47
801
+ .b8 116
802
+ .b8 111
803
+ .b8 114
804
+ .b8 99
805
+ .b8 104
806
+ .b8 105
807
+ .b8 110
808
+ .b8 100
809
+ .b8 117
810
+ .b8 99
811
+ .b8 116
812
+ .b8 111
813
+ .b8 114
814
+ .b8 95
815
+ .b8 114
816
+ .b8 111
817
+ .b8 111
818
+ .b8 116
819
+ .b8 47
820
+ .b8 99
821
+ .b8 121
822
+ .b8 0
823
+ .b8 1
824
+ .b64 $L__func_begin0
825
+ .b64 $L__func_end0
826
+ .b8 2
827
+ .b8 116
828
+ .b8 114
829
+ .b8 105
830
+ .b8 116
831
+ .b8 111
832
+ .b8 110
833
+ .b8 95
834
+ .b8 95
835
+ .b8 48
836
+ .b8 100
837
+ .b8 49
838
+ .b8 100
839
+ .b8 50
840
+ .b8 100
841
+ .b8 51
842
+ .b8 100
843
+ .b8 101
844
+ .b8 52
845
+ .b8 0
846
+ .b8 116
847
+ .b8 114
848
+ .b8 105
849
+ .b8 116
850
+ .b8 111
851
+ .b8 110
852
+ .b8 95
853
+ .b8 95
854
+ .b8 48
855
+ .b8 100
856
+ .b8 49
857
+ .b8 100
858
+ .b8 50
859
+ .b8 100
860
+ .b8 51
861
+ .b8 100
862
+ .b8 101
863
+ .b8 52
864
+ .b8 0
865
+ .b8 1
866
+ .b8 18
867
+ .b8 1
868
+ .b8 1
869
+ .b8 3
870
+ .b64 $L__func_begin0
871
+ .b64 $L__func_end0
872
+ .b8 1
873
+ .b8 156
874
+ .b32 125
875
+ .b8 4
876
+ .b32 125
877
+ .b64 $L__tmp1
878
+ .b64 $L__tmp4
879
+ .b8 2
880
+ .b8 34
881
+ .b8 45
882
+ .b8 5
883
+ .b32 125
884
+ .b64 $L__tmp5
885
+ .b64 $L__tmp22
886
+ .b8 2
887
+ .b8 36
888
+ .b8 38
889
+ .b8 4
890
+ .b32 125
891
+ .b64 $L__tmp5
892
+ .b64 $L__tmp22
893
+ .b8 2
894
+ .b8 49
895
+ .b8 29
896
+ .b8 0
897
+ .b8 4
898
+ .b32 125
899
+ .b64 $L__tmp6
900
+ .b64 $L__tmp23
901
+ .b8 2
902
+ .b8 36
903
+ .b8 38
904
+ .b8 4
905
+ .b32 125
906
+ .b64 $L__tmp24
907
+ .b64 $L__tmp43
908
+ .b8 3
909
+ .b8 50
910
+ .b8 27
911
+ .b8 5
912
+ .b32 125
913
+ .b64 $L__tmp25
914
+ .b64 $L__tmp42
915
+ .b8 3
916
+ .b8 50
917
+ .b8 27
918
+ .b8 4
919
+ .b32 125
920
+ .b64 $L__tmp25
921
+ .b64 $L__tmp42
922
+ .b8 3
923
+ .b8 243
924
+ .b8 36
925
+ .b8 0
926
+ .b8 0
927
+ .b8 0
928
+ }
929
+ .section .debug_pubnames
930
+ {
931
+ .b32 $L__pubNames_end0-$L__pubNames_start0
932
+ $L__pubNames_start0:
933
+ .b8 2
934
+ .b8 0
935
+ .b32 .debug_info
936
+ .b32 363
937
+ .b32 125
938
+ .b8 116
939
+ .b8 114
940
+ .b8 105
941
+ .b8 116
942
+ .b8 111
943
+ .b8 110
944
+ .b8 95
945
+ .b8 95
946
+ .b8 48
947
+ .b8 100
948
+ .b8 49
949
+ .b8 100
950
+ .b8 50
951
+ .b8 100
952
+ .b8 51
953
+ .b8 100
954
+ .b8 101
955
+ .b8 52
956
+ .b8 0
957
+ .b32 0
958
+ $L__pubNames_end0:
959
+ }
960
+ .section .debug_pubtypes
961
+ {
962
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
963
+ $L__pubTypes_start0:
964
+ .b8 2
965
+ .b8 0
966
+ .b32 .debug_info
967
+ .b32 363
968
+ .b32 0
969
+ $L__pubTypes_end0:
970
+ }
971
+ .section .debug_loc { }
.triton/dump/fac03406d1136fc802dac111a1efea36/triton_.ttir ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3de4(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i64) attributes {noinline = false} {
3
+ %c50257_i64 = arith.constant 50257 : i64
4
+ %cst = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16>
5
+ %cst_0 = arith.constant dense<true> : tensor<1x2048xi1>
6
+ %c50257_i32 = arith.constant 50257 : i32
7
+ %c2048_i32 = arith.constant 2048 : i32
8
+ %c0_i32 = arith.constant 0 : i32
9
+ %cst_1 = arith.constant dense<50257> : tensor<1x2048xi64>
10
+ %cst_2 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32>
11
+ %cst_3 = arith.constant dense<0xFF800000> : tensor<1x2048xf32>
12
+ %0 = tt.get_program_id x : i32
13
+ %1 = arith.extsi %0 : i32 to i64
14
+ %2 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32>
15
+ %3 = tt.expand_dims %2 {axis = 0 : i32} : (tensor<2048xi32>) -> tensor<1x2048xi32>
16
+ %4 = arith.extsi %3 : tensor<1x2048xi32> to tensor<1x2048xi64>
17
+ %5 = arith.muli %1, %c50257_i64 : i64
18
+ %6 = tt.splat %5 : (i64) -> tensor<1x2048xi64>
19
+ %7 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1x2048x!tt.ptr<bf16, 1>>
20
+ %8 = scf.for %arg5 = %c0_i32 to %c50257_i32 step %c2048_i32 iter_args(%arg6 = %cst_3) -> (tensor<1x2048xf32>) : i32 {
21
+ %22 = arith.extsi %arg5 : i32 to i64
22
+ %23 = tt.splat %22 : (i64) -> tensor<1x2048xi64>
23
+ %24 = arith.addi %23, %4 : tensor<1x2048xi64>
24
+ %25 = arith.cmpi slt, %24, %cst_1 : tensor<1x2048xi64>
25
+ %26 = arith.addi %24, %6 : tensor<1x2048xi64>
26
+ %27 = tt.addptr %7, %26 : tensor<1x2048x!tt.ptr<bf16, 1>>, tensor<1x2048xi64>
27
+ %28 = tt.load %27, %25, %cst {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x2048xbf16>
28
+ %29 = arith.extf %28 : tensor<1x2048xbf16> to tensor<1x2048xf32>
29
+ %30 = arith.cmpf ogt, %arg6, %29 : tensor<1x2048xf32>
30
+ %31 = arith.cmpf une, %arg6, %arg6 : tensor<1x2048xf32>
31
+ %32 = arith.ori %30, %31 : tensor<1x2048xi1>
32
+ %33 = arith.xori %32, %cst_0 : tensor<1x2048xi1>
33
+ %34 = arith.andi %25, %33 : tensor<1x2048xi1>
34
+ %35 = arith.select %34, %29, %arg6 : tensor<1x2048xi1>, tensor<1x2048xf32>
35
+ scf.yield %35 : tensor<1x2048xf32>
36
+ }
37
+ %9 = "tt.reduce"(%8) <{axis = 1 : i32}> ({
38
+ ^bb0(%arg5: f32, %arg6: f32):
39
+ %22 = arith.cmpf ogt, %arg5, %arg6 : f32
40
+ %23 = arith.cmpf une, %arg5, %arg5 : f32
41
+ %24 = arith.ori %22, %23 : i1
42
+ %25 = arith.select %24, %arg5, %arg6 : f32
43
+ tt.reduce.return %25 : f32
44
+ }) : (tensor<1x2048xf32>) -> tensor<1xf32>
45
+ %10 = tt.expand_dims %9 {axis = 1 : i32} : (tensor<1xf32>) -> tensor<1x1xf32>
46
+ %11 = tt.addptr %arg1, %1 : !tt.ptr<f32, 1>, i64
47
+ %12 = tt.splat %11 : (!tt.ptr<f32, 1>) -> tensor<1x1x!tt.ptr<f32, 1>>
48
+ tt.store %12, %10 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xf32>
49
+ %13 = arith.muli %1, %c50257_i64 : i64
50
+ %14 = tt.splat %13 : (i64) -> tensor<1x2048xi64>
51
+ %15 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1x2048x!tt.ptr<bf16, 1>>
52
+ %16 = tt.broadcast %10 : (tensor<1x1xf32>) -> tensor<1x2048xf32>
53
+ %17 = scf.for %arg5 = %c0_i32 to %c50257_i32 step %c2048_i32 iter_args(%arg6 = %cst_2) -> (tensor<1x2048xf32>) : i32 {
54
+ %22 = arith.extsi %arg5 : i32 to i64
55
+ %23 = tt.splat %22 : (i64) -> tensor<1x2048xi64>
56
+ %24 = arith.addi %23, %4 : tensor<1x2048xi64>
57
+ %25 = arith.cmpi slt, %24, %cst_1 : tensor<1x2048xi64>
58
+ %26 = arith.addi %24, %14 : tensor<1x2048xi64>
59
+ %27 = tt.addptr %15, %26 : tensor<1x2048x!tt.ptr<bf16, 1>>, tensor<1x2048xi64>
60
+ %28 = tt.load %27, %25, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x2048xbf16>
61
+ %29 = arith.extf %28 : tensor<1x2048xbf16> to tensor<1x2048xf32>
62
+ %30 = arith.subf %29, %16 : tensor<1x2048xf32>
63
+ %31 = math.exp %30 : tensor<1x2048xf32>
64
+ %32 = arith.addf %arg6, %31 : tensor<1x2048xf32>
65
+ %33 = arith.select %25, %32, %arg6 : tensor<1x2048xi1>, tensor<1x2048xf32>
66
+ scf.yield %33 : tensor<1x2048xf32>
67
+ }
68
+ %18 = "tt.reduce"(%17) <{axis = 1 : i32}> ({
69
+ ^bb0(%arg5: f32, %arg6: f32):
70
+ %22 = arith.addf %arg5, %arg6 : f32
71
+ tt.reduce.return %22 : f32
72
+ }) : (tensor<1x2048xf32>) -> tensor<1xf32>
73
+ %19 = tt.expand_dims %18 {axis = 1 : i32} : (tensor<1xf32>) -> tensor<1x1xf32>
74
+ %20 = tt.addptr %arg2, %1 : !tt.ptr<f32, 1>, i64
75
+ %21 = tt.splat %20 : (!tt.ptr<f32, 1>) -> tensor<1x1x!tt.ptr<f32, 1>>
76
+ tt.store %21, %19 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xf32>
77
+ tt.return
78
+ }
79
+ }