0-hero commited on
Commit
934a9ba
·
verified ·
1 Parent(s): 71c6277

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .launchpadlib/api.launchpad.net/cache/api.launchpad.net,devel,-application,vnd.sun.wadl+xml,2f09acb494bdefdbf8ef0d1396a05e86 +0 -0
  2. .local/share/jupyter/nbextensions/printview/main.js +75 -0
  3. .local/share/jupyter/nbextensions/python-markdown/main.js +212 -0
  4. .local/share/jupyter/nbextensions/qtconsole/qtconsole.yaml +6 -0
  5. .local/share/jupyter/nbextensions/rubberband/main.css +12 -0
  6. .local/share/jupyter/nbextensions/rubberband/rubberband.yaml +7 -0
  7. .local/share/jupyter/nbextensions/ruler/ruler.yaml +32 -0
  8. .local/share/jupyter/nbextensions/runtools/main.js +745 -0
  9. .local/share/jupyter/nbextensions/runtools/runtools_lock.png +0 -0
  10. .local/share/jupyter/nbextensions/scratchpad/README.md +14 -0
  11. .local/share/jupyter/nbextensions/skill/README.md +15 -0
  12. .triton/cache/6e97c2a1f7a095255f6dd5de1807841d/cuda_utils.so +0 -0
  13. .triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.ptx +807 -0
  14. .triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.ttir +53 -0
  15. .triton/dump/305a9479aab997a3a16bfe46bb303a50/triton_.cubin +0 -0
  16. .triton/dump/305a9479aab997a3a16bfe46bb303a50/triton_.ptx +1041 -0
  17. .triton/dump/415aac87553b7d064f52694fa7254686/triton_.llir +860 -0
  18. .triton/dump/44b225411009956bfbae22f8bac7d703/triton_.cubin +0 -0
  19. .triton/dump/44b225411009956bfbae22f8bac7d703/triton_.llir +304 -0
  20. .triton/dump/44b225411009956bfbae22f8bac7d703/triton_.ttir +61 -0
  21. .triton/dump/4c6ad48573c74d55ed79384f6b432d50/triton_.ttir +18 -0
  22. .triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.llir +362 -0
  23. .triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.ptx +486 -0
  24. .triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.ttgir +38 -0
  25. .triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.ttir +37 -0
  26. .triton/dump/645565eaba0a18dd23ef200fe9abb0c0/triton_.cubin +0 -0
  27. .triton/dump/645565eaba0a18dd23ef200fe9abb0c0/triton_.ptx +834 -0
  28. .triton/dump/645565eaba0a18dd23ef200fe9abb0c0/triton_.ttgir +98 -0
  29. .triton/dump/7dc5bb3e5c2bb99527fff34c6fba7810/triton_.llir +42 -0
  30. .triton/dump/7dc5bb3e5c2bb99527fff34c6fba7810/triton_.ttir +17 -0
  31. .triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.cubin +0 -0
  32. .triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.llir +333 -0
  33. .triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.ttgir +68 -0
  34. .triton/dump/9a2fb05196b13393bea452d08e9aaca8/triton_.llir +54 -0
  35. .triton/dump/9a2fb05196b13393bea452d08e9aaca8/triton_.ttgir +19 -0
  36. .triton/dump/a69784da01a97187168f22847465505f/triton_.cubin +0 -0
  37. .triton/dump/a69784da01a97187168f22847465505f/triton_.ttgir +73 -0
  38. .triton/dump/a75e14a8d2d1ec8471f1c7b615552f8c/triton_.cubin +0 -0
  39. .triton/dump/a75e14a8d2d1ec8471f1c7b615552f8c/triton_.ptx +295 -0
  40. .triton/dump/a75e14a8d2d1ec8471f1c7b615552f8c/triton_.ttir +18 -0
  41. .triton/dump/ab89f85e55d5ddd9676325b49df9419f/triton_.ptx +717 -0
  42. .triton/dump/ab89f85e55d5ddd9676325b49df9419f/triton_.ttgir +68 -0
  43. .triton/dump/be28ee3793470d9803546f03e49c3edf/triton_.cubin +0 -0
  44. .triton/dump/be28ee3793470d9803546f03e49c3edf/triton_.ttgir +85 -0
  45. .triton/dump/c0c34db8f5ff22b79fc971c63187477a/triton_.cubin +0 -0
  46. .triton/dump/c0c34db8f5ff22b79fc971c63187477a/triton_.ttir +84 -0
  47. .triton/dump/c0db4dd81e5aac83500e3ccf67d3896d/triton_.llir +53 -0
  48. .triton/dump/d7a12c0ba96f8920b8147157303ee99f/triton_.ttir +62 -0
  49. .triton/dump/f5088324dcdcf6814f6743553c1321c2/triton_.llir +63 -0
  50. .triton/dump/f5088324dcdcf6814f6743553c1321c2/triton_.ptx +300 -0
.launchpadlib/api.launchpad.net/cache/api.launchpad.net,devel,-application,vnd.sun.wadl+xml,2f09acb494bdefdbf8ef0d1396a05e86 ADDED
The diff for this file is too large to render. See raw diff
 
.local/share/jupyter/nbextensions/printview/main.js ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // call "jupyter nbconvert" and open generated html file in new tab
2
+
3
+ define([
4
+ 'base/js/namespace',
5
+ 'jquery',
6
+ 'base/js/events',
7
+ 'base/js/utils'
8
+ ], function(
9
+ IPython,
10
+ $,
11
+ events,
12
+ utils
13
+ ) {
14
+ "use strict";
15
+
16
+ var nbconvert_options = '--to html';
17
+ var extension = '.html';
18
+ var open_tab = true;
19
+
20
+ /**
21
+ * Get option from config
22
+ */
23
+ var initialize = function () {
24
+ var config = IPython.notebook.config;
25
+ if (config.data.hasOwnProperty('printview_nbconvert_options') ) {
26
+ nbconvert_options = config.data.printview_nbconvert_options;
27
+ if (nbconvert_options.search('pdf') > 0) extension = '.pdf';
28
+ if (nbconvert_options.search('slides') > 0) extension = '.slides.html';
29
+ }
30
+ if (config.data.hasOwnProperty('printview_open_tab') ) {
31
+ if (typeof(config.data.printview_open_tab) === "boolean") {
32
+ open_tab = config.data.printview_open_tab;
33
+ }
34
+ }
35
+ };
36
+
37
+ /**
38
+ * Call nbconvert using the current notebook server profile
39
+ *
40
+ */
41
+ var callNbconvert = function () {
42
+ events.off('notebook_saved.Notebook');
43
+ var kernel = IPython.notebook.kernel;
44
+ var name = IPython.notebook.notebook_name;
45
+ var command = 'import os; os.system(\'jupyter nbconvert ' + nbconvert_options + ' \"' + name + '\"\')';
46
+ function callback() {
47
+ if (open_tab === true) {
48
+ var url = utils.splitext(name)[0] + extension;
49
+ window.open(url, '_blank');
50
+ }
51
+ }
52
+ kernel.execute(command, { shell: { reply : callback } });
53
+ $('#doPrintView').blur();
54
+ };
55
+
56
+ var nbconvertPrintView = function () {
57
+ events.on('notebook_saved.Notebook',callNbconvert);
58
+ IPython.notebook.save_notebook(false);
59
+ };
60
+
61
+ var load_ipython_extension = function() {
62
+ $(IPython.toolbar.add_buttons_group([
63
+ IPython.keyboard_manager.actions.register ({
64
+ help : 'Create static print view',
65
+ icon : 'fa-print',
66
+ handler: nbconvertPrintView
67
+ }, 'create-static-printview', 'printview'),
68
+ ])).find('.btn').attr('id', 'doPrintView');
69
+ return IPython.notebook.config.loaded.then(initialize);
70
+ };
71
+
72
+ return {
73
+ load_ipython_extension : load_ipython_extension
74
+ };
75
+ });
.local/share/jupyter/nbextensions/python-markdown/main.js ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Allow Python-code in markdown cells
2
+ // Encapsulate using {{...}}
3
+ // - You can also return html or markdown from your Python code
4
+ // - You can embed images, however they will be sanitized on reload.
5
+
6
+ // TODO: Markdown cells will only be reevaluated when a notebook is dirty
7
+ // (i.e. you have made changes). If you save it before reevaluating MD cells,
8
+ // they will show the old value.
9
+
10
+ define([
11
+ 'base/js/namespace',
12
+ 'jquery',
13
+ 'require',
14
+ 'notebook/js/cell',
15
+ 'base/js/security',
16
+ 'components/marked/lib/marked',
17
+ 'base/js/events',
18
+ 'notebook/js/textcell'
19
+ ], function(IPython, $, requirejs, cell, security, marked, events, textcell) {
20
+ "use strict";
21
+
22
+ /*
23
+ * Find Python expression enclosed in {{ }}, execute and add to text as
24
+ * <span> tags. The actual content gets filled in later by a callback.
25
+ * Already executed expressions are cached in cell metadata.
26
+ *
27
+ * @method execute_python
28
+ * @param cell {Cell} notebook cell
29
+ * @param text {String} text in cell
30
+ */
31
+ var execute_python = function(cell,text) {
32
+ /* never execute code in untrusted notebooks */
33
+ if (IPython.notebook.trusted === false ) {
34
+ return undefined
35
+ }
36
+ /* always clear stored variables if notebook is dirty */
37
+ if (IPython.notebook.dirty === true ) delete cell.metadata.variables;
38
+ // search for code in double curly braces: {{}}
39
+ var found = false;
40
+ var newtext = text.replace(/{{(.*?)}}/g, function(match,tag,cha) {
41
+ found = true;
42
+ if (tag === "") return undefined;
43
+ var code = tag;
44
+ var id = 'python_'+cell.cell_id+'_'+cha; /* create an individual ID */
45
+ var thiscell = cell;
46
+ var thismatch = tag;
47
+
48
+ /* there a two possible options:
49
+ a) notebook dirty or variable not stored in metadata: evaluate variable
50
+ b) notebook clean and variable stored in metadata: display stored value
51
+ */
52
+ if (typeof cell.metadata.variables === "undefined") {
53
+ cell.metadata.variables = {}
54
+ }
55
+ var val = cell.metadata.variables[thismatch];
56
+ if (IPython.notebook.dirty === true || val === undefined || jQuery.isEmptyObject(val)) {
57
+ cell.metadata.variables[thismatch] = {};
58
+ var execute_callback = function (out_data)
59
+ {
60
+ var html;
61
+ if (out_data.msg_type === "error") {
62
+ var text = "**" + out_data.content.ename + "**: " + out_data.content.evalue;
63
+ html = marked(text);
64
+ } else if (out_data.msg_type === "stream") {
65
+ html = marked(out_data.content.text);
66
+ var t = html.match(/^\s*<p>([\s\S]*?)<\/p>\s*$/); //strip <p> and </p> that marked (maybe) adds and we don't want
67
+ html = t !== null ? t[1] : html;
68
+ var q = html.match(/^&#39;([\s\S]*?)&#39;$/); // strip quotes from strings
69
+ if (q !== null) html = q[1]
70
+ } else if (out_data.msg_type === "execute_result" | out_data.msg_type === "display_data" ) {
71
+ var ul = out_data.content.data;
72
+ if (ul != undefined) {
73
+ if (ul['text/latex'] != undefined) {
74
+ html = ul['text/latex'];
75
+ } else if (ul['image/svg+xml'] != undefined) {
76
+ var svg = ul['image/svg+xml'];
77
+ /* embed SVG in an <img> tag, still get eaten by sanitizer... */
78
+ svg = btoa(svg);
79
+ html = '<img src="data:image/svg+xml;base64,' + svg + '"/>';
80
+ } else if (ul['image/jpeg'] != undefined) {
81
+ var jpeg = ul['image/jpeg'];
82
+ html = '<img src="data:image/jpeg;base64,' + jpeg + '"/>';
83
+ } else if (ul['image/png'] != undefined) {
84
+ var png = ul['image/png'];
85
+ html = '<img src="data:image/png;base64,' + png + '"/>';
86
+ } else if (ul['text/markdown'] != undefined) {
87
+ html = marked(ul['text/markdown']);
88
+ } else if (ul['text/html'] != undefined) {
89
+ html = ul['text/html'];
90
+ } else {
91
+ html = marked(ul['text/plain']);
92
+ // [\s\S] is used to also catch newlines
93
+ var t = html.match(/^\s*<p>([\s\S]*?)<\/p>\s*$/); //strip <p> and </p> that marked adds and we don't want
94
+ html = t !== null ? t[1] : html;
95
+ var q = html.match(/^&#39;([\s\S]*?)&#39;$/); // strip quotes from strings
96
+ if (q !== null) html = q[1]
97
+ }
98
+ }
99
+ } else {
100
+ return;
101
+ }
102
+ thiscell.metadata.variables[thismatch] = html;
103
+ var el = document.getElementById(id);
104
+ el.innerHTML = el.innerHTML + html; // output result
105
+ };
106
+ var callbacks = { iopub : { output: execute_callback } };
107
+ if (cell.notebook.kernel != null) {
108
+ cell.notebook.kernel.execute(code, callbacks, {silent: false, store_history : false, stop_on_error: false });
109
+ return "<span id='"+id+"'></span>"; // add HTML tag with ID where output will be placed
110
+ }
111
+ return undefined;
112
+ } else {
113
+ /* Notebook not dirty: replace tags with metadata */
114
+ val = cell.metadata.variables[tag];
115
+ return "<span id='"+id+"'>"+val+"</span>"
116
+ }
117
+ });
118
+ if (found == true) return newtext;
119
+ return undefined
120
+ };
121
+
122
+ /*
123
+ * Render markdown cell and replace {{...}} with python code
124
+ *
125
+ */
126
+ var render_cell = function(cell) {
127
+ var element = cell.element.find('div.text_cell_render');
128
+ var text = execute_python(cell, element[0].innerHTML);
129
+ if (text !== undefined) {
130
+ element[0].innerHTML = text;
131
+ MathJax.Hub.Queue(["Typeset",MathJax.Hub,element[0]]);
132
+ }
133
+ };
134
+
135
+ /* force rendering of markdown cell if notebook is dirty */
136
+ var original_render = textcell.MarkdownCell.prototype.render;
137
+ textcell.MarkdownCell.prototype.render = function() {
138
+ if (IPython.notebook.dirty === true) {
139
+ this.rendered = false
140
+ }
141
+ return original_render.apply(this)
142
+ };
143
+
144
+ var set_trusted_indicator = function() {
145
+ var ind = $('.notebook-trusted');
146
+ if (IPython.notebook.trusted === true) {
147
+ ind.attr('title','Notebook is trusted');
148
+ ind.removeClass('fa-question');
149
+ ind.addClass('fa-check');
150
+ } else {
151
+ ind.attr('title','Notebook is not trusted');
152
+ ind.removeClass('fa-check');
153
+ ind.addClass('fa-question');
154
+ }
155
+ };
156
+
157
+
158
+ /**
159
+ * Add CSS file
160
+ *
161
+ * @param name filename
162
+ */
163
+ var load_css = function (name) {
164
+ var link = document.createElement("link");
165
+ link.type = "text/css";
166
+ link.rel = "stylesheet";
167
+ link.href = requirejs.toUrl(name);
168
+ document.getElementsByTagName("head")[0].appendChild(link);
169
+ };
170
+
171
+
172
+ /**
173
+ * Update all references variables in markdown cells
174
+ *
175
+ */
176
+ var update_md_cells = function () {
177
+ var ncells = IPython.notebook.ncells();
178
+ var cells = IPython.notebook.get_cells();
179
+ for (var i = 0; i < ncells; i++) {
180
+ var cell = cells[i];
181
+ if (cell.metadata.hasOwnProperty('variables')) {
182
+ render_cell(cell)
183
+ }
184
+ }
185
+ };
186
+
187
+ var load_ipython_extension = function() {
188
+ load_css('./main.css');
189
+ events.on("rendered.MarkdownCell", function (event, data) {
190
+ render_cell(data.cell);
191
+ });
192
+ events.on("trust_changed.Notebook", set_trusted_indicator);
193
+
194
+ $('#save_widget').append('<i id="notebook-trusted-indicator" class="fa fa-question notebook-trusted" />');
195
+ set_trusted_indicator();
196
+
197
+ /* Show values stored in metadata on reload */
198
+ events.on("kernel_ready.Kernel", function () {
199
+ if (Jupyter.notebook !== undefined && Jupyter.notebook._fully_loaded) {
200
+ update_md_cells()
201
+ } else {
202
+ events.on("notebook_loaded.Notebook", function () {
203
+ update_md_cells()
204
+ })
205
+ }
206
+ });
207
+ };
208
+
209
+ return {
210
+ load_ipython_extension : load_ipython_extension
211
+ };
212
+ });
.local/share/jupyter/nbextensions/qtconsole/qtconsole.yaml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ Type: IPython Notebook Extension
2
+ Name: Launch QTConsole
3
+ Link: README.md
4
+ Description: Launch a QTConsole attached to the running kernel
5
+ Main: qtconsole.js
6
+ Compatibility: 4.x
.local/share/jupyter/nbextensions/rubberband/main.css ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .highlight-drag
2
+ {
3
+ background-color: transparent;
4
+ border: dashed #ff3333 3px;
5
+ position: absolute;
6
+ display: none;
7
+ }
8
+
9
+ .cell.selected
10
+ {
11
+ background-color: #fcfcfc;
12
+ }
.local/share/jupyter/nbextensions/rubberband/rubberband.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ Type: IPython Notebook Extension
2
+ Name: Rubberband
3
+ Description: The rubberband extension allows selecting multiple cells
4
+ Link: readme.md
5
+ Icon: icon.png
6
+ Main: main.js
7
+ Compatibility: 4.x, 5.x
.local/share/jupyter/nbextensions/ruler/ruler.yaml ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Type: IPython Notebook Extension
2
+ Name: Ruler
3
+ Description: This extension enables the Ruler CodeMirror feature
4
+ Link: readme.md
5
+ Icon: icon.png
6
+ Main: main.js
7
+ Compatibility: 4.x, 5.x
8
+ Parameters:
9
+
10
+ - name: ruler_column
11
+ input_type: list
12
+ list_element:
13
+ input_type: number
14
+ description: Column where ruler is displayed
15
+ default: [78]
16
+
17
+ - name: ruler_color
18
+ input_type: list
19
+ list_element:
20
+ input_type: color
21
+ description: Ruler color
22
+ default: ["#ff0000"]
23
+
24
+ - name: ruler_linestyle
25
+ description: 'Ruler style, e.g. solid, dashed'
26
+ input_type: list
27
+ default: ['dashed']
28
+
29
+ - name: ruler_do_css_patch
30
+ description: apply css patch for ruler padding bug in notebook >= 4.3
31
+ input_type: checkbox
32
+ default: true
.local/share/jupyter/nbextensions/runtools/main.js ADDED
@@ -0,0 +1,745 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Extended code execution commands and more
2
+
3
+ define([
4
+ 'base/js/namespace',
5
+ 'jquery',
6
+ 'require',
7
+ 'base/js/events',
8
+ 'services/config',
9
+ 'base/js/utils',
10
+ 'notebook/js/codecell'
11
+ ], function(Jupyter, $, requirejs, events, configmod, utils, codecell) {
12
+ "use strict";
13
+
14
+ var run_list = []; /* list of cells to be run */
15
+
16
+ // define default config parameter values
17
+ var params = {
18
+ run_cells_above: 'Alt-a',
19
+ run_cells_below: 'Alt-b',
20
+ toggle_marker: 'Alt-t',
21
+ mark_all_codecells: 'Alt-m',
22
+ unmark_all_codecells: 'Alt-u',
23
+ run_marked_cells: 'Alt-r',
24
+ run_all_cells: 'Alt-x',
25
+ run_all_cells_ignore_errors: 'Alt-f',
26
+ stop_execution: 'Ctrl-c',
27
+ marked_color: '#20f224',
28
+ scheduled_color: '#00def0',
29
+ run_color: '#f30a2d'
30
+ };
31
+
32
+ /**
33
+ * Add event if user clicks on codemirror gutter
34
+ *
35
+ */
36
+ function add_gutter_events() {
37
+ var ncells = Jupyter.notebook.ncells();
38
+ var cells = Jupyter.notebook.get_cells();
39
+ for (var i = 0; i < ncells; i++) {
40
+ var cell = cells[i];
41
+ if ((cell.cell_type === "code")) {
42
+ cell.code_mirror.on("gutterClick", changeEvent);
43
+ if (is_marked(cell)) {
44
+ var g = cell.code_mirror.getGutterElement();
45
+ $(g).css({
46
+ "background-color": params.marked_color
47
+ });
48
+ }
49
+ }
50
+ }
51
+ }
52
+
53
+ /*
54
+ * Initialize toolbar and gutter after config was loaded
55
+ */
56
+ function initialize() {
57
+ $.extend(true, params, Jupyter.notebook.config.data.runtools);
58
+
59
+ add_gutter_events();
60
+
61
+ /* Add run control buttons to toolbar */
62
+ $(Jupyter.toolbar.add_buttons_group([
63
+ Jupyter.keyboard_manager.actions.register ({
64
+ help: 'Toggle Runtools Toolbar',
65
+ icon: 'fa-cogs',
66
+ handler: toggle_toolbar
67
+ }, 'toggle-runtools-toolbar', 'runtools')
68
+ ])).find('.btn').attr('id', 'toggle_runtools').css({
69
+ 'outline': 'none'
70
+ });
71
+
72
+ /* Add keyboard shortcuts */
73
+ var add_command_shortcuts = {};
74
+ add_command_shortcuts[params["run_cells_above"]] = {
75
+ help: 'Run cells above',
76
+ help_index: 'xa',
77
+ handler: function() {
78
+ execute_cells_above();
79
+ return false;
80
+ }
81
+ };
82
+ add_command_shortcuts[params["run_cells_below"]] = {
83
+ help: 'Run cells below',
84
+ help_index: 'aa',
85
+ handler: function() {
86
+ execute_cells_below();
87
+ return false;
88
+ }
89
+ };
90
+ add_command_shortcuts[params["toggle_marker"]] = {
91
+ help: 'Toggle marker',
92
+ help_index: 'mt',
93
+ handler: function() {
94
+ toggle_marker();
95
+ return false;
96
+ }
97
+ };
98
+ add_command_shortcuts[params["mark_all_codecells"]] = {
99
+ help: 'Mark all codecells',
100
+ help_index: 'ma',
101
+ handler: function() {
102
+ mark_all();
103
+ return false;
104
+ }
105
+ };
106
+ add_command_shortcuts[params["unmark_all_codecells"]] = {
107
+ help: 'Unmark all codecells',
108
+ help_index: 'mu',
109
+ handler: function() {
110
+ mark_none();
111
+ return false;
112
+ }
113
+ };
114
+ add_command_shortcuts[params["run_marked_cells"]] = {
115
+ help: 'Run marked cells',
116
+ help_index: 'rm',
117
+ handler: function() {
118
+ run_marked_cells();
119
+ return false;
120
+ }
121
+ };
122
+ add_command_shortcuts[params["run_all_cells"]] = {
123
+ help: 'Run all cells',
124
+ help_index: 'ra',
125
+ handler: function() {
126
+ var pos = Jupyter.notebook.element.scrollTop();
127
+ execute_all_cells();
128
+ Jupyter.notebook.element.animate({
129
+ scrollTop: pos
130
+ }, 100);
131
+ return false;
132
+ }
133
+ };
134
+ add_command_shortcuts[params["run_all_cells_ignore_errors"]] = {
135
+ help: 'Run all cells - ignore errors',
136
+ help_index: 'rf',
137
+ handler: function() {
138
+ run_all_cells_ignore_errors();
139
+ return false;
140
+ }
141
+ };
142
+ Jupyter.keyboard_manager.command_shortcuts.add_shortcuts(add_command_shortcuts);
143
+ Jupyter.keyboard_manager.edit_shortcuts.add_shortcuts(add_command_shortcuts);
144
+
145
+ events.on('finished_execute.CodeCell', finished_execute_event);
146
+ }
147
+
148
+ /**
149
+ * Hide or show a cell
150
+ *
151
+ * @param cell
152
+ * @param io 'i' for cell input, 'o' for cell output
153
+ * @param showme {Boolean} show (true) or hide (false) cell
154
+ */
155
+ function showCell(cell, io, showme) {
156
+ if (io === 'i') {
157
+ if (showme === true) {
158
+ cell.element.find("div.input").show();
159
+ cell.metadata.hide_input = false;
160
+ } else {
161
+ cell.element.find("div.input").hide();
162
+ cell.metadata.hide_input = true;
163
+ }
164
+ } else {
165
+ if (showme === true) {
166
+ cell.element.find('div.output').show();
167
+ cell.metadata.hide_output = false;
168
+ } else {
169
+ cell.element.find('div.output').hide();
170
+ cell.metadata.hide_output = true;
171
+ }
172
+ }
173
+ }
174
+
175
+ function _show_input_output_of_marked(show, char) {
176
+ var cells = Jupyter.notebook.get_cells();
177
+ var ncells = cells.length;
178
+ for (var i = 0; i < ncells; i++) {
179
+ var _cell = cells[i];
180
+ if (is_marked(_cell))
181
+ showCell(_cell, char, show);
182
+ }
183
+ }
184
+
185
+ /**
186
+ * Hide or show input of all marked code cells
187
+ *
188
+ * @param show {Boolean} show (true) or hide (false) code cells
189
+ */
190
+ function show_input(show) {
191
+ _show_input_output_of_marked(show, 'i');
192
+ }
193
+
194
+ /**
195
+ * Hide or show output area of all marked code cells
196
+ *
197
+ * @param {Boolean} show show (true) or hide (false)
198
+ */
199
+ function show_output(show) {
200
+ _show_input_output_of_marked(show, 'o');
201
+ }
202
+
203
+
204
+ /**
205
+ * Execute next cell in run list, if it is still marked
206
+ *
207
+ */
208
+ function execute_next_marked_cell() {
209
+ var cells = Jupyter.notebook.get_cells();
210
+ var end = cells.length;
211
+ while (run_list.length > 0) {
212
+ var runcell = run_list.shift();
213
+ for (var i = 0; i < end; i++) {
214
+ if (runcell === cells[i]) {
215
+ if (runcell.metadata.run_control !== undefined && runcell.metadata.run_control.marked === true) {
216
+ var g = runcell.code_mirror.getGutterElement();
217
+ $(g).css({
218
+ "background-color": params.run_color
219
+ });
220
+ runcell.execute();
221
+ return;
222
+ }
223
+ }
224
+ }
225
+ }
226
+ }
227
+
228
+ function _execute_without_selecting(idx_start, idx_end, stop_on_error) {
229
+ // notebook.execute_cells alters selection, this doesn't
230
+ var cells = Jupyter.notebook.get_cells();
231
+ idx_start = idx_start !== undefined ? idx_start : 0;
232
+ idx_end = idx_end !== undefined ? idx_end : cells.length;
233
+ for (var ii = idx_start; ii < idx_end; ii++) {
234
+ cells[ii].execute(stop_on_error);
235
+ }
236
+ }
237
+
238
+ function execute_cells_above() {
239
+ _execute_without_selecting(0, Jupyter.notebook.get_selected_index());
240
+ }
241
+
242
+ function execute_cells_below() {
243
+ _execute_without_selecting(Jupyter.notebook.get_selected_index(), undefined);
244
+ }
245
+
246
+ function execute_all_cells(stop_on_error) {
247
+ _execute_without_selecting(0, undefined, stop_on_error);
248
+ }
249
+
250
+ /**
251
+ * Run code cells marked in metadata
252
+ *
253
+ */
254
+ function run_marked_cells() {
255
+ var cells = Jupyter.notebook.get_cells();
256
+ var end = cells.length;
257
+ run_list = [];
258
+ /* Show all marked cells as scheduled to be run with new gutter background color */
259
+ for (var i = 0; i < end; i++) {
260
+ var cell = cells[i];
261
+ if (cell instanceof codecell.CodeCell) {
262
+ var last_line = cell.code_mirror.lastLine();
263
+ var cell_empty = ( last_line === 0 && cell.code_mirror.getLine(last_line) === "");
264
+ if (cell.metadata.run_control !== undefined && cell_empty === false) {
265
+ if (cell.metadata.run_control.marked === true) {
266
+ var g = cell.code_mirror.getGutterElement();
267
+ $(g).css({
268
+ "background-color": params.scheduled_color
269
+ });
270
+ run_list.push(cell);
271
+ }
272
+ }
273
+ }
274
+ }
275
+ execute_next_marked_cell();
276
+ }
277
+
278
+ /*
279
+ * Execute next cell in run_list when notified execution of last cell has been finished
280
+ * @param evt Event
281
+ * @param data Cell that has finished executing
282
+ */
283
+ var finished_execute_event = function(evt, data) {
284
+ var cell = data.cell;
285
+ /* Reset gutter color no non-queued state */
286
+ if (is_marked(cell)) {
287
+ var g = cell.code_mirror.getGutterElement();
288
+ $(g).css({
289
+ "background-color": params.marked_color
290
+ });
291
+ }
292
+ execute_next_marked_cell();
293
+ };
294
+
295
+ /**
296
+ *
297
+ * @param cell
298
+ * @param value
299
+ */
300
+ function setCell(cell, value) {
301
+ if (!(cell instanceof codecell.CodeCell)) return;
302
+ if (cell.metadata.run_control === undefined) cell.metadata.run_control = {};
303
+ if (cell.metadata.run_control.marked === undefined) cell.metadata.run_control.marked = false;
304
+ if (value === undefined) value = !cell.metadata.run_control.marked;
305
+ var g = cell.code_mirror.getGutterElement();
306
+ if (value === false) {
307
+ cell.metadata.run_control.marked = false;
308
+ $(g).css({
309
+ "background-color": ""
310
+ });
311
+ } else {
312
+ cell.metadata.run_control.marked = true;
313
+ $(g).css({
314
+ "background-color": params.marked_color
315
+ });
316
+ }
317
+ }
318
+
319
+ function setCellsMarked(cells, value) {
320
+ var ncells = cells.length;
321
+ for (var i = 0; i < ncells; i++) {
322
+ setCell(cells[i], value);
323
+ }
324
+ }
325
+
326
+ /**
327
+ * Toggle code cell marker
328
+ */
329
+ function toggle_marker() {
330
+ setCellsMarked(Jupyter.notebook.get_selected_cells(), undefined);
331
+ }
332
+
333
+ /**
334
+ *
335
+ */
336
+ function mark_all() {
337
+ setCellsMarked(Jupyter.notebook.get_cells(), true);
338
+ }
339
+
340
+ /**
341
+ *
342
+ */
343
+ function mark_none() {
344
+ setCellsMarked(Jupyter.notebook.get_cells(), false);
345
+ }
346
+
347
+ /**
348
+ *
349
+ * @param cell notebook cell instance
350
+ * @param state {string} state to be display [ '', 'locked', 'executed', 'modified' ]
351
+ */
352
+ function set_cell_state(cell, state) {
353
+ var icon = "";
354
+ if (state === 'locked') {
355
+ icon = '<div class="fa fa-lock" style="font-size:70%;" /div>'
356
+ }
357
+ cell.code_mirror.setGutterMarker(0, "CodeMirror-cellstate", celltypeMarker(icon))
358
+ }
359
+
360
+ /**
361
+ * Change event to mark/unmark cell
362
+ *
363
+ * @param cm codemirror instance
364
+ * @param line current line
365
+ * @param gutter not used
366
+ */
367
+ function changeEvent(cm, line, gutter) {
368
+ if (gutter === "CodeMirror-foldgutter") return; /* Don't collide with codefolding extension */
369
+
370
+ var cmline = cm.doc.children[0].lines[line];
371
+ if (cmline === undefined) {
372
+ return;
373
+ }
374
+ var cell = $(cm.display.gutters).closest('.cell').data('cell');
375
+ if (cell.metadata.run_control === undefined)
376
+ cell.metadata.run_control = {};
377
+ setCell(cell, !cell.metadata.run_control.marked);
378
+ }
379
+
380
+ /**
381
+ *
382
+ * @param cell cell to be tested
383
+ * @returns {boolean} true if marked
384
+ */
385
+ var is_marked = function(cell) {
386
+ return (cell instanceof codecell.CodeCell) &&
387
+ cell.metadata.run_control !== undefined &&
388
+ cell.metadata.run_control.marked;
389
+ };
390
+
391
+ /**
392
+ * Return div element to set in cellstate gutter
393
+ *
394
+ * @param val HTML string
395
+ * @returns {Element} div Element
396
+ */
397
+ function celltypeMarker(val) {
398
+ var marker = document.createElement("div");
399
+ marker.style.color = "#822";
400
+ marker.innerHTML = val;
401
+ return marker;
402
+ }
403
+
404
+ /**
405
+ * Lock/Unlock current code cell
406
+ * if (cell.metadata.run_control != undefined && cell.metadata.run_control.read_only) {
407
+ * cell.code_mirror.setOption('readOnly', cell.metadata.run_control.read_only);
408
+ */
409
+ var lock_cell = function(locked) {
410
+ var ncells = Jupyter.notebook.ncells();
411
+ for (var i = ncells - 2; i >= 0; i--) {
412
+ var cells = Jupyter.notebook.get_cells();
413
+ if ((cells[i].cell_type === "code") && is_marked(cells[i])) {
414
+ if (locked === true) {
415
+ cells[i].metadata.editable = false;
416
+ set_cell_state(cells[i], 'locked')
417
+ } else {
418
+ cells[i].metadata.editable = true;
419
+ set_cell_state(cells[i], '')
420
+ }
421
+ }
422
+ }
423
+ };
424
+
425
+ /**
426
+ * Execute all cells and don't stop on errors
427
+ *
428
+ */
429
+ var run_all_cells_ignore_errors = function() {
430
+ execute_all_cells(false);
431
+ };
432
+
433
+ /**
434
+ * Create floating toolbar
435
+ *
436
+ */
437
+ var create_runtools_div = function() {
438
+ var btn = '<div class="btn-toolbar">\
439
+ <div class="btn-group">\
440
+ <button type="button" id="run_c" class="btn-primary fa fa-step-forward" title="Run current cell"></button>\
441
+ <button type="button" id="run_ca" class="btn-primary fa icon-run-to" title="' +
442
+ 'Run cells above (' + params["run_cells_above"] + ')"</button>\
443
+ <button type="button" id="run_cb" class="btn-primary fa icon-run-from" title="' +
444
+ 'Run cells below (' + params["run_cells_below"] + ')"</button>\
445
+ <button type="button" id="run_a" class="btn-primary fa icon-run-all" title="' +
446
+ 'Run all cells (' + params["run_all_cells"] + ')"</button>\
447
+ <button type="button" id="run_af" class="btn-primary fa icon-run-all-forced" title="' +
448
+ 'Run all - ignore errors (' + params["run_all_cells_ignore_errors"] + ')"</button>\
449
+ <button type="button" id="run_m" class="btn-primary fa icon-run-marked" title="' +
450
+ 'Run marked codecells (' + params["run_marked_cells"] + ')"</button>\
451
+ <button type="button" id="interrupt_b" class="btn-primary fa fa-stop" title="' +
452
+ 'Stop execution (' + params["stop_execution"] + ')"</button>\
453
+ </div>\
454
+ <div class="btn-group">\
455
+ <button type="button" id="mark_toggle" class="btn-primary fa icon-mark-toggle" title="Mark single code cell"></button>\
456
+ <button type="button" id="mark_all" class="btn-primary fa icon-mark-all" title="Mark all code cells"></button>\
457
+ <button type="button" id="mark_none" class="btn-primary fa icon-mark-none" title="Unmark all code cells"></button>\
458
+ </div>\
459
+ <div class="btn-group">\
460
+ <button type="button" id="show_input" class="btn-primary fa icon-show-input" title="Show input of code cell"></button>\
461
+ <button type="button" id="hide_input" class="btn-primary fa icon-hide-input" title="Hide input of code cell"></button>\
462
+ <button type="button" id="show_output" class="btn-primary fa icon-show-output" title="Show output of code cell"></button>\
463
+ <button type="button" id="hide_output" class="btn-primary fa icon-hide-output" title="Hide output of code cell"></button>\
464
+ <button type="button" id="lock_marked" class="btn-primary fa fa-lock" title="Lock marked cells"></button>\
465
+ <button type="button" id="unlock_marked" class="btn-primary fa fa-unlock" title="Unlock marked cells"></button>\
466
+ </div>\
467
+ </div>';
468
+
469
+ var runtools_wrapper = $('<div id="runtools-wrapper">')
470
+ .text("Runtools")
471
+ .append(btn)
472
+ .draggable()
473
+ .append("</div>");
474
+
475
+ $("#header").append(runtools_wrapper);
476
+ $("#runtools-wrapper").css({
477
+ 'position': 'absolute'
478
+ });
479
+ $('#run_c').on('click', function(e) {
480
+ var idx = Jupyter.notebook.get_selected_index();
481
+ _execute_without_selecting(idx, idx + 1);
482
+ e.target.blur();
483
+ })
484
+ .tooltip({
485
+ delay: {
486
+ show: 500,
487
+ hide: 100
488
+ }
489
+ });
490
+ $('#run_ca').on('click', function(e) {
491
+ execute_cells_above();
492
+ e.target.blur();
493
+ })
494
+ .tooltip({
495
+ delay: {
496
+ show: 500,
497
+ hide: 100
498
+ }
499
+ });
500
+ $('#run_cb').on('click', function(e) {
501
+ execute_cells_below();
502
+ e.target.blur();
503
+ })
504
+ .tooltip({
505
+ delay: {
506
+ show: 500,
507
+ hide: 100
508
+ }
509
+ });
510
+ $('#run_a').on('click', function(e) {
511
+ execute_all_cells();
512
+ e.target.blur();
513
+ })
514
+ .tooltip({
515
+ delay: {
516
+ show: 500,
517
+ hide: 100
518
+ }
519
+ });
520
+ $('#run_af').on('click', function(e) {
521
+ run_all_cells_ignore_errors();
522
+ e.target.blur()
523
+ })
524
+ .tooltip({
525
+ delay: {
526
+ show: 500,
527
+ hide: 100
528
+ }
529
+ });
530
+ $('#run_m').on('click', function(e) {
531
+ run_marked_cells();
532
+ e.target.blur()
533
+ })
534
+ .tooltip({
535
+ delay: {
536
+ show: 500,
537
+ hide: 100
538
+ }
539
+ });
540
+ $('#interrupt_b').on('click', function(e) {
541
+ interrupt_execution();
542
+ e.target.blur()
543
+ })
544
+ .tooltip({
545
+ delay: {
546
+ show: 500,
547
+ hide: 100
548
+ }
549
+ });
550
+ $('#mark_toggle').on('click', function() {
551
+ toggle_marker()
552
+ })
553
+ .tooltip({
554
+ delay: {
555
+ show: 500,
556
+ hide: 100
557
+ }
558
+ });
559
+ $('#mark_all').on('click', function() {
560
+ mark_all()
561
+ })
562
+ .tooltip({
563
+ delay: {
564
+ show: 500,
565
+ hide: 100
566
+ }
567
+ });
568
+ $('#mark_none').on('click', function() {
569
+ mark_none()
570
+ })
571
+ .tooltip({
572
+ delay: {
573
+ show: 500,
574
+ hide: 100
575
+ }
576
+ });
577
+ $('#show_input').on('click', function() {
578
+ show_input(true);
579
+ this.blur()
580
+ })
581
+ .tooltip({
582
+ delay: {
583
+ show: 500,
584
+ hide: 100
585
+ }
586
+ });
587
+ $('#hide_input').on('click', function() {
588
+ show_input(false);
589
+ this.blur()
590
+ })
591
+ .tooltip({
592
+ delay: {
593
+ show: 500,
594
+ hide: 100
595
+ }
596
+ });
597
+ $('#show_output').on('click', function() {
598
+ show_output(true);
599
+ this.blur()
600
+ })
601
+ .tooltip({
602
+ delay: {
603
+ show: 500,
604
+ hide: 100
605
+ }
606
+ });
607
+ $('#hide_output').on('click', function() {
608
+ show_output(false);
609
+ this.blur()
610
+ })
611
+ .tooltip({
612
+ delay: {
613
+ show: 500,
614
+ hide: 100
615
+ }
616
+ });
617
+ $('#lock_marked').on('click', function() {
618
+ lock_cell(true);
619
+ this.blur()
620
+ })
621
+ .tooltip({
622
+ delay: {
623
+ show: 500,
624
+ hide: 100
625
+ }
626
+ });
627
+ $('#unlock_marked').on('click', function() {
628
+ lock_cell(false);
629
+ this.blur()
630
+ })
631
+ .tooltip({
632
+ delay: {
633
+ show: 500,
634
+ hide: 100
635
+ }
636
+ });
637
+ };
638
+
639
+ /**
640
+ * Show/hide toolbar
641
+ *
642
+ */
643
+ var toggle_toolbar = function() {
644
+ var dom = $("#runtools-wrapper");
645
+
646
+ if (dom.is(':visible')) {
647
+ $('#toggle_runtools').removeClass('active').blur();
648
+ dom.hide();
649
+ } else {
650
+ $('#toggle_runtools').addClass('active');
651
+ dom.show();
652
+ }
653
+
654
+ if (dom.length === 0) {
655
+ create_runtools_div()
656
+ }
657
+ };
658
+
659
+
660
+ /**
661
+ * Add CSS file
662
+ *
663
+ * @param name filename
664
+ */
665
+ var load_css = function(name) {
666
+ var link = document.createElement("link");
667
+ link.type = "text/css";
668
+ link.rel = "stylesheet";
669
+ link.href = requirejs.toUrl(name);
670
+ document.getElementsByTagName("head")[0].appendChild(link);
671
+ };
672
+
673
+ /**
674
+ * Add gutter to a new cell
675
+ *
676
+ * @param event
677
+ * @param nbcell
678
+ *
679
+ */
680
+ var createCell = function(event, nbcell) {
681
+ var cell = nbcell.cell;
682
+ if (cell instanceof codecell.CodeCell) {
683
+ var gutters = cell.code_mirror.getOption('gutters').slice();
684
+ if ($.inArray("CodeMirror-cellstate", gutters) < 0) {
685
+ gutters.push('CodeMirror-cellstate');
686
+ cell.code_mirror.setOption('gutters', gutters);
687
+ cell.code_mirror.on("gutterClick", changeEvent);
688
+
689
+ }
690
+ }
691
+ };
692
+
693
+
694
+ /**
695
+ * Initialize all cells with new gutter
696
+ */
697
+ var initGutter = function() {
698
+ var cells = Jupyter.notebook.get_cells();
699
+ var ncells = cells.length;
700
+ for (var i = 0; i < ncells; i++) {
701
+ var cell = cells[i];
702
+ if (cell instanceof codecell.CodeCell) {
703
+ var gutters = cell.code_mirror.getOption('gutters').slice();
704
+ if ($.inArray("CodeMirror-cellstate", gutters) < 0) {
705
+ gutters.push('CodeMirror-cellstate');
706
+ cell.code_mirror.setOption('gutters', gutters);
707
+ }
708
+ }
709
+ /**
710
+ * Restore hide/show status after reload
711
+ */
712
+ if (cell.metadata.hasOwnProperty('hide_input') && cell.metadata.hide_input === true)
713
+ showCell(cell, 'i', false);
714
+ if (cell.metadata.hasOwnProperty('hide_output') && cell.metadata.hide_output === true)
715
+ showCell(cell, 'o', false);
716
+ if (cell.is_editable() === false) {
717
+ set_cell_state(cell, 'locked');
718
+ }
719
+ cell.code_mirror.refresh();
720
+ }
721
+ events.on('create.Cell', createCell);
722
+ };
723
+
724
+ /**
725
+ * Called from notebook after extension was loaded
726
+ *
727
+ */
728
+ var load_extension = function() {
729
+ load_css('./main.css');
730
+ load_css('./gutter.css'); /* set gutter width */
731
+ requirejs(['./cellstate'], function() {
732
+ if (Jupyter.notebook._fully_loaded) {
733
+ initGutter();
734
+ } else {
735
+ events.one('notebook_loaded.Notebook', initGutter);
736
+ }
737
+ });
738
+ Jupyter.notebook.config.loaded.then(initialize);
739
+ };
740
+
741
+ return {
742
+ load_jupyter_extension: load_extension,
743
+ load_ipython_extension: load_extension
744
+ };
745
+ });
.local/share/jupyter/nbextensions/runtools/runtools_lock.png ADDED
.local/share/jupyter/nbextensions/scratchpad/README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Scratchpad notebook extension
2
+
3
+ Adds a scratchpad cell to Jupyter notebook.
4
+ This is a cell in which you can execute code against the current kernel without modifying the notebook document.
5
+
6
+ Scratchpad cells can be executed using `Shift-Enter` (other shortcuts are appled to the notebook document). The scratchpad can be toggled by clicking the icon in the bottom-right, or via the keyboard shortcut `Ctrl-B`.
7
+
8
+ ![demo](demo.gif)
9
+
10
+
11
+ ## Credits
12
+
13
+ This extension is a copy of the extension from MinRK here:
14
+ `git clone git://github.com/minrk/nbextension-scratchpad`.
.local/share/jupyter/nbextensions/skill/README.md ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SKILL for Codemirror
2
+ This extension provides a *SKILL* mode for CodeMirror editor.
3
+
4
+ The extension adds a MIME type `x-skill` and a mode `skill` that can be
5
+ used with CodeMirror.
6
+
7
+ ## About SKILL
8
+ From [Wikipedia](https://en.wikipedia.org/wiki/Cadence_SKILL):
9
+ SKILL is a Lisp dialect used as a scripting language and PCell (parameterized
10
+ cells) description language used in many EDA software suites by Cadence Design
11
+ Systems (e.g. Cadence Allegro and Cadence Virtuoso)
12
+
13
+ ## Notes
14
+ This extension was written to enhance the Virtuoso kernel for Jupyter
15
+ (https://github.com/benvarkey/JuVi).
.triton/cache/6e97c2a1f7a095255f6dd5de1807841d/cuda_utils.so ADDED
Binary file (28 kB). View file
 
.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.ptx ADDED
@@ -0,0 +1,807 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4d5d6d7d8de9de
10
+ .extern .shared .align 1 .b8 global_smem[];
11
+ .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
12
+
13
+ .visible .entry triton__0d1d2d3d4d5d6d7d8de9de(
14
+ .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_0,
15
+ .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_1,
16
+ .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_2,
17
+ .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_3,
18
+ .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_4,
19
+ .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_5,
20
+ .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_6,
21
+ .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_7,
22
+ .param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_8,
23
+ .param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_9
24
+ )
25
+ .maxntid 64, 1, 1
26
+ {
27
+ .reg .pred %p<33>;
28
+ .reg .b16 %rs<21>;
29
+ .reg .b32 %r<112>;
30
+ .reg .f32 %f<94>;
31
+ .reg .b64 %rd<20>;
32
+ .loc 1 18 0
33
+ $L__func_begin0:
34
+ .loc 1 18 0
35
+
36
+ ld.param.u64 %rd9, [triton__0d1d2d3d4d5d6d7d8de9de_param_0];
37
+ ld.param.u64 %rd10, [triton__0d1d2d3d4d5d6d7d8de9de_param_1];
38
+ $L__tmp0:
39
+ .loc 1 26 26
40
+ mov.u32 %r78, %tid.x;
41
+ and.b32 %r79, %r78, 31;
42
+ ld.param.u64 %rd11, [triton__0d1d2d3d4d5d6d7d8de9de_param_2];
43
+ ld.param.u64 %rd12, [triton__0d1d2d3d4d5d6d7d8de9de_param_3];
44
+ ld.param.u64 %rd13, [triton__0d1d2d3d4d5d6d7d8de9de_param_4];
45
+ shl.b32 %r80, %r78, 2;
46
+ ld.param.u64 %rd14, [triton__0d1d2d3d4d5d6d7d8de9de_param_5];
47
+ and.b32 %r81, %r80, 252;
48
+ ld.param.u64 %rd15, [triton__0d1d2d3d4d5d6d7d8de9de_param_6];
49
+ ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6d7d8de9de_param_7];
50
+ .loc 1 23 28
51
+ mov.u32 %r1, %ctaid.x;
52
+ .loc 1 30 40
53
+ shl.b32 %r82, %r1, 8;
54
+ .loc 1 30 36
55
+ or.b32 %r83, %r82, %r81;
56
+ .loc 1 30 30
57
+ mul.wide.s32 %rd17, %r83, 4;
58
+ add.s64 %rd1, %rd9, %rd17;
59
+ mov.b32 %r6, 0;
60
+ mov.pred %p1, -1;
61
+ .loc 1 30 46
62
+ mov.u32 %r2, 0x0;
63
+ mov.u32 %r3, 0x0;
64
+ mov.u32 %r4, 0x0;
65
+ mov.u32 %r5, 0x0;
66
+ @%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
67
+ @!%p1 mov.u32 %r2, %r6;
68
+ @!%p1 mov.u32 %r3, %r6;
69
+ @!%p1 mov.u32 %r4, %r6;
70
+ @!%p1 mov.u32 %r5, %r6;
71
+ mov.b32 %f1, %r2;
72
+ mov.b32 %f2, %r3;
73
+ mov.b32 %f3, %r4;
74
+ mov.b32 %f4, %r5;
75
+ .loc 1 31 30
76
+ mul.wide.s32 %rd18, %r83, 2;
77
+ add.s64 %rd2, %rd10, %rd18;
78
+ .loc 1 31 46
79
+ mov.u32 %r10, 0x0;
80
+ mov.u32 %r11, 0x0;
81
+ @%p1 ld.global.v2.b32 { %r10, %r11 }, [ %rd2 + 0 ];
82
+ @!%p1 mov.u32 %r10, %r6;
83
+ @!%p1 mov.u32 %r11, %r6;
84
+ cvt.u16.u32 %rs1, %r10;
85
+ { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r10; }
86
+ cvt.u16.u32 %rs3, %r11;
87
+ { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r11; }
88
+ .loc 1 31 67
89
+ cvt.f32.bf16 %r14, %rs1;
90
+ mov.b32 %f5, %r14;
91
+ cvt.f32.bf16 %r15, %rs2;
92
+ mov.b32 %f6, %r15;
93
+ cvt.f32.bf16 %r16, %rs3;
94
+ mov.b32 %f7, %r16;
95
+ cvt.f32.bf16 %r17, %rs4;
96
+ mov.b32 %f8, %r17;
97
+ .loc 1 32 30
98
+ add.s64 %rd3, %rd11, %rd18;
99
+ .loc 1 32 46
100
+ mov.u32 %r18, 0x0;
101
+ mov.u32 %r19, 0x0;
102
+ @%p1 ld.global.v2.b32 { %r18, %r19 }, [ %rd3 + 0 ];
103
+ @!%p1 mov.u32 %r18, %r6;
104
+ @!%p1 mov.u32 %r19, %r6;
105
+ cvt.u16.u32 %rs5, %r18;
106
+ { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r18; }
107
+ cvt.u16.u32 %rs7, %r19;
108
+ { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r19; }
109
+ .loc 1 32 67
110
+ cvt.f32.bf16 %r22, %rs5;
111
+ mov.b32 %f9, %r22;
112
+ cvt.f32.bf16 %r23, %rs6;
113
+ mov.b32 %f10, %r23;
114
+ cvt.f32.bf16 %r24, %rs7;
115
+ mov.b32 %f11, %r24;
116
+ cvt.f32.bf16 %r25, %rs8;
117
+ mov.b32 %f12, %r25;
118
+ .loc 1 33 30
119
+ add.s64 %rd4, %rd12, %rd18;
120
+ .loc 1 33 46
121
+ mov.u32 %r26, 0x0;
122
+ mov.u32 %r27, 0x0;
123
+ @%p1 ld.global.v2.b32 { %r26, %r27 }, [ %rd4 + 0 ];
124
+ @!%p1 mov.u32 %r26, %r6;
125
+ @!%p1 mov.u32 %r27, %r6;
126
+ cvt.u16.u32 %rs9, %r26;
127
+ { .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r26; }
128
+ cvt.u16.u32 %rs11, %r27;
129
+ { .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r27; }
130
+ .loc 1 33 67
131
+ cvt.f32.bf16 %r30, %rs9;
132
+ mov.b32 %f13, %r30;
133
+ cvt.f32.bf16 %r31, %rs10;
134
+ mov.b32 %f14, %r31;
135
+ cvt.f32.bf16 %r32, %rs11;
136
+ mov.b32 %f15, %r32;
137
+ cvt.f32.bf16 %r33, %rs12;
138
+ mov.b32 %f16, %r33;
139
+ .loc 1 34 31
140
+ add.s64 %rd5, %rd13, %rd18;
141
+ .loc 1 34 47
142
+ mov.u32 %r34, 0x0;
143
+ mov.u32 %r35, 0x0;
144
+ @%p1 ld.global.v2.b32 { %r34, %r35 }, [ %rd5 + 0 ];
145
+ @!%p1 mov.u32 %r34, %r6;
146
+ @!%p1 mov.u32 %r35, %r6;
147
+ cvt.u16.u32 %rs13, %r34;
148
+ { .reg .b16 tmp; mov.b32 {tmp, %rs14}, %r34; }
149
+ cvt.u16.u32 %rs15, %r35;
150
+ { .reg .b16 tmp; mov.b32 {tmp, %rs16}, %r35; }
151
+ .loc 1 34 68
152
+ cvt.f32.bf16 %r38, %rs13;
153
+ mov.b32 %f17, %r38;
154
+ cvt.f32.bf16 %r39, %rs14;
155
+ mov.b32 %f18, %r39;
156
+ cvt.f32.bf16 %r40, %rs15;
157
+ mov.b32 %f19, %r40;
158
+ cvt.f32.bf16 %r41, %rs16;
159
+ mov.b32 %f20, %r41;
160
+ .loc 1 35 31
161
+ mul.wide.u32 %rd19, %r81, 4;
162
+ add.s64 %rd6, %rd14, %rd19;
163
+ .loc 1 35 36
164
+ mov.u32 %r42, 0x0;
165
+ mov.u32 %r43, 0x0;
166
+ mov.u32 %r44, 0x0;
167
+ mov.u32 %r45, 0x0;
168
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r42, %r43, %r44, %r45 }, [ %rd6 + 0 ];
169
+ @!%p1 mov.u32 %r42, %r6;
170
+ @!%p1 mov.u32 %r43, %r6;
171
+ @!%p1 mov.u32 %r44, %r6;
172
+ @!%p1 mov.u32 %r45, %r6;
173
+ .loc 1 37 18
174
+ add.f32 %f21, %f5, %f1;
175
+ add.f32 %f22, %f6, %f2;
176
+ add.f32 %f23, %f7, %f3;
177
+ .loc 1 39 18
178
+ add.f32 %f24, %f21, %f9;
179
+ add.f32 %f25, %f22, %f10;
180
+ add.f32 %f26, %f23, %f11;
181
+ .loc 1 41 18
182
+ add.f32 %f27, %f25, %f14;
183
+ add.f32 %f28, %f26, %f15;
184
+ .loc 1 43 19
185
+ add.f32 %f29, %f27, %f18;
186
+ add.f32 %f30, %f28, %f19;
187
+ .loc 1 41 18
188
+ add.f32 %f31, %f24, %f13;
189
+ add.f32 %f32, %f8, %f4;
190
+ .loc 1 43 19
191
+ add.f32 %f33, %f32, %f12;
192
+ add.f32 %f34, %f31, %f17;
193
+ $L__tmp1:
194
+ .loc 2 233 15
195
+ add.f32 %f35, %f34, %f29;
196
+ add.f32 %f36, %f33, %f16;
197
+ add.f32 %f37, %f35, %f30;
198
+ add.f32 %f38, %f36, %f20;
199
+ mov.b32 %r71, %f38;
200
+ add.f32 %f39, %f37, %f38;
201
+ $L__tmp2:
202
+ .loc 2 243 36
203
+ mov.b32 %r84, %f39;
204
+ shfl.sync.bfly.b32 %r85, %r84, 16, 31, -1;
205
+ mov.b32 %f40, %r85;
206
+ $L__tmp3:
207
+ .loc 2 233 15
208
+ add.f32 %f41, %f39, %f40;
209
+ $L__tmp4:
210
+ .loc 2 243 36
211
+ mov.b32 %r86, %f41;
212
+ shfl.sync.bfly.b32 %r87, %r86, 8, 31, -1;
213
+ mov.b32 %f42, %r87;
214
+ $L__tmp5:
215
+ .loc 2 233 15
216
+ add.f32 %f43, %f41, %f42;
217
+ $L__tmp6:
218
+ .loc 2 243 36
219
+ mov.b32 %r88, %f43;
220
+ shfl.sync.bfly.b32 %r89, %r88, 4, 31, -1;
221
+ mov.b32 %f44, %r89;
222
+ $L__tmp7:
223
+ .loc 2 233 15
224
+ add.f32 %f45, %f43, %f44;
225
+ $L__tmp8:
226
+ .loc 2 243 36
227
+ mov.b32 %r90, %f45;
228
+ shfl.sync.bfly.b32 %r91, %r90, 2, 31, -1;
229
+ mov.b32 %f46, %r91;
230
+ $L__tmp9:
231
+ .loc 2 233 15
232
+ add.f32 %f47, %f45, %f46;
233
+ $L__tmp10:
234
+ .loc 2 243 36
235
+ mov.b32 %r92, %f47;
236
+ shfl.sync.bfly.b32 %r93, %r92, 1, 31, -1;
237
+ mov.b32 %f48, %r93;
238
+ $L__tmp11:
239
+ .loc 2 233 15
240
+ add.f32 %f49, %f47, %f48;
241
+ $L__tmp12:
242
+ .loc 2 243 36
243
+ setp.eq.s32 %p23, %r79, 0;
244
+ shr.u32 %r94, %r78, 3;
245
+ and.b32 %r95, %r94, 4;
246
+ mov.u32 %r96, global_smem;
247
+ add.s32 %r50, %r96, %r95;
248
+ mov.b32 %r51, %f49;
249
+ @%p23 st.shared.b32 [ %r50 + 0 ], %r51;
250
+ bar.sync 0;
251
+ setp.lt.s32 %p24, %r78, 2;
252
+ add.s32 %r53, %r96, %r80;
253
+ @%p24 ld.shared.b32 %r52, [ %r53 + 0 ];
254
+ mov.b32 %f50, %r52;
255
+ shfl.sync.bfly.b32 %r97, %r52, 1, 31, -1;
256
+ mov.b32 %f51, %r97;
257
+ $L__tmp13:
258
+ .loc 2 233 15
259
+ add.f32 %f52, %f50, %f51;
260
+ $L__tmp14:
261
+ .loc 2 243 36
262
+ and.b32 %r98, %r78, 1;
263
+ setp.eq.b32 %p31, %r98, 1;
264
+ not.pred %p32, %p31;
265
+ and.pred %p25, %p24, %p32;
266
+ mov.b32 %r55, %f52;
267
+ @%p25 st.shared.b32 [ %r53 + 0 ], %r55;
268
+ bar.sync 0;
269
+ ld.shared.f32 %f53, [global_smem];
270
+ $L__tmp15:
271
+ .loc 3 8 15
272
+ add.f32 %f54, %f53, 0f00000000;
273
+ $L__tmp16:
274
+ .loc 1 51 20
275
+ mov.b32 %r57, %f54;
276
+ mov.b32 %r58, 1132462080;
277
+ div.full.f32 %r56, %r57, %r58;
278
+ mov.b32 %f55, %r56;
279
+ .loc 1 52 20
280
+ sub.f32 %f56, %f34, %f55;
281
+ sub.f32 %f57, %f29, %f55;
282
+ sub.f32 %f58, %f30, %f55;
283
+ sub.f32 %f59, %f38, %f55;
284
+ .loc 1 53 20
285
+ mul.f32 %f60, %f57, %f57;
286
+ $L__tmp17:
287
+ .loc 2 243 36
288
+ bar.sync 0;
289
+ $L__tmp18:
290
+ .loc 2 233 15
291
+ fma.rn.f32 %f61, %f56, %f56, %f60;
292
+ fma.rn.f32 %f62, %f58, %f58, %f61;
293
+ fma.rn.f32 %f63, %f59, %f59, %f62;
294
+ $L__tmp19:
295
+ .loc 2 243 36
296
+ mov.b32 %r99, %f63;
297
+ shfl.sync.bfly.b32 %r100, %r99, 16, 31, -1;
298
+ mov.b32 %f64, %r100;
299
+ $L__tmp20:
300
+ .loc 2 233 15
301
+ add.f32 %f65, %f63, %f64;
302
+ $L__tmp21:
303
+ .loc 2 243 36
304
+ mov.b32 %r101, %f65;
305
+ shfl.sync.bfly.b32 %r102, %r101, 8, 31, -1;
306
+ mov.b32 %f66, %r102;
307
+ $L__tmp22:
308
+ .loc 2 233 15
309
+ add.f32 %f67, %f65, %f66;
310
+ $L__tmp23:
311
+ .loc 2 243 36
312
+ mov.b32 %r103, %f67;
313
+ shfl.sync.bfly.b32 %r104, %r103, 4, 31, -1;
314
+ mov.b32 %f68, %r104;
315
+ $L__tmp24:
316
+ .loc 2 233 15
317
+ add.f32 %f69, %f67, %f68;
318
+ $L__tmp25:
319
+ .loc 2 243 36
320
+ mov.b32 %r105, %f69;
321
+ shfl.sync.bfly.b32 %r106, %r105, 2, 31, -1;
322
+ mov.b32 %f70, %r106;
323
+ $L__tmp26:
324
+ .loc 2 233 15
325
+ add.f32 %f71, %f69, %f70;
326
+ $L__tmp27:
327
+ .loc 2 243 36
328
+ mov.b32 %r107, %f71;
329
+ shfl.sync.bfly.b32 %r108, %r107, 1, 31, -1;
330
+ mov.b32 %f72, %r108;
331
+ $L__tmp28:
332
+ .loc 2 233 15
333
+ add.f32 %f73, %f71, %f72;
334
+ $L__tmp29:
335
+ .loc 2 243 36
336
+ mov.b32 %r60, %f73;
337
+ @%p23 st.shared.b32 [ %r50 + 0 ], %r60;
338
+ bar.sync 0;
339
+ @%p24 ld.shared.b32 %r61, [ %r53 + 0 ];
340
+ mov.b32 %f74, %r61;
341
+ shfl.sync.bfly.b32 %r109, %r61, 1, 31, -1;
342
+ mov.b32 %f75, %r109;
343
+ $L__tmp30:
344
+ .loc 2 233 15
345
+ add.f32 %f76, %f74, %f75;
346
+ $L__tmp31:
347
+ .loc 2 243 36
348
+ mov.b32 %r64, %f76;
349
+ @%p25 st.shared.b32 [ %r53 + 0 ], %r64;
350
+ bar.sync 0;
351
+ ld.shared.f32 %f77, [global_smem];
352
+ $L__tmp32:
353
+ .loc 3 8 15
354
+ add.f32 %f78, %f77, 0f00000000;
355
+ $L__tmp33:
356
+ .loc 1 59 20
357
+ mov.b32 %r66, %f78;
358
+ div.full.f32 %r65, %r66, %r58;
359
+ mov.b32 %f79, %r65;
360
+ .loc 1 61 20
361
+ add.f32 %f80, %f79, 0f3727C5AC;
362
+ .loc 1 62 26
363
+ rsqrt.approx.ftz.f32 %f81, %f80;
364
+ .loc 1 35 36
365
+ mov.b32 %f82, %r45;
366
+ mov.b32 %f83, %r44;
367
+ mov.b32 %f84, %r43;
368
+ mov.b32 %f85, %r42;
369
+ .loc 1 63 20
370
+ mul.f32 %f86, %f56, %f81;
371
+ mul.f32 %f87, %f57, %f81;
372
+ mul.f32 %f88, %f58, %f81;
373
+ mul.f32 %f89, %f59, %f81;
374
+ .loc 1 64 20
375
+ mul.f32 %f90, %f86, %f85;
376
+ mul.f32 %f91, %f87, %f84;
377
+ mul.f32 %f92, %f88, %f83;
378
+ mul.f32 %f93, %f89, %f82;
379
+ .loc 1 66 25
380
+ add.s64 %rd7, %rd15, %rd17;
381
+ .loc 1 66 48
382
+ mov.b32 %r68, %f34;
383
+ mov.b32 %r69, %f29;
384
+ mov.b32 %r70, %f30;
385
+ @%p1 st.global.v4.b32 [ %rd7 + 0 ], { %r68, %r69, %r70, %r71 };
386
+ .loc 1 67 25
387
+ add.s64 %rd8, %rd16, %rd18;
388
+ .loc 1 67 48
389
+ mov.b32 %r72, %f90;
390
+ cvt.rn.bf16.f32 %rs17, %r72;
391
+ mov.b32 %r73, %f91;
392
+ cvt.rn.bf16.f32 %rs18, %r73;
393
+ mov.b32 %r74, %f92;
394
+ cvt.rn.bf16.f32 %rs19, %r74;
395
+ mov.b32 %r75, %f93;
396
+ cvt.rn.bf16.f32 %rs20, %r75;
397
+ mov.b32 %r110, {%rs17, %rs18};
398
+ mov.b32 %r111, {%rs19, %rs20};
399
+ @%p1 st.global.v2.b32 [ %rd8 + 0 ], { %r110, %r111 };
400
+ .loc 1 67 4
401
+ ret;
402
+ $L__tmp34:
403
+ $L__func_end0:
404
+
405
+ }
406
+ // .globl __nv_rsqrtf
407
+ .visible .func (.param .b32 func_retval0) __nv_rsqrtf(
408
+ .param .b32 __nv_rsqrtf_param_0
409
+ )
410
+ {
411
+ .reg .f32 %f<3>;
412
+ $L__func_begin1:
413
+
414
+ ld.param.f32 %f1, [__nv_rsqrtf_param_0];
415
+ rsqrt.approx.ftz.f32 %f2, %f1;
416
+ st.param.f32 [func_retval0+0], %f2;
417
+ ret;
418
+ $L__func_end1:
419
+
420
+ }
421
+ .file 1 "/tmp/torchinductor_root/jb/cjbnqg5u4sj7a4xstjer3a6tdgnnigb2iymd27gcs6o7oduhxy2v.py"
422
+ .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
423
+ .file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
424
+ .section .debug_abbrev
425
+ {
426
+ .b8 1
427
+ .b8 17
428
+ .b8 1
429
+ .b8 37
430
+ .b8 8
431
+ .b8 19
432
+ .b8 5
433
+ .b8 3
434
+ .b8 8
435
+ .b8 16
436
+ .b8 6
437
+ .b8 27
438
+ .b8 8
439
+ .b8 180
440
+ .b8 66
441
+ .b8 12
442
+ .b8 17
443
+ .b8 1
444
+ .b8 18
445
+ .b8 1
446
+ .b8 0
447
+ .b8 0
448
+ .b8 2
449
+ .b8 46
450
+ .b8 0
451
+ .b8 135
452
+ .b8 64
453
+ .b8 8
454
+ .b8 3
455
+ .b8 8
456
+ .b8 58
457
+ .b8 11
458
+ .b8 59
459
+ .b8 11
460
+ .b8 63
461
+ .b8 12
462
+ .b8 32
463
+ .b8 11
464
+ .b8 0
465
+ .b8 0
466
+ .b8 3
467
+ .b8 46
468
+ .b8 1
469
+ .b8 17
470
+ .b8 1
471
+ .b8 18
472
+ .b8 1
473
+ .b8 64
474
+ .b8 10
475
+ .b8 49
476
+ .b8 19
477
+ .b8 0
478
+ .b8 0
479
+ .b8 4
480
+ .b8 29
481
+ .b8 1
482
+ .b8 49
483
+ .b8 19
484
+ .b8 17
485
+ .b8 1
486
+ .b8 18
487
+ .b8 1
488
+ .b8 88
489
+ .b8 11
490
+ .b8 89
491
+ .b8 11
492
+ .b8 87
493
+ .b8 11
494
+ .b8 0
495
+ .b8 0
496
+ .b8 5
497
+ .b8 29
498
+ .b8 0
499
+ .b8 49
500
+ .b8 19
501
+ .b8 17
502
+ .b8 1
503
+ .b8 18
504
+ .b8 1
505
+ .b8 88
506
+ .b8 11
507
+ .b8 89
508
+ .b8 11
509
+ .b8 87
510
+ .b8 11
511
+ .b8 0
512
+ .b8 0
513
+ .b8 0
514
+ }
515
+ .section .debug_info
516
+ {
517
+ .b32 407
518
+ .b8 2
519
+ .b8 0
520
+ .b32 .debug_abbrev
521
+ .b8 8
522
+ .b8 1
523
+ .b8 116
524
+ .b8 114
525
+ .b8 105
526
+ .b8 116
527
+ .b8 111
528
+ .b8 110
529
+ .b8 0
530
+ .b8 2
531
+ .b8 0
532
+ .b8 99
533
+ .b8 106
534
+ .b8 98
535
+ .b8 110
536
+ .b8 113
537
+ .b8 103
538
+ .b8 53
539
+ .b8 117
540
+ .b8 52
541
+ .b8 115
542
+ .b8 106
543
+ .b8 55
544
+ .b8 97
545
+ .b8 52
546
+ .b8 120
547
+ .b8 115
548
+ .b8 116
549
+ .b8 106
550
+ .b8 101
551
+ .b8 114
552
+ .b8 51
553
+ .b8 97
554
+ .b8 54
555
+ .b8 116
556
+ .b8 100
557
+ .b8 103
558
+ .b8 110
559
+ .b8 110
560
+ .b8 105
561
+ .b8 103
562
+ .b8 98
563
+ .b8 50
564
+ .b8 105
565
+ .b8 121
566
+ .b8 109
567
+ .b8 100
568
+ .b8 50
569
+ .b8 55
570
+ .b8 103
571
+ .b8 99
572
+ .b8 115
573
+ .b8 54
574
+ .b8 111
575
+ .b8 55
576
+ .b8 111
577
+ .b8 100
578
+ .b8 117
579
+ .b8 104
580
+ .b8 120
581
+ .b8 121
582
+ .b8 50
583
+ .b8 118
584
+ .b8 46
585
+ .b8 112
586
+ .b8 121
587
+ .b8 0
588
+ .b32 .debug_line
589
+ .b8 47
590
+ .b8 116
591
+ .b8 109
592
+ .b8 112
593
+ .b8 47
594
+ .b8 116
595
+ .b8 111
596
+ .b8 114
597
+ .b8 99
598
+ .b8 104
599
+ .b8 105
600
+ .b8 110
601
+ .b8 100
602
+ .b8 117
603
+ .b8 99
604
+ .b8 116
605
+ .b8 111
606
+ .b8 114
607
+ .b8 95
608
+ .b8 114
609
+ .b8 111
610
+ .b8 111
611
+ .b8 116
612
+ .b8 47
613
+ .b8 106
614
+ .b8 98
615
+ .b8 0
616
+ .b8 1
617
+ .b64 $L__func_begin0
618
+ .b64 $L__func_end0
619
+ .b8 2
620
+ .b8 116
621
+ .b8 114
622
+ .b8 105
623
+ .b8 116
624
+ .b8 111
625
+ .b8 110
626
+ .b8 95
627
+ .b8 95
628
+ .b8 48
629
+ .b8 100
630
+ .b8 49
631
+ .b8 100
632
+ .b8 50
633
+ .b8 100
634
+ .b8 51
635
+ .b8 100
636
+ .b8 52
637
+ .b8 100
638
+ .b8 53
639
+ .b8 100
640
+ .b8 54
641
+ .b8 100
642
+ .b8 55
643
+ .b8 100
644
+ .b8 56
645
+ .b8 100
646
+ .b8 101
647
+ .b8 57
648
+ .b8 100
649
+ .b8 101
650
+ .b8 0
651
+ .b8 116
652
+ .b8 114
653
+ .b8 105
654
+ .b8 116
655
+ .b8 111
656
+ .b8 110
657
+ .b8 95
658
+ .b8 95
659
+ .b8 48
660
+ .b8 100
661
+ .b8 49
662
+ .b8 100
663
+ .b8 50
664
+ .b8 100
665
+ .b8 51
666
+ .b8 100
667
+ .b8 52
668
+ .b8 100
669
+ .b8 53
670
+ .b8 100
671
+ .b8 54
672
+ .b8 100
673
+ .b8 55
674
+ .b8 100
675
+ .b8 56
676
+ .b8 100
677
+ .b8 101
678
+ .b8 57
679
+ .b8 100
680
+ .b8 101
681
+ .b8 0
682
+ .b8 1
683
+ .b8 18
684
+ .b8 1
685
+ .b8 1
686
+ .b8 3
687
+ .b64 $L__func_begin0
688
+ .b64 $L__func_end0
689
+ .b8 1
690
+ .b8 156
691
+ .b32 125
692
+ .b8 4
693
+ .b32 125
694
+ .b64 $L__tmp1
695
+ .b64 $L__tmp14
696
+ .b8 2
697
+ .b8 48
698
+ .b8 59
699
+ .b8 5
700
+ .b32 125
701
+ .b64 $L__tmp1
702
+ .b64 $L__tmp14
703
+ .b8 2
704
+ .b8 243
705
+ .b8 36
706
+ .b8 0
707
+ .b8 5
708
+ .b32 125
709
+ .b64 $L__tmp2
710
+ .b64 $L__tmp15
711
+ .b8 2
712
+ .b8 48
713
+ .b8 59
714
+ .b8 5
715
+ .b32 125
716
+ .b64 $L__tmp15
717
+ .b64 $L__tmp16
718
+ .b8 3
719
+ .b8 48
720
+ .b8 45
721
+ .b8 5
722
+ .b32 125
723
+ .b64 $L__tmp17
724
+ .b64 $L__tmp32
725
+ .b8 2
726
+ .b8 56
727
+ .b8 59
728
+ .b8 4
729
+ .b32 125
730
+ .b64 $L__tmp18
731
+ .b64 $L__tmp31
732
+ .b8 2
733
+ .b8 56
734
+ .b8 59
735
+ .b8 5
736
+ .b32 125
737
+ .b64 $L__tmp18
738
+ .b64 $L__tmp31
739
+ .b8 2
740
+ .b8 243
741
+ .b8 36
742
+ .b8 0
743
+ .b8 5
744
+ .b32 125
745
+ .b64 $L__tmp32
746
+ .b64 $L__tmp33
747
+ .b8 3
748
+ .b8 56
749
+ .b8 45
750
+ .b8 0
751
+ .b8 0
752
+ }
753
+ .section .debug_pubnames
754
+ {
755
+ .b32 $L__pubNames_end0-$L__pubNames_start0
756
+ $L__pubNames_start0:
757
+ .b8 2
758
+ .b8 0
759
+ .b32 .debug_info
760
+ .b32 411
761
+ .b32 125
762
+ .b8 116
763
+ .b8 114
764
+ .b8 105
765
+ .b8 116
766
+ .b8 111
767
+ .b8 110
768
+ .b8 95
769
+ .b8 95
770
+ .b8 48
771
+ .b8 100
772
+ .b8 49
773
+ .b8 100
774
+ .b8 50
775
+ .b8 100
776
+ .b8 51
777
+ .b8 100
778
+ .b8 52
779
+ .b8 100
780
+ .b8 53
781
+ .b8 100
782
+ .b8 54
783
+ .b8 100
784
+ .b8 55
785
+ .b8 100
786
+ .b8 56
787
+ .b8 100
788
+ .b8 101
789
+ .b8 57
790
+ .b8 100
791
+ .b8 101
792
+ .b8 0
793
+ .b32 0
794
+ $L__pubNames_end0:
795
+ }
796
+ .section .debug_pubtypes
797
+ {
798
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
799
+ $L__pubTypes_start0:
800
+ .b8 2
801
+ .b8 0
802
+ .b32 .debug_info
803
+ .b32 411
804
+ .b32 0
805
+ $L__pubTypes_end0:
806
+ }
807
+ .section .debug_loc { }
.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.ttir ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<256> : tensor<32x1xi64>
4
+ %cst_0 = arith.constant dense<0> : tensor<32x1xi64>
5
+ %cst_1 = arith.constant dense<512> : tensor<32x1xi64>
6
+ %cst_2 = arith.constant dense<true> : tensor<32x1xi1>
7
+ %cst_3 = arith.constant dense<256> : tensor<32x1xi32>
8
+ %cst_4 = arith.constant dense<131072> : tensor<1x128xi32>
9
+ %cst_5 = arith.constant dense<120> : tensor<1x128xi32>
10
+ %cst_6 = arith.constant dense<0.000000e+00> : tensor<32x128xf32>
11
+ %c32_i32 = arith.constant 32 : i32
12
+ %0 = tt.get_program_id x : i32
13
+ %1 = arith.muli %0, %c32_i32 : i32
14
+ %2 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32>
15
+ %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<32xi32>) -> tensor<32x1xi32>
16
+ %4 = tt.splat %1 : (i32) -> tensor<32x1xi32>
17
+ %5 = arith.addi %4, %3 : tensor<32x1xi32>
18
+ %6 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
19
+ %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<128xi32>) -> tensor<1x128xi32>
20
+ %8 = arith.cmpi slt, %7, %cst_5 : tensor<1x128xi32>
21
+ %9 = arith.muli %7, %cst_4 : tensor<1x128xi32>
22
+ %10 = tt.broadcast %5 : (tensor<32x1xi32>) -> tensor<32x128xi32>
23
+ %11 = tt.broadcast %9 : (tensor<1x128xi32>) -> tensor<32x128xi32>
24
+ %12 = arith.addi %10, %11 : tensor<32x128xi32>
25
+ %13 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<32x128x!tt.ptr<f32, 1>>
26
+ %14 = tt.addptr %13, %12 : tensor<32x128x!tt.ptr<f32, 1>>, tensor<32x128xi32>
27
+ %15 = tt.broadcast %8 : (tensor<1x128xi1>) -> tensor<32x128xi1>
28
+ %16 = tt.load %14, %15, %cst_6 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<32x128xf32>
29
+ %17 = arith.addf %16, %cst_6 : tensor<32x128xf32>
30
+ %18 = arith.select %15, %17, %cst_6 : tensor<32x128xi1>, tensor<32x128xf32>
31
+ %19 = "tt.reduce"(%18) <{axis = 1 : i32}> ({
32
+ ^bb0(%arg5: f32, %arg6: f32):
33
+ %35 = arith.addf %arg5, %arg6 : f32
34
+ tt.reduce.return %35 : f32
35
+ }) : (tensor<32x128xf32>) -> tensor<32xf32>
36
+ %20 = tt.expand_dims %19 {axis = 1 : i32} : (tensor<32xf32>) -> tensor<32x1xf32>
37
+ %21 = arith.divsi %5, %cst_3 : tensor<32x1xi32>
38
+ %22 = arith.remsi %5, %cst_3 : tensor<32x1xi32>
39
+ %23 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<32x1x!tt.ptr<i64, 1>>
40
+ %24 = tt.addptr %23, %21 : tensor<32x1x!tt.ptr<i64, 1>>, tensor<32x1xi32>
41
+ %25 = tt.load %24 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<32x1xi64>
42
+ %26 = arith.addi %25, %cst_1 : tensor<32x1xi64>
43
+ %27 = arith.cmpi slt, %25, %cst_0 : tensor<32x1xi64>
44
+ %28 = arith.select %27, %26, %25 : tensor<32x1xi1>, tensor<32x1xi64>
45
+ %29 = arith.muli %28, %cst : tensor<32x1xi64>
46
+ %30 = arith.extsi %22 : tensor<32x1xi32> to tensor<32x1xi64>
47
+ %31 = arith.addi %30, %29 : tensor<32x1xi64>
48
+ %32 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<32x1x!tt.ptr<f32, 1>>
49
+ %33 = tt.addptr %32, %31 : tensor<32x1x!tt.ptr<f32, 1>>, tensor<32x1xi64>
50
+ %34 = "tt.atomic_rmw"(%33, %20, %cst_2) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<32x1x!tt.ptr<f32, 1>>, tensor<32x1xf32>, tensor<32x1xi1>) -> tensor<32x1xf32>
51
+ tt.return
52
+ }
53
+ }
.triton/dump/305a9479aab997a3a16bfe46bb303a50/triton_.cubin ADDED
Binary file (30.4 kB). View file
 
.triton/dump/305a9479aab997a3a16bfe46bb303a50/triton_.ptx ADDED
@@ -0,0 +1,1041 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4d5d6de7de
10
+ .extern .func __assertfail
11
+ (
12
+ .param .b64 __assertfail_param_0,
13
+ .param .b64 __assertfail_param_1,
14
+ .param .b32 __assertfail_param_2,
15
+ .param .b64 __assertfail_param_3,
16
+ .param .b64 __assertfail_param_4
17
+ )
18
+ ;
19
+ .global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
20
+ .global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
21
+ .global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 54, 32, 60, 32, 53, 48, 50, 53, 55};
22
+ .global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
23
+ .global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
24
+ .global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
25
+ .extern .shared .align 1 .b8 global_smem[];
26
+ .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
27
+
28
+ .visible .entry triton__0d1d2d3d4d5d6de7de(
29
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_0,
30
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_1,
31
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_2,
32
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_3,
33
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_4,
34
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_5,
35
+ .param .u32 triton__0d1d2d3d4d5d6de7de_param_6,
36
+ .param .u32 triton__0d1d2d3d4d5d6de7de_param_7
37
+ )
38
+ .maxntid 64, 1, 1
39
+ {
40
+ .reg .pred %p<59>;
41
+ .reg .b16 %rs<13>;
42
+ .reg .b32 %r<176>;
43
+ .reg .f32 %f<169>;
44
+ .reg .b64 %rd<58>;
45
+ .loc 1 18 0
46
+ $L__func_begin0:
47
+ .loc 1 18 0
48
+
49
+ ld.param.u64 %rd8, [triton__0d1d2d3d4d5d6de7de_param_4];
50
+ ld.param.u64 %rd7, [triton__0d1d2d3d4d5d6de7de_param_1];
51
+ ld.param.u64 %rd22, [triton__0d1d2d3d4d5d6de7de_param_0];
52
+ $L__tmp0:
53
+ .loc 1 24 33
54
+ mov.u32 %r1, %tid.x;
55
+ and.b32 %r2, %r1, 31;
56
+ ld.param.u64 %rd23, [triton__0d1d2d3d4d5d6de7de_param_2];
57
+ ld.param.u64 %rd24, [triton__0d1d2d3d4d5d6de7de_param_3];
58
+ bfe.u32 %r3, %r1, 5, 1;
59
+ shl.b32 %r30, %r1, 2;
60
+ and.b32 %r4, %r30, 252;
61
+ .loc 1 21 28
62
+ mov.u32 %r13, %ctaid.x;
63
+ .loc 1 26 30
64
+ mul.wide.s32 %rd25, %r13, 8;
65
+ add.s64 %rd11, %rd22, %rd25;
66
+ mov.pred %p53, -1;
67
+ .loc 1 26 35
68
+ mov.u64 %rd10, 0x0;
69
+ @%p53 ld.global.L1::evict_last.b64 { %rd10 }, [ %rd11 + 0 ];
70
+ mov.u64 %rd12, 0x0;
71
+ @%p53 ld.global.L1::evict_last.b64 { %rd12 }, [ %rd11 + 0 ];
72
+ mov.u64 %rd14, 0x0;
73
+ @%p53 ld.global.L1::evict_last.b64 { %rd14 }, [ %rd11 + 0 ];
74
+ mov.u64 %rd16, 0x0;
75
+ @%p53 ld.global.L1::evict_last.b64 { %rd16 }, [ %rd11 + 0 ];
76
+ mov.u64 %rd18, 0x0;
77
+ @%p53 ld.global.L1::evict_last.b64 { %rd18 }, [ %rd11 + 0 ];
78
+ .loc 1 27 18
79
+ shr.s32 %r31, %r13, 31;
80
+ shr.u32 %r32, %r31, 23;
81
+ add.s32 %r33, %r13, %r32;
82
+ and.b32 %r34, %r33, 16776704;
83
+ sub.s32 %r35, %r13, %r34;
84
+ .loc 1 35 44
85
+ shl.b32 %r36, %r35, 8;
86
+ .loc 1 35 40
87
+ or.b32 %r37, %r36, %r4;
88
+ .loc 1 35 34
89
+ mul.wide.s32 %rd26, %r37, 4;
90
+ add.s64 %rd37, %rd23, %rd26;
91
+ mov.b32 %r151, 0;
92
+ .loc 1 35 50
93
+ mov.u32 %r14, 0x0;
94
+ mov.u32 %r15, 0x0;
95
+ mov.u32 %r16, 0x0;
96
+ mov.u32 %r17, 0x0;
97
+ @%p53 ld.global.L1::evict_last.v4.b32 { %r14, %r15, %r16, %r17 }, [ %rd37 + 0 ];
98
+ @!%p53 mov.u32 %r14, %r151;
99
+ @!%p53 mov.u32 %r15, %r151;
100
+ @!%p53 mov.u32 %r16, %r151;
101
+ @!%p53 mov.u32 %r17, %r151;
102
+ mov.b32 %f2, %r14;
103
+ mov.b32 %f1, %r15;
104
+ mov.b32 %f3, %r16;
105
+ mov.b32 %f4, %r17;
106
+ .loc 1 36 44
107
+ shl.b32 %r38, %r13, 8;
108
+ .loc 1 36 40
109
+ or.b32 %r39, %r38, %r4;
110
+ .loc 1 36 34
111
+ mul.wide.s32 %rd27, %r39, 2;
112
+ add.s64 %rd38, %rd24, %rd27;
113
+ .loc 1 36 50
114
+ mov.u32 %r22, 0x0;
115
+ mov.u32 %r23, 0x0;
116
+ @%p53 ld.global.L1::evict_last.v2.b32 { %r22, %r23 }, [ %rd38 + 0 ];
117
+ @!%p53 mov.u32 %r22, %r151;
118
+ @!%p53 mov.u32 %r23, %r151;
119
+ cvt.u16.u32 %rs1, %r22;
120
+ { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r22; }
121
+ cvt.u16.u32 %rs3, %r23;
122
+ { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r23; }
123
+ .loc 1 36 101
124
+ cvt.f32.bf16 %r26, %rs1;
125
+ mov.b32 %f5, %r26;
126
+ cvt.f32.bf16 %r27, %rs2;
127
+ mov.b32 %f6, %r27;
128
+ cvt.f32.bf16 %r28, %rs3;
129
+ mov.b32 %f7, %r28;
130
+ cvt.f32.bf16 %r29, %rs4;
131
+ mov.b32 %f8, %r29;
132
+ .loc 1 37 22
133
+ add.s64 %rd28, %rd18, 50257;
134
+ .loc 1 38 22
135
+ setp.lt.s64 %p14, %rd18, 0;
136
+ .loc 1 39 36
137
+ selp.b64 %rd5, %rd28, %rd18, %p14;
138
+ .loc 1 40 40
139
+ setp.lt.u64 %p15, %rd5, 50257;
140
+ mov.b32 %r175, 883;
141
+ mov.u64 %rd57, 1;
142
+ .loc 1 40 55
143
+ @%p15 bra $L__BB0_2;
144
+ mov.u64 %rd29, assertMessage_0;
145
+ cvta.global.u64 %rd30, %rd29;
146
+ mov.u64 %rd31, assertFile_0;
147
+ cvta.global.u64 %rd32, %rd31;
148
+ mov.u64 %rd33, assertFunc_0;
149
+ cvta.global.u64 %rd34, %rd33;
150
+ { // callseq 0, 0
151
+ .reg .b32 temp_param_reg;
152
+ .param .b64 param0;
153
+ st.param.b64 [param0+0], %rd30;
154
+ .param .b64 param1;
155
+ st.param.b64 [param1+0], %rd32;
156
+ .param .b32 param2;
157
+ st.param.b32 [param2+0], %r175;
158
+ .param .b64 param3;
159
+ st.param.b64 [param3+0], %rd34;
160
+ .param .b64 param4;
161
+ st.param.b64 [param4+0], %rd57;
162
+ call.uni
163
+ __assertfail,
164
+ (
165
+ param0,
166
+ param1,
167
+ param2,
168
+ param3,
169
+ param4
170
+ );
171
+ } // callseq 0
172
+ $L__BB0_2:
173
+ .loc 1 0 55
174
+ ld.param.u64 %rd9, [triton__0d1d2d3d4d5d6de7de_param_5];
175
+ cvt.s64.s32 %rd3, %r39;
176
+ .loc 1 38 22
177
+ setp.lt.s64 %p44, %rd10, 0;
178
+ .loc 1 41 44
179
+ shl.b64 %rd40, %rd10, 8;
180
+ add.s64 %rd41, %rd40, 12865792;
181
+ selp.b64 %rd42, %rd41, %rd40, %p44;
182
+ cvt.u64.u32 %rd43, %r4;
183
+ .loc 1 41 40
184
+ or.b64 %rd44, %rd42, %rd43;
185
+ .loc 1 41 34
186
+ shl.b64 %rd45, %rd44, 2;
187
+ add.s64 %rd54, %rd7, %rd45;
188
+ .loc 1 41 52
189
+ mov.u32 %r41, 0x0;
190
+ mov.u32 %r42, 0x0;
191
+ mov.u32 %r43, 0x0;
192
+ mov.u32 %r44, 0x0;
193
+ @%p53 ld.global.L1::evict_last.v4.b32 { %r41, %r42, %r43, %r44 }, [ %rd54 + 0 ];
194
+ @!%p53 mov.u32 %r41, %r151;
195
+ @!%p53 mov.u32 %r42, %r151;
196
+ @!%p53 mov.u32 %r43, %r151;
197
+ @!%p53 mov.u32 %r44, %r151;
198
+ mov.b32 %f15, %r43;
199
+ mov.b32 %f16, %r44;
200
+ .loc 1 42 22
201
+ add.f32 %f17, %f3, %f15;
202
+ add.f32 %f18, %f4, %f16;
203
+ .loc 1 44 22
204
+ add.f32 %f19, %f7, %f17;
205
+ add.f32 %f20, %f8, %f18;
206
+ .loc 1 41 52
207
+ mov.b32 %f21, %r41;
208
+ mov.b32 %f22, %r42;
209
+ .loc 1 42 22
210
+ add.f32 %f23, %f1, %f22;
211
+ add.f32 %f24, %f2, %f21;
212
+ .loc 1 44 22
213
+ add.f32 %f25, %f5, %f24;
214
+ add.f32 %f26, %f6, %f23;
215
+ $L__tmp1:
216
+ .loc 2 98 22
217
+ add.f32 %f27, %f26, 0f00000000;
218
+ add.f32 %f28, %f25, 0f00000000;
219
+ add.f32 %f29, %f19, 0f00000000;
220
+ add.f32 %f30, %f20, 0f00000000;
221
+ .loc 2 101 30
222
+ sub.f32 %f31, %f25, %f28;
223
+ sub.f32 %f32, %f26, %f27;
224
+ sub.f32 %f33, %f19, %f29;
225
+ sub.f32 %f34, %f20, %f30;
226
+ .loc 2 101 13
227
+ fma.rn.f32 %f35, %f25, %f31, 0f00000000;
228
+ fma.rn.f32 %f36, %f26, %f32, 0f00000000;
229
+ fma.rn.f32 %f37, %f19, %f33, 0f00000000;
230
+ fma.rn.f32 %f38, %f20, %f34, 0f00000000;
231
+ $L__tmp2:
232
+ .loc 2 108 21
233
+ sub.f32 %f39, %f27, %f28;
234
+ mov.b32 %r50, 1065353216;
235
+ mov.b32 %r51, 1073741824;
236
+ .loc 2 110 60
237
+ div.full.f32 %r49, %r50, %r51;
238
+ mov.b32 %f40, %r49;
239
+ .loc 2 112 17
240
+ fma.rn.f32 %f41, %f40, %f39, %f28;
241
+ .loc 2 113 15
242
+ add.f32 %f42, %f35, %f36;
243
+ .loc 2 113 30
244
+ mul.f32 %f43, %f39, %f39;
245
+ .loc 2 113 22
246
+ fma.rn.f32 %f44, %f40, %f43, %f42;
247
+ .loc 2 108 21
248
+ sub.f32 %f45, %f29, %f41;
249
+ mov.b32 %r54, 1077936128;
250
+ .loc 2 110 60
251
+ div.full.f32 %r52, %r50, %r54;
252
+ mov.b32 %f46, %r52;
253
+ .loc 2 112 17
254
+ fma.rn.f32 %f47, %f46, %f45, %f41;
255
+ .loc 2 113 15
256
+ add.f32 %f48, %f37, %f44;
257
+ .loc 2 113 30
258
+ mul.f32 %f49, %f45, %f45;
259
+ .loc 2 113 38
260
+ fma.rn.f32 %f50, %f45, %f45, %f49;
261
+ .loc 2 113 22
262
+ fma.rn.f32 %f51, %f46, %f50, %f48;
263
+ .loc 2 108 21
264
+ sub.f32 %f52, %f30, %f47;
265
+ mov.b32 %r57, 1082130432;
266
+ .loc 2 110 60
267
+ div.full.f32 %r55, %r50, %r57;
268
+ mov.b32 %f53, %r55;
269
+ .loc 2 112 17
270
+ fma.rn.f32 %f54, %f53, %f52, %f47;
271
+ .loc 2 113 15
272
+ add.f32 %f55, %f38, %f51;
273
+ .loc 2 113 30
274
+ mul.f32 %f56, %f52, %f52;
275
+ .loc 2 113 38
276
+ mul.f32 %f57, %f56, 0f40400000;
277
+ .loc 2 113 22
278
+ fma.rn.f32 %f58, %f53, %f57, %f55;
279
+ $L__tmp3:
280
+ .loc 2 120 46
281
+ mov.b32 %r118, %f54;
282
+ shfl.sync.bfly.b32 %r119, %r118, 16, 31, -1;
283
+ mov.b32 %f59, %r119;
284
+ mov.b32 %r120, %f58;
285
+ shfl.sync.bfly.b32 %r121, %r120, 16, 31, -1;
286
+ mov.b32 %f60, %r121;
287
+ shfl.sync.bfly.b32 %r59, %r57, 16, 31, -1;
288
+ mov.b32 %f61, %r59;
289
+ $L__tmp4:
290
+ .loc 2 108 21
291
+ sub.f32 %f62, %f59, %f54;
292
+ .loc 2 109 28
293
+ add.f32 %f63, %f61, 0f40800000;
294
+ .loc 2 110 39
295
+ setp.eq.f32 %p45, %f63, 0f00000000;
296
+ .loc 2 110 60
297
+ mov.b32 %r60, %f63;
298
+ div.full.f32 %r58, %r59, %r60;
299
+ mov.b32 %f64, %r58;
300
+ .loc 2 110 49
301
+ selp.f32 %f65, 0f00000000, %f64, %p45;
302
+ .loc 2 112 17
303
+ fma.rn.f32 %f66, %f65, %f62, %f54;
304
+ .loc 2 113 15
305
+ add.f32 %f67, %f58, %f60;
306
+ .loc 2 113 30
307
+ mul.f32 %f68, %f62, %f62;
308
+ .loc 2 113 38
309
+ mul.f32 %f69, %f68, 0f40800000;
310
+ .loc 2 113 22
311
+ fma.rn.f32 %f70, %f65, %f69, %f67;
312
+ $L__tmp5:
313
+ .loc 2 120 46
314
+ mov.b32 %r122, %f66;
315
+ shfl.sync.bfly.b32 %r123, %r122, 8, 31, -1;
316
+ mov.b32 %f71, %r123;
317
+ mov.b32 %r124, %f70;
318
+ shfl.sync.bfly.b32 %r125, %r124, 8, 31, -1;
319
+ mov.b32 %f72, %r125;
320
+ shfl.sync.bfly.b32 %r62, %r60, 8, 31, -1;
321
+ mov.b32 %f73, %r62;
322
+ $L__tmp6:
323
+ .loc 2 108 21
324
+ sub.f32 %f74, %f71, %f66;
325
+ .loc 2 109 28
326
+ add.f32 %f75, %f63, %f73;
327
+ .loc 2 110 39
328
+ setp.eq.f32 %p46, %f75, 0f00000000;
329
+ .loc 2 110 60
330
+ mov.b32 %r63, %f75;
331
+ div.full.f32 %r61, %r62, %r63;
332
+ mov.b32 %f76, %r61;
333
+ .loc 2 110 49
334
+ selp.f32 %f77, 0f00000000, %f76, %p46;
335
+ .loc 2 112 17
336
+ fma.rn.f32 %f78, %f77, %f74, %f66;
337
+ .loc 2 113 15
338
+ add.f32 %f79, %f70, %f72;
339
+ .loc 2 113 30
340
+ mul.f32 %f80, %f74, %f74;
341
+ .loc 2 113 38
342
+ mul.f32 %f81, %f63, %f80;
343
+ .loc 2 113 22
344
+ fma.rn.f32 %f82, %f77, %f81, %f79;
345
+ $L__tmp7:
346
+ .loc 2 120 46
347
+ mov.b32 %r126, %f78;
348
+ shfl.sync.bfly.b32 %r127, %r126, 4, 31, -1;
349
+ mov.b32 %f83, %r127;
350
+ mov.b32 %r128, %f82;
351
+ shfl.sync.bfly.b32 %r129, %r128, 4, 31, -1;
352
+ mov.b32 %f84, %r129;
353
+ shfl.sync.bfly.b32 %r65, %r63, 4, 31, -1;
354
+ mov.b32 %f85, %r65;
355
+ $L__tmp8:
356
+ .loc 2 108 21
357
+ sub.f32 %f86, %f83, %f78;
358
+ .loc 2 109 28
359
+ add.f32 %f87, %f75, %f85;
360
+ .loc 2 110 39
361
+ setp.eq.f32 %p47, %f87, 0f00000000;
362
+ .loc 2 110 60
363
+ mov.b32 %r66, %f87;
364
+ div.full.f32 %r64, %r65, %r66;
365
+ mov.b32 %f88, %r64;
366
+ .loc 2 110 49
367
+ selp.f32 %f89, 0f00000000, %f88, %p47;
368
+ .loc 2 112 17
369
+ fma.rn.f32 %f90, %f89, %f86, %f78;
370
+ .loc 2 113 15
371
+ add.f32 %f91, %f82, %f84;
372
+ .loc 2 113 30
373
+ mul.f32 %f92, %f86, %f86;
374
+ .loc 2 113 38
375
+ mul.f32 %f93, %f75, %f92;
376
+ .loc 2 113 22
377
+ fma.rn.f32 %f94, %f89, %f93, %f91;
378
+ $L__tmp9:
379
+ .loc 2 120 46
380
+ mov.b32 %r130, %f90;
381
+ shfl.sync.bfly.b32 %r131, %r130, 2, 31, -1;
382
+ mov.b32 %f95, %r131;
383
+ mov.b32 %r132, %f94;
384
+ shfl.sync.bfly.b32 %r133, %r132, 2, 31, -1;
385
+ mov.b32 %f96, %r133;
386
+ shfl.sync.bfly.b32 %r68, %r66, 2, 31, -1;
387
+ mov.b32 %f97, %r68;
388
+ $L__tmp10:
389
+ .loc 2 108 21
390
+ sub.f32 %f98, %f95, %f90;
391
+ .loc 2 109 28
392
+ add.f32 %f99, %f87, %f97;
393
+ .loc 2 110 39
394
+ setp.eq.f32 %p48, %f99, 0f00000000;
395
+ .loc 2 110 60
396
+ mov.b32 %r69, %f99;
397
+ div.full.f32 %r67, %r68, %r69;
398
+ mov.b32 %f100, %r67;
399
+ .loc 2 110 49
400
+ selp.f32 %f101, 0f00000000, %f100, %p48;
401
+ .loc 2 112 17
402
+ fma.rn.f32 %f102, %f101, %f98, %f90;
403
+ .loc 2 113 15
404
+ add.f32 %f103, %f94, %f96;
405
+ .loc 2 113 30
406
+ mul.f32 %f104, %f98, %f98;
407
+ .loc 2 113 38
408
+ mul.f32 %f105, %f87, %f104;
409
+ .loc 2 113 22
410
+ fma.rn.f32 %f106, %f101, %f105, %f103;
411
+ $L__tmp11:
412
+ .loc 2 120 46
413
+ mov.b32 %r134, %f102;
414
+ shfl.sync.bfly.b32 %r135, %r134, 1, 31, -1;
415
+ mov.b32 %f107, %r135;
416
+ mov.b32 %r136, %f106;
417
+ shfl.sync.bfly.b32 %r137, %r136, 1, 31, -1;
418
+ mov.b32 %f108, %r137;
419
+ shfl.sync.bfly.b32 %r71, %r69, 1, 31, -1;
420
+ mov.b32 %f109, %r71;
421
+ $L__tmp12:
422
+ .loc 2 108 21
423
+ sub.f32 %f110, %f107, %f102;
424
+ .loc 2 109 28
425
+ add.f32 %f111, %f99, %f109;
426
+ .loc 2 110 39
427
+ setp.eq.f32 %p49, %f111, 0f00000000;
428
+ .loc 2 110 60
429
+ mov.b32 %r72, %f111;
430
+ div.full.f32 %r70, %r71, %r72;
431
+ mov.b32 %f112, %r70;
432
+ .loc 2 110 49
433
+ selp.f32 %f113, 0f00000000, %f112, %p49;
434
+ .loc 2 112 17
435
+ fma.rn.f32 %f114, %f113, %f110, %f102;
436
+ .loc 2 113 15
437
+ add.f32 %f115, %f106, %f108;
438
+ .loc 2 113 30
439
+ mul.f32 %f116, %f110, %f110;
440
+ .loc 2 113 38
441
+ mul.f32 %f117, %f99, %f116;
442
+ .loc 2 113 22
443
+ fma.rn.f32 %f118, %f113, %f117, %f115;
444
+ $L__tmp13:
445
+ .loc 2 120 46
446
+ setp.eq.s32 %p21, %r2, 0;
447
+ shl.b32 %r138, %r3, 2;
448
+ mov.u32 %r139, global_smem;
449
+ add.s32 %r73, %r139, %r138;
450
+ mov.b32 %r74, %f114;
451
+ @%p21 st.shared.b32 [ %r73 + 0 ], %r74;
452
+ add.s32 %r140, %r139, 8;
453
+ add.s32 %r75, %r140, %r138;
454
+ mov.b32 %r76, %f118;
455
+ @%p21 st.shared.b32 [ %r75 + 0 ], %r76;
456
+ add.s32 %r141, %r139, 16;
457
+ add.s32 %r77, %r141, %r138;
458
+ @%p21 st.shared.b32 [ %r77 + 0 ], %r72;
459
+ bar.sync 0;
460
+ setp.lt.s32 %p24, %r1, 2;
461
+ add.s32 %r80, %r139, %r30;
462
+ @%p24 ld.shared.b32 %r79, [ %r80 + 0 ];
463
+ mov.b32 %f119, %r79;
464
+ add.s32 %r82, %r140, %r30;
465
+ @%p24 ld.shared.b32 %r81, [ %r82 + 0 ];
466
+ mov.b32 %f120, %r81;
467
+ add.s32 %r84, %r141, %r30;
468
+ @%p24 ld.shared.b32 %r83, [ %r84 + 0 ];
469
+ mov.b32 %f121, %r83;
470
+ shfl.sync.bfly.b32 %r143, %r79, 1, 31, -1;
471
+ mov.b32 %f122, %r143;
472
+ shfl.sync.bfly.b32 %r144, %r81, 1, 31, -1;
473
+ mov.b32 %f123, %r144;
474
+ shfl.sync.bfly.b32 %r86, %r83, 1, 31, -1;
475
+ mov.b32 %f124, %r86;
476
+ $L__tmp14:
477
+ .loc 2 108 21
478
+ sub.f32 %f125, %f122, %f119;
479
+ .loc 2 109 28
480
+ add.f32 %f126, %f121, %f124;
481
+ .loc 2 110 39
482
+ setp.eq.f32 %p50, %f126, 0f00000000;
483
+ .loc 2 110 60
484
+ mov.b32 %r87, %f126;
485
+ div.full.f32 %r85, %r86, %r87;
486
+ mov.b32 %f127, %r85;
487
+ .loc 2 110 49
488
+ selp.f32 %f128, 0f00000000, %f127, %p50;
489
+ .loc 2 112 17
490
+ fma.rn.f32 %f129, %f125, %f128, %f119;
491
+ .loc 2 113 15
492
+ add.f32 %f130, %f120, %f123;
493
+ .loc 2 113 30
494
+ mul.f32 %f131, %f125, %f125;
495
+ .loc 2 113 38
496
+ mul.f32 %f132, %f121, %f131;
497
+ .loc 2 113 22
498
+ fma.rn.f32 %f133, %f132, %f128, %f130;
499
+ $L__tmp15:
500
+ .loc 2 120 46
501
+ and.b32 %r145, %r1, 1;
502
+ setp.eq.b32 %p51, %r145, 1;
503
+ not.pred %p52, %p51;
504
+ and.pred %p27, %p24, %p52;
505
+ mov.b32 %r89, %f129;
506
+ @%p27 st.shared.b32 [ %r80 + 0 ], %r89;
507
+ mov.b32 %r91, %f133;
508
+ @%p27 st.shared.b32 [ %r82 + 0 ], %r91;
509
+ @%p27 st.shared.b32 [ %r84 + 0 ], %r87;
510
+ bar.sync 0;
511
+ ld.shared.f32 %f9, [global_smem];
512
+ ld.shared.f32 %f10, [global_smem+8];
513
+ $L__tmp16:
514
+ .loc 1 62 51
515
+ mov.u32 %r94, 0x0;
516
+ mov.u32 %r95, 0x0;
517
+ mov.u32 %r96, 0x0;
518
+ mov.u32 %r97, 0x0;
519
+ @%p53 ld.global.L1::evict_last.v4.b32 { %r94, %r95, %r96, %r97 }, [ %rd37 + 0 ];
520
+ @!%p53 mov.u32 %r94, %r151;
521
+ @!%p53 mov.u32 %r95, %r151;
522
+ @!%p53 mov.u32 %r96, %r151;
523
+ @!%p53 mov.u32 %r97, %r151;
524
+ .loc 1 63 51
525
+ mov.u32 %r102, 0x0;
526
+ mov.u32 %r103, 0x0;
527
+ @%p53 ld.global.L1::evict_first.v2.b32 { %r102, %r103 }, [ %rd38 + 0 ];
528
+ @!%p53 mov.u32 %r102, %r151;
529
+ @!%p53 mov.u32 %r103, %r151;
530
+ cvt.u16.u32 %rs5, %r102;
531
+ { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r102; }
532
+ cvt.u16.u32 %rs7, %r103;
533
+ { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r103; }
534
+ .loc 1 63 103
535
+ cvt.f32.bf16 %r106, %rs5;
536
+ mov.b32 %f11, %r106;
537
+ cvt.f32.bf16 %r107, %rs6;
538
+ mov.b32 %f12, %r107;
539
+ cvt.f32.bf16 %r108, %rs7;
540
+ mov.b32 %f13, %r108;
541
+ cvt.f32.bf16 %r109, %rs8;
542
+ mov.b32 %f14, %r109;
543
+ .loc 1 64 35
544
+ mul.wide.u32 %rd46, %r4, 4;
545
+ add.s64 %rd39, %rd8, %rd46;
546
+ .loc 1 64 40
547
+ mov.u32 %r110, 0x0;
548
+ mov.u32 %r111, 0x0;
549
+ mov.u32 %r112, 0x0;
550
+ mov.u32 %r113, 0x0;
551
+ @%p53 ld.global.L1::evict_last.v4.b32 { %r110, %r111, %r112, %r113 }, [ %rd39 + 0 ];
552
+ @!%p53 mov.u32 %r110, %r151;
553
+ @!%p53 mov.u32 %r111, %r151;
554
+ @!%p53 mov.u32 %r112, %r151;
555
+ @!%p53 mov.u32 %r113, %r151;
556
+ .loc 1 68 57
557
+ @%p15 bra $L__BB0_4;
558
+ mov.u64 %rd47, assertMessage_1;
559
+ cvta.global.u64 %rd48, %rd47;
560
+ mov.u64 %rd49, assertFile_1;
561
+ cvta.global.u64 %rd50, %rd49;
562
+ mov.u64 %rd51, assertFunc_1;
563
+ cvta.global.u64 %rd52, %rd51;
564
+ { // callseq 1, 0
565
+ .reg .b32 temp_param_reg;
566
+ .param .b64 param0;
567
+ st.param.b64 [param0+0], %rd48;
568
+ .param .b64 param1;
569
+ st.param.b64 [param1+0], %rd50;
570
+ .param .b32 param2;
571
+ st.param.b32 [param2+0], %r175;
572
+ .param .b64 param3;
573
+ st.param.b64 [param3+0], %rd52;
574
+ .param .b64 param4;
575
+ st.param.b64 [param4+0], %rd57;
576
+ call.uni
577
+ __assertfail,
578
+ (
579
+ param0,
580
+ param1,
581
+ param2,
582
+ param3,
583
+ param4
584
+ );
585
+ } // callseq 1
586
+ $L__BB0_4:
587
+ .loc 1 69 54
588
+ mov.u32 %r147, 0x0;
589
+ mov.u32 %r148, 0x0;
590
+ mov.u32 %r149, 0x0;
591
+ mov.u32 %r150, 0x0;
592
+ @%p53 ld.global.L1::evict_first.v4.b32 { %r147, %r148, %r149, %r150 }, [ %rd54 + 0 ];
593
+ @!%p53 mov.u32 %r147, %r151;
594
+ @!%p53 mov.u32 %r148, %r151;
595
+ @!%p53 mov.u32 %r149, %r151;
596
+ @!%p53 mov.u32 %r150, %r151;
597
+ .loc 1 75 24
598
+ mov.b32 %r156, %f10;
599
+ mov.b32 %r157, 1132462080;
600
+ div.full.f32 %r155, %r156, %r157;
601
+ mov.b32 %f134, %r155;
602
+ .loc 1 77 24
603
+ add.f32 %f135, %f134, 0f3727C5AC;
604
+ .loc 1 78 30
605
+ rsqrt.approx.ftz.f32 %f136, %f135;
606
+ .loc 1 69 54
607
+ mov.b32 %f137, %r150;
608
+ .loc 1 62 51
609
+ mov.b32 %f138, %r97;
610
+ .loc 1 70 24
611
+ add.f32 %f139, %f138, %f137;
612
+ .loc 1 72 24
613
+ add.f32 %f140, %f14, %f139;
614
+ .loc 1 73 24
615
+ sub.f32 %f141, %f140, %f9;
616
+ .loc 1 69 54
617
+ mov.b32 %f142, %r149;
618
+ .loc 1 62 51
619
+ mov.b32 %f143, %r96;
620
+ .loc 1 70 24
621
+ add.f32 %f144, %f143, %f142;
622
+ .loc 1 72 24
623
+ add.f32 %f145, %f13, %f144;
624
+ .loc 1 73 24
625
+ sub.f32 %f146, %f145, %f9;
626
+ .loc 1 69 54
627
+ mov.b32 %f147, %r148;
628
+ .loc 1 62 51
629
+ mov.b32 %f148, %r95;
630
+ .loc 1 70 24
631
+ add.f32 %f149, %f148, %f147;
632
+ .loc 1 72 24
633
+ add.f32 %f150, %f12, %f149;
634
+ .loc 1 73 24
635
+ sub.f32 %f151, %f150, %f9;
636
+ .loc 1 69 54
637
+ mov.b32 %f152, %r147;
638
+ .loc 1 62 51
639
+ mov.b32 %f153, %r94;
640
+ .loc 1 70 24
641
+ add.f32 %f154, %f153, %f152;
642
+ .loc 1 72 24
643
+ add.f32 %f155, %f11, %f154;
644
+ .loc 1 73 24
645
+ sub.f32 %f156, %f155, %f9;
646
+ .loc 1 64 40
647
+ mov.b32 %f157, %r110;
648
+ mov.b32 %f158, %r111;
649
+ mov.b32 %f159, %r112;
650
+ mov.b32 %f160, %r113;
651
+ .loc 1 79 24
652
+ mul.f32 %f161, %f156, %f136;
653
+ mul.f32 %f162, %f151, %f136;
654
+ mul.f32 %f163, %f146, %f136;
655
+ mul.f32 %f164, %f141, %f136;
656
+ .loc 1 80 24
657
+ mul.f32 %f165, %f161, %f157;
658
+ mul.f32 %f166, %f162, %f158;
659
+ mul.f32 %f167, %f163, %f159;
660
+ mul.f32 %f168, %f164, %f160;
661
+ .loc 1 82 29
662
+ shl.b64 %rd56, %rd3, 1;
663
+ add.s64 %rd55, %rd9, %rd56;
664
+ .loc 1 82 52
665
+ mov.b32 %r167, %f165;
666
+ cvt.rn.bf16.f32 %rs9, %r167;
667
+ mov.b32 %r168, %f166;
668
+ cvt.rn.bf16.f32 %rs10, %r168;
669
+ mov.b32 %r169, %f167;
670
+ cvt.rn.bf16.f32 %rs11, %r169;
671
+ mov.b32 %r170, %f168;
672
+ cvt.rn.bf16.f32 %rs12, %r170;
673
+ mov.b32 %r173, {%rs9, %rs10};
674
+ mov.b32 %r174, {%rs11, %rs12};
675
+ @%p53 st.global.v2.b32 [ %rd55 + 0 ], { %r173, %r174 };
676
+ .loc 1 58 4
677
+ ret;
678
+ $L__tmp17:
679
+ $L__func_end0:
680
+
681
+ }
682
+ // .globl __nv_rsqrtf
683
+ .visible .func (.param .b32 func_retval0) __nv_rsqrtf(
684
+ .param .b32 __nv_rsqrtf_param_0
685
+ )
686
+ {
687
+ .reg .f32 %f<3>;
688
+ $L__func_begin1:
689
+
690
+ ld.param.f32 %f1, [__nv_rsqrtf_param_0];
691
+ rsqrt.approx.ftz.f32 %f2, %f1;
692
+ st.param.f32 [func_retval0+0], %f2;
693
+ ret;
694
+ $L__func_end1:
695
+
696
+ }
697
+ .file 1 "/tmp/torchinductor_root/pn/cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py"
698
+ .file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
699
+ .section .debug_abbrev
700
+ {
701
+ .b8 1
702
+ .b8 17
703
+ .b8 1
704
+ .b8 37
705
+ .b8 8
706
+ .b8 19
707
+ .b8 5
708
+ .b8 3
709
+ .b8 8
710
+ .b8 16
711
+ .b8 6
712
+ .b8 27
713
+ .b8 8
714
+ .b8 180
715
+ .b8 66
716
+ .b8 12
717
+ .b8 17
718
+ .b8 1
719
+ .b8 18
720
+ .b8 1
721
+ .b8 0
722
+ .b8 0
723
+ .b8 2
724
+ .b8 46
725
+ .b8 0
726
+ .b8 135
727
+ .b8 64
728
+ .b8 8
729
+ .b8 3
730
+ .b8 8
731
+ .b8 58
732
+ .b8 11
733
+ .b8 59
734
+ .b8 11
735
+ .b8 63
736
+ .b8 12
737
+ .b8 32
738
+ .b8 11
739
+ .b8 0
740
+ .b8 0
741
+ .b8 3
742
+ .b8 46
743
+ .b8 1
744
+ .b8 17
745
+ .b8 1
746
+ .b8 18
747
+ .b8 1
748
+ .b8 64
749
+ .b8 10
750
+ .b8 49
751
+ .b8 19
752
+ .b8 0
753
+ .b8 0
754
+ .b8 4
755
+ .b8 29
756
+ .b8 0
757
+ .b8 49
758
+ .b8 19
759
+ .b8 17
760
+ .b8 1
761
+ .b8 18
762
+ .b8 1
763
+ .b8 88
764
+ .b8 11
765
+ .b8 89
766
+ .b8 11
767
+ .b8 87
768
+ .b8 11
769
+ .b8 0
770
+ .b8 0
771
+ .b8 5
772
+ .b8 29
773
+ .b8 1
774
+ .b8 49
775
+ .b8 19
776
+ .b8 17
777
+ .b8 1
778
+ .b8 18
779
+ .b8 1
780
+ .b8 88
781
+ .b8 11
782
+ .b8 89
783
+ .b8 11
784
+ .b8 87
785
+ .b8 11
786
+ .b8 0
787
+ .b8 0
788
+ .b8 0
789
+ }
790
+ .section .debug_info
791
+ {
792
+ .b32 302
793
+ .b8 2
794
+ .b8 0
795
+ .b32 .debug_abbrev
796
+ .b8 8
797
+ .b8 1
798
+ .b8 116
799
+ .b8 114
800
+ .b8 105
801
+ .b8 116
802
+ .b8 111
803
+ .b8 110
804
+ .b8 0
805
+ .b8 2
806
+ .b8 0
807
+ .b8 99
808
+ .b8 112
809
+ .b8 110
810
+ .b8 51
811
+ .b8 108
812
+ .b8 97
813
+ .b8 119
814
+ .b8 103
815
+ .b8 54
816
+ .b8 53
817
+ .b8 108
818
+ .b8 112
819
+ .b8 105
820
+ .b8 54
821
+ .b8 51
822
+ .b8 103
823
+ .b8 118
824
+ .b8 54
825
+ .b8 99
826
+ .b8 54
827
+ .b8 112
828
+ .b8 110
829
+ .b8 52
830
+ .b8 111
831
+ .b8 105
832
+ .b8 107
833
+ .b8 104
834
+ .b8 103
835
+ .b8 54
836
+ .b8 113
837
+ .b8 118
838
+ .b8 97
839
+ .b8 50
840
+ .b8 104
841
+ .b8 50
842
+ .b8 113
843
+ .b8 106
844
+ .b8 100
845
+ .b8 112
846
+ .b8 120
847
+ .b8 101
848
+ .b8 54
849
+ .b8 113
850
+ .b8 106
851
+ .b8 52
852
+ .b8 108
853
+ .b8 118
854
+ .b8 116
855
+ .b8 116
856
+ .b8 119
857
+ .b8 101
858
+ .b8 122
859
+ .b8 46
860
+ .b8 112
861
+ .b8 121
862
+ .b8 0
863
+ .b32 .debug_line
864
+ .b8 47
865
+ .b8 116
866
+ .b8 109
867
+ .b8 112
868
+ .b8 47
869
+ .b8 116
870
+ .b8 111
871
+ .b8 114
872
+ .b8 99
873
+ .b8 104
874
+ .b8 105
875
+ .b8 110
876
+ .b8 100
877
+ .b8 117
878
+ .b8 99
879
+ .b8 116
880
+ .b8 111
881
+ .b8 114
882
+ .b8 95
883
+ .b8 114
884
+ .b8 111
885
+ .b8 111
886
+ .b8 116
887
+ .b8 47
888
+ .b8 112
889
+ .b8 110
890
+ .b8 0
891
+ .b8 1
892
+ .b64 $L__func_begin0
893
+ .b64 $L__func_end0
894
+ .b8 2
895
+ .b8 116
896
+ .b8 114
897
+ .b8 105
898
+ .b8 116
899
+ .b8 111
900
+ .b8 110
901
+ .b8 95
902
+ .b8 95
903
+ .b8 48
904
+ .b8 100
905
+ .b8 49
906
+ .b8 100
907
+ .b8 50
908
+ .b8 100
909
+ .b8 51
910
+ .b8 100
911
+ .b8 52
912
+ .b8 100
913
+ .b8 53
914
+ .b8 100
915
+ .b8 54
916
+ .b8 100
917
+ .b8 101
918
+ .b8 55
919
+ .b8 100
920
+ .b8 101
921
+ .b8 0
922
+ .b8 116
923
+ .b8 114
924
+ .b8 105
925
+ .b8 116
926
+ .b8 111
927
+ .b8 110
928
+ .b8 95
929
+ .b8 95
930
+ .b8 48
931
+ .b8 100
932
+ .b8 49
933
+ .b8 100
934
+ .b8 50
935
+ .b8 100
936
+ .b8 51
937
+ .b8 100
938
+ .b8 52
939
+ .b8 100
940
+ .b8 53
941
+ .b8 100
942
+ .b8 54
943
+ .b8 100
944
+ .b8 101
945
+ .b8 55
946
+ .b8 100
947
+ .b8 101
948
+ .b8 0
949
+ .b8 1
950
+ .b8 18
951
+ .b8 1
952
+ .b8 1
953
+ .b8 3
954
+ .b64 $L__func_begin0
955
+ .b64 $L__func_end0
956
+ .b8 1
957
+ .b8 156
958
+ .b32 125
959
+ .b8 4
960
+ .b32 125
961
+ .b64 $L__tmp1
962
+ .b64 $L__tmp2
963
+ .b8 2
964
+ .b8 47
965
+ .b8 41
966
+ .b8 5
967
+ .b32 125
968
+ .b64 $L__tmp2
969
+ .b64 $L__tmp15
970
+ .b8 2
971
+ .b8 53
972
+ .b8 44
973
+ .b8 4
974
+ .b32 125
975
+ .b64 $L__tmp2
976
+ .b64 $L__tmp15
977
+ .b8 2
978
+ .b8 120
979
+ .b8 46
980
+ .b8 0
981
+ .b8 4
982
+ .b32 125
983
+ .b64 $L__tmp3
984
+ .b64 $L__tmp16
985
+ .b8 2
986
+ .b8 53
987
+ .b8 44
988
+ .b8 0
989
+ .b8 0
990
+ }
991
+ .section .debug_pubnames
992
+ {
993
+ .b32 $L__pubNames_end0-$L__pubNames_start0
994
+ $L__pubNames_start0:
995
+ .b8 2
996
+ .b8 0
997
+ .b32 .debug_info
998
+ .b32 306
999
+ .b32 125
1000
+ .b8 116
1001
+ .b8 114
1002
+ .b8 105
1003
+ .b8 116
1004
+ .b8 111
1005
+ .b8 110
1006
+ .b8 95
1007
+ .b8 95
1008
+ .b8 48
1009
+ .b8 100
1010
+ .b8 49
1011
+ .b8 100
1012
+ .b8 50
1013
+ .b8 100
1014
+ .b8 51
1015
+ .b8 100
1016
+ .b8 52
1017
+ .b8 100
1018
+ .b8 53
1019
+ .b8 100
1020
+ .b8 54
1021
+ .b8 100
1022
+ .b8 101
1023
+ .b8 55
1024
+ .b8 100
1025
+ .b8 101
1026
+ .b8 0
1027
+ .b32 0
1028
+ $L__pubNames_end0:
1029
+ }
1030
+ .section .debug_pubtypes
1031
+ {
1032
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
1033
+ $L__pubTypes_start0:
1034
+ .b8 2
1035
+ .b8 0
1036
+ .b32 .debug_info
1037
+ .b32 306
1038
+ .b32 0
1039
+ $L__pubTypes_end0:
1040
+ }
1041
+ .section .debug_loc { }
.triton/dump/415aac87553b7d064f52694fa7254686/triton_.llir ADDED
@@ -0,0 +1,860 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
5
+
6
+ define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !7 {
7
+ %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
8
+ %5 = shl i32 %4, 3, !dbg !10
9
+ %6 = and i32 %5, 1016, !dbg !10
10
+ %7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #4, !dbg !11
11
+ %8 = shl i32 %7, 10, !dbg !12
12
+ %9 = or i32 %8, %6, !dbg !13
13
+ %10 = sext i32 %9 to i64, !dbg !14
14
+ %11 = getelementptr i16, ptr addrspace(1) %0, i64 %10, !dbg !14
15
+ %12 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %11, i1 true) #4, !dbg !15
16
+ %13 = extractvalue { i32, i32, i32, i32 } %12, 0, !dbg !15
17
+ %14 = extractvalue { i32, i32, i32, i32 } %12, 1, !dbg !15
18
+ %15 = extractvalue { i32, i32, i32, i32 } %12, 2, !dbg !15
19
+ %16 = extractvalue { i32, i32, i32, i32 } %12, 3, !dbg !15
20
+ %17 = trunc i32 %13 to i16, !dbg !15
21
+ %extelt.offset = lshr i32 %13, 16, !dbg !15
22
+ %18 = trunc i32 %extelt.offset to i16, !dbg !15
23
+ %19 = trunc i32 %14 to i16, !dbg !15
24
+ %extelt.offset1 = lshr i32 %14, 16, !dbg !15
25
+ %20 = trunc i32 %extelt.offset1 to i16, !dbg !15
26
+ %21 = trunc i32 %15 to i16, !dbg !15
27
+ %extelt.offset2 = lshr i32 %15, 16, !dbg !15
28
+ %22 = trunc i32 %extelt.offset2 to i16, !dbg !15
29
+ %23 = trunc i32 %16 to i16, !dbg !15
30
+ %extelt.offset3 = lshr i32 %16, 16, !dbg !15
31
+ %24 = trunc i32 %extelt.offset3 to i16, !dbg !15
32
+ %25 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %17) #4, !dbg !16
33
+ %26 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %18) #4, !dbg !16
34
+ %27 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %19) #4, !dbg !16
35
+ %28 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %20) #4, !dbg !16
36
+ %29 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %21) #4, !dbg !16
37
+ %30 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %22) #4, !dbg !16
38
+ %31 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %23) #4, !dbg !16
39
+ %32 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %24) #4, !dbg !16
40
+ %33 = fmul float %25, 0x3FE6A09E60000000, !dbg !17
41
+ %34 = fmul float %26, 0x3FE6A09E60000000, !dbg !17
42
+ %35 = fmul float %27, 0x3FE6A09E60000000, !dbg !17
43
+ %36 = fmul float %28, 0x3FE6A09E60000000, !dbg !17
44
+ %37 = fmul float %29, 0x3FE6A09E60000000, !dbg !17
45
+ %38 = fmul float %30, 0x3FE6A09E60000000, !dbg !17
46
+ %39 = fmul float %31, 0x3FE6A09E60000000, !dbg !17
47
+ %40 = fmul float %32, 0x3FE6A09E60000000, !dbg !17
48
+ %41 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
49
+ %.not.i = icmp eq i32 %41, 0, !dbg !18
50
+ %42 = tail call float @llvm.nvvm.fabs.ftz.f(float %33) #4, !dbg !18
51
+ %43 = tail call float @llvm.nvvm.fabs.f(float %33) #4, !dbg !18
52
+ %.0.i = select i1 %.not.i, float %43, float %42, !dbg !18
53
+ %44 = fcmp oge float %.0.i, 0x3FF00C1FC0000000, !dbg !18
54
+ br i1 %44, label %__nv_fabsf.exit1.i, label %46, !dbg !18
55
+
56
+ __nv_fabsf.exit1.i: ; preds = %3
57
+ %45 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
58
+ %.not1.i = icmp eq i32 %45, 0, !dbg !18
59
+ %.01.i = select i1 %.not1.i, float %43, float %42, !dbg !18
60
+ br label %__internal_fmad.exit.i, !dbg !18
61
+
62
+ 46: ; preds = %3
63
+ %47 = fmul float %33, %33, !dbg !18
64
+ br label %__internal_fmad.exit.i, !dbg !18
65
+
66
+ __internal_fmad.exit.i: ; preds = %46, %__nv_fabsf.exit1.i
67
+ %48 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i ], [ 0x3FC06EBA60000000, %46 ], !dbg !18
68
+ %49 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i ], [ 0xBFD8127580000000, %46 ], !dbg !18
69
+ %50 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i ], [ 0x3FBCE315E0000000, %46 ], !dbg !18
70
+ %51 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i ], [ 0xBF9B837CE0000000, %46 ], !dbg !18
71
+ %52 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i ], [ 0x3F755ABD40000000, %46 ], !dbg !18
72
+ %53 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i ], [ 0xBF4AE9A400000000, %46 ], !dbg !18
73
+ %54 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i ], [ 0x3F163D2D40000000, %46 ], !dbg !18
74
+ %55 = phi float [ %.01.i, %__nv_fabsf.exit1.i ], [ %47, %46 ], !dbg !18
75
+ %56 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
76
+ %.not2.i = icmp eq i32 %56, 0, !dbg !18
77
+ %57 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %54, float %55, float %53) #4, !dbg !18
78
+ %58 = tail call float @llvm.nvvm.fma.rn.f(float %54, float %55, float %53) #4, !dbg !18
79
+ %.02.i = select i1 %.not2.i, float %58, float %57, !dbg !18
80
+ %59 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
81
+ %.not3.i = icmp eq i32 %59, 0, !dbg !18
82
+ %60 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i, float %55, float %52) #4, !dbg !18
83
+ %61 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i, float %55, float %52) #4, !dbg !18
84
+ %.03.i = select i1 %.not3.i, float %61, float %60, !dbg !18
85
+ %62 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
86
+ %.not4.i = icmp eq i32 %62, 0, !dbg !18
87
+ %63 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i, float %55, float %51) #4, !dbg !18
88
+ %64 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i, float %55, float %51) #4, !dbg !18
89
+ %.04.i = select i1 %.not4.i, float %64, float %63, !dbg !18
90
+ %65 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
91
+ %.not5.i = icmp eq i32 %65, 0, !dbg !18
92
+ %66 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i, float %55, float %50) #4, !dbg !18
93
+ %67 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i, float %55, float %50) #4, !dbg !18
94
+ %.05.i = select i1 %.not5.i, float %67, float %66, !dbg !18
95
+ %68 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
96
+ %.not6.i = icmp eq i32 %68, 0, !dbg !18
97
+ %69 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i, float %55, float %49) #4, !dbg !18
98
+ %70 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i, float %55, float %49) #4, !dbg !18
99
+ %.06.i = select i1 %.not6.i, float %70, float %69, !dbg !18
100
+ %71 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
101
+ %.not7.i = icmp eq i32 %71, 0, !dbg !18
102
+ %72 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i, float %55, float %48) #4, !dbg !18
103
+ %73 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i, float %55, float %48) #4, !dbg !18
104
+ %.07.i = select i1 %.not7.i, float %73, float %72, !dbg !18
105
+ %74 = fneg float %55, !dbg !18
106
+ %75 = select i1 %44, float %74, float %33, !dbg !18
107
+ %76 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
108
+ %.not8.i = icmp eq i32 %76, 0, !dbg !18
109
+ %77 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i, float %75, float %75) #4, !dbg !18
110
+ %78 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i, float %75, float %75) #4, !dbg !18
111
+ %.08.i = select i1 %.not8.i, float %78, float %77, !dbg !18
112
+ br i1 %44, label %79, label %__nv_erff.exit, !dbg !18
113
+
114
+ 79: ; preds = %__internal_fmad.exit.i
115
+ %80 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i) #4, !dbg !18
116
+ %81 = fsub float 1.000000e+00, %80, !dbg !18
117
+ %82 = bitcast float %81 to i32, !dbg !18
118
+ %83 = bitcast float %33 to i32, !dbg !18
119
+ %84 = and i32 %83, -2147483648, !dbg !18
120
+ %85 = or i32 %84, %82, !dbg !18
121
+ %86 = bitcast i32 %85 to float, !dbg !18
122
+ br label %__nv_erff.exit, !dbg !18
123
+
124
+ __nv_erff.exit: ; preds = %__internal_fmad.exit.i, %79
125
+ %r.0.i = phi float [ %86, %79 ], [ %.08.i, %__internal_fmad.exit.i ], !dbg !18
126
+ %87 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
127
+ %.not.i4 = icmp eq i32 %87, 0, !dbg !18
128
+ %88 = tail call float @llvm.nvvm.fabs.ftz.f(float %34) #4, !dbg !18
129
+ %89 = tail call float @llvm.nvvm.fabs.f(float %34) #4, !dbg !18
130
+ %.0.i5 = select i1 %.not.i4, float %89, float %88, !dbg !18
131
+ %90 = fcmp oge float %.0.i5, 0x3FF00C1FC0000000, !dbg !18
132
+ br i1 %90, label %__nv_fabsf.exit1.i22, label %92, !dbg !18
133
+
134
+ __nv_fabsf.exit1.i22: ; preds = %__nv_erff.exit
135
+ %91 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
136
+ %.not1.i23 = icmp eq i32 %91, 0, !dbg !18
137
+ %.01.i24 = select i1 %.not1.i23, float %89, float %88, !dbg !18
138
+ br label %__internal_fmad.exit.i6, !dbg !18
139
+
140
+ 92: ; preds = %__nv_erff.exit
141
+ %93 = fmul float %34, %34, !dbg !18
142
+ br label %__internal_fmad.exit.i6, !dbg !18
143
+
144
+ __internal_fmad.exit.i6: ; preds = %92, %__nv_fabsf.exit1.i22
145
+ %94 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i22 ], [ 0x3FC06EBA60000000, %92 ], !dbg !18
146
+ %95 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i22 ], [ 0xBFD8127580000000, %92 ], !dbg !18
147
+ %96 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i22 ], [ 0x3FBCE315E0000000, %92 ], !dbg !18
148
+ %97 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i22 ], [ 0xBF9B837CE0000000, %92 ], !dbg !18
149
+ %98 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i22 ], [ 0x3F755ABD40000000, %92 ], !dbg !18
150
+ %99 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i22 ], [ 0xBF4AE9A400000000, %92 ], !dbg !18
151
+ %100 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i22 ], [ 0x3F163D2D40000000, %92 ], !dbg !18
152
+ %101 = phi float [ %.01.i24, %__nv_fabsf.exit1.i22 ], [ %93, %92 ], !dbg !18
153
+ %102 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
154
+ %.not2.i7 = icmp eq i32 %102, 0, !dbg !18
155
+ %103 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %100, float %101, float %99) #4, !dbg !18
156
+ %104 = tail call float @llvm.nvvm.fma.rn.f(float %100, float %101, float %99) #4, !dbg !18
157
+ %.02.i8 = select i1 %.not2.i7, float %104, float %103, !dbg !18
158
+ %105 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
159
+ %.not3.i9 = icmp eq i32 %105, 0, !dbg !18
160
+ %106 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i8, float %101, float %98) #4, !dbg !18
161
+ %107 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i8, float %101, float %98) #4, !dbg !18
162
+ %.03.i10 = select i1 %.not3.i9, float %107, float %106, !dbg !18
163
+ %108 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
164
+ %.not4.i11 = icmp eq i32 %108, 0, !dbg !18
165
+ %109 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i10, float %101, float %97) #4, !dbg !18
166
+ %110 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i10, float %101, float %97) #4, !dbg !18
167
+ %.04.i12 = select i1 %.not4.i11, float %110, float %109, !dbg !18
168
+ %111 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
169
+ %.not5.i13 = icmp eq i32 %111, 0, !dbg !18
170
+ %112 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i12, float %101, float %96) #4, !dbg !18
171
+ %113 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i12, float %101, float %96) #4, !dbg !18
172
+ %.05.i14 = select i1 %.not5.i13, float %113, float %112, !dbg !18
173
+ %114 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
174
+ %.not6.i15 = icmp eq i32 %114, 0, !dbg !18
175
+ %115 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i14, float %101, float %95) #4, !dbg !18
176
+ %116 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i14, float %101, float %95) #4, !dbg !18
177
+ %.06.i16 = select i1 %.not6.i15, float %116, float %115, !dbg !18
178
+ %117 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
179
+ %.not7.i17 = icmp eq i32 %117, 0, !dbg !18
180
+ %118 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i16, float %101, float %94) #4, !dbg !18
181
+ %119 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i16, float %101, float %94) #4, !dbg !18
182
+ %.07.i18 = select i1 %.not7.i17, float %119, float %118, !dbg !18
183
+ %120 = fneg float %101, !dbg !18
184
+ %121 = select i1 %90, float %120, float %34, !dbg !18
185
+ %122 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
186
+ %.not8.i19 = icmp eq i32 %122, 0, !dbg !18
187
+ %123 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i18, float %121, float %121) #4, !dbg !18
188
+ %124 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i18, float %121, float %121) #4, !dbg !18
189
+ %.08.i20 = select i1 %.not8.i19, float %124, float %123, !dbg !18
190
+ br i1 %90, label %125, label %__nv_erff.exit25, !dbg !18
191
+
192
+ 125: ; preds = %__internal_fmad.exit.i6
193
+ %126 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i20) #4, !dbg !18
194
+ %127 = fsub float 1.000000e+00, %126, !dbg !18
195
+ %128 = bitcast float %127 to i32, !dbg !18
196
+ %129 = bitcast float %34 to i32, !dbg !18
197
+ %130 = and i32 %129, -2147483648, !dbg !18
198
+ %131 = or i32 %130, %128, !dbg !18
199
+ %132 = bitcast i32 %131 to float, !dbg !18
200
+ br label %__nv_erff.exit25, !dbg !18
201
+
202
+ __nv_erff.exit25: ; preds = %__internal_fmad.exit.i6, %125
203
+ %r.0.i21 = phi float [ %132, %125 ], [ %.08.i20, %__internal_fmad.exit.i6 ], !dbg !18
204
+ %133 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
205
+ %.not.i26 = icmp eq i32 %133, 0, !dbg !18
206
+ %134 = tail call float @llvm.nvvm.fabs.ftz.f(float %35) #4, !dbg !18
207
+ %135 = tail call float @llvm.nvvm.fabs.f(float %35) #4, !dbg !18
208
+ %.0.i27 = select i1 %.not.i26, float %135, float %134, !dbg !18
209
+ %136 = fcmp oge float %.0.i27, 0x3FF00C1FC0000000, !dbg !18
210
+ br i1 %136, label %__nv_fabsf.exit1.i44, label %138, !dbg !18
211
+
212
+ __nv_fabsf.exit1.i44: ; preds = %__nv_erff.exit25
213
+ %137 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
214
+ %.not1.i45 = icmp eq i32 %137, 0, !dbg !18
215
+ %.01.i46 = select i1 %.not1.i45, float %135, float %134, !dbg !18
216
+ br label %__internal_fmad.exit.i28, !dbg !18
217
+
218
+ 138: ; preds = %__nv_erff.exit25
219
+ %139 = fmul float %35, %35, !dbg !18
220
+ br label %__internal_fmad.exit.i28, !dbg !18
221
+
222
+ __internal_fmad.exit.i28: ; preds = %138, %__nv_fabsf.exit1.i44
223
+ %140 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i44 ], [ 0x3FC06EBA60000000, %138 ], !dbg !18
224
+ %141 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i44 ], [ 0xBFD8127580000000, %138 ], !dbg !18
225
+ %142 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i44 ], [ 0x3FBCE315E0000000, %138 ], !dbg !18
226
+ %143 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i44 ], [ 0xBF9B837CE0000000, %138 ], !dbg !18
227
+ %144 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i44 ], [ 0x3F755ABD40000000, %138 ], !dbg !18
228
+ %145 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i44 ], [ 0xBF4AE9A400000000, %138 ], !dbg !18
229
+ %146 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i44 ], [ 0x3F163D2D40000000, %138 ], !dbg !18
230
+ %147 = phi float [ %.01.i46, %__nv_fabsf.exit1.i44 ], [ %139, %138 ], !dbg !18
231
+ %148 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
232
+ %.not2.i29 = icmp eq i32 %148, 0, !dbg !18
233
+ %149 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %146, float %147, float %145) #4, !dbg !18
234
+ %150 = tail call float @llvm.nvvm.fma.rn.f(float %146, float %147, float %145) #4, !dbg !18
235
+ %.02.i30 = select i1 %.not2.i29, float %150, float %149, !dbg !18
236
+ %151 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
237
+ %.not3.i31 = icmp eq i32 %151, 0, !dbg !18
238
+ %152 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i30, float %147, float %144) #4, !dbg !18
239
+ %153 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i30, float %147, float %144) #4, !dbg !18
240
+ %.03.i32 = select i1 %.not3.i31, float %153, float %152, !dbg !18
241
+ %154 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
242
+ %.not4.i33 = icmp eq i32 %154, 0, !dbg !18
243
+ %155 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i32, float %147, float %143) #4, !dbg !18
244
+ %156 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i32, float %147, float %143) #4, !dbg !18
245
+ %.04.i34 = select i1 %.not4.i33, float %156, float %155, !dbg !18
246
+ %157 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
247
+ %.not5.i35 = icmp eq i32 %157, 0, !dbg !18
248
+ %158 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i34, float %147, float %142) #4, !dbg !18
249
+ %159 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i34, float %147, float %142) #4, !dbg !18
250
+ %.05.i36 = select i1 %.not5.i35, float %159, float %158, !dbg !18
251
+ %160 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
252
+ %.not6.i37 = icmp eq i32 %160, 0, !dbg !18
253
+ %161 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i36, float %147, float %141) #4, !dbg !18
254
+ %162 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i36, float %147, float %141) #4, !dbg !18
255
+ %.06.i38 = select i1 %.not6.i37, float %162, float %161, !dbg !18
256
+ %163 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
257
+ %.not7.i39 = icmp eq i32 %163, 0, !dbg !18
258
+ %164 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i38, float %147, float %140) #4, !dbg !18
259
+ %165 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i38, float %147, float %140) #4, !dbg !18
260
+ %.07.i40 = select i1 %.not7.i39, float %165, float %164, !dbg !18
261
+ %166 = fneg float %147, !dbg !18
262
+ %167 = select i1 %136, float %166, float %35, !dbg !18
263
+ %168 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
264
+ %.not8.i41 = icmp eq i32 %168, 0, !dbg !18
265
+ %169 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i40, float %167, float %167) #4, !dbg !18
266
+ %170 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i40, float %167, float %167) #4, !dbg !18
267
+ %.08.i42 = select i1 %.not8.i41, float %170, float %169, !dbg !18
268
+ br i1 %136, label %171, label %__nv_erff.exit47, !dbg !18
269
+
270
+ 171: ; preds = %__internal_fmad.exit.i28
271
+ %172 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i42) #4, !dbg !18
272
+ %173 = fsub float 1.000000e+00, %172, !dbg !18
273
+ %174 = bitcast float %173 to i32, !dbg !18
274
+ %175 = bitcast float %35 to i32, !dbg !18
275
+ %176 = and i32 %175, -2147483648, !dbg !18
276
+ %177 = or i32 %176, %174, !dbg !18
277
+ %178 = bitcast i32 %177 to float, !dbg !18
278
+ br label %__nv_erff.exit47, !dbg !18
279
+
280
+ __nv_erff.exit47: ; preds = %__internal_fmad.exit.i28, %171
281
+ %r.0.i43 = phi float [ %178, %171 ], [ %.08.i42, %__internal_fmad.exit.i28 ], !dbg !18
282
+ %179 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
283
+ %.not.i48 = icmp eq i32 %179, 0, !dbg !18
284
+ %180 = tail call float @llvm.nvvm.fabs.ftz.f(float %36) #4, !dbg !18
285
+ %181 = tail call float @llvm.nvvm.fabs.f(float %36) #4, !dbg !18
286
+ %.0.i49 = select i1 %.not.i48, float %181, float %180, !dbg !18
287
+ %182 = fcmp oge float %.0.i49, 0x3FF00C1FC0000000, !dbg !18
288
+ br i1 %182, label %__nv_fabsf.exit1.i66, label %184, !dbg !18
289
+
290
+ __nv_fabsf.exit1.i66: ; preds = %__nv_erff.exit47
291
+ %183 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
292
+ %.not1.i67 = icmp eq i32 %183, 0, !dbg !18
293
+ %.01.i68 = select i1 %.not1.i67, float %181, float %180, !dbg !18
294
+ br label %__internal_fmad.exit.i50, !dbg !18
295
+
296
+ 184: ; preds = %__nv_erff.exit47
297
+ %185 = fmul float %36, %36, !dbg !18
298
+ br label %__internal_fmad.exit.i50, !dbg !18
299
+
300
+ __internal_fmad.exit.i50: ; preds = %184, %__nv_fabsf.exit1.i66
301
+ %186 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i66 ], [ 0x3FC06EBA60000000, %184 ], !dbg !18
302
+ %187 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i66 ], [ 0xBFD8127580000000, %184 ], !dbg !18
303
+ %188 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i66 ], [ 0x3FBCE315E0000000, %184 ], !dbg !18
304
+ %189 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i66 ], [ 0xBF9B837CE0000000, %184 ], !dbg !18
305
+ %190 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i66 ], [ 0x3F755ABD40000000, %184 ], !dbg !18
306
+ %191 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i66 ], [ 0xBF4AE9A400000000, %184 ], !dbg !18
307
+ %192 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i66 ], [ 0x3F163D2D40000000, %184 ], !dbg !18
308
+ %193 = phi float [ %.01.i68, %__nv_fabsf.exit1.i66 ], [ %185, %184 ], !dbg !18
309
+ %194 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
310
+ %.not2.i51 = icmp eq i32 %194, 0, !dbg !18
311
+ %195 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %192, float %193, float %191) #4, !dbg !18
312
+ %196 = tail call float @llvm.nvvm.fma.rn.f(float %192, float %193, float %191) #4, !dbg !18
313
+ %.02.i52 = select i1 %.not2.i51, float %196, float %195, !dbg !18
314
+ %197 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
315
+ %.not3.i53 = icmp eq i32 %197, 0, !dbg !18
316
+ %198 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i52, float %193, float %190) #4, !dbg !18
317
+ %199 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i52, float %193, float %190) #4, !dbg !18
318
+ %.03.i54 = select i1 %.not3.i53, float %199, float %198, !dbg !18
319
+ %200 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
320
+ %.not4.i55 = icmp eq i32 %200, 0, !dbg !18
321
+ %201 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i54, float %193, float %189) #4, !dbg !18
322
+ %202 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i54, float %193, float %189) #4, !dbg !18
323
+ %.04.i56 = select i1 %.not4.i55, float %202, float %201, !dbg !18
324
+ %203 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
325
+ %.not5.i57 = icmp eq i32 %203, 0, !dbg !18
326
+ %204 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i56, float %193, float %188) #4, !dbg !18
327
+ %205 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i56, float %193, float %188) #4, !dbg !18
328
+ %.05.i58 = select i1 %.not5.i57, float %205, float %204, !dbg !18
329
+ %206 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
330
+ %.not6.i59 = icmp eq i32 %206, 0, !dbg !18
331
+ %207 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i58, float %193, float %187) #4, !dbg !18
332
+ %208 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i58, float %193, float %187) #4, !dbg !18
333
+ %.06.i60 = select i1 %.not6.i59, float %208, float %207, !dbg !18
334
+ %209 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
335
+ %.not7.i61 = icmp eq i32 %209, 0, !dbg !18
336
+ %210 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i60, float %193, float %186) #4, !dbg !18
337
+ %211 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i60, float %193, float %186) #4, !dbg !18
338
+ %.07.i62 = select i1 %.not7.i61, float %211, float %210, !dbg !18
339
+ %212 = fneg float %193, !dbg !18
340
+ %213 = select i1 %182, float %212, float %36, !dbg !18
341
+ %214 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
342
+ %.not8.i63 = icmp eq i32 %214, 0, !dbg !18
343
+ %215 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i62, float %213, float %213) #4, !dbg !18
344
+ %216 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i62, float %213, float %213) #4, !dbg !18
345
+ %.08.i64 = select i1 %.not8.i63, float %216, float %215, !dbg !18
346
+ br i1 %182, label %217, label %__nv_erff.exit69, !dbg !18
347
+
348
+ 217: ; preds = %__internal_fmad.exit.i50
349
+ %218 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i64) #4, !dbg !18
350
+ %219 = fsub float 1.000000e+00, %218, !dbg !18
351
+ %220 = bitcast float %219 to i32, !dbg !18
352
+ %221 = bitcast float %36 to i32, !dbg !18
353
+ %222 = and i32 %221, -2147483648, !dbg !18
354
+ %223 = or i32 %222, %220, !dbg !18
355
+ %224 = bitcast i32 %223 to float, !dbg !18
356
+ br label %__nv_erff.exit69, !dbg !18
357
+
358
+ __nv_erff.exit69: ; preds = %__internal_fmad.exit.i50, %217
359
+ %r.0.i65 = phi float [ %224, %217 ], [ %.08.i64, %__internal_fmad.exit.i50 ], !dbg !18
360
+ %225 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
361
+ %.not.i70 = icmp eq i32 %225, 0, !dbg !18
362
+ %226 = tail call float @llvm.nvvm.fabs.ftz.f(float %37) #4, !dbg !18
363
+ %227 = tail call float @llvm.nvvm.fabs.f(float %37) #4, !dbg !18
364
+ %.0.i71 = select i1 %.not.i70, float %227, float %226, !dbg !18
365
+ %228 = fcmp oge float %.0.i71, 0x3FF00C1FC0000000, !dbg !18
366
+ br i1 %228, label %__nv_fabsf.exit1.i88, label %230, !dbg !18
367
+
368
+ __nv_fabsf.exit1.i88: ; preds = %__nv_erff.exit69
369
+ %229 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
370
+ %.not1.i89 = icmp eq i32 %229, 0, !dbg !18
371
+ %.01.i90 = select i1 %.not1.i89, float %227, float %226, !dbg !18
372
+ br label %__internal_fmad.exit.i72, !dbg !18
373
+
374
+ 230: ; preds = %__nv_erff.exit69
375
+ %231 = fmul float %37, %37, !dbg !18
376
+ br label %__internal_fmad.exit.i72, !dbg !18
377
+
378
+ __internal_fmad.exit.i72: ; preds = %230, %__nv_fabsf.exit1.i88
379
+ %232 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i88 ], [ 0x3FC06EBA60000000, %230 ], !dbg !18
380
+ %233 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i88 ], [ 0xBFD8127580000000, %230 ], !dbg !18
381
+ %234 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i88 ], [ 0x3FBCE315E0000000, %230 ], !dbg !18
382
+ %235 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i88 ], [ 0xBF9B837CE0000000, %230 ], !dbg !18
383
+ %236 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i88 ], [ 0x3F755ABD40000000, %230 ], !dbg !18
384
+ %237 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i88 ], [ 0xBF4AE9A400000000, %230 ], !dbg !18
385
+ %238 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i88 ], [ 0x3F163D2D40000000, %230 ], !dbg !18
386
+ %239 = phi float [ %.01.i90, %__nv_fabsf.exit1.i88 ], [ %231, %230 ], !dbg !18
387
+ %240 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
388
+ %.not2.i73 = icmp eq i32 %240, 0, !dbg !18
389
+ %241 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %238, float %239, float %237) #4, !dbg !18
390
+ %242 = tail call float @llvm.nvvm.fma.rn.f(float %238, float %239, float %237) #4, !dbg !18
391
+ %.02.i74 = select i1 %.not2.i73, float %242, float %241, !dbg !18
392
+ %243 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
393
+ %.not3.i75 = icmp eq i32 %243, 0, !dbg !18
394
+ %244 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i74, float %239, float %236) #4, !dbg !18
395
+ %245 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i74, float %239, float %236) #4, !dbg !18
396
+ %.03.i76 = select i1 %.not3.i75, float %245, float %244, !dbg !18
397
+ %246 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
398
+ %.not4.i77 = icmp eq i32 %246, 0, !dbg !18
399
+ %247 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i76, float %239, float %235) #4, !dbg !18
400
+ %248 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i76, float %239, float %235) #4, !dbg !18
401
+ %.04.i78 = select i1 %.not4.i77, float %248, float %247, !dbg !18
402
+ %249 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
403
+ %.not5.i79 = icmp eq i32 %249, 0, !dbg !18
404
+ %250 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i78, float %239, float %234) #4, !dbg !18
405
+ %251 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i78, float %239, float %234) #4, !dbg !18
406
+ %.05.i80 = select i1 %.not5.i79, float %251, float %250, !dbg !18
407
+ %252 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
408
+ %.not6.i81 = icmp eq i32 %252, 0, !dbg !18
409
+ %253 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i80, float %239, float %233) #4, !dbg !18
410
+ %254 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i80, float %239, float %233) #4, !dbg !18
411
+ %.06.i82 = select i1 %.not6.i81, float %254, float %253, !dbg !18
412
+ %255 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
413
+ %.not7.i83 = icmp eq i32 %255, 0, !dbg !18
414
+ %256 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i82, float %239, float %232) #4, !dbg !18
415
+ %257 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i82, float %239, float %232) #4, !dbg !18
416
+ %.07.i84 = select i1 %.not7.i83, float %257, float %256, !dbg !18
417
+ %258 = fneg float %239, !dbg !18
418
+ %259 = select i1 %228, float %258, float %37, !dbg !18
419
+ %260 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
420
+ %.not8.i85 = icmp eq i32 %260, 0, !dbg !18
421
+ %261 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i84, float %259, float %259) #4, !dbg !18
422
+ %262 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i84, float %259, float %259) #4, !dbg !18
423
+ %.08.i86 = select i1 %.not8.i85, float %262, float %261, !dbg !18
424
+ br i1 %228, label %263, label %__nv_erff.exit91, !dbg !18
425
+
426
+ 263: ; preds = %__internal_fmad.exit.i72
427
+ %264 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i86) #4, !dbg !18
428
+ %265 = fsub float 1.000000e+00, %264, !dbg !18
429
+ %266 = bitcast float %265 to i32, !dbg !18
430
+ %267 = bitcast float %37 to i32, !dbg !18
431
+ %268 = and i32 %267, -2147483648, !dbg !18
432
+ %269 = or i32 %268, %266, !dbg !18
433
+ %270 = bitcast i32 %269 to float, !dbg !18
434
+ br label %__nv_erff.exit91, !dbg !18
435
+
436
+ __nv_erff.exit91: ; preds = %__internal_fmad.exit.i72, %263
437
+ %r.0.i87 = phi float [ %270, %263 ], [ %.08.i86, %__internal_fmad.exit.i72 ], !dbg !18
438
+ %271 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
439
+ %.not.i92 = icmp eq i32 %271, 0, !dbg !18
440
+ %272 = tail call float @llvm.nvvm.fabs.ftz.f(float %38) #4, !dbg !18
441
+ %273 = tail call float @llvm.nvvm.fabs.f(float %38) #4, !dbg !18
442
+ %.0.i93 = select i1 %.not.i92, float %273, float %272, !dbg !18
443
+ %274 = fcmp oge float %.0.i93, 0x3FF00C1FC0000000, !dbg !18
444
+ br i1 %274, label %__nv_fabsf.exit1.i110, label %276, !dbg !18
445
+
446
+ __nv_fabsf.exit1.i110: ; preds = %__nv_erff.exit91
447
+ %275 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
448
+ %.not1.i111 = icmp eq i32 %275, 0, !dbg !18
449
+ %.01.i112 = select i1 %.not1.i111, float %273, float %272, !dbg !18
450
+ br label %__internal_fmad.exit.i94, !dbg !18
451
+
452
+ 276: ; preds = %__nv_erff.exit91
453
+ %277 = fmul float %38, %38, !dbg !18
454
+ br label %__internal_fmad.exit.i94, !dbg !18
455
+
456
+ __internal_fmad.exit.i94: ; preds = %276, %__nv_fabsf.exit1.i110
457
+ %278 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i110 ], [ 0x3FC06EBA60000000, %276 ], !dbg !18
458
+ %279 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i110 ], [ 0xBFD8127580000000, %276 ], !dbg !18
459
+ %280 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i110 ], [ 0x3FBCE315E0000000, %276 ], !dbg !18
460
+ %281 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i110 ], [ 0xBF9B837CE0000000, %276 ], !dbg !18
461
+ %282 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i110 ], [ 0x3F755ABD40000000, %276 ], !dbg !18
462
+ %283 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i110 ], [ 0xBF4AE9A400000000, %276 ], !dbg !18
463
+ %284 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i110 ], [ 0x3F163D2D40000000, %276 ], !dbg !18
464
+ %285 = phi float [ %.01.i112, %__nv_fabsf.exit1.i110 ], [ %277, %276 ], !dbg !18
465
+ %286 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
466
+ %.not2.i95 = icmp eq i32 %286, 0, !dbg !18
467
+ %287 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %284, float %285, float %283) #4, !dbg !18
468
+ %288 = tail call float @llvm.nvvm.fma.rn.f(float %284, float %285, float %283) #4, !dbg !18
469
+ %.02.i96 = select i1 %.not2.i95, float %288, float %287, !dbg !18
470
+ %289 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
471
+ %.not3.i97 = icmp eq i32 %289, 0, !dbg !18
472
+ %290 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i96, float %285, float %282) #4, !dbg !18
473
+ %291 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i96, float %285, float %282) #4, !dbg !18
474
+ %.03.i98 = select i1 %.not3.i97, float %291, float %290, !dbg !18
475
+ %292 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
476
+ %.not4.i99 = icmp eq i32 %292, 0, !dbg !18
477
+ %293 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i98, float %285, float %281) #4, !dbg !18
478
+ %294 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i98, float %285, float %281) #4, !dbg !18
479
+ %.04.i100 = select i1 %.not4.i99, float %294, float %293, !dbg !18
480
+ %295 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
481
+ %.not5.i101 = icmp eq i32 %295, 0, !dbg !18
482
+ %296 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i100, float %285, float %280) #4, !dbg !18
483
+ %297 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i100, float %285, float %280) #4, !dbg !18
484
+ %.05.i102 = select i1 %.not5.i101, float %297, float %296, !dbg !18
485
+ %298 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
486
+ %.not6.i103 = icmp eq i32 %298, 0, !dbg !18
487
+ %299 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i102, float %285, float %279) #4, !dbg !18
488
+ %300 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i102, float %285, float %279) #4, !dbg !18
489
+ %.06.i104 = select i1 %.not6.i103, float %300, float %299, !dbg !18
490
+ %301 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
491
+ %.not7.i105 = icmp eq i32 %301, 0, !dbg !18
492
+ %302 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i104, float %285, float %278) #4, !dbg !18
493
+ %303 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i104, float %285, float %278) #4, !dbg !18
494
+ %.07.i106 = select i1 %.not7.i105, float %303, float %302, !dbg !18
495
+ %304 = fneg float %285, !dbg !18
496
+ %305 = select i1 %274, float %304, float %38, !dbg !18
497
+ %306 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
498
+ %.not8.i107 = icmp eq i32 %306, 0, !dbg !18
499
+ %307 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i106, float %305, float %305) #4, !dbg !18
500
+ %308 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i106, float %305, float %305) #4, !dbg !18
501
+ %.08.i108 = select i1 %.not8.i107, float %308, float %307, !dbg !18
502
+ br i1 %274, label %309, label %__nv_erff.exit113, !dbg !18
503
+
504
+ 309: ; preds = %__internal_fmad.exit.i94
505
+ %310 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i108) #4, !dbg !18
506
+ %311 = fsub float 1.000000e+00, %310, !dbg !18
507
+ %312 = bitcast float %311 to i32, !dbg !18
508
+ %313 = bitcast float %38 to i32, !dbg !18
509
+ %314 = and i32 %313, -2147483648, !dbg !18
510
+ %315 = or i32 %314, %312, !dbg !18
511
+ %316 = bitcast i32 %315 to float, !dbg !18
512
+ br label %__nv_erff.exit113, !dbg !18
513
+
514
+ __nv_erff.exit113: ; preds = %__internal_fmad.exit.i94, %309
515
+ %r.0.i109 = phi float [ %316, %309 ], [ %.08.i108, %__internal_fmad.exit.i94 ], !dbg !18
516
+ %317 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
517
+ %.not.i114 = icmp eq i32 %317, 0, !dbg !18
518
+ %318 = tail call float @llvm.nvvm.fabs.ftz.f(float %39) #4, !dbg !18
519
+ %319 = tail call float @llvm.nvvm.fabs.f(float %39) #4, !dbg !18
520
+ %.0.i115 = select i1 %.not.i114, float %319, float %318, !dbg !18
521
+ %320 = fcmp oge float %.0.i115, 0x3FF00C1FC0000000, !dbg !18
522
+ br i1 %320, label %__nv_fabsf.exit1.i132, label %322, !dbg !18
523
+
524
+ __nv_fabsf.exit1.i132: ; preds = %__nv_erff.exit113
525
+ %321 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
526
+ %.not1.i133 = icmp eq i32 %321, 0, !dbg !18
527
+ %.01.i134 = select i1 %.not1.i133, float %319, float %318, !dbg !18
528
+ br label %__internal_fmad.exit.i116, !dbg !18
529
+
530
+ 322: ; preds = %__nv_erff.exit113
531
+ %323 = fmul float %39, %39, !dbg !18
532
+ br label %__internal_fmad.exit.i116, !dbg !18
533
+
534
+ __internal_fmad.exit.i116: ; preds = %322, %__nv_fabsf.exit1.i132
535
+ %324 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i132 ], [ 0x3FC06EBA60000000, %322 ], !dbg !18
536
+ %325 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i132 ], [ 0xBFD8127580000000, %322 ], !dbg !18
537
+ %326 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i132 ], [ 0x3FBCE315E0000000, %322 ], !dbg !18
538
+ %327 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i132 ], [ 0xBF9B837CE0000000, %322 ], !dbg !18
539
+ %328 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i132 ], [ 0x3F755ABD40000000, %322 ], !dbg !18
540
+ %329 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i132 ], [ 0xBF4AE9A400000000, %322 ], !dbg !18
541
+ %330 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i132 ], [ 0x3F163D2D40000000, %322 ], !dbg !18
542
+ %331 = phi float [ %.01.i134, %__nv_fabsf.exit1.i132 ], [ %323, %322 ], !dbg !18
543
+ %332 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
544
+ %.not2.i117 = icmp eq i32 %332, 0, !dbg !18
545
+ %333 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %330, float %331, float %329) #4, !dbg !18
546
+ %334 = tail call float @llvm.nvvm.fma.rn.f(float %330, float %331, float %329) #4, !dbg !18
547
+ %.02.i118 = select i1 %.not2.i117, float %334, float %333, !dbg !18
548
+ %335 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
549
+ %.not3.i119 = icmp eq i32 %335, 0, !dbg !18
550
+ %336 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i118, float %331, float %328) #4, !dbg !18
551
+ %337 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i118, float %331, float %328) #4, !dbg !18
552
+ %.03.i120 = select i1 %.not3.i119, float %337, float %336, !dbg !18
553
+ %338 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
554
+ %.not4.i121 = icmp eq i32 %338, 0, !dbg !18
555
+ %339 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i120, float %331, float %327) #4, !dbg !18
556
+ %340 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i120, float %331, float %327) #4, !dbg !18
557
+ %.04.i122 = select i1 %.not4.i121, float %340, float %339, !dbg !18
558
+ %341 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
559
+ %.not5.i123 = icmp eq i32 %341, 0, !dbg !18
560
+ %342 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i122, float %331, float %326) #4, !dbg !18
561
+ %343 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i122, float %331, float %326) #4, !dbg !18
562
+ %.05.i124 = select i1 %.not5.i123, float %343, float %342, !dbg !18
563
+ %344 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
564
+ %.not6.i125 = icmp eq i32 %344, 0, !dbg !18
565
+ %345 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i124, float %331, float %325) #4, !dbg !18
566
+ %346 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i124, float %331, float %325) #4, !dbg !18
567
+ %.06.i126 = select i1 %.not6.i125, float %346, float %345, !dbg !18
568
+ %347 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
569
+ %.not7.i127 = icmp eq i32 %347, 0, !dbg !18
570
+ %348 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i126, float %331, float %324) #4, !dbg !18
571
+ %349 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i126, float %331, float %324) #4, !dbg !18
572
+ %.07.i128 = select i1 %.not7.i127, float %349, float %348, !dbg !18
573
+ %350 = fneg float %331, !dbg !18
574
+ %351 = select i1 %320, float %350, float %39, !dbg !18
575
+ %352 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
576
+ %.not8.i129 = icmp eq i32 %352, 0, !dbg !18
577
+ %353 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i128, float %351, float %351) #4, !dbg !18
578
+ %354 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i128, float %351, float %351) #4, !dbg !18
579
+ %.08.i130 = select i1 %.not8.i129, float %354, float %353, !dbg !18
580
+ br i1 %320, label %355, label %__nv_erff.exit135, !dbg !18
581
+
582
+ 355: ; preds = %__internal_fmad.exit.i116
583
+ %356 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i130) #4, !dbg !18
584
+ %357 = fsub float 1.000000e+00, %356, !dbg !18
585
+ %358 = bitcast float %357 to i32, !dbg !18
586
+ %359 = bitcast float %39 to i32, !dbg !18
587
+ %360 = and i32 %359, -2147483648, !dbg !18
588
+ %361 = or i32 %360, %358, !dbg !18
589
+ %362 = bitcast i32 %361 to float, !dbg !18
590
+ br label %__nv_erff.exit135, !dbg !18
591
+
592
+ __nv_erff.exit135: ; preds = %__internal_fmad.exit.i116, %355
593
+ %r.0.i131 = phi float [ %362, %355 ], [ %.08.i130, %__internal_fmad.exit.i116 ], !dbg !18
594
+ %363 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
595
+ %.not.i136 = icmp eq i32 %363, 0, !dbg !18
596
+ %364 = tail call float @llvm.nvvm.fabs.ftz.f(float %40) #4, !dbg !18
597
+ %365 = tail call float @llvm.nvvm.fabs.f(float %40) #4, !dbg !18
598
+ %.0.i137 = select i1 %.not.i136, float %365, float %364, !dbg !18
599
+ %366 = fcmp oge float %.0.i137, 0x3FF00C1FC0000000, !dbg !18
600
+ br i1 %366, label %__nv_fabsf.exit1.i154, label %368, !dbg !18
601
+
602
+ __nv_fabsf.exit1.i154: ; preds = %__nv_erff.exit135
603
+ %367 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
604
+ %.not1.i155 = icmp eq i32 %367, 0, !dbg !18
605
+ %.01.i156 = select i1 %.not1.i155, float %365, float %364, !dbg !18
606
+ br label %__internal_fmad.exit.i138, !dbg !18
607
+
608
+ 368: ; preds = %__nv_erff.exit135
609
+ %369 = fmul float %40, %40, !dbg !18
610
+ br label %__internal_fmad.exit.i138, !dbg !18
611
+
612
+ __internal_fmad.exit.i138: ; preds = %368, %__nv_fabsf.exit1.i154
613
+ %370 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i154 ], [ 0x3FC06EBA60000000, %368 ], !dbg !18
614
+ %371 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i154 ], [ 0xBFD8127580000000, %368 ], !dbg !18
615
+ %372 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i154 ], [ 0x3FBCE315E0000000, %368 ], !dbg !18
616
+ %373 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i154 ], [ 0xBF9B837CE0000000, %368 ], !dbg !18
617
+ %374 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i154 ], [ 0x3F755ABD40000000, %368 ], !dbg !18
618
+ %375 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i154 ], [ 0xBF4AE9A400000000, %368 ], !dbg !18
619
+ %376 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i154 ], [ 0x3F163D2D40000000, %368 ], !dbg !18
620
+ %377 = phi float [ %.01.i156, %__nv_fabsf.exit1.i154 ], [ %369, %368 ], !dbg !18
621
+ %378 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
622
+ %.not2.i139 = icmp eq i32 %378, 0, !dbg !18
623
+ %379 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %376, float %377, float %375) #4, !dbg !18
624
+ %380 = tail call float @llvm.nvvm.fma.rn.f(float %376, float %377, float %375) #4, !dbg !18
625
+ %.02.i140 = select i1 %.not2.i139, float %380, float %379, !dbg !18
626
+ %381 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
627
+ %.not3.i141 = icmp eq i32 %381, 0, !dbg !18
628
+ %382 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i140, float %377, float %374) #4, !dbg !18
629
+ %383 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i140, float %377, float %374) #4, !dbg !18
630
+ %.03.i142 = select i1 %.not3.i141, float %383, float %382, !dbg !18
631
+ %384 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
632
+ %.not4.i143 = icmp eq i32 %384, 0, !dbg !18
633
+ %385 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i142, float %377, float %373) #4, !dbg !18
634
+ %386 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i142, float %377, float %373) #4, !dbg !18
635
+ %.04.i144 = select i1 %.not4.i143, float %386, float %385, !dbg !18
636
+ %387 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
637
+ %.not5.i145 = icmp eq i32 %387, 0, !dbg !18
638
+ %388 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i144, float %377, float %372) #4, !dbg !18
639
+ %389 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i144, float %377, float %372) #4, !dbg !18
640
+ %.05.i146 = select i1 %.not5.i145, float %389, float %388, !dbg !18
641
+ %390 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
642
+ %.not6.i147 = icmp eq i32 %390, 0, !dbg !18
643
+ %391 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i146, float %377, float %371) #4, !dbg !18
644
+ %392 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i146, float %377, float %371) #4, !dbg !18
645
+ %.06.i148 = select i1 %.not6.i147, float %392, float %391, !dbg !18
646
+ %393 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
647
+ %.not7.i149 = icmp eq i32 %393, 0, !dbg !18
648
+ %394 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i148, float %377, float %370) #4, !dbg !18
649
+ %395 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i148, float %377, float %370) #4, !dbg !18
650
+ %.07.i150 = select i1 %.not7.i149, float %395, float %394, !dbg !18
651
+ %396 = fneg float %377, !dbg !18
652
+ %397 = select i1 %366, float %396, float %40, !dbg !18
653
+ %398 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
654
+ %.not8.i151 = icmp eq i32 %398, 0, !dbg !18
655
+ %399 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i150, float %397, float %397) #4, !dbg !18
656
+ %400 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i150, float %397, float %397) #4, !dbg !18
657
+ %.08.i152 = select i1 %.not8.i151, float %400, float %399, !dbg !18
658
+ br i1 %366, label %401, label %__nv_erff.exit157, !dbg !18
659
+
660
+ 401: ; preds = %__internal_fmad.exit.i138
661
+ %402 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i152) #4, !dbg !18
662
+ %403 = fsub float 1.000000e+00, %402, !dbg !18
663
+ %404 = bitcast float %403 to i32, !dbg !18
664
+ %405 = bitcast float %40 to i32, !dbg !18
665
+ %406 = and i32 %405, -2147483648, !dbg !18
666
+ %407 = or i32 %406, %404, !dbg !18
667
+ %408 = bitcast i32 %407 to float, !dbg !18
668
+ br label %__nv_erff.exit157, !dbg !18
669
+
670
+ __nv_erff.exit157: ; preds = %__internal_fmad.exit.i138, %401
671
+ %r.0.i153 = phi float [ %408, %401 ], [ %.08.i152, %__internal_fmad.exit.i138 ], !dbg !18
672
+ %409 = fmul float %32, 5.000000e-01, !dbg !19
673
+ %410 = fmul float %31, 5.000000e-01, !dbg !19
674
+ %411 = fmul float %30, 5.000000e-01, !dbg !19
675
+ %412 = fmul float %29, 5.000000e-01, !dbg !19
676
+ %413 = fmul float %28, 5.000000e-01, !dbg !19
677
+ %414 = fmul float %27, 5.000000e-01, !dbg !19
678
+ %415 = fmul float %26, 5.000000e-01, !dbg !19
679
+ %416 = fmul float %25, 5.000000e-01, !dbg !19
680
+ %417 = fadd float %r.0.i, 1.000000e+00, !dbg !20
681
+ %418 = fadd float %r.0.i21, 1.000000e+00, !dbg !20
682
+ %419 = fadd float %r.0.i43, 1.000000e+00, !dbg !20
683
+ %420 = fadd float %r.0.i65, 1.000000e+00, !dbg !20
684
+ %421 = fadd float %r.0.i87, 1.000000e+00, !dbg !20
685
+ %422 = fadd float %r.0.i109, 1.000000e+00, !dbg !20
686
+ %423 = fadd float %r.0.i131, 1.000000e+00, !dbg !20
687
+ %424 = fadd float %r.0.i153, 1.000000e+00, !dbg !20
688
+ %425 = fmul float %416, %417, !dbg !21
689
+ %426 = fmul float %415, %418, !dbg !21
690
+ %427 = fmul float %414, %419, !dbg !21
691
+ %428 = fmul float %413, %420, !dbg !21
692
+ %429 = fmul float %412, %421, !dbg !21
693
+ %430 = fmul float %411, %422, !dbg !21
694
+ %431 = fmul float %410, %423, !dbg !21
695
+ %432 = fmul float %409, %424, !dbg !21
696
+ %433 = getelementptr i16, ptr addrspace(1) %1, i64 %10, !dbg !22
697
+ %434 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %425) #4, !dbg !23
698
+ %435 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %426) #4, !dbg !23
699
+ %436 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %427) #4, !dbg !23
700
+ %437 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %428) #4, !dbg !23
701
+ %438 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %429) #4, !dbg !23
702
+ %439 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %430) #4, !dbg !23
703
+ %440 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %431) #4, !dbg !23
704
+ %441 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %432) #4, !dbg !23
705
+ %442 = insertelement <2 x i16> undef, i16 %434, i64 0, !dbg !23
706
+ %443 = insertelement <2 x i16> %442, i16 %435, i64 1, !dbg !23
707
+ %444 = bitcast <2 x i16> %443 to i32, !dbg !23
708
+ %445 = insertelement <2 x i16> undef, i16 %436, i64 0, !dbg !23
709
+ %446 = insertelement <2 x i16> %445, i16 %437, i64 1, !dbg !23
710
+ %447 = bitcast <2 x i16> %446 to i32, !dbg !23
711
+ %448 = insertelement <2 x i16> undef, i16 %438, i64 0, !dbg !23
712
+ %449 = insertelement <2 x i16> %448, i16 %439, i64 1, !dbg !23
713
+ %450 = bitcast <2 x i16> %449 to i32, !dbg !23
714
+ %451 = insertelement <2 x i16> undef, i16 %440, i64 0, !dbg !23
715
+ %452 = insertelement <2 x i16> %451, i16 %441, i64 1, !dbg !23
716
+ %453 = bitcast <2 x i16> %452 to i32, !dbg !23
717
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %444, i32 %447, i32 %450, i32 %453, ptr addrspace(1) %433, i1 true) #4, !dbg !23
718
+ ret void, !dbg !24
719
+ }
720
+
721
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
722
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
723
+
724
+ ; Function Attrs: alwaysinline nounwind
725
+ define float @__nv_erff(float %a) local_unnamed_addr #1 {
726
+ __nv_fabsf.exit:
727
+ %0 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
728
+ %.not = icmp eq i32 %0, 0
729
+ %1 = tail call float @llvm.nvvm.fabs.ftz.f(float %a) #4
730
+ %2 = tail call float @llvm.nvvm.fabs.f(float %a) #4
731
+ %.0 = select i1 %.not, float %2, float %1
732
+ %3 = fcmp oge float %.0, 0x3FF00C1FC0000000
733
+ br i1 %3, label %__nv_fabsf.exit1, label %5
734
+
735
+ __nv_fabsf.exit1: ; preds = %__nv_fabsf.exit
736
+ %4 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
737
+ %.not1 = icmp eq i32 %4, 0
738
+ %.01 = select i1 %.not1, float %2, float %1
739
+ br label %__internal_fmad.exit
740
+
741
+ 5: ; preds = %__nv_fabsf.exit
742
+ %6 = fmul float %a, %a
743
+ br label %__internal_fmad.exit
744
+
745
+ __internal_fmad.exit: ; preds = %5, %__nv_fabsf.exit1
746
+ %7 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1 ], [ 0x3FC06EBA60000000, %5 ]
747
+ %8 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1 ], [ 0xBFD8127580000000, %5 ]
748
+ %9 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1 ], [ 0x3FBCE315E0000000, %5 ]
749
+ %10 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1 ], [ 0xBF9B837CE0000000, %5 ]
750
+ %11 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1 ], [ 0x3F755ABD40000000, %5 ]
751
+ %12 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1 ], [ 0xBF4AE9A400000000, %5 ]
752
+ %13 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1 ], [ 0x3F163D2D40000000, %5 ]
753
+ %14 = phi float [ %.01, %__nv_fabsf.exit1 ], [ %6, %5 ]
754
+ %15 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
755
+ %.not2 = icmp eq i32 %15, 0
756
+ %16 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %13, float %14, float %12) #4
757
+ %17 = tail call float @llvm.nvvm.fma.rn.f(float %13, float %14, float %12) #4
758
+ %.02 = select i1 %.not2, float %17, float %16
759
+ %18 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
760
+ %.not3 = icmp eq i32 %18, 0
761
+ %19 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02, float %14, float %11) #4
762
+ %20 = tail call float @llvm.nvvm.fma.rn.f(float %.02, float %14, float %11) #4
763
+ %.03 = select i1 %.not3, float %20, float %19
764
+ %21 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
765
+ %.not4 = icmp eq i32 %21, 0
766
+ %22 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03, float %14, float %10) #4
767
+ %23 = tail call float @llvm.nvvm.fma.rn.f(float %.03, float %14, float %10) #4
768
+ %.04 = select i1 %.not4, float %23, float %22
769
+ %24 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
770
+ %.not5 = icmp eq i32 %24, 0
771
+ %25 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04, float %14, float %9) #4
772
+ %26 = tail call float @llvm.nvvm.fma.rn.f(float %.04, float %14, float %9) #4
773
+ %.05 = select i1 %.not5, float %26, float %25
774
+ %27 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
775
+ %.not6 = icmp eq i32 %27, 0
776
+ %28 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05, float %14, float %8) #4
777
+ %29 = tail call float @llvm.nvvm.fma.rn.f(float %.05, float %14, float %8) #4
778
+ %.06 = select i1 %.not6, float %29, float %28
779
+ %30 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
780
+ %.not7 = icmp eq i32 %30, 0
781
+ %31 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06, float %14, float %7) #4
782
+ %32 = tail call float @llvm.nvvm.fma.rn.f(float %.06, float %14, float %7) #4
783
+ %.07 = select i1 %.not7, float %32, float %31
784
+ %33 = fneg float %14
785
+ %34 = select i1 %3, float %33, float %a
786
+ %35 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
787
+ %.not8 = icmp eq i32 %35, 0
788
+ %36 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07, float %34, float %34) #4
789
+ %37 = tail call float @llvm.nvvm.fma.rn.f(float %.07, float %34, float %34) #4
790
+ %.08 = select i1 %.not8, float %37, float %36
791
+ br i1 %3, label %38, label %46
792
+
793
+ 38: ; preds = %__internal_fmad.exit
794
+ %39 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08) #4
795
+ %40 = fsub float 1.000000e+00, %39
796
+ %41 = bitcast float %40 to i32
797
+ %42 = bitcast float %a to i32
798
+ %43 = and i32 %42, -2147483648
799
+ %44 = or i32 %43, %41
800
+ %45 = bitcast i32 %44 to float
801
+ br label %46
802
+
803
+ 46: ; preds = %38, %__internal_fmad.exit
804
+ %r.0 = phi float [ %45, %38 ], [ %.08, %__internal_fmad.exit ]
805
+ ret float %r.0
806
+ }
807
+
808
+ declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #2
809
+
810
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
811
+ declare float @llvm.nvvm.fabs.ftz.f(float) #0
812
+
813
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
814
+ declare float @llvm.nvvm.fabs.f(float) #0
815
+
816
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
817
+ declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float) #0
818
+
819
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
820
+ declare float @llvm.nvvm.fma.rn.f(float, float, float) #0
821
+
822
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
823
+ declare float @llvm.nvvm.ex2.approx.ftz.f(float) #3
824
+
825
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
826
+ attributes #1 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
827
+ attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
828
+ attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
829
+ attributes #4 = { nounwind }
830
+
831
+ !llvm.module.flags = !{!0, !1}
832
+ !llvm.dbg.cu = !{!2}
833
+ !nvvm.annotations = !{!4, !5, !5, !4}
834
+ !llvm.ident = !{!6}
835
+
836
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
837
+ !1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
838
+ !2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
839
+ !3 = !DIFile(filename: "cjfoqo3nutni5cmtw4brla34cz45fusadehkxfkr2fie2qgo7vwt.py", directory: "/tmp/torchinductor_root/jf")
840
+ !4 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
841
+ !5 = !{ptr @triton__0d1d2de, !"maxntidx", i32 128}
842
+ !6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
843
+ !7 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
844
+ !8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
845
+ !9 = !{}
846
+ !10 = !DILocation(line: 21, column: 36, scope: !7)
847
+ !11 = !DILocation(line: 20, column: 28, scope: !7)
848
+ !12 = !DILocation(line: 20, column: 33, scope: !7)
849
+ !13 = !DILocation(line: 21, column: 23, scope: !7)
850
+ !14 = !DILocation(line: 24, column: 30, scope: !7)
851
+ !15 = !DILocation(line: 24, column: 35, scope: !7)
852
+ !16 = !DILocation(line: 24, column: 44, scope: !7)
853
+ !17 = !DILocation(line: 29, column: 18, scope: !7)
854
+ !18 = !DILocation(line: 30, column: 23, scope: !7)
855
+ !19 = !DILocation(line: 27, column: 18, scope: !7)
856
+ !20 = !DILocation(line: 32, column: 18, scope: !7)
857
+ !21 = !DILocation(line: 33, column: 18, scope: !7)
858
+ !22 = !DILocation(line: 35, column: 25, scope: !7)
859
+ !23 = !DILocation(line: 35, column: 37, scope: !7)
860
+ !24 = !DILocation(line: 35, column: 4, scope: !7)
.triton/dump/44b225411009956bfbae22f8bac7d703/triton_.cubin ADDED
Binary file (13.9 kB). View file
 
.triton/dump/44b225411009956bfbae22f8bac7d703/triton_.llir ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @global_smem = external addrspace(3) global [0 x i8]
5
+ @.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
6
+
7
+ define void @triton__0d1d2d3d4d5de6de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i32 %5, i32 %6) local_unnamed_addr !dbg !7 {
8
+ %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
9
+ %9 = and i32 %8, 31, !dbg !10
10
+ %10 = lshr i32 %8, 5, !dbg !10
11
+ %11 = and i32 %10, 1, !dbg !10
12
+ %urem = shl i32 %8, 2, !dbg !10
13
+ %12 = and i32 %urem, 252, !dbg !10
14
+ %13 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !11
15
+ %14 = shl i32 %13, 8, !dbg !12
16
+ %15 = or i32 %14, %12, !dbg !13
17
+ %16 = sext i32 %15 to i64, !dbg !14
18
+ %17 = getelementptr float, ptr addrspace(1) %0, i64 %16, !dbg !14
19
+ %18 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %17, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !15
20
+ %19 = extractvalue { i32, i32, i32, i32 } %18, 0, !dbg !15
21
+ %20 = extractvalue { i32, i32, i32, i32 } %18, 1, !dbg !15
22
+ %21 = extractvalue { i32, i32, i32, i32 } %18, 2, !dbg !15
23
+ %22 = extractvalue { i32, i32, i32, i32 } %18, 3, !dbg !15
24
+ %23 = bitcast i32 %21 to float, !dbg !15
25
+ %24 = bitcast i32 %22 to float, !dbg !15
26
+ %25 = getelementptr i16, ptr addrspace(1) %1, i64 %16, !dbg !16
27
+ %26 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %25, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !17
28
+ %27 = extractvalue { i32, i32 } %26, 0, !dbg !17
29
+ %28 = extractvalue { i32, i32 } %26, 1, !dbg !17
30
+ %29 = trunc i32 %27 to i16, !dbg !17
31
+ %extelt.offset = lshr i32 %27, 16, !dbg !17
32
+ %30 = trunc i32 %extelt.offset to i16, !dbg !17
33
+ %31 = trunc i32 %28 to i16, !dbg !17
34
+ %extelt.offset1 = lshr i32 %28, 16, !dbg !17
35
+ %32 = trunc i32 %extelt.offset1 to i16, !dbg !17
36
+ %33 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %29) #6, !dbg !18
37
+ %34 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %30) #6, !dbg !18
38
+ %35 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %31) #6, !dbg !18
39
+ %36 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %32) #6, !dbg !18
40
+ %37 = getelementptr i16, ptr addrspace(1) %2, i64 %16, !dbg !19
41
+ %38 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %37, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !20
42
+ %39 = extractvalue { i32, i32 } %38, 0, !dbg !20
43
+ %40 = extractvalue { i32, i32 } %38, 1, !dbg !20
44
+ %41 = trunc i32 %39 to i16, !dbg !20
45
+ %extelt.offset2 = lshr i32 %39, 16, !dbg !20
46
+ %42 = trunc i32 %extelt.offset2 to i16, !dbg !20
47
+ %43 = trunc i32 %40 to i16, !dbg !20
48
+ %extelt.offset3 = lshr i32 %40, 16, !dbg !20
49
+ %44 = trunc i32 %extelt.offset3 to i16, !dbg !20
50
+ %45 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %41) #6, !dbg !21
51
+ %46 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %42) #6, !dbg !21
52
+ %47 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %43) #6, !dbg !21
53
+ %48 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %44) #6, !dbg !21
54
+ %49 = zext nneg i32 %12 to i64, !dbg !22
55
+ %50 = getelementptr float, ptr addrspace(1) %3, i64 %49, !dbg !22
56
+ %51 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %50, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !23
57
+ %52 = fadd float %35, %23, !dbg !24
58
+ %53 = fadd float %36, %24, !dbg !24
59
+ %54 = insertelement <2 x i32> poison, i32 %19, i64 0, !dbg !15
60
+ %55 = insertelement <2 x i32> %54, i32 %20, i64 1, !dbg !15
61
+ %56 = bitcast <2 x i32> %55 to <2 x float>, !dbg !15
62
+ %57 = insertelement <2 x float> poison, float %33, i64 0, !dbg !24
63
+ %58 = insertelement <2 x float> %57, float %34, i64 1, !dbg !24
64
+ %59 = fadd <2 x float> %58, %56, !dbg !24
65
+ %60 = insertelement <2 x float> poison, float %45, i64 0, !dbg !25
66
+ %61 = insertelement <2 x float> %60, float %46, i64 1, !dbg !25
67
+ %62 = fadd <2 x float> %59, %61, !dbg !25
68
+ %63 = fadd float %52, %47, !dbg !25
69
+ %64 = fadd float %53, %48, !dbg !25
70
+ %65 = extractelement <2 x float> %62, i64 0, !dbg !26
71
+ %66 = extractelement <2 x float> %62, i64 1, !dbg !26
72
+ %67 = fadd float %65, %66, !dbg !26
73
+ %68 = fadd float %67, %63, !dbg !26
74
+ %69 = fadd float %68, %64, !dbg !26
75
+ %70 = bitcast float %69 to i32, !dbg !32
76
+ %71 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %70, i32 16, i32 31), !dbg !32
77
+ %72 = bitcast i32 %71 to float, !dbg !32
78
+ %73 = fadd float %69, %72, !dbg !26
79
+ %74 = bitcast float %73 to i32, !dbg !32
80
+ %75 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %74, i32 8, i32 31), !dbg !32
81
+ %76 = bitcast i32 %75 to float, !dbg !32
82
+ %77 = fadd float %73, %76, !dbg !26
83
+ %78 = bitcast float %77 to i32, !dbg !32
84
+ %79 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %78, i32 4, i32 31), !dbg !32
85
+ %80 = bitcast i32 %79 to float, !dbg !32
86
+ %81 = fadd float %77, %80, !dbg !26
87
+ %82 = bitcast float %81 to i32, !dbg !32
88
+ %83 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %82, i32 2, i32 31), !dbg !32
89
+ %84 = bitcast i32 %83 to float, !dbg !32
90
+ %85 = fadd float %81, %84, !dbg !26
91
+ %86 = bitcast float %85 to i32, !dbg !32
92
+ %87 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %86, i32 1, i32 31), !dbg !32
93
+ %88 = bitcast i32 %87 to float, !dbg !32
94
+ %89 = fadd float %85, %88, !dbg !26
95
+ %90 = icmp eq i32 %9, 0, !dbg !32
96
+ %91 = zext nneg i32 %11 to i64, !dbg !32
97
+ %92 = getelementptr float, ptr addrspace(3) @global_smem, i64 %91, !dbg !32
98
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %92, float %89, i1 %90) #6, !dbg !32
99
+ tail call void @llvm.nvvm.barrier0(), !dbg !32
100
+ %93 = icmp slt i32 %8, 2, !dbg !32
101
+ %94 = sext i32 %8 to i64, !dbg !32
102
+ %95 = getelementptr float, ptr addrspace(3) @global_smem, i64 %94, !dbg !32
103
+ %96 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %95, i1 %93) #6, !dbg !32
104
+ %97 = bitcast float %96 to i32, !dbg !32
105
+ %98 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %97, i32 1, i32 31), !dbg !32
106
+ %99 = bitcast i32 %98 to float, !dbg !32
107
+ %100 = fadd float %96, %99, !dbg !26
108
+ %101 = and i32 %8, 1, !dbg !32
109
+ %102 = icmp eq i32 %101, 0, !dbg !32
110
+ %103 = and i1 %93, %102, !dbg !32
111
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %95, float %100, i1 %103) #6, !dbg !32
112
+ tail call void @llvm.nvvm.barrier0(), !dbg !32
113
+ %104 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !32
114
+ %105 = fadd float %104, 0.000000e+00, !dbg !34
115
+ %106 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %105, float 2.560000e+02) #6, !dbg !38
116
+ %107 = fsub float %65, %106, !dbg !39
117
+ %108 = fsub float %66, %106, !dbg !39
118
+ %109 = fsub float %63, %106, !dbg !39
119
+ %110 = fsub float %64, %106, !dbg !39
120
+ %111 = fmul float %107, %107, !dbg !40
121
+ %112 = fmul float %108, %108, !dbg !40
122
+ %113 = fmul float %109, %109, !dbg !40
123
+ %114 = fmul float %110, %110, !dbg !40
124
+ tail call void @llvm.nvvm.barrier0(), !dbg !41
125
+ %115 = fadd float %111, %112, !dbg !43
126
+ %116 = fadd float %113, %115, !dbg !43
127
+ %117 = fadd float %114, %116, !dbg !43
128
+ %118 = bitcast float %117 to i32, !dbg !41
129
+ %119 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %118, i32 16, i32 31), !dbg !41
130
+ %120 = bitcast i32 %119 to float, !dbg !41
131
+ %121 = fadd float %117, %120, !dbg !43
132
+ %122 = bitcast float %121 to i32, !dbg !41
133
+ %123 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %122, i32 8, i32 31), !dbg !41
134
+ %124 = bitcast i32 %123 to float, !dbg !41
135
+ %125 = fadd float %121, %124, !dbg !43
136
+ %126 = bitcast float %125 to i32, !dbg !41
137
+ %127 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %126, i32 4, i32 31), !dbg !41
138
+ %128 = bitcast i32 %127 to float, !dbg !41
139
+ %129 = fadd float %125, %128, !dbg !43
140
+ %130 = bitcast float %129 to i32, !dbg !41
141
+ %131 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %130, i32 2, i32 31), !dbg !41
142
+ %132 = bitcast i32 %131 to float, !dbg !41
143
+ %133 = fadd float %129, %132, !dbg !43
144
+ %134 = bitcast float %133 to i32, !dbg !41
145
+ %135 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %134, i32 1, i32 31), !dbg !41
146
+ %136 = bitcast i32 %135 to float, !dbg !41
147
+ %137 = fadd float %133, %136, !dbg !43
148
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %92, float %137, i1 %90) #6, !dbg !41
149
+ tail call void @llvm.nvvm.barrier0(), !dbg !41
150
+ %138 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %95, i1 %93) #6, !dbg !41
151
+ %139 = bitcast float %138 to i32, !dbg !41
152
+ %140 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %139, i32 1, i32 31), !dbg !41
153
+ %141 = bitcast i32 %140 to float, !dbg !41
154
+ %142 = fadd float %138, %141, !dbg !43
155
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %95, float %142, i1 %103) #6, !dbg !41
156
+ tail call void @llvm.nvvm.barrier0(), !dbg !41
157
+ %143 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !41
158
+ %144 = fadd float %143, 0.000000e+00, !dbg !46
159
+ %145 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %144, float 2.560000e+02) #6, !dbg !48
160
+ %146 = fadd float %145, 0x3EE4F8B580000000, !dbg !49
161
+ %147 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !50
162
+ %.not.i = icmp eq i32 %147, 0, !dbg !50
163
+ br i1 %.not.i, label %150, label %148, !dbg !50
164
+
165
+ 148: ; preds = %7
166
+ %149 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %146), !dbg !50
167
+ br label %__nv_rsqrtf.exit, !dbg !50
168
+
169
+ 150: ; preds = %7
170
+ %151 = tail call float @llvm.nvvm.rsqrt.approx.f(float %146), !dbg !50
171
+ br label %__nv_rsqrtf.exit, !dbg !50
172
+
173
+ __nv_rsqrtf.exit: ; preds = %148, %150
174
+ %.0.i = phi float [ %149, %148 ], [ %151, %150 ], !dbg !50
175
+ %152 = extractvalue { i32, i32, i32, i32 } %51, 3, !dbg !23
176
+ %153 = bitcast i32 %152 to float, !dbg !23
177
+ %154 = extractvalue { i32, i32, i32, i32 } %51, 2, !dbg !23
178
+ %155 = bitcast i32 %154 to float, !dbg !23
179
+ %156 = extractvalue { i32, i32, i32, i32 } %51, 1, !dbg !23
180
+ %157 = bitcast i32 %156 to float, !dbg !23
181
+ %158 = extractvalue { i32, i32, i32, i32 } %51, 0, !dbg !23
182
+ %159 = bitcast i32 %158 to float, !dbg !23
183
+ %160 = fmul float %107, %.0.i, !dbg !51
184
+ %161 = fmul float %108, %.0.i, !dbg !51
185
+ %162 = fmul float %109, %.0.i, !dbg !51
186
+ %163 = fmul float %110, %.0.i, !dbg !51
187
+ %164 = fmul float %160, %159, !dbg !52
188
+ %165 = fmul float %161, %157, !dbg !52
189
+ %166 = fmul float %162, %155, !dbg !52
190
+ %167 = fmul float %163, %153, !dbg !52
191
+ %168 = getelementptr float, ptr addrspace(1) %4, i64 %16, !dbg !53
192
+ %169 = bitcast float %164 to i32, !dbg !54
193
+ %170 = bitcast float %165 to i32, !dbg !54
194
+ %171 = bitcast float %166 to i32, !dbg !54
195
+ %172 = bitcast float %167 to i32, !dbg !54
196
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %169, i32 %170, i32 %171, i32 %172, ptr addrspace(1) %168, i1 true) #6, !dbg !54
197
+ ret void, !dbg !55
198
+ }
199
+
200
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
201
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
202
+
203
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
204
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
205
+
206
+ ; Function Attrs: convergent nocallback nounwind
207
+ declare void @llvm.nvvm.barrier0() #2
208
+
209
+ ; Function Attrs: alwaysinline nounwind
210
+ define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
211
+ %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
212
+ %.not = icmp eq i32 %1, 0
213
+ br i1 %.not, label %4, label %2
214
+
215
+ 2: ; preds = %0
216
+ %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
217
+ br label %6
218
+
219
+ 4: ; preds = %0
220
+ %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
221
+ br label %6
222
+
223
+ 6: ; preds = %4, %2
224
+ %.0 = phi float [ %3, %2 ], [ %5, %4 ]
225
+ ret float %.0
226
+ }
227
+
228
+ declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
229
+
230
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
231
+ declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
232
+
233
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
234
+ declare float @llvm.nvvm.rsqrt.approx.f(float) #5
235
+
236
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
237
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
238
+ attributes #2 = { convergent nocallback nounwind }
239
+ attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
240
+ attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
241
+ attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
242
+ attributes #6 = { nounwind }
243
+
244
+ !llvm.module.flags = !{!0, !1}
245
+ !llvm.dbg.cu = !{!2}
246
+ !nvvm.annotations = !{!4, !5, !5, !4}
247
+ !llvm.ident = !{!6}
248
+
249
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
250
+ !1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
251
+ !2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
252
+ !3 = !DIFile(filename: "ctvr3xs46luhhbr7xomihgyropjaatss7yata4igaw6kvgwas7g2.py", directory: "/tmp/torchinductor_root/tv")
253
+ !4 = !{ptr @triton__0d1d2d3d4d5de6de, !"kernel", i32 1}
254
+ !5 = !{ptr @triton__0d1d2d3d4d5de6de, !"maxntidx", i32 64}
255
+ !6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
256
+ !7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5de6de", linkageName: "triton__0d1d2d3d4d5de6de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
257
+ !8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
258
+ !9 = !{}
259
+ !10 = !DILocation(line: 26, column: 26, scope: !7)
260
+ !11 = !DILocation(line: 23, column: 28, scope: !7)
261
+ !12 = !DILocation(line: 30, column: 40, scope: !7)
262
+ !13 = !DILocation(line: 30, column: 36, scope: !7)
263
+ !14 = !DILocation(line: 30, column: 30, scope: !7)
264
+ !15 = !DILocation(line: 30, column: 46, scope: !7)
265
+ !16 = !DILocation(line: 31, column: 30, scope: !7)
266
+ !17 = !DILocation(line: 31, column: 46, scope: !7)
267
+ !18 = !DILocation(line: 31, column: 67, scope: !7)
268
+ !19 = !DILocation(line: 32, column: 30, scope: !7)
269
+ !20 = !DILocation(line: 32, column: 46, scope: !7)
270
+ !21 = !DILocation(line: 32, column: 67, scope: !7)
271
+ !22 = !DILocation(line: 33, column: 31, scope: !7)
272
+ !23 = !DILocation(line: 33, column: 36, scope: !7)
273
+ !24 = !DILocation(line: 35, column: 18, scope: !7)
274
+ !25 = !DILocation(line: 37, column: 18, scope: !7)
275
+ !26 = !DILocation(line: 233, column: 15, scope: !27, inlinedAt: !30)
276
+ !27 = distinct !DILexicalBlockFile(scope: !29, file: !28, discriminator: 0)
277
+ !28 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
278
+ !29 = distinct !DILexicalBlockFile(scope: !7, file: !28, discriminator: 0)
279
+ !30 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !31)
280
+ !31 = !DILocation(line: 42, column: 59, scope: !27)
281
+ !32 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !33)
282
+ !33 = !DILocation(line: 42, column: 59, scope: !29)
283
+ !34 = !DILocation(line: 8, column: 15, scope: !35, inlinedAt: !37)
284
+ !35 = distinct !DILexicalBlockFile(scope: !7, file: !36, discriminator: 0)
285
+ !36 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
286
+ !37 = !DILocation(line: 42, column: 45, scope: !35)
287
+ !38 = !DILocation(line: 45, column: 20, scope: !7)
288
+ !39 = !DILocation(line: 46, column: 19, scope: !7)
289
+ !40 = !DILocation(line: 47, column: 20, scope: !7)
290
+ !41 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !42)
291
+ !42 = !DILocation(line: 50, column: 59, scope: !29)
292
+ !43 = !DILocation(line: 233, column: 15, scope: !27, inlinedAt: !44)
293
+ !44 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !45)
294
+ !45 = !DILocation(line: 50, column: 59, scope: !27)
295
+ !46 = !DILocation(line: 8, column: 15, scope: !35, inlinedAt: !47)
296
+ !47 = !DILocation(line: 50, column: 45, scope: !35)
297
+ !48 = !DILocation(line: 53, column: 20, scope: !7)
298
+ !49 = !DILocation(line: 55, column: 20, scope: !7)
299
+ !50 = !DILocation(line: 56, column: 26, scope: !7)
300
+ !51 = !DILocation(line: 57, column: 20, scope: !7)
301
+ !52 = !DILocation(line: 58, column: 20, scope: !7)
302
+ !53 = !DILocation(line: 59, column: 25, scope: !7)
303
+ !54 = !DILocation(line: 59, column: 48, scope: !7)
304
+ !55 = !DILocation(line: 59, column: 4, scope: !7)
.triton/dump/44b225411009956bfbae22f8bac7d703/triton_.ttir ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %c256_i32 = arith.constant 256 : i32
4
+ %cst = arith.constant dense<0.000000e+00> : tensor<256xbf16>
5
+ %cst_0 = arith.constant 0.000000e+00 : f32
6
+ %cst_1 = arith.constant 2.560000e+02 : f32
7
+ %cst_2 = arith.constant 9.99999974E-6 : f32
8
+ %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32>
9
+ %cst_4 = arith.constant dense<256> : tensor<256xi32>
10
+ %0 = tt.get_program_id x : i32
11
+ %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
12
+ %2 = arith.cmpi slt, %1, %cst_4 : tensor<256xi32>
13
+ %3 = arith.muli %0, %c256_i32 : i32
14
+ %4 = tt.splat %3 : (i32) -> tensor<256xi32>
15
+ %5 = arith.addi %1, %4 : tensor<256xi32>
16
+ %6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
17
+ %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
18
+ %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
19
+ %9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
20
+ %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
21
+ %11 = tt.load %10, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
22
+ %12 = arith.extf %11 : tensor<256xbf16> to tensor<256xf32>
23
+ %13 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
24
+ %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
25
+ %15 = tt.load %14, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
26
+ %16 = arith.extf %15 : tensor<256xbf16> to tensor<256xf32>
27
+ %17 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
28
+ %18 = tt.addptr %17, %1 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
29
+ %19 = tt.load %18, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
30
+ %20 = arith.addf %8, %12 : tensor<256xf32>
31
+ %21 = arith.addf %20, %16 : tensor<256xf32>
32
+ %22 = arith.select %2, %21, %cst_3 : tensor<256xi1>, tensor<256xf32>
33
+ %23 = "tt.reduce"(%22) <{axis = 0 : i32}> ({
34
+ ^bb0(%arg7: f32, %arg8: f32):
35
+ %40 = arith.addf %arg7, %arg8 : f32
36
+ tt.reduce.return %40 : f32
37
+ }) : (tensor<256xf32>) -> f32
38
+ %24 = arith.addf %23, %cst_0 : f32
39
+ %25 = arith.divf %24, %cst_1 : f32
40
+ %26 = tt.splat %25 : (f32) -> tensor<256xf32>
41
+ %27 = arith.subf %21, %26 : tensor<256xf32>
42
+ %28 = arith.mulf %27, %27 : tensor<256xf32>
43
+ %29 = arith.select %2, %28, %cst_3 : tensor<256xi1>, tensor<256xf32>
44
+ %30 = "tt.reduce"(%29) <{axis = 0 : i32}> ({
45
+ ^bb0(%arg7: f32, %arg8: f32):
46
+ %40 = arith.addf %arg7, %arg8 : f32
47
+ tt.reduce.return %40 : f32
48
+ }) : (tensor<256xf32>) -> f32
49
+ %31 = arith.addf %30, %cst_0 : f32
50
+ %32 = arith.divf %31, %cst_1 : f32
51
+ %33 = arith.addf %32, %cst_2 : f32
52
+ %34 = tt.extern_elementwise %33 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
53
+ %35 = tt.splat %34 : (f32) -> tensor<256xf32>
54
+ %36 = arith.mulf %27, %35 : tensor<256xf32>
55
+ %37 = arith.mulf %36, %19 : tensor<256xf32>
56
+ %38 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
57
+ %39 = tt.addptr %38, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
58
+ tt.store %39, %37, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32>
59
+ tt.return
60
+ }
61
+ }
.triton/dump/4c6ad48573c74d55ed79384f6b432d50/triton_.ttir ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %c1024_i32 = arith.constant 1024 : i32
4
+ %0 = tt.get_program_id x : i32
5
+ %1 = arith.muli %0, %c1024_i32 : i32
6
+ %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
7
+ %3 = tt.splat %1 : (i32) -> tensor<1024xi32>
8
+ %4 = arith.addi %3, %2 : tensor<1024xi32>
9
+ %5 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>>
10
+ %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<f32, 1>>, tensor<1024xi32>
11
+ %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xf32>
12
+ %8 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
13
+ %9 = tt.addptr %8, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
14
+ %10 = arith.truncf %7 : tensor<1024xf32> to tensor<1024xbf16>
15
+ tt.store %9, %10 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16>
16
+ tt.return
17
+ }
18
+ }
.triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.llir ADDED
@@ -0,0 +1,362 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
5
+
6
+ define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !7 {
7
+ %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
8
+ %5 = shl i32 %4, 1, !dbg !10
9
+ %6 = and i32 %5, 510, !dbg !10
10
+ %7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #4, !dbg !11
11
+ %8 = shl i32 %7, 9, !dbg !12
12
+ %9 = or i32 %8, %6, !dbg !13
13
+ %10 = sext i32 %9 to i64, !dbg !14
14
+ %11 = getelementptr i16, ptr addrspace(1) %0, i64 %10, !dbg !14
15
+ %12 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %11, i1 true) #4, !dbg !15
16
+ %13 = trunc i32 %12 to i16, !dbg !15
17
+ %extelt.offset = lshr i32 %12, 16, !dbg !15
18
+ %14 = trunc i32 %extelt.offset to i16, !dbg !15
19
+ %15 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %13) #4, !dbg !16
20
+ %16 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %14) #4, !dbg !16
21
+ %17 = getelementptr i16, ptr addrspace(1) %1, i64 %10, !dbg !17
22
+ %18 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %17, i1 true) #4, !dbg !18
23
+ %19 = trunc i32 %18 to i16, !dbg !18
24
+ %extelt.offset1 = lshr i32 %18, 16, !dbg !18
25
+ %20 = trunc i32 %extelt.offset1 to i16, !dbg !18
26
+ %21 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %19) #4, !dbg !19
27
+ %22 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %20) #4, !dbg !19
28
+ %23 = fmul float %21, 0x3FE6A09E60000000, !dbg !20
29
+ %24 = fmul float %22, 0x3FE6A09E60000000, !dbg !20
30
+ %25 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
31
+ %.not.i = icmp eq i32 %25, 0, !dbg !21
32
+ %26 = tail call float @llvm.nvvm.fabs.ftz.f(float %23) #4, !dbg !21
33
+ %27 = tail call float @llvm.nvvm.fabs.f(float %23) #4, !dbg !21
34
+ %.0.i = select i1 %.not.i, float %27, float %26, !dbg !21
35
+ %28 = fcmp oge float %.0.i, 0x3FF00C1FC0000000, !dbg !21
36
+ br i1 %28, label %__nv_fabsf.exit1.i, label %30, !dbg !21
37
+
38
+ __nv_fabsf.exit1.i: ; preds = %3
39
+ %29 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
40
+ %.not1.i = icmp eq i32 %29, 0, !dbg !21
41
+ %.01.i = select i1 %.not1.i, float %27, float %26, !dbg !21
42
+ br label %__internal_fmad.exit.i, !dbg !21
43
+
44
+ 30: ; preds = %3
45
+ %31 = fmul float %23, %23, !dbg !21
46
+ br label %__internal_fmad.exit.i, !dbg !21
47
+
48
+ __internal_fmad.exit.i: ; preds = %30, %__nv_fabsf.exit1.i
49
+ %32 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i ], [ 0x3FC06EBA60000000, %30 ], !dbg !21
50
+ %33 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i ], [ 0xBFD8127580000000, %30 ], !dbg !21
51
+ %34 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i ], [ 0x3FBCE315E0000000, %30 ], !dbg !21
52
+ %35 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i ], [ 0xBF9B837CE0000000, %30 ], !dbg !21
53
+ %36 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i ], [ 0x3F755ABD40000000, %30 ], !dbg !21
54
+ %37 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i ], [ 0xBF4AE9A400000000, %30 ], !dbg !21
55
+ %38 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i ], [ 0x3F163D2D40000000, %30 ], !dbg !21
56
+ %39 = phi float [ %.01.i, %__nv_fabsf.exit1.i ], [ %31, %30 ], !dbg !21
57
+ %40 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
58
+ %.not2.i = icmp eq i32 %40, 0, !dbg !21
59
+ %41 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %38, float %39, float %37) #4, !dbg !21
60
+ %42 = tail call float @llvm.nvvm.fma.rn.f(float %38, float %39, float %37) #4, !dbg !21
61
+ %.02.i = select i1 %.not2.i, float %42, float %41, !dbg !21
62
+ %43 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
63
+ %.not3.i = icmp eq i32 %43, 0, !dbg !21
64
+ %44 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i, float %39, float %36) #4, !dbg !21
65
+ %45 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i, float %39, float %36) #4, !dbg !21
66
+ %.03.i = select i1 %.not3.i, float %45, float %44, !dbg !21
67
+ %46 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
68
+ %.not4.i = icmp eq i32 %46, 0, !dbg !21
69
+ %47 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i, float %39, float %35) #4, !dbg !21
70
+ %48 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i, float %39, float %35) #4, !dbg !21
71
+ %.04.i = select i1 %.not4.i, float %48, float %47, !dbg !21
72
+ %49 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
73
+ %.not5.i = icmp eq i32 %49, 0, !dbg !21
74
+ %50 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i, float %39, float %34) #4, !dbg !21
75
+ %51 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i, float %39, float %34) #4, !dbg !21
76
+ %.05.i = select i1 %.not5.i, float %51, float %50, !dbg !21
77
+ %52 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
78
+ %.not6.i = icmp eq i32 %52, 0, !dbg !21
79
+ %53 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i, float %39, float %33) #4, !dbg !21
80
+ %54 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i, float %39, float %33) #4, !dbg !21
81
+ %.06.i = select i1 %.not6.i, float %54, float %53, !dbg !21
82
+ %55 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
83
+ %.not7.i = icmp eq i32 %55, 0, !dbg !21
84
+ %56 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i, float %39, float %32) #4, !dbg !21
85
+ %57 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i, float %39, float %32) #4, !dbg !21
86
+ %.07.i = select i1 %.not7.i, float %57, float %56, !dbg !21
87
+ %58 = fneg float %39, !dbg !21
88
+ %59 = select i1 %28, float %58, float %23, !dbg !21
89
+ %60 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
90
+ %.not8.i = icmp eq i32 %60, 0, !dbg !21
91
+ %61 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i, float %59, float %59) #4, !dbg !21
92
+ %62 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i, float %59, float %59) #4, !dbg !21
93
+ %.08.i = select i1 %.not8.i, float %62, float %61, !dbg !21
94
+ br i1 %28, label %63, label %__nv_erff.exit, !dbg !21
95
+
96
+ 63: ; preds = %__internal_fmad.exit.i
97
+ %64 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i) #4, !dbg !21
98
+ %65 = fsub float 1.000000e+00, %64, !dbg !21
99
+ %66 = bitcast float %65 to i32, !dbg !21
100
+ %67 = bitcast float %23 to i32, !dbg !21
101
+ %68 = and i32 %67, -2147483648, !dbg !21
102
+ %69 = or i32 %68, %66, !dbg !21
103
+ %70 = bitcast i32 %69 to float, !dbg !21
104
+ br label %__nv_erff.exit, !dbg !21
105
+
106
+ __nv_erff.exit: ; preds = %__internal_fmad.exit.i, %63
107
+ %r.0.i = phi float [ %70, %63 ], [ %.08.i, %__internal_fmad.exit.i ], !dbg !21
108
+ %71 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
109
+ %.not.i2 = icmp eq i32 %71, 0, !dbg !21
110
+ %72 = tail call float @llvm.nvvm.fabs.ftz.f(float %24) #4, !dbg !21
111
+ %73 = tail call float @llvm.nvvm.fabs.f(float %24) #4, !dbg !21
112
+ %.0.i3 = select i1 %.not.i2, float %73, float %72, !dbg !21
113
+ %74 = fcmp oge float %.0.i3, 0x3FF00C1FC0000000, !dbg !21
114
+ br i1 %74, label %__nv_fabsf.exit1.i20, label %76, !dbg !21
115
+
116
+ __nv_fabsf.exit1.i20: ; preds = %__nv_erff.exit
117
+ %75 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
118
+ %.not1.i21 = icmp eq i32 %75, 0, !dbg !21
119
+ %.01.i22 = select i1 %.not1.i21, float %73, float %72, !dbg !21
120
+ br label %__internal_fmad.exit.i4, !dbg !21
121
+
122
+ 76: ; preds = %__nv_erff.exit
123
+ %77 = fmul float %24, %24, !dbg !21
124
+ br label %__internal_fmad.exit.i4, !dbg !21
125
+
126
+ __internal_fmad.exit.i4: ; preds = %76, %__nv_fabsf.exit1.i20
127
+ %78 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i20 ], [ 0x3FC06EBA60000000, %76 ], !dbg !21
128
+ %79 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i20 ], [ 0xBFD8127580000000, %76 ], !dbg !21
129
+ %80 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i20 ], [ 0x3FBCE315E0000000, %76 ], !dbg !21
130
+ %81 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i20 ], [ 0xBF9B837CE0000000, %76 ], !dbg !21
131
+ %82 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i20 ], [ 0x3F755ABD40000000, %76 ], !dbg !21
132
+ %83 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i20 ], [ 0xBF4AE9A400000000, %76 ], !dbg !21
133
+ %84 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i20 ], [ 0x3F163D2D40000000, %76 ], !dbg !21
134
+ %85 = phi float [ %.01.i22, %__nv_fabsf.exit1.i20 ], [ %77, %76 ], !dbg !21
135
+ %86 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
136
+ %.not2.i5 = icmp eq i32 %86, 0, !dbg !21
137
+ %87 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %84, float %85, float %83) #4, !dbg !21
138
+ %88 = tail call float @llvm.nvvm.fma.rn.f(float %84, float %85, float %83) #4, !dbg !21
139
+ %.02.i6 = select i1 %.not2.i5, float %88, float %87, !dbg !21
140
+ %89 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
141
+ %.not3.i7 = icmp eq i32 %89, 0, !dbg !21
142
+ %90 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i6, float %85, float %82) #4, !dbg !21
143
+ %91 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i6, float %85, float %82) #4, !dbg !21
144
+ %.03.i8 = select i1 %.not3.i7, float %91, float %90, !dbg !21
145
+ %92 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
146
+ %.not4.i9 = icmp eq i32 %92, 0, !dbg !21
147
+ %93 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i8, float %85, float %81) #4, !dbg !21
148
+ %94 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i8, float %85, float %81) #4, !dbg !21
149
+ %.04.i10 = select i1 %.not4.i9, float %94, float %93, !dbg !21
150
+ %95 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
151
+ %.not5.i11 = icmp eq i32 %95, 0, !dbg !21
152
+ %96 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i10, float %85, float %80) #4, !dbg !21
153
+ %97 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i10, float %85, float %80) #4, !dbg !21
154
+ %.05.i12 = select i1 %.not5.i11, float %97, float %96, !dbg !21
155
+ %98 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
156
+ %.not6.i13 = icmp eq i32 %98, 0, !dbg !21
157
+ %99 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i12, float %85, float %79) #4, !dbg !21
158
+ %100 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i12, float %85, float %79) #4, !dbg !21
159
+ %.06.i14 = select i1 %.not6.i13, float %100, float %99, !dbg !21
160
+ %101 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
161
+ %.not7.i15 = icmp eq i32 %101, 0, !dbg !21
162
+ %102 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i14, float %85, float %78) #4, !dbg !21
163
+ %103 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i14, float %85, float %78) #4, !dbg !21
164
+ %.07.i16 = select i1 %.not7.i15, float %103, float %102, !dbg !21
165
+ %104 = fneg float %85, !dbg !21
166
+ %105 = select i1 %74, float %104, float %24, !dbg !21
167
+ %106 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
168
+ %.not8.i17 = icmp eq i32 %106, 0, !dbg !21
169
+ %107 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i16, float %105, float %105) #4, !dbg !21
170
+ %108 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i16, float %105, float %105) #4, !dbg !21
171
+ %.08.i18 = select i1 %.not8.i17, float %108, float %107, !dbg !21
172
+ br i1 %74, label %109, label %__nv_erff.exit23, !dbg !21
173
+
174
+ 109: ; preds = %__internal_fmad.exit.i4
175
+ %110 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i18) #4, !dbg !21
176
+ %111 = fsub float 1.000000e+00, %110, !dbg !21
177
+ %112 = bitcast float %111 to i32, !dbg !21
178
+ %113 = bitcast float %24 to i32, !dbg !21
179
+ %114 = and i32 %113, -2147483648, !dbg !21
180
+ %115 = or i32 %114, %112, !dbg !21
181
+ %116 = bitcast i32 %115 to float, !dbg !21
182
+ br label %__nv_erff.exit23, !dbg !21
183
+
184
+ __nv_erff.exit23: ; preds = %__internal_fmad.exit.i4, %109
185
+ %r.0.i19 = phi float [ %116, %109 ], [ %.08.i18, %__internal_fmad.exit.i4 ], !dbg !21
186
+ %117 = fadd float %r.0.i, 1.000000e+00, !dbg !22
187
+ %118 = fadd float %r.0.i19, 1.000000e+00, !dbg !22
188
+ %119 = fmul float %117, 5.000000e-01, !dbg !23
189
+ %120 = fmul float %118, 5.000000e-01, !dbg !23
190
+ %121 = fmul float %21, %21, !dbg !24
191
+ %122 = fmul float %22, %22, !dbg !24
192
+ %123 = fmul float %121, -5.000000e-01, !dbg !25
193
+ %124 = fmul float %122, -5.000000e-01, !dbg !25
194
+ %125 = fmul float %123, 0x3FF7154760000000, !dbg !26
195
+ %126 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %125) #4, !dbg !26
196
+ %127 = fmul float %124, 0x3FF7154760000000, !dbg !26
197
+ %128 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %127) #4, !dbg !26
198
+ %129 = fmul float %126, 0x3FD9884540000000, !dbg !27
199
+ %130 = fmul float %128, 0x3FD9884540000000, !dbg !27
200
+ %131 = fmul float %21, %129, !dbg !28
201
+ %132 = fmul float %22, %130, !dbg !28
202
+ %133 = fadd float %119, %131, !dbg !29
203
+ %134 = fadd float %120, %132, !dbg !29
204
+ %135 = fmul float %15, %133, !dbg !30
205
+ %136 = fmul float %16, %134, !dbg !30
206
+ %137 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %135) #4, !dbg !31
207
+ %138 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %136) #4, !dbg !31
208
+ %139 = insertelement <2 x i16> undef, i16 %137, i64 0, !dbg !31
209
+ %140 = insertelement <2 x i16> %139, i16 %138, i64 1, !dbg !31
210
+ %141 = bitcast <2 x i16> %140 to i32, !dbg !31
211
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %141, ptr addrspace(1) %11, i1 true) #4, !dbg !31
212
+ ret void, !dbg !32
213
+ }
214
+
215
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
216
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
217
+
218
+ ; Function Attrs: alwaysinline nounwind
219
+ define float @__nv_erff(float %a) local_unnamed_addr #1 {
220
+ __nv_fabsf.exit:
221
+ %0 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
222
+ %.not = icmp eq i32 %0, 0
223
+ %1 = tail call float @llvm.nvvm.fabs.ftz.f(float %a) #4
224
+ %2 = tail call float @llvm.nvvm.fabs.f(float %a) #4
225
+ %.0 = select i1 %.not, float %2, float %1
226
+ %3 = fcmp oge float %.0, 0x3FF00C1FC0000000
227
+ br i1 %3, label %__nv_fabsf.exit1, label %5
228
+
229
+ __nv_fabsf.exit1: ; preds = %__nv_fabsf.exit
230
+ %4 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
231
+ %.not1 = icmp eq i32 %4, 0
232
+ %.01 = select i1 %.not1, float %2, float %1
233
+ br label %__internal_fmad.exit
234
+
235
+ 5: ; preds = %__nv_fabsf.exit
236
+ %6 = fmul float %a, %a
237
+ br label %__internal_fmad.exit
238
+
239
+ __internal_fmad.exit: ; preds = %5, %__nv_fabsf.exit1
240
+ %7 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1 ], [ 0x3FC06EBA60000000, %5 ]
241
+ %8 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1 ], [ 0xBFD8127580000000, %5 ]
242
+ %9 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1 ], [ 0x3FBCE315E0000000, %5 ]
243
+ %10 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1 ], [ 0xBF9B837CE0000000, %5 ]
244
+ %11 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1 ], [ 0x3F755ABD40000000, %5 ]
245
+ %12 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1 ], [ 0xBF4AE9A400000000, %5 ]
246
+ %13 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1 ], [ 0x3F163D2D40000000, %5 ]
247
+ %14 = phi float [ %.01, %__nv_fabsf.exit1 ], [ %6, %5 ]
248
+ %15 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
249
+ %.not2 = icmp eq i32 %15, 0
250
+ %16 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %13, float %14, float %12) #4
251
+ %17 = tail call float @llvm.nvvm.fma.rn.f(float %13, float %14, float %12) #4
252
+ %.02 = select i1 %.not2, float %17, float %16
253
+ %18 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
254
+ %.not3 = icmp eq i32 %18, 0
255
+ %19 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02, float %14, float %11) #4
256
+ %20 = tail call float @llvm.nvvm.fma.rn.f(float %.02, float %14, float %11) #4
257
+ %.03 = select i1 %.not3, float %20, float %19
258
+ %21 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
259
+ %.not4 = icmp eq i32 %21, 0
260
+ %22 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03, float %14, float %10) #4
261
+ %23 = tail call float @llvm.nvvm.fma.rn.f(float %.03, float %14, float %10) #4
262
+ %.04 = select i1 %.not4, float %23, float %22
263
+ %24 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
264
+ %.not5 = icmp eq i32 %24, 0
265
+ %25 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04, float %14, float %9) #4
266
+ %26 = tail call float @llvm.nvvm.fma.rn.f(float %.04, float %14, float %9) #4
267
+ %.05 = select i1 %.not5, float %26, float %25
268
+ %27 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
269
+ %.not6 = icmp eq i32 %27, 0
270
+ %28 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05, float %14, float %8) #4
271
+ %29 = tail call float @llvm.nvvm.fma.rn.f(float %.05, float %14, float %8) #4
272
+ %.06 = select i1 %.not6, float %29, float %28
273
+ %30 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
274
+ %.not7 = icmp eq i32 %30, 0
275
+ %31 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06, float %14, float %7) #4
276
+ %32 = tail call float @llvm.nvvm.fma.rn.f(float %.06, float %14, float %7) #4
277
+ %.07 = select i1 %.not7, float %32, float %31
278
+ %33 = fneg float %14
279
+ %34 = select i1 %3, float %33, float %a
280
+ %35 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
281
+ %.not8 = icmp eq i32 %35, 0
282
+ %36 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07, float %34, float %34) #4
283
+ %37 = tail call float @llvm.nvvm.fma.rn.f(float %.07, float %34, float %34) #4
284
+ %.08 = select i1 %.not8, float %37, float %36
285
+ br i1 %3, label %38, label %46
286
+
287
+ 38: ; preds = %__internal_fmad.exit
288
+ %39 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08) #4
289
+ %40 = fsub float 1.000000e+00, %39
290
+ %41 = bitcast float %40 to i32
291
+ %42 = bitcast float %a to i32
292
+ %43 = and i32 %42, -2147483648
293
+ %44 = or i32 %43, %41
294
+ %45 = bitcast i32 %44 to float
295
+ br label %46
296
+
297
+ 46: ; preds = %38, %__internal_fmad.exit
298
+ %r.0 = phi float [ %45, %38 ], [ %.08, %__internal_fmad.exit ]
299
+ ret float %r.0
300
+ }
301
+
302
+ declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #2
303
+
304
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
305
+ declare float @llvm.nvvm.fabs.ftz.f(float) #0
306
+
307
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
308
+ declare float @llvm.nvvm.fabs.f(float) #0
309
+
310
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
311
+ declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float) #0
312
+
313
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
314
+ declare float @llvm.nvvm.fma.rn.f(float, float, float) #0
315
+
316
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
317
+ declare float @llvm.nvvm.ex2.approx.ftz.f(float) #3
318
+
319
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
320
+ attributes #1 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
321
+ attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
322
+ attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
323
+ attributes #4 = { nounwind }
324
+
325
+ !llvm.module.flags = !{!0, !1}
326
+ !llvm.dbg.cu = !{!2}
327
+ !nvvm.annotations = !{!4, !5, !5, !4}
328
+ !llvm.ident = !{!6}
329
+
330
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
331
+ !1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
332
+ !2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
333
+ !3 = !DIFile(filename: "c5jxaguxho3nhrlt5vcinnz5fevodumlpwn4wyb2vx3xrveicerl.py", directory: "/tmp/torchinductor_root/5j")
334
+ !4 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
335
+ !5 = !{ptr @triton__0d1d2de, !"maxntidx", i32 256}
336
+ !6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
337
+ !7 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
338
+ !8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
339
+ !9 = !{}
340
+ !10 = !DILocation(line: 21, column: 36, scope: !7)
341
+ !11 = !DILocation(line: 20, column: 28, scope: !7)
342
+ !12 = !DILocation(line: 20, column: 33, scope: !7)
343
+ !13 = !DILocation(line: 21, column: 23, scope: !7)
344
+ !14 = !DILocation(line: 24, column: 34, scope: !7)
345
+ !15 = !DILocation(line: 24, column: 39, scope: !7)
346
+ !16 = !DILocation(line: 24, column: 48, scope: !7)
347
+ !17 = !DILocation(line: 25, column: 30, scope: !7)
348
+ !18 = !DILocation(line: 25, column: 35, scope: !7)
349
+ !19 = !DILocation(line: 25, column: 44, scope: !7)
350
+ !20 = !DILocation(line: 29, column: 18, scope: !7)
351
+ !21 = !DILocation(line: 30, column: 23, scope: !7)
352
+ !22 = !DILocation(line: 32, column: 18, scope: !7)
353
+ !23 = !DILocation(line: 34, column: 19, scope: !7)
354
+ !24 = !DILocation(line: 35, column: 19, scope: !7)
355
+ !25 = !DILocation(line: 37, column: 20, scope: !7)
356
+ !26 = !DILocation(line: 38, column: 19, scope: !7)
357
+ !27 = !DILocation(line: 40, column: 20, scope: !7)
358
+ !28 = !DILocation(line: 41, column: 19, scope: !7)
359
+ !29 = !DILocation(line: 42, column: 20, scope: !7)
360
+ !30 = !DILocation(line: 43, column: 19, scope: !7)
361
+ !31 = !DILocation(line: 45, column: 40, scope: !7)
362
+ !32 = !DILocation(line: 45, column: 4, scope: !7)
.triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.ptx ADDED
@@ -0,0 +1,486 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2de
10
+ .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
11
+
12
+ .visible .entry triton__0d1d2de(
13
+ .param .u64 triton__0d1d2de_param_0,
14
+ .param .u64 triton__0d1d2de_param_1,
15
+ .param .u32 triton__0d1d2de_param_2
16
+ )
17
+ .maxntid 256, 1, 1
18
+ {
19
+ .reg .pred %p<10>;
20
+ .reg .b16 %rs<7>;
21
+ .reg .b32 %r<25>;
22
+ .reg .f32 %f<127>;
23
+ .reg .b64 %rd<8>;
24
+ .loc 1 18 0
25
+ $L__func_begin0:
26
+ .loc 1 18 0
27
+
28
+ ld.param.u64 %rd4, [triton__0d1d2de_param_0];
29
+ ld.param.u64 %rd5, [triton__0d1d2de_param_1];
30
+ $L__tmp0:
31
+ .loc 1 21 36
32
+ mov.u32 %r8, %tid.x;
33
+ shl.b32 %r9, %r8, 1;
34
+ and.b32 %r10, %r9, 510;
35
+ .loc 1 20 28
36
+ mov.u32 %r1, %ctaid.x;
37
+ .loc 1 20 33
38
+ shl.b32 %r11, %r1, 9;
39
+ .loc 1 21 23
40
+ or.b32 %r12, %r11, %r10;
41
+ .loc 1 24 34
42
+ mul.wide.s32 %rd6, %r12, 2;
43
+ add.s64 %rd7, %rd4, %rd6;
44
+ mov.pred %p1, -1;
45
+ .loc 1 24 39
46
+ mov.u32 %r2, 0x0;
47
+ @%p1 ld.global.b32 { %r2 }, [ %rd7 + 0 ];
48
+ .loc 1 25 30
49
+ add.s64 %rd3, %rd5, %rd6;
50
+ .loc 1 25 35
51
+ mov.u32 %r5, 0x0;
52
+ @%p1 ld.global.b32 { %r5 }, [ %rd3 + 0 ];
53
+ cvt.u16.u32 %rs3, %r5;
54
+ { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r5; }
55
+ .loc 1 25 44
56
+ cvt.f32.bf16 %r6, %rs3;
57
+ mov.b32 %f3, %r6;
58
+ cvt.f32.bf16 %r7, %rs4;
59
+ mov.b32 %f4, %r7;
60
+ .loc 1 29 18
61
+ mul.f32 %f5, %f3, 0f3F3504F3;
62
+ .loc 1 30 23
63
+ abs.ftz.f32 %f7, %f5;
64
+ setp.ge.f32 %p3, %f7, 0f3F8060FE;
65
+ mov.f32 %f115, 0f3789CA3C;
66
+ mov.f32 %f114, 0fB9F560B9;
67
+ mov.f32 %f113, 0f3BAC840B;
68
+ mov.f32 %f112, 0fBD0C8162;
69
+ mov.f32 %f111, 0f3E1CF906;
70
+ mov.f32 %f110, 0f3F6A937E;
71
+ mov.f32 %f109, 0f3F20D842;
72
+ mov.f32 %f116, %f7;
73
+ @%p3 bra $L__BB0_2;
74
+ .loc 1 0 23
75
+ mov.f32 %f115, 0f38B1E96A;
76
+ mov.f32 %f114, 0fBA574D20;
77
+ mov.f32 %f113, 0f3BAAD5EA;
78
+ mov.f32 %f112, 0fBCDC1BE7;
79
+ mov.f32 %f111, 0f3DE718AF;
80
+ mov.f32 %f110, 0fBEC093AC;
81
+ mov.f32 %f109, 0f3E0375D3;
82
+ .loc 1 30 23
83
+ mul.f32 %f116, %f5, %f5;
84
+ $L__BB0_2:
85
+ .loc 1 0 0
86
+ cvt.u16.u32 %rs1, %r2;
87
+ { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
88
+ mul.f32 %f6, %f4, 0f3F3504F3;
89
+ .loc 1 30 23
90
+ setp.ltu.f32 %p4, %f7, 0f3F8060FE;
91
+ fma.rn.ftz.f32 %f47, %f115, %f116, %f114;
92
+ fma.rn.ftz.f32 %f48, %f47, %f116, %f113;
93
+ fma.rn.ftz.f32 %f49, %f48, %f116, %f112;
94
+ fma.rn.ftz.f32 %f50, %f49, %f116, %f111;
95
+ fma.rn.ftz.f32 %f51, %f50, %f116, %f110;
96
+ fma.rn.ftz.f32 %f52, %f51, %f116, %f109;
97
+ neg.f32 %f53, %f116;
98
+ selp.f32 %f54, %f53, %f5, %p3;
99
+ fma.rn.ftz.f32 %f117, %f52, %f54, %f54;
100
+ mov.f32 %f108, 0f3F800000;
101
+ @%p4 bra $L__BB0_4;
102
+ ex2.approx.ftz.f32 %f55, %f117;
103
+ sub.f32 %f57, %f108, %f55;
104
+ mov.b32 %r13, %f57;
105
+ mov.b32 %r14, %f5;
106
+ and.b32 %r15, %r14, -2147483648;
107
+ or.b32 %r16, %r15, %r13;
108
+ mov.b32 %f117, %r16;
109
+ $L__BB0_4:
110
+ .loc 1 0 0
111
+ cvt.f32.bf16 %r3, %rs1;
112
+ cvt.f32.bf16 %r4, %rs2;
113
+ .loc 1 30 23
114
+ abs.ftz.f32 %f20, %f6;
115
+ setp.ge.f32 %p6, %f20, 0f3F8060FE;
116
+ mov.f32 %f124, 0f3789CA3C;
117
+ mov.f32 %f123, 0fB9F560B9;
118
+ mov.f32 %f122, 0f3BAC840B;
119
+ mov.f32 %f121, 0fBD0C8162;
120
+ mov.f32 %f120, 0f3E1CF906;
121
+ mov.f32 %f119, 0f3F6A937E;
122
+ mov.f32 %f118, 0f3F20D842;
123
+ mov.f32 %f125, %f20;
124
+ @%p6 bra $L__BB0_6;
125
+ mul.f32 %f125, %f6, %f6;
126
+ mov.f32 %f124, 0f38B1E96A;
127
+ mov.f32 %f123, 0fBA574D20;
128
+ mov.f32 %f122, 0f3BAAD5EA;
129
+ mov.f32 %f121, 0fBCDC1BE7;
130
+ mov.f32 %f120, 0f3DE718AF;
131
+ mov.f32 %f119, 0fBEC093AC;
132
+ mov.f32 %f118, 0f3E0375D3;
133
+ $L__BB0_6:
134
+ .loc 1 0 0
135
+ mov.b32 %f1, %r3;
136
+ mov.b32 %f2, %r4;
137
+ .loc 1 30 23
138
+ setp.ltu.f32 %p7, %f20, 0f3F8060FE;
139
+ fma.rn.ftz.f32 %f72, %f124, %f125, %f123;
140
+ fma.rn.ftz.f32 %f73, %f72, %f125, %f122;
141
+ fma.rn.ftz.f32 %f74, %f73, %f125, %f121;
142
+ fma.rn.ftz.f32 %f75, %f74, %f125, %f120;
143
+ fma.rn.ftz.f32 %f76, %f75, %f125, %f119;
144
+ fma.rn.ftz.f32 %f77, %f76, %f125, %f118;
145
+ neg.f32 %f78, %f125;
146
+ selp.f32 %f79, %f78, %f6, %p6;
147
+ fma.rn.ftz.f32 %f126, %f77, %f79, %f79;
148
+ @%p7 bra $L__BB0_8;
149
+ ex2.approx.ftz.f32 %f80, %f126;
150
+ sub.f32 %f82, %f108, %f80;
151
+ mov.b32 %r17, %f82;
152
+ mov.b32 %r18, %f6;
153
+ and.b32 %r19, %r18, -2147483648;
154
+ or.b32 %r20, %r19, %r17;
155
+ mov.b32 %f126, %r20;
156
+ $L__BB0_8:
157
+ .loc 1 32 18
158
+ add.f32 %f87, %f117, 0f3F800000;
159
+ add.f32 %f88, %f126, 0f3F800000;
160
+ .loc 1 35 19
161
+ mul.f32 %f89, %f3, %f3;
162
+ mul.f32 %f90, %f4, %f4;
163
+ .loc 1 37 20
164
+ mul.f32 %f91, %f89, 0fBF000000;
165
+ mul.f32 %f92, %f90, 0fBF000000;
166
+ .loc 1 38 19
167
+ mul.f32 %f84, %f91, 0f3FB8AA3B;
168
+ ex2.approx.f32 %f83, %f84;
169
+ mul.f32 %f86, %f92, 0f3FB8AA3B;
170
+ ex2.approx.f32 %f85, %f86;
171
+ .loc 1 40 20
172
+ mul.f32 %f93, %f83, 0f3ECC422A;
173
+ mul.f32 %f94, %f85, 0f3ECC422A;
174
+ .loc 1 41 19
175
+ mul.f32 %f95, %f3, %f93;
176
+ mul.f32 %f96, %f4, %f94;
177
+ .loc 1 42 20
178
+ fma.rn.f32 %f97, %f87, 0f3F000000, %f95;
179
+ fma.rn.f32 %f98, %f88, 0f3F000000, %f96;
180
+ .loc 1 43 19
181
+ mul.f32 %f99, %f1, %f97;
182
+ mul.f32 %f100, %f2, %f98;
183
+ .loc 1 45 40
184
+ mov.b32 %r21, %f99;
185
+ cvt.rn.bf16.f32 %rs5, %r21;
186
+ mov.b32 %r22, %f100;
187
+ cvt.rn.bf16.f32 %rs6, %r22;
188
+ mov.b32 %r24, {%rs5, %rs6};
189
+ @%p1 st.global.b32 [ %rd7 + 0 ], { %r24 };
190
+ .loc 1 45 4
191
+ ret;
192
+ $L__tmp1:
193
+ $L__func_end0:
194
+
195
+ }
196
+ // .globl __nv_erff
197
+ .visible .func (.param .b32 func_retval0) __nv_erff(
198
+ .param .b32 __nv_erff_param_0
199
+ )
200
+ {
201
+ .reg .pred %p<4>;
202
+ .reg .b32 %r<5>;
203
+ .reg .f32 %f<49>;
204
+ $L__func_begin1:
205
+
206
+ ld.param.f32 %f14, [__nv_erff_param_0];
207
+ abs.ftz.f32 %f1, %f14;
208
+ setp.ge.f32 %p1, %f1, 0f3F8060FE;
209
+ mov.f32 %f46, 0f3789CA3C;
210
+ mov.f32 %f45, 0fB9F560B9;
211
+ mov.f32 %f44, 0f3BAC840B;
212
+ mov.f32 %f43, 0fBD0C8162;
213
+ mov.f32 %f42, 0f3E1CF906;
214
+ mov.f32 %f41, 0f3F6A937E;
215
+ mov.f32 %f40, 0f3F20D842;
216
+ mov.f32 %f47, %f1;
217
+ @%p1 bra $L__BB1_2;
218
+ mul.f32 %f47, %f14, %f14;
219
+ mov.f32 %f46, 0f38B1E96A;
220
+ mov.f32 %f45, 0fBA574D20;
221
+ mov.f32 %f44, 0f3BAAD5EA;
222
+ mov.f32 %f43, 0fBCDC1BE7;
223
+ mov.f32 %f42, 0f3DE718AF;
224
+ mov.f32 %f41, 0fBEC093AC;
225
+ mov.f32 %f40, 0f3E0375D3;
226
+ $L__BB1_2:
227
+ setp.ltu.f32 %p2, %f1, 0f3F8060FE;
228
+ fma.rn.ftz.f32 %f29, %f46, %f47, %f45;
229
+ fma.rn.ftz.f32 %f30, %f29, %f47, %f44;
230
+ fma.rn.ftz.f32 %f31, %f30, %f47, %f43;
231
+ fma.rn.ftz.f32 %f32, %f31, %f47, %f42;
232
+ fma.rn.ftz.f32 %f33, %f32, %f47, %f41;
233
+ fma.rn.ftz.f32 %f34, %f33, %f47, %f40;
234
+ neg.f32 %f35, %f47;
235
+ selp.f32 %f36, %f35, %f14, %p1;
236
+ fma.rn.ftz.f32 %f48, %f34, %f36, %f36;
237
+ @%p2 bra $L__BB1_4;
238
+ ex2.approx.ftz.f32 %f37, %f48;
239
+ mov.f32 %f38, 0f3F800000;
240
+ sub.f32 %f39, %f38, %f37;
241
+ mov.b32 %r1, %f39;
242
+ mov.b32 %r2, %f14;
243
+ and.b32 %r3, %r2, -2147483648;
244
+ or.b32 %r4, %r3, %r1;
245
+ mov.b32 %f48, %r4;
246
+ $L__BB1_4:
247
+ st.param.f32 [func_retval0+0], %f48;
248
+ ret;
249
+ $L__func_end1:
250
+
251
+ }
252
+ .file 1 "/tmp/torchinductor_root/5j/c5jxaguxho3nhrlt5vcinnz5fevodumlpwn4wyb2vx3xrveicerl.py"
253
+ .section .debug_abbrev
254
+ {
255
+ .b8 1
256
+ .b8 17
257
+ .b8 1
258
+ .b8 37
259
+ .b8 8
260
+ .b8 19
261
+ .b8 5
262
+ .b8 3
263
+ .b8 8
264
+ .b8 16
265
+ .b8 6
266
+ .b8 27
267
+ .b8 8
268
+ .b8 180
269
+ .b8 66
270
+ .b8 12
271
+ .b8 17
272
+ .b8 1
273
+ .b8 18
274
+ .b8 1
275
+ .b8 0
276
+ .b8 0
277
+ .b8 2
278
+ .b8 46
279
+ .b8 0
280
+ .b8 17
281
+ .b8 1
282
+ .b8 18
283
+ .b8 1
284
+ .b8 64
285
+ .b8 10
286
+ .b8 135
287
+ .b8 64
288
+ .b8 8
289
+ .b8 3
290
+ .b8 8
291
+ .b8 58
292
+ .b8 11
293
+ .b8 59
294
+ .b8 11
295
+ .b8 63
296
+ .b8 12
297
+ .b8 0
298
+ .b8 0
299
+ .b8 0
300
+ }
301
+ .section .debug_info
302
+ {
303
+ .b32 176
304
+ .b8 2
305
+ .b8 0
306
+ .b32 .debug_abbrev
307
+ .b8 8
308
+ .b8 1
309
+ .b8 116
310
+ .b8 114
311
+ .b8 105
312
+ .b8 116
313
+ .b8 111
314
+ .b8 110
315
+ .b8 0
316
+ .b8 2
317
+ .b8 0
318
+ .b8 99
319
+ .b8 53
320
+ .b8 106
321
+ .b8 120
322
+ .b8 97
323
+ .b8 103
324
+ .b8 117
325
+ .b8 120
326
+ .b8 104
327
+ .b8 111
328
+ .b8 51
329
+ .b8 110
330
+ .b8 104
331
+ .b8 114
332
+ .b8 108
333
+ .b8 116
334
+ .b8 53
335
+ .b8 118
336
+ .b8 99
337
+ .b8 105
338
+ .b8 110
339
+ .b8 110
340
+ .b8 122
341
+ .b8 53
342
+ .b8 102
343
+ .b8 101
344
+ .b8 118
345
+ .b8 111
346
+ .b8 100
347
+ .b8 117
348
+ .b8 109
349
+ .b8 108
350
+ .b8 112
351
+ .b8 119
352
+ .b8 110
353
+ .b8 52
354
+ .b8 119
355
+ .b8 121
356
+ .b8 98
357
+ .b8 50
358
+ .b8 118
359
+ .b8 120
360
+ .b8 51
361
+ .b8 120
362
+ .b8 114
363
+ .b8 118
364
+ .b8 101
365
+ .b8 105
366
+ .b8 99
367
+ .b8 101
368
+ .b8 114
369
+ .b8 108
370
+ .b8 46
371
+ .b8 112
372
+ .b8 121
373
+ .b8 0
374
+ .b32 .debug_line
375
+ .b8 47
376
+ .b8 116
377
+ .b8 109
378
+ .b8 112
379
+ .b8 47
380
+ .b8 116
381
+ .b8 111
382
+ .b8 114
383
+ .b8 99
384
+ .b8 104
385
+ .b8 105
386
+ .b8 110
387
+ .b8 100
388
+ .b8 117
389
+ .b8 99
390
+ .b8 116
391
+ .b8 111
392
+ .b8 114
393
+ .b8 95
394
+ .b8 114
395
+ .b8 111
396
+ .b8 111
397
+ .b8 116
398
+ .b8 47
399
+ .b8 53
400
+ .b8 106
401
+ .b8 0
402
+ .b8 1
403
+ .b64 $L__func_begin0
404
+ .b64 $L__func_end0
405
+ .b8 2
406
+ .b64 $L__func_begin0
407
+ .b64 $L__func_end0
408
+ .b8 1
409
+ .b8 156
410
+ .b8 116
411
+ .b8 114
412
+ .b8 105
413
+ .b8 116
414
+ .b8 111
415
+ .b8 110
416
+ .b8 95
417
+ .b8 95
418
+ .b8 48
419
+ .b8 100
420
+ .b8 49
421
+ .b8 100
422
+ .b8 50
423
+ .b8 100
424
+ .b8 101
425
+ .b8 0
426
+ .b8 116
427
+ .b8 114
428
+ .b8 105
429
+ .b8 116
430
+ .b8 111
431
+ .b8 110
432
+ .b8 95
433
+ .b8 95
434
+ .b8 48
435
+ .b8 100
436
+ .b8 49
437
+ .b8 100
438
+ .b8 50
439
+ .b8 100
440
+ .b8 101
441
+ .b8 0
442
+ .b8 1
443
+ .b8 18
444
+ .b8 1
445
+ .b8 0
446
+ }
447
+ .section .debug_pubnames
448
+ {
449
+ .b32 $L__pubNames_end0-$L__pubNames_start0
450
+ $L__pubNames_start0:
451
+ .b8 2
452
+ .b8 0
453
+ .b32 .debug_info
454
+ .b32 180
455
+ .b32 125
456
+ .b8 116
457
+ .b8 114
458
+ .b8 105
459
+ .b8 116
460
+ .b8 111
461
+ .b8 110
462
+ .b8 95
463
+ .b8 95
464
+ .b8 48
465
+ .b8 100
466
+ .b8 49
467
+ .b8 100
468
+ .b8 50
469
+ .b8 100
470
+ .b8 101
471
+ .b8 0
472
+ .b32 0
473
+ $L__pubNames_end0:
474
+ }
475
+ .section .debug_pubtypes
476
+ {
477
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
478
+ $L__pubTypes_start0:
479
+ .b8 2
480
+ .b8 0
481
+ .b32 .debug_info
482
+ .b32 180
483
+ .b32 0
484
+ $L__pubTypes_end0:
485
+ }
486
+ .section .debug_loc { }
.triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.ttgir ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
4
+ %cst = arith.constant dense<0.398942292> : tensor<512xf32, #blocked>
5
+ %cst_0 = arith.constant dense<-5.000000e-01> : tensor<512xf32, #blocked>
6
+ %cst_1 = arith.constant dense<5.000000e-01> : tensor<512xf32, #blocked>
7
+ %cst_2 = arith.constant dense<1.000000e+00> : tensor<512xf32, #blocked>
8
+ %cst_3 = arith.constant dense<0.707106769> : tensor<512xf32, #blocked>
9
+ %c512_i32 = arith.constant 512 : i32
10
+ %0 = tt.get_program_id x : i32
11
+ %1 = arith.muli %0, %c512_i32 : i32
12
+ %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked>
13
+ %3 = tt.splat %1 : (i32) -> tensor<512xi32, #blocked>
14
+ %4 = arith.addi %3, %2 : tensor<512xi32, #blocked>
15
+ %5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>, #blocked>
16
+ %6 = tt.addptr %5, %4 : tensor<512x!tt.ptr<bf16, 1>, #blocked>, tensor<512xi32, #blocked>
17
+ %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16, #blocked>
18
+ %8 = arith.extf %7 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked>
19
+ %9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>, #blocked>
20
+ %10 = tt.addptr %9, %4 : tensor<512x!tt.ptr<bf16, 1>, #blocked>, tensor<512xi32, #blocked>
21
+ %11 = tt.load %10 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16, #blocked>
22
+ %12 = arith.extf %11 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked>
23
+ %13 = arith.mulf %12, %cst_3 : tensor<512xf32, #blocked>
24
+ %14 = tt.extern_elementwise %13 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<512xf32, #blocked>) -> tensor<512xf32, #blocked>
25
+ %15 = arith.addf %14, %cst_2 : tensor<512xf32, #blocked>
26
+ %16 = arith.mulf %15, %cst_1 : tensor<512xf32, #blocked>
27
+ %17 = arith.mulf %12, %12 : tensor<512xf32, #blocked>
28
+ %18 = arith.mulf %17, %cst_0 : tensor<512xf32, #blocked>
29
+ %19 = math.exp %18 : tensor<512xf32, #blocked>
30
+ %20 = arith.mulf %19, %cst : tensor<512xf32, #blocked>
31
+ %21 = arith.mulf %12, %20 : tensor<512xf32, #blocked>
32
+ %22 = arith.addf %16, %21 : tensor<512xf32, #blocked>
33
+ %23 = arith.mulf %8, %22 : tensor<512xf32, #blocked>
34
+ %24 = arith.truncf %23 : tensor<512xf32, #blocked> to tensor<512xbf16, #blocked>
35
+ tt.store %6, %24 {cache = 1 : i32, evict = 1 : i32} : tensor<512xbf16, #blocked>
36
+ tt.return
37
+ }
38
+ }
.triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.ttir ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<0.398942292> : tensor<512xf32>
4
+ %cst_0 = arith.constant dense<-5.000000e-01> : tensor<512xf32>
5
+ %cst_1 = arith.constant dense<5.000000e-01> : tensor<512xf32>
6
+ %cst_2 = arith.constant dense<1.000000e+00> : tensor<512xf32>
7
+ %cst_3 = arith.constant dense<0.707106769> : tensor<512xf32>
8
+ %c512_i32 = arith.constant 512 : i32
9
+ %0 = tt.get_program_id x : i32
10
+ %1 = arith.muli %0, %c512_i32 : i32
11
+ %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32>
12
+ %3 = tt.splat %1 : (i32) -> tensor<512xi32>
13
+ %4 = arith.addi %3, %2 : tensor<512xi32>
14
+ %5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>>
15
+ %6 = tt.addptr %5, %4 : tensor<512x!tt.ptr<bf16, 1>>, tensor<512xi32>
16
+ %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16>
17
+ %8 = arith.extf %7 : tensor<512xbf16> to tensor<512xf32>
18
+ %9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>>
19
+ %10 = tt.addptr %9, %4 : tensor<512x!tt.ptr<bf16, 1>>, tensor<512xi32>
20
+ %11 = tt.load %10 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16>
21
+ %12 = arith.extf %11 : tensor<512xbf16> to tensor<512xf32>
22
+ %13 = arith.mulf %12, %cst_3 : tensor<512xf32>
23
+ %14 = tt.extern_elementwise %13 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<512xf32>) -> tensor<512xf32>
24
+ %15 = arith.addf %14, %cst_2 : tensor<512xf32>
25
+ %16 = arith.mulf %15, %cst_1 : tensor<512xf32>
26
+ %17 = arith.mulf %12, %12 : tensor<512xf32>
27
+ %18 = arith.mulf %17, %cst_0 : tensor<512xf32>
28
+ %19 = math.exp %18 : tensor<512xf32>
29
+ %20 = arith.mulf %19, %cst : tensor<512xf32>
30
+ %21 = arith.mulf %12, %20 : tensor<512xf32>
31
+ %22 = arith.addf %16, %21 : tensor<512xf32>
32
+ %23 = arith.mulf %8, %22 : tensor<512xf32>
33
+ %24 = arith.truncf %23 : tensor<512xf32> to tensor<512xbf16>
34
+ tt.store %6, %24 {cache = 1 : i32, evict = 1 : i32} : tensor<512xbf16>
35
+ tt.return
36
+ }
37
+ }
.triton/dump/645565eaba0a18dd23ef200fe9abb0c0/triton_.cubin ADDED
Binary file (19.5 kB). View file
 
.triton/dump/645565eaba0a18dd23ef200fe9abb0c0/triton_.ptx ADDED
@@ -0,0 +1,834 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4d5d6d7d8de9de
10
+ .extern .func __assertfail
11
+ (
12
+ .param .b64 __assertfail_param_0,
13
+ .param .b64 __assertfail_param_1,
14
+ .param .b32 __assertfail_param_2,
15
+ .param .b64 __assertfail_param_3,
16
+ .param .b64 __assertfail_param_4
17
+ )
18
+ ;
19
+ .global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
20
+ .global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
21
+ .global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
22
+ .extern .shared .align 1 .b8 global_smem[];
23
+ .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
24
+
25
+ .visible .entry triton__0d1d2d3d4d5d6d7d8de9de(
26
+ .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_0,
27
+ .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_1,
28
+ .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_2,
29
+ .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_3,
30
+ .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_4,
31
+ .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_5,
32
+ .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_6,
33
+ .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_7,
34
+ .param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_8,
35
+ .param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_9
36
+ )
37
+ .maxntid 64, 1, 1
38
+ {
39
+ .reg .pred %p<36>;
40
+ .reg .b16 %rs<5>;
41
+ .reg .b32 %r<109>;
42
+ .reg .f32 %f<70>;
43
+ .reg .b64 %rd<49>;
44
+ .loc 1 18 0
45
+ $L__func_begin0:
46
+ .loc 1 18 0
47
+
48
+ ld.param.u64 %rd8, [triton__0d1d2d3d4d5d6d7d8de9de_param_7];
49
+ ld.param.u64 %rd7, [triton__0d1d2d3d4d5d6d7d8de9de_param_6];
50
+ ld.param.u64 %rd6, [triton__0d1d2d3d4d5d6d7d8de9de_param_5];
51
+ ld.param.u64 %rd5, [triton__0d1d2d3d4d5d6d7d8de9de_param_2];
52
+ ld.param.u64 %rd4, [triton__0d1d2d3d4d5d6d7d8de9de_param_0];
53
+ $L__tmp0:
54
+ .loc 1 26 26
55
+ mov.u32 %r1, %tid.x;
56
+ ld.param.u64 %rd21, [triton__0d1d2d3d4d5d6d7d8de9de_param_1];
57
+ and.b32 %r2, %r1, 63;
58
+ shl.b32 %r28, %r2, 2;
59
+ ld.param.u64 %rd22, [triton__0d1d2d3d4d5d6d7d8de9de_param_3];
60
+ ld.param.u64 %rd23, [triton__0d1d2d3d4d5d6d7d8de9de_param_4];
61
+ .loc 1 23 28
62
+ mov.u32 %r11, %ctaid.x;
63
+ .loc 1 30 18
64
+ shr.s32 %r29, %r11, 31;
65
+ shr.u32 %r30, %r29, 23;
66
+ add.s32 %r31, %r11, %r30;
67
+ and.b32 %r32, %r31, 16776704;
68
+ sub.s32 %r33, %r11, %r32;
69
+ .loc 1 31 30
70
+ cvt.s64.s32 %rd1, %r11;
71
+ mul.wide.s32 %rd24, %r11, 8;
72
+ add.s64 %rd10, %rd21, %rd24;
73
+ mov.pred %p18, -1;
74
+ .loc 1 31 35
75
+ mov.u64 %rd9, 0x0;
76
+ @%p18 ld.global.L1::evict_last.b64 { %rd9 }, [ %rd10 + 0 ];
77
+ mov.u64 %rd11, 0x0;
78
+ @%p18 ld.global.L1::evict_last.b64 { %rd11 }, [ %rd10 + 0 ];
79
+ mov.u64 %rd13, 0x0;
80
+ @%p18 ld.global.L1::evict_last.b64 { %rd13 }, [ %rd10 + 0 ];
81
+ mov.u64 %rd15, 0x0;
82
+ @%p18 ld.global.L1::evict_last.b64 { %rd15 }, [ %rd10 + 0 ];
83
+ mov.u64 %rd17, 0x0;
84
+ @%p18 ld.global.L1::evict_last.b64 { %rd17 }, [ %rd10 + 0 ];
85
+ .loc 1 32 40
86
+ shl.b32 %r34, %r33, 8;
87
+ .loc 1 32 36
88
+ or.b32 %r35, %r34, %r28;
89
+ .loc 1 32 30
90
+ mul.wide.s32 %rd25, %r35, 4;
91
+ add.s64 %rd19, %rd22, %rd25;
92
+ mov.b32 %r41, 0;
93
+ .loc 1 32 46
94
+ mov.u32 %r12, 0x0;
95
+ mov.u32 %r13, 0x0;
96
+ mov.u32 %r14, 0x0;
97
+ mov.u32 %r15, 0x0;
98
+ @%p18 ld.global.L1::evict_last.v4.b32 { %r12, %r13, %r14, %r15 }, [ %rd19 + 0 ];
99
+ @!%p18 mov.u32 %r12, %r41;
100
+ @!%p18 mov.u32 %r13, %r41;
101
+ @!%p18 mov.u32 %r14, %r41;
102
+ @!%p18 mov.u32 %r15, %r41;
103
+ .loc 1 33 31
104
+ cvt.u64.u32 %rd3, %r28;
105
+ mul.wide.u32 %rd26, %r28, 4;
106
+ add.s64 %rd20, %rd23, %rd26;
107
+ .loc 1 33 36
108
+ mov.u32 %r20, 0x0;
109
+ mov.u32 %r21, 0x0;
110
+ mov.u32 %r22, 0x0;
111
+ mov.u32 %r23, 0x0;
112
+ @%p18 ld.global.L1::evict_last.v4.b32 { %r20, %r21, %r22, %r23 }, [ %rd20 + 0 ];
113
+ @!%p18 mov.u32 %r20, %r41;
114
+ @!%p18 mov.u32 %r21, %r41;
115
+ @!%p18 mov.u32 %r22, %r41;
116
+ @!%p18 mov.u32 %r23, %r41;
117
+ .loc 1 34 18
118
+ add.s64 %rd27, %rd17, 50257;
119
+ .loc 1 35 18
120
+ setp.lt.s64 %p16, %rd17, 0;
121
+ .loc 1 36 32
122
+ selp.b64 %rd28, %rd27, %rd17, %p16;
123
+ .loc 1 37 36
124
+ setp.lt.u64 %p17, %rd28, 50257;
125
+ .loc 1 37 51
126
+ @%p17 bra $L__BB0_2;
127
+ mov.u64 %rd29, assertMessage_0;
128
+ cvta.global.u64 %rd30, %rd29;
129
+ mov.u64 %rd31, assertFile_0;
130
+ cvta.global.u64 %rd32, %rd31;
131
+ mov.u64 %rd33, assertFunc_0;
132
+ cvta.global.u64 %rd34, %rd33;
133
+ mov.b32 %r36, 883;
134
+ mov.u64 %rd35, 1;
135
+ { // callseq 0, 0
136
+ .reg .b32 temp_param_reg;
137
+ .param .b64 param0;
138
+ st.param.b64 [param0+0], %rd30;
139
+ .param .b64 param1;
140
+ st.param.b64 [param1+0], %rd32;
141
+ .param .b32 param2;
142
+ st.param.b32 [param2+0], %r36;
143
+ .param .b64 param3;
144
+ st.param.b64 [param3+0], %rd34;
145
+ .param .b64 param4;
146
+ st.param.b64 [param4+0], %rd35;
147
+ call.uni
148
+ __assertfail,
149
+ (
150
+ param0,
151
+ param1,
152
+ param2,
153
+ param3,
154
+ param4
155
+ );
156
+ } // callseq 0
157
+ $L__BB0_2:
158
+ .loc 1 35 18
159
+ setp.lt.s64 %p33, %rd9, 0;
160
+ .loc 1 26 26
161
+ and.b32 %r75, %r1, 31;
162
+ .loc 1 38 40
163
+ shl.b64 %rd41, %rd9, 8;
164
+ add.s64 %rd42, %rd41, 12865792;
165
+ selp.b64 %rd43, %rd42, %rd41, %p33;
166
+ .loc 1 38 36
167
+ or.b64 %rd44, %rd43, %rd3;
168
+ .loc 1 38 30
169
+ shl.b64 %rd45, %rd44, 2;
170
+ add.s64 %rd36, %rd5, %rd45;
171
+ .loc 1 38 48
172
+ mov.u32 %r37, 0x0;
173
+ mov.u32 %r38, 0x0;
174
+ mov.u32 %r39, 0x0;
175
+ mov.u32 %r40, 0x0;
176
+ @%p18 ld.global.v4.b32 { %r37, %r38, %r39, %r40 }, [ %rd36 + 0 ];
177
+ @!%p18 mov.u32 %r37, %r41;
178
+ @!%p18 mov.u32 %r38, %r41;
179
+ @!%p18 mov.u32 %r39, %r41;
180
+ @!%p18 mov.u32 %r40, %r41;
181
+ .loc 1 32 46
182
+ mov.b32 %f1, %r12;
183
+ mov.b32 %f2, %r13;
184
+ .loc 1 38 48
185
+ mov.b32 %f3, %r37;
186
+ mov.b32 %f4, %r38;
187
+ .loc 1 39 18
188
+ add.f32 %f5, %f2, %f4;
189
+ mov.b32 %r64, %f5;
190
+ add.f32 %f6, %f1, %f3;
191
+ .loc 1 32 46
192
+ mov.b32 %f7, %r15;
193
+ mov.b32 %f8, %r14;
194
+ .loc 1 38 48
195
+ mov.b32 %f9, %r40;
196
+ mov.b32 %f10, %r39;
197
+ .loc 1 39 18
198
+ add.f32 %f11, %f8, %f10;
199
+ mov.b32 %r65, %f11;
200
+ add.f32 %f12, %f7, %f9;
201
+ $L__tmp1:
202
+ .loc 2 233 15
203
+ add.f32 %f13, %f6, %f5;
204
+ add.f32 %f14, %f11, %f13;
205
+ add.f32 %f15, %f12, %f14;
206
+ $L__tmp2:
207
+ .loc 2 243 36
208
+ mov.b32 %r76, %f15;
209
+ shfl.sync.bfly.b32 %r77, %r76, 16, 31, -1;
210
+ mov.b32 %f16, %r77;
211
+ $L__tmp3:
212
+ .loc 2 233 15
213
+ add.f32 %f17, %f15, %f16;
214
+ $L__tmp4:
215
+ .loc 2 243 36
216
+ mov.b32 %r78, %f17;
217
+ shfl.sync.bfly.b32 %r79, %r78, 8, 31, -1;
218
+ mov.b32 %f18, %r79;
219
+ $L__tmp5:
220
+ .loc 2 233 15
221
+ add.f32 %f19, %f17, %f18;
222
+ $L__tmp6:
223
+ .loc 2 243 36
224
+ mov.b32 %r80, %f19;
225
+ shfl.sync.bfly.b32 %r81, %r80, 4, 31, -1;
226
+ mov.b32 %f20, %r81;
227
+ $L__tmp7:
228
+ .loc 2 233 15
229
+ add.f32 %f21, %f19, %f20;
230
+ $L__tmp8:
231
+ .loc 2 243 36
232
+ mov.b32 %r82, %f21;
233
+ shfl.sync.bfly.b32 %r83, %r82, 2, 31, -1;
234
+ mov.b32 %f22, %r83;
235
+ $L__tmp9:
236
+ .loc 2 233 15
237
+ add.f32 %f23, %f21, %f22;
238
+ $L__tmp10:
239
+ .loc 2 243 36
240
+ mov.b32 %r84, %f23;
241
+ shfl.sync.bfly.b32 %r85, %r84, 1, 31, -1;
242
+ mov.b32 %f24, %r85;
243
+ $L__tmp11:
244
+ .loc 2 233 15
245
+ add.f32 %f25, %f23, %f24;
246
+ $L__tmp12:
247
+ .loc 2 243 36
248
+ setp.eq.s32 %p23, %r75, 0;
249
+ shr.u32 %r86, %r1, 3;
250
+ and.b32 %r87, %r86, 4;
251
+ mov.u32 %r88, global_smem;
252
+ add.s32 %r45, %r88, %r87;
253
+ mov.b32 %r46, %f25;
254
+ @%p23 st.shared.b32 [ %r45 + 0 ], %r46;
255
+ bar.sync 0;
256
+ setp.lt.s32 %p24, %r1, 2;
257
+ shl.b32 %r89, %r1, 2;
258
+ add.s32 %r48, %r88, %r89;
259
+ @%p24 ld.shared.b32 %r47, [ %r48 + 0 ];
260
+ mov.b32 %f26, %r47;
261
+ shfl.sync.bfly.b32 %r90, %r47, 1, 31, -1;
262
+ mov.b32 %f27, %r90;
263
+ $L__tmp13:
264
+ .loc 2 233 15
265
+ add.f32 %f28, %f26, %f27;
266
+ $L__tmp14:
267
+ .loc 2 243 36
268
+ and.b32 %r91, %r1, 1;
269
+ setp.eq.b32 %p34, %r91, 1;
270
+ not.pred %p35, %p34;
271
+ and.pred %p25, %p24, %p35;
272
+ mov.b32 %r50, %f28;
273
+ @%p25 st.shared.b32 [ %r48 + 0 ], %r50;
274
+ bar.sync 0;
275
+ ld.shared.f32 %f29, [global_smem];
276
+ $L__tmp15:
277
+ .loc 3 8 15
278
+ add.f32 %f30, %f29, 0f00000000;
279
+ $L__tmp16:
280
+ .loc 1 47 20
281
+ mov.b32 %r52, %f30;
282
+ mov.b32 %r53, 1132462080;
283
+ div.full.f32 %r74, %r52, %r53;
284
+ mov.b32 %f31, %r74;
285
+ .loc 1 48 19
286
+ sub.f32 %f32, %f6, %f31;
287
+ sub.f32 %f33, %f5, %f31;
288
+ sub.f32 %f34, %f11, %f31;
289
+ sub.f32 %f35, %f12, %f31;
290
+ .loc 1 49 20
291
+ mul.f32 %f36, %f33, %f33;
292
+ $L__tmp17:
293
+ .loc 2 243 36
294
+ bar.sync 0;
295
+ $L__tmp18:
296
+ .loc 2 233 15
297
+ fma.rn.f32 %f37, %f32, %f32, %f36;
298
+ fma.rn.f32 %f38, %f34, %f34, %f37;
299
+ fma.rn.f32 %f39, %f35, %f35, %f38;
300
+ $L__tmp19:
301
+ .loc 2 243 36
302
+ mov.b32 %r92, %f39;
303
+ shfl.sync.bfly.b32 %r93, %r92, 16, 31, -1;
304
+ mov.b32 %f40, %r93;
305
+ $L__tmp20:
306
+ .loc 2 233 15
307
+ add.f32 %f41, %f39, %f40;
308
+ $L__tmp21:
309
+ .loc 2 243 36
310
+ mov.b32 %r94, %f41;
311
+ shfl.sync.bfly.b32 %r95, %r94, 8, 31, -1;
312
+ mov.b32 %f42, %r95;
313
+ $L__tmp22:
314
+ .loc 2 233 15
315
+ add.f32 %f43, %f41, %f42;
316
+ $L__tmp23:
317
+ .loc 2 243 36
318
+ mov.b32 %r96, %f43;
319
+ shfl.sync.bfly.b32 %r97, %r96, 4, 31, -1;
320
+ mov.b32 %f44, %r97;
321
+ $L__tmp24:
322
+ .loc 2 233 15
323
+ add.f32 %f45, %f43, %f44;
324
+ $L__tmp25:
325
+ .loc 2 243 36
326
+ mov.b32 %r98, %f45;
327
+ shfl.sync.bfly.b32 %r99, %r98, 2, 31, -1;
328
+ mov.b32 %f46, %r99;
329
+ $L__tmp26:
330
+ .loc 2 233 15
331
+ add.f32 %f47, %f45, %f46;
332
+ $L__tmp27:
333
+ .loc 2 243 36
334
+ mov.b32 %r100, %f47;
335
+ shfl.sync.bfly.b32 %r101, %r100, 1, 31, -1;
336
+ mov.b32 %f48, %r101;
337
+ $L__tmp28:
338
+ .loc 2 233 15
339
+ add.f32 %f49, %f47, %f48;
340
+ $L__tmp29:
341
+ .loc 2 243 36
342
+ mov.b32 %r55, %f49;
343
+ @%p23 st.shared.b32 [ %r45 + 0 ], %r55;
344
+ bar.sync 0;
345
+ @%p24 ld.shared.b32 %r56, [ %r48 + 0 ];
346
+ mov.b32 %f50, %r56;
347
+ shfl.sync.bfly.b32 %r102, %r56, 1, 31, -1;
348
+ mov.b32 %f51, %r102;
349
+ $L__tmp30:
350
+ .loc 2 233 15
351
+ add.f32 %f52, %f50, %f51;
352
+ $L__tmp31:
353
+ .loc 2 243 36
354
+ mov.b32 %r59, %f52;
355
+ @%p25 st.shared.b32 [ %r48 + 0 ], %r59;
356
+ bar.sync 0;
357
+ ld.shared.f32 %f53, [global_smem];
358
+ $L__tmp32:
359
+ .loc 3 8 15
360
+ add.f32 %f54, %f53, 0f00000000;
361
+ $L__tmp33:
362
+ .loc 1 54 20
363
+ mov.b32 %r61, %f54;
364
+ div.full.f32 %r60, %r61, %r53;
365
+ mov.b32 %f55, %r60;
366
+ .loc 1 56 20
367
+ add.f32 %f56, %f55, 0f3727C5AC;
368
+ .loc 1 57 26
369
+ rsqrt.approx.ftz.f32 %f57, %f56;
370
+ cvt.u32.u64 %r103, %rd3;
371
+ cvt.u32.u64 %r104, %rd1;
372
+ .loc 1 33 36
373
+ mov.b32 %f58, %r20;
374
+ mov.b32 %f59, %r21;
375
+ mov.b32 %f60, %r22;
376
+ mov.b32 %f61, %r23;
377
+ .loc 1 59 20
378
+ mul.f32 %f62, %f32, %f57;
379
+ mul.f32 %f63, %f33, %f57;
380
+ mul.f32 %f64, %f34, %f57;
381
+ mul.f32 %f65, %f35, %f57;
382
+ .loc 1 60 20
383
+ mul.f32 %f66, %f62, %f58;
384
+ mul.f32 %f67, %f63, %f59;
385
+ mul.f32 %f68, %f64, %f60;
386
+ mul.f32 %f69, %f65, %f61;
387
+ .loc 1 62 35
388
+ shl.b32 %r105, %r104, 8;
389
+ .loc 1 62 31
390
+ or.b32 %r106, %r105, %r103;
391
+ .loc 1 62 25
392
+ mul.wide.s32 %rd46, %r106, 4;
393
+ add.s64 %rd37, %rd6, %rd46;
394
+ .loc 1 39 18
395
+ mov.b32 %r63, %f6;
396
+ mov.b32 %r66, %f12;
397
+ .loc 1 62 47
398
+ @%p18 st.global.v4.b32 [ %rd37 + 0 ], { %r63, %r64, %r65, %r66 };
399
+ .loc 1 63 4
400
+ bar.sync 0;
401
+ .loc 1 64 28
402
+ shl.b64 %rd47, %rd1, 2;
403
+ add.s64 %rd38, %rd4, %rd47;
404
+ .loc 1 64 40
405
+ setp.eq.s32 %p30, %r2, 0;
406
+ mov.b32 %r67, %f57;
407
+ @%p30 st.global.b32 [ %rd38 + 0 ], { %r67 };
408
+ .loc 1 65 25
409
+ mul.wide.s32 %rd48, %r106, 2;
410
+ add.s64 %rd39, %rd8, %rd48;
411
+ .loc 1 65 48
412
+ mov.b32 %r68, %f66;
413
+ cvt.rn.bf16.f32 %rs1, %r68;
414
+ mov.b32 %r69, %f67;
415
+ cvt.rn.bf16.f32 %rs2, %r69;
416
+ mov.b32 %r70, %f68;
417
+ cvt.rn.bf16.f32 %rs3, %r70;
418
+ mov.b32 %r71, %f69;
419
+ cvt.rn.bf16.f32 %rs4, %r71;
420
+ mov.b32 %r107, {%rs1, %rs2};
421
+ mov.b32 %r108, {%rs3, %rs4};
422
+ @%p18 st.global.v2.b32 [ %rd39 + 0 ], { %r107, %r108 };
423
+ .loc 1 66 25
424
+ add.s64 %rd40, %rd7, %rd47;
425
+ .loc 1 66 37
426
+ @%p30 st.global.b32 [ %rd40 + 0 ], { %r74 };
427
+ .loc 1 66 4
428
+ ret;
429
+ $L__tmp34:
430
+ $L__func_end0:
431
+
432
+ }
433
+ // .globl __nv_rsqrtf
434
+ .visible .func (.param .b32 func_retval0) __nv_rsqrtf(
435
+ .param .b32 __nv_rsqrtf_param_0
436
+ )
437
+ {
438
+ .reg .f32 %f<3>;
439
+ $L__func_begin1:
440
+
441
+ ld.param.f32 %f1, [__nv_rsqrtf_param_0];
442
+ rsqrt.approx.ftz.f32 %f2, %f1;
443
+ st.param.f32 [func_retval0+0], %f2;
444
+ ret;
445
+ $L__func_end1:
446
+
447
+ }
448
+ .file 1 "/tmp/torchinductor_root/pd/cpdqiwgwgnzx7tsvbieui7kffx5dt43uhgvg7z7egekxcsybpv34.py"
449
+ .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
450
+ .file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
451
+ .section .debug_abbrev
452
+ {
453
+ .b8 1
454
+ .b8 17
455
+ .b8 1
456
+ .b8 37
457
+ .b8 8
458
+ .b8 19
459
+ .b8 5
460
+ .b8 3
461
+ .b8 8
462
+ .b8 16
463
+ .b8 6
464
+ .b8 27
465
+ .b8 8
466
+ .b8 180
467
+ .b8 66
468
+ .b8 12
469
+ .b8 17
470
+ .b8 1
471
+ .b8 18
472
+ .b8 1
473
+ .b8 0
474
+ .b8 0
475
+ .b8 2
476
+ .b8 46
477
+ .b8 0
478
+ .b8 135
479
+ .b8 64
480
+ .b8 8
481
+ .b8 3
482
+ .b8 8
483
+ .b8 58
484
+ .b8 11
485
+ .b8 59
486
+ .b8 11
487
+ .b8 63
488
+ .b8 12
489
+ .b8 32
490
+ .b8 11
491
+ .b8 0
492
+ .b8 0
493
+ .b8 3
494
+ .b8 46
495
+ .b8 1
496
+ .b8 17
497
+ .b8 1
498
+ .b8 18
499
+ .b8 1
500
+ .b8 64
501
+ .b8 10
502
+ .b8 49
503
+ .b8 19
504
+ .b8 0
505
+ .b8 0
506
+ .b8 4
507
+ .b8 29
508
+ .b8 1
509
+ .b8 49
510
+ .b8 19
511
+ .b8 17
512
+ .b8 1
513
+ .b8 18
514
+ .b8 1
515
+ .b8 88
516
+ .b8 11
517
+ .b8 89
518
+ .b8 11
519
+ .b8 87
520
+ .b8 11
521
+ .b8 0
522
+ .b8 0
523
+ .b8 5
524
+ .b8 29
525
+ .b8 0
526
+ .b8 49
527
+ .b8 19
528
+ .b8 17
529
+ .b8 1
530
+ .b8 18
531
+ .b8 1
532
+ .b8 88
533
+ .b8 11
534
+ .b8 89
535
+ .b8 11
536
+ .b8 87
537
+ .b8 11
538
+ .b8 0
539
+ .b8 0
540
+ .b8 0
541
+ }
542
+ .section .debug_info
543
+ {
544
+ .b32 407
545
+ .b8 2
546
+ .b8 0
547
+ .b32 .debug_abbrev
548
+ .b8 8
549
+ .b8 1
550
+ .b8 116
551
+ .b8 114
552
+ .b8 105
553
+ .b8 116
554
+ .b8 111
555
+ .b8 110
556
+ .b8 0
557
+ .b8 2
558
+ .b8 0
559
+ .b8 99
560
+ .b8 112
561
+ .b8 100
562
+ .b8 113
563
+ .b8 105
564
+ .b8 119
565
+ .b8 103
566
+ .b8 119
567
+ .b8 103
568
+ .b8 110
569
+ .b8 122
570
+ .b8 120
571
+ .b8 55
572
+ .b8 116
573
+ .b8 115
574
+ .b8 118
575
+ .b8 98
576
+ .b8 105
577
+ .b8 101
578
+ .b8 117
579
+ .b8 105
580
+ .b8 55
581
+ .b8 107
582
+ .b8 102
583
+ .b8 102
584
+ .b8 120
585
+ .b8 53
586
+ .b8 100
587
+ .b8 116
588
+ .b8 52
589
+ .b8 51
590
+ .b8 117
591
+ .b8 104
592
+ .b8 103
593
+ .b8 118
594
+ .b8 103
595
+ .b8 55
596
+ .b8 122
597
+ .b8 55
598
+ .b8 101
599
+ .b8 103
600
+ .b8 101
601
+ .b8 107
602
+ .b8 120
603
+ .b8 99
604
+ .b8 115
605
+ .b8 121
606
+ .b8 98
607
+ .b8 112
608
+ .b8 118
609
+ .b8 51
610
+ .b8 52
611
+ .b8 46
612
+ .b8 112
613
+ .b8 121
614
+ .b8 0
615
+ .b32 .debug_line
616
+ .b8 47
617
+ .b8 116
618
+ .b8 109
619
+ .b8 112
620
+ .b8 47
621
+ .b8 116
622
+ .b8 111
623
+ .b8 114
624
+ .b8 99
625
+ .b8 104
626
+ .b8 105
627
+ .b8 110
628
+ .b8 100
629
+ .b8 117
630
+ .b8 99
631
+ .b8 116
632
+ .b8 111
633
+ .b8 114
634
+ .b8 95
635
+ .b8 114
636
+ .b8 111
637
+ .b8 111
638
+ .b8 116
639
+ .b8 47
640
+ .b8 112
641
+ .b8 100
642
+ .b8 0
643
+ .b8 1
644
+ .b64 $L__func_begin0
645
+ .b64 $L__func_end0
646
+ .b8 2
647
+ .b8 116
648
+ .b8 114
649
+ .b8 105
650
+ .b8 116
651
+ .b8 111
652
+ .b8 110
653
+ .b8 95
654
+ .b8 95
655
+ .b8 48
656
+ .b8 100
657
+ .b8 49
658
+ .b8 100
659
+ .b8 50
660
+ .b8 100
661
+ .b8 51
662
+ .b8 100
663
+ .b8 52
664
+ .b8 100
665
+ .b8 53
666
+ .b8 100
667
+ .b8 54
668
+ .b8 100
669
+ .b8 55
670
+ .b8 100
671
+ .b8 56
672
+ .b8 100
673
+ .b8 101
674
+ .b8 57
675
+ .b8 100
676
+ .b8 101
677
+ .b8 0
678
+ .b8 116
679
+ .b8 114
680
+ .b8 105
681
+ .b8 116
682
+ .b8 111
683
+ .b8 110
684
+ .b8 95
685
+ .b8 95
686
+ .b8 48
687
+ .b8 100
688
+ .b8 49
689
+ .b8 100
690
+ .b8 50
691
+ .b8 100
692
+ .b8 51
693
+ .b8 100
694
+ .b8 52
695
+ .b8 100
696
+ .b8 53
697
+ .b8 100
698
+ .b8 54
699
+ .b8 100
700
+ .b8 55
701
+ .b8 100
702
+ .b8 56
703
+ .b8 100
704
+ .b8 101
705
+ .b8 57
706
+ .b8 100
707
+ .b8 101
708
+ .b8 0
709
+ .b8 1
710
+ .b8 18
711
+ .b8 1
712
+ .b8 1
713
+ .b8 3
714
+ .b64 $L__func_begin0
715
+ .b64 $L__func_end0
716
+ .b8 1
717
+ .b8 156
718
+ .b32 125
719
+ .b8 4
720
+ .b32 125
721
+ .b64 $L__tmp1
722
+ .b64 $L__tmp14
723
+ .b8 2
724
+ .b8 44
725
+ .b8 59
726
+ .b8 5
727
+ .b32 125
728
+ .b64 $L__tmp1
729
+ .b64 $L__tmp14
730
+ .b8 2
731
+ .b8 243
732
+ .b8 36
733
+ .b8 0
734
+ .b8 5
735
+ .b32 125
736
+ .b64 $L__tmp2
737
+ .b64 $L__tmp15
738
+ .b8 2
739
+ .b8 44
740
+ .b8 59
741
+ .b8 5
742
+ .b32 125
743
+ .b64 $L__tmp15
744
+ .b64 $L__tmp16
745
+ .b8 3
746
+ .b8 44
747
+ .b8 45
748
+ .b8 5
749
+ .b32 125
750
+ .b64 $L__tmp17
751
+ .b64 $L__tmp32
752
+ .b8 2
753
+ .b8 52
754
+ .b8 59
755
+ .b8 4
756
+ .b32 125
757
+ .b64 $L__tmp18
758
+ .b64 $L__tmp31
759
+ .b8 2
760
+ .b8 52
761
+ .b8 59
762
+ .b8 5
763
+ .b32 125
764
+ .b64 $L__tmp18
765
+ .b64 $L__tmp31
766
+ .b8 2
767
+ .b8 243
768
+ .b8 36
769
+ .b8 0
770
+ .b8 5
771
+ .b32 125
772
+ .b64 $L__tmp32
773
+ .b64 $L__tmp33
774
+ .b8 3
775
+ .b8 52
776
+ .b8 45
777
+ .b8 0
778
+ .b8 0
779
+ }
780
+ .section .debug_pubnames
781
+ {
782
+ .b32 $L__pubNames_end0-$L__pubNames_start0
783
+ $L__pubNames_start0:
784
+ .b8 2
785
+ .b8 0
786
+ .b32 .debug_info
787
+ .b32 411
788
+ .b32 125
789
+ .b8 116
790
+ .b8 114
791
+ .b8 105
792
+ .b8 116
793
+ .b8 111
794
+ .b8 110
795
+ .b8 95
796
+ .b8 95
797
+ .b8 48
798
+ .b8 100
799
+ .b8 49
800
+ .b8 100
801
+ .b8 50
802
+ .b8 100
803
+ .b8 51
804
+ .b8 100
805
+ .b8 52
806
+ .b8 100
807
+ .b8 53
808
+ .b8 100
809
+ .b8 54
810
+ .b8 100
811
+ .b8 55
812
+ .b8 100
813
+ .b8 56
814
+ .b8 100
815
+ .b8 101
816
+ .b8 57
817
+ .b8 100
818
+ .b8 101
819
+ .b8 0
820
+ .b32 0
821
+ $L__pubNames_end0:
822
+ }
823
+ .section .debug_pubtypes
824
+ {
825
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
826
+ $L__pubTypes_start0:
827
+ .b8 2
828
+ .b8 0
829
+ .b32 .debug_info
830
+ .b32 411
831
+ .b32 0
832
+ $L__pubTypes_end0:
833
+ }
834
+ .section .debug_loc { }
.triton/dump/645565eaba0a18dd23ef200fe9abb0c0/triton_.ttgir ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
3
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
4
+ tt.func public @triton__0d1d2d3d4d5d6d7d8de9de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
5
+ %cst = arith.constant dense<256> : tensor<256xi32, #blocked>
6
+ %cst_0 = arith.constant dense<0> : tensor<1xi64, #blocked>
7
+ %cst_1 = arith.constant dense<50257> : tensor<1xi64, #blocked>
8
+ %cst_2 = arith.constant dense<256> : tensor<1xi64, #blocked>
9
+ %cst_3 = arith.constant 9.99999974E-6 : f32
10
+ %cst_4 = arith.constant 2.560000e+02 : f32
11
+ %cst_5 = arith.constant 0.000000e+00 : f32
12
+ %c256_i32 = arith.constant 256 : i32
13
+ %c512_i32 = arith.constant 512 : i32
14
+ %cst_6 = arith.constant dense<50257> : tensor<1xi64, #blocked1>
15
+ %cst_7 = arith.constant dense<0> : tensor<1xi64, #blocked1>
16
+ %cst_8 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
17
+ %0 = tt.get_program_id x : i32
18
+ %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
19
+ %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
20
+ %3 = arith.remsi %0, %c512_i32 : i32
21
+ %4 = tt.addptr %arg1, %0 : !tt.ptr<i64, 1>, i32
22
+ %5 = tt.splat %4 : (!tt.ptr<i64, 1>) -> tensor<1x!tt.ptr<i64, 1>, #blocked>
23
+ %6 = tt.splat %4 : (!tt.ptr<i64, 1>) -> tensor<1x!tt.ptr<i64, 1>, #blocked1>
24
+ %7 = tt.load %5 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xi64, #blocked>
25
+ %8 = tt.load %6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xi64, #blocked1>
26
+ %9 = arith.muli %3, %c256_i32 : i32
27
+ %10 = tt.splat %9 : (i32) -> tensor<256xi32, #blocked>
28
+ %11 = arith.addi %1, %10 : tensor<256xi32, #blocked>
29
+ %12 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
30
+ %13 = tt.addptr %12, %11 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
31
+ %14 = tt.load %13, %2, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
32
+ %15 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
33
+ %16 = tt.addptr %15, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
34
+ %17 = tt.load %16, %2, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
35
+ %18 = arith.addi %7, %cst_1 : tensor<1xi64, #blocked>
36
+ %19 = arith.addi %8, %cst_6 : tensor<1xi64, #blocked1>
37
+ %20 = arith.cmpi slt, %7, %cst_0 : tensor<1xi64, #blocked>
38
+ %21 = arith.cmpi slt, %8, %cst_7 : tensor<1xi64, #blocked1>
39
+ %22 = arith.select %20, %18, %7 : tensor<1xi1, #blocked>, tensor<1xi64, #blocked>
40
+ %23 = arith.select %21, %19, %8 : tensor<1xi1, #blocked1>, tensor<1xi64, #blocked1>
41
+ %24 = arith.cmpi sge, %23, %cst_7 : tensor<1xi64, #blocked1>
42
+ %25 = arith.cmpi slt, %23, %cst_6 : tensor<1xi64, #blocked1>
43
+ %26 = arith.andi %24, %25 : tensor<1xi1, #blocked1>
44
+ tt.assert %26, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<1xi1, #blocked1>
45
+ %27 = arith.muli %22, %cst_2 : tensor<1xi64, #blocked>
46
+ %28 = tt.broadcast %27 : (tensor<1xi64, #blocked>) -> tensor<256xi64, #blocked>
47
+ %29 = arith.extsi %1 : tensor<256xi32, #blocked> to tensor<256xi64, #blocked>
48
+ %30 = arith.addi %29, %28 : tensor<256xi64, #blocked>
49
+ %31 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
50
+ %32 = tt.addptr %31, %30 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi64, #blocked>
51
+ %33 = tt.load %32, %2, %cst_8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
52
+ %34 = arith.addf %33, %14 : tensor<256xf32, #blocked>
53
+ %35 = arith.select %2, %34, %cst_8 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
54
+ %36 = "tt.reduce"(%35) <{axis = 0 : i32}> ({
55
+ ^bb0(%arg10: f32, %arg11: f32):
56
+ %65 = arith.addf %arg10, %arg11 : f32
57
+ tt.reduce.return %65 : f32
58
+ }) : (tensor<256xf32, #blocked>) -> f32
59
+ %37 = arith.addf %36, %cst_5 : f32
60
+ %38 = arith.divf %37, %cst_4 : f32
61
+ %39 = tt.splat %38 : (f32) -> tensor<1xf32, #blocked1>
62
+ %40 = tt.splat %38 : (f32) -> tensor<256xf32, #blocked>
63
+ %41 = arith.subf %34, %40 : tensor<256xf32, #blocked>
64
+ %42 = arith.mulf %41, %41 : tensor<256xf32, #blocked>
65
+ %43 = arith.select %2, %42, %cst_8 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
66
+ %44 = "tt.reduce"(%43) <{axis = 0 : i32}> ({
67
+ ^bb0(%arg10: f32, %arg11: f32):
68
+ %65 = arith.addf %arg10, %arg11 : f32
69
+ tt.reduce.return %65 : f32
70
+ }) : (tensor<256xf32, #blocked>) -> f32
71
+ %45 = arith.addf %44, %cst_5 : f32
72
+ %46 = arith.divf %45, %cst_4 : f32
73
+ %47 = arith.addf %46, %cst_3 : f32
74
+ %48 = tt.extern_elementwise %47 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
75
+ %49 = tt.splat %48 : (f32) -> tensor<1xf32, #blocked1>
76
+ %50 = tt.splat %48 : (f32) -> tensor<256xf32, #blocked>
77
+ %51 = arith.mulf %41, %50 : tensor<256xf32, #blocked>
78
+ %52 = arith.mulf %51, %17 : tensor<256xf32, #blocked>
79
+ %53 = arith.muli %0, %c256_i32 : i32
80
+ %54 = tt.splat %53 : (i32) -> tensor<256xi32, #blocked>
81
+ %55 = arith.addi %1, %54 : tensor<256xi32, #blocked>
82
+ %56 = tt.splat %arg5 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
83
+ %57 = tt.addptr %56, %55 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
84
+ tt.store %57, %34, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked>
85
+ gpu.barrier
86
+ %58 = tt.addptr %arg0, %0 : !tt.ptr<f32, 1>, i32
87
+ %59 = tt.splat %58 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked1>
88
+ tt.store %59, %49 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32, #blocked1>
89
+ %60 = tt.splat %arg7 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
90
+ %61 = tt.addptr %60, %55 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
91
+ %62 = arith.truncf %52 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked>
92
+ tt.store %61, %62, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked>
93
+ %63 = tt.addptr %arg6, %0 : !tt.ptr<f32, 1>, i32
94
+ %64 = tt.splat %63 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked1>
95
+ tt.store %64, %39 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32, #blocked1>
96
+ tt.return
97
+ }
98
+ }
.triton/dump/7dc5bb3e5c2bb99527fff34c6fba7810/triton_.llir ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ define void @triton__0d1de(ptr addrspace(1) %0, i32 %1) local_unnamed_addr !dbg !5 {
5
+ %3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
6
+ %4 = and i32 %3, 127, !dbg !8
7
+ %5 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
8
+ %6 = shl i32 %5, 7, !dbg !10
9
+ %7 = or i32 %6, %4, !dbg !11
10
+ %8 = icmp slt i32 %7, 512, !dbg !12
11
+ %9 = sext i32 %7 to i64, !dbg !13
12
+ %10 = getelementptr i64, ptr addrspace(1) %0, i64 %9, !dbg !13
13
+ tail call void asm sideeffect "@$2 st.global.b64 [ $1 + 0 ], { $0 };", "l,l,b"(i64 %9, ptr addrspace(1) %10, i1 %8) #1, !dbg !14
14
+ ret void, !dbg !15
15
+ }
16
+
17
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
18
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
19
+
20
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
21
+ attributes #1 = { nounwind }
22
+
23
+ !llvm.module.flags = !{!0}
24
+ !llvm.dbg.cu = !{!1}
25
+ !nvvm.annotations = !{!3, !4, !4, !3}
26
+
27
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
28
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
29
+ !2 = !DIFile(filename: "cwxxgxdevnyc453z7hh4nxzgmvlhh6suwokktps3dw62btskgxt4.py", directory: "/tmp/torchinductor_root/wx")
30
+ !3 = !{ptr @triton__0d1de, !"kernel", i32 1}
31
+ !4 = !{ptr @triton__0d1de, !"maxntidx", i32 128}
32
+ !5 = distinct !DISubprogram(name: "triton__0d1de", linkageName: "triton__0d1de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
33
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
34
+ !7 = !{}
35
+ !8 = !DILocation(line: 21, column: 36, scope: !5)
36
+ !9 = !DILocation(line: 20, column: 28, scope: !5)
37
+ !10 = !DILocation(line: 20, column: 33, scope: !5)
38
+ !11 = !DILocation(line: 21, column: 23, scope: !5)
39
+ !12 = !DILocation(line: 22, column: 21, scope: !5)
40
+ !13 = !DILocation(line: 25, column: 25, scope: !5)
41
+ !14 = !DILocation(line: 25, column: 36, scope: !5)
42
+ !15 = !DILocation(line: 25, column: 4, scope: !5)
.triton/dump/7dc5bb3e5c2bb99527fff34c6fba7810/triton_.ttir ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<512> : tensor<128xi32>
4
+ %c128_i32 = arith.constant 128 : i32
5
+ %0 = tt.get_program_id x : i32
6
+ %1 = arith.muli %0, %c128_i32 : i32
7
+ %2 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
8
+ %3 = tt.splat %1 : (i32) -> tensor<128xi32>
9
+ %4 = arith.addi %3, %2 : tensor<128xi32>
10
+ %5 = arith.cmpi slt, %4, %cst : tensor<128xi32>
11
+ %6 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<128x!tt.ptr<i64, 1>>
12
+ %7 = tt.addptr %6, %4 : tensor<128x!tt.ptr<i64, 1>>, tensor<128xi32>
13
+ %8 = arith.extsi %4 : tensor<128xi32> to tensor<128xi64>
14
+ tt.store %7, %8, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<128xi64>
15
+ tt.return
16
+ }
17
+ }
.triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.cubin ADDED
Binary file (15.2 kB). View file
 
.triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.llir ADDED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @global_smem = external addrspace(3) global [0 x i8]
5
+ @.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
6
+
7
+ define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !7 {
8
+ %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
9
+ %10 = and i32 %9, 31, !dbg !10
10
+ %11 = lshr i32 %9, 5, !dbg !10
11
+ %12 = and i32 %11, 1, !dbg !10
12
+ %urem = shl i32 %9, 2, !dbg !10
13
+ %13 = and i32 %urem, 252, !dbg !10
14
+ %14 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !11
15
+ %15 = shl i32 %14, 8, !dbg !12
16
+ %16 = or i32 %15, %13, !dbg !13
17
+ %17 = sext i32 %16 to i64, !dbg !14
18
+ %18 = getelementptr float, ptr addrspace(1) %0, i64 %17, !dbg !14
19
+ %19 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %18, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !15
20
+ %20 = extractvalue { i32, i32, i32, i32 } %19, 0, !dbg !15
21
+ %21 = extractvalue { i32, i32, i32, i32 } %19, 1, !dbg !15
22
+ %22 = extractvalue { i32, i32, i32, i32 } %19, 2, !dbg !15
23
+ %23 = extractvalue { i32, i32, i32, i32 } %19, 3, !dbg !15
24
+ %24 = bitcast i32 %22 to float, !dbg !15
25
+ %25 = bitcast i32 %23 to float, !dbg !15
26
+ %26 = getelementptr i16, ptr addrspace(1) %1, i64 %17, !dbg !16
27
+ %27 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %26, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !17
28
+ %28 = extractvalue { i32, i32 } %27, 0, !dbg !17
29
+ %29 = extractvalue { i32, i32 } %27, 1, !dbg !17
30
+ %30 = trunc i32 %28 to i16, !dbg !17
31
+ %extelt.offset = lshr i32 %28, 16, !dbg !17
32
+ %31 = trunc i32 %extelt.offset to i16, !dbg !17
33
+ %32 = trunc i32 %29 to i16, !dbg !17
34
+ %extelt.offset1 = lshr i32 %29, 16, !dbg !17
35
+ %33 = trunc i32 %extelt.offset1 to i16, !dbg !17
36
+ %34 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %30) #6, !dbg !18
37
+ %35 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %31) #6, !dbg !18
38
+ %36 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %32) #6, !dbg !18
39
+ %37 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %33) #6, !dbg !18
40
+ %38 = getelementptr i16, ptr addrspace(1) %2, i64 %17, !dbg !19
41
+ %39 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %38, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !20
42
+ %40 = extractvalue { i32, i32 } %39, 0, !dbg !20
43
+ %41 = extractvalue { i32, i32 } %39, 1, !dbg !20
44
+ %42 = trunc i32 %40 to i16, !dbg !20
45
+ %extelt.offset2 = lshr i32 %40, 16, !dbg !20
46
+ %43 = trunc i32 %extelt.offset2 to i16, !dbg !20
47
+ %44 = trunc i32 %41 to i16, !dbg !20
48
+ %extelt.offset3 = lshr i32 %41, 16, !dbg !20
49
+ %45 = trunc i32 %extelt.offset3 to i16, !dbg !20
50
+ %46 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %42) #6, !dbg !21
51
+ %47 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %43) #6, !dbg !21
52
+ %48 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %44) #6, !dbg !21
53
+ %49 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %45) #6, !dbg !21
54
+ %50 = getelementptr i16, ptr addrspace(1) %3, i64 %17, !dbg !22
55
+ %51 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %50, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !23
56
+ %52 = extractvalue { i32, i32 } %51, 0, !dbg !23
57
+ %53 = extractvalue { i32, i32 } %51, 1, !dbg !23
58
+ %54 = trunc i32 %52 to i16, !dbg !23
59
+ %extelt.offset4 = lshr i32 %52, 16, !dbg !23
60
+ %55 = trunc i32 %extelt.offset4 to i16, !dbg !23
61
+ %56 = trunc i32 %53 to i16, !dbg !23
62
+ %extelt.offset5 = lshr i32 %53, 16, !dbg !23
63
+ %57 = trunc i32 %extelt.offset5 to i16, !dbg !23
64
+ %58 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %54) #6, !dbg !24
65
+ %59 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %55) #6, !dbg !24
66
+ %60 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %56) #6, !dbg !24
67
+ %61 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %57) #6, !dbg !24
68
+ %62 = zext nneg i32 %13 to i64, !dbg !25
69
+ %63 = getelementptr float, ptr addrspace(1) %4, i64 %62, !dbg !25
70
+ %64 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %63, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !26
71
+ %65 = fadd float %36, %24, !dbg !27
72
+ %66 = fadd float %37, %25, !dbg !27
73
+ %67 = fadd float %65, %48, !dbg !28
74
+ %68 = fadd float %66, %49, !dbg !28
75
+ %69 = insertelement <2 x i32> poison, i32 %20, i64 0, !dbg !15
76
+ %70 = insertelement <2 x i32> %69, i32 %21, i64 1, !dbg !15
77
+ %71 = bitcast <2 x i32> %70 to <2 x float>, !dbg !15
78
+ %72 = insertelement <2 x float> poison, float %34, i64 0, !dbg !27
79
+ %73 = insertelement <2 x float> %72, float %35, i64 1, !dbg !27
80
+ %74 = fadd <2 x float> %73, %71, !dbg !27
81
+ %75 = insertelement <2 x float> poison, float %46, i64 0, !dbg !28
82
+ %76 = insertelement <2 x float> %75, float %47, i64 1, !dbg !28
83
+ %77 = fadd <2 x float> %74, %76, !dbg !28
84
+ %78 = insertelement <2 x float> poison, float %58, i64 0, !dbg !29
85
+ %79 = insertelement <2 x float> %78, float %59, i64 1, !dbg !29
86
+ %80 = fadd <2 x float> %77, %79, !dbg !29
87
+ %81 = fadd float %67, %60, !dbg !29
88
+ %82 = fadd float %68, %61, !dbg !29
89
+ %83 = extractelement <2 x float> %80, i64 0, !dbg !30
90
+ %84 = extractelement <2 x float> %80, i64 1, !dbg !30
91
+ %85 = fadd float %83, %84, !dbg !30
92
+ %86 = fadd float %85, %81, !dbg !30
93
+ %87 = fadd float %86, %82, !dbg !30
94
+ %88 = bitcast float %87 to i32, !dbg !36
95
+ %89 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %88, i32 16, i32 31), !dbg !36
96
+ %90 = bitcast i32 %89 to float, !dbg !36
97
+ %91 = fadd float %87, %90, !dbg !30
98
+ %92 = bitcast float %91 to i32, !dbg !36
99
+ %93 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %92, i32 8, i32 31), !dbg !36
100
+ %94 = bitcast i32 %93 to float, !dbg !36
101
+ %95 = fadd float %91, %94, !dbg !30
102
+ %96 = bitcast float %95 to i32, !dbg !36
103
+ %97 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %96, i32 4, i32 31), !dbg !36
104
+ %98 = bitcast i32 %97 to float, !dbg !36
105
+ %99 = fadd float %95, %98, !dbg !30
106
+ %100 = bitcast float %99 to i32, !dbg !36
107
+ %101 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %100, i32 2, i32 31), !dbg !36
108
+ %102 = bitcast i32 %101 to float, !dbg !36
109
+ %103 = fadd float %99, %102, !dbg !30
110
+ %104 = bitcast float %103 to i32, !dbg !36
111
+ %105 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %104, i32 1, i32 31), !dbg !36
112
+ %106 = bitcast i32 %105 to float, !dbg !36
113
+ %107 = fadd float %103, %106, !dbg !30
114
+ %108 = icmp eq i32 %10, 0, !dbg !36
115
+ %109 = zext nneg i32 %12 to i64, !dbg !36
116
+ %110 = getelementptr float, ptr addrspace(3) @global_smem, i64 %109, !dbg !36
117
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %110, float %107, i1 %108) #6, !dbg !36
118
+ tail call void @llvm.nvvm.barrier0(), !dbg !36
119
+ %111 = icmp slt i32 %9, 2, !dbg !36
120
+ %112 = sext i32 %9 to i64, !dbg !36
121
+ %113 = getelementptr float, ptr addrspace(3) @global_smem, i64 %112, !dbg !36
122
+ %114 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %113, i1 %111) #6, !dbg !36
123
+ %115 = bitcast float %114 to i32, !dbg !36
124
+ %116 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %115, i32 1, i32 31), !dbg !36
125
+ %117 = bitcast i32 %116 to float, !dbg !36
126
+ %118 = fadd float %114, %117, !dbg !30
127
+ %119 = and i32 %9, 1, !dbg !36
128
+ %120 = icmp eq i32 %119, 0, !dbg !36
129
+ %121 = and i1 %111, %120, !dbg !36
130
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %113, float %118, i1 %121) #6, !dbg !36
131
+ tail call void @llvm.nvvm.barrier0(), !dbg !36
132
+ %122 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !36
133
+ %123 = fadd float %122, 0.000000e+00, !dbg !38
134
+ %124 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %123, float 2.560000e+02) #6, !dbg !42
135
+ %125 = fsub float %83, %124, !dbg !43
136
+ %126 = fsub float %84, %124, !dbg !43
137
+ %127 = fsub float %81, %124, !dbg !43
138
+ %128 = fsub float %82, %124, !dbg !43
139
+ %129 = fmul float %125, %125, !dbg !44
140
+ %130 = fmul float %126, %126, !dbg !44
141
+ %131 = fmul float %127, %127, !dbg !44
142
+ %132 = fmul float %128, %128, !dbg !44
143
+ tail call void @llvm.nvvm.barrier0(), !dbg !45
144
+ %133 = fadd float %129, %130, !dbg !47
145
+ %134 = fadd float %131, %133, !dbg !47
146
+ %135 = fadd float %132, %134, !dbg !47
147
+ %136 = bitcast float %135 to i32, !dbg !45
148
+ %137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 16, i32 31), !dbg !45
149
+ %138 = bitcast i32 %137 to float, !dbg !45
150
+ %139 = fadd float %135, %138, !dbg !47
151
+ %140 = bitcast float %139 to i32, !dbg !45
152
+ %141 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %140, i32 8, i32 31), !dbg !45
153
+ %142 = bitcast i32 %141 to float, !dbg !45
154
+ %143 = fadd float %139, %142, !dbg !47
155
+ %144 = bitcast float %143 to i32, !dbg !45
156
+ %145 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %144, i32 4, i32 31), !dbg !45
157
+ %146 = bitcast i32 %145 to float, !dbg !45
158
+ %147 = fadd float %143, %146, !dbg !47
159
+ %148 = bitcast float %147 to i32, !dbg !45
160
+ %149 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %148, i32 2, i32 31), !dbg !45
161
+ %150 = bitcast i32 %149 to float, !dbg !45
162
+ %151 = fadd float %147, %150, !dbg !47
163
+ %152 = bitcast float %151 to i32, !dbg !45
164
+ %153 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %152, i32 1, i32 31), !dbg !45
165
+ %154 = bitcast i32 %153 to float, !dbg !45
166
+ %155 = fadd float %151, %154, !dbg !47
167
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %110, float %155, i1 %108) #6, !dbg !45
168
+ tail call void @llvm.nvvm.barrier0(), !dbg !45
169
+ %156 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %113, i1 %111) #6, !dbg !45
170
+ %157 = bitcast float %156 to i32, !dbg !45
171
+ %158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 1, i32 31), !dbg !45
172
+ %159 = bitcast i32 %158 to float, !dbg !45
173
+ %160 = fadd float %156, %159, !dbg !47
174
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %113, float %160, i1 %121) #6, !dbg !45
175
+ tail call void @llvm.nvvm.barrier0(), !dbg !45
176
+ %161 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !45
177
+ %162 = fadd float %161, 0.000000e+00, !dbg !50
178
+ %163 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %162, float 2.560000e+02) #6, !dbg !52
179
+ %164 = fadd float %163, 0x3EE4F8B580000000, !dbg !53
180
+ %165 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !54
181
+ %.not.i = icmp eq i32 %165, 0, !dbg !54
182
+ br i1 %.not.i, label %168, label %166, !dbg !54
183
+
184
+ 166: ; preds = %8
185
+ %167 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %164), !dbg !54
186
+ br label %__nv_rsqrtf.exit, !dbg !54
187
+
188
+ 168: ; preds = %8
189
+ %169 = tail call float @llvm.nvvm.rsqrt.approx.f(float %164), !dbg !54
190
+ br label %__nv_rsqrtf.exit, !dbg !54
191
+
192
+ __nv_rsqrtf.exit: ; preds = %166, %168
193
+ %.0.i = phi float [ %167, %166 ], [ %169, %168 ], !dbg !54
194
+ %170 = extractvalue { i32, i32, i32, i32 } %64, 3, !dbg !26
195
+ %171 = bitcast i32 %170 to float, !dbg !26
196
+ %172 = extractvalue { i32, i32, i32, i32 } %64, 2, !dbg !26
197
+ %173 = bitcast i32 %172 to float, !dbg !26
198
+ %174 = extractvalue { i32, i32, i32, i32 } %64, 1, !dbg !26
199
+ %175 = bitcast i32 %174 to float, !dbg !26
200
+ %176 = extractvalue { i32, i32, i32, i32 } %64, 0, !dbg !26
201
+ %177 = bitcast i32 %176 to float, !dbg !26
202
+ %178 = fmul float %125, %.0.i, !dbg !55
203
+ %179 = fmul float %126, %.0.i, !dbg !55
204
+ %180 = fmul float %127, %.0.i, !dbg !55
205
+ %181 = fmul float %128, %.0.i, !dbg !55
206
+ %182 = fmul float %178, %177, !dbg !56
207
+ %183 = fmul float %179, %175, !dbg !56
208
+ %184 = fmul float %180, %173, !dbg !56
209
+ %185 = fmul float %181, %171, !dbg !56
210
+ %186 = getelementptr i16, ptr addrspace(1) %5, i64 %17, !dbg !57
211
+ %187 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %182) #6, !dbg !58
212
+ %188 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %183) #6, !dbg !58
213
+ %189 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %184) #6, !dbg !58
214
+ %190 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %185) #6, !dbg !58
215
+ %191 = insertelement <2 x i16> undef, i16 %187, i64 0, !dbg !58
216
+ %192 = insertelement <2 x i16> %191, i16 %188, i64 1, !dbg !58
217
+ %193 = bitcast <2 x i16> %192 to i32, !dbg !58
218
+ %194 = insertelement <2 x i16> undef, i16 %189, i64 0, !dbg !58
219
+ %195 = insertelement <2 x i16> %194, i16 %190, i64 1, !dbg !58
220
+ %196 = bitcast <2 x i16> %195 to i32, !dbg !58
221
+ tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %193, i32 %196, ptr addrspace(1) %186, i1 true) #6, !dbg !58
222
+ ret void, !dbg !59
223
+ }
224
+
225
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
226
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
227
+
228
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
229
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
230
+
231
+ ; Function Attrs: convergent nocallback nounwind
232
+ declare void @llvm.nvvm.barrier0() #2
233
+
234
+ ; Function Attrs: alwaysinline nounwind
235
+ define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
236
+ %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
237
+ %.not = icmp eq i32 %1, 0
238
+ br i1 %.not, label %4, label %2
239
+
240
+ 2: ; preds = %0
241
+ %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
242
+ br label %6
243
+
244
+ 4: ; preds = %0
245
+ %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
246
+ br label %6
247
+
248
+ 6: ; preds = %4, %2
249
+ %.0 = phi float [ %3, %2 ], [ %5, %4 ]
250
+ ret float %.0
251
+ }
252
+
253
+ declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
254
+
255
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
256
+ declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
257
+
258
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
259
+ declare float @llvm.nvvm.rsqrt.approx.f(float) #5
260
+
261
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
262
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
263
+ attributes #2 = { convergent nocallback nounwind }
264
+ attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
265
+ attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
266
+ attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
267
+ attributes #6 = { nounwind }
268
+
269
+ !llvm.module.flags = !{!0, !1}
270
+ !llvm.dbg.cu = !{!2}
271
+ !nvvm.annotations = !{!4, !5, !5, !4}
272
+ !llvm.ident = !{!6}
273
+
274
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
275
+ !1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
276
+ !2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
277
+ !3 = !DIFile(filename: "cpwl4wgyi5spzbgbswrqxfrxlyk2m76a4bakbp6l5ltopjbkjadt.py", directory: "/tmp/torchinductor_root/pw")
278
+ !4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1}
279
+ !5 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 64}
280
+ !6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
281
+ !7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
282
+ !8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
283
+ !9 = !{}
284
+ !10 = !DILocation(line: 26, column: 26, scope: !7)
285
+ !11 = !DILocation(line: 23, column: 28, scope: !7)
286
+ !12 = !DILocation(line: 30, column: 40, scope: !7)
287
+ !13 = !DILocation(line: 30, column: 36, scope: !7)
288
+ !14 = !DILocation(line: 30, column: 30, scope: !7)
289
+ !15 = !DILocation(line: 30, column: 46, scope: !7)
290
+ !16 = !DILocation(line: 31, column: 30, scope: !7)
291
+ !17 = !DILocation(line: 31, column: 46, scope: !7)
292
+ !18 = !DILocation(line: 31, column: 67, scope: !7)
293
+ !19 = !DILocation(line: 32, column: 30, scope: !7)
294
+ !20 = !DILocation(line: 32, column: 46, scope: !7)
295
+ !21 = !DILocation(line: 32, column: 67, scope: !7)
296
+ !22 = !DILocation(line: 33, column: 30, scope: !7)
297
+ !23 = !DILocation(line: 33, column: 46, scope: !7)
298
+ !24 = !DILocation(line: 33, column: 67, scope: !7)
299
+ !25 = !DILocation(line: 34, column: 31, scope: !7)
300
+ !26 = !DILocation(line: 34, column: 36, scope: !7)
301
+ !27 = !DILocation(line: 36, column: 18, scope: !7)
302
+ !28 = !DILocation(line: 38, column: 18, scope: !7)
303
+ !29 = !DILocation(line: 40, column: 18, scope: !7)
304
+ !30 = !DILocation(line: 233, column: 15, scope: !31, inlinedAt: !34)
305
+ !31 = distinct !DILexicalBlockFile(scope: !33, file: !32, discriminator: 0)
306
+ !32 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
307
+ !33 = distinct !DILexicalBlockFile(scope: !7, file: !32, discriminator: 0)
308
+ !34 = !DILocation(line: 243, column: 36, scope: !31, inlinedAt: !35)
309
+ !35 = !DILocation(line: 45, column: 59, scope: !31)
310
+ !36 = !DILocation(line: 243, column: 36, scope: !33, inlinedAt: !37)
311
+ !37 = !DILocation(line: 45, column: 59, scope: !33)
312
+ !38 = !DILocation(line: 8, column: 15, scope: !39, inlinedAt: !41)
313
+ !39 = distinct !DILexicalBlockFile(scope: !7, file: !40, discriminator: 0)
314
+ !40 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
315
+ !41 = !DILocation(line: 45, column: 45, scope: !39)
316
+ !42 = !DILocation(line: 48, column: 20, scope: !7)
317
+ !43 = !DILocation(line: 49, column: 20, scope: !7)
318
+ !44 = !DILocation(line: 50, column: 20, scope: !7)
319
+ !45 = !DILocation(line: 243, column: 36, scope: !33, inlinedAt: !46)
320
+ !46 = !DILocation(line: 53, column: 59, scope: !33)
321
+ !47 = !DILocation(line: 233, column: 15, scope: !31, inlinedAt: !48)
322
+ !48 = !DILocation(line: 243, column: 36, scope: !31, inlinedAt: !49)
323
+ !49 = !DILocation(line: 53, column: 59, scope: !31)
324
+ !50 = !DILocation(line: 8, column: 15, scope: !39, inlinedAt: !51)
325
+ !51 = !DILocation(line: 53, column: 45, scope: !39)
326
+ !52 = !DILocation(line: 56, column: 20, scope: !7)
327
+ !53 = !DILocation(line: 58, column: 20, scope: !7)
328
+ !54 = !DILocation(line: 59, column: 26, scope: !7)
329
+ !55 = !DILocation(line: 60, column: 20, scope: !7)
330
+ !56 = !DILocation(line: 61, column: 20, scope: !7)
331
+ !57 = !DILocation(line: 63, column: 25, scope: !7)
332
+ !58 = !DILocation(line: 63, column: 48, scope: !7)
333
+ !59 = !DILocation(line: 63, column: 4, scope: !7)
.triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.ttgir ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
4
+ %cst = arith.constant dense<256> : tensor<256xi32, #blocked>
5
+ %cst_0 = arith.constant 9.99999974E-6 : f32
6
+ %cst_1 = arith.constant 2.560000e+02 : f32
7
+ %cst_2 = arith.constant 0.000000e+00 : f32
8
+ %c256_i32 = arith.constant 256 : i32
9
+ %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
10
+ %cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
11
+ %0 = tt.get_program_id x : i32
12
+ %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
13
+ %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
14
+ %3 = arith.muli %0, %c256_i32 : i32
15
+ %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
16
+ %5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
17
+ %6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
18
+ %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
19
+ %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
20
+ %9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
21
+ %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
22
+ %11 = tt.load %10, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
23
+ %12 = arith.extf %11 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
24
+ %13 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
25
+ %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
26
+ %15 = tt.load %14, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
27
+ %16 = arith.extf %15 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
28
+ %17 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
29
+ %18 = tt.addptr %17, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
30
+ %19 = tt.load %18, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
31
+ %20 = arith.extf %19 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
32
+ %21 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
33
+ %22 = tt.addptr %21, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
34
+ %23 = tt.load %22, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
35
+ %24 = arith.addf %8, %12 : tensor<256xf32, #blocked>
36
+ %25 = arith.addf %24, %16 : tensor<256xf32, #blocked>
37
+ %26 = arith.addf %25, %20 : tensor<256xf32, #blocked>
38
+ %27 = arith.select %2, %26, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
39
+ %28 = "tt.reduce"(%27) <{axis = 0 : i32}> ({
40
+ ^bb0(%arg8: f32, %arg9: f32):
41
+ %46 = arith.addf %arg8, %arg9 : f32
42
+ tt.reduce.return %46 : f32
43
+ }) : (tensor<256xf32, #blocked>) -> f32
44
+ %29 = arith.addf %28, %cst_2 : f32
45
+ %30 = arith.divf %29, %cst_1 : f32
46
+ %31 = tt.splat %30 : (f32) -> tensor<256xf32, #blocked>
47
+ %32 = arith.subf %26, %31 : tensor<256xf32, #blocked>
48
+ %33 = arith.mulf %32, %32 : tensor<256xf32, #blocked>
49
+ %34 = arith.select %2, %33, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
50
+ %35 = "tt.reduce"(%34) <{axis = 0 : i32}> ({
51
+ ^bb0(%arg8: f32, %arg9: f32):
52
+ %46 = arith.addf %arg8, %arg9 : f32
53
+ tt.reduce.return %46 : f32
54
+ }) : (tensor<256xf32, #blocked>) -> f32
55
+ %36 = arith.addf %35, %cst_2 : f32
56
+ %37 = arith.divf %36, %cst_1 : f32
57
+ %38 = arith.addf %37, %cst_0 : f32
58
+ %39 = tt.extern_elementwise %38 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
59
+ %40 = tt.splat %39 : (f32) -> tensor<256xf32, #blocked>
60
+ %41 = arith.mulf %32, %40 : tensor<256xf32, #blocked>
61
+ %42 = arith.mulf %41, %23 : tensor<256xf32, #blocked>
62
+ %43 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
63
+ %44 = tt.addptr %43, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
64
+ %45 = arith.truncf %42 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked>
65
+ tt.store %44, %45, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked>
66
+ tt.return
67
+ }
68
+ }
.triton/dump/9a2fb05196b13393bea452d08e9aaca8/triton_.llir ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 {
5
+ %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
6
+ %5 = shl i32 %4, 1, !dbg !8
7
+ %6 = and i32 %5, 510, !dbg !8
8
+ %7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
9
+ %8 = shl i32 %7, 9, !dbg !10
10
+ %9 = or i32 %8, %6, !dbg !11
11
+ %10 = sext i32 %9 to i64, !dbg !12
12
+ %11 = getelementptr float, ptr addrspace(1) %0, i64 %10, !dbg !12
13
+ %12 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];", "=r,=r,l,b"(ptr addrspace(1) %11, i1 true) #1, !dbg !13
14
+ %13 = extractvalue { i32, i32 } %12, 0, !dbg !13
15
+ %14 = extractvalue { i32, i32 } %12, 1, !dbg !13
16
+ %15 = bitcast i32 %13 to float, !dbg !13
17
+ %16 = bitcast i32 %14 to float, !dbg !13
18
+ %17 = getelementptr i16, ptr addrspace(1) %1, i64 %10, !dbg !14
19
+ %18 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %15) #1, !dbg !15
20
+ %19 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %16) #1, !dbg !15
21
+ %20 = insertelement <2 x i16> undef, i16 %18, i64 0, !dbg !15
22
+ %21 = insertelement <2 x i16> %20, i16 %19, i64 1, !dbg !15
23
+ %22 = bitcast <2 x i16> %21 to i32, !dbg !15
24
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %22, ptr addrspace(1) %17, i1 true) #1, !dbg !15
25
+ ret void, !dbg !16
26
+ }
27
+
28
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
29
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
30
+
31
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
32
+ attributes #1 = { nounwind }
33
+
34
+ !llvm.module.flags = !{!0}
35
+ !llvm.dbg.cu = !{!1}
36
+ !nvvm.annotations = !{!3, !4, !4, !3}
37
+
38
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
39
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
40
+ !2 = !DIFile(filename: "c5tryp5qwkhreijk7s5x327wofz54lwj4kvctuqdzv2vrf2xyons.py", directory: "/tmp/torchinductor_root/5t")
41
+ !3 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
42
+ !4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 256}
43
+ !5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
44
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
45
+ !7 = !{}
46
+ !8 = !DILocation(line: 21, column: 36, scope: !5)
47
+ !9 = !DILocation(line: 20, column: 28, scope: !5)
48
+ !10 = !DILocation(line: 20, column: 33, scope: !5)
49
+ !11 = !DILocation(line: 21, column: 23, scope: !5)
50
+ !12 = !DILocation(line: 24, column: 30, scope: !5)
51
+ !13 = !DILocation(line: 24, column: 35, scope: !5)
52
+ !14 = !DILocation(line: 26, column: 25, scope: !5)
53
+ !15 = !DILocation(line: 26, column: 36, scope: !5)
54
+ !16 = !DILocation(line: 26, column: 4, scope: !5)
.triton/dump/9a2fb05196b13393bea452d08e9aaca8/triton_.ttgir ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1d2de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
4
+ %c512_i32 = arith.constant 512 : i32
5
+ %0 = tt.get_program_id x : i32
6
+ %1 = arith.muli %0, %c512_i32 : i32
7
+ %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked>
8
+ %3 = tt.splat %1 : (i32) -> tensor<512xi32, #blocked>
9
+ %4 = arith.addi %3, %2 : tensor<512xi32, #blocked>
10
+ %5 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>, #blocked>
11
+ %6 = tt.addptr %5, %4 : tensor<512x!tt.ptr<f32, 1>, #blocked>, tensor<512xi32, #blocked>
12
+ %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xf32, #blocked>
13
+ %8 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>, #blocked>
14
+ %9 = tt.addptr %8, %4 : tensor<512x!tt.ptr<bf16, 1>, #blocked>, tensor<512xi32, #blocked>
15
+ %10 = arith.truncf %7 : tensor<512xf32, #blocked> to tensor<512xbf16, #blocked>
16
+ tt.store %9, %10 {cache = 1 : i32, evict = 1 : i32} : tensor<512xbf16, #blocked>
17
+ tt.return
18
+ }
19
+ }
.triton/dump/a69784da01a97187168f22847465505f/triton_.cubin ADDED
Binary file (15 kB). View file
 
.triton/dump/a69784da01a97187168f22847465505f/triton_.ttgir ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
3
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
4
+ tt.func public @triton__0d1d2d3d4d5d6d7de8de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
5
+ %cst = arith.constant dense<256> : tensor<256xi32, #blocked>
6
+ %cst_0 = arith.constant 9.99999974E-6 : f32
7
+ %cst_1 = arith.constant 2.560000e+02 : f32
8
+ %cst_2 = arith.constant 0.000000e+00 : f32
9
+ %c256_i32 = arith.constant 256 : i32
10
+ %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
11
+ %cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
12
+ %0 = tt.get_program_id x : i32
13
+ %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
14
+ %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
15
+ %3 = arith.muli %0, %c256_i32 : i32
16
+ %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
17
+ %5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
18
+ %6 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
19
+ %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
20
+ %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
21
+ %9 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
22
+ %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
23
+ %11 = tt.load %10, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
24
+ %12 = arith.extf %11 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
25
+ %13 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
26
+ %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
27
+ %15 = tt.load %14, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
28
+ %16 = arith.extf %15 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
29
+ %17 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
30
+ %18 = tt.addptr %17, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
31
+ %19 = tt.load %18, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
32
+ %20 = arith.addf %8, %12 : tensor<256xf32, #blocked>
33
+ %21 = arith.addf %20, %16 : tensor<256xf32, #blocked>
34
+ %22 = arith.select %2, %21, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
35
+ %23 = "tt.reduce"(%22) <{axis = 0 : i32}> ({
36
+ ^bb0(%arg9: f32, %arg10: f32):
37
+ %47 = arith.addf %arg9, %arg10 : f32
38
+ tt.reduce.return %47 : f32
39
+ }) : (tensor<256xf32, #blocked>) -> f32
40
+ %24 = arith.addf %23, %cst_2 : f32
41
+ %25 = arith.divf %24, %cst_1 : f32
42
+ %26 = tt.splat %25 : (f32) -> tensor<1xf32, #blocked1>
43
+ %27 = tt.splat %25 : (f32) -> tensor<256xf32, #blocked>
44
+ %28 = arith.subf %21, %27 : tensor<256xf32, #blocked>
45
+ %29 = arith.mulf %28, %28 : tensor<256xf32, #blocked>
46
+ %30 = arith.select %2, %29, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
47
+ %31 = "tt.reduce"(%30) <{axis = 0 : i32}> ({
48
+ ^bb0(%arg9: f32, %arg10: f32):
49
+ %47 = arith.addf %arg9, %arg10 : f32
50
+ tt.reduce.return %47 : f32
51
+ }) : (tensor<256xf32, #blocked>) -> f32
52
+ %32 = arith.addf %31, %cst_2 : f32
53
+ %33 = arith.divf %32, %cst_1 : f32
54
+ %34 = arith.addf %33, %cst_0 : f32
55
+ %35 = tt.extern_elementwise %34 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
56
+ %36 = tt.splat %35 : (f32) -> tensor<1xf32, #blocked1>
57
+ %37 = tt.splat %35 : (f32) -> tensor<256xf32, #blocked>
58
+ %38 = arith.mulf %28, %37 : tensor<256xf32, #blocked>
59
+ %39 = arith.mulf %38, %19 : tensor<256xf32, #blocked>
60
+ gpu.barrier
61
+ %40 = tt.addptr %arg0, %0 : !tt.ptr<f32, 1>, i32
62
+ %41 = tt.splat %40 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked1>
63
+ tt.store %41, %36 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32, #blocked1>
64
+ %42 = tt.splat %arg6 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
65
+ %43 = tt.addptr %42, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
66
+ %44 = arith.truncf %39 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked>
67
+ tt.store %43, %44, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked>
68
+ %45 = tt.addptr %arg5, %0 : !tt.ptr<f32, 1>, i32
69
+ %46 = tt.splat %45 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked1>
70
+ tt.store %46, %26 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32, #blocked1>
71
+ tt.return
72
+ }
73
+ }
.triton/dump/a75e14a8d2d1ec8471f1c7b615552f8c/triton_.cubin ADDED
Binary file (4.9 kB). View file
 
.triton/dump/a75e14a8d2d1ec8471f1c7b615552f8c/triton_.ptx ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2de
10
+
11
+ .visible .entry triton__0d1d2de(
12
+ .param .u64 triton__0d1d2de_param_0,
13
+ .param .u64 triton__0d1d2de_param_1,
14
+ .param .u32 triton__0d1d2de_param_2
15
+ )
16
+ .maxntid 128, 1, 1
17
+ {
18
+ .reg .pred %p<3>;
19
+ .reg .b16 %rs<3>;
20
+ .reg .b32 %r<13>;
21
+ .reg .b64 %rd<7>;
22
+ .loc 1 18 0
23
+ $L__func_begin0:
24
+ .loc 1 18 0
25
+
26
+ ld.param.u64 %rd3, [triton__0d1d2de_param_0];
27
+ ld.param.u64 %rd4, [triton__0d1d2de_param_1];
28
+ $L__tmp0:
29
+ .loc 1 21 36
30
+ mov.u32 %r7, %tid.x;
31
+ shl.b32 %r8, %r7, 1;
32
+ and.b32 %r9, %r8, 254;
33
+ .loc 1 20 28
34
+ mov.u32 %r1, %ctaid.x;
35
+ .loc 1 20 33
36
+ shl.b32 %r10, %r1, 8;
37
+ .loc 1 21 23
38
+ or.b32 %r11, %r10, %r9;
39
+ .loc 1 24 30
40
+ mul.wide.s32 %rd5, %r11, 4;
41
+ add.s64 %rd1, %rd3, %rd5;
42
+ mov.pred %p1, -1;
43
+ .loc 1 24 35
44
+ mov.u32 %r4, 0x0;
45
+ mov.u32 %r5, 0x0;
46
+ @%p1 ld.global.v2.b32 { %r4, %r5 }, [ %rd1 + 0 ];
47
+ .loc 1 26 25
48
+ mul.wide.s32 %rd6, %r11, 2;
49
+ add.s64 %rd2, %rd4, %rd6;
50
+ .loc 1 26 36
51
+ cvt.rn.bf16.f32 %rs1, %r4;
52
+ cvt.rn.bf16.f32 %rs2, %r5;
53
+ mov.b32 %r12, {%rs1, %rs2};
54
+ @%p1 st.global.b32 [ %rd2 + 0 ], { %r12 };
55
+ .loc 1 26 4
56
+ ret;
57
+ $L__tmp1:
58
+ $L__func_end0:
59
+
60
+ }
61
+ .file 1 "/tmp/torchinductor_root/pq/cpqhcwm5bfrhuwddh4c4qks6bh7sovfbpfnmqhnm4h4w23icqnu6.py"
62
+ .section .debug_abbrev
63
+ {
64
+ .b8 1
65
+ .b8 17
66
+ .b8 1
67
+ .b8 37
68
+ .b8 8
69
+ .b8 19
70
+ .b8 5
71
+ .b8 3
72
+ .b8 8
73
+ .b8 16
74
+ .b8 6
75
+ .b8 27
76
+ .b8 8
77
+ .b8 180
78
+ .b8 66
79
+ .b8 12
80
+ .b8 17
81
+ .b8 1
82
+ .b8 18
83
+ .b8 1
84
+ .b8 0
85
+ .b8 0
86
+ .b8 2
87
+ .b8 46
88
+ .b8 0
89
+ .b8 17
90
+ .b8 1
91
+ .b8 18
92
+ .b8 1
93
+ .b8 64
94
+ .b8 10
95
+ .b8 135
96
+ .b8 64
97
+ .b8 8
98
+ .b8 3
99
+ .b8 8
100
+ .b8 58
101
+ .b8 11
102
+ .b8 59
103
+ .b8 11
104
+ .b8 63
105
+ .b8 12
106
+ .b8 0
107
+ .b8 0
108
+ .b8 0
109
+ }
110
+ .section .debug_info
111
+ {
112
+ .b32 176
113
+ .b8 2
114
+ .b8 0
115
+ .b32 .debug_abbrev
116
+ .b8 8
117
+ .b8 1
118
+ .b8 116
119
+ .b8 114
120
+ .b8 105
121
+ .b8 116
122
+ .b8 111
123
+ .b8 110
124
+ .b8 0
125
+ .b8 2
126
+ .b8 0
127
+ .b8 99
128
+ .b8 112
129
+ .b8 113
130
+ .b8 104
131
+ .b8 99
132
+ .b8 119
133
+ .b8 109
134
+ .b8 53
135
+ .b8 98
136
+ .b8 102
137
+ .b8 114
138
+ .b8 104
139
+ .b8 117
140
+ .b8 119
141
+ .b8 100
142
+ .b8 100
143
+ .b8 104
144
+ .b8 52
145
+ .b8 99
146
+ .b8 52
147
+ .b8 113
148
+ .b8 107
149
+ .b8 115
150
+ .b8 54
151
+ .b8 98
152
+ .b8 104
153
+ .b8 55
154
+ .b8 115
155
+ .b8 111
156
+ .b8 118
157
+ .b8 102
158
+ .b8 98
159
+ .b8 112
160
+ .b8 102
161
+ .b8 110
162
+ .b8 109
163
+ .b8 113
164
+ .b8 104
165
+ .b8 110
166
+ .b8 109
167
+ .b8 52
168
+ .b8 104
169
+ .b8 52
170
+ .b8 119
171
+ .b8 50
172
+ .b8 51
173
+ .b8 105
174
+ .b8 99
175
+ .b8 113
176
+ .b8 110
177
+ .b8 117
178
+ .b8 54
179
+ .b8 46
180
+ .b8 112
181
+ .b8 121
182
+ .b8 0
183
+ .b32 .debug_line
184
+ .b8 47
185
+ .b8 116
186
+ .b8 109
187
+ .b8 112
188
+ .b8 47
189
+ .b8 116
190
+ .b8 111
191
+ .b8 114
192
+ .b8 99
193
+ .b8 104
194
+ .b8 105
195
+ .b8 110
196
+ .b8 100
197
+ .b8 117
198
+ .b8 99
199
+ .b8 116
200
+ .b8 111
201
+ .b8 114
202
+ .b8 95
203
+ .b8 114
204
+ .b8 111
205
+ .b8 111
206
+ .b8 116
207
+ .b8 47
208
+ .b8 112
209
+ .b8 113
210
+ .b8 0
211
+ .b8 1
212
+ .b64 $L__func_begin0
213
+ .b64 $L__func_end0
214
+ .b8 2
215
+ .b64 $L__func_begin0
216
+ .b64 $L__func_end0
217
+ .b8 1
218
+ .b8 156
219
+ .b8 116
220
+ .b8 114
221
+ .b8 105
222
+ .b8 116
223
+ .b8 111
224
+ .b8 110
225
+ .b8 95
226
+ .b8 95
227
+ .b8 48
228
+ .b8 100
229
+ .b8 49
230
+ .b8 100
231
+ .b8 50
232
+ .b8 100
233
+ .b8 101
234
+ .b8 0
235
+ .b8 116
236
+ .b8 114
237
+ .b8 105
238
+ .b8 116
239
+ .b8 111
240
+ .b8 110
241
+ .b8 95
242
+ .b8 95
243
+ .b8 48
244
+ .b8 100
245
+ .b8 49
246
+ .b8 100
247
+ .b8 50
248
+ .b8 100
249
+ .b8 101
250
+ .b8 0
251
+ .b8 1
252
+ .b8 18
253
+ .b8 1
254
+ .b8 0
255
+ }
256
+ .section .debug_pubnames
257
+ {
258
+ .b32 $L__pubNames_end0-$L__pubNames_start0
259
+ $L__pubNames_start0:
260
+ .b8 2
261
+ .b8 0
262
+ .b32 .debug_info
263
+ .b32 180
264
+ .b32 125
265
+ .b8 116
266
+ .b8 114
267
+ .b8 105
268
+ .b8 116
269
+ .b8 111
270
+ .b8 110
271
+ .b8 95
272
+ .b8 95
273
+ .b8 48
274
+ .b8 100
275
+ .b8 49
276
+ .b8 100
277
+ .b8 50
278
+ .b8 100
279
+ .b8 101
280
+ .b8 0
281
+ .b32 0
282
+ $L__pubNames_end0:
283
+ }
284
+ .section .debug_pubtypes
285
+ {
286
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
287
+ $L__pubTypes_start0:
288
+ .b8 2
289
+ .b8 0
290
+ .b32 .debug_info
291
+ .b32 180
292
+ .b32 0
293
+ $L__pubTypes_end0:
294
+ }
295
+ .section .debug_loc { }
.triton/dump/a75e14a8d2d1ec8471f1c7b615552f8c/triton_.ttir ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %c256_i32 = arith.constant 256 : i32
4
+ %0 = tt.get_program_id x : i32
5
+ %1 = arith.muli %0, %c256_i32 : i32
6
+ %2 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
7
+ %3 = tt.splat %1 : (i32) -> tensor<256xi32>
8
+ %4 = arith.addi %3, %2 : tensor<256xi32>
9
+ %5 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
10
+ %6 = tt.addptr %5, %4 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
11
+ %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
12
+ %8 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
13
+ %9 = tt.addptr %8, %4 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
14
+ %10 = arith.truncf %7 : tensor<256xf32> to tensor<256xbf16>
15
+ tt.store %9, %10 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16>
16
+ tt.return
17
+ }
18
+ }
.triton/dump/ab89f85e55d5ddd9676325b49df9419f/triton_.ptx ADDED
@@ -0,0 +1,717 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4d5d6de7de
10
+ .extern .shared .align 1 .b8 global_smem[];
11
+ .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
12
+
13
+ .visible .entry triton__0d1d2d3d4d5d6de7de(
14
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_0,
15
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_1,
16
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_2,
17
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_3,
18
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_4,
19
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_5,
20
+ .param .u32 triton__0d1d2d3d4d5d6de7de_param_6,
21
+ .param .u32 triton__0d1d2d3d4d5d6de7de_param_7
22
+ )
23
+ .maxntid 64, 1, 1
24
+ {
25
+ .reg .pred %p<25>;
26
+ .reg .b16 %rs<9>;
27
+ .reg .b32 %r<87>;
28
+ .reg .f32 %f<70>;
29
+ .reg .b64 %rd<17>;
30
+ .loc 1 18 0
31
+ $L__func_begin0:
32
+ .loc 1 18 0
33
+
34
+ ld.param.u64 %rd7, [triton__0d1d2d3d4d5d6de7de_param_0];
35
+ ld.param.u64 %rd8, [triton__0d1d2d3d4d5d6de7de_param_1];
36
+ $L__tmp0:
37
+ .loc 1 26 26
38
+ mov.u32 %r52, %tid.x;
39
+ and.b32 %r53, %r52, 31;
40
+ ld.param.u64 %rd9, [triton__0d1d2d3d4d5d6de7de_param_2];
41
+ ld.param.u64 %rd10, [triton__0d1d2d3d4d5d6de7de_param_3];
42
+ ld.param.u64 %rd11, [triton__0d1d2d3d4d5d6de7de_param_4];
43
+ and.b32 %r54, %r52, 63;
44
+ ld.param.u64 %rd12, [triton__0d1d2d3d4d5d6de7de_param_5];
45
+ shl.b32 %r55, %r54, 2;
46
+ .loc 1 23 28
47
+ mov.u32 %r1, %ctaid.x;
48
+ .loc 1 30 40
49
+ shl.b32 %r56, %r1, 8;
50
+ .loc 1 30 36
51
+ or.b32 %r57, %r56, %r55;
52
+ .loc 1 30 30
53
+ mul.wide.s32 %rd13, %r57, 4;
54
+ add.s64 %rd1, %rd8, %rd13;
55
+ mov.b32 %r6, 0;
56
+ mov.pred %p1, -1;
57
+ .loc 1 30 46
58
+ mov.u32 %r2, 0x0;
59
+ mov.u32 %r3, 0x0;
60
+ mov.u32 %r4, 0x0;
61
+ mov.u32 %r5, 0x0;
62
+ @%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
63
+ @!%p1 mov.u32 %r2, %r6;
64
+ @!%p1 mov.u32 %r3, %r6;
65
+ @!%p1 mov.u32 %r4, %r6;
66
+ @!%p1 mov.u32 %r5, %r6;
67
+ mov.b32 %f1, %r2;
68
+ mov.b32 %f2, %r3;
69
+ mov.b32 %f3, %r4;
70
+ mov.b32 %f4, %r5;
71
+ .loc 1 31 30
72
+ mul.wide.s32 %rd14, %r57, 2;
73
+ add.s64 %rd2, %rd9, %rd14;
74
+ .loc 1 31 46
75
+ mov.u32 %r10, 0x0;
76
+ mov.u32 %r11, 0x0;
77
+ @%p1 ld.global.v2.b32 { %r10, %r11 }, [ %rd2 + 0 ];
78
+ @!%p1 mov.u32 %r10, %r6;
79
+ @!%p1 mov.u32 %r11, %r6;
80
+ cvt.u16.u32 %rs1, %r10;
81
+ { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r10; }
82
+ cvt.u16.u32 %rs3, %r11;
83
+ { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r11; }
84
+ .loc 1 31 67
85
+ cvt.f32.bf16 %r14, %rs1;
86
+ mov.b32 %f5, %r14;
87
+ cvt.f32.bf16 %r15, %rs2;
88
+ mov.b32 %f6, %r15;
89
+ cvt.f32.bf16 %r16, %rs3;
90
+ mov.b32 %f7, %r16;
91
+ cvt.f32.bf16 %r17, %rs4;
92
+ mov.b32 %f8, %r17;
93
+ .loc 1 32 31
94
+ mul.wide.u32 %rd15, %r55, 4;
95
+ add.s64 %rd3, %rd10, %rd15;
96
+ .loc 1 32 36
97
+ mov.u32 %r18, 0x0;
98
+ mov.u32 %r19, 0x0;
99
+ mov.u32 %r20, 0x0;
100
+ mov.u32 %r21, 0x0;
101
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd3 + 0 ];
102
+ @!%p1 mov.u32 %r18, %r6;
103
+ @!%p1 mov.u32 %r19, %r6;
104
+ @!%p1 mov.u32 %r20, %r6;
105
+ @!%p1 mov.u32 %r21, %r6;
106
+ .loc 1 34 18
107
+ add.f32 %f9, %f5, %f1;
108
+ add.f32 %f10, %f6, %f2;
109
+ add.f32 %f11, %f7, %f3;
110
+ add.f32 %f12, %f8, %f4;
111
+ $L__tmp1:
112
+ .loc 2 233 15
113
+ add.f32 %f13, %f9, %f10;
114
+ add.f32 %f14, %f13, %f11;
115
+ add.f32 %f15, %f14, %f12;
116
+ $L__tmp2:
117
+ .loc 2 243 36
118
+ mov.b32 %r58, %f15;
119
+ shfl.sync.bfly.b32 %r59, %r58, 16, 31, -1;
120
+ mov.b32 %f16, %r59;
121
+ $L__tmp3:
122
+ .loc 2 233 15
123
+ add.f32 %f17, %f15, %f16;
124
+ $L__tmp4:
125
+ .loc 2 243 36
126
+ mov.b32 %r60, %f17;
127
+ shfl.sync.bfly.b32 %r61, %r60, 8, 31, -1;
128
+ mov.b32 %f18, %r61;
129
+ $L__tmp5:
130
+ .loc 2 233 15
131
+ add.f32 %f19, %f17, %f18;
132
+ $L__tmp6:
133
+ .loc 2 243 36
134
+ mov.b32 %r62, %f19;
135
+ shfl.sync.bfly.b32 %r63, %r62, 4, 31, -1;
136
+ mov.b32 %f20, %r63;
137
+ $L__tmp7:
138
+ .loc 2 233 15
139
+ add.f32 %f21, %f19, %f20;
140
+ $L__tmp8:
141
+ .loc 2 243 36
142
+ mov.b32 %r64, %f21;
143
+ shfl.sync.bfly.b32 %r65, %r64, 2, 31, -1;
144
+ mov.b32 %f22, %r65;
145
+ $L__tmp9:
146
+ .loc 2 233 15
147
+ add.f32 %f23, %f21, %f22;
148
+ $L__tmp10:
149
+ .loc 2 243 36
150
+ mov.b32 %r66, %f23;
151
+ shfl.sync.bfly.b32 %r67, %r66, 1, 31, -1;
152
+ mov.b32 %f24, %r67;
153
+ $L__tmp11:
154
+ .loc 2 233 15
155
+ add.f32 %f25, %f23, %f24;
156
+ $L__tmp12:
157
+ .loc 2 243 36
158
+ setp.eq.s32 %p14, %r53, 0;
159
+ shr.u32 %r68, %r52, 3;
160
+ and.b32 %r69, %r68, 4;
161
+ mov.u32 %r70, global_smem;
162
+ add.s32 %r26, %r70, %r69;
163
+ mov.b32 %r27, %f25;
164
+ @%p14 st.shared.b32 [ %r26 + 0 ], %r27;
165
+ bar.sync 0;
166
+ setp.lt.s32 %p15, %r52, 2;
167
+ shl.b32 %r71, %r52, 2;
168
+ add.s32 %r29, %r70, %r71;
169
+ @%p15 ld.shared.b32 %r28, [ %r29 + 0 ];
170
+ mov.b32 %f26, %r28;
171
+ shfl.sync.bfly.b32 %r72, %r28, 1, 31, -1;
172
+ mov.b32 %f27, %r72;
173
+ $L__tmp13:
174
+ .loc 2 233 15
175
+ add.f32 %f28, %f26, %f27;
176
+ $L__tmp14:
177
+ .loc 2 243 36
178
+ and.b32 %r73, %r52, 1;
179
+ setp.eq.b32 %p23, %r73, 1;
180
+ not.pred %p24, %p23;
181
+ and.pred %p16, %p15, %p24;
182
+ mov.b32 %r31, %f28;
183
+ @%p16 st.shared.b32 [ %r29 + 0 ], %r31;
184
+ bar.sync 0;
185
+ ld.shared.f32 %f29, [global_smem];
186
+ $L__tmp15:
187
+ .loc 3 8 15
188
+ add.f32 %f30, %f29, 0f00000000;
189
+ $L__tmp16:
190
+ .loc 1 42 20
191
+ mov.b32 %r33, %f30;
192
+ mov.b32 %r34, 1132462080;
193
+ div.full.f32 %r51, %r33, %r34;
194
+ mov.b32 %f31, %r51;
195
+ .loc 1 43 19
196
+ sub.f32 %f32, %f9, %f31;
197
+ sub.f32 %f33, %f10, %f31;
198
+ sub.f32 %f34, %f11, %f31;
199
+ sub.f32 %f35, %f12, %f31;
200
+ .loc 1 44 20
201
+ mul.f32 %f36, %f33, %f33;
202
+ $L__tmp17:
203
+ .loc 2 243 36
204
+ bar.sync 0;
205
+ $L__tmp18:
206
+ .loc 2 233 15
207
+ fma.rn.f32 %f37, %f32, %f32, %f36;
208
+ fma.rn.f32 %f38, %f34, %f34, %f37;
209
+ fma.rn.f32 %f39, %f35, %f35, %f38;
210
+ $L__tmp19:
211
+ .loc 2 243 36
212
+ mov.b32 %r74, %f39;
213
+ shfl.sync.bfly.b32 %r75, %r74, 16, 31, -1;
214
+ mov.b32 %f40, %r75;
215
+ $L__tmp20:
216
+ .loc 2 233 15
217
+ add.f32 %f41, %f39, %f40;
218
+ $L__tmp21:
219
+ .loc 2 243 36
220
+ mov.b32 %r76, %f41;
221
+ shfl.sync.bfly.b32 %r77, %r76, 8, 31, -1;
222
+ mov.b32 %f42, %r77;
223
+ $L__tmp22:
224
+ .loc 2 233 15
225
+ add.f32 %f43, %f41, %f42;
226
+ $L__tmp23:
227
+ .loc 2 243 36
228
+ mov.b32 %r78, %f43;
229
+ shfl.sync.bfly.b32 %r79, %r78, 4, 31, -1;
230
+ mov.b32 %f44, %r79;
231
+ $L__tmp24:
232
+ .loc 2 233 15
233
+ add.f32 %f45, %f43, %f44;
234
+ $L__tmp25:
235
+ .loc 2 243 36
236
+ mov.b32 %r80, %f45;
237
+ shfl.sync.bfly.b32 %r81, %r80, 2, 31, -1;
238
+ mov.b32 %f46, %r81;
239
+ $L__tmp26:
240
+ .loc 2 233 15
241
+ add.f32 %f47, %f45, %f46;
242
+ $L__tmp27:
243
+ .loc 2 243 36
244
+ mov.b32 %r82, %f47;
245
+ shfl.sync.bfly.b32 %r83, %r82, 1, 31, -1;
246
+ mov.b32 %f48, %r83;
247
+ $L__tmp28:
248
+ .loc 2 233 15
249
+ add.f32 %f49, %f47, %f48;
250
+ $L__tmp29:
251
+ .loc 2 243 36
252
+ mov.b32 %r36, %f49;
253
+ @%p14 st.shared.b32 [ %r26 + 0 ], %r36;
254
+ bar.sync 0;
255
+ @%p15 ld.shared.b32 %r37, [ %r29 + 0 ];
256
+ mov.b32 %f50, %r37;
257
+ shfl.sync.bfly.b32 %r84, %r37, 1, 31, -1;
258
+ mov.b32 %f51, %r84;
259
+ $L__tmp30:
260
+ .loc 2 233 15
261
+ add.f32 %f52, %f50, %f51;
262
+ $L__tmp31:
263
+ .loc 2 243 36
264
+ mov.b32 %r40, %f52;
265
+ @%p16 st.shared.b32 [ %r29 + 0 ], %r40;
266
+ bar.sync 0;
267
+ ld.shared.f32 %f53, [global_smem];
268
+ $L__tmp32:
269
+ .loc 3 8 15
270
+ add.f32 %f54, %f53, 0f00000000;
271
+ $L__tmp33:
272
+ .loc 1 49 20
273
+ mov.b32 %r42, %f54;
274
+ div.full.f32 %r41, %r42, %r34;
275
+ mov.b32 %f55, %r41;
276
+ .loc 1 51 20
277
+ add.f32 %f56, %f55, 0f3727C5AC;
278
+ .loc 1 52 26
279
+ rsqrt.approx.ftz.f32 %f57, %f56;
280
+ .loc 1 32 36
281
+ mov.b32 %f58, %r21;
282
+ mov.b32 %f59, %r20;
283
+ mov.b32 %f60, %r19;
284
+ mov.b32 %f61, %r18;
285
+ .loc 1 54 20
286
+ mul.f32 %f62, %f32, %f57;
287
+ mul.f32 %f63, %f33, %f57;
288
+ mul.f32 %f64, %f34, %f57;
289
+ mul.f32 %f65, %f35, %f57;
290
+ .loc 1 55 20
291
+ mul.f32 %f66, %f62, %f61;
292
+ mul.f32 %f67, %f63, %f60;
293
+ mul.f32 %f68, %f64, %f59;
294
+ mul.f32 %f69, %f65, %f58;
295
+ .loc 1 57 4
296
+ bar.sync 0;
297
+ .loc 1 58 28
298
+ mul.wide.s32 %rd16, %r1, 4;
299
+ add.s64 %rd4, %rd7, %rd16;
300
+ .loc 1 58 40
301
+ setp.eq.s32 %p20, %r54, 0;
302
+ mov.b32 %r44, %f57;
303
+ @%p20 st.global.b32 [ %rd4 + 0 ], { %r44 };
304
+ .loc 1 59 25
305
+ add.s64 %rd5, %rd12, %rd14;
306
+ .loc 1 59 48
307
+ mov.b32 %r45, %f66;
308
+ cvt.rn.bf16.f32 %rs5, %r45;
309
+ mov.b32 %r46, %f67;
310
+ cvt.rn.bf16.f32 %rs6, %r46;
311
+ mov.b32 %r47, %f68;
312
+ cvt.rn.bf16.f32 %rs7, %r47;
313
+ mov.b32 %r48, %f69;
314
+ cvt.rn.bf16.f32 %rs8, %r48;
315
+ mov.b32 %r85, {%rs5, %rs6};
316
+ mov.b32 %r86, {%rs7, %rs8};
317
+ @%p1 st.global.v2.b32 [ %rd5 + 0 ], { %r85, %r86 };
318
+ .loc 1 60 25
319
+ add.s64 %rd6, %rd11, %rd16;
320
+ .loc 1 60 37
321
+ @%p20 st.global.b32 [ %rd6 + 0 ], { %r51 };
322
+ .loc 1 60 4
323
+ ret;
324
+ $L__tmp34:
325
+ $L__func_end0:
326
+
327
+ }
328
+ // .globl __nv_rsqrtf
329
+ .visible .func (.param .b32 func_retval0) __nv_rsqrtf(
330
+ .param .b32 __nv_rsqrtf_param_0
331
+ )
332
+ {
333
+ .reg .f32 %f<3>;
334
+ $L__func_begin1:
335
+
336
+ ld.param.f32 %f1, [__nv_rsqrtf_param_0];
337
+ rsqrt.approx.ftz.f32 %f2, %f1;
338
+ st.param.f32 [func_retval0+0], %f2;
339
+ ret;
340
+ $L__func_end1:
341
+
342
+ }
343
+ .file 1 "/tmp/torchinductor_root/w3/cw35gljjtatzr2ztskwlxndj2nreiih7r3vg5rw4douyaxccqgij.py"
344
+ .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
345
+ .file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
346
+ .section .debug_abbrev
347
+ {
348
+ .b8 1
349
+ .b8 17
350
+ .b8 1
351
+ .b8 37
352
+ .b8 8
353
+ .b8 19
354
+ .b8 5
355
+ .b8 3
356
+ .b8 8
357
+ .b8 16
358
+ .b8 6
359
+ .b8 27
360
+ .b8 8
361
+ .b8 180
362
+ .b8 66
363
+ .b8 12
364
+ .b8 17
365
+ .b8 1
366
+ .b8 18
367
+ .b8 1
368
+ .b8 0
369
+ .b8 0
370
+ .b8 2
371
+ .b8 46
372
+ .b8 0
373
+ .b8 135
374
+ .b8 64
375
+ .b8 8
376
+ .b8 3
377
+ .b8 8
378
+ .b8 58
379
+ .b8 11
380
+ .b8 59
381
+ .b8 11
382
+ .b8 63
383
+ .b8 12
384
+ .b8 32
385
+ .b8 11
386
+ .b8 0
387
+ .b8 0
388
+ .b8 3
389
+ .b8 46
390
+ .b8 1
391
+ .b8 17
392
+ .b8 1
393
+ .b8 18
394
+ .b8 1
395
+ .b8 64
396
+ .b8 10
397
+ .b8 49
398
+ .b8 19
399
+ .b8 0
400
+ .b8 0
401
+ .b8 4
402
+ .b8 29
403
+ .b8 1
404
+ .b8 49
405
+ .b8 19
406
+ .b8 17
407
+ .b8 1
408
+ .b8 18
409
+ .b8 1
410
+ .b8 88
411
+ .b8 11
412
+ .b8 89
413
+ .b8 11
414
+ .b8 87
415
+ .b8 11
416
+ .b8 0
417
+ .b8 0
418
+ .b8 5
419
+ .b8 29
420
+ .b8 0
421
+ .b8 49
422
+ .b8 19
423
+ .b8 17
424
+ .b8 1
425
+ .b8 18
426
+ .b8 1
427
+ .b8 88
428
+ .b8 11
429
+ .b8 89
430
+ .b8 11
431
+ .b8 87
432
+ .b8 11
433
+ .b8 0
434
+ .b8 0
435
+ .b8 0
436
+ }
437
+ .section .debug_info
438
+ {
439
+ .b32 399
440
+ .b8 2
441
+ .b8 0
442
+ .b32 .debug_abbrev
443
+ .b8 8
444
+ .b8 1
445
+ .b8 116
446
+ .b8 114
447
+ .b8 105
448
+ .b8 116
449
+ .b8 111
450
+ .b8 110
451
+ .b8 0
452
+ .b8 2
453
+ .b8 0
454
+ .b8 99
455
+ .b8 119
456
+ .b8 51
457
+ .b8 53
458
+ .b8 103
459
+ .b8 108
460
+ .b8 106
461
+ .b8 106
462
+ .b8 116
463
+ .b8 97
464
+ .b8 116
465
+ .b8 122
466
+ .b8 114
467
+ .b8 50
468
+ .b8 122
469
+ .b8 116
470
+ .b8 115
471
+ .b8 107
472
+ .b8 119
473
+ .b8 108
474
+ .b8 120
475
+ .b8 110
476
+ .b8 100
477
+ .b8 106
478
+ .b8 50
479
+ .b8 110
480
+ .b8 114
481
+ .b8 101
482
+ .b8 105
483
+ .b8 105
484
+ .b8 104
485
+ .b8 55
486
+ .b8 114
487
+ .b8 51
488
+ .b8 118
489
+ .b8 103
490
+ .b8 53
491
+ .b8 114
492
+ .b8 119
493
+ .b8 52
494
+ .b8 100
495
+ .b8 111
496
+ .b8 117
497
+ .b8 121
498
+ .b8 97
499
+ .b8 120
500
+ .b8 99
501
+ .b8 99
502
+ .b8 113
503
+ .b8 103
504
+ .b8 105
505
+ .b8 106
506
+ .b8 46
507
+ .b8 112
508
+ .b8 121
509
+ .b8 0
510
+ .b32 .debug_line
511
+ .b8 47
512
+ .b8 116
513
+ .b8 109
514
+ .b8 112
515
+ .b8 47
516
+ .b8 116
517
+ .b8 111
518
+ .b8 114
519
+ .b8 99
520
+ .b8 104
521
+ .b8 105
522
+ .b8 110
523
+ .b8 100
524
+ .b8 117
525
+ .b8 99
526
+ .b8 116
527
+ .b8 111
528
+ .b8 114
529
+ .b8 95
530
+ .b8 114
531
+ .b8 111
532
+ .b8 111
533
+ .b8 116
534
+ .b8 47
535
+ .b8 119
536
+ .b8 51
537
+ .b8 0
538
+ .b8 1
539
+ .b64 $L__func_begin0
540
+ .b64 $L__func_end0
541
+ .b8 2
542
+ .b8 116
543
+ .b8 114
544
+ .b8 105
545
+ .b8 116
546
+ .b8 111
547
+ .b8 110
548
+ .b8 95
549
+ .b8 95
550
+ .b8 48
551
+ .b8 100
552
+ .b8 49
553
+ .b8 100
554
+ .b8 50
555
+ .b8 100
556
+ .b8 51
557
+ .b8 100
558
+ .b8 52
559
+ .b8 100
560
+ .b8 53
561
+ .b8 100
562
+ .b8 54
563
+ .b8 100
564
+ .b8 101
565
+ .b8 55
566
+ .b8 100
567
+ .b8 101
568
+ .b8 0
569
+ .b8 116
570
+ .b8 114
571
+ .b8 105
572
+ .b8 116
573
+ .b8 111
574
+ .b8 110
575
+ .b8 95
576
+ .b8 95
577
+ .b8 48
578
+ .b8 100
579
+ .b8 49
580
+ .b8 100
581
+ .b8 50
582
+ .b8 100
583
+ .b8 51
584
+ .b8 100
585
+ .b8 52
586
+ .b8 100
587
+ .b8 53
588
+ .b8 100
589
+ .b8 54
590
+ .b8 100
591
+ .b8 101
592
+ .b8 55
593
+ .b8 100
594
+ .b8 101
595
+ .b8 0
596
+ .b8 1
597
+ .b8 18
598
+ .b8 1
599
+ .b8 1
600
+ .b8 3
601
+ .b64 $L__func_begin0
602
+ .b64 $L__func_end0
603
+ .b8 1
604
+ .b8 156
605
+ .b32 125
606
+ .b8 4
607
+ .b32 125
608
+ .b64 $L__tmp1
609
+ .b64 $L__tmp14
610
+ .b8 2
611
+ .b8 39
612
+ .b8 58
613
+ .b8 5
614
+ .b32 125
615
+ .b64 $L__tmp1
616
+ .b64 $L__tmp14
617
+ .b8 2
618
+ .b8 243
619
+ .b8 36
620
+ .b8 0
621
+ .b8 5
622
+ .b32 125
623
+ .b64 $L__tmp2
624
+ .b64 $L__tmp15
625
+ .b8 2
626
+ .b8 39
627
+ .b8 58
628
+ .b8 5
629
+ .b32 125
630
+ .b64 $L__tmp15
631
+ .b64 $L__tmp16
632
+ .b8 3
633
+ .b8 39
634
+ .b8 45
635
+ .b8 5
636
+ .b32 125
637
+ .b64 $L__tmp17
638
+ .b64 $L__tmp32
639
+ .b8 2
640
+ .b8 47
641
+ .b8 59
642
+ .b8 4
643
+ .b32 125
644
+ .b64 $L__tmp18
645
+ .b64 $L__tmp31
646
+ .b8 2
647
+ .b8 47
648
+ .b8 59
649
+ .b8 5
650
+ .b32 125
651
+ .b64 $L__tmp18
652
+ .b64 $L__tmp31
653
+ .b8 2
654
+ .b8 243
655
+ .b8 36
656
+ .b8 0
657
+ .b8 5
658
+ .b32 125
659
+ .b64 $L__tmp32
660
+ .b64 $L__tmp33
661
+ .b8 3
662
+ .b8 47
663
+ .b8 45
664
+ .b8 0
665
+ .b8 0
666
+ }
667
+ .section .debug_pubnames
668
+ {
669
+ .b32 $L__pubNames_end0-$L__pubNames_start0
670
+ $L__pubNames_start0:
671
+ .b8 2
672
+ .b8 0
673
+ .b32 .debug_info
674
+ .b32 403
675
+ .b32 125
676
+ .b8 116
677
+ .b8 114
678
+ .b8 105
679
+ .b8 116
680
+ .b8 111
681
+ .b8 110
682
+ .b8 95
683
+ .b8 95
684
+ .b8 48
685
+ .b8 100
686
+ .b8 49
687
+ .b8 100
688
+ .b8 50
689
+ .b8 100
690
+ .b8 51
691
+ .b8 100
692
+ .b8 52
693
+ .b8 100
694
+ .b8 53
695
+ .b8 100
696
+ .b8 54
697
+ .b8 100
698
+ .b8 101
699
+ .b8 55
700
+ .b8 100
701
+ .b8 101
702
+ .b8 0
703
+ .b32 0
704
+ $L__pubNames_end0:
705
+ }
706
+ .section .debug_pubtypes
707
+ {
708
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
709
+ $L__pubTypes_start0:
710
+ .b8 2
711
+ .b8 0
712
+ .b32 .debug_info
713
+ .b32 403
714
+ .b32 0
715
+ $L__pubTypes_end0:
716
+ }
717
+ .section .debug_loc { }
.triton/dump/ab89f85e55d5ddd9676325b49df9419f/triton_.ttgir ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
3
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
4
+ tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
5
+ %cst = arith.constant dense<256> : tensor<256xi32, #blocked>
6
+ %cst_0 = arith.constant 9.99999974E-6 : f32
7
+ %cst_1 = arith.constant 2.560000e+02 : f32
8
+ %cst_2 = arith.constant 0.000000e+00 : f32
9
+ %c256_i32 = arith.constant 256 : i32
10
+ %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
11
+ %cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
12
+ %0 = tt.get_program_id x : i32
13
+ %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
14
+ %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
15
+ %3 = arith.muli %0, %c256_i32 : i32
16
+ %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
17
+ %5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
18
+ %6 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
19
+ %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
20
+ %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
21
+ %9 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
22
+ %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
23
+ %11 = tt.load %10, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
24
+ %12 = arith.extf %11 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
25
+ %13 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
26
+ %14 = tt.addptr %13, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
27
+ %15 = tt.load %14, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
28
+ %16 = arith.addf %8, %12 : tensor<256xf32, #blocked>
29
+ %17 = arith.select %2, %16, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
30
+ %18 = "tt.reduce"(%17) <{axis = 0 : i32}> ({
31
+ ^bb0(%arg8: f32, %arg9: f32):
32
+ %42 = arith.addf %arg8, %arg9 : f32
33
+ tt.reduce.return %42 : f32
34
+ }) : (tensor<256xf32, #blocked>) -> f32
35
+ %19 = arith.addf %18, %cst_2 : f32
36
+ %20 = arith.divf %19, %cst_1 : f32
37
+ %21 = tt.splat %20 : (f32) -> tensor<1xf32, #blocked1>
38
+ %22 = tt.splat %20 : (f32) -> tensor<256xf32, #blocked>
39
+ %23 = arith.subf %16, %22 : tensor<256xf32, #blocked>
40
+ %24 = arith.mulf %23, %23 : tensor<256xf32, #blocked>
41
+ %25 = arith.select %2, %24, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
42
+ %26 = "tt.reduce"(%25) <{axis = 0 : i32}> ({
43
+ ^bb0(%arg8: f32, %arg9: f32):
44
+ %42 = arith.addf %arg8, %arg9 : f32
45
+ tt.reduce.return %42 : f32
46
+ }) : (tensor<256xf32, #blocked>) -> f32
47
+ %27 = arith.addf %26, %cst_2 : f32
48
+ %28 = arith.divf %27, %cst_1 : f32
49
+ %29 = arith.addf %28, %cst_0 : f32
50
+ %30 = tt.extern_elementwise %29 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
51
+ %31 = tt.splat %30 : (f32) -> tensor<1xf32, #blocked1>
52
+ %32 = tt.splat %30 : (f32) -> tensor<256xf32, #blocked>
53
+ %33 = arith.mulf %23, %32 : tensor<256xf32, #blocked>
54
+ %34 = arith.mulf %33, %15 : tensor<256xf32, #blocked>
55
+ gpu.barrier
56
+ %35 = tt.addptr %arg0, %0 : !tt.ptr<f32, 1>, i32
57
+ %36 = tt.splat %35 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked1>
58
+ tt.store %36, %31 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32, #blocked1>
59
+ %37 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
60
+ %38 = tt.addptr %37, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
61
+ %39 = arith.truncf %34 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked>
62
+ tt.store %38, %39, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked>
63
+ %40 = tt.addptr %arg4, %0 : !tt.ptr<f32, 1>, i32
64
+ %41 = tt.splat %40 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked1>
65
+ tt.store %41, %21 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32, #blocked1>
66
+ tt.return
67
+ }
68
+ }
.triton/dump/be28ee3793470d9803546f03e49c3edf/triton_.cubin ADDED
Binary file (17.5 kB). View file
 
.triton/dump/be28ee3793470d9803546f03e49c3edf/triton_.ttgir ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
3
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
4
+ tt.func public @triton__0d1d2d3d4d5d6d7d8d9d10de11de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg8: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg9: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg10: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg11: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
5
+ %cst = arith.constant dense<256> : tensor<256xi32, #blocked>
6
+ %cst_0 = arith.constant 9.99999974E-6 : f32
7
+ %cst_1 = arith.constant 2.560000e+02 : f32
8
+ %cst_2 = arith.constant 0.000000e+00 : f32
9
+ %c256_i32 = arith.constant 256 : i32
10
+ %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
11
+ %cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
12
+ %0 = tt.get_program_id x : i32
13
+ %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
14
+ %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
15
+ %3 = arith.muli %0, %c256_i32 : i32
16
+ %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
17
+ %5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
18
+ %6 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
19
+ %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
20
+ %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
21
+ %9 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
22
+ %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
23
+ %11 = tt.load %10, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
24
+ %12 = arith.extf %11 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
25
+ %13 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
26
+ %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
27
+ %15 = tt.load %14, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
28
+ %16 = arith.extf %15 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
29
+ %17 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
30
+ %18 = tt.addptr %17, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
31
+ %19 = tt.load %18, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
32
+ %20 = arith.extf %19 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
33
+ %21 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
34
+ %22 = tt.addptr %21, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
35
+ %23 = tt.load %22, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
36
+ %24 = arith.extf %23 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
37
+ %25 = tt.splat %arg6 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
38
+ %26 = tt.addptr %25, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
39
+ %27 = tt.load %26, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
40
+ %28 = arith.addf %8, %12 : tensor<256xf32, #blocked>
41
+ %29 = arith.addf %28, %16 : tensor<256xf32, #blocked>
42
+ %30 = arith.addf %29, %20 : tensor<256xf32, #blocked>
43
+ %31 = arith.addf %30, %24 : tensor<256xf32, #blocked>
44
+ %32 = arith.select %2, %31, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
45
+ %33 = "tt.reduce"(%32) <{axis = 0 : i32}> ({
46
+ ^bb0(%arg12: f32, %arg13: f32):
47
+ %58 = arith.addf %arg12, %arg13 : f32
48
+ tt.reduce.return %58 : f32
49
+ }) : (tensor<256xf32, #blocked>) -> f32
50
+ %34 = arith.addf %33, %cst_2 : f32
51
+ %35 = arith.divf %34, %cst_1 : f32
52
+ %36 = tt.splat %35 : (f32) -> tensor<1xf32, #blocked1>
53
+ %37 = tt.splat %35 : (f32) -> tensor<256xf32, #blocked>
54
+ %38 = arith.subf %31, %37 : tensor<256xf32, #blocked>
55
+ %39 = arith.mulf %38, %38 : tensor<256xf32, #blocked>
56
+ %40 = arith.select %2, %39, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
57
+ %41 = "tt.reduce"(%40) <{axis = 0 : i32}> ({
58
+ ^bb0(%arg12: f32, %arg13: f32):
59
+ %58 = arith.addf %arg12, %arg13 : f32
60
+ tt.reduce.return %58 : f32
61
+ }) : (tensor<256xf32, #blocked>) -> f32
62
+ %42 = arith.addf %41, %cst_2 : f32
63
+ %43 = arith.divf %42, %cst_1 : f32
64
+ %44 = arith.addf %43, %cst_0 : f32
65
+ %45 = tt.extern_elementwise %44 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
66
+ %46 = tt.splat %45 : (f32) -> tensor<1xf32, #blocked1>
67
+ %47 = tt.splat %45 : (f32) -> tensor<256xf32, #blocked>
68
+ %48 = arith.mulf %38, %47 : tensor<256xf32, #blocked>
69
+ %49 = arith.mulf %48, %27 : tensor<256xf32, #blocked>
70
+ %50 = tt.splat %arg7 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
71
+ %51 = tt.addptr %50, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
72
+ tt.store %51, %31, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked>
73
+ gpu.barrier
74
+ %52 = tt.addptr %arg0, %0 : !tt.ptr<f32, 1>, i32
75
+ %53 = tt.splat %52 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked1>
76
+ tt.store %53, %46 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32, #blocked1>
77
+ %54 = tt.splat %arg9 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
78
+ %55 = tt.addptr %54, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
79
+ tt.store %55, %49, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked>
80
+ %56 = tt.addptr %arg8, %0 : !tt.ptr<f32, 1>, i32
81
+ %57 = tt.splat %56 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked1>
82
+ tt.store %57, %36 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32, #blocked1>
83
+ tt.return
84
+ }
85
+ }
.triton/dump/c0c34db8f5ff22b79fc971c63187477a/triton_.cubin ADDED
Binary file (17.8 kB). View file
 
.triton/dump/c0c34db8f5ff22b79fc971c63187477a/triton_.ttir ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3d4d5d6d7d8d9d10de11de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg8: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg9: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg10: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg11: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %c256_i32 = arith.constant 256 : i32
4
+ %cst = arith.constant dense<0.000000e+00> : tensor<256xbf16>
5
+ %cst_0 = arith.constant 0.000000e+00 : f32
6
+ %cst_1 = arith.constant 2.560000e+02 : f32
7
+ %cst_2 = arith.constant 9.99999974E-6 : f32
8
+ %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32>
9
+ %cst_4 = arith.constant dense<256> : tensor<256xi32>
10
+ %0 = tt.get_program_id x : i32
11
+ %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
12
+ %2 = arith.cmpi slt, %1, %cst_4 : tensor<256xi32>
13
+ %3 = arith.muli %0, %c256_i32 : i32
14
+ %4 = tt.splat %3 : (i32) -> tensor<256xi32>
15
+ %5 = arith.addi %1, %4 : tensor<256xi32>
16
+ %6 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
17
+ %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
18
+ %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
19
+ %9 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
20
+ %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
21
+ %11 = tt.load %10, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
22
+ %12 = arith.extf %11 : tensor<256xbf16> to tensor<256xf32>
23
+ %13 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
24
+ %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
25
+ %15 = tt.load %14, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
26
+ %16 = arith.extf %15 : tensor<256xbf16> to tensor<256xf32>
27
+ %17 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
28
+ %18 = tt.addptr %17, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
29
+ %19 = tt.load %18, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
30
+ %20 = arith.extf %19 : tensor<256xbf16> to tensor<256xf32>
31
+ %21 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
32
+ %22 = tt.addptr %21, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
33
+ %23 = tt.load %22, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
34
+ %24 = arith.extf %23 : tensor<256xbf16> to tensor<256xf32>
35
+ %25 = tt.splat %arg6 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
36
+ %26 = tt.addptr %25, %1 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
37
+ %27 = tt.load %26, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
38
+ %28 = arith.addf %8, %12 : tensor<256xf32>
39
+ %29 = arith.addf %28, %16 : tensor<256xf32>
40
+ %30 = arith.addf %29, %20 : tensor<256xf32>
41
+ %31 = arith.addf %30, %24 : tensor<256xf32>
42
+ %32 = arith.select %2, %31, %cst_3 : tensor<256xi1>, tensor<256xf32>
43
+ %33 = "tt.reduce"(%32) <{axis = 0 : i32}> ({
44
+ ^bb0(%arg12: f32, %arg13: f32):
45
+ %59 = arith.addf %arg12, %arg13 : f32
46
+ tt.reduce.return %59 : f32
47
+ }) : (tensor<256xf32>) -> f32
48
+ %34 = arith.addf %33, %cst_0 : f32
49
+ %35 = arith.divf %34, %cst_1 : f32
50
+ %36 = tt.splat %35 : (f32) -> tensor<1xf32>
51
+ %37 = tt.splat %35 : (f32) -> tensor<256xf32>
52
+ %38 = arith.subf %31, %37 : tensor<256xf32>
53
+ %39 = arith.mulf %38, %38 : tensor<256xf32>
54
+ %40 = arith.select %2, %39, %cst_3 : tensor<256xi1>, tensor<256xf32>
55
+ %41 = "tt.reduce"(%40) <{axis = 0 : i32}> ({
56
+ ^bb0(%arg12: f32, %arg13: f32):
57
+ %59 = arith.addf %arg12, %arg13 : f32
58
+ tt.reduce.return %59 : f32
59
+ }) : (tensor<256xf32>) -> f32
60
+ %42 = arith.addf %41, %cst_0 : f32
61
+ %43 = arith.divf %42, %cst_1 : f32
62
+ %44 = arith.addf %43, %cst_2 : f32
63
+ %45 = tt.extern_elementwise %44 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
64
+ %46 = tt.splat %45 : (f32) -> tensor<1xf32>
65
+ %47 = tt.splat %45 : (f32) -> tensor<256xf32>
66
+ %48 = arith.mulf %38, %47 : tensor<256xf32>
67
+ %49 = arith.mulf %48, %27 : tensor<256xf32>
68
+ %50 = tt.splat %arg7 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
69
+ %51 = tt.addptr %50, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
70
+ tt.store %51, %31, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32>
71
+ gpu.barrier
72
+ %52 = tt.addptr %arg0, %0 : !tt.ptr<f32, 1>, i32
73
+ %53 = tt.splat %52 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>>
74
+ tt.store %53, %46 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32>
75
+ %54 = tt.splat %arg9 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
76
+ %55 = tt.addptr %54, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
77
+ %56 = arith.truncf %49 : tensor<256xf32> to tensor<256xbf16>
78
+ tt.store %55, %56, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16>
79
+ %57 = tt.addptr %arg8, %0 : !tt.ptr<f32, 1>, i32
80
+ %58 = tt.splat %57 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>>
81
+ tt.store %58, %36 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32>
82
+ tt.return
83
+ }
84
+ }
.triton/dump/c0db4dd81e5aac83500e3ccf67d3896d/triton_.llir ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 {
5
+ %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
6
+ %5 = shl i32 %4, 1, !dbg !8
7
+ %6 = and i32 %5, 510, !dbg !8
8
+ %7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
9
+ %8 = shl i32 %7, 9, !dbg !10
10
+ %9 = or i32 %8, %6, !dbg !11
11
+ %10 = sext i32 %9 to i64, !dbg !12
12
+ %11 = getelementptr i16, ptr addrspace(1) %0, i64 %10, !dbg !12
13
+ %12 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %11, i1 true) #1, !dbg !13
14
+ %13 = trunc i32 %12 to i16, !dbg !13
15
+ %extelt.offset = lshr i32 %12, 16, !dbg !13
16
+ %14 = trunc i32 %extelt.offset to i16, !dbg !13
17
+ %15 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %13) #1, !dbg !14
18
+ %16 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %14) #1, !dbg !14
19
+ %17 = getelementptr float, ptr addrspace(1) %1, i64 %10, !dbg !15
20
+ %18 = bitcast float %15 to i32, !dbg !16
21
+ %19 = bitcast float %16 to i32, !dbg !16
22
+ tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %18, i32 %19, ptr addrspace(1) %17, i1 true) #1, !dbg !16
23
+ ret void, !dbg !17
24
+ }
25
+
26
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
27
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
28
+
29
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
30
+ attributes #1 = { nounwind }
31
+
32
+ !llvm.module.flags = !{!0}
33
+ !llvm.dbg.cu = !{!1}
34
+ !nvvm.annotations = !{!3, !4, !4, !3}
35
+
36
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
37
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
38
+ !2 = !DIFile(filename: "cyamhdbxtmf4rgres6uo7orhfzw3ryhsvm5qzdvyqgggck2hqbyi.py", directory: "/tmp/torchinductor_root/ya")
39
+ !3 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
40
+ !4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 256}
41
+ !5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
42
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
43
+ !7 = !{}
44
+ !8 = !DILocation(line: 21, column: 36, scope: !5)
45
+ !9 = !DILocation(line: 20, column: 28, scope: !5)
46
+ !10 = !DILocation(line: 20, column: 33, scope: !5)
47
+ !11 = !DILocation(line: 21, column: 23, scope: !5)
48
+ !12 = !DILocation(line: 24, column: 30, scope: !5)
49
+ !13 = !DILocation(line: 24, column: 35, scope: !5)
50
+ !14 = !DILocation(line: 24, column: 44, scope: !5)
51
+ !15 = !DILocation(line: 26, column: 25, scope: !5)
52
+ !16 = !DILocation(line: 26, column: 36, scope: !5)
53
+ !17 = !DILocation(line: 26, column: 4, scope: !5)
.triton/dump/d7a12c0ba96f8920b8147157303ee99f/triton_.ttir ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %c256_i32 = arith.constant 256 : i32
4
+ %cst = arith.constant dense<0.000000e+00> : tensor<256xbf16>
5
+ %cst_0 = arith.constant 0.000000e+00 : f32
6
+ %cst_1 = arith.constant 2.560000e+02 : f32
7
+ %cst_2 = arith.constant 9.99999974E-6 : f32
8
+ %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32>
9
+ %cst_4 = arith.constant dense<256> : tensor<256xi32>
10
+ %0 = tt.get_program_id x : i32
11
+ %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
12
+ %2 = arith.cmpi slt, %1, %cst_4 : tensor<256xi32>
13
+ %3 = arith.muli %0, %c256_i32 : i32
14
+ %4 = tt.splat %3 : (i32) -> tensor<256xi32>
15
+ %5 = arith.addi %1, %4 : tensor<256xi32>
16
+ %6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
17
+ %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
18
+ %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
19
+ %9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
20
+ %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
21
+ %11 = tt.load %10, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
22
+ %12 = arith.extf %11 : tensor<256xbf16> to tensor<256xf32>
23
+ %13 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
24
+ %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
25
+ %15 = tt.load %14, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
26
+ %16 = arith.extf %15 : tensor<256xbf16> to tensor<256xf32>
27
+ %17 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
28
+ %18 = tt.addptr %17, %1 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
29
+ %19 = tt.load %18, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
30
+ %20 = arith.addf %8, %12 : tensor<256xf32>
31
+ %21 = arith.addf %20, %16 : tensor<256xf32>
32
+ %22 = arith.select %2, %21, %cst_3 : tensor<256xi1>, tensor<256xf32>
33
+ %23 = "tt.reduce"(%22) <{axis = 0 : i32}> ({
34
+ ^bb0(%arg7: f32, %arg8: f32):
35
+ %41 = arith.addf %arg7, %arg8 : f32
36
+ tt.reduce.return %41 : f32
37
+ }) : (tensor<256xf32>) -> f32
38
+ %24 = arith.addf %23, %cst_0 : f32
39
+ %25 = arith.divf %24, %cst_1 : f32
40
+ %26 = tt.splat %25 : (f32) -> tensor<256xf32>
41
+ %27 = arith.subf %21, %26 : tensor<256xf32>
42
+ %28 = arith.mulf %27, %27 : tensor<256xf32>
43
+ %29 = arith.select %2, %28, %cst_3 : tensor<256xi1>, tensor<256xf32>
44
+ %30 = "tt.reduce"(%29) <{axis = 0 : i32}> ({
45
+ ^bb0(%arg7: f32, %arg8: f32):
46
+ %41 = arith.addf %arg7, %arg8 : f32
47
+ tt.reduce.return %41 : f32
48
+ }) : (tensor<256xf32>) -> f32
49
+ %31 = arith.addf %30, %cst_0 : f32
50
+ %32 = arith.divf %31, %cst_1 : f32
51
+ %33 = arith.addf %32, %cst_2 : f32
52
+ %34 = tt.extern_elementwise %33 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
53
+ %35 = tt.splat %34 : (f32) -> tensor<256xf32>
54
+ %36 = arith.mulf %27, %35 : tensor<256xf32>
55
+ %37 = arith.mulf %36, %19 : tensor<256xf32>
56
+ %38 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
57
+ %39 = tt.addptr %38, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
58
+ %40 = arith.truncf %37 : tensor<256xf32> to tensor<256xbf16>
59
+ tt.store %39, %40, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16>
60
+ tt.return
61
+ }
62
+ }
.triton/dump/f5088324dcdcf6814f6743553c1321c2/triton_.llir ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 {
5
+ %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
6
+ %5 = shl i32 %4, 2, !dbg !8
7
+ %6 = and i32 %5, 508, !dbg !8
8
+ %7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
9
+ %8 = shl i32 %7, 9, !dbg !10
10
+ %9 = or i32 %8, %6, !dbg !11
11
+ %10 = sext i32 %9 to i64, !dbg !12
12
+ %11 = getelementptr float, ptr addrspace(1) %0, i64 %10, !dbg !12
13
+ %12 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %11, i1 true) #1, !dbg !13
14
+ %13 = extractvalue { i32, i32, i32, i32 } %12, 0, !dbg !13
15
+ %14 = extractvalue { i32, i32, i32, i32 } %12, 1, !dbg !13
16
+ %15 = extractvalue { i32, i32, i32, i32 } %12, 2, !dbg !13
17
+ %16 = extractvalue { i32, i32, i32, i32 } %12, 3, !dbg !13
18
+ %17 = bitcast i32 %13 to float, !dbg !13
19
+ %18 = bitcast i32 %14 to float, !dbg !13
20
+ %19 = bitcast i32 %15 to float, !dbg !13
21
+ %20 = bitcast i32 %16 to float, !dbg !13
22
+ %21 = getelementptr i16, ptr addrspace(1) %1, i64 %10, !dbg !14
23
+ %22 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %17) #1, !dbg !15
24
+ %23 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %18) #1, !dbg !15
25
+ %24 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %19) #1, !dbg !15
26
+ %25 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %20) #1, !dbg !15
27
+ %26 = insertelement <2 x i16> undef, i16 %22, i64 0, !dbg !15
28
+ %27 = insertelement <2 x i16> %26, i16 %23, i64 1, !dbg !15
29
+ %28 = bitcast <2 x i16> %27 to i32, !dbg !15
30
+ %29 = insertelement <2 x i16> undef, i16 %24, i64 0, !dbg !15
31
+ %30 = insertelement <2 x i16> %29, i16 %25, i64 1, !dbg !15
32
+ %31 = bitcast <2 x i16> %30 to i32, !dbg !15
33
+ tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %28, i32 %31, ptr addrspace(1) %21, i1 true) #1, !dbg !15
34
+ ret void, !dbg !16
35
+ }
36
+
37
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
38
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
39
+
40
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
41
+ attributes #1 = { nounwind }
42
+
43
+ !llvm.module.flags = !{!0}
44
+ !llvm.dbg.cu = !{!1}
45
+ !nvvm.annotations = !{!3, !4, !4, !3}
46
+
47
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
48
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
49
+ !2 = !DIFile(filename: "cpqhcwm5bfrhuwddh4c4qks6bh7sovfbpfnmqhnm4h4w23icqnu6.py", directory: "/tmp/torchinductor_root/pq")
50
+ !3 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
51
+ !4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 128}
52
+ !5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
53
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
54
+ !7 = !{}
55
+ !8 = !DILocation(line: 21, column: 36, scope: !5)
56
+ !9 = !DILocation(line: 20, column: 28, scope: !5)
57
+ !10 = !DILocation(line: 20, column: 33, scope: !5)
58
+ !11 = !DILocation(line: 21, column: 23, scope: !5)
59
+ !12 = !DILocation(line: 24, column: 30, scope: !5)
60
+ !13 = !DILocation(line: 24, column: 35, scope: !5)
61
+ !14 = !DILocation(line: 26, column: 25, scope: !5)
62
+ !15 = !DILocation(line: 26, column: 36, scope: !5)
63
+ !16 = !DILocation(line: 26, column: 4, scope: !5)
.triton/dump/f5088324dcdcf6814f6743553c1321c2/triton_.ptx ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2de
10
+
11
+ .visible .entry triton__0d1d2de(
12
+ .param .u64 triton__0d1d2de_param_0,
13
+ .param .u64 triton__0d1d2de_param_1,
14
+ .param .u32 triton__0d1d2de_param_2
15
+ )
16
+ .maxntid 128, 1, 1
17
+ {
18
+ .reg .pred %p<3>;
19
+ .reg .b16 %rs<5>;
20
+ .reg .b32 %r<19>;
21
+ .reg .b64 %rd<7>;
22
+ .loc 1 18 0
23
+ $L__func_begin0:
24
+ .loc 1 18 0
25
+
26
+ ld.param.u64 %rd3, [triton__0d1d2de_param_0];
27
+ ld.param.u64 %rd4, [triton__0d1d2de_param_1];
28
+ $L__tmp0:
29
+ .loc 1 21 36
30
+ mov.u32 %r12, %tid.x;
31
+ shl.b32 %r13, %r12, 2;
32
+ and.b32 %r14, %r13, 508;
33
+ .loc 1 20 28
34
+ mov.u32 %r1, %ctaid.x;
35
+ .loc 1 20 33
36
+ shl.b32 %r15, %r1, 9;
37
+ .loc 1 21 23
38
+ or.b32 %r16, %r15, %r14;
39
+ .loc 1 24 30
40
+ mul.wide.s32 %rd5, %r16, 4;
41
+ add.s64 %rd1, %rd3, %rd5;
42
+ mov.pred %p1, -1;
43
+ .loc 1 24 35
44
+ mov.u32 %r6, 0x0;
45
+ mov.u32 %r7, 0x0;
46
+ mov.u32 %r8, 0x0;
47
+ mov.u32 %r9, 0x0;
48
+ @%p1 ld.global.v4.b32 { %r6, %r7, %r8, %r9 }, [ %rd1 + 0 ];
49
+ .loc 1 26 25
50
+ mul.wide.s32 %rd6, %r16, 2;
51
+ add.s64 %rd2, %rd4, %rd6;
52
+ .loc 1 26 36
53
+ cvt.rn.bf16.f32 %rs1, %r6;
54
+ cvt.rn.bf16.f32 %rs2, %r7;
55
+ cvt.rn.bf16.f32 %rs3, %r8;
56
+ cvt.rn.bf16.f32 %rs4, %r9;
57
+ mov.b32 %r17, {%rs1, %rs2};
58
+ mov.b32 %r18, {%rs3, %rs4};
59
+ @%p1 st.global.v2.b32 [ %rd2 + 0 ], { %r17, %r18 };
60
+ .loc 1 26 4
61
+ ret;
62
+ $L__tmp1:
63
+ $L__func_end0:
64
+
65
+ }
66
+ .file 1 "/tmp/torchinductor_root/pq/cpqhcwm5bfrhuwddh4c4qks6bh7sovfbpfnmqhnm4h4w23icqnu6.py"
67
+ .section .debug_abbrev
68
+ {
69
+ .b8 1
70
+ .b8 17
71
+ .b8 1
72
+ .b8 37
73
+ .b8 8
74
+ .b8 19
75
+ .b8 5
76
+ .b8 3
77
+ .b8 8
78
+ .b8 16
79
+ .b8 6
80
+ .b8 27
81
+ .b8 8
82
+ .b8 180
83
+ .b8 66
84
+ .b8 12
85
+ .b8 17
86
+ .b8 1
87
+ .b8 18
88
+ .b8 1
89
+ .b8 0
90
+ .b8 0
91
+ .b8 2
92
+ .b8 46
93
+ .b8 0
94
+ .b8 17
95
+ .b8 1
96
+ .b8 18
97
+ .b8 1
98
+ .b8 64
99
+ .b8 10
100
+ .b8 135
101
+ .b8 64
102
+ .b8 8
103
+ .b8 3
104
+ .b8 8
105
+ .b8 58
106
+ .b8 11
107
+ .b8 59
108
+ .b8 11
109
+ .b8 63
110
+ .b8 12
111
+ .b8 0
112
+ .b8 0
113
+ .b8 0
114
+ }
115
+ .section .debug_info
116
+ {
117
+ .b32 176
118
+ .b8 2
119
+ .b8 0
120
+ .b32 .debug_abbrev
121
+ .b8 8
122
+ .b8 1
123
+ .b8 116
124
+ .b8 114
125
+ .b8 105
126
+ .b8 116
127
+ .b8 111
128
+ .b8 110
129
+ .b8 0
130
+ .b8 2
131
+ .b8 0
132
+ .b8 99
133
+ .b8 112
134
+ .b8 113
135
+ .b8 104
136
+ .b8 99
137
+ .b8 119
138
+ .b8 109
139
+ .b8 53
140
+ .b8 98
141
+ .b8 102
142
+ .b8 114
143
+ .b8 104
144
+ .b8 117
145
+ .b8 119
146
+ .b8 100
147
+ .b8 100
148
+ .b8 104
149
+ .b8 52
150
+ .b8 99
151
+ .b8 52
152
+ .b8 113
153
+ .b8 107
154
+ .b8 115
155
+ .b8 54
156
+ .b8 98
157
+ .b8 104
158
+ .b8 55
159
+ .b8 115
160
+ .b8 111
161
+ .b8 118
162
+ .b8 102
163
+ .b8 98
164
+ .b8 112
165
+ .b8 102
166
+ .b8 110
167
+ .b8 109
168
+ .b8 113
169
+ .b8 104
170
+ .b8 110
171
+ .b8 109
172
+ .b8 52
173
+ .b8 104
174
+ .b8 52
175
+ .b8 119
176
+ .b8 50
177
+ .b8 51
178
+ .b8 105
179
+ .b8 99
180
+ .b8 113
181
+ .b8 110
182
+ .b8 117
183
+ .b8 54
184
+ .b8 46
185
+ .b8 112
186
+ .b8 121
187
+ .b8 0
188
+ .b32 .debug_line
189
+ .b8 47
190
+ .b8 116
191
+ .b8 109
192
+ .b8 112
193
+ .b8 47
194
+ .b8 116
195
+ .b8 111
196
+ .b8 114
197
+ .b8 99
198
+ .b8 104
199
+ .b8 105
200
+ .b8 110
201
+ .b8 100
202
+ .b8 117
203
+ .b8 99
204
+ .b8 116
205
+ .b8 111
206
+ .b8 114
207
+ .b8 95
208
+ .b8 114
209
+ .b8 111
210
+ .b8 111
211
+ .b8 116
212
+ .b8 47
213
+ .b8 112
214
+ .b8 113
215
+ .b8 0
216
+ .b8 1
217
+ .b64 $L__func_begin0
218
+ .b64 $L__func_end0
219
+ .b8 2
220
+ .b64 $L__func_begin0
221
+ .b64 $L__func_end0
222
+ .b8 1
223
+ .b8 156
224
+ .b8 116
225
+ .b8 114
226
+ .b8 105
227
+ .b8 116
228
+ .b8 111
229
+ .b8 110
230
+ .b8 95
231
+ .b8 95
232
+ .b8 48
233
+ .b8 100
234
+ .b8 49
235
+ .b8 100
236
+ .b8 50
237
+ .b8 100
238
+ .b8 101
239
+ .b8 0
240
+ .b8 116
241
+ .b8 114
242
+ .b8 105
243
+ .b8 116
244
+ .b8 111
245
+ .b8 110
246
+ .b8 95
247
+ .b8 95
248
+ .b8 48
249
+ .b8 100
250
+ .b8 49
251
+ .b8 100
252
+ .b8 50
253
+ .b8 100
254
+ .b8 101
255
+ .b8 0
256
+ .b8 1
257
+ .b8 18
258
+ .b8 1
259
+ .b8 0
260
+ }
261
+ .section .debug_pubnames
262
+ {
263
+ .b32 $L__pubNames_end0-$L__pubNames_start0
264
+ $L__pubNames_start0:
265
+ .b8 2
266
+ .b8 0
267
+ .b32 .debug_info
268
+ .b32 180
269
+ .b32 125
270
+ .b8 116
271
+ .b8 114
272
+ .b8 105
273
+ .b8 116
274
+ .b8 111
275
+ .b8 110
276
+ .b8 95
277
+ .b8 95
278
+ .b8 48
279
+ .b8 100
280
+ .b8 49
281
+ .b8 100
282
+ .b8 50
283
+ .b8 100
284
+ .b8 101
285
+ .b8 0
286
+ .b32 0
287
+ $L__pubNames_end0:
288
+ }
289
+ .section .debug_pubtypes
290
+ {
291
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
292
+ $L__pubTypes_start0:
293
+ .b8 2
294
+ .b8 0
295
+ .b32 .debug_info
296
+ .b32 180
297
+ .b32 0
298
+ $L__pubTypes_end0:
299
+ }
300
+ .section .debug_loc { }