Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .launchpadlib/api.launchpad.net/cache/api.launchpad.net,devel,-application,vnd.sun.wadl+xml,2f09acb494bdefdbf8ef0d1396a05e86 +0 -0
- .local/share/jupyter/nbextensions/printview/main.js +75 -0
- .local/share/jupyter/nbextensions/python-markdown/main.js +212 -0
- .local/share/jupyter/nbextensions/qtconsole/qtconsole.yaml +6 -0
- .local/share/jupyter/nbextensions/rubberband/main.css +12 -0
- .local/share/jupyter/nbextensions/rubberband/rubberband.yaml +7 -0
- .local/share/jupyter/nbextensions/ruler/ruler.yaml +32 -0
- .local/share/jupyter/nbextensions/runtools/main.js +745 -0
- .local/share/jupyter/nbextensions/runtools/runtools_lock.png +0 -0
- .local/share/jupyter/nbextensions/scratchpad/README.md +14 -0
- .local/share/jupyter/nbextensions/skill/README.md +15 -0
- .triton/cache/6e97c2a1f7a095255f6dd5de1807841d/cuda_utils.so +0 -0
- .triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.ptx +807 -0
- .triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.ttir +53 -0
- .triton/dump/305a9479aab997a3a16bfe46bb303a50/triton_.cubin +0 -0
- .triton/dump/305a9479aab997a3a16bfe46bb303a50/triton_.ptx +1041 -0
- .triton/dump/415aac87553b7d064f52694fa7254686/triton_.llir +860 -0
- .triton/dump/44b225411009956bfbae22f8bac7d703/triton_.cubin +0 -0
- .triton/dump/44b225411009956bfbae22f8bac7d703/triton_.llir +304 -0
- .triton/dump/44b225411009956bfbae22f8bac7d703/triton_.ttir +61 -0
- .triton/dump/4c6ad48573c74d55ed79384f6b432d50/triton_.ttir +18 -0
- .triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.llir +362 -0
- .triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.ptx +486 -0
- .triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.ttgir +38 -0
- .triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.ttir +37 -0
- .triton/dump/645565eaba0a18dd23ef200fe9abb0c0/triton_.cubin +0 -0
- .triton/dump/645565eaba0a18dd23ef200fe9abb0c0/triton_.ptx +834 -0
- .triton/dump/645565eaba0a18dd23ef200fe9abb0c0/triton_.ttgir +98 -0
- .triton/dump/7dc5bb3e5c2bb99527fff34c6fba7810/triton_.llir +42 -0
- .triton/dump/7dc5bb3e5c2bb99527fff34c6fba7810/triton_.ttir +17 -0
- .triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.cubin +0 -0
- .triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.llir +333 -0
- .triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.ttgir +68 -0
- .triton/dump/9a2fb05196b13393bea452d08e9aaca8/triton_.llir +54 -0
- .triton/dump/9a2fb05196b13393bea452d08e9aaca8/triton_.ttgir +19 -0
- .triton/dump/a69784da01a97187168f22847465505f/triton_.cubin +0 -0
- .triton/dump/a69784da01a97187168f22847465505f/triton_.ttgir +73 -0
- .triton/dump/a75e14a8d2d1ec8471f1c7b615552f8c/triton_.cubin +0 -0
- .triton/dump/a75e14a8d2d1ec8471f1c7b615552f8c/triton_.ptx +295 -0
- .triton/dump/a75e14a8d2d1ec8471f1c7b615552f8c/triton_.ttir +18 -0
- .triton/dump/ab89f85e55d5ddd9676325b49df9419f/triton_.ptx +717 -0
- .triton/dump/ab89f85e55d5ddd9676325b49df9419f/triton_.ttgir +68 -0
- .triton/dump/be28ee3793470d9803546f03e49c3edf/triton_.cubin +0 -0
- .triton/dump/be28ee3793470d9803546f03e49c3edf/triton_.ttgir +85 -0
- .triton/dump/c0c34db8f5ff22b79fc971c63187477a/triton_.cubin +0 -0
- .triton/dump/c0c34db8f5ff22b79fc971c63187477a/triton_.ttir +84 -0
- .triton/dump/c0db4dd81e5aac83500e3ccf67d3896d/triton_.llir +53 -0
- .triton/dump/d7a12c0ba96f8920b8147157303ee99f/triton_.ttir +62 -0
- .triton/dump/f5088324dcdcf6814f6743553c1321c2/triton_.llir +63 -0
- .triton/dump/f5088324dcdcf6814f6743553c1321c2/triton_.ptx +300 -0
.launchpadlib/api.launchpad.net/cache/api.launchpad.net,devel,-application,vnd.sun.wadl+xml,2f09acb494bdefdbf8ef0d1396a05e86
ADDED
The diff for this file is too large to render.
See raw diff
|
|
.local/share/jupyter/nbextensions/printview/main.js
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// call "jupyter nbconvert" and open generated html file in new tab
|
2 |
+
|
3 |
+
define([
|
4 |
+
'base/js/namespace',
|
5 |
+
'jquery',
|
6 |
+
'base/js/events',
|
7 |
+
'base/js/utils'
|
8 |
+
], function(
|
9 |
+
IPython,
|
10 |
+
$,
|
11 |
+
events,
|
12 |
+
utils
|
13 |
+
) {
|
14 |
+
"use strict";
|
15 |
+
|
16 |
+
var nbconvert_options = '--to html';
|
17 |
+
var extension = '.html';
|
18 |
+
var open_tab = true;
|
19 |
+
|
20 |
+
/**
|
21 |
+
* Get option from config
|
22 |
+
*/
|
23 |
+
var initialize = function () {
|
24 |
+
var config = IPython.notebook.config;
|
25 |
+
if (config.data.hasOwnProperty('printview_nbconvert_options') ) {
|
26 |
+
nbconvert_options = config.data.printview_nbconvert_options;
|
27 |
+
if (nbconvert_options.search('pdf') > 0) extension = '.pdf';
|
28 |
+
if (nbconvert_options.search('slides') > 0) extension = '.slides.html';
|
29 |
+
}
|
30 |
+
if (config.data.hasOwnProperty('printview_open_tab') ) {
|
31 |
+
if (typeof(config.data.printview_open_tab) === "boolean") {
|
32 |
+
open_tab = config.data.printview_open_tab;
|
33 |
+
}
|
34 |
+
}
|
35 |
+
};
|
36 |
+
|
37 |
+
/**
|
38 |
+
* Call nbconvert using the current notebook server profile
|
39 |
+
*
|
40 |
+
*/
|
41 |
+
var callNbconvert = function () {
|
42 |
+
events.off('notebook_saved.Notebook');
|
43 |
+
var kernel = IPython.notebook.kernel;
|
44 |
+
var name = IPython.notebook.notebook_name;
|
45 |
+
var command = 'import os; os.system(\'jupyter nbconvert ' + nbconvert_options + ' \"' + name + '\"\')';
|
46 |
+
function callback() {
|
47 |
+
if (open_tab === true) {
|
48 |
+
var url = utils.splitext(name)[0] + extension;
|
49 |
+
window.open(url, '_blank');
|
50 |
+
}
|
51 |
+
}
|
52 |
+
kernel.execute(command, { shell: { reply : callback } });
|
53 |
+
$('#doPrintView').blur();
|
54 |
+
};
|
55 |
+
|
56 |
+
var nbconvertPrintView = function () {
|
57 |
+
events.on('notebook_saved.Notebook',callNbconvert);
|
58 |
+
IPython.notebook.save_notebook(false);
|
59 |
+
};
|
60 |
+
|
61 |
+
var load_ipython_extension = function() {
|
62 |
+
$(IPython.toolbar.add_buttons_group([
|
63 |
+
IPython.keyboard_manager.actions.register ({
|
64 |
+
help : 'Create static print view',
|
65 |
+
icon : 'fa-print',
|
66 |
+
handler: nbconvertPrintView
|
67 |
+
}, 'create-static-printview', 'printview'),
|
68 |
+
])).find('.btn').attr('id', 'doPrintView');
|
69 |
+
return IPython.notebook.config.loaded.then(initialize);
|
70 |
+
};
|
71 |
+
|
72 |
+
return {
|
73 |
+
load_ipython_extension : load_ipython_extension
|
74 |
+
};
|
75 |
+
});
|
.local/share/jupyter/nbextensions/python-markdown/main.js
ADDED
@@ -0,0 +1,212 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Allow Python-code in markdown cells
|
2 |
+
// Encapsulate using {{...}}
|
3 |
+
// - You can also return html or markdown from your Python code
|
4 |
+
// - You can embed images, however they will be sanitized on reload.
|
5 |
+
|
6 |
+
// TODO: Markdown cells will only be reevaluated when a notebook is dirty
|
7 |
+
// (i.e. you have made changes). If you save it before reevaluating MD cells,
|
8 |
+
// they will show the old value.
|
9 |
+
|
10 |
+
define([
|
11 |
+
'base/js/namespace',
|
12 |
+
'jquery',
|
13 |
+
'require',
|
14 |
+
'notebook/js/cell',
|
15 |
+
'base/js/security',
|
16 |
+
'components/marked/lib/marked',
|
17 |
+
'base/js/events',
|
18 |
+
'notebook/js/textcell'
|
19 |
+
], function(IPython, $, requirejs, cell, security, marked, events, textcell) {
|
20 |
+
"use strict";
|
21 |
+
|
22 |
+
/*
|
23 |
+
* Find Python expression enclosed in {{ }}, execute and add to text as
|
24 |
+
* <span> tags. The actual content gets filled in later by a callback.
|
25 |
+
* Already executed expressions are cached in cell metadata.
|
26 |
+
*
|
27 |
+
* @method execute_python
|
28 |
+
* @param cell {Cell} notebook cell
|
29 |
+
* @param text {String} text in cell
|
30 |
+
*/
|
31 |
+
var execute_python = function(cell,text) {
|
32 |
+
/* never execute code in untrusted notebooks */
|
33 |
+
if (IPython.notebook.trusted === false ) {
|
34 |
+
return undefined
|
35 |
+
}
|
36 |
+
/* always clear stored variables if notebook is dirty */
|
37 |
+
if (IPython.notebook.dirty === true ) delete cell.metadata.variables;
|
38 |
+
// search for code in double curly braces: {{}}
|
39 |
+
var found = false;
|
40 |
+
var newtext = text.replace(/{{(.*?)}}/g, function(match,tag,cha) {
|
41 |
+
found = true;
|
42 |
+
if (tag === "") return undefined;
|
43 |
+
var code = tag;
|
44 |
+
var id = 'python_'+cell.cell_id+'_'+cha; /* create an individual ID */
|
45 |
+
var thiscell = cell;
|
46 |
+
var thismatch = tag;
|
47 |
+
|
48 |
+
/* there a two possible options:
|
49 |
+
a) notebook dirty or variable not stored in metadata: evaluate variable
|
50 |
+
b) notebook clean and variable stored in metadata: display stored value
|
51 |
+
*/
|
52 |
+
if (typeof cell.metadata.variables === "undefined") {
|
53 |
+
cell.metadata.variables = {}
|
54 |
+
}
|
55 |
+
var val = cell.metadata.variables[thismatch];
|
56 |
+
if (IPython.notebook.dirty === true || val === undefined || jQuery.isEmptyObject(val)) {
|
57 |
+
cell.metadata.variables[thismatch] = {};
|
58 |
+
var execute_callback = function (out_data)
|
59 |
+
{
|
60 |
+
var html;
|
61 |
+
if (out_data.msg_type === "error") {
|
62 |
+
var text = "**" + out_data.content.ename + "**: " + out_data.content.evalue;
|
63 |
+
html = marked(text);
|
64 |
+
} else if (out_data.msg_type === "stream") {
|
65 |
+
html = marked(out_data.content.text);
|
66 |
+
var t = html.match(/^\s*<p>([\s\S]*?)<\/p>\s*$/); //strip <p> and </p> that marked (maybe) adds and we don't want
|
67 |
+
html = t !== null ? t[1] : html;
|
68 |
+
var q = html.match(/^'([\s\S]*?)'$/); // strip quotes from strings
|
69 |
+
if (q !== null) html = q[1]
|
70 |
+
} else if (out_data.msg_type === "execute_result" | out_data.msg_type === "display_data" ) {
|
71 |
+
var ul = out_data.content.data;
|
72 |
+
if (ul != undefined) {
|
73 |
+
if (ul['text/latex'] != undefined) {
|
74 |
+
html = ul['text/latex'];
|
75 |
+
} else if (ul['image/svg+xml'] != undefined) {
|
76 |
+
var svg = ul['image/svg+xml'];
|
77 |
+
/* embed SVG in an <img> tag, still get eaten by sanitizer... */
|
78 |
+
svg = btoa(svg);
|
79 |
+
html = '<img src="data:image/svg+xml;base64,' + svg + '"/>';
|
80 |
+
} else if (ul['image/jpeg'] != undefined) {
|
81 |
+
var jpeg = ul['image/jpeg'];
|
82 |
+
html = '<img src="data:image/jpeg;base64,' + jpeg + '"/>';
|
83 |
+
} else if (ul['image/png'] != undefined) {
|
84 |
+
var png = ul['image/png'];
|
85 |
+
html = '<img src="data:image/png;base64,' + png + '"/>';
|
86 |
+
} else if (ul['text/markdown'] != undefined) {
|
87 |
+
html = marked(ul['text/markdown']);
|
88 |
+
} else if (ul['text/html'] != undefined) {
|
89 |
+
html = ul['text/html'];
|
90 |
+
} else {
|
91 |
+
html = marked(ul['text/plain']);
|
92 |
+
// [\s\S] is used to also catch newlines
|
93 |
+
var t = html.match(/^\s*<p>([\s\S]*?)<\/p>\s*$/); //strip <p> and </p> that marked adds and we don't want
|
94 |
+
html = t !== null ? t[1] : html;
|
95 |
+
var q = html.match(/^'([\s\S]*?)'$/); // strip quotes from strings
|
96 |
+
if (q !== null) html = q[1]
|
97 |
+
}
|
98 |
+
}
|
99 |
+
} else {
|
100 |
+
return;
|
101 |
+
}
|
102 |
+
thiscell.metadata.variables[thismatch] = html;
|
103 |
+
var el = document.getElementById(id);
|
104 |
+
el.innerHTML = el.innerHTML + html; // output result
|
105 |
+
};
|
106 |
+
var callbacks = { iopub : { output: execute_callback } };
|
107 |
+
if (cell.notebook.kernel != null) {
|
108 |
+
cell.notebook.kernel.execute(code, callbacks, {silent: false, store_history : false, stop_on_error: false });
|
109 |
+
return "<span id='"+id+"'></span>"; // add HTML tag with ID where output will be placed
|
110 |
+
}
|
111 |
+
return undefined;
|
112 |
+
} else {
|
113 |
+
/* Notebook not dirty: replace tags with metadata */
|
114 |
+
val = cell.metadata.variables[tag];
|
115 |
+
return "<span id='"+id+"'>"+val+"</span>"
|
116 |
+
}
|
117 |
+
});
|
118 |
+
if (found == true) return newtext;
|
119 |
+
return undefined
|
120 |
+
};
|
121 |
+
|
122 |
+
/*
|
123 |
+
* Render markdown cell and replace {{...}} with python code
|
124 |
+
*
|
125 |
+
*/
|
126 |
+
var render_cell = function(cell) {
|
127 |
+
var element = cell.element.find('div.text_cell_render');
|
128 |
+
var text = execute_python(cell, element[0].innerHTML);
|
129 |
+
if (text !== undefined) {
|
130 |
+
element[0].innerHTML = text;
|
131 |
+
MathJax.Hub.Queue(["Typeset",MathJax.Hub,element[0]]);
|
132 |
+
}
|
133 |
+
};
|
134 |
+
|
135 |
+
/* force rendering of markdown cell if notebook is dirty */
|
136 |
+
var original_render = textcell.MarkdownCell.prototype.render;
|
137 |
+
textcell.MarkdownCell.prototype.render = function() {
|
138 |
+
if (IPython.notebook.dirty === true) {
|
139 |
+
this.rendered = false
|
140 |
+
}
|
141 |
+
return original_render.apply(this)
|
142 |
+
};
|
143 |
+
|
144 |
+
var set_trusted_indicator = function() {
|
145 |
+
var ind = $('.notebook-trusted');
|
146 |
+
if (IPython.notebook.trusted === true) {
|
147 |
+
ind.attr('title','Notebook is trusted');
|
148 |
+
ind.removeClass('fa-question');
|
149 |
+
ind.addClass('fa-check');
|
150 |
+
} else {
|
151 |
+
ind.attr('title','Notebook is not trusted');
|
152 |
+
ind.removeClass('fa-check');
|
153 |
+
ind.addClass('fa-question');
|
154 |
+
}
|
155 |
+
};
|
156 |
+
|
157 |
+
|
158 |
+
/**
|
159 |
+
* Add CSS file
|
160 |
+
*
|
161 |
+
* @param name filename
|
162 |
+
*/
|
163 |
+
var load_css = function (name) {
|
164 |
+
var link = document.createElement("link");
|
165 |
+
link.type = "text/css";
|
166 |
+
link.rel = "stylesheet";
|
167 |
+
link.href = requirejs.toUrl(name);
|
168 |
+
document.getElementsByTagName("head")[0].appendChild(link);
|
169 |
+
};
|
170 |
+
|
171 |
+
|
172 |
+
/**
|
173 |
+
* Update all references variables in markdown cells
|
174 |
+
*
|
175 |
+
*/
|
176 |
+
var update_md_cells = function () {
|
177 |
+
var ncells = IPython.notebook.ncells();
|
178 |
+
var cells = IPython.notebook.get_cells();
|
179 |
+
for (var i = 0; i < ncells; i++) {
|
180 |
+
var cell = cells[i];
|
181 |
+
if (cell.metadata.hasOwnProperty('variables')) {
|
182 |
+
render_cell(cell)
|
183 |
+
}
|
184 |
+
}
|
185 |
+
};
|
186 |
+
|
187 |
+
var load_ipython_extension = function() {
|
188 |
+
load_css('./main.css');
|
189 |
+
events.on("rendered.MarkdownCell", function (event, data) {
|
190 |
+
render_cell(data.cell);
|
191 |
+
});
|
192 |
+
events.on("trust_changed.Notebook", set_trusted_indicator);
|
193 |
+
|
194 |
+
$('#save_widget').append('<i id="notebook-trusted-indicator" class="fa fa-question notebook-trusted" />');
|
195 |
+
set_trusted_indicator();
|
196 |
+
|
197 |
+
/* Show values stored in metadata on reload */
|
198 |
+
events.on("kernel_ready.Kernel", function () {
|
199 |
+
if (Jupyter.notebook !== undefined && Jupyter.notebook._fully_loaded) {
|
200 |
+
update_md_cells()
|
201 |
+
} else {
|
202 |
+
events.on("notebook_loaded.Notebook", function () {
|
203 |
+
update_md_cells()
|
204 |
+
})
|
205 |
+
}
|
206 |
+
});
|
207 |
+
};
|
208 |
+
|
209 |
+
return {
|
210 |
+
load_ipython_extension : load_ipython_extension
|
211 |
+
};
|
212 |
+
});
|
.local/share/jupyter/nbextensions/qtconsole/qtconsole.yaml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Type: IPython Notebook Extension
|
2 |
+
Name: Launch QTConsole
|
3 |
+
Link: README.md
|
4 |
+
Description: Launch a QTConsole attached to the running kernel
|
5 |
+
Main: qtconsole.js
|
6 |
+
Compatibility: 4.x
|
.local/share/jupyter/nbextensions/rubberband/main.css
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.highlight-drag
|
2 |
+
{
|
3 |
+
background-color: transparent;
|
4 |
+
border: dashed #ff3333 3px;
|
5 |
+
position: absolute;
|
6 |
+
display: none;
|
7 |
+
}
|
8 |
+
|
9 |
+
.cell.selected
|
10 |
+
{
|
11 |
+
background-color: #fcfcfc;
|
12 |
+
}
|
.local/share/jupyter/nbextensions/rubberband/rubberband.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Type: IPython Notebook Extension
|
2 |
+
Name: Rubberband
|
3 |
+
Description: The rubberband extension allows selecting multiple cells
|
4 |
+
Link: readme.md
|
5 |
+
Icon: icon.png
|
6 |
+
Main: main.js
|
7 |
+
Compatibility: 4.x, 5.x
|
.local/share/jupyter/nbextensions/ruler/ruler.yaml
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Type: IPython Notebook Extension
|
2 |
+
Name: Ruler
|
3 |
+
Description: This extension enables the Ruler CodeMirror feature
|
4 |
+
Link: readme.md
|
5 |
+
Icon: icon.png
|
6 |
+
Main: main.js
|
7 |
+
Compatibility: 4.x, 5.x
|
8 |
+
Parameters:
|
9 |
+
|
10 |
+
- name: ruler_column
|
11 |
+
input_type: list
|
12 |
+
list_element:
|
13 |
+
input_type: number
|
14 |
+
description: Column where ruler is displayed
|
15 |
+
default: [78]
|
16 |
+
|
17 |
+
- name: ruler_color
|
18 |
+
input_type: list
|
19 |
+
list_element:
|
20 |
+
input_type: color
|
21 |
+
description: Ruler color
|
22 |
+
default: ["#ff0000"]
|
23 |
+
|
24 |
+
- name: ruler_linestyle
|
25 |
+
description: 'Ruler style, e.g. solid, dashed'
|
26 |
+
input_type: list
|
27 |
+
default: ['dashed']
|
28 |
+
|
29 |
+
- name: ruler_do_css_patch
|
30 |
+
description: apply css patch for ruler padding bug in notebook >= 4.3
|
31 |
+
input_type: checkbox
|
32 |
+
default: true
|
.local/share/jupyter/nbextensions/runtools/main.js
ADDED
@@ -0,0 +1,745 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Extended code execution commands and more
|
2 |
+
|
3 |
+
define([
|
4 |
+
'base/js/namespace',
|
5 |
+
'jquery',
|
6 |
+
'require',
|
7 |
+
'base/js/events',
|
8 |
+
'services/config',
|
9 |
+
'base/js/utils',
|
10 |
+
'notebook/js/codecell'
|
11 |
+
], function(Jupyter, $, requirejs, events, configmod, utils, codecell) {
|
12 |
+
"use strict";
|
13 |
+
|
14 |
+
var run_list = []; /* list of cells to be run */
|
15 |
+
|
16 |
+
// define default config parameter values
|
17 |
+
var params = {
|
18 |
+
run_cells_above: 'Alt-a',
|
19 |
+
run_cells_below: 'Alt-b',
|
20 |
+
toggle_marker: 'Alt-t',
|
21 |
+
mark_all_codecells: 'Alt-m',
|
22 |
+
unmark_all_codecells: 'Alt-u',
|
23 |
+
run_marked_cells: 'Alt-r',
|
24 |
+
run_all_cells: 'Alt-x',
|
25 |
+
run_all_cells_ignore_errors: 'Alt-f',
|
26 |
+
stop_execution: 'Ctrl-c',
|
27 |
+
marked_color: '#20f224',
|
28 |
+
scheduled_color: '#00def0',
|
29 |
+
run_color: '#f30a2d'
|
30 |
+
};
|
31 |
+
|
32 |
+
/**
|
33 |
+
* Add event if user clicks on codemirror gutter
|
34 |
+
*
|
35 |
+
*/
|
36 |
+
function add_gutter_events() {
|
37 |
+
var ncells = Jupyter.notebook.ncells();
|
38 |
+
var cells = Jupyter.notebook.get_cells();
|
39 |
+
for (var i = 0; i < ncells; i++) {
|
40 |
+
var cell = cells[i];
|
41 |
+
if ((cell.cell_type === "code")) {
|
42 |
+
cell.code_mirror.on("gutterClick", changeEvent);
|
43 |
+
if (is_marked(cell)) {
|
44 |
+
var g = cell.code_mirror.getGutterElement();
|
45 |
+
$(g).css({
|
46 |
+
"background-color": params.marked_color
|
47 |
+
});
|
48 |
+
}
|
49 |
+
}
|
50 |
+
}
|
51 |
+
}
|
52 |
+
|
53 |
+
/*
|
54 |
+
* Initialize toolbar and gutter after config was loaded
|
55 |
+
*/
|
56 |
+
function initialize() {
|
57 |
+
$.extend(true, params, Jupyter.notebook.config.data.runtools);
|
58 |
+
|
59 |
+
add_gutter_events();
|
60 |
+
|
61 |
+
/* Add run control buttons to toolbar */
|
62 |
+
$(Jupyter.toolbar.add_buttons_group([
|
63 |
+
Jupyter.keyboard_manager.actions.register ({
|
64 |
+
help: 'Toggle Runtools Toolbar',
|
65 |
+
icon: 'fa-cogs',
|
66 |
+
handler: toggle_toolbar
|
67 |
+
}, 'toggle-runtools-toolbar', 'runtools')
|
68 |
+
])).find('.btn').attr('id', 'toggle_runtools').css({
|
69 |
+
'outline': 'none'
|
70 |
+
});
|
71 |
+
|
72 |
+
/* Add keyboard shortcuts */
|
73 |
+
var add_command_shortcuts = {};
|
74 |
+
add_command_shortcuts[params["run_cells_above"]] = {
|
75 |
+
help: 'Run cells above',
|
76 |
+
help_index: 'xa',
|
77 |
+
handler: function() {
|
78 |
+
execute_cells_above();
|
79 |
+
return false;
|
80 |
+
}
|
81 |
+
};
|
82 |
+
add_command_shortcuts[params["run_cells_below"]] = {
|
83 |
+
help: 'Run cells below',
|
84 |
+
help_index: 'aa',
|
85 |
+
handler: function() {
|
86 |
+
execute_cells_below();
|
87 |
+
return false;
|
88 |
+
}
|
89 |
+
};
|
90 |
+
add_command_shortcuts[params["toggle_marker"]] = {
|
91 |
+
help: 'Toggle marker',
|
92 |
+
help_index: 'mt',
|
93 |
+
handler: function() {
|
94 |
+
toggle_marker();
|
95 |
+
return false;
|
96 |
+
}
|
97 |
+
};
|
98 |
+
add_command_shortcuts[params["mark_all_codecells"]] = {
|
99 |
+
help: 'Mark all codecells',
|
100 |
+
help_index: 'ma',
|
101 |
+
handler: function() {
|
102 |
+
mark_all();
|
103 |
+
return false;
|
104 |
+
}
|
105 |
+
};
|
106 |
+
add_command_shortcuts[params["unmark_all_codecells"]] = {
|
107 |
+
help: 'Unmark all codecells',
|
108 |
+
help_index: 'mu',
|
109 |
+
handler: function() {
|
110 |
+
mark_none();
|
111 |
+
return false;
|
112 |
+
}
|
113 |
+
};
|
114 |
+
add_command_shortcuts[params["run_marked_cells"]] = {
|
115 |
+
help: 'Run marked cells',
|
116 |
+
help_index: 'rm',
|
117 |
+
handler: function() {
|
118 |
+
run_marked_cells();
|
119 |
+
return false;
|
120 |
+
}
|
121 |
+
};
|
122 |
+
add_command_shortcuts[params["run_all_cells"]] = {
|
123 |
+
help: 'Run all cells',
|
124 |
+
help_index: 'ra',
|
125 |
+
handler: function() {
|
126 |
+
var pos = Jupyter.notebook.element.scrollTop();
|
127 |
+
execute_all_cells();
|
128 |
+
Jupyter.notebook.element.animate({
|
129 |
+
scrollTop: pos
|
130 |
+
}, 100);
|
131 |
+
return false;
|
132 |
+
}
|
133 |
+
};
|
134 |
+
add_command_shortcuts[params["run_all_cells_ignore_errors"]] = {
|
135 |
+
help: 'Run all cells - ignore errors',
|
136 |
+
help_index: 'rf',
|
137 |
+
handler: function() {
|
138 |
+
run_all_cells_ignore_errors();
|
139 |
+
return false;
|
140 |
+
}
|
141 |
+
};
|
142 |
+
Jupyter.keyboard_manager.command_shortcuts.add_shortcuts(add_command_shortcuts);
|
143 |
+
Jupyter.keyboard_manager.edit_shortcuts.add_shortcuts(add_command_shortcuts);
|
144 |
+
|
145 |
+
events.on('finished_execute.CodeCell', finished_execute_event);
|
146 |
+
}
|
147 |
+
|
148 |
+
/**
|
149 |
+
* Hide or show a cell
|
150 |
+
*
|
151 |
+
* @param cell
|
152 |
+
* @param io 'i' for cell input, 'o' for cell output
|
153 |
+
* @param showme {Boolean} show (true) or hide (false) cell
|
154 |
+
*/
|
155 |
+
function showCell(cell, io, showme) {
|
156 |
+
if (io === 'i') {
|
157 |
+
if (showme === true) {
|
158 |
+
cell.element.find("div.input").show();
|
159 |
+
cell.metadata.hide_input = false;
|
160 |
+
} else {
|
161 |
+
cell.element.find("div.input").hide();
|
162 |
+
cell.metadata.hide_input = true;
|
163 |
+
}
|
164 |
+
} else {
|
165 |
+
if (showme === true) {
|
166 |
+
cell.element.find('div.output').show();
|
167 |
+
cell.metadata.hide_output = false;
|
168 |
+
} else {
|
169 |
+
cell.element.find('div.output').hide();
|
170 |
+
cell.metadata.hide_output = true;
|
171 |
+
}
|
172 |
+
}
|
173 |
+
}
|
174 |
+
|
175 |
+
function _show_input_output_of_marked(show, char) {
|
176 |
+
var cells = Jupyter.notebook.get_cells();
|
177 |
+
var ncells = cells.length;
|
178 |
+
for (var i = 0; i < ncells; i++) {
|
179 |
+
var _cell = cells[i];
|
180 |
+
if (is_marked(_cell))
|
181 |
+
showCell(_cell, char, show);
|
182 |
+
}
|
183 |
+
}
|
184 |
+
|
185 |
+
/**
|
186 |
+
* Hide or show input of all marked code cells
|
187 |
+
*
|
188 |
+
* @param show {Boolean} show (true) or hide (false) code cells
|
189 |
+
*/
|
190 |
+
function show_input(show) {
|
191 |
+
_show_input_output_of_marked(show, 'i');
|
192 |
+
}
|
193 |
+
|
194 |
+
/**
|
195 |
+
* Hide or show output area of all marked code cells
|
196 |
+
*
|
197 |
+
* @param {Boolean} show show (true) or hide (false)
|
198 |
+
*/
|
199 |
+
function show_output(show) {
|
200 |
+
_show_input_output_of_marked(show, 'o');
|
201 |
+
}
|
202 |
+
|
203 |
+
|
204 |
+
/**
|
205 |
+
* Execute next cell in run list, if it is still marked
|
206 |
+
*
|
207 |
+
*/
|
208 |
+
function execute_next_marked_cell() {
|
209 |
+
var cells = Jupyter.notebook.get_cells();
|
210 |
+
var end = cells.length;
|
211 |
+
while (run_list.length > 0) {
|
212 |
+
var runcell = run_list.shift();
|
213 |
+
for (var i = 0; i < end; i++) {
|
214 |
+
if (runcell === cells[i]) {
|
215 |
+
if (runcell.metadata.run_control !== undefined && runcell.metadata.run_control.marked === true) {
|
216 |
+
var g = runcell.code_mirror.getGutterElement();
|
217 |
+
$(g).css({
|
218 |
+
"background-color": params.run_color
|
219 |
+
});
|
220 |
+
runcell.execute();
|
221 |
+
return;
|
222 |
+
}
|
223 |
+
}
|
224 |
+
}
|
225 |
+
}
|
226 |
+
}
|
227 |
+
|
228 |
+
function _execute_without_selecting(idx_start, idx_end, stop_on_error) {
|
229 |
+
// notebook.execute_cells alters selection, this doesn't
|
230 |
+
var cells = Jupyter.notebook.get_cells();
|
231 |
+
idx_start = idx_start !== undefined ? idx_start : 0;
|
232 |
+
idx_end = idx_end !== undefined ? idx_end : cells.length;
|
233 |
+
for (var ii = idx_start; ii < idx_end; ii++) {
|
234 |
+
cells[ii].execute(stop_on_error);
|
235 |
+
}
|
236 |
+
}
|
237 |
+
|
238 |
+
function execute_cells_above() {
|
239 |
+
_execute_without_selecting(0, Jupyter.notebook.get_selected_index());
|
240 |
+
}
|
241 |
+
|
242 |
+
function execute_cells_below() {
|
243 |
+
_execute_without_selecting(Jupyter.notebook.get_selected_index(), undefined);
|
244 |
+
}
|
245 |
+
|
246 |
+
function execute_all_cells(stop_on_error) {
|
247 |
+
_execute_without_selecting(0, undefined, stop_on_error);
|
248 |
+
}
|
249 |
+
|
250 |
+
/**
|
251 |
+
* Run code cells marked in metadata
|
252 |
+
*
|
253 |
+
*/
|
254 |
+
function run_marked_cells() {
|
255 |
+
var cells = Jupyter.notebook.get_cells();
|
256 |
+
var end = cells.length;
|
257 |
+
run_list = [];
|
258 |
+
/* Show all marked cells as scheduled to be run with new gutter background color */
|
259 |
+
for (var i = 0; i < end; i++) {
|
260 |
+
var cell = cells[i];
|
261 |
+
if (cell instanceof codecell.CodeCell) {
|
262 |
+
var last_line = cell.code_mirror.lastLine();
|
263 |
+
var cell_empty = ( last_line === 0 && cell.code_mirror.getLine(last_line) === "");
|
264 |
+
if (cell.metadata.run_control !== undefined && cell_empty === false) {
|
265 |
+
if (cell.metadata.run_control.marked === true) {
|
266 |
+
var g = cell.code_mirror.getGutterElement();
|
267 |
+
$(g).css({
|
268 |
+
"background-color": params.scheduled_color
|
269 |
+
});
|
270 |
+
run_list.push(cell);
|
271 |
+
}
|
272 |
+
}
|
273 |
+
}
|
274 |
+
}
|
275 |
+
execute_next_marked_cell();
|
276 |
+
}
|
277 |
+
|
278 |
+
/*
|
279 |
+
* Execute next cell in run_list when notified execution of last cell has been finished
|
280 |
+
* @param evt Event
|
281 |
+
* @param data Cell that has finished executing
|
282 |
+
*/
|
283 |
+
var finished_execute_event = function(evt, data) {
|
284 |
+
var cell = data.cell;
|
285 |
+
/* Reset gutter color no non-queued state */
|
286 |
+
if (is_marked(cell)) {
|
287 |
+
var g = cell.code_mirror.getGutterElement();
|
288 |
+
$(g).css({
|
289 |
+
"background-color": params.marked_color
|
290 |
+
});
|
291 |
+
}
|
292 |
+
execute_next_marked_cell();
|
293 |
+
};
|
294 |
+
|
295 |
+
/**
|
296 |
+
*
|
297 |
+
* @param cell
|
298 |
+
* @param value
|
299 |
+
*/
|
300 |
+
function setCell(cell, value) {
|
301 |
+
if (!(cell instanceof codecell.CodeCell)) return;
|
302 |
+
if (cell.metadata.run_control === undefined) cell.metadata.run_control = {};
|
303 |
+
if (cell.metadata.run_control.marked === undefined) cell.metadata.run_control.marked = false;
|
304 |
+
if (value === undefined) value = !cell.metadata.run_control.marked;
|
305 |
+
var g = cell.code_mirror.getGutterElement();
|
306 |
+
if (value === false) {
|
307 |
+
cell.metadata.run_control.marked = false;
|
308 |
+
$(g).css({
|
309 |
+
"background-color": ""
|
310 |
+
});
|
311 |
+
} else {
|
312 |
+
cell.metadata.run_control.marked = true;
|
313 |
+
$(g).css({
|
314 |
+
"background-color": params.marked_color
|
315 |
+
});
|
316 |
+
}
|
317 |
+
}
|
318 |
+
|
319 |
+
function setCellsMarked(cells, value) {
|
320 |
+
var ncells = cells.length;
|
321 |
+
for (var i = 0; i < ncells; i++) {
|
322 |
+
setCell(cells[i], value);
|
323 |
+
}
|
324 |
+
}
|
325 |
+
|
326 |
+
/**
|
327 |
+
* Toggle code cell marker
|
328 |
+
*/
|
329 |
+
function toggle_marker() {
|
330 |
+
setCellsMarked(Jupyter.notebook.get_selected_cells(), undefined);
|
331 |
+
}
|
332 |
+
|
333 |
+
/**
|
334 |
+
*
|
335 |
+
*/
|
336 |
+
function mark_all() {
|
337 |
+
setCellsMarked(Jupyter.notebook.get_cells(), true);
|
338 |
+
}
|
339 |
+
|
340 |
+
/**
|
341 |
+
*
|
342 |
+
*/
|
343 |
+
function mark_none() {
|
344 |
+
setCellsMarked(Jupyter.notebook.get_cells(), false);
|
345 |
+
}
|
346 |
+
|
347 |
+
/**
|
348 |
+
*
|
349 |
+
* @param cell notebook cell instance
|
350 |
+
* @param state {string} state to be display [ '', 'locked', 'executed', 'modified' ]
|
351 |
+
*/
|
352 |
+
function set_cell_state(cell, state) {
|
353 |
+
var icon = "";
|
354 |
+
if (state === 'locked') {
|
355 |
+
icon = '<div class="fa fa-lock" style="font-size:70%;" /div>'
|
356 |
+
}
|
357 |
+
cell.code_mirror.setGutterMarker(0, "CodeMirror-cellstate", celltypeMarker(icon))
|
358 |
+
}
|
359 |
+
|
360 |
+
/**
|
361 |
+
* Change event to mark/unmark cell
|
362 |
+
*
|
363 |
+
* @param cm codemirror instance
|
364 |
+
* @param line current line
|
365 |
+
* @param gutter not used
|
366 |
+
*/
|
367 |
+
function changeEvent(cm, line, gutter) {
|
368 |
+
if (gutter === "CodeMirror-foldgutter") return; /* Don't collide with codefolding extension */
|
369 |
+
|
370 |
+
var cmline = cm.doc.children[0].lines[line];
|
371 |
+
if (cmline === undefined) {
|
372 |
+
return;
|
373 |
+
}
|
374 |
+
var cell = $(cm.display.gutters).closest('.cell').data('cell');
|
375 |
+
if (cell.metadata.run_control === undefined)
|
376 |
+
cell.metadata.run_control = {};
|
377 |
+
setCell(cell, !cell.metadata.run_control.marked);
|
378 |
+
}
|
379 |
+
|
380 |
+
/**
|
381 |
+
*
|
382 |
+
* @param cell cell to be tested
|
383 |
+
* @returns {boolean} true if marked
|
384 |
+
*/
|
385 |
+
var is_marked = function(cell) {
|
386 |
+
return (cell instanceof codecell.CodeCell) &&
|
387 |
+
cell.metadata.run_control !== undefined &&
|
388 |
+
cell.metadata.run_control.marked;
|
389 |
+
};
|
390 |
+
|
391 |
+
/**
|
392 |
+
* Return div element to set in cellstate gutter
|
393 |
+
*
|
394 |
+
* @param val HTML string
|
395 |
+
* @returns {Element} div Element
|
396 |
+
*/
|
397 |
+
function celltypeMarker(val) {
|
398 |
+
var marker = document.createElement("div");
|
399 |
+
marker.style.color = "#822";
|
400 |
+
marker.innerHTML = val;
|
401 |
+
return marker;
|
402 |
+
}
|
403 |
+
|
404 |
+
/**
|
405 |
+
* Lock/Unlock current code cell
|
406 |
+
* if (cell.metadata.run_control != undefined && cell.metadata.run_control.read_only) {
|
407 |
+
* cell.code_mirror.setOption('readOnly', cell.metadata.run_control.read_only);
|
408 |
+
*/
|
409 |
+
var lock_cell = function(locked) {
|
410 |
+
var ncells = Jupyter.notebook.ncells();
|
411 |
+
for (var i = ncells - 2; i >= 0; i--) {
|
412 |
+
var cells = Jupyter.notebook.get_cells();
|
413 |
+
if ((cells[i].cell_type === "code") && is_marked(cells[i])) {
|
414 |
+
if (locked === true) {
|
415 |
+
cells[i].metadata.editable = false;
|
416 |
+
set_cell_state(cells[i], 'locked')
|
417 |
+
} else {
|
418 |
+
cells[i].metadata.editable = true;
|
419 |
+
set_cell_state(cells[i], '')
|
420 |
+
}
|
421 |
+
}
|
422 |
+
}
|
423 |
+
};
|
424 |
+
|
425 |
+
/**
|
426 |
+
* Execute all cells and don't stop on errors
|
427 |
+
*
|
428 |
+
*/
|
429 |
+
var run_all_cells_ignore_errors = function() {
|
430 |
+
execute_all_cells(false);
|
431 |
+
};
|
432 |
+
|
433 |
+
/**
|
434 |
+
* Create floating toolbar
|
435 |
+
*
|
436 |
+
*/
|
437 |
+
var create_runtools_div = function() {
|
438 |
+
var btn = '<div class="btn-toolbar">\
|
439 |
+
<div class="btn-group">\
|
440 |
+
<button type="button" id="run_c" class="btn-primary fa fa-step-forward" title="Run current cell"></button>\
|
441 |
+
<button type="button" id="run_ca" class="btn-primary fa icon-run-to" title="' +
|
442 |
+
'Run cells above (' + params["run_cells_above"] + ')"</button>\
|
443 |
+
<button type="button" id="run_cb" class="btn-primary fa icon-run-from" title="' +
|
444 |
+
'Run cells below (' + params["run_cells_below"] + ')"</button>\
|
445 |
+
<button type="button" id="run_a" class="btn-primary fa icon-run-all" title="' +
|
446 |
+
'Run all cells (' + params["run_all_cells"] + ')"</button>\
|
447 |
+
<button type="button" id="run_af" class="btn-primary fa icon-run-all-forced" title="' +
|
448 |
+
'Run all - ignore errors (' + params["run_all_cells_ignore_errors"] + ')"</button>\
|
449 |
+
<button type="button" id="run_m" class="btn-primary fa icon-run-marked" title="' +
|
450 |
+
'Run marked codecells (' + params["run_marked_cells"] + ')"</button>\
|
451 |
+
<button type="button" id="interrupt_b" class="btn-primary fa fa-stop" title="' +
|
452 |
+
'Stop execution (' + params["stop_execution"] + ')"</button>\
|
453 |
+
</div>\
|
454 |
+
<div class="btn-group">\
|
455 |
+
<button type="button" id="mark_toggle" class="btn-primary fa icon-mark-toggle" title="Mark single code cell"></button>\
|
456 |
+
<button type="button" id="mark_all" class="btn-primary fa icon-mark-all" title="Mark all code cells"></button>\
|
457 |
+
<button type="button" id="mark_none" class="btn-primary fa icon-mark-none" title="Unmark all code cells"></button>\
|
458 |
+
</div>\
|
459 |
+
<div class="btn-group">\
|
460 |
+
<button type="button" id="show_input" class="btn-primary fa icon-show-input" title="Show input of code cell"></button>\
|
461 |
+
<button type="button" id="hide_input" class="btn-primary fa icon-hide-input" title="Hide input of code cell"></button>\
|
462 |
+
<button type="button" id="show_output" class="btn-primary fa icon-show-output" title="Show output of code cell"></button>\
|
463 |
+
<button type="button" id="hide_output" class="btn-primary fa icon-hide-output" title="Hide output of code cell"></button>\
|
464 |
+
<button type="button" id="lock_marked" class="btn-primary fa fa-lock" title="Lock marked cells"></button>\
|
465 |
+
<button type="button" id="unlock_marked" class="btn-primary fa fa-unlock" title="Unlock marked cells"></button>\
|
466 |
+
</div>\
|
467 |
+
</div>';
|
468 |
+
|
469 |
+
var runtools_wrapper = $('<div id="runtools-wrapper">')
|
470 |
+
.text("Runtools")
|
471 |
+
.append(btn)
|
472 |
+
.draggable()
|
473 |
+
.append("</div>");
|
474 |
+
|
475 |
+
$("#header").append(runtools_wrapper);
|
476 |
+
$("#runtools-wrapper").css({
|
477 |
+
'position': 'absolute'
|
478 |
+
});
|
479 |
+
$('#run_c').on('click', function(e) {
|
480 |
+
var idx = Jupyter.notebook.get_selected_index();
|
481 |
+
_execute_without_selecting(idx, idx + 1);
|
482 |
+
e.target.blur();
|
483 |
+
})
|
484 |
+
.tooltip({
|
485 |
+
delay: {
|
486 |
+
show: 500,
|
487 |
+
hide: 100
|
488 |
+
}
|
489 |
+
});
|
490 |
+
$('#run_ca').on('click', function(e) {
|
491 |
+
execute_cells_above();
|
492 |
+
e.target.blur();
|
493 |
+
})
|
494 |
+
.tooltip({
|
495 |
+
delay: {
|
496 |
+
show: 500,
|
497 |
+
hide: 100
|
498 |
+
}
|
499 |
+
});
|
500 |
+
$('#run_cb').on('click', function(e) {
|
501 |
+
execute_cells_below();
|
502 |
+
e.target.blur();
|
503 |
+
})
|
504 |
+
.tooltip({
|
505 |
+
delay: {
|
506 |
+
show: 500,
|
507 |
+
hide: 100
|
508 |
+
}
|
509 |
+
});
|
510 |
+
$('#run_a').on('click', function(e) {
|
511 |
+
execute_all_cells();
|
512 |
+
e.target.blur();
|
513 |
+
})
|
514 |
+
.tooltip({
|
515 |
+
delay: {
|
516 |
+
show: 500,
|
517 |
+
hide: 100
|
518 |
+
}
|
519 |
+
});
|
520 |
+
$('#run_af').on('click', function(e) {
|
521 |
+
run_all_cells_ignore_errors();
|
522 |
+
e.target.blur()
|
523 |
+
})
|
524 |
+
.tooltip({
|
525 |
+
delay: {
|
526 |
+
show: 500,
|
527 |
+
hide: 100
|
528 |
+
}
|
529 |
+
});
|
530 |
+
$('#run_m').on('click', function(e) {
|
531 |
+
run_marked_cells();
|
532 |
+
e.target.blur()
|
533 |
+
})
|
534 |
+
.tooltip({
|
535 |
+
delay: {
|
536 |
+
show: 500,
|
537 |
+
hide: 100
|
538 |
+
}
|
539 |
+
});
|
540 |
+
$('#interrupt_b').on('click', function(e) {
|
541 |
+
interrupt_execution();
|
542 |
+
e.target.blur()
|
543 |
+
})
|
544 |
+
.tooltip({
|
545 |
+
delay: {
|
546 |
+
show: 500,
|
547 |
+
hide: 100
|
548 |
+
}
|
549 |
+
});
|
550 |
+
$('#mark_toggle').on('click', function() {
|
551 |
+
toggle_marker()
|
552 |
+
})
|
553 |
+
.tooltip({
|
554 |
+
delay: {
|
555 |
+
show: 500,
|
556 |
+
hide: 100
|
557 |
+
}
|
558 |
+
});
|
559 |
+
$('#mark_all').on('click', function() {
|
560 |
+
mark_all()
|
561 |
+
})
|
562 |
+
.tooltip({
|
563 |
+
delay: {
|
564 |
+
show: 500,
|
565 |
+
hide: 100
|
566 |
+
}
|
567 |
+
});
|
568 |
+
$('#mark_none').on('click', function() {
|
569 |
+
mark_none()
|
570 |
+
})
|
571 |
+
.tooltip({
|
572 |
+
delay: {
|
573 |
+
show: 500,
|
574 |
+
hide: 100
|
575 |
+
}
|
576 |
+
});
|
577 |
+
$('#show_input').on('click', function() {
|
578 |
+
show_input(true);
|
579 |
+
this.blur()
|
580 |
+
})
|
581 |
+
.tooltip({
|
582 |
+
delay: {
|
583 |
+
show: 500,
|
584 |
+
hide: 100
|
585 |
+
}
|
586 |
+
});
|
587 |
+
$('#hide_input').on('click', function() {
|
588 |
+
show_input(false);
|
589 |
+
this.blur()
|
590 |
+
})
|
591 |
+
.tooltip({
|
592 |
+
delay: {
|
593 |
+
show: 500,
|
594 |
+
hide: 100
|
595 |
+
}
|
596 |
+
});
|
597 |
+
$('#show_output').on('click', function() {
|
598 |
+
show_output(true);
|
599 |
+
this.blur()
|
600 |
+
})
|
601 |
+
.tooltip({
|
602 |
+
delay: {
|
603 |
+
show: 500,
|
604 |
+
hide: 100
|
605 |
+
}
|
606 |
+
});
|
607 |
+
$('#hide_output').on('click', function() {
|
608 |
+
show_output(false);
|
609 |
+
this.blur()
|
610 |
+
})
|
611 |
+
.tooltip({
|
612 |
+
delay: {
|
613 |
+
show: 500,
|
614 |
+
hide: 100
|
615 |
+
}
|
616 |
+
});
|
617 |
+
$('#lock_marked').on('click', function() {
|
618 |
+
lock_cell(true);
|
619 |
+
this.blur()
|
620 |
+
})
|
621 |
+
.tooltip({
|
622 |
+
delay: {
|
623 |
+
show: 500,
|
624 |
+
hide: 100
|
625 |
+
}
|
626 |
+
});
|
627 |
+
$('#unlock_marked').on('click', function() {
|
628 |
+
lock_cell(false);
|
629 |
+
this.blur()
|
630 |
+
})
|
631 |
+
.tooltip({
|
632 |
+
delay: {
|
633 |
+
show: 500,
|
634 |
+
hide: 100
|
635 |
+
}
|
636 |
+
});
|
637 |
+
};
|
638 |
+
|
639 |
+
/**
|
640 |
+
* Show/hide toolbar
|
641 |
+
*
|
642 |
+
*/
|
643 |
+
var toggle_toolbar = function() {
|
644 |
+
var dom = $("#runtools-wrapper");
|
645 |
+
|
646 |
+
if (dom.is(':visible')) {
|
647 |
+
$('#toggle_runtools').removeClass('active').blur();
|
648 |
+
dom.hide();
|
649 |
+
} else {
|
650 |
+
$('#toggle_runtools').addClass('active');
|
651 |
+
dom.show();
|
652 |
+
}
|
653 |
+
|
654 |
+
if (dom.length === 0) {
|
655 |
+
create_runtools_div()
|
656 |
+
}
|
657 |
+
};
|
658 |
+
|
659 |
+
|
660 |
+
/**
|
661 |
+
* Add CSS file
|
662 |
+
*
|
663 |
+
* @param name filename
|
664 |
+
*/
|
665 |
+
var load_css = function(name) {
|
666 |
+
var link = document.createElement("link");
|
667 |
+
link.type = "text/css";
|
668 |
+
link.rel = "stylesheet";
|
669 |
+
link.href = requirejs.toUrl(name);
|
670 |
+
document.getElementsByTagName("head")[0].appendChild(link);
|
671 |
+
};
|
672 |
+
|
673 |
+
/**
|
674 |
+
* Add gutter to a new cell
|
675 |
+
*
|
676 |
+
* @param event
|
677 |
+
* @param nbcell
|
678 |
+
*
|
679 |
+
*/
|
680 |
+
var createCell = function(event, nbcell) {
|
681 |
+
var cell = nbcell.cell;
|
682 |
+
if (cell instanceof codecell.CodeCell) {
|
683 |
+
var gutters = cell.code_mirror.getOption('gutters').slice();
|
684 |
+
if ($.inArray("CodeMirror-cellstate", gutters) < 0) {
|
685 |
+
gutters.push('CodeMirror-cellstate');
|
686 |
+
cell.code_mirror.setOption('gutters', gutters);
|
687 |
+
cell.code_mirror.on("gutterClick", changeEvent);
|
688 |
+
|
689 |
+
}
|
690 |
+
}
|
691 |
+
};
|
692 |
+
|
693 |
+
|
694 |
+
/**
|
695 |
+
* Initialize all cells with new gutter
|
696 |
+
*/
|
697 |
+
var initGutter = function() {
|
698 |
+
var cells = Jupyter.notebook.get_cells();
|
699 |
+
var ncells = cells.length;
|
700 |
+
for (var i = 0; i < ncells; i++) {
|
701 |
+
var cell = cells[i];
|
702 |
+
if (cell instanceof codecell.CodeCell) {
|
703 |
+
var gutters = cell.code_mirror.getOption('gutters').slice();
|
704 |
+
if ($.inArray("CodeMirror-cellstate", gutters) < 0) {
|
705 |
+
gutters.push('CodeMirror-cellstate');
|
706 |
+
cell.code_mirror.setOption('gutters', gutters);
|
707 |
+
}
|
708 |
+
}
|
709 |
+
/**
|
710 |
+
* Restore hide/show status after reload
|
711 |
+
*/
|
712 |
+
if (cell.metadata.hasOwnProperty('hide_input') && cell.metadata.hide_input === true)
|
713 |
+
showCell(cell, 'i', false);
|
714 |
+
if (cell.metadata.hasOwnProperty('hide_output') && cell.metadata.hide_output === true)
|
715 |
+
showCell(cell, 'o', false);
|
716 |
+
if (cell.is_editable() === false) {
|
717 |
+
set_cell_state(cell, 'locked');
|
718 |
+
}
|
719 |
+
cell.code_mirror.refresh();
|
720 |
+
}
|
721 |
+
events.on('create.Cell', createCell);
|
722 |
+
};
|
723 |
+
|
724 |
+
/**
|
725 |
+
* Called from notebook after extension was loaded
|
726 |
+
*
|
727 |
+
*/
|
728 |
+
var load_extension = function() {
|
729 |
+
load_css('./main.css');
|
730 |
+
load_css('./gutter.css'); /* set gutter width */
|
731 |
+
requirejs(['./cellstate'], function() {
|
732 |
+
if (Jupyter.notebook._fully_loaded) {
|
733 |
+
initGutter();
|
734 |
+
} else {
|
735 |
+
events.one('notebook_loaded.Notebook', initGutter);
|
736 |
+
}
|
737 |
+
});
|
738 |
+
Jupyter.notebook.config.loaded.then(initialize);
|
739 |
+
};
|
740 |
+
|
741 |
+
return {
|
742 |
+
load_jupyter_extension: load_extension,
|
743 |
+
load_ipython_extension: load_extension
|
744 |
+
};
|
745 |
+
});
|
.local/share/jupyter/nbextensions/runtools/runtools_lock.png
ADDED
![]() |
.local/share/jupyter/nbextensions/scratchpad/README.md
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Scratchpad notebook extension
|
2 |
+
|
3 |
+
Adds a scratchpad cell to Jupyter notebook.
|
4 |
+
This is a cell in which you can execute code against the current kernel without modifying the notebook document.
|
5 |
+
|
6 |
+
Scratchpad cells can be executed using `Shift-Enter` (other shortcuts are appled to the notebook document). The scratchpad can be toggled by clicking the icon in the bottom-right, or via the keyboard shortcut `Ctrl-B`.
|
7 |
+
|
8 |
+
![demo](demo.gif)
|
9 |
+
|
10 |
+
|
11 |
+
## Credits
|
12 |
+
|
13 |
+
This extension is a copy of the extension from MinRK here:
|
14 |
+
`git clone git://github.com/minrk/nbextension-scratchpad`.
|
.local/share/jupyter/nbextensions/skill/README.md
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# SKILL for Codemirror
|
2 |
+
This extension provides a *SKILL* mode for CodeMirror editor.
|
3 |
+
|
4 |
+
The extension adds a MIME type `x-skill` and a mode `skill` that can be
|
5 |
+
used with CodeMirror.
|
6 |
+
|
7 |
+
## About SKILL
|
8 |
+
From [Wikipedia](https://en.wikipedia.org/wiki/Cadence_SKILL):
|
9 |
+
SKILL is a Lisp dialect used as a scripting language and PCell (parameterized
|
10 |
+
cells) description language used in many EDA software suites by Cadence Design
|
11 |
+
Systems (e.g. Cadence Allegro and Cadence Virtuoso)
|
12 |
+
|
13 |
+
## Notes
|
14 |
+
This extension was written to enhance the Virtuoso kernel for Jupyter
|
15 |
+
(https://github.com/benvarkey/JuVi).
|
.triton/cache/6e97c2a1f7a095255f6dd5de1807841d/cuda_utils.so
ADDED
Binary file (28 kB). View file
|
|
.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.ptx
ADDED
@@ -0,0 +1,807 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3d4d5d6d7d8de9de
|
10 |
+
.extern .shared .align 1 .b8 global_smem[];
|
11 |
+
.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
|
12 |
+
|
13 |
+
.visible .entry triton__0d1d2d3d4d5d6d7d8de9de(
|
14 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_0,
|
15 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_1,
|
16 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_2,
|
17 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_3,
|
18 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_4,
|
19 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_5,
|
20 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_6,
|
21 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_7,
|
22 |
+
.param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_8,
|
23 |
+
.param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_9
|
24 |
+
)
|
25 |
+
.maxntid 64, 1, 1
|
26 |
+
{
|
27 |
+
.reg .pred %p<33>;
|
28 |
+
.reg .b16 %rs<21>;
|
29 |
+
.reg .b32 %r<112>;
|
30 |
+
.reg .f32 %f<94>;
|
31 |
+
.reg .b64 %rd<20>;
|
32 |
+
.loc 1 18 0
|
33 |
+
$L__func_begin0:
|
34 |
+
.loc 1 18 0
|
35 |
+
|
36 |
+
ld.param.u64 %rd9, [triton__0d1d2d3d4d5d6d7d8de9de_param_0];
|
37 |
+
ld.param.u64 %rd10, [triton__0d1d2d3d4d5d6d7d8de9de_param_1];
|
38 |
+
$L__tmp0:
|
39 |
+
.loc 1 26 26
|
40 |
+
mov.u32 %r78, %tid.x;
|
41 |
+
and.b32 %r79, %r78, 31;
|
42 |
+
ld.param.u64 %rd11, [triton__0d1d2d3d4d5d6d7d8de9de_param_2];
|
43 |
+
ld.param.u64 %rd12, [triton__0d1d2d3d4d5d6d7d8de9de_param_3];
|
44 |
+
ld.param.u64 %rd13, [triton__0d1d2d3d4d5d6d7d8de9de_param_4];
|
45 |
+
shl.b32 %r80, %r78, 2;
|
46 |
+
ld.param.u64 %rd14, [triton__0d1d2d3d4d5d6d7d8de9de_param_5];
|
47 |
+
and.b32 %r81, %r80, 252;
|
48 |
+
ld.param.u64 %rd15, [triton__0d1d2d3d4d5d6d7d8de9de_param_6];
|
49 |
+
ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6d7d8de9de_param_7];
|
50 |
+
.loc 1 23 28
|
51 |
+
mov.u32 %r1, %ctaid.x;
|
52 |
+
.loc 1 30 40
|
53 |
+
shl.b32 %r82, %r1, 8;
|
54 |
+
.loc 1 30 36
|
55 |
+
or.b32 %r83, %r82, %r81;
|
56 |
+
.loc 1 30 30
|
57 |
+
mul.wide.s32 %rd17, %r83, 4;
|
58 |
+
add.s64 %rd1, %rd9, %rd17;
|
59 |
+
mov.b32 %r6, 0;
|
60 |
+
mov.pred %p1, -1;
|
61 |
+
.loc 1 30 46
|
62 |
+
mov.u32 %r2, 0x0;
|
63 |
+
mov.u32 %r3, 0x0;
|
64 |
+
mov.u32 %r4, 0x0;
|
65 |
+
mov.u32 %r5, 0x0;
|
66 |
+
@%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
|
67 |
+
@!%p1 mov.u32 %r2, %r6;
|
68 |
+
@!%p1 mov.u32 %r3, %r6;
|
69 |
+
@!%p1 mov.u32 %r4, %r6;
|
70 |
+
@!%p1 mov.u32 %r5, %r6;
|
71 |
+
mov.b32 %f1, %r2;
|
72 |
+
mov.b32 %f2, %r3;
|
73 |
+
mov.b32 %f3, %r4;
|
74 |
+
mov.b32 %f4, %r5;
|
75 |
+
.loc 1 31 30
|
76 |
+
mul.wide.s32 %rd18, %r83, 2;
|
77 |
+
add.s64 %rd2, %rd10, %rd18;
|
78 |
+
.loc 1 31 46
|
79 |
+
mov.u32 %r10, 0x0;
|
80 |
+
mov.u32 %r11, 0x0;
|
81 |
+
@%p1 ld.global.v2.b32 { %r10, %r11 }, [ %rd2 + 0 ];
|
82 |
+
@!%p1 mov.u32 %r10, %r6;
|
83 |
+
@!%p1 mov.u32 %r11, %r6;
|
84 |
+
cvt.u16.u32 %rs1, %r10;
|
85 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r10; }
|
86 |
+
cvt.u16.u32 %rs3, %r11;
|
87 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r11; }
|
88 |
+
.loc 1 31 67
|
89 |
+
cvt.f32.bf16 %r14, %rs1;
|
90 |
+
mov.b32 %f5, %r14;
|
91 |
+
cvt.f32.bf16 %r15, %rs2;
|
92 |
+
mov.b32 %f6, %r15;
|
93 |
+
cvt.f32.bf16 %r16, %rs3;
|
94 |
+
mov.b32 %f7, %r16;
|
95 |
+
cvt.f32.bf16 %r17, %rs4;
|
96 |
+
mov.b32 %f8, %r17;
|
97 |
+
.loc 1 32 30
|
98 |
+
add.s64 %rd3, %rd11, %rd18;
|
99 |
+
.loc 1 32 46
|
100 |
+
mov.u32 %r18, 0x0;
|
101 |
+
mov.u32 %r19, 0x0;
|
102 |
+
@%p1 ld.global.v2.b32 { %r18, %r19 }, [ %rd3 + 0 ];
|
103 |
+
@!%p1 mov.u32 %r18, %r6;
|
104 |
+
@!%p1 mov.u32 %r19, %r6;
|
105 |
+
cvt.u16.u32 %rs5, %r18;
|
106 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r18; }
|
107 |
+
cvt.u16.u32 %rs7, %r19;
|
108 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r19; }
|
109 |
+
.loc 1 32 67
|
110 |
+
cvt.f32.bf16 %r22, %rs5;
|
111 |
+
mov.b32 %f9, %r22;
|
112 |
+
cvt.f32.bf16 %r23, %rs6;
|
113 |
+
mov.b32 %f10, %r23;
|
114 |
+
cvt.f32.bf16 %r24, %rs7;
|
115 |
+
mov.b32 %f11, %r24;
|
116 |
+
cvt.f32.bf16 %r25, %rs8;
|
117 |
+
mov.b32 %f12, %r25;
|
118 |
+
.loc 1 33 30
|
119 |
+
add.s64 %rd4, %rd12, %rd18;
|
120 |
+
.loc 1 33 46
|
121 |
+
mov.u32 %r26, 0x0;
|
122 |
+
mov.u32 %r27, 0x0;
|
123 |
+
@%p1 ld.global.v2.b32 { %r26, %r27 }, [ %rd4 + 0 ];
|
124 |
+
@!%p1 mov.u32 %r26, %r6;
|
125 |
+
@!%p1 mov.u32 %r27, %r6;
|
126 |
+
cvt.u16.u32 %rs9, %r26;
|
127 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r26; }
|
128 |
+
cvt.u16.u32 %rs11, %r27;
|
129 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r27; }
|
130 |
+
.loc 1 33 67
|
131 |
+
cvt.f32.bf16 %r30, %rs9;
|
132 |
+
mov.b32 %f13, %r30;
|
133 |
+
cvt.f32.bf16 %r31, %rs10;
|
134 |
+
mov.b32 %f14, %r31;
|
135 |
+
cvt.f32.bf16 %r32, %rs11;
|
136 |
+
mov.b32 %f15, %r32;
|
137 |
+
cvt.f32.bf16 %r33, %rs12;
|
138 |
+
mov.b32 %f16, %r33;
|
139 |
+
.loc 1 34 31
|
140 |
+
add.s64 %rd5, %rd13, %rd18;
|
141 |
+
.loc 1 34 47
|
142 |
+
mov.u32 %r34, 0x0;
|
143 |
+
mov.u32 %r35, 0x0;
|
144 |
+
@%p1 ld.global.v2.b32 { %r34, %r35 }, [ %rd5 + 0 ];
|
145 |
+
@!%p1 mov.u32 %r34, %r6;
|
146 |
+
@!%p1 mov.u32 %r35, %r6;
|
147 |
+
cvt.u16.u32 %rs13, %r34;
|
148 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs14}, %r34; }
|
149 |
+
cvt.u16.u32 %rs15, %r35;
|
150 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs16}, %r35; }
|
151 |
+
.loc 1 34 68
|
152 |
+
cvt.f32.bf16 %r38, %rs13;
|
153 |
+
mov.b32 %f17, %r38;
|
154 |
+
cvt.f32.bf16 %r39, %rs14;
|
155 |
+
mov.b32 %f18, %r39;
|
156 |
+
cvt.f32.bf16 %r40, %rs15;
|
157 |
+
mov.b32 %f19, %r40;
|
158 |
+
cvt.f32.bf16 %r41, %rs16;
|
159 |
+
mov.b32 %f20, %r41;
|
160 |
+
.loc 1 35 31
|
161 |
+
mul.wide.u32 %rd19, %r81, 4;
|
162 |
+
add.s64 %rd6, %rd14, %rd19;
|
163 |
+
.loc 1 35 36
|
164 |
+
mov.u32 %r42, 0x0;
|
165 |
+
mov.u32 %r43, 0x0;
|
166 |
+
mov.u32 %r44, 0x0;
|
167 |
+
mov.u32 %r45, 0x0;
|
168 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r42, %r43, %r44, %r45 }, [ %rd6 + 0 ];
|
169 |
+
@!%p1 mov.u32 %r42, %r6;
|
170 |
+
@!%p1 mov.u32 %r43, %r6;
|
171 |
+
@!%p1 mov.u32 %r44, %r6;
|
172 |
+
@!%p1 mov.u32 %r45, %r6;
|
173 |
+
.loc 1 37 18
|
174 |
+
add.f32 %f21, %f5, %f1;
|
175 |
+
add.f32 %f22, %f6, %f2;
|
176 |
+
add.f32 %f23, %f7, %f3;
|
177 |
+
.loc 1 39 18
|
178 |
+
add.f32 %f24, %f21, %f9;
|
179 |
+
add.f32 %f25, %f22, %f10;
|
180 |
+
add.f32 %f26, %f23, %f11;
|
181 |
+
.loc 1 41 18
|
182 |
+
add.f32 %f27, %f25, %f14;
|
183 |
+
add.f32 %f28, %f26, %f15;
|
184 |
+
.loc 1 43 19
|
185 |
+
add.f32 %f29, %f27, %f18;
|
186 |
+
add.f32 %f30, %f28, %f19;
|
187 |
+
.loc 1 41 18
|
188 |
+
add.f32 %f31, %f24, %f13;
|
189 |
+
add.f32 %f32, %f8, %f4;
|
190 |
+
.loc 1 43 19
|
191 |
+
add.f32 %f33, %f32, %f12;
|
192 |
+
add.f32 %f34, %f31, %f17;
|
193 |
+
$L__tmp1:
|
194 |
+
.loc 2 233 15
|
195 |
+
add.f32 %f35, %f34, %f29;
|
196 |
+
add.f32 %f36, %f33, %f16;
|
197 |
+
add.f32 %f37, %f35, %f30;
|
198 |
+
add.f32 %f38, %f36, %f20;
|
199 |
+
mov.b32 %r71, %f38;
|
200 |
+
add.f32 %f39, %f37, %f38;
|
201 |
+
$L__tmp2:
|
202 |
+
.loc 2 243 36
|
203 |
+
mov.b32 %r84, %f39;
|
204 |
+
shfl.sync.bfly.b32 %r85, %r84, 16, 31, -1;
|
205 |
+
mov.b32 %f40, %r85;
|
206 |
+
$L__tmp3:
|
207 |
+
.loc 2 233 15
|
208 |
+
add.f32 %f41, %f39, %f40;
|
209 |
+
$L__tmp4:
|
210 |
+
.loc 2 243 36
|
211 |
+
mov.b32 %r86, %f41;
|
212 |
+
shfl.sync.bfly.b32 %r87, %r86, 8, 31, -1;
|
213 |
+
mov.b32 %f42, %r87;
|
214 |
+
$L__tmp5:
|
215 |
+
.loc 2 233 15
|
216 |
+
add.f32 %f43, %f41, %f42;
|
217 |
+
$L__tmp6:
|
218 |
+
.loc 2 243 36
|
219 |
+
mov.b32 %r88, %f43;
|
220 |
+
shfl.sync.bfly.b32 %r89, %r88, 4, 31, -1;
|
221 |
+
mov.b32 %f44, %r89;
|
222 |
+
$L__tmp7:
|
223 |
+
.loc 2 233 15
|
224 |
+
add.f32 %f45, %f43, %f44;
|
225 |
+
$L__tmp8:
|
226 |
+
.loc 2 243 36
|
227 |
+
mov.b32 %r90, %f45;
|
228 |
+
shfl.sync.bfly.b32 %r91, %r90, 2, 31, -1;
|
229 |
+
mov.b32 %f46, %r91;
|
230 |
+
$L__tmp9:
|
231 |
+
.loc 2 233 15
|
232 |
+
add.f32 %f47, %f45, %f46;
|
233 |
+
$L__tmp10:
|
234 |
+
.loc 2 243 36
|
235 |
+
mov.b32 %r92, %f47;
|
236 |
+
shfl.sync.bfly.b32 %r93, %r92, 1, 31, -1;
|
237 |
+
mov.b32 %f48, %r93;
|
238 |
+
$L__tmp11:
|
239 |
+
.loc 2 233 15
|
240 |
+
add.f32 %f49, %f47, %f48;
|
241 |
+
$L__tmp12:
|
242 |
+
.loc 2 243 36
|
243 |
+
setp.eq.s32 %p23, %r79, 0;
|
244 |
+
shr.u32 %r94, %r78, 3;
|
245 |
+
and.b32 %r95, %r94, 4;
|
246 |
+
mov.u32 %r96, global_smem;
|
247 |
+
add.s32 %r50, %r96, %r95;
|
248 |
+
mov.b32 %r51, %f49;
|
249 |
+
@%p23 st.shared.b32 [ %r50 + 0 ], %r51;
|
250 |
+
bar.sync 0;
|
251 |
+
setp.lt.s32 %p24, %r78, 2;
|
252 |
+
add.s32 %r53, %r96, %r80;
|
253 |
+
@%p24 ld.shared.b32 %r52, [ %r53 + 0 ];
|
254 |
+
mov.b32 %f50, %r52;
|
255 |
+
shfl.sync.bfly.b32 %r97, %r52, 1, 31, -1;
|
256 |
+
mov.b32 %f51, %r97;
|
257 |
+
$L__tmp13:
|
258 |
+
.loc 2 233 15
|
259 |
+
add.f32 %f52, %f50, %f51;
|
260 |
+
$L__tmp14:
|
261 |
+
.loc 2 243 36
|
262 |
+
and.b32 %r98, %r78, 1;
|
263 |
+
setp.eq.b32 %p31, %r98, 1;
|
264 |
+
not.pred %p32, %p31;
|
265 |
+
and.pred %p25, %p24, %p32;
|
266 |
+
mov.b32 %r55, %f52;
|
267 |
+
@%p25 st.shared.b32 [ %r53 + 0 ], %r55;
|
268 |
+
bar.sync 0;
|
269 |
+
ld.shared.f32 %f53, [global_smem];
|
270 |
+
$L__tmp15:
|
271 |
+
.loc 3 8 15
|
272 |
+
add.f32 %f54, %f53, 0f00000000;
|
273 |
+
$L__tmp16:
|
274 |
+
.loc 1 51 20
|
275 |
+
mov.b32 %r57, %f54;
|
276 |
+
mov.b32 %r58, 1132462080;
|
277 |
+
div.full.f32 %r56, %r57, %r58;
|
278 |
+
mov.b32 %f55, %r56;
|
279 |
+
.loc 1 52 20
|
280 |
+
sub.f32 %f56, %f34, %f55;
|
281 |
+
sub.f32 %f57, %f29, %f55;
|
282 |
+
sub.f32 %f58, %f30, %f55;
|
283 |
+
sub.f32 %f59, %f38, %f55;
|
284 |
+
.loc 1 53 20
|
285 |
+
mul.f32 %f60, %f57, %f57;
|
286 |
+
$L__tmp17:
|
287 |
+
.loc 2 243 36
|
288 |
+
bar.sync 0;
|
289 |
+
$L__tmp18:
|
290 |
+
.loc 2 233 15
|
291 |
+
fma.rn.f32 %f61, %f56, %f56, %f60;
|
292 |
+
fma.rn.f32 %f62, %f58, %f58, %f61;
|
293 |
+
fma.rn.f32 %f63, %f59, %f59, %f62;
|
294 |
+
$L__tmp19:
|
295 |
+
.loc 2 243 36
|
296 |
+
mov.b32 %r99, %f63;
|
297 |
+
shfl.sync.bfly.b32 %r100, %r99, 16, 31, -1;
|
298 |
+
mov.b32 %f64, %r100;
|
299 |
+
$L__tmp20:
|
300 |
+
.loc 2 233 15
|
301 |
+
add.f32 %f65, %f63, %f64;
|
302 |
+
$L__tmp21:
|
303 |
+
.loc 2 243 36
|
304 |
+
mov.b32 %r101, %f65;
|
305 |
+
shfl.sync.bfly.b32 %r102, %r101, 8, 31, -1;
|
306 |
+
mov.b32 %f66, %r102;
|
307 |
+
$L__tmp22:
|
308 |
+
.loc 2 233 15
|
309 |
+
add.f32 %f67, %f65, %f66;
|
310 |
+
$L__tmp23:
|
311 |
+
.loc 2 243 36
|
312 |
+
mov.b32 %r103, %f67;
|
313 |
+
shfl.sync.bfly.b32 %r104, %r103, 4, 31, -1;
|
314 |
+
mov.b32 %f68, %r104;
|
315 |
+
$L__tmp24:
|
316 |
+
.loc 2 233 15
|
317 |
+
add.f32 %f69, %f67, %f68;
|
318 |
+
$L__tmp25:
|
319 |
+
.loc 2 243 36
|
320 |
+
mov.b32 %r105, %f69;
|
321 |
+
shfl.sync.bfly.b32 %r106, %r105, 2, 31, -1;
|
322 |
+
mov.b32 %f70, %r106;
|
323 |
+
$L__tmp26:
|
324 |
+
.loc 2 233 15
|
325 |
+
add.f32 %f71, %f69, %f70;
|
326 |
+
$L__tmp27:
|
327 |
+
.loc 2 243 36
|
328 |
+
mov.b32 %r107, %f71;
|
329 |
+
shfl.sync.bfly.b32 %r108, %r107, 1, 31, -1;
|
330 |
+
mov.b32 %f72, %r108;
|
331 |
+
$L__tmp28:
|
332 |
+
.loc 2 233 15
|
333 |
+
add.f32 %f73, %f71, %f72;
|
334 |
+
$L__tmp29:
|
335 |
+
.loc 2 243 36
|
336 |
+
mov.b32 %r60, %f73;
|
337 |
+
@%p23 st.shared.b32 [ %r50 + 0 ], %r60;
|
338 |
+
bar.sync 0;
|
339 |
+
@%p24 ld.shared.b32 %r61, [ %r53 + 0 ];
|
340 |
+
mov.b32 %f74, %r61;
|
341 |
+
shfl.sync.bfly.b32 %r109, %r61, 1, 31, -1;
|
342 |
+
mov.b32 %f75, %r109;
|
343 |
+
$L__tmp30:
|
344 |
+
.loc 2 233 15
|
345 |
+
add.f32 %f76, %f74, %f75;
|
346 |
+
$L__tmp31:
|
347 |
+
.loc 2 243 36
|
348 |
+
mov.b32 %r64, %f76;
|
349 |
+
@%p25 st.shared.b32 [ %r53 + 0 ], %r64;
|
350 |
+
bar.sync 0;
|
351 |
+
ld.shared.f32 %f77, [global_smem];
|
352 |
+
$L__tmp32:
|
353 |
+
.loc 3 8 15
|
354 |
+
add.f32 %f78, %f77, 0f00000000;
|
355 |
+
$L__tmp33:
|
356 |
+
.loc 1 59 20
|
357 |
+
mov.b32 %r66, %f78;
|
358 |
+
div.full.f32 %r65, %r66, %r58;
|
359 |
+
mov.b32 %f79, %r65;
|
360 |
+
.loc 1 61 20
|
361 |
+
add.f32 %f80, %f79, 0f3727C5AC;
|
362 |
+
.loc 1 62 26
|
363 |
+
rsqrt.approx.ftz.f32 %f81, %f80;
|
364 |
+
.loc 1 35 36
|
365 |
+
mov.b32 %f82, %r45;
|
366 |
+
mov.b32 %f83, %r44;
|
367 |
+
mov.b32 %f84, %r43;
|
368 |
+
mov.b32 %f85, %r42;
|
369 |
+
.loc 1 63 20
|
370 |
+
mul.f32 %f86, %f56, %f81;
|
371 |
+
mul.f32 %f87, %f57, %f81;
|
372 |
+
mul.f32 %f88, %f58, %f81;
|
373 |
+
mul.f32 %f89, %f59, %f81;
|
374 |
+
.loc 1 64 20
|
375 |
+
mul.f32 %f90, %f86, %f85;
|
376 |
+
mul.f32 %f91, %f87, %f84;
|
377 |
+
mul.f32 %f92, %f88, %f83;
|
378 |
+
mul.f32 %f93, %f89, %f82;
|
379 |
+
.loc 1 66 25
|
380 |
+
add.s64 %rd7, %rd15, %rd17;
|
381 |
+
.loc 1 66 48
|
382 |
+
mov.b32 %r68, %f34;
|
383 |
+
mov.b32 %r69, %f29;
|
384 |
+
mov.b32 %r70, %f30;
|
385 |
+
@%p1 st.global.v4.b32 [ %rd7 + 0 ], { %r68, %r69, %r70, %r71 };
|
386 |
+
.loc 1 67 25
|
387 |
+
add.s64 %rd8, %rd16, %rd18;
|
388 |
+
.loc 1 67 48
|
389 |
+
mov.b32 %r72, %f90;
|
390 |
+
cvt.rn.bf16.f32 %rs17, %r72;
|
391 |
+
mov.b32 %r73, %f91;
|
392 |
+
cvt.rn.bf16.f32 %rs18, %r73;
|
393 |
+
mov.b32 %r74, %f92;
|
394 |
+
cvt.rn.bf16.f32 %rs19, %r74;
|
395 |
+
mov.b32 %r75, %f93;
|
396 |
+
cvt.rn.bf16.f32 %rs20, %r75;
|
397 |
+
mov.b32 %r110, {%rs17, %rs18};
|
398 |
+
mov.b32 %r111, {%rs19, %rs20};
|
399 |
+
@%p1 st.global.v2.b32 [ %rd8 + 0 ], { %r110, %r111 };
|
400 |
+
.loc 1 67 4
|
401 |
+
ret;
|
402 |
+
$L__tmp34:
|
403 |
+
$L__func_end0:
|
404 |
+
|
405 |
+
}
|
406 |
+
// .globl __nv_rsqrtf
|
407 |
+
.visible .func (.param .b32 func_retval0) __nv_rsqrtf(
|
408 |
+
.param .b32 __nv_rsqrtf_param_0
|
409 |
+
)
|
410 |
+
{
|
411 |
+
.reg .f32 %f<3>;
|
412 |
+
$L__func_begin1:
|
413 |
+
|
414 |
+
ld.param.f32 %f1, [__nv_rsqrtf_param_0];
|
415 |
+
rsqrt.approx.ftz.f32 %f2, %f1;
|
416 |
+
st.param.f32 [func_retval0+0], %f2;
|
417 |
+
ret;
|
418 |
+
$L__func_end1:
|
419 |
+
|
420 |
+
}
|
421 |
+
.file 1 "/tmp/torchinductor_root/jb/cjbnqg5u4sj7a4xstjer3a6tdgnnigb2iymd27gcs6o7oduhxy2v.py"
|
422 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
|
423 |
+
.file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
|
424 |
+
.section .debug_abbrev
|
425 |
+
{
|
426 |
+
.b8 1
|
427 |
+
.b8 17
|
428 |
+
.b8 1
|
429 |
+
.b8 37
|
430 |
+
.b8 8
|
431 |
+
.b8 19
|
432 |
+
.b8 5
|
433 |
+
.b8 3
|
434 |
+
.b8 8
|
435 |
+
.b8 16
|
436 |
+
.b8 6
|
437 |
+
.b8 27
|
438 |
+
.b8 8
|
439 |
+
.b8 180
|
440 |
+
.b8 66
|
441 |
+
.b8 12
|
442 |
+
.b8 17
|
443 |
+
.b8 1
|
444 |
+
.b8 18
|
445 |
+
.b8 1
|
446 |
+
.b8 0
|
447 |
+
.b8 0
|
448 |
+
.b8 2
|
449 |
+
.b8 46
|
450 |
+
.b8 0
|
451 |
+
.b8 135
|
452 |
+
.b8 64
|
453 |
+
.b8 8
|
454 |
+
.b8 3
|
455 |
+
.b8 8
|
456 |
+
.b8 58
|
457 |
+
.b8 11
|
458 |
+
.b8 59
|
459 |
+
.b8 11
|
460 |
+
.b8 63
|
461 |
+
.b8 12
|
462 |
+
.b8 32
|
463 |
+
.b8 11
|
464 |
+
.b8 0
|
465 |
+
.b8 0
|
466 |
+
.b8 3
|
467 |
+
.b8 46
|
468 |
+
.b8 1
|
469 |
+
.b8 17
|
470 |
+
.b8 1
|
471 |
+
.b8 18
|
472 |
+
.b8 1
|
473 |
+
.b8 64
|
474 |
+
.b8 10
|
475 |
+
.b8 49
|
476 |
+
.b8 19
|
477 |
+
.b8 0
|
478 |
+
.b8 0
|
479 |
+
.b8 4
|
480 |
+
.b8 29
|
481 |
+
.b8 1
|
482 |
+
.b8 49
|
483 |
+
.b8 19
|
484 |
+
.b8 17
|
485 |
+
.b8 1
|
486 |
+
.b8 18
|
487 |
+
.b8 1
|
488 |
+
.b8 88
|
489 |
+
.b8 11
|
490 |
+
.b8 89
|
491 |
+
.b8 11
|
492 |
+
.b8 87
|
493 |
+
.b8 11
|
494 |
+
.b8 0
|
495 |
+
.b8 0
|
496 |
+
.b8 5
|
497 |
+
.b8 29
|
498 |
+
.b8 0
|
499 |
+
.b8 49
|
500 |
+
.b8 19
|
501 |
+
.b8 17
|
502 |
+
.b8 1
|
503 |
+
.b8 18
|
504 |
+
.b8 1
|
505 |
+
.b8 88
|
506 |
+
.b8 11
|
507 |
+
.b8 89
|
508 |
+
.b8 11
|
509 |
+
.b8 87
|
510 |
+
.b8 11
|
511 |
+
.b8 0
|
512 |
+
.b8 0
|
513 |
+
.b8 0
|
514 |
+
}
|
515 |
+
.section .debug_info
|
516 |
+
{
|
517 |
+
.b32 407
|
518 |
+
.b8 2
|
519 |
+
.b8 0
|
520 |
+
.b32 .debug_abbrev
|
521 |
+
.b8 8
|
522 |
+
.b8 1
|
523 |
+
.b8 116
|
524 |
+
.b8 114
|
525 |
+
.b8 105
|
526 |
+
.b8 116
|
527 |
+
.b8 111
|
528 |
+
.b8 110
|
529 |
+
.b8 0
|
530 |
+
.b8 2
|
531 |
+
.b8 0
|
532 |
+
.b8 99
|
533 |
+
.b8 106
|
534 |
+
.b8 98
|
535 |
+
.b8 110
|
536 |
+
.b8 113
|
537 |
+
.b8 103
|
538 |
+
.b8 53
|
539 |
+
.b8 117
|
540 |
+
.b8 52
|
541 |
+
.b8 115
|
542 |
+
.b8 106
|
543 |
+
.b8 55
|
544 |
+
.b8 97
|
545 |
+
.b8 52
|
546 |
+
.b8 120
|
547 |
+
.b8 115
|
548 |
+
.b8 116
|
549 |
+
.b8 106
|
550 |
+
.b8 101
|
551 |
+
.b8 114
|
552 |
+
.b8 51
|
553 |
+
.b8 97
|
554 |
+
.b8 54
|
555 |
+
.b8 116
|
556 |
+
.b8 100
|
557 |
+
.b8 103
|
558 |
+
.b8 110
|
559 |
+
.b8 110
|
560 |
+
.b8 105
|
561 |
+
.b8 103
|
562 |
+
.b8 98
|
563 |
+
.b8 50
|
564 |
+
.b8 105
|
565 |
+
.b8 121
|
566 |
+
.b8 109
|
567 |
+
.b8 100
|
568 |
+
.b8 50
|
569 |
+
.b8 55
|
570 |
+
.b8 103
|
571 |
+
.b8 99
|
572 |
+
.b8 115
|
573 |
+
.b8 54
|
574 |
+
.b8 111
|
575 |
+
.b8 55
|
576 |
+
.b8 111
|
577 |
+
.b8 100
|
578 |
+
.b8 117
|
579 |
+
.b8 104
|
580 |
+
.b8 120
|
581 |
+
.b8 121
|
582 |
+
.b8 50
|
583 |
+
.b8 118
|
584 |
+
.b8 46
|
585 |
+
.b8 112
|
586 |
+
.b8 121
|
587 |
+
.b8 0
|
588 |
+
.b32 .debug_line
|
589 |
+
.b8 47
|
590 |
+
.b8 116
|
591 |
+
.b8 109
|
592 |
+
.b8 112
|
593 |
+
.b8 47
|
594 |
+
.b8 116
|
595 |
+
.b8 111
|
596 |
+
.b8 114
|
597 |
+
.b8 99
|
598 |
+
.b8 104
|
599 |
+
.b8 105
|
600 |
+
.b8 110
|
601 |
+
.b8 100
|
602 |
+
.b8 117
|
603 |
+
.b8 99
|
604 |
+
.b8 116
|
605 |
+
.b8 111
|
606 |
+
.b8 114
|
607 |
+
.b8 95
|
608 |
+
.b8 114
|
609 |
+
.b8 111
|
610 |
+
.b8 111
|
611 |
+
.b8 116
|
612 |
+
.b8 47
|
613 |
+
.b8 106
|
614 |
+
.b8 98
|
615 |
+
.b8 0
|
616 |
+
.b8 1
|
617 |
+
.b64 $L__func_begin0
|
618 |
+
.b64 $L__func_end0
|
619 |
+
.b8 2
|
620 |
+
.b8 116
|
621 |
+
.b8 114
|
622 |
+
.b8 105
|
623 |
+
.b8 116
|
624 |
+
.b8 111
|
625 |
+
.b8 110
|
626 |
+
.b8 95
|
627 |
+
.b8 95
|
628 |
+
.b8 48
|
629 |
+
.b8 100
|
630 |
+
.b8 49
|
631 |
+
.b8 100
|
632 |
+
.b8 50
|
633 |
+
.b8 100
|
634 |
+
.b8 51
|
635 |
+
.b8 100
|
636 |
+
.b8 52
|
637 |
+
.b8 100
|
638 |
+
.b8 53
|
639 |
+
.b8 100
|
640 |
+
.b8 54
|
641 |
+
.b8 100
|
642 |
+
.b8 55
|
643 |
+
.b8 100
|
644 |
+
.b8 56
|
645 |
+
.b8 100
|
646 |
+
.b8 101
|
647 |
+
.b8 57
|
648 |
+
.b8 100
|
649 |
+
.b8 101
|
650 |
+
.b8 0
|
651 |
+
.b8 116
|
652 |
+
.b8 114
|
653 |
+
.b8 105
|
654 |
+
.b8 116
|
655 |
+
.b8 111
|
656 |
+
.b8 110
|
657 |
+
.b8 95
|
658 |
+
.b8 95
|
659 |
+
.b8 48
|
660 |
+
.b8 100
|
661 |
+
.b8 49
|
662 |
+
.b8 100
|
663 |
+
.b8 50
|
664 |
+
.b8 100
|
665 |
+
.b8 51
|
666 |
+
.b8 100
|
667 |
+
.b8 52
|
668 |
+
.b8 100
|
669 |
+
.b8 53
|
670 |
+
.b8 100
|
671 |
+
.b8 54
|
672 |
+
.b8 100
|
673 |
+
.b8 55
|
674 |
+
.b8 100
|
675 |
+
.b8 56
|
676 |
+
.b8 100
|
677 |
+
.b8 101
|
678 |
+
.b8 57
|
679 |
+
.b8 100
|
680 |
+
.b8 101
|
681 |
+
.b8 0
|
682 |
+
.b8 1
|
683 |
+
.b8 18
|
684 |
+
.b8 1
|
685 |
+
.b8 1
|
686 |
+
.b8 3
|
687 |
+
.b64 $L__func_begin0
|
688 |
+
.b64 $L__func_end0
|
689 |
+
.b8 1
|
690 |
+
.b8 156
|
691 |
+
.b32 125
|
692 |
+
.b8 4
|
693 |
+
.b32 125
|
694 |
+
.b64 $L__tmp1
|
695 |
+
.b64 $L__tmp14
|
696 |
+
.b8 2
|
697 |
+
.b8 48
|
698 |
+
.b8 59
|
699 |
+
.b8 5
|
700 |
+
.b32 125
|
701 |
+
.b64 $L__tmp1
|
702 |
+
.b64 $L__tmp14
|
703 |
+
.b8 2
|
704 |
+
.b8 243
|
705 |
+
.b8 36
|
706 |
+
.b8 0
|
707 |
+
.b8 5
|
708 |
+
.b32 125
|
709 |
+
.b64 $L__tmp2
|
710 |
+
.b64 $L__tmp15
|
711 |
+
.b8 2
|
712 |
+
.b8 48
|
713 |
+
.b8 59
|
714 |
+
.b8 5
|
715 |
+
.b32 125
|
716 |
+
.b64 $L__tmp15
|
717 |
+
.b64 $L__tmp16
|
718 |
+
.b8 3
|
719 |
+
.b8 48
|
720 |
+
.b8 45
|
721 |
+
.b8 5
|
722 |
+
.b32 125
|
723 |
+
.b64 $L__tmp17
|
724 |
+
.b64 $L__tmp32
|
725 |
+
.b8 2
|
726 |
+
.b8 56
|
727 |
+
.b8 59
|
728 |
+
.b8 4
|
729 |
+
.b32 125
|
730 |
+
.b64 $L__tmp18
|
731 |
+
.b64 $L__tmp31
|
732 |
+
.b8 2
|
733 |
+
.b8 56
|
734 |
+
.b8 59
|
735 |
+
.b8 5
|
736 |
+
.b32 125
|
737 |
+
.b64 $L__tmp18
|
738 |
+
.b64 $L__tmp31
|
739 |
+
.b8 2
|
740 |
+
.b8 243
|
741 |
+
.b8 36
|
742 |
+
.b8 0
|
743 |
+
.b8 5
|
744 |
+
.b32 125
|
745 |
+
.b64 $L__tmp32
|
746 |
+
.b64 $L__tmp33
|
747 |
+
.b8 3
|
748 |
+
.b8 56
|
749 |
+
.b8 45
|
750 |
+
.b8 0
|
751 |
+
.b8 0
|
752 |
+
}
|
753 |
+
.section .debug_pubnames
|
754 |
+
{
|
755 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
756 |
+
$L__pubNames_start0:
|
757 |
+
.b8 2
|
758 |
+
.b8 0
|
759 |
+
.b32 .debug_info
|
760 |
+
.b32 411
|
761 |
+
.b32 125
|
762 |
+
.b8 116
|
763 |
+
.b8 114
|
764 |
+
.b8 105
|
765 |
+
.b8 116
|
766 |
+
.b8 111
|
767 |
+
.b8 110
|
768 |
+
.b8 95
|
769 |
+
.b8 95
|
770 |
+
.b8 48
|
771 |
+
.b8 100
|
772 |
+
.b8 49
|
773 |
+
.b8 100
|
774 |
+
.b8 50
|
775 |
+
.b8 100
|
776 |
+
.b8 51
|
777 |
+
.b8 100
|
778 |
+
.b8 52
|
779 |
+
.b8 100
|
780 |
+
.b8 53
|
781 |
+
.b8 100
|
782 |
+
.b8 54
|
783 |
+
.b8 100
|
784 |
+
.b8 55
|
785 |
+
.b8 100
|
786 |
+
.b8 56
|
787 |
+
.b8 100
|
788 |
+
.b8 101
|
789 |
+
.b8 57
|
790 |
+
.b8 100
|
791 |
+
.b8 101
|
792 |
+
.b8 0
|
793 |
+
.b32 0
|
794 |
+
$L__pubNames_end0:
|
795 |
+
}
|
796 |
+
.section .debug_pubtypes
|
797 |
+
{
|
798 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
799 |
+
$L__pubTypes_start0:
|
800 |
+
.b8 2
|
801 |
+
.b8 0
|
802 |
+
.b32 .debug_info
|
803 |
+
.b32 411
|
804 |
+
.b32 0
|
805 |
+
$L__pubTypes_end0:
|
806 |
+
}
|
807 |
+
.section .debug_loc { }
|
.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.ttir
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<256> : tensor<32x1xi64>
|
4 |
+
%cst_0 = arith.constant dense<0> : tensor<32x1xi64>
|
5 |
+
%cst_1 = arith.constant dense<512> : tensor<32x1xi64>
|
6 |
+
%cst_2 = arith.constant dense<true> : tensor<32x1xi1>
|
7 |
+
%cst_3 = arith.constant dense<256> : tensor<32x1xi32>
|
8 |
+
%cst_4 = arith.constant dense<131072> : tensor<1x128xi32>
|
9 |
+
%cst_5 = arith.constant dense<120> : tensor<1x128xi32>
|
10 |
+
%cst_6 = arith.constant dense<0.000000e+00> : tensor<32x128xf32>
|
11 |
+
%c32_i32 = arith.constant 32 : i32
|
12 |
+
%0 = tt.get_program_id x : i32
|
13 |
+
%1 = arith.muli %0, %c32_i32 : i32
|
14 |
+
%2 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32>
|
15 |
+
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<32xi32>) -> tensor<32x1xi32>
|
16 |
+
%4 = tt.splat %1 : (i32) -> tensor<32x1xi32>
|
17 |
+
%5 = arith.addi %4, %3 : tensor<32x1xi32>
|
18 |
+
%6 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
|
19 |
+
%7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<128xi32>) -> tensor<1x128xi32>
|
20 |
+
%8 = arith.cmpi slt, %7, %cst_5 : tensor<1x128xi32>
|
21 |
+
%9 = arith.muli %7, %cst_4 : tensor<1x128xi32>
|
22 |
+
%10 = tt.broadcast %5 : (tensor<32x1xi32>) -> tensor<32x128xi32>
|
23 |
+
%11 = tt.broadcast %9 : (tensor<1x128xi32>) -> tensor<32x128xi32>
|
24 |
+
%12 = arith.addi %10, %11 : tensor<32x128xi32>
|
25 |
+
%13 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<32x128x!tt.ptr<f32, 1>>
|
26 |
+
%14 = tt.addptr %13, %12 : tensor<32x128x!tt.ptr<f32, 1>>, tensor<32x128xi32>
|
27 |
+
%15 = tt.broadcast %8 : (tensor<1x128xi1>) -> tensor<32x128xi1>
|
28 |
+
%16 = tt.load %14, %15, %cst_6 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<32x128xf32>
|
29 |
+
%17 = arith.addf %16, %cst_6 : tensor<32x128xf32>
|
30 |
+
%18 = arith.select %15, %17, %cst_6 : tensor<32x128xi1>, tensor<32x128xf32>
|
31 |
+
%19 = "tt.reduce"(%18) <{axis = 1 : i32}> ({
|
32 |
+
^bb0(%arg5: f32, %arg6: f32):
|
33 |
+
%35 = arith.addf %arg5, %arg6 : f32
|
34 |
+
tt.reduce.return %35 : f32
|
35 |
+
}) : (tensor<32x128xf32>) -> tensor<32xf32>
|
36 |
+
%20 = tt.expand_dims %19 {axis = 1 : i32} : (tensor<32xf32>) -> tensor<32x1xf32>
|
37 |
+
%21 = arith.divsi %5, %cst_3 : tensor<32x1xi32>
|
38 |
+
%22 = arith.remsi %5, %cst_3 : tensor<32x1xi32>
|
39 |
+
%23 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<32x1x!tt.ptr<i64, 1>>
|
40 |
+
%24 = tt.addptr %23, %21 : tensor<32x1x!tt.ptr<i64, 1>>, tensor<32x1xi32>
|
41 |
+
%25 = tt.load %24 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<32x1xi64>
|
42 |
+
%26 = arith.addi %25, %cst_1 : tensor<32x1xi64>
|
43 |
+
%27 = arith.cmpi slt, %25, %cst_0 : tensor<32x1xi64>
|
44 |
+
%28 = arith.select %27, %26, %25 : tensor<32x1xi1>, tensor<32x1xi64>
|
45 |
+
%29 = arith.muli %28, %cst : tensor<32x1xi64>
|
46 |
+
%30 = arith.extsi %22 : tensor<32x1xi32> to tensor<32x1xi64>
|
47 |
+
%31 = arith.addi %30, %29 : tensor<32x1xi64>
|
48 |
+
%32 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<32x1x!tt.ptr<f32, 1>>
|
49 |
+
%33 = tt.addptr %32, %31 : tensor<32x1x!tt.ptr<f32, 1>>, tensor<32x1xi64>
|
50 |
+
%34 = "tt.atomic_rmw"(%33, %20, %cst_2) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<32x1x!tt.ptr<f32, 1>>, tensor<32x1xf32>, tensor<32x1xi1>) -> tensor<32x1xf32>
|
51 |
+
tt.return
|
52 |
+
}
|
53 |
+
}
|
.triton/dump/305a9479aab997a3a16bfe46bb303a50/triton_.cubin
ADDED
Binary file (30.4 kB). View file
|
|
.triton/dump/305a9479aab997a3a16bfe46bb303a50/triton_.ptx
ADDED
@@ -0,0 +1,1041 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3d4d5d6de7de
|
10 |
+
.extern .func __assertfail
|
11 |
+
(
|
12 |
+
.param .b64 __assertfail_param_0,
|
13 |
+
.param .b64 __assertfail_param_1,
|
14 |
+
.param .b32 __assertfail_param_2,
|
15 |
+
.param .b64 __assertfail_param_3,
|
16 |
+
.param .b64 __assertfail_param_4
|
17 |
+
)
|
18 |
+
;
|
19 |
+
.global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
|
20 |
+
.global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
|
21 |
+
.global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 54, 32, 60, 32, 53, 48, 50, 53, 55};
|
22 |
+
.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
|
23 |
+
.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
|
24 |
+
.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
|
25 |
+
.extern .shared .align 1 .b8 global_smem[];
|
26 |
+
.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
|
27 |
+
|
28 |
+
.visible .entry triton__0d1d2d3d4d5d6de7de(
|
29 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_0,
|
30 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_1,
|
31 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_2,
|
32 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_3,
|
33 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_4,
|
34 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_5,
|
35 |
+
.param .u32 triton__0d1d2d3d4d5d6de7de_param_6,
|
36 |
+
.param .u32 triton__0d1d2d3d4d5d6de7de_param_7
|
37 |
+
)
|
38 |
+
.maxntid 64, 1, 1
|
39 |
+
{
|
40 |
+
.reg .pred %p<59>;
|
41 |
+
.reg .b16 %rs<13>;
|
42 |
+
.reg .b32 %r<176>;
|
43 |
+
.reg .f32 %f<169>;
|
44 |
+
.reg .b64 %rd<58>;
|
45 |
+
.loc 1 18 0
|
46 |
+
$L__func_begin0:
|
47 |
+
.loc 1 18 0
|
48 |
+
|
49 |
+
ld.param.u64 %rd8, [triton__0d1d2d3d4d5d6de7de_param_4];
|
50 |
+
ld.param.u64 %rd7, [triton__0d1d2d3d4d5d6de7de_param_1];
|
51 |
+
ld.param.u64 %rd22, [triton__0d1d2d3d4d5d6de7de_param_0];
|
52 |
+
$L__tmp0:
|
53 |
+
.loc 1 24 33
|
54 |
+
mov.u32 %r1, %tid.x;
|
55 |
+
and.b32 %r2, %r1, 31;
|
56 |
+
ld.param.u64 %rd23, [triton__0d1d2d3d4d5d6de7de_param_2];
|
57 |
+
ld.param.u64 %rd24, [triton__0d1d2d3d4d5d6de7de_param_3];
|
58 |
+
bfe.u32 %r3, %r1, 5, 1;
|
59 |
+
shl.b32 %r30, %r1, 2;
|
60 |
+
and.b32 %r4, %r30, 252;
|
61 |
+
.loc 1 21 28
|
62 |
+
mov.u32 %r13, %ctaid.x;
|
63 |
+
.loc 1 26 30
|
64 |
+
mul.wide.s32 %rd25, %r13, 8;
|
65 |
+
add.s64 %rd11, %rd22, %rd25;
|
66 |
+
mov.pred %p53, -1;
|
67 |
+
.loc 1 26 35
|
68 |
+
mov.u64 %rd10, 0x0;
|
69 |
+
@%p53 ld.global.L1::evict_last.b64 { %rd10 }, [ %rd11 + 0 ];
|
70 |
+
mov.u64 %rd12, 0x0;
|
71 |
+
@%p53 ld.global.L1::evict_last.b64 { %rd12 }, [ %rd11 + 0 ];
|
72 |
+
mov.u64 %rd14, 0x0;
|
73 |
+
@%p53 ld.global.L1::evict_last.b64 { %rd14 }, [ %rd11 + 0 ];
|
74 |
+
mov.u64 %rd16, 0x0;
|
75 |
+
@%p53 ld.global.L1::evict_last.b64 { %rd16 }, [ %rd11 + 0 ];
|
76 |
+
mov.u64 %rd18, 0x0;
|
77 |
+
@%p53 ld.global.L1::evict_last.b64 { %rd18 }, [ %rd11 + 0 ];
|
78 |
+
.loc 1 27 18
|
79 |
+
shr.s32 %r31, %r13, 31;
|
80 |
+
shr.u32 %r32, %r31, 23;
|
81 |
+
add.s32 %r33, %r13, %r32;
|
82 |
+
and.b32 %r34, %r33, 16776704;
|
83 |
+
sub.s32 %r35, %r13, %r34;
|
84 |
+
.loc 1 35 44
|
85 |
+
shl.b32 %r36, %r35, 8;
|
86 |
+
.loc 1 35 40
|
87 |
+
or.b32 %r37, %r36, %r4;
|
88 |
+
.loc 1 35 34
|
89 |
+
mul.wide.s32 %rd26, %r37, 4;
|
90 |
+
add.s64 %rd37, %rd23, %rd26;
|
91 |
+
mov.b32 %r151, 0;
|
92 |
+
.loc 1 35 50
|
93 |
+
mov.u32 %r14, 0x0;
|
94 |
+
mov.u32 %r15, 0x0;
|
95 |
+
mov.u32 %r16, 0x0;
|
96 |
+
mov.u32 %r17, 0x0;
|
97 |
+
@%p53 ld.global.L1::evict_last.v4.b32 { %r14, %r15, %r16, %r17 }, [ %rd37 + 0 ];
|
98 |
+
@!%p53 mov.u32 %r14, %r151;
|
99 |
+
@!%p53 mov.u32 %r15, %r151;
|
100 |
+
@!%p53 mov.u32 %r16, %r151;
|
101 |
+
@!%p53 mov.u32 %r17, %r151;
|
102 |
+
mov.b32 %f2, %r14;
|
103 |
+
mov.b32 %f1, %r15;
|
104 |
+
mov.b32 %f3, %r16;
|
105 |
+
mov.b32 %f4, %r17;
|
106 |
+
.loc 1 36 44
|
107 |
+
shl.b32 %r38, %r13, 8;
|
108 |
+
.loc 1 36 40
|
109 |
+
or.b32 %r39, %r38, %r4;
|
110 |
+
.loc 1 36 34
|
111 |
+
mul.wide.s32 %rd27, %r39, 2;
|
112 |
+
add.s64 %rd38, %rd24, %rd27;
|
113 |
+
.loc 1 36 50
|
114 |
+
mov.u32 %r22, 0x0;
|
115 |
+
mov.u32 %r23, 0x0;
|
116 |
+
@%p53 ld.global.L1::evict_last.v2.b32 { %r22, %r23 }, [ %rd38 + 0 ];
|
117 |
+
@!%p53 mov.u32 %r22, %r151;
|
118 |
+
@!%p53 mov.u32 %r23, %r151;
|
119 |
+
cvt.u16.u32 %rs1, %r22;
|
120 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r22; }
|
121 |
+
cvt.u16.u32 %rs3, %r23;
|
122 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r23; }
|
123 |
+
.loc 1 36 101
|
124 |
+
cvt.f32.bf16 %r26, %rs1;
|
125 |
+
mov.b32 %f5, %r26;
|
126 |
+
cvt.f32.bf16 %r27, %rs2;
|
127 |
+
mov.b32 %f6, %r27;
|
128 |
+
cvt.f32.bf16 %r28, %rs3;
|
129 |
+
mov.b32 %f7, %r28;
|
130 |
+
cvt.f32.bf16 %r29, %rs4;
|
131 |
+
mov.b32 %f8, %r29;
|
132 |
+
.loc 1 37 22
|
133 |
+
add.s64 %rd28, %rd18, 50257;
|
134 |
+
.loc 1 38 22
|
135 |
+
setp.lt.s64 %p14, %rd18, 0;
|
136 |
+
.loc 1 39 36
|
137 |
+
selp.b64 %rd5, %rd28, %rd18, %p14;
|
138 |
+
.loc 1 40 40
|
139 |
+
setp.lt.u64 %p15, %rd5, 50257;
|
140 |
+
mov.b32 %r175, 883;
|
141 |
+
mov.u64 %rd57, 1;
|
142 |
+
.loc 1 40 55
|
143 |
+
@%p15 bra $L__BB0_2;
|
144 |
+
mov.u64 %rd29, assertMessage_0;
|
145 |
+
cvta.global.u64 %rd30, %rd29;
|
146 |
+
mov.u64 %rd31, assertFile_0;
|
147 |
+
cvta.global.u64 %rd32, %rd31;
|
148 |
+
mov.u64 %rd33, assertFunc_0;
|
149 |
+
cvta.global.u64 %rd34, %rd33;
|
150 |
+
{ // callseq 0, 0
|
151 |
+
.reg .b32 temp_param_reg;
|
152 |
+
.param .b64 param0;
|
153 |
+
st.param.b64 [param0+0], %rd30;
|
154 |
+
.param .b64 param1;
|
155 |
+
st.param.b64 [param1+0], %rd32;
|
156 |
+
.param .b32 param2;
|
157 |
+
st.param.b32 [param2+0], %r175;
|
158 |
+
.param .b64 param3;
|
159 |
+
st.param.b64 [param3+0], %rd34;
|
160 |
+
.param .b64 param4;
|
161 |
+
st.param.b64 [param4+0], %rd57;
|
162 |
+
call.uni
|
163 |
+
__assertfail,
|
164 |
+
(
|
165 |
+
param0,
|
166 |
+
param1,
|
167 |
+
param2,
|
168 |
+
param3,
|
169 |
+
param4
|
170 |
+
);
|
171 |
+
} // callseq 0
|
172 |
+
$L__BB0_2:
|
173 |
+
.loc 1 0 55
|
174 |
+
ld.param.u64 %rd9, [triton__0d1d2d3d4d5d6de7de_param_5];
|
175 |
+
cvt.s64.s32 %rd3, %r39;
|
176 |
+
.loc 1 38 22
|
177 |
+
setp.lt.s64 %p44, %rd10, 0;
|
178 |
+
.loc 1 41 44
|
179 |
+
shl.b64 %rd40, %rd10, 8;
|
180 |
+
add.s64 %rd41, %rd40, 12865792;
|
181 |
+
selp.b64 %rd42, %rd41, %rd40, %p44;
|
182 |
+
cvt.u64.u32 %rd43, %r4;
|
183 |
+
.loc 1 41 40
|
184 |
+
or.b64 %rd44, %rd42, %rd43;
|
185 |
+
.loc 1 41 34
|
186 |
+
shl.b64 %rd45, %rd44, 2;
|
187 |
+
add.s64 %rd54, %rd7, %rd45;
|
188 |
+
.loc 1 41 52
|
189 |
+
mov.u32 %r41, 0x0;
|
190 |
+
mov.u32 %r42, 0x0;
|
191 |
+
mov.u32 %r43, 0x0;
|
192 |
+
mov.u32 %r44, 0x0;
|
193 |
+
@%p53 ld.global.L1::evict_last.v4.b32 { %r41, %r42, %r43, %r44 }, [ %rd54 + 0 ];
|
194 |
+
@!%p53 mov.u32 %r41, %r151;
|
195 |
+
@!%p53 mov.u32 %r42, %r151;
|
196 |
+
@!%p53 mov.u32 %r43, %r151;
|
197 |
+
@!%p53 mov.u32 %r44, %r151;
|
198 |
+
mov.b32 %f15, %r43;
|
199 |
+
mov.b32 %f16, %r44;
|
200 |
+
.loc 1 42 22
|
201 |
+
add.f32 %f17, %f3, %f15;
|
202 |
+
add.f32 %f18, %f4, %f16;
|
203 |
+
.loc 1 44 22
|
204 |
+
add.f32 %f19, %f7, %f17;
|
205 |
+
add.f32 %f20, %f8, %f18;
|
206 |
+
.loc 1 41 52
|
207 |
+
mov.b32 %f21, %r41;
|
208 |
+
mov.b32 %f22, %r42;
|
209 |
+
.loc 1 42 22
|
210 |
+
add.f32 %f23, %f1, %f22;
|
211 |
+
add.f32 %f24, %f2, %f21;
|
212 |
+
.loc 1 44 22
|
213 |
+
add.f32 %f25, %f5, %f24;
|
214 |
+
add.f32 %f26, %f6, %f23;
|
215 |
+
$L__tmp1:
|
216 |
+
.loc 2 98 22
|
217 |
+
add.f32 %f27, %f26, 0f00000000;
|
218 |
+
add.f32 %f28, %f25, 0f00000000;
|
219 |
+
add.f32 %f29, %f19, 0f00000000;
|
220 |
+
add.f32 %f30, %f20, 0f00000000;
|
221 |
+
.loc 2 101 30
|
222 |
+
sub.f32 %f31, %f25, %f28;
|
223 |
+
sub.f32 %f32, %f26, %f27;
|
224 |
+
sub.f32 %f33, %f19, %f29;
|
225 |
+
sub.f32 %f34, %f20, %f30;
|
226 |
+
.loc 2 101 13
|
227 |
+
fma.rn.f32 %f35, %f25, %f31, 0f00000000;
|
228 |
+
fma.rn.f32 %f36, %f26, %f32, 0f00000000;
|
229 |
+
fma.rn.f32 %f37, %f19, %f33, 0f00000000;
|
230 |
+
fma.rn.f32 %f38, %f20, %f34, 0f00000000;
|
231 |
+
$L__tmp2:
|
232 |
+
.loc 2 108 21
|
233 |
+
sub.f32 %f39, %f27, %f28;
|
234 |
+
mov.b32 %r50, 1065353216;
|
235 |
+
mov.b32 %r51, 1073741824;
|
236 |
+
.loc 2 110 60
|
237 |
+
div.full.f32 %r49, %r50, %r51;
|
238 |
+
mov.b32 %f40, %r49;
|
239 |
+
.loc 2 112 17
|
240 |
+
fma.rn.f32 %f41, %f40, %f39, %f28;
|
241 |
+
.loc 2 113 15
|
242 |
+
add.f32 %f42, %f35, %f36;
|
243 |
+
.loc 2 113 30
|
244 |
+
mul.f32 %f43, %f39, %f39;
|
245 |
+
.loc 2 113 22
|
246 |
+
fma.rn.f32 %f44, %f40, %f43, %f42;
|
247 |
+
.loc 2 108 21
|
248 |
+
sub.f32 %f45, %f29, %f41;
|
249 |
+
mov.b32 %r54, 1077936128;
|
250 |
+
.loc 2 110 60
|
251 |
+
div.full.f32 %r52, %r50, %r54;
|
252 |
+
mov.b32 %f46, %r52;
|
253 |
+
.loc 2 112 17
|
254 |
+
fma.rn.f32 %f47, %f46, %f45, %f41;
|
255 |
+
.loc 2 113 15
|
256 |
+
add.f32 %f48, %f37, %f44;
|
257 |
+
.loc 2 113 30
|
258 |
+
mul.f32 %f49, %f45, %f45;
|
259 |
+
.loc 2 113 38
|
260 |
+
fma.rn.f32 %f50, %f45, %f45, %f49;
|
261 |
+
.loc 2 113 22
|
262 |
+
fma.rn.f32 %f51, %f46, %f50, %f48;
|
263 |
+
.loc 2 108 21
|
264 |
+
sub.f32 %f52, %f30, %f47;
|
265 |
+
mov.b32 %r57, 1082130432;
|
266 |
+
.loc 2 110 60
|
267 |
+
div.full.f32 %r55, %r50, %r57;
|
268 |
+
mov.b32 %f53, %r55;
|
269 |
+
.loc 2 112 17
|
270 |
+
fma.rn.f32 %f54, %f53, %f52, %f47;
|
271 |
+
.loc 2 113 15
|
272 |
+
add.f32 %f55, %f38, %f51;
|
273 |
+
.loc 2 113 30
|
274 |
+
mul.f32 %f56, %f52, %f52;
|
275 |
+
.loc 2 113 38
|
276 |
+
mul.f32 %f57, %f56, 0f40400000;
|
277 |
+
.loc 2 113 22
|
278 |
+
fma.rn.f32 %f58, %f53, %f57, %f55;
|
279 |
+
$L__tmp3:
|
280 |
+
.loc 2 120 46
|
281 |
+
mov.b32 %r118, %f54;
|
282 |
+
shfl.sync.bfly.b32 %r119, %r118, 16, 31, -1;
|
283 |
+
mov.b32 %f59, %r119;
|
284 |
+
mov.b32 %r120, %f58;
|
285 |
+
shfl.sync.bfly.b32 %r121, %r120, 16, 31, -1;
|
286 |
+
mov.b32 %f60, %r121;
|
287 |
+
shfl.sync.bfly.b32 %r59, %r57, 16, 31, -1;
|
288 |
+
mov.b32 %f61, %r59;
|
289 |
+
$L__tmp4:
|
290 |
+
.loc 2 108 21
|
291 |
+
sub.f32 %f62, %f59, %f54;
|
292 |
+
.loc 2 109 28
|
293 |
+
add.f32 %f63, %f61, 0f40800000;
|
294 |
+
.loc 2 110 39
|
295 |
+
setp.eq.f32 %p45, %f63, 0f00000000;
|
296 |
+
.loc 2 110 60
|
297 |
+
mov.b32 %r60, %f63;
|
298 |
+
div.full.f32 %r58, %r59, %r60;
|
299 |
+
mov.b32 %f64, %r58;
|
300 |
+
.loc 2 110 49
|
301 |
+
selp.f32 %f65, 0f00000000, %f64, %p45;
|
302 |
+
.loc 2 112 17
|
303 |
+
fma.rn.f32 %f66, %f65, %f62, %f54;
|
304 |
+
.loc 2 113 15
|
305 |
+
add.f32 %f67, %f58, %f60;
|
306 |
+
.loc 2 113 30
|
307 |
+
mul.f32 %f68, %f62, %f62;
|
308 |
+
.loc 2 113 38
|
309 |
+
mul.f32 %f69, %f68, 0f40800000;
|
310 |
+
.loc 2 113 22
|
311 |
+
fma.rn.f32 %f70, %f65, %f69, %f67;
|
312 |
+
$L__tmp5:
|
313 |
+
.loc 2 120 46
|
314 |
+
mov.b32 %r122, %f66;
|
315 |
+
shfl.sync.bfly.b32 %r123, %r122, 8, 31, -1;
|
316 |
+
mov.b32 %f71, %r123;
|
317 |
+
mov.b32 %r124, %f70;
|
318 |
+
shfl.sync.bfly.b32 %r125, %r124, 8, 31, -1;
|
319 |
+
mov.b32 %f72, %r125;
|
320 |
+
shfl.sync.bfly.b32 %r62, %r60, 8, 31, -1;
|
321 |
+
mov.b32 %f73, %r62;
|
322 |
+
$L__tmp6:
|
323 |
+
.loc 2 108 21
|
324 |
+
sub.f32 %f74, %f71, %f66;
|
325 |
+
.loc 2 109 28
|
326 |
+
add.f32 %f75, %f63, %f73;
|
327 |
+
.loc 2 110 39
|
328 |
+
setp.eq.f32 %p46, %f75, 0f00000000;
|
329 |
+
.loc 2 110 60
|
330 |
+
mov.b32 %r63, %f75;
|
331 |
+
div.full.f32 %r61, %r62, %r63;
|
332 |
+
mov.b32 %f76, %r61;
|
333 |
+
.loc 2 110 49
|
334 |
+
selp.f32 %f77, 0f00000000, %f76, %p46;
|
335 |
+
.loc 2 112 17
|
336 |
+
fma.rn.f32 %f78, %f77, %f74, %f66;
|
337 |
+
.loc 2 113 15
|
338 |
+
add.f32 %f79, %f70, %f72;
|
339 |
+
.loc 2 113 30
|
340 |
+
mul.f32 %f80, %f74, %f74;
|
341 |
+
.loc 2 113 38
|
342 |
+
mul.f32 %f81, %f63, %f80;
|
343 |
+
.loc 2 113 22
|
344 |
+
fma.rn.f32 %f82, %f77, %f81, %f79;
|
345 |
+
$L__tmp7:
|
346 |
+
.loc 2 120 46
|
347 |
+
mov.b32 %r126, %f78;
|
348 |
+
shfl.sync.bfly.b32 %r127, %r126, 4, 31, -1;
|
349 |
+
mov.b32 %f83, %r127;
|
350 |
+
mov.b32 %r128, %f82;
|
351 |
+
shfl.sync.bfly.b32 %r129, %r128, 4, 31, -1;
|
352 |
+
mov.b32 %f84, %r129;
|
353 |
+
shfl.sync.bfly.b32 %r65, %r63, 4, 31, -1;
|
354 |
+
mov.b32 %f85, %r65;
|
355 |
+
$L__tmp8:
|
356 |
+
.loc 2 108 21
|
357 |
+
sub.f32 %f86, %f83, %f78;
|
358 |
+
.loc 2 109 28
|
359 |
+
add.f32 %f87, %f75, %f85;
|
360 |
+
.loc 2 110 39
|
361 |
+
setp.eq.f32 %p47, %f87, 0f00000000;
|
362 |
+
.loc 2 110 60
|
363 |
+
mov.b32 %r66, %f87;
|
364 |
+
div.full.f32 %r64, %r65, %r66;
|
365 |
+
mov.b32 %f88, %r64;
|
366 |
+
.loc 2 110 49
|
367 |
+
selp.f32 %f89, 0f00000000, %f88, %p47;
|
368 |
+
.loc 2 112 17
|
369 |
+
fma.rn.f32 %f90, %f89, %f86, %f78;
|
370 |
+
.loc 2 113 15
|
371 |
+
add.f32 %f91, %f82, %f84;
|
372 |
+
.loc 2 113 30
|
373 |
+
mul.f32 %f92, %f86, %f86;
|
374 |
+
.loc 2 113 38
|
375 |
+
mul.f32 %f93, %f75, %f92;
|
376 |
+
.loc 2 113 22
|
377 |
+
fma.rn.f32 %f94, %f89, %f93, %f91;
|
378 |
+
$L__tmp9:
|
379 |
+
.loc 2 120 46
|
380 |
+
mov.b32 %r130, %f90;
|
381 |
+
shfl.sync.bfly.b32 %r131, %r130, 2, 31, -1;
|
382 |
+
mov.b32 %f95, %r131;
|
383 |
+
mov.b32 %r132, %f94;
|
384 |
+
shfl.sync.bfly.b32 %r133, %r132, 2, 31, -1;
|
385 |
+
mov.b32 %f96, %r133;
|
386 |
+
shfl.sync.bfly.b32 %r68, %r66, 2, 31, -1;
|
387 |
+
mov.b32 %f97, %r68;
|
388 |
+
$L__tmp10:
|
389 |
+
.loc 2 108 21
|
390 |
+
sub.f32 %f98, %f95, %f90;
|
391 |
+
.loc 2 109 28
|
392 |
+
add.f32 %f99, %f87, %f97;
|
393 |
+
.loc 2 110 39
|
394 |
+
setp.eq.f32 %p48, %f99, 0f00000000;
|
395 |
+
.loc 2 110 60
|
396 |
+
mov.b32 %r69, %f99;
|
397 |
+
div.full.f32 %r67, %r68, %r69;
|
398 |
+
mov.b32 %f100, %r67;
|
399 |
+
.loc 2 110 49
|
400 |
+
selp.f32 %f101, 0f00000000, %f100, %p48;
|
401 |
+
.loc 2 112 17
|
402 |
+
fma.rn.f32 %f102, %f101, %f98, %f90;
|
403 |
+
.loc 2 113 15
|
404 |
+
add.f32 %f103, %f94, %f96;
|
405 |
+
.loc 2 113 30
|
406 |
+
mul.f32 %f104, %f98, %f98;
|
407 |
+
.loc 2 113 38
|
408 |
+
mul.f32 %f105, %f87, %f104;
|
409 |
+
.loc 2 113 22
|
410 |
+
fma.rn.f32 %f106, %f101, %f105, %f103;
|
411 |
+
$L__tmp11:
|
412 |
+
.loc 2 120 46
|
413 |
+
mov.b32 %r134, %f102;
|
414 |
+
shfl.sync.bfly.b32 %r135, %r134, 1, 31, -1;
|
415 |
+
mov.b32 %f107, %r135;
|
416 |
+
mov.b32 %r136, %f106;
|
417 |
+
shfl.sync.bfly.b32 %r137, %r136, 1, 31, -1;
|
418 |
+
mov.b32 %f108, %r137;
|
419 |
+
shfl.sync.bfly.b32 %r71, %r69, 1, 31, -1;
|
420 |
+
mov.b32 %f109, %r71;
|
421 |
+
$L__tmp12:
|
422 |
+
.loc 2 108 21
|
423 |
+
sub.f32 %f110, %f107, %f102;
|
424 |
+
.loc 2 109 28
|
425 |
+
add.f32 %f111, %f99, %f109;
|
426 |
+
.loc 2 110 39
|
427 |
+
setp.eq.f32 %p49, %f111, 0f00000000;
|
428 |
+
.loc 2 110 60
|
429 |
+
mov.b32 %r72, %f111;
|
430 |
+
div.full.f32 %r70, %r71, %r72;
|
431 |
+
mov.b32 %f112, %r70;
|
432 |
+
.loc 2 110 49
|
433 |
+
selp.f32 %f113, 0f00000000, %f112, %p49;
|
434 |
+
.loc 2 112 17
|
435 |
+
fma.rn.f32 %f114, %f113, %f110, %f102;
|
436 |
+
.loc 2 113 15
|
437 |
+
add.f32 %f115, %f106, %f108;
|
438 |
+
.loc 2 113 30
|
439 |
+
mul.f32 %f116, %f110, %f110;
|
440 |
+
.loc 2 113 38
|
441 |
+
mul.f32 %f117, %f99, %f116;
|
442 |
+
.loc 2 113 22
|
443 |
+
fma.rn.f32 %f118, %f113, %f117, %f115;
|
444 |
+
$L__tmp13:
|
445 |
+
.loc 2 120 46
|
446 |
+
setp.eq.s32 %p21, %r2, 0;
|
447 |
+
shl.b32 %r138, %r3, 2;
|
448 |
+
mov.u32 %r139, global_smem;
|
449 |
+
add.s32 %r73, %r139, %r138;
|
450 |
+
mov.b32 %r74, %f114;
|
451 |
+
@%p21 st.shared.b32 [ %r73 + 0 ], %r74;
|
452 |
+
add.s32 %r140, %r139, 8;
|
453 |
+
add.s32 %r75, %r140, %r138;
|
454 |
+
mov.b32 %r76, %f118;
|
455 |
+
@%p21 st.shared.b32 [ %r75 + 0 ], %r76;
|
456 |
+
add.s32 %r141, %r139, 16;
|
457 |
+
add.s32 %r77, %r141, %r138;
|
458 |
+
@%p21 st.shared.b32 [ %r77 + 0 ], %r72;
|
459 |
+
bar.sync 0;
|
460 |
+
setp.lt.s32 %p24, %r1, 2;
|
461 |
+
add.s32 %r80, %r139, %r30;
|
462 |
+
@%p24 ld.shared.b32 %r79, [ %r80 + 0 ];
|
463 |
+
mov.b32 %f119, %r79;
|
464 |
+
add.s32 %r82, %r140, %r30;
|
465 |
+
@%p24 ld.shared.b32 %r81, [ %r82 + 0 ];
|
466 |
+
mov.b32 %f120, %r81;
|
467 |
+
add.s32 %r84, %r141, %r30;
|
468 |
+
@%p24 ld.shared.b32 %r83, [ %r84 + 0 ];
|
469 |
+
mov.b32 %f121, %r83;
|
470 |
+
shfl.sync.bfly.b32 %r143, %r79, 1, 31, -1;
|
471 |
+
mov.b32 %f122, %r143;
|
472 |
+
shfl.sync.bfly.b32 %r144, %r81, 1, 31, -1;
|
473 |
+
mov.b32 %f123, %r144;
|
474 |
+
shfl.sync.bfly.b32 %r86, %r83, 1, 31, -1;
|
475 |
+
mov.b32 %f124, %r86;
|
476 |
+
$L__tmp14:
|
477 |
+
.loc 2 108 21
|
478 |
+
sub.f32 %f125, %f122, %f119;
|
479 |
+
.loc 2 109 28
|
480 |
+
add.f32 %f126, %f121, %f124;
|
481 |
+
.loc 2 110 39
|
482 |
+
setp.eq.f32 %p50, %f126, 0f00000000;
|
483 |
+
.loc 2 110 60
|
484 |
+
mov.b32 %r87, %f126;
|
485 |
+
div.full.f32 %r85, %r86, %r87;
|
486 |
+
mov.b32 %f127, %r85;
|
487 |
+
.loc 2 110 49
|
488 |
+
selp.f32 %f128, 0f00000000, %f127, %p50;
|
489 |
+
.loc 2 112 17
|
490 |
+
fma.rn.f32 %f129, %f125, %f128, %f119;
|
491 |
+
.loc 2 113 15
|
492 |
+
add.f32 %f130, %f120, %f123;
|
493 |
+
.loc 2 113 30
|
494 |
+
mul.f32 %f131, %f125, %f125;
|
495 |
+
.loc 2 113 38
|
496 |
+
mul.f32 %f132, %f121, %f131;
|
497 |
+
.loc 2 113 22
|
498 |
+
fma.rn.f32 %f133, %f132, %f128, %f130;
|
499 |
+
$L__tmp15:
|
500 |
+
.loc 2 120 46
|
501 |
+
and.b32 %r145, %r1, 1;
|
502 |
+
setp.eq.b32 %p51, %r145, 1;
|
503 |
+
not.pred %p52, %p51;
|
504 |
+
and.pred %p27, %p24, %p52;
|
505 |
+
mov.b32 %r89, %f129;
|
506 |
+
@%p27 st.shared.b32 [ %r80 + 0 ], %r89;
|
507 |
+
mov.b32 %r91, %f133;
|
508 |
+
@%p27 st.shared.b32 [ %r82 + 0 ], %r91;
|
509 |
+
@%p27 st.shared.b32 [ %r84 + 0 ], %r87;
|
510 |
+
bar.sync 0;
|
511 |
+
ld.shared.f32 %f9, [global_smem];
|
512 |
+
ld.shared.f32 %f10, [global_smem+8];
|
513 |
+
$L__tmp16:
|
514 |
+
.loc 1 62 51
|
515 |
+
mov.u32 %r94, 0x0;
|
516 |
+
mov.u32 %r95, 0x0;
|
517 |
+
mov.u32 %r96, 0x0;
|
518 |
+
mov.u32 %r97, 0x0;
|
519 |
+
@%p53 ld.global.L1::evict_last.v4.b32 { %r94, %r95, %r96, %r97 }, [ %rd37 + 0 ];
|
520 |
+
@!%p53 mov.u32 %r94, %r151;
|
521 |
+
@!%p53 mov.u32 %r95, %r151;
|
522 |
+
@!%p53 mov.u32 %r96, %r151;
|
523 |
+
@!%p53 mov.u32 %r97, %r151;
|
524 |
+
.loc 1 63 51
|
525 |
+
mov.u32 %r102, 0x0;
|
526 |
+
mov.u32 %r103, 0x0;
|
527 |
+
@%p53 ld.global.L1::evict_first.v2.b32 { %r102, %r103 }, [ %rd38 + 0 ];
|
528 |
+
@!%p53 mov.u32 %r102, %r151;
|
529 |
+
@!%p53 mov.u32 %r103, %r151;
|
530 |
+
cvt.u16.u32 %rs5, %r102;
|
531 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r102; }
|
532 |
+
cvt.u16.u32 %rs7, %r103;
|
533 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r103; }
|
534 |
+
.loc 1 63 103
|
535 |
+
cvt.f32.bf16 %r106, %rs5;
|
536 |
+
mov.b32 %f11, %r106;
|
537 |
+
cvt.f32.bf16 %r107, %rs6;
|
538 |
+
mov.b32 %f12, %r107;
|
539 |
+
cvt.f32.bf16 %r108, %rs7;
|
540 |
+
mov.b32 %f13, %r108;
|
541 |
+
cvt.f32.bf16 %r109, %rs8;
|
542 |
+
mov.b32 %f14, %r109;
|
543 |
+
.loc 1 64 35
|
544 |
+
mul.wide.u32 %rd46, %r4, 4;
|
545 |
+
add.s64 %rd39, %rd8, %rd46;
|
546 |
+
.loc 1 64 40
|
547 |
+
mov.u32 %r110, 0x0;
|
548 |
+
mov.u32 %r111, 0x0;
|
549 |
+
mov.u32 %r112, 0x0;
|
550 |
+
mov.u32 %r113, 0x0;
|
551 |
+
@%p53 ld.global.L1::evict_last.v4.b32 { %r110, %r111, %r112, %r113 }, [ %rd39 + 0 ];
|
552 |
+
@!%p53 mov.u32 %r110, %r151;
|
553 |
+
@!%p53 mov.u32 %r111, %r151;
|
554 |
+
@!%p53 mov.u32 %r112, %r151;
|
555 |
+
@!%p53 mov.u32 %r113, %r151;
|
556 |
+
.loc 1 68 57
|
557 |
+
@%p15 bra $L__BB0_4;
|
558 |
+
mov.u64 %rd47, assertMessage_1;
|
559 |
+
cvta.global.u64 %rd48, %rd47;
|
560 |
+
mov.u64 %rd49, assertFile_1;
|
561 |
+
cvta.global.u64 %rd50, %rd49;
|
562 |
+
mov.u64 %rd51, assertFunc_1;
|
563 |
+
cvta.global.u64 %rd52, %rd51;
|
564 |
+
{ // callseq 1, 0
|
565 |
+
.reg .b32 temp_param_reg;
|
566 |
+
.param .b64 param0;
|
567 |
+
st.param.b64 [param0+0], %rd48;
|
568 |
+
.param .b64 param1;
|
569 |
+
st.param.b64 [param1+0], %rd50;
|
570 |
+
.param .b32 param2;
|
571 |
+
st.param.b32 [param2+0], %r175;
|
572 |
+
.param .b64 param3;
|
573 |
+
st.param.b64 [param3+0], %rd52;
|
574 |
+
.param .b64 param4;
|
575 |
+
st.param.b64 [param4+0], %rd57;
|
576 |
+
call.uni
|
577 |
+
__assertfail,
|
578 |
+
(
|
579 |
+
param0,
|
580 |
+
param1,
|
581 |
+
param2,
|
582 |
+
param3,
|
583 |
+
param4
|
584 |
+
);
|
585 |
+
} // callseq 1
|
586 |
+
$L__BB0_4:
|
587 |
+
.loc 1 69 54
|
588 |
+
mov.u32 %r147, 0x0;
|
589 |
+
mov.u32 %r148, 0x0;
|
590 |
+
mov.u32 %r149, 0x0;
|
591 |
+
mov.u32 %r150, 0x0;
|
592 |
+
@%p53 ld.global.L1::evict_first.v4.b32 { %r147, %r148, %r149, %r150 }, [ %rd54 + 0 ];
|
593 |
+
@!%p53 mov.u32 %r147, %r151;
|
594 |
+
@!%p53 mov.u32 %r148, %r151;
|
595 |
+
@!%p53 mov.u32 %r149, %r151;
|
596 |
+
@!%p53 mov.u32 %r150, %r151;
|
597 |
+
.loc 1 75 24
|
598 |
+
mov.b32 %r156, %f10;
|
599 |
+
mov.b32 %r157, 1132462080;
|
600 |
+
div.full.f32 %r155, %r156, %r157;
|
601 |
+
mov.b32 %f134, %r155;
|
602 |
+
.loc 1 77 24
|
603 |
+
add.f32 %f135, %f134, 0f3727C5AC;
|
604 |
+
.loc 1 78 30
|
605 |
+
rsqrt.approx.ftz.f32 %f136, %f135;
|
606 |
+
.loc 1 69 54
|
607 |
+
mov.b32 %f137, %r150;
|
608 |
+
.loc 1 62 51
|
609 |
+
mov.b32 %f138, %r97;
|
610 |
+
.loc 1 70 24
|
611 |
+
add.f32 %f139, %f138, %f137;
|
612 |
+
.loc 1 72 24
|
613 |
+
add.f32 %f140, %f14, %f139;
|
614 |
+
.loc 1 73 24
|
615 |
+
sub.f32 %f141, %f140, %f9;
|
616 |
+
.loc 1 69 54
|
617 |
+
mov.b32 %f142, %r149;
|
618 |
+
.loc 1 62 51
|
619 |
+
mov.b32 %f143, %r96;
|
620 |
+
.loc 1 70 24
|
621 |
+
add.f32 %f144, %f143, %f142;
|
622 |
+
.loc 1 72 24
|
623 |
+
add.f32 %f145, %f13, %f144;
|
624 |
+
.loc 1 73 24
|
625 |
+
sub.f32 %f146, %f145, %f9;
|
626 |
+
.loc 1 69 54
|
627 |
+
mov.b32 %f147, %r148;
|
628 |
+
.loc 1 62 51
|
629 |
+
mov.b32 %f148, %r95;
|
630 |
+
.loc 1 70 24
|
631 |
+
add.f32 %f149, %f148, %f147;
|
632 |
+
.loc 1 72 24
|
633 |
+
add.f32 %f150, %f12, %f149;
|
634 |
+
.loc 1 73 24
|
635 |
+
sub.f32 %f151, %f150, %f9;
|
636 |
+
.loc 1 69 54
|
637 |
+
mov.b32 %f152, %r147;
|
638 |
+
.loc 1 62 51
|
639 |
+
mov.b32 %f153, %r94;
|
640 |
+
.loc 1 70 24
|
641 |
+
add.f32 %f154, %f153, %f152;
|
642 |
+
.loc 1 72 24
|
643 |
+
add.f32 %f155, %f11, %f154;
|
644 |
+
.loc 1 73 24
|
645 |
+
sub.f32 %f156, %f155, %f9;
|
646 |
+
.loc 1 64 40
|
647 |
+
mov.b32 %f157, %r110;
|
648 |
+
mov.b32 %f158, %r111;
|
649 |
+
mov.b32 %f159, %r112;
|
650 |
+
mov.b32 %f160, %r113;
|
651 |
+
.loc 1 79 24
|
652 |
+
mul.f32 %f161, %f156, %f136;
|
653 |
+
mul.f32 %f162, %f151, %f136;
|
654 |
+
mul.f32 %f163, %f146, %f136;
|
655 |
+
mul.f32 %f164, %f141, %f136;
|
656 |
+
.loc 1 80 24
|
657 |
+
mul.f32 %f165, %f161, %f157;
|
658 |
+
mul.f32 %f166, %f162, %f158;
|
659 |
+
mul.f32 %f167, %f163, %f159;
|
660 |
+
mul.f32 %f168, %f164, %f160;
|
661 |
+
.loc 1 82 29
|
662 |
+
shl.b64 %rd56, %rd3, 1;
|
663 |
+
add.s64 %rd55, %rd9, %rd56;
|
664 |
+
.loc 1 82 52
|
665 |
+
mov.b32 %r167, %f165;
|
666 |
+
cvt.rn.bf16.f32 %rs9, %r167;
|
667 |
+
mov.b32 %r168, %f166;
|
668 |
+
cvt.rn.bf16.f32 %rs10, %r168;
|
669 |
+
mov.b32 %r169, %f167;
|
670 |
+
cvt.rn.bf16.f32 %rs11, %r169;
|
671 |
+
mov.b32 %r170, %f168;
|
672 |
+
cvt.rn.bf16.f32 %rs12, %r170;
|
673 |
+
mov.b32 %r173, {%rs9, %rs10};
|
674 |
+
mov.b32 %r174, {%rs11, %rs12};
|
675 |
+
@%p53 st.global.v2.b32 [ %rd55 + 0 ], { %r173, %r174 };
|
676 |
+
.loc 1 58 4
|
677 |
+
ret;
|
678 |
+
$L__tmp17:
|
679 |
+
$L__func_end0:
|
680 |
+
|
681 |
+
}
|
682 |
+
// .globl __nv_rsqrtf
|
683 |
+
.visible .func (.param .b32 func_retval0) __nv_rsqrtf(
|
684 |
+
.param .b32 __nv_rsqrtf_param_0
|
685 |
+
)
|
686 |
+
{
|
687 |
+
.reg .f32 %f<3>;
|
688 |
+
$L__func_begin1:
|
689 |
+
|
690 |
+
ld.param.f32 %f1, [__nv_rsqrtf_param_0];
|
691 |
+
rsqrt.approx.ftz.f32 %f2, %f1;
|
692 |
+
st.param.f32 [func_retval0+0], %f2;
|
693 |
+
ret;
|
694 |
+
$L__func_end1:
|
695 |
+
|
696 |
+
}
|
697 |
+
.file 1 "/tmp/torchinductor_root/pn/cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py"
|
698 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
|
699 |
+
.section .debug_abbrev
|
700 |
+
{
|
701 |
+
.b8 1
|
702 |
+
.b8 17
|
703 |
+
.b8 1
|
704 |
+
.b8 37
|
705 |
+
.b8 8
|
706 |
+
.b8 19
|
707 |
+
.b8 5
|
708 |
+
.b8 3
|
709 |
+
.b8 8
|
710 |
+
.b8 16
|
711 |
+
.b8 6
|
712 |
+
.b8 27
|
713 |
+
.b8 8
|
714 |
+
.b8 180
|
715 |
+
.b8 66
|
716 |
+
.b8 12
|
717 |
+
.b8 17
|
718 |
+
.b8 1
|
719 |
+
.b8 18
|
720 |
+
.b8 1
|
721 |
+
.b8 0
|
722 |
+
.b8 0
|
723 |
+
.b8 2
|
724 |
+
.b8 46
|
725 |
+
.b8 0
|
726 |
+
.b8 135
|
727 |
+
.b8 64
|
728 |
+
.b8 8
|
729 |
+
.b8 3
|
730 |
+
.b8 8
|
731 |
+
.b8 58
|
732 |
+
.b8 11
|
733 |
+
.b8 59
|
734 |
+
.b8 11
|
735 |
+
.b8 63
|
736 |
+
.b8 12
|
737 |
+
.b8 32
|
738 |
+
.b8 11
|
739 |
+
.b8 0
|
740 |
+
.b8 0
|
741 |
+
.b8 3
|
742 |
+
.b8 46
|
743 |
+
.b8 1
|
744 |
+
.b8 17
|
745 |
+
.b8 1
|
746 |
+
.b8 18
|
747 |
+
.b8 1
|
748 |
+
.b8 64
|
749 |
+
.b8 10
|
750 |
+
.b8 49
|
751 |
+
.b8 19
|
752 |
+
.b8 0
|
753 |
+
.b8 0
|
754 |
+
.b8 4
|
755 |
+
.b8 29
|
756 |
+
.b8 0
|
757 |
+
.b8 49
|
758 |
+
.b8 19
|
759 |
+
.b8 17
|
760 |
+
.b8 1
|
761 |
+
.b8 18
|
762 |
+
.b8 1
|
763 |
+
.b8 88
|
764 |
+
.b8 11
|
765 |
+
.b8 89
|
766 |
+
.b8 11
|
767 |
+
.b8 87
|
768 |
+
.b8 11
|
769 |
+
.b8 0
|
770 |
+
.b8 0
|
771 |
+
.b8 5
|
772 |
+
.b8 29
|
773 |
+
.b8 1
|
774 |
+
.b8 49
|
775 |
+
.b8 19
|
776 |
+
.b8 17
|
777 |
+
.b8 1
|
778 |
+
.b8 18
|
779 |
+
.b8 1
|
780 |
+
.b8 88
|
781 |
+
.b8 11
|
782 |
+
.b8 89
|
783 |
+
.b8 11
|
784 |
+
.b8 87
|
785 |
+
.b8 11
|
786 |
+
.b8 0
|
787 |
+
.b8 0
|
788 |
+
.b8 0
|
789 |
+
}
|
790 |
+
.section .debug_info
|
791 |
+
{
|
792 |
+
.b32 302
|
793 |
+
.b8 2
|
794 |
+
.b8 0
|
795 |
+
.b32 .debug_abbrev
|
796 |
+
.b8 8
|
797 |
+
.b8 1
|
798 |
+
.b8 116
|
799 |
+
.b8 114
|
800 |
+
.b8 105
|
801 |
+
.b8 116
|
802 |
+
.b8 111
|
803 |
+
.b8 110
|
804 |
+
.b8 0
|
805 |
+
.b8 2
|
806 |
+
.b8 0
|
807 |
+
.b8 99
|
808 |
+
.b8 112
|
809 |
+
.b8 110
|
810 |
+
.b8 51
|
811 |
+
.b8 108
|
812 |
+
.b8 97
|
813 |
+
.b8 119
|
814 |
+
.b8 103
|
815 |
+
.b8 54
|
816 |
+
.b8 53
|
817 |
+
.b8 108
|
818 |
+
.b8 112
|
819 |
+
.b8 105
|
820 |
+
.b8 54
|
821 |
+
.b8 51
|
822 |
+
.b8 103
|
823 |
+
.b8 118
|
824 |
+
.b8 54
|
825 |
+
.b8 99
|
826 |
+
.b8 54
|
827 |
+
.b8 112
|
828 |
+
.b8 110
|
829 |
+
.b8 52
|
830 |
+
.b8 111
|
831 |
+
.b8 105
|
832 |
+
.b8 107
|
833 |
+
.b8 104
|
834 |
+
.b8 103
|
835 |
+
.b8 54
|
836 |
+
.b8 113
|
837 |
+
.b8 118
|
838 |
+
.b8 97
|
839 |
+
.b8 50
|
840 |
+
.b8 104
|
841 |
+
.b8 50
|
842 |
+
.b8 113
|
843 |
+
.b8 106
|
844 |
+
.b8 100
|
845 |
+
.b8 112
|
846 |
+
.b8 120
|
847 |
+
.b8 101
|
848 |
+
.b8 54
|
849 |
+
.b8 113
|
850 |
+
.b8 106
|
851 |
+
.b8 52
|
852 |
+
.b8 108
|
853 |
+
.b8 118
|
854 |
+
.b8 116
|
855 |
+
.b8 116
|
856 |
+
.b8 119
|
857 |
+
.b8 101
|
858 |
+
.b8 122
|
859 |
+
.b8 46
|
860 |
+
.b8 112
|
861 |
+
.b8 121
|
862 |
+
.b8 0
|
863 |
+
.b32 .debug_line
|
864 |
+
.b8 47
|
865 |
+
.b8 116
|
866 |
+
.b8 109
|
867 |
+
.b8 112
|
868 |
+
.b8 47
|
869 |
+
.b8 116
|
870 |
+
.b8 111
|
871 |
+
.b8 114
|
872 |
+
.b8 99
|
873 |
+
.b8 104
|
874 |
+
.b8 105
|
875 |
+
.b8 110
|
876 |
+
.b8 100
|
877 |
+
.b8 117
|
878 |
+
.b8 99
|
879 |
+
.b8 116
|
880 |
+
.b8 111
|
881 |
+
.b8 114
|
882 |
+
.b8 95
|
883 |
+
.b8 114
|
884 |
+
.b8 111
|
885 |
+
.b8 111
|
886 |
+
.b8 116
|
887 |
+
.b8 47
|
888 |
+
.b8 112
|
889 |
+
.b8 110
|
890 |
+
.b8 0
|
891 |
+
.b8 1
|
892 |
+
.b64 $L__func_begin0
|
893 |
+
.b64 $L__func_end0
|
894 |
+
.b8 2
|
895 |
+
.b8 116
|
896 |
+
.b8 114
|
897 |
+
.b8 105
|
898 |
+
.b8 116
|
899 |
+
.b8 111
|
900 |
+
.b8 110
|
901 |
+
.b8 95
|
902 |
+
.b8 95
|
903 |
+
.b8 48
|
904 |
+
.b8 100
|
905 |
+
.b8 49
|
906 |
+
.b8 100
|
907 |
+
.b8 50
|
908 |
+
.b8 100
|
909 |
+
.b8 51
|
910 |
+
.b8 100
|
911 |
+
.b8 52
|
912 |
+
.b8 100
|
913 |
+
.b8 53
|
914 |
+
.b8 100
|
915 |
+
.b8 54
|
916 |
+
.b8 100
|
917 |
+
.b8 101
|
918 |
+
.b8 55
|
919 |
+
.b8 100
|
920 |
+
.b8 101
|
921 |
+
.b8 0
|
922 |
+
.b8 116
|
923 |
+
.b8 114
|
924 |
+
.b8 105
|
925 |
+
.b8 116
|
926 |
+
.b8 111
|
927 |
+
.b8 110
|
928 |
+
.b8 95
|
929 |
+
.b8 95
|
930 |
+
.b8 48
|
931 |
+
.b8 100
|
932 |
+
.b8 49
|
933 |
+
.b8 100
|
934 |
+
.b8 50
|
935 |
+
.b8 100
|
936 |
+
.b8 51
|
937 |
+
.b8 100
|
938 |
+
.b8 52
|
939 |
+
.b8 100
|
940 |
+
.b8 53
|
941 |
+
.b8 100
|
942 |
+
.b8 54
|
943 |
+
.b8 100
|
944 |
+
.b8 101
|
945 |
+
.b8 55
|
946 |
+
.b8 100
|
947 |
+
.b8 101
|
948 |
+
.b8 0
|
949 |
+
.b8 1
|
950 |
+
.b8 18
|
951 |
+
.b8 1
|
952 |
+
.b8 1
|
953 |
+
.b8 3
|
954 |
+
.b64 $L__func_begin0
|
955 |
+
.b64 $L__func_end0
|
956 |
+
.b8 1
|
957 |
+
.b8 156
|
958 |
+
.b32 125
|
959 |
+
.b8 4
|
960 |
+
.b32 125
|
961 |
+
.b64 $L__tmp1
|
962 |
+
.b64 $L__tmp2
|
963 |
+
.b8 2
|
964 |
+
.b8 47
|
965 |
+
.b8 41
|
966 |
+
.b8 5
|
967 |
+
.b32 125
|
968 |
+
.b64 $L__tmp2
|
969 |
+
.b64 $L__tmp15
|
970 |
+
.b8 2
|
971 |
+
.b8 53
|
972 |
+
.b8 44
|
973 |
+
.b8 4
|
974 |
+
.b32 125
|
975 |
+
.b64 $L__tmp2
|
976 |
+
.b64 $L__tmp15
|
977 |
+
.b8 2
|
978 |
+
.b8 120
|
979 |
+
.b8 46
|
980 |
+
.b8 0
|
981 |
+
.b8 4
|
982 |
+
.b32 125
|
983 |
+
.b64 $L__tmp3
|
984 |
+
.b64 $L__tmp16
|
985 |
+
.b8 2
|
986 |
+
.b8 53
|
987 |
+
.b8 44
|
988 |
+
.b8 0
|
989 |
+
.b8 0
|
990 |
+
}
|
991 |
+
.section .debug_pubnames
|
992 |
+
{
|
993 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
994 |
+
$L__pubNames_start0:
|
995 |
+
.b8 2
|
996 |
+
.b8 0
|
997 |
+
.b32 .debug_info
|
998 |
+
.b32 306
|
999 |
+
.b32 125
|
1000 |
+
.b8 116
|
1001 |
+
.b8 114
|
1002 |
+
.b8 105
|
1003 |
+
.b8 116
|
1004 |
+
.b8 111
|
1005 |
+
.b8 110
|
1006 |
+
.b8 95
|
1007 |
+
.b8 95
|
1008 |
+
.b8 48
|
1009 |
+
.b8 100
|
1010 |
+
.b8 49
|
1011 |
+
.b8 100
|
1012 |
+
.b8 50
|
1013 |
+
.b8 100
|
1014 |
+
.b8 51
|
1015 |
+
.b8 100
|
1016 |
+
.b8 52
|
1017 |
+
.b8 100
|
1018 |
+
.b8 53
|
1019 |
+
.b8 100
|
1020 |
+
.b8 54
|
1021 |
+
.b8 100
|
1022 |
+
.b8 101
|
1023 |
+
.b8 55
|
1024 |
+
.b8 100
|
1025 |
+
.b8 101
|
1026 |
+
.b8 0
|
1027 |
+
.b32 0
|
1028 |
+
$L__pubNames_end0:
|
1029 |
+
}
|
1030 |
+
.section .debug_pubtypes
|
1031 |
+
{
|
1032 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
1033 |
+
$L__pubTypes_start0:
|
1034 |
+
.b8 2
|
1035 |
+
.b8 0
|
1036 |
+
.b32 .debug_info
|
1037 |
+
.b32 306
|
1038 |
+
.b32 0
|
1039 |
+
$L__pubTypes_end0:
|
1040 |
+
}
|
1041 |
+
.section .debug_loc { }
|
.triton/dump/415aac87553b7d064f52694fa7254686/triton_.llir
ADDED
@@ -0,0 +1,860 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
|
5 |
+
|
6 |
+
define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !7 {
|
7 |
+
%4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
|
8 |
+
%5 = shl i32 %4, 3, !dbg !10
|
9 |
+
%6 = and i32 %5, 1016, !dbg !10
|
10 |
+
%7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #4, !dbg !11
|
11 |
+
%8 = shl i32 %7, 10, !dbg !12
|
12 |
+
%9 = or i32 %8, %6, !dbg !13
|
13 |
+
%10 = sext i32 %9 to i64, !dbg !14
|
14 |
+
%11 = getelementptr i16, ptr addrspace(1) %0, i64 %10, !dbg !14
|
15 |
+
%12 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %11, i1 true) #4, !dbg !15
|
16 |
+
%13 = extractvalue { i32, i32, i32, i32 } %12, 0, !dbg !15
|
17 |
+
%14 = extractvalue { i32, i32, i32, i32 } %12, 1, !dbg !15
|
18 |
+
%15 = extractvalue { i32, i32, i32, i32 } %12, 2, !dbg !15
|
19 |
+
%16 = extractvalue { i32, i32, i32, i32 } %12, 3, !dbg !15
|
20 |
+
%17 = trunc i32 %13 to i16, !dbg !15
|
21 |
+
%extelt.offset = lshr i32 %13, 16, !dbg !15
|
22 |
+
%18 = trunc i32 %extelt.offset to i16, !dbg !15
|
23 |
+
%19 = trunc i32 %14 to i16, !dbg !15
|
24 |
+
%extelt.offset1 = lshr i32 %14, 16, !dbg !15
|
25 |
+
%20 = trunc i32 %extelt.offset1 to i16, !dbg !15
|
26 |
+
%21 = trunc i32 %15 to i16, !dbg !15
|
27 |
+
%extelt.offset2 = lshr i32 %15, 16, !dbg !15
|
28 |
+
%22 = trunc i32 %extelt.offset2 to i16, !dbg !15
|
29 |
+
%23 = trunc i32 %16 to i16, !dbg !15
|
30 |
+
%extelt.offset3 = lshr i32 %16, 16, !dbg !15
|
31 |
+
%24 = trunc i32 %extelt.offset3 to i16, !dbg !15
|
32 |
+
%25 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %17) #4, !dbg !16
|
33 |
+
%26 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %18) #4, !dbg !16
|
34 |
+
%27 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %19) #4, !dbg !16
|
35 |
+
%28 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %20) #4, !dbg !16
|
36 |
+
%29 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %21) #4, !dbg !16
|
37 |
+
%30 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %22) #4, !dbg !16
|
38 |
+
%31 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %23) #4, !dbg !16
|
39 |
+
%32 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %24) #4, !dbg !16
|
40 |
+
%33 = fmul float %25, 0x3FE6A09E60000000, !dbg !17
|
41 |
+
%34 = fmul float %26, 0x3FE6A09E60000000, !dbg !17
|
42 |
+
%35 = fmul float %27, 0x3FE6A09E60000000, !dbg !17
|
43 |
+
%36 = fmul float %28, 0x3FE6A09E60000000, !dbg !17
|
44 |
+
%37 = fmul float %29, 0x3FE6A09E60000000, !dbg !17
|
45 |
+
%38 = fmul float %30, 0x3FE6A09E60000000, !dbg !17
|
46 |
+
%39 = fmul float %31, 0x3FE6A09E60000000, !dbg !17
|
47 |
+
%40 = fmul float %32, 0x3FE6A09E60000000, !dbg !17
|
48 |
+
%41 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
49 |
+
%.not.i = icmp eq i32 %41, 0, !dbg !18
|
50 |
+
%42 = tail call float @llvm.nvvm.fabs.ftz.f(float %33) #4, !dbg !18
|
51 |
+
%43 = tail call float @llvm.nvvm.fabs.f(float %33) #4, !dbg !18
|
52 |
+
%.0.i = select i1 %.not.i, float %43, float %42, !dbg !18
|
53 |
+
%44 = fcmp oge float %.0.i, 0x3FF00C1FC0000000, !dbg !18
|
54 |
+
br i1 %44, label %__nv_fabsf.exit1.i, label %46, !dbg !18
|
55 |
+
|
56 |
+
__nv_fabsf.exit1.i: ; preds = %3
|
57 |
+
%45 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
58 |
+
%.not1.i = icmp eq i32 %45, 0, !dbg !18
|
59 |
+
%.01.i = select i1 %.not1.i, float %43, float %42, !dbg !18
|
60 |
+
br label %__internal_fmad.exit.i, !dbg !18
|
61 |
+
|
62 |
+
46: ; preds = %3
|
63 |
+
%47 = fmul float %33, %33, !dbg !18
|
64 |
+
br label %__internal_fmad.exit.i, !dbg !18
|
65 |
+
|
66 |
+
__internal_fmad.exit.i: ; preds = %46, %__nv_fabsf.exit1.i
|
67 |
+
%48 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i ], [ 0x3FC06EBA60000000, %46 ], !dbg !18
|
68 |
+
%49 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i ], [ 0xBFD8127580000000, %46 ], !dbg !18
|
69 |
+
%50 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i ], [ 0x3FBCE315E0000000, %46 ], !dbg !18
|
70 |
+
%51 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i ], [ 0xBF9B837CE0000000, %46 ], !dbg !18
|
71 |
+
%52 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i ], [ 0x3F755ABD40000000, %46 ], !dbg !18
|
72 |
+
%53 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i ], [ 0xBF4AE9A400000000, %46 ], !dbg !18
|
73 |
+
%54 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i ], [ 0x3F163D2D40000000, %46 ], !dbg !18
|
74 |
+
%55 = phi float [ %.01.i, %__nv_fabsf.exit1.i ], [ %47, %46 ], !dbg !18
|
75 |
+
%56 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
76 |
+
%.not2.i = icmp eq i32 %56, 0, !dbg !18
|
77 |
+
%57 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %54, float %55, float %53) #4, !dbg !18
|
78 |
+
%58 = tail call float @llvm.nvvm.fma.rn.f(float %54, float %55, float %53) #4, !dbg !18
|
79 |
+
%.02.i = select i1 %.not2.i, float %58, float %57, !dbg !18
|
80 |
+
%59 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
81 |
+
%.not3.i = icmp eq i32 %59, 0, !dbg !18
|
82 |
+
%60 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i, float %55, float %52) #4, !dbg !18
|
83 |
+
%61 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i, float %55, float %52) #4, !dbg !18
|
84 |
+
%.03.i = select i1 %.not3.i, float %61, float %60, !dbg !18
|
85 |
+
%62 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
86 |
+
%.not4.i = icmp eq i32 %62, 0, !dbg !18
|
87 |
+
%63 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i, float %55, float %51) #4, !dbg !18
|
88 |
+
%64 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i, float %55, float %51) #4, !dbg !18
|
89 |
+
%.04.i = select i1 %.not4.i, float %64, float %63, !dbg !18
|
90 |
+
%65 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
91 |
+
%.not5.i = icmp eq i32 %65, 0, !dbg !18
|
92 |
+
%66 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i, float %55, float %50) #4, !dbg !18
|
93 |
+
%67 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i, float %55, float %50) #4, !dbg !18
|
94 |
+
%.05.i = select i1 %.not5.i, float %67, float %66, !dbg !18
|
95 |
+
%68 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
96 |
+
%.not6.i = icmp eq i32 %68, 0, !dbg !18
|
97 |
+
%69 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i, float %55, float %49) #4, !dbg !18
|
98 |
+
%70 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i, float %55, float %49) #4, !dbg !18
|
99 |
+
%.06.i = select i1 %.not6.i, float %70, float %69, !dbg !18
|
100 |
+
%71 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
101 |
+
%.not7.i = icmp eq i32 %71, 0, !dbg !18
|
102 |
+
%72 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i, float %55, float %48) #4, !dbg !18
|
103 |
+
%73 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i, float %55, float %48) #4, !dbg !18
|
104 |
+
%.07.i = select i1 %.not7.i, float %73, float %72, !dbg !18
|
105 |
+
%74 = fneg float %55, !dbg !18
|
106 |
+
%75 = select i1 %44, float %74, float %33, !dbg !18
|
107 |
+
%76 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
108 |
+
%.not8.i = icmp eq i32 %76, 0, !dbg !18
|
109 |
+
%77 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i, float %75, float %75) #4, !dbg !18
|
110 |
+
%78 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i, float %75, float %75) #4, !dbg !18
|
111 |
+
%.08.i = select i1 %.not8.i, float %78, float %77, !dbg !18
|
112 |
+
br i1 %44, label %79, label %__nv_erff.exit, !dbg !18
|
113 |
+
|
114 |
+
79: ; preds = %__internal_fmad.exit.i
|
115 |
+
%80 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i) #4, !dbg !18
|
116 |
+
%81 = fsub float 1.000000e+00, %80, !dbg !18
|
117 |
+
%82 = bitcast float %81 to i32, !dbg !18
|
118 |
+
%83 = bitcast float %33 to i32, !dbg !18
|
119 |
+
%84 = and i32 %83, -2147483648, !dbg !18
|
120 |
+
%85 = or i32 %84, %82, !dbg !18
|
121 |
+
%86 = bitcast i32 %85 to float, !dbg !18
|
122 |
+
br label %__nv_erff.exit, !dbg !18
|
123 |
+
|
124 |
+
__nv_erff.exit: ; preds = %__internal_fmad.exit.i, %79
|
125 |
+
%r.0.i = phi float [ %86, %79 ], [ %.08.i, %__internal_fmad.exit.i ], !dbg !18
|
126 |
+
%87 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
127 |
+
%.not.i4 = icmp eq i32 %87, 0, !dbg !18
|
128 |
+
%88 = tail call float @llvm.nvvm.fabs.ftz.f(float %34) #4, !dbg !18
|
129 |
+
%89 = tail call float @llvm.nvvm.fabs.f(float %34) #4, !dbg !18
|
130 |
+
%.0.i5 = select i1 %.not.i4, float %89, float %88, !dbg !18
|
131 |
+
%90 = fcmp oge float %.0.i5, 0x3FF00C1FC0000000, !dbg !18
|
132 |
+
br i1 %90, label %__nv_fabsf.exit1.i22, label %92, !dbg !18
|
133 |
+
|
134 |
+
__nv_fabsf.exit1.i22: ; preds = %__nv_erff.exit
|
135 |
+
%91 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
136 |
+
%.not1.i23 = icmp eq i32 %91, 0, !dbg !18
|
137 |
+
%.01.i24 = select i1 %.not1.i23, float %89, float %88, !dbg !18
|
138 |
+
br label %__internal_fmad.exit.i6, !dbg !18
|
139 |
+
|
140 |
+
92: ; preds = %__nv_erff.exit
|
141 |
+
%93 = fmul float %34, %34, !dbg !18
|
142 |
+
br label %__internal_fmad.exit.i6, !dbg !18
|
143 |
+
|
144 |
+
__internal_fmad.exit.i6: ; preds = %92, %__nv_fabsf.exit1.i22
|
145 |
+
%94 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i22 ], [ 0x3FC06EBA60000000, %92 ], !dbg !18
|
146 |
+
%95 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i22 ], [ 0xBFD8127580000000, %92 ], !dbg !18
|
147 |
+
%96 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i22 ], [ 0x3FBCE315E0000000, %92 ], !dbg !18
|
148 |
+
%97 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i22 ], [ 0xBF9B837CE0000000, %92 ], !dbg !18
|
149 |
+
%98 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i22 ], [ 0x3F755ABD40000000, %92 ], !dbg !18
|
150 |
+
%99 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i22 ], [ 0xBF4AE9A400000000, %92 ], !dbg !18
|
151 |
+
%100 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i22 ], [ 0x3F163D2D40000000, %92 ], !dbg !18
|
152 |
+
%101 = phi float [ %.01.i24, %__nv_fabsf.exit1.i22 ], [ %93, %92 ], !dbg !18
|
153 |
+
%102 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
154 |
+
%.not2.i7 = icmp eq i32 %102, 0, !dbg !18
|
155 |
+
%103 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %100, float %101, float %99) #4, !dbg !18
|
156 |
+
%104 = tail call float @llvm.nvvm.fma.rn.f(float %100, float %101, float %99) #4, !dbg !18
|
157 |
+
%.02.i8 = select i1 %.not2.i7, float %104, float %103, !dbg !18
|
158 |
+
%105 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
159 |
+
%.not3.i9 = icmp eq i32 %105, 0, !dbg !18
|
160 |
+
%106 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i8, float %101, float %98) #4, !dbg !18
|
161 |
+
%107 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i8, float %101, float %98) #4, !dbg !18
|
162 |
+
%.03.i10 = select i1 %.not3.i9, float %107, float %106, !dbg !18
|
163 |
+
%108 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
164 |
+
%.not4.i11 = icmp eq i32 %108, 0, !dbg !18
|
165 |
+
%109 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i10, float %101, float %97) #4, !dbg !18
|
166 |
+
%110 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i10, float %101, float %97) #4, !dbg !18
|
167 |
+
%.04.i12 = select i1 %.not4.i11, float %110, float %109, !dbg !18
|
168 |
+
%111 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
169 |
+
%.not5.i13 = icmp eq i32 %111, 0, !dbg !18
|
170 |
+
%112 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i12, float %101, float %96) #4, !dbg !18
|
171 |
+
%113 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i12, float %101, float %96) #4, !dbg !18
|
172 |
+
%.05.i14 = select i1 %.not5.i13, float %113, float %112, !dbg !18
|
173 |
+
%114 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
174 |
+
%.not6.i15 = icmp eq i32 %114, 0, !dbg !18
|
175 |
+
%115 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i14, float %101, float %95) #4, !dbg !18
|
176 |
+
%116 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i14, float %101, float %95) #4, !dbg !18
|
177 |
+
%.06.i16 = select i1 %.not6.i15, float %116, float %115, !dbg !18
|
178 |
+
%117 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
179 |
+
%.not7.i17 = icmp eq i32 %117, 0, !dbg !18
|
180 |
+
%118 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i16, float %101, float %94) #4, !dbg !18
|
181 |
+
%119 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i16, float %101, float %94) #4, !dbg !18
|
182 |
+
%.07.i18 = select i1 %.not7.i17, float %119, float %118, !dbg !18
|
183 |
+
%120 = fneg float %101, !dbg !18
|
184 |
+
%121 = select i1 %90, float %120, float %34, !dbg !18
|
185 |
+
%122 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
186 |
+
%.not8.i19 = icmp eq i32 %122, 0, !dbg !18
|
187 |
+
%123 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i18, float %121, float %121) #4, !dbg !18
|
188 |
+
%124 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i18, float %121, float %121) #4, !dbg !18
|
189 |
+
%.08.i20 = select i1 %.not8.i19, float %124, float %123, !dbg !18
|
190 |
+
br i1 %90, label %125, label %__nv_erff.exit25, !dbg !18
|
191 |
+
|
192 |
+
125: ; preds = %__internal_fmad.exit.i6
|
193 |
+
%126 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i20) #4, !dbg !18
|
194 |
+
%127 = fsub float 1.000000e+00, %126, !dbg !18
|
195 |
+
%128 = bitcast float %127 to i32, !dbg !18
|
196 |
+
%129 = bitcast float %34 to i32, !dbg !18
|
197 |
+
%130 = and i32 %129, -2147483648, !dbg !18
|
198 |
+
%131 = or i32 %130, %128, !dbg !18
|
199 |
+
%132 = bitcast i32 %131 to float, !dbg !18
|
200 |
+
br label %__nv_erff.exit25, !dbg !18
|
201 |
+
|
202 |
+
__nv_erff.exit25: ; preds = %__internal_fmad.exit.i6, %125
|
203 |
+
%r.0.i21 = phi float [ %132, %125 ], [ %.08.i20, %__internal_fmad.exit.i6 ], !dbg !18
|
204 |
+
%133 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
205 |
+
%.not.i26 = icmp eq i32 %133, 0, !dbg !18
|
206 |
+
%134 = tail call float @llvm.nvvm.fabs.ftz.f(float %35) #4, !dbg !18
|
207 |
+
%135 = tail call float @llvm.nvvm.fabs.f(float %35) #4, !dbg !18
|
208 |
+
%.0.i27 = select i1 %.not.i26, float %135, float %134, !dbg !18
|
209 |
+
%136 = fcmp oge float %.0.i27, 0x3FF00C1FC0000000, !dbg !18
|
210 |
+
br i1 %136, label %__nv_fabsf.exit1.i44, label %138, !dbg !18
|
211 |
+
|
212 |
+
__nv_fabsf.exit1.i44: ; preds = %__nv_erff.exit25
|
213 |
+
%137 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
214 |
+
%.not1.i45 = icmp eq i32 %137, 0, !dbg !18
|
215 |
+
%.01.i46 = select i1 %.not1.i45, float %135, float %134, !dbg !18
|
216 |
+
br label %__internal_fmad.exit.i28, !dbg !18
|
217 |
+
|
218 |
+
138: ; preds = %__nv_erff.exit25
|
219 |
+
%139 = fmul float %35, %35, !dbg !18
|
220 |
+
br label %__internal_fmad.exit.i28, !dbg !18
|
221 |
+
|
222 |
+
__internal_fmad.exit.i28: ; preds = %138, %__nv_fabsf.exit1.i44
|
223 |
+
%140 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i44 ], [ 0x3FC06EBA60000000, %138 ], !dbg !18
|
224 |
+
%141 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i44 ], [ 0xBFD8127580000000, %138 ], !dbg !18
|
225 |
+
%142 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i44 ], [ 0x3FBCE315E0000000, %138 ], !dbg !18
|
226 |
+
%143 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i44 ], [ 0xBF9B837CE0000000, %138 ], !dbg !18
|
227 |
+
%144 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i44 ], [ 0x3F755ABD40000000, %138 ], !dbg !18
|
228 |
+
%145 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i44 ], [ 0xBF4AE9A400000000, %138 ], !dbg !18
|
229 |
+
%146 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i44 ], [ 0x3F163D2D40000000, %138 ], !dbg !18
|
230 |
+
%147 = phi float [ %.01.i46, %__nv_fabsf.exit1.i44 ], [ %139, %138 ], !dbg !18
|
231 |
+
%148 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
232 |
+
%.not2.i29 = icmp eq i32 %148, 0, !dbg !18
|
233 |
+
%149 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %146, float %147, float %145) #4, !dbg !18
|
234 |
+
%150 = tail call float @llvm.nvvm.fma.rn.f(float %146, float %147, float %145) #4, !dbg !18
|
235 |
+
%.02.i30 = select i1 %.not2.i29, float %150, float %149, !dbg !18
|
236 |
+
%151 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
237 |
+
%.not3.i31 = icmp eq i32 %151, 0, !dbg !18
|
238 |
+
%152 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i30, float %147, float %144) #4, !dbg !18
|
239 |
+
%153 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i30, float %147, float %144) #4, !dbg !18
|
240 |
+
%.03.i32 = select i1 %.not3.i31, float %153, float %152, !dbg !18
|
241 |
+
%154 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
242 |
+
%.not4.i33 = icmp eq i32 %154, 0, !dbg !18
|
243 |
+
%155 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i32, float %147, float %143) #4, !dbg !18
|
244 |
+
%156 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i32, float %147, float %143) #4, !dbg !18
|
245 |
+
%.04.i34 = select i1 %.not4.i33, float %156, float %155, !dbg !18
|
246 |
+
%157 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
247 |
+
%.not5.i35 = icmp eq i32 %157, 0, !dbg !18
|
248 |
+
%158 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i34, float %147, float %142) #4, !dbg !18
|
249 |
+
%159 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i34, float %147, float %142) #4, !dbg !18
|
250 |
+
%.05.i36 = select i1 %.not5.i35, float %159, float %158, !dbg !18
|
251 |
+
%160 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
252 |
+
%.not6.i37 = icmp eq i32 %160, 0, !dbg !18
|
253 |
+
%161 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i36, float %147, float %141) #4, !dbg !18
|
254 |
+
%162 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i36, float %147, float %141) #4, !dbg !18
|
255 |
+
%.06.i38 = select i1 %.not6.i37, float %162, float %161, !dbg !18
|
256 |
+
%163 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
257 |
+
%.not7.i39 = icmp eq i32 %163, 0, !dbg !18
|
258 |
+
%164 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i38, float %147, float %140) #4, !dbg !18
|
259 |
+
%165 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i38, float %147, float %140) #4, !dbg !18
|
260 |
+
%.07.i40 = select i1 %.not7.i39, float %165, float %164, !dbg !18
|
261 |
+
%166 = fneg float %147, !dbg !18
|
262 |
+
%167 = select i1 %136, float %166, float %35, !dbg !18
|
263 |
+
%168 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
264 |
+
%.not8.i41 = icmp eq i32 %168, 0, !dbg !18
|
265 |
+
%169 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i40, float %167, float %167) #4, !dbg !18
|
266 |
+
%170 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i40, float %167, float %167) #4, !dbg !18
|
267 |
+
%.08.i42 = select i1 %.not8.i41, float %170, float %169, !dbg !18
|
268 |
+
br i1 %136, label %171, label %__nv_erff.exit47, !dbg !18
|
269 |
+
|
270 |
+
171: ; preds = %__internal_fmad.exit.i28
|
271 |
+
%172 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i42) #4, !dbg !18
|
272 |
+
%173 = fsub float 1.000000e+00, %172, !dbg !18
|
273 |
+
%174 = bitcast float %173 to i32, !dbg !18
|
274 |
+
%175 = bitcast float %35 to i32, !dbg !18
|
275 |
+
%176 = and i32 %175, -2147483648, !dbg !18
|
276 |
+
%177 = or i32 %176, %174, !dbg !18
|
277 |
+
%178 = bitcast i32 %177 to float, !dbg !18
|
278 |
+
br label %__nv_erff.exit47, !dbg !18
|
279 |
+
|
280 |
+
__nv_erff.exit47: ; preds = %__internal_fmad.exit.i28, %171
|
281 |
+
%r.0.i43 = phi float [ %178, %171 ], [ %.08.i42, %__internal_fmad.exit.i28 ], !dbg !18
|
282 |
+
%179 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
283 |
+
%.not.i48 = icmp eq i32 %179, 0, !dbg !18
|
284 |
+
%180 = tail call float @llvm.nvvm.fabs.ftz.f(float %36) #4, !dbg !18
|
285 |
+
%181 = tail call float @llvm.nvvm.fabs.f(float %36) #4, !dbg !18
|
286 |
+
%.0.i49 = select i1 %.not.i48, float %181, float %180, !dbg !18
|
287 |
+
%182 = fcmp oge float %.0.i49, 0x3FF00C1FC0000000, !dbg !18
|
288 |
+
br i1 %182, label %__nv_fabsf.exit1.i66, label %184, !dbg !18
|
289 |
+
|
290 |
+
__nv_fabsf.exit1.i66: ; preds = %__nv_erff.exit47
|
291 |
+
%183 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
292 |
+
%.not1.i67 = icmp eq i32 %183, 0, !dbg !18
|
293 |
+
%.01.i68 = select i1 %.not1.i67, float %181, float %180, !dbg !18
|
294 |
+
br label %__internal_fmad.exit.i50, !dbg !18
|
295 |
+
|
296 |
+
184: ; preds = %__nv_erff.exit47
|
297 |
+
%185 = fmul float %36, %36, !dbg !18
|
298 |
+
br label %__internal_fmad.exit.i50, !dbg !18
|
299 |
+
|
300 |
+
__internal_fmad.exit.i50: ; preds = %184, %__nv_fabsf.exit1.i66
|
301 |
+
%186 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i66 ], [ 0x3FC06EBA60000000, %184 ], !dbg !18
|
302 |
+
%187 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i66 ], [ 0xBFD8127580000000, %184 ], !dbg !18
|
303 |
+
%188 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i66 ], [ 0x3FBCE315E0000000, %184 ], !dbg !18
|
304 |
+
%189 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i66 ], [ 0xBF9B837CE0000000, %184 ], !dbg !18
|
305 |
+
%190 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i66 ], [ 0x3F755ABD40000000, %184 ], !dbg !18
|
306 |
+
%191 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i66 ], [ 0xBF4AE9A400000000, %184 ], !dbg !18
|
307 |
+
%192 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i66 ], [ 0x3F163D2D40000000, %184 ], !dbg !18
|
308 |
+
%193 = phi float [ %.01.i68, %__nv_fabsf.exit1.i66 ], [ %185, %184 ], !dbg !18
|
309 |
+
%194 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
310 |
+
%.not2.i51 = icmp eq i32 %194, 0, !dbg !18
|
311 |
+
%195 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %192, float %193, float %191) #4, !dbg !18
|
312 |
+
%196 = tail call float @llvm.nvvm.fma.rn.f(float %192, float %193, float %191) #4, !dbg !18
|
313 |
+
%.02.i52 = select i1 %.not2.i51, float %196, float %195, !dbg !18
|
314 |
+
%197 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
315 |
+
%.not3.i53 = icmp eq i32 %197, 0, !dbg !18
|
316 |
+
%198 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i52, float %193, float %190) #4, !dbg !18
|
317 |
+
%199 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i52, float %193, float %190) #4, !dbg !18
|
318 |
+
%.03.i54 = select i1 %.not3.i53, float %199, float %198, !dbg !18
|
319 |
+
%200 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
320 |
+
%.not4.i55 = icmp eq i32 %200, 0, !dbg !18
|
321 |
+
%201 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i54, float %193, float %189) #4, !dbg !18
|
322 |
+
%202 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i54, float %193, float %189) #4, !dbg !18
|
323 |
+
%.04.i56 = select i1 %.not4.i55, float %202, float %201, !dbg !18
|
324 |
+
%203 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
325 |
+
%.not5.i57 = icmp eq i32 %203, 0, !dbg !18
|
326 |
+
%204 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i56, float %193, float %188) #4, !dbg !18
|
327 |
+
%205 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i56, float %193, float %188) #4, !dbg !18
|
328 |
+
%.05.i58 = select i1 %.not5.i57, float %205, float %204, !dbg !18
|
329 |
+
%206 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
330 |
+
%.not6.i59 = icmp eq i32 %206, 0, !dbg !18
|
331 |
+
%207 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i58, float %193, float %187) #4, !dbg !18
|
332 |
+
%208 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i58, float %193, float %187) #4, !dbg !18
|
333 |
+
%.06.i60 = select i1 %.not6.i59, float %208, float %207, !dbg !18
|
334 |
+
%209 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
335 |
+
%.not7.i61 = icmp eq i32 %209, 0, !dbg !18
|
336 |
+
%210 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i60, float %193, float %186) #4, !dbg !18
|
337 |
+
%211 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i60, float %193, float %186) #4, !dbg !18
|
338 |
+
%.07.i62 = select i1 %.not7.i61, float %211, float %210, !dbg !18
|
339 |
+
%212 = fneg float %193, !dbg !18
|
340 |
+
%213 = select i1 %182, float %212, float %36, !dbg !18
|
341 |
+
%214 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
342 |
+
%.not8.i63 = icmp eq i32 %214, 0, !dbg !18
|
343 |
+
%215 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i62, float %213, float %213) #4, !dbg !18
|
344 |
+
%216 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i62, float %213, float %213) #4, !dbg !18
|
345 |
+
%.08.i64 = select i1 %.not8.i63, float %216, float %215, !dbg !18
|
346 |
+
br i1 %182, label %217, label %__nv_erff.exit69, !dbg !18
|
347 |
+
|
348 |
+
217: ; preds = %__internal_fmad.exit.i50
|
349 |
+
%218 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i64) #4, !dbg !18
|
350 |
+
%219 = fsub float 1.000000e+00, %218, !dbg !18
|
351 |
+
%220 = bitcast float %219 to i32, !dbg !18
|
352 |
+
%221 = bitcast float %36 to i32, !dbg !18
|
353 |
+
%222 = and i32 %221, -2147483648, !dbg !18
|
354 |
+
%223 = or i32 %222, %220, !dbg !18
|
355 |
+
%224 = bitcast i32 %223 to float, !dbg !18
|
356 |
+
br label %__nv_erff.exit69, !dbg !18
|
357 |
+
|
358 |
+
__nv_erff.exit69: ; preds = %__internal_fmad.exit.i50, %217
|
359 |
+
%r.0.i65 = phi float [ %224, %217 ], [ %.08.i64, %__internal_fmad.exit.i50 ], !dbg !18
|
360 |
+
%225 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
361 |
+
%.not.i70 = icmp eq i32 %225, 0, !dbg !18
|
362 |
+
%226 = tail call float @llvm.nvvm.fabs.ftz.f(float %37) #4, !dbg !18
|
363 |
+
%227 = tail call float @llvm.nvvm.fabs.f(float %37) #4, !dbg !18
|
364 |
+
%.0.i71 = select i1 %.not.i70, float %227, float %226, !dbg !18
|
365 |
+
%228 = fcmp oge float %.0.i71, 0x3FF00C1FC0000000, !dbg !18
|
366 |
+
br i1 %228, label %__nv_fabsf.exit1.i88, label %230, !dbg !18
|
367 |
+
|
368 |
+
__nv_fabsf.exit1.i88: ; preds = %__nv_erff.exit69
|
369 |
+
%229 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
370 |
+
%.not1.i89 = icmp eq i32 %229, 0, !dbg !18
|
371 |
+
%.01.i90 = select i1 %.not1.i89, float %227, float %226, !dbg !18
|
372 |
+
br label %__internal_fmad.exit.i72, !dbg !18
|
373 |
+
|
374 |
+
230: ; preds = %__nv_erff.exit69
|
375 |
+
%231 = fmul float %37, %37, !dbg !18
|
376 |
+
br label %__internal_fmad.exit.i72, !dbg !18
|
377 |
+
|
378 |
+
__internal_fmad.exit.i72: ; preds = %230, %__nv_fabsf.exit1.i88
|
379 |
+
%232 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i88 ], [ 0x3FC06EBA60000000, %230 ], !dbg !18
|
380 |
+
%233 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i88 ], [ 0xBFD8127580000000, %230 ], !dbg !18
|
381 |
+
%234 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i88 ], [ 0x3FBCE315E0000000, %230 ], !dbg !18
|
382 |
+
%235 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i88 ], [ 0xBF9B837CE0000000, %230 ], !dbg !18
|
383 |
+
%236 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i88 ], [ 0x3F755ABD40000000, %230 ], !dbg !18
|
384 |
+
%237 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i88 ], [ 0xBF4AE9A400000000, %230 ], !dbg !18
|
385 |
+
%238 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i88 ], [ 0x3F163D2D40000000, %230 ], !dbg !18
|
386 |
+
%239 = phi float [ %.01.i90, %__nv_fabsf.exit1.i88 ], [ %231, %230 ], !dbg !18
|
387 |
+
%240 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
388 |
+
%.not2.i73 = icmp eq i32 %240, 0, !dbg !18
|
389 |
+
%241 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %238, float %239, float %237) #4, !dbg !18
|
390 |
+
%242 = tail call float @llvm.nvvm.fma.rn.f(float %238, float %239, float %237) #4, !dbg !18
|
391 |
+
%.02.i74 = select i1 %.not2.i73, float %242, float %241, !dbg !18
|
392 |
+
%243 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
393 |
+
%.not3.i75 = icmp eq i32 %243, 0, !dbg !18
|
394 |
+
%244 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i74, float %239, float %236) #4, !dbg !18
|
395 |
+
%245 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i74, float %239, float %236) #4, !dbg !18
|
396 |
+
%.03.i76 = select i1 %.not3.i75, float %245, float %244, !dbg !18
|
397 |
+
%246 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
398 |
+
%.not4.i77 = icmp eq i32 %246, 0, !dbg !18
|
399 |
+
%247 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i76, float %239, float %235) #4, !dbg !18
|
400 |
+
%248 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i76, float %239, float %235) #4, !dbg !18
|
401 |
+
%.04.i78 = select i1 %.not4.i77, float %248, float %247, !dbg !18
|
402 |
+
%249 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
403 |
+
%.not5.i79 = icmp eq i32 %249, 0, !dbg !18
|
404 |
+
%250 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i78, float %239, float %234) #4, !dbg !18
|
405 |
+
%251 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i78, float %239, float %234) #4, !dbg !18
|
406 |
+
%.05.i80 = select i1 %.not5.i79, float %251, float %250, !dbg !18
|
407 |
+
%252 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
408 |
+
%.not6.i81 = icmp eq i32 %252, 0, !dbg !18
|
409 |
+
%253 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i80, float %239, float %233) #4, !dbg !18
|
410 |
+
%254 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i80, float %239, float %233) #4, !dbg !18
|
411 |
+
%.06.i82 = select i1 %.not6.i81, float %254, float %253, !dbg !18
|
412 |
+
%255 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
413 |
+
%.not7.i83 = icmp eq i32 %255, 0, !dbg !18
|
414 |
+
%256 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i82, float %239, float %232) #4, !dbg !18
|
415 |
+
%257 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i82, float %239, float %232) #4, !dbg !18
|
416 |
+
%.07.i84 = select i1 %.not7.i83, float %257, float %256, !dbg !18
|
417 |
+
%258 = fneg float %239, !dbg !18
|
418 |
+
%259 = select i1 %228, float %258, float %37, !dbg !18
|
419 |
+
%260 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
420 |
+
%.not8.i85 = icmp eq i32 %260, 0, !dbg !18
|
421 |
+
%261 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i84, float %259, float %259) #4, !dbg !18
|
422 |
+
%262 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i84, float %259, float %259) #4, !dbg !18
|
423 |
+
%.08.i86 = select i1 %.not8.i85, float %262, float %261, !dbg !18
|
424 |
+
br i1 %228, label %263, label %__nv_erff.exit91, !dbg !18
|
425 |
+
|
426 |
+
263: ; preds = %__internal_fmad.exit.i72
|
427 |
+
%264 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i86) #4, !dbg !18
|
428 |
+
%265 = fsub float 1.000000e+00, %264, !dbg !18
|
429 |
+
%266 = bitcast float %265 to i32, !dbg !18
|
430 |
+
%267 = bitcast float %37 to i32, !dbg !18
|
431 |
+
%268 = and i32 %267, -2147483648, !dbg !18
|
432 |
+
%269 = or i32 %268, %266, !dbg !18
|
433 |
+
%270 = bitcast i32 %269 to float, !dbg !18
|
434 |
+
br label %__nv_erff.exit91, !dbg !18
|
435 |
+
|
436 |
+
__nv_erff.exit91: ; preds = %__internal_fmad.exit.i72, %263
|
437 |
+
%r.0.i87 = phi float [ %270, %263 ], [ %.08.i86, %__internal_fmad.exit.i72 ], !dbg !18
|
438 |
+
%271 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
439 |
+
%.not.i92 = icmp eq i32 %271, 0, !dbg !18
|
440 |
+
%272 = tail call float @llvm.nvvm.fabs.ftz.f(float %38) #4, !dbg !18
|
441 |
+
%273 = tail call float @llvm.nvvm.fabs.f(float %38) #4, !dbg !18
|
442 |
+
%.0.i93 = select i1 %.not.i92, float %273, float %272, !dbg !18
|
443 |
+
%274 = fcmp oge float %.0.i93, 0x3FF00C1FC0000000, !dbg !18
|
444 |
+
br i1 %274, label %__nv_fabsf.exit1.i110, label %276, !dbg !18
|
445 |
+
|
446 |
+
__nv_fabsf.exit1.i110: ; preds = %__nv_erff.exit91
|
447 |
+
%275 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
448 |
+
%.not1.i111 = icmp eq i32 %275, 0, !dbg !18
|
449 |
+
%.01.i112 = select i1 %.not1.i111, float %273, float %272, !dbg !18
|
450 |
+
br label %__internal_fmad.exit.i94, !dbg !18
|
451 |
+
|
452 |
+
276: ; preds = %__nv_erff.exit91
|
453 |
+
%277 = fmul float %38, %38, !dbg !18
|
454 |
+
br label %__internal_fmad.exit.i94, !dbg !18
|
455 |
+
|
456 |
+
__internal_fmad.exit.i94: ; preds = %276, %__nv_fabsf.exit1.i110
|
457 |
+
%278 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i110 ], [ 0x3FC06EBA60000000, %276 ], !dbg !18
|
458 |
+
%279 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i110 ], [ 0xBFD8127580000000, %276 ], !dbg !18
|
459 |
+
%280 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i110 ], [ 0x3FBCE315E0000000, %276 ], !dbg !18
|
460 |
+
%281 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i110 ], [ 0xBF9B837CE0000000, %276 ], !dbg !18
|
461 |
+
%282 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i110 ], [ 0x3F755ABD40000000, %276 ], !dbg !18
|
462 |
+
%283 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i110 ], [ 0xBF4AE9A400000000, %276 ], !dbg !18
|
463 |
+
%284 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i110 ], [ 0x3F163D2D40000000, %276 ], !dbg !18
|
464 |
+
%285 = phi float [ %.01.i112, %__nv_fabsf.exit1.i110 ], [ %277, %276 ], !dbg !18
|
465 |
+
%286 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
466 |
+
%.not2.i95 = icmp eq i32 %286, 0, !dbg !18
|
467 |
+
%287 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %284, float %285, float %283) #4, !dbg !18
|
468 |
+
%288 = tail call float @llvm.nvvm.fma.rn.f(float %284, float %285, float %283) #4, !dbg !18
|
469 |
+
%.02.i96 = select i1 %.not2.i95, float %288, float %287, !dbg !18
|
470 |
+
%289 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
471 |
+
%.not3.i97 = icmp eq i32 %289, 0, !dbg !18
|
472 |
+
%290 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i96, float %285, float %282) #4, !dbg !18
|
473 |
+
%291 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i96, float %285, float %282) #4, !dbg !18
|
474 |
+
%.03.i98 = select i1 %.not3.i97, float %291, float %290, !dbg !18
|
475 |
+
%292 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
476 |
+
%.not4.i99 = icmp eq i32 %292, 0, !dbg !18
|
477 |
+
%293 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i98, float %285, float %281) #4, !dbg !18
|
478 |
+
%294 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i98, float %285, float %281) #4, !dbg !18
|
479 |
+
%.04.i100 = select i1 %.not4.i99, float %294, float %293, !dbg !18
|
480 |
+
%295 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
481 |
+
%.not5.i101 = icmp eq i32 %295, 0, !dbg !18
|
482 |
+
%296 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i100, float %285, float %280) #4, !dbg !18
|
483 |
+
%297 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i100, float %285, float %280) #4, !dbg !18
|
484 |
+
%.05.i102 = select i1 %.not5.i101, float %297, float %296, !dbg !18
|
485 |
+
%298 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
486 |
+
%.not6.i103 = icmp eq i32 %298, 0, !dbg !18
|
487 |
+
%299 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i102, float %285, float %279) #4, !dbg !18
|
488 |
+
%300 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i102, float %285, float %279) #4, !dbg !18
|
489 |
+
%.06.i104 = select i1 %.not6.i103, float %300, float %299, !dbg !18
|
490 |
+
%301 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
491 |
+
%.not7.i105 = icmp eq i32 %301, 0, !dbg !18
|
492 |
+
%302 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i104, float %285, float %278) #4, !dbg !18
|
493 |
+
%303 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i104, float %285, float %278) #4, !dbg !18
|
494 |
+
%.07.i106 = select i1 %.not7.i105, float %303, float %302, !dbg !18
|
495 |
+
%304 = fneg float %285, !dbg !18
|
496 |
+
%305 = select i1 %274, float %304, float %38, !dbg !18
|
497 |
+
%306 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
498 |
+
%.not8.i107 = icmp eq i32 %306, 0, !dbg !18
|
499 |
+
%307 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i106, float %305, float %305) #4, !dbg !18
|
500 |
+
%308 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i106, float %305, float %305) #4, !dbg !18
|
501 |
+
%.08.i108 = select i1 %.not8.i107, float %308, float %307, !dbg !18
|
502 |
+
br i1 %274, label %309, label %__nv_erff.exit113, !dbg !18
|
503 |
+
|
504 |
+
309: ; preds = %__internal_fmad.exit.i94
|
505 |
+
%310 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i108) #4, !dbg !18
|
506 |
+
%311 = fsub float 1.000000e+00, %310, !dbg !18
|
507 |
+
%312 = bitcast float %311 to i32, !dbg !18
|
508 |
+
%313 = bitcast float %38 to i32, !dbg !18
|
509 |
+
%314 = and i32 %313, -2147483648, !dbg !18
|
510 |
+
%315 = or i32 %314, %312, !dbg !18
|
511 |
+
%316 = bitcast i32 %315 to float, !dbg !18
|
512 |
+
br label %__nv_erff.exit113, !dbg !18
|
513 |
+
|
514 |
+
__nv_erff.exit113: ; preds = %__internal_fmad.exit.i94, %309
|
515 |
+
%r.0.i109 = phi float [ %316, %309 ], [ %.08.i108, %__internal_fmad.exit.i94 ], !dbg !18
|
516 |
+
%317 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
517 |
+
%.not.i114 = icmp eq i32 %317, 0, !dbg !18
|
518 |
+
%318 = tail call float @llvm.nvvm.fabs.ftz.f(float %39) #4, !dbg !18
|
519 |
+
%319 = tail call float @llvm.nvvm.fabs.f(float %39) #4, !dbg !18
|
520 |
+
%.0.i115 = select i1 %.not.i114, float %319, float %318, !dbg !18
|
521 |
+
%320 = fcmp oge float %.0.i115, 0x3FF00C1FC0000000, !dbg !18
|
522 |
+
br i1 %320, label %__nv_fabsf.exit1.i132, label %322, !dbg !18
|
523 |
+
|
524 |
+
__nv_fabsf.exit1.i132: ; preds = %__nv_erff.exit113
|
525 |
+
%321 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
526 |
+
%.not1.i133 = icmp eq i32 %321, 0, !dbg !18
|
527 |
+
%.01.i134 = select i1 %.not1.i133, float %319, float %318, !dbg !18
|
528 |
+
br label %__internal_fmad.exit.i116, !dbg !18
|
529 |
+
|
530 |
+
322: ; preds = %__nv_erff.exit113
|
531 |
+
%323 = fmul float %39, %39, !dbg !18
|
532 |
+
br label %__internal_fmad.exit.i116, !dbg !18
|
533 |
+
|
534 |
+
__internal_fmad.exit.i116: ; preds = %322, %__nv_fabsf.exit1.i132
|
535 |
+
%324 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i132 ], [ 0x3FC06EBA60000000, %322 ], !dbg !18
|
536 |
+
%325 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i132 ], [ 0xBFD8127580000000, %322 ], !dbg !18
|
537 |
+
%326 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i132 ], [ 0x3FBCE315E0000000, %322 ], !dbg !18
|
538 |
+
%327 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i132 ], [ 0xBF9B837CE0000000, %322 ], !dbg !18
|
539 |
+
%328 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i132 ], [ 0x3F755ABD40000000, %322 ], !dbg !18
|
540 |
+
%329 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i132 ], [ 0xBF4AE9A400000000, %322 ], !dbg !18
|
541 |
+
%330 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i132 ], [ 0x3F163D2D40000000, %322 ], !dbg !18
|
542 |
+
%331 = phi float [ %.01.i134, %__nv_fabsf.exit1.i132 ], [ %323, %322 ], !dbg !18
|
543 |
+
%332 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
544 |
+
%.not2.i117 = icmp eq i32 %332, 0, !dbg !18
|
545 |
+
%333 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %330, float %331, float %329) #4, !dbg !18
|
546 |
+
%334 = tail call float @llvm.nvvm.fma.rn.f(float %330, float %331, float %329) #4, !dbg !18
|
547 |
+
%.02.i118 = select i1 %.not2.i117, float %334, float %333, !dbg !18
|
548 |
+
%335 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
549 |
+
%.not3.i119 = icmp eq i32 %335, 0, !dbg !18
|
550 |
+
%336 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i118, float %331, float %328) #4, !dbg !18
|
551 |
+
%337 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i118, float %331, float %328) #4, !dbg !18
|
552 |
+
%.03.i120 = select i1 %.not3.i119, float %337, float %336, !dbg !18
|
553 |
+
%338 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
554 |
+
%.not4.i121 = icmp eq i32 %338, 0, !dbg !18
|
555 |
+
%339 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i120, float %331, float %327) #4, !dbg !18
|
556 |
+
%340 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i120, float %331, float %327) #4, !dbg !18
|
557 |
+
%.04.i122 = select i1 %.not4.i121, float %340, float %339, !dbg !18
|
558 |
+
%341 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
559 |
+
%.not5.i123 = icmp eq i32 %341, 0, !dbg !18
|
560 |
+
%342 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i122, float %331, float %326) #4, !dbg !18
|
561 |
+
%343 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i122, float %331, float %326) #4, !dbg !18
|
562 |
+
%.05.i124 = select i1 %.not5.i123, float %343, float %342, !dbg !18
|
563 |
+
%344 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
564 |
+
%.not6.i125 = icmp eq i32 %344, 0, !dbg !18
|
565 |
+
%345 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i124, float %331, float %325) #4, !dbg !18
|
566 |
+
%346 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i124, float %331, float %325) #4, !dbg !18
|
567 |
+
%.06.i126 = select i1 %.not6.i125, float %346, float %345, !dbg !18
|
568 |
+
%347 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
569 |
+
%.not7.i127 = icmp eq i32 %347, 0, !dbg !18
|
570 |
+
%348 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i126, float %331, float %324) #4, !dbg !18
|
571 |
+
%349 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i126, float %331, float %324) #4, !dbg !18
|
572 |
+
%.07.i128 = select i1 %.not7.i127, float %349, float %348, !dbg !18
|
573 |
+
%350 = fneg float %331, !dbg !18
|
574 |
+
%351 = select i1 %320, float %350, float %39, !dbg !18
|
575 |
+
%352 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
576 |
+
%.not8.i129 = icmp eq i32 %352, 0, !dbg !18
|
577 |
+
%353 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i128, float %351, float %351) #4, !dbg !18
|
578 |
+
%354 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i128, float %351, float %351) #4, !dbg !18
|
579 |
+
%.08.i130 = select i1 %.not8.i129, float %354, float %353, !dbg !18
|
580 |
+
br i1 %320, label %355, label %__nv_erff.exit135, !dbg !18
|
581 |
+
|
582 |
+
355: ; preds = %__internal_fmad.exit.i116
|
583 |
+
%356 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i130) #4, !dbg !18
|
584 |
+
%357 = fsub float 1.000000e+00, %356, !dbg !18
|
585 |
+
%358 = bitcast float %357 to i32, !dbg !18
|
586 |
+
%359 = bitcast float %39 to i32, !dbg !18
|
587 |
+
%360 = and i32 %359, -2147483648, !dbg !18
|
588 |
+
%361 = or i32 %360, %358, !dbg !18
|
589 |
+
%362 = bitcast i32 %361 to float, !dbg !18
|
590 |
+
br label %__nv_erff.exit135, !dbg !18
|
591 |
+
|
592 |
+
__nv_erff.exit135: ; preds = %__internal_fmad.exit.i116, %355
|
593 |
+
%r.0.i131 = phi float [ %362, %355 ], [ %.08.i130, %__internal_fmad.exit.i116 ], !dbg !18
|
594 |
+
%363 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
595 |
+
%.not.i136 = icmp eq i32 %363, 0, !dbg !18
|
596 |
+
%364 = tail call float @llvm.nvvm.fabs.ftz.f(float %40) #4, !dbg !18
|
597 |
+
%365 = tail call float @llvm.nvvm.fabs.f(float %40) #4, !dbg !18
|
598 |
+
%.0.i137 = select i1 %.not.i136, float %365, float %364, !dbg !18
|
599 |
+
%366 = fcmp oge float %.0.i137, 0x3FF00C1FC0000000, !dbg !18
|
600 |
+
br i1 %366, label %__nv_fabsf.exit1.i154, label %368, !dbg !18
|
601 |
+
|
602 |
+
__nv_fabsf.exit1.i154: ; preds = %__nv_erff.exit135
|
603 |
+
%367 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
604 |
+
%.not1.i155 = icmp eq i32 %367, 0, !dbg !18
|
605 |
+
%.01.i156 = select i1 %.not1.i155, float %365, float %364, !dbg !18
|
606 |
+
br label %__internal_fmad.exit.i138, !dbg !18
|
607 |
+
|
608 |
+
368: ; preds = %__nv_erff.exit135
|
609 |
+
%369 = fmul float %40, %40, !dbg !18
|
610 |
+
br label %__internal_fmad.exit.i138, !dbg !18
|
611 |
+
|
612 |
+
__internal_fmad.exit.i138: ; preds = %368, %__nv_fabsf.exit1.i154
|
613 |
+
%370 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i154 ], [ 0x3FC06EBA60000000, %368 ], !dbg !18
|
614 |
+
%371 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i154 ], [ 0xBFD8127580000000, %368 ], !dbg !18
|
615 |
+
%372 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i154 ], [ 0x3FBCE315E0000000, %368 ], !dbg !18
|
616 |
+
%373 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i154 ], [ 0xBF9B837CE0000000, %368 ], !dbg !18
|
617 |
+
%374 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i154 ], [ 0x3F755ABD40000000, %368 ], !dbg !18
|
618 |
+
%375 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i154 ], [ 0xBF4AE9A400000000, %368 ], !dbg !18
|
619 |
+
%376 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i154 ], [ 0x3F163D2D40000000, %368 ], !dbg !18
|
620 |
+
%377 = phi float [ %.01.i156, %__nv_fabsf.exit1.i154 ], [ %369, %368 ], !dbg !18
|
621 |
+
%378 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
622 |
+
%.not2.i139 = icmp eq i32 %378, 0, !dbg !18
|
623 |
+
%379 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %376, float %377, float %375) #4, !dbg !18
|
624 |
+
%380 = tail call float @llvm.nvvm.fma.rn.f(float %376, float %377, float %375) #4, !dbg !18
|
625 |
+
%.02.i140 = select i1 %.not2.i139, float %380, float %379, !dbg !18
|
626 |
+
%381 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
627 |
+
%.not3.i141 = icmp eq i32 %381, 0, !dbg !18
|
628 |
+
%382 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i140, float %377, float %374) #4, !dbg !18
|
629 |
+
%383 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i140, float %377, float %374) #4, !dbg !18
|
630 |
+
%.03.i142 = select i1 %.not3.i141, float %383, float %382, !dbg !18
|
631 |
+
%384 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
632 |
+
%.not4.i143 = icmp eq i32 %384, 0, !dbg !18
|
633 |
+
%385 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i142, float %377, float %373) #4, !dbg !18
|
634 |
+
%386 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i142, float %377, float %373) #4, !dbg !18
|
635 |
+
%.04.i144 = select i1 %.not4.i143, float %386, float %385, !dbg !18
|
636 |
+
%387 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
637 |
+
%.not5.i145 = icmp eq i32 %387, 0, !dbg !18
|
638 |
+
%388 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i144, float %377, float %372) #4, !dbg !18
|
639 |
+
%389 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i144, float %377, float %372) #4, !dbg !18
|
640 |
+
%.05.i146 = select i1 %.not5.i145, float %389, float %388, !dbg !18
|
641 |
+
%390 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
642 |
+
%.not6.i147 = icmp eq i32 %390, 0, !dbg !18
|
643 |
+
%391 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i146, float %377, float %371) #4, !dbg !18
|
644 |
+
%392 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i146, float %377, float %371) #4, !dbg !18
|
645 |
+
%.06.i148 = select i1 %.not6.i147, float %392, float %391, !dbg !18
|
646 |
+
%393 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
647 |
+
%.not7.i149 = icmp eq i32 %393, 0, !dbg !18
|
648 |
+
%394 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i148, float %377, float %370) #4, !dbg !18
|
649 |
+
%395 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i148, float %377, float %370) #4, !dbg !18
|
650 |
+
%.07.i150 = select i1 %.not7.i149, float %395, float %394, !dbg !18
|
651 |
+
%396 = fneg float %377, !dbg !18
|
652 |
+
%397 = select i1 %366, float %396, float %40, !dbg !18
|
653 |
+
%398 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
654 |
+
%.not8.i151 = icmp eq i32 %398, 0, !dbg !18
|
655 |
+
%399 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i150, float %397, float %397) #4, !dbg !18
|
656 |
+
%400 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i150, float %397, float %397) #4, !dbg !18
|
657 |
+
%.08.i152 = select i1 %.not8.i151, float %400, float %399, !dbg !18
|
658 |
+
br i1 %366, label %401, label %__nv_erff.exit157, !dbg !18
|
659 |
+
|
660 |
+
401: ; preds = %__internal_fmad.exit.i138
|
661 |
+
%402 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i152) #4, !dbg !18
|
662 |
+
%403 = fsub float 1.000000e+00, %402, !dbg !18
|
663 |
+
%404 = bitcast float %403 to i32, !dbg !18
|
664 |
+
%405 = bitcast float %40 to i32, !dbg !18
|
665 |
+
%406 = and i32 %405, -2147483648, !dbg !18
|
666 |
+
%407 = or i32 %406, %404, !dbg !18
|
667 |
+
%408 = bitcast i32 %407 to float, !dbg !18
|
668 |
+
br label %__nv_erff.exit157, !dbg !18
|
669 |
+
|
670 |
+
__nv_erff.exit157: ; preds = %__internal_fmad.exit.i138, %401
|
671 |
+
%r.0.i153 = phi float [ %408, %401 ], [ %.08.i152, %__internal_fmad.exit.i138 ], !dbg !18
|
672 |
+
%409 = fmul float %32, 5.000000e-01, !dbg !19
|
673 |
+
%410 = fmul float %31, 5.000000e-01, !dbg !19
|
674 |
+
%411 = fmul float %30, 5.000000e-01, !dbg !19
|
675 |
+
%412 = fmul float %29, 5.000000e-01, !dbg !19
|
676 |
+
%413 = fmul float %28, 5.000000e-01, !dbg !19
|
677 |
+
%414 = fmul float %27, 5.000000e-01, !dbg !19
|
678 |
+
%415 = fmul float %26, 5.000000e-01, !dbg !19
|
679 |
+
%416 = fmul float %25, 5.000000e-01, !dbg !19
|
680 |
+
%417 = fadd float %r.0.i, 1.000000e+00, !dbg !20
|
681 |
+
%418 = fadd float %r.0.i21, 1.000000e+00, !dbg !20
|
682 |
+
%419 = fadd float %r.0.i43, 1.000000e+00, !dbg !20
|
683 |
+
%420 = fadd float %r.0.i65, 1.000000e+00, !dbg !20
|
684 |
+
%421 = fadd float %r.0.i87, 1.000000e+00, !dbg !20
|
685 |
+
%422 = fadd float %r.0.i109, 1.000000e+00, !dbg !20
|
686 |
+
%423 = fadd float %r.0.i131, 1.000000e+00, !dbg !20
|
687 |
+
%424 = fadd float %r.0.i153, 1.000000e+00, !dbg !20
|
688 |
+
%425 = fmul float %416, %417, !dbg !21
|
689 |
+
%426 = fmul float %415, %418, !dbg !21
|
690 |
+
%427 = fmul float %414, %419, !dbg !21
|
691 |
+
%428 = fmul float %413, %420, !dbg !21
|
692 |
+
%429 = fmul float %412, %421, !dbg !21
|
693 |
+
%430 = fmul float %411, %422, !dbg !21
|
694 |
+
%431 = fmul float %410, %423, !dbg !21
|
695 |
+
%432 = fmul float %409, %424, !dbg !21
|
696 |
+
%433 = getelementptr i16, ptr addrspace(1) %1, i64 %10, !dbg !22
|
697 |
+
%434 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %425) #4, !dbg !23
|
698 |
+
%435 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %426) #4, !dbg !23
|
699 |
+
%436 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %427) #4, !dbg !23
|
700 |
+
%437 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %428) #4, !dbg !23
|
701 |
+
%438 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %429) #4, !dbg !23
|
702 |
+
%439 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %430) #4, !dbg !23
|
703 |
+
%440 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %431) #4, !dbg !23
|
704 |
+
%441 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %432) #4, !dbg !23
|
705 |
+
%442 = insertelement <2 x i16> undef, i16 %434, i64 0, !dbg !23
|
706 |
+
%443 = insertelement <2 x i16> %442, i16 %435, i64 1, !dbg !23
|
707 |
+
%444 = bitcast <2 x i16> %443 to i32, !dbg !23
|
708 |
+
%445 = insertelement <2 x i16> undef, i16 %436, i64 0, !dbg !23
|
709 |
+
%446 = insertelement <2 x i16> %445, i16 %437, i64 1, !dbg !23
|
710 |
+
%447 = bitcast <2 x i16> %446 to i32, !dbg !23
|
711 |
+
%448 = insertelement <2 x i16> undef, i16 %438, i64 0, !dbg !23
|
712 |
+
%449 = insertelement <2 x i16> %448, i16 %439, i64 1, !dbg !23
|
713 |
+
%450 = bitcast <2 x i16> %449 to i32, !dbg !23
|
714 |
+
%451 = insertelement <2 x i16> undef, i16 %440, i64 0, !dbg !23
|
715 |
+
%452 = insertelement <2 x i16> %451, i16 %441, i64 1, !dbg !23
|
716 |
+
%453 = bitcast <2 x i16> %452 to i32, !dbg !23
|
717 |
+
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %444, i32 %447, i32 %450, i32 %453, ptr addrspace(1) %433, i1 true) #4, !dbg !23
|
718 |
+
ret void, !dbg !24
|
719 |
+
}
|
720 |
+
|
721 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
722 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
723 |
+
|
724 |
+
; Function Attrs: alwaysinline nounwind
|
725 |
+
define float @__nv_erff(float %a) local_unnamed_addr #1 {
|
726 |
+
__nv_fabsf.exit:
|
727 |
+
%0 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
728 |
+
%.not = icmp eq i32 %0, 0
|
729 |
+
%1 = tail call float @llvm.nvvm.fabs.ftz.f(float %a) #4
|
730 |
+
%2 = tail call float @llvm.nvvm.fabs.f(float %a) #4
|
731 |
+
%.0 = select i1 %.not, float %2, float %1
|
732 |
+
%3 = fcmp oge float %.0, 0x3FF00C1FC0000000
|
733 |
+
br i1 %3, label %__nv_fabsf.exit1, label %5
|
734 |
+
|
735 |
+
__nv_fabsf.exit1: ; preds = %__nv_fabsf.exit
|
736 |
+
%4 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
737 |
+
%.not1 = icmp eq i32 %4, 0
|
738 |
+
%.01 = select i1 %.not1, float %2, float %1
|
739 |
+
br label %__internal_fmad.exit
|
740 |
+
|
741 |
+
5: ; preds = %__nv_fabsf.exit
|
742 |
+
%6 = fmul float %a, %a
|
743 |
+
br label %__internal_fmad.exit
|
744 |
+
|
745 |
+
__internal_fmad.exit: ; preds = %5, %__nv_fabsf.exit1
|
746 |
+
%7 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1 ], [ 0x3FC06EBA60000000, %5 ]
|
747 |
+
%8 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1 ], [ 0xBFD8127580000000, %5 ]
|
748 |
+
%9 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1 ], [ 0x3FBCE315E0000000, %5 ]
|
749 |
+
%10 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1 ], [ 0xBF9B837CE0000000, %5 ]
|
750 |
+
%11 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1 ], [ 0x3F755ABD40000000, %5 ]
|
751 |
+
%12 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1 ], [ 0xBF4AE9A400000000, %5 ]
|
752 |
+
%13 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1 ], [ 0x3F163D2D40000000, %5 ]
|
753 |
+
%14 = phi float [ %.01, %__nv_fabsf.exit1 ], [ %6, %5 ]
|
754 |
+
%15 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
755 |
+
%.not2 = icmp eq i32 %15, 0
|
756 |
+
%16 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %13, float %14, float %12) #4
|
757 |
+
%17 = tail call float @llvm.nvvm.fma.rn.f(float %13, float %14, float %12) #4
|
758 |
+
%.02 = select i1 %.not2, float %17, float %16
|
759 |
+
%18 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
760 |
+
%.not3 = icmp eq i32 %18, 0
|
761 |
+
%19 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02, float %14, float %11) #4
|
762 |
+
%20 = tail call float @llvm.nvvm.fma.rn.f(float %.02, float %14, float %11) #4
|
763 |
+
%.03 = select i1 %.not3, float %20, float %19
|
764 |
+
%21 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
765 |
+
%.not4 = icmp eq i32 %21, 0
|
766 |
+
%22 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03, float %14, float %10) #4
|
767 |
+
%23 = tail call float @llvm.nvvm.fma.rn.f(float %.03, float %14, float %10) #4
|
768 |
+
%.04 = select i1 %.not4, float %23, float %22
|
769 |
+
%24 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
770 |
+
%.not5 = icmp eq i32 %24, 0
|
771 |
+
%25 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04, float %14, float %9) #4
|
772 |
+
%26 = tail call float @llvm.nvvm.fma.rn.f(float %.04, float %14, float %9) #4
|
773 |
+
%.05 = select i1 %.not5, float %26, float %25
|
774 |
+
%27 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
775 |
+
%.not6 = icmp eq i32 %27, 0
|
776 |
+
%28 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05, float %14, float %8) #4
|
777 |
+
%29 = tail call float @llvm.nvvm.fma.rn.f(float %.05, float %14, float %8) #4
|
778 |
+
%.06 = select i1 %.not6, float %29, float %28
|
779 |
+
%30 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
780 |
+
%.not7 = icmp eq i32 %30, 0
|
781 |
+
%31 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06, float %14, float %7) #4
|
782 |
+
%32 = tail call float @llvm.nvvm.fma.rn.f(float %.06, float %14, float %7) #4
|
783 |
+
%.07 = select i1 %.not7, float %32, float %31
|
784 |
+
%33 = fneg float %14
|
785 |
+
%34 = select i1 %3, float %33, float %a
|
786 |
+
%35 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
787 |
+
%.not8 = icmp eq i32 %35, 0
|
788 |
+
%36 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07, float %34, float %34) #4
|
789 |
+
%37 = tail call float @llvm.nvvm.fma.rn.f(float %.07, float %34, float %34) #4
|
790 |
+
%.08 = select i1 %.not8, float %37, float %36
|
791 |
+
br i1 %3, label %38, label %46
|
792 |
+
|
793 |
+
38: ; preds = %__internal_fmad.exit
|
794 |
+
%39 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08) #4
|
795 |
+
%40 = fsub float 1.000000e+00, %39
|
796 |
+
%41 = bitcast float %40 to i32
|
797 |
+
%42 = bitcast float %a to i32
|
798 |
+
%43 = and i32 %42, -2147483648
|
799 |
+
%44 = or i32 %43, %41
|
800 |
+
%45 = bitcast i32 %44 to float
|
801 |
+
br label %46
|
802 |
+
|
803 |
+
46: ; preds = %38, %__internal_fmad.exit
|
804 |
+
%r.0 = phi float [ %45, %38 ], [ %.08, %__internal_fmad.exit ]
|
805 |
+
ret float %r.0
|
806 |
+
}
|
807 |
+
|
808 |
+
declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #2
|
809 |
+
|
810 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
811 |
+
declare float @llvm.nvvm.fabs.ftz.f(float) #0
|
812 |
+
|
813 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
814 |
+
declare float @llvm.nvvm.fabs.f(float) #0
|
815 |
+
|
816 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
817 |
+
declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float) #0
|
818 |
+
|
819 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
820 |
+
declare float @llvm.nvvm.fma.rn.f(float, float, float) #0
|
821 |
+
|
822 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
823 |
+
declare float @llvm.nvvm.ex2.approx.ftz.f(float) #3
|
824 |
+
|
825 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
826 |
+
attributes #1 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
827 |
+
attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
828 |
+
attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
|
829 |
+
attributes #4 = { nounwind }
|
830 |
+
|
831 |
+
!llvm.module.flags = !{!0, !1}
|
832 |
+
!llvm.dbg.cu = !{!2}
|
833 |
+
!nvvm.annotations = !{!4, !5, !5, !4}
|
834 |
+
!llvm.ident = !{!6}
|
835 |
+
|
836 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
837 |
+
!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
838 |
+
!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
839 |
+
!3 = !DIFile(filename: "cjfoqo3nutni5cmtw4brla34cz45fusadehkxfkr2fie2qgo7vwt.py", directory: "/tmp/torchinductor_root/jf")
|
840 |
+
!4 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
|
841 |
+
!5 = !{ptr @triton__0d1d2de, !"maxntidx", i32 128}
|
842 |
+
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
|
843 |
+
!7 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
|
844 |
+
!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
|
845 |
+
!9 = !{}
|
846 |
+
!10 = !DILocation(line: 21, column: 36, scope: !7)
|
847 |
+
!11 = !DILocation(line: 20, column: 28, scope: !7)
|
848 |
+
!12 = !DILocation(line: 20, column: 33, scope: !7)
|
849 |
+
!13 = !DILocation(line: 21, column: 23, scope: !7)
|
850 |
+
!14 = !DILocation(line: 24, column: 30, scope: !7)
|
851 |
+
!15 = !DILocation(line: 24, column: 35, scope: !7)
|
852 |
+
!16 = !DILocation(line: 24, column: 44, scope: !7)
|
853 |
+
!17 = !DILocation(line: 29, column: 18, scope: !7)
|
854 |
+
!18 = !DILocation(line: 30, column: 23, scope: !7)
|
855 |
+
!19 = !DILocation(line: 27, column: 18, scope: !7)
|
856 |
+
!20 = !DILocation(line: 32, column: 18, scope: !7)
|
857 |
+
!21 = !DILocation(line: 33, column: 18, scope: !7)
|
858 |
+
!22 = !DILocation(line: 35, column: 25, scope: !7)
|
859 |
+
!23 = !DILocation(line: 35, column: 37, scope: !7)
|
860 |
+
!24 = !DILocation(line: 35, column: 4, scope: !7)
|
.triton/dump/44b225411009956bfbae22f8bac7d703/triton_.cubin
ADDED
Binary file (13.9 kB). View file
|
|
.triton/dump/44b225411009956bfbae22f8bac7d703/triton_.llir
ADDED
@@ -0,0 +1,304 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@global_smem = external addrspace(3) global [0 x i8]
|
5 |
+
@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
|
6 |
+
|
7 |
+
define void @triton__0d1d2d3d4d5de6de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i32 %5, i32 %6) local_unnamed_addr !dbg !7 {
|
8 |
+
%8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
|
9 |
+
%9 = and i32 %8, 31, !dbg !10
|
10 |
+
%10 = lshr i32 %8, 5, !dbg !10
|
11 |
+
%11 = and i32 %10, 1, !dbg !10
|
12 |
+
%urem = shl i32 %8, 2, !dbg !10
|
13 |
+
%12 = and i32 %urem, 252, !dbg !10
|
14 |
+
%13 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !11
|
15 |
+
%14 = shl i32 %13, 8, !dbg !12
|
16 |
+
%15 = or i32 %14, %12, !dbg !13
|
17 |
+
%16 = sext i32 %15 to i64, !dbg !14
|
18 |
+
%17 = getelementptr float, ptr addrspace(1) %0, i64 %16, !dbg !14
|
19 |
+
%18 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %17, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !15
|
20 |
+
%19 = extractvalue { i32, i32, i32, i32 } %18, 0, !dbg !15
|
21 |
+
%20 = extractvalue { i32, i32, i32, i32 } %18, 1, !dbg !15
|
22 |
+
%21 = extractvalue { i32, i32, i32, i32 } %18, 2, !dbg !15
|
23 |
+
%22 = extractvalue { i32, i32, i32, i32 } %18, 3, !dbg !15
|
24 |
+
%23 = bitcast i32 %21 to float, !dbg !15
|
25 |
+
%24 = bitcast i32 %22 to float, !dbg !15
|
26 |
+
%25 = getelementptr i16, ptr addrspace(1) %1, i64 %16, !dbg !16
|
27 |
+
%26 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %25, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !17
|
28 |
+
%27 = extractvalue { i32, i32 } %26, 0, !dbg !17
|
29 |
+
%28 = extractvalue { i32, i32 } %26, 1, !dbg !17
|
30 |
+
%29 = trunc i32 %27 to i16, !dbg !17
|
31 |
+
%extelt.offset = lshr i32 %27, 16, !dbg !17
|
32 |
+
%30 = trunc i32 %extelt.offset to i16, !dbg !17
|
33 |
+
%31 = trunc i32 %28 to i16, !dbg !17
|
34 |
+
%extelt.offset1 = lshr i32 %28, 16, !dbg !17
|
35 |
+
%32 = trunc i32 %extelt.offset1 to i16, !dbg !17
|
36 |
+
%33 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %29) #6, !dbg !18
|
37 |
+
%34 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %30) #6, !dbg !18
|
38 |
+
%35 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %31) #6, !dbg !18
|
39 |
+
%36 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %32) #6, !dbg !18
|
40 |
+
%37 = getelementptr i16, ptr addrspace(1) %2, i64 %16, !dbg !19
|
41 |
+
%38 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %37, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !20
|
42 |
+
%39 = extractvalue { i32, i32 } %38, 0, !dbg !20
|
43 |
+
%40 = extractvalue { i32, i32 } %38, 1, !dbg !20
|
44 |
+
%41 = trunc i32 %39 to i16, !dbg !20
|
45 |
+
%extelt.offset2 = lshr i32 %39, 16, !dbg !20
|
46 |
+
%42 = trunc i32 %extelt.offset2 to i16, !dbg !20
|
47 |
+
%43 = trunc i32 %40 to i16, !dbg !20
|
48 |
+
%extelt.offset3 = lshr i32 %40, 16, !dbg !20
|
49 |
+
%44 = trunc i32 %extelt.offset3 to i16, !dbg !20
|
50 |
+
%45 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %41) #6, !dbg !21
|
51 |
+
%46 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %42) #6, !dbg !21
|
52 |
+
%47 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %43) #6, !dbg !21
|
53 |
+
%48 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %44) #6, !dbg !21
|
54 |
+
%49 = zext nneg i32 %12 to i64, !dbg !22
|
55 |
+
%50 = getelementptr float, ptr addrspace(1) %3, i64 %49, !dbg !22
|
56 |
+
%51 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %50, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !23
|
57 |
+
%52 = fadd float %35, %23, !dbg !24
|
58 |
+
%53 = fadd float %36, %24, !dbg !24
|
59 |
+
%54 = insertelement <2 x i32> poison, i32 %19, i64 0, !dbg !15
|
60 |
+
%55 = insertelement <2 x i32> %54, i32 %20, i64 1, !dbg !15
|
61 |
+
%56 = bitcast <2 x i32> %55 to <2 x float>, !dbg !15
|
62 |
+
%57 = insertelement <2 x float> poison, float %33, i64 0, !dbg !24
|
63 |
+
%58 = insertelement <2 x float> %57, float %34, i64 1, !dbg !24
|
64 |
+
%59 = fadd <2 x float> %58, %56, !dbg !24
|
65 |
+
%60 = insertelement <2 x float> poison, float %45, i64 0, !dbg !25
|
66 |
+
%61 = insertelement <2 x float> %60, float %46, i64 1, !dbg !25
|
67 |
+
%62 = fadd <2 x float> %59, %61, !dbg !25
|
68 |
+
%63 = fadd float %52, %47, !dbg !25
|
69 |
+
%64 = fadd float %53, %48, !dbg !25
|
70 |
+
%65 = extractelement <2 x float> %62, i64 0, !dbg !26
|
71 |
+
%66 = extractelement <2 x float> %62, i64 1, !dbg !26
|
72 |
+
%67 = fadd float %65, %66, !dbg !26
|
73 |
+
%68 = fadd float %67, %63, !dbg !26
|
74 |
+
%69 = fadd float %68, %64, !dbg !26
|
75 |
+
%70 = bitcast float %69 to i32, !dbg !32
|
76 |
+
%71 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %70, i32 16, i32 31), !dbg !32
|
77 |
+
%72 = bitcast i32 %71 to float, !dbg !32
|
78 |
+
%73 = fadd float %69, %72, !dbg !26
|
79 |
+
%74 = bitcast float %73 to i32, !dbg !32
|
80 |
+
%75 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %74, i32 8, i32 31), !dbg !32
|
81 |
+
%76 = bitcast i32 %75 to float, !dbg !32
|
82 |
+
%77 = fadd float %73, %76, !dbg !26
|
83 |
+
%78 = bitcast float %77 to i32, !dbg !32
|
84 |
+
%79 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %78, i32 4, i32 31), !dbg !32
|
85 |
+
%80 = bitcast i32 %79 to float, !dbg !32
|
86 |
+
%81 = fadd float %77, %80, !dbg !26
|
87 |
+
%82 = bitcast float %81 to i32, !dbg !32
|
88 |
+
%83 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %82, i32 2, i32 31), !dbg !32
|
89 |
+
%84 = bitcast i32 %83 to float, !dbg !32
|
90 |
+
%85 = fadd float %81, %84, !dbg !26
|
91 |
+
%86 = bitcast float %85 to i32, !dbg !32
|
92 |
+
%87 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %86, i32 1, i32 31), !dbg !32
|
93 |
+
%88 = bitcast i32 %87 to float, !dbg !32
|
94 |
+
%89 = fadd float %85, %88, !dbg !26
|
95 |
+
%90 = icmp eq i32 %9, 0, !dbg !32
|
96 |
+
%91 = zext nneg i32 %11 to i64, !dbg !32
|
97 |
+
%92 = getelementptr float, ptr addrspace(3) @global_smem, i64 %91, !dbg !32
|
98 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %92, float %89, i1 %90) #6, !dbg !32
|
99 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !32
|
100 |
+
%93 = icmp slt i32 %8, 2, !dbg !32
|
101 |
+
%94 = sext i32 %8 to i64, !dbg !32
|
102 |
+
%95 = getelementptr float, ptr addrspace(3) @global_smem, i64 %94, !dbg !32
|
103 |
+
%96 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %95, i1 %93) #6, !dbg !32
|
104 |
+
%97 = bitcast float %96 to i32, !dbg !32
|
105 |
+
%98 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %97, i32 1, i32 31), !dbg !32
|
106 |
+
%99 = bitcast i32 %98 to float, !dbg !32
|
107 |
+
%100 = fadd float %96, %99, !dbg !26
|
108 |
+
%101 = and i32 %8, 1, !dbg !32
|
109 |
+
%102 = icmp eq i32 %101, 0, !dbg !32
|
110 |
+
%103 = and i1 %93, %102, !dbg !32
|
111 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %95, float %100, i1 %103) #6, !dbg !32
|
112 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !32
|
113 |
+
%104 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !32
|
114 |
+
%105 = fadd float %104, 0.000000e+00, !dbg !34
|
115 |
+
%106 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %105, float 2.560000e+02) #6, !dbg !38
|
116 |
+
%107 = fsub float %65, %106, !dbg !39
|
117 |
+
%108 = fsub float %66, %106, !dbg !39
|
118 |
+
%109 = fsub float %63, %106, !dbg !39
|
119 |
+
%110 = fsub float %64, %106, !dbg !39
|
120 |
+
%111 = fmul float %107, %107, !dbg !40
|
121 |
+
%112 = fmul float %108, %108, !dbg !40
|
122 |
+
%113 = fmul float %109, %109, !dbg !40
|
123 |
+
%114 = fmul float %110, %110, !dbg !40
|
124 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !41
|
125 |
+
%115 = fadd float %111, %112, !dbg !43
|
126 |
+
%116 = fadd float %113, %115, !dbg !43
|
127 |
+
%117 = fadd float %114, %116, !dbg !43
|
128 |
+
%118 = bitcast float %117 to i32, !dbg !41
|
129 |
+
%119 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %118, i32 16, i32 31), !dbg !41
|
130 |
+
%120 = bitcast i32 %119 to float, !dbg !41
|
131 |
+
%121 = fadd float %117, %120, !dbg !43
|
132 |
+
%122 = bitcast float %121 to i32, !dbg !41
|
133 |
+
%123 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %122, i32 8, i32 31), !dbg !41
|
134 |
+
%124 = bitcast i32 %123 to float, !dbg !41
|
135 |
+
%125 = fadd float %121, %124, !dbg !43
|
136 |
+
%126 = bitcast float %125 to i32, !dbg !41
|
137 |
+
%127 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %126, i32 4, i32 31), !dbg !41
|
138 |
+
%128 = bitcast i32 %127 to float, !dbg !41
|
139 |
+
%129 = fadd float %125, %128, !dbg !43
|
140 |
+
%130 = bitcast float %129 to i32, !dbg !41
|
141 |
+
%131 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %130, i32 2, i32 31), !dbg !41
|
142 |
+
%132 = bitcast i32 %131 to float, !dbg !41
|
143 |
+
%133 = fadd float %129, %132, !dbg !43
|
144 |
+
%134 = bitcast float %133 to i32, !dbg !41
|
145 |
+
%135 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %134, i32 1, i32 31), !dbg !41
|
146 |
+
%136 = bitcast i32 %135 to float, !dbg !41
|
147 |
+
%137 = fadd float %133, %136, !dbg !43
|
148 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %92, float %137, i1 %90) #6, !dbg !41
|
149 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !41
|
150 |
+
%138 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %95, i1 %93) #6, !dbg !41
|
151 |
+
%139 = bitcast float %138 to i32, !dbg !41
|
152 |
+
%140 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %139, i32 1, i32 31), !dbg !41
|
153 |
+
%141 = bitcast i32 %140 to float, !dbg !41
|
154 |
+
%142 = fadd float %138, %141, !dbg !43
|
155 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %95, float %142, i1 %103) #6, !dbg !41
|
156 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !41
|
157 |
+
%143 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !41
|
158 |
+
%144 = fadd float %143, 0.000000e+00, !dbg !46
|
159 |
+
%145 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %144, float 2.560000e+02) #6, !dbg !48
|
160 |
+
%146 = fadd float %145, 0x3EE4F8B580000000, !dbg !49
|
161 |
+
%147 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !50
|
162 |
+
%.not.i = icmp eq i32 %147, 0, !dbg !50
|
163 |
+
br i1 %.not.i, label %150, label %148, !dbg !50
|
164 |
+
|
165 |
+
148: ; preds = %7
|
166 |
+
%149 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %146), !dbg !50
|
167 |
+
br label %__nv_rsqrtf.exit, !dbg !50
|
168 |
+
|
169 |
+
150: ; preds = %7
|
170 |
+
%151 = tail call float @llvm.nvvm.rsqrt.approx.f(float %146), !dbg !50
|
171 |
+
br label %__nv_rsqrtf.exit, !dbg !50
|
172 |
+
|
173 |
+
__nv_rsqrtf.exit: ; preds = %148, %150
|
174 |
+
%.0.i = phi float [ %149, %148 ], [ %151, %150 ], !dbg !50
|
175 |
+
%152 = extractvalue { i32, i32, i32, i32 } %51, 3, !dbg !23
|
176 |
+
%153 = bitcast i32 %152 to float, !dbg !23
|
177 |
+
%154 = extractvalue { i32, i32, i32, i32 } %51, 2, !dbg !23
|
178 |
+
%155 = bitcast i32 %154 to float, !dbg !23
|
179 |
+
%156 = extractvalue { i32, i32, i32, i32 } %51, 1, !dbg !23
|
180 |
+
%157 = bitcast i32 %156 to float, !dbg !23
|
181 |
+
%158 = extractvalue { i32, i32, i32, i32 } %51, 0, !dbg !23
|
182 |
+
%159 = bitcast i32 %158 to float, !dbg !23
|
183 |
+
%160 = fmul float %107, %.0.i, !dbg !51
|
184 |
+
%161 = fmul float %108, %.0.i, !dbg !51
|
185 |
+
%162 = fmul float %109, %.0.i, !dbg !51
|
186 |
+
%163 = fmul float %110, %.0.i, !dbg !51
|
187 |
+
%164 = fmul float %160, %159, !dbg !52
|
188 |
+
%165 = fmul float %161, %157, !dbg !52
|
189 |
+
%166 = fmul float %162, %155, !dbg !52
|
190 |
+
%167 = fmul float %163, %153, !dbg !52
|
191 |
+
%168 = getelementptr float, ptr addrspace(1) %4, i64 %16, !dbg !53
|
192 |
+
%169 = bitcast float %164 to i32, !dbg !54
|
193 |
+
%170 = bitcast float %165 to i32, !dbg !54
|
194 |
+
%171 = bitcast float %166 to i32, !dbg !54
|
195 |
+
%172 = bitcast float %167 to i32, !dbg !54
|
196 |
+
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %169, i32 %170, i32 %171, i32 %172, ptr addrspace(1) %168, i1 true) #6, !dbg !54
|
197 |
+
ret void, !dbg !55
|
198 |
+
}
|
199 |
+
|
200 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
201 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
202 |
+
|
203 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
204 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
|
205 |
+
|
206 |
+
; Function Attrs: convergent nocallback nounwind
|
207 |
+
declare void @llvm.nvvm.barrier0() #2
|
208 |
+
|
209 |
+
; Function Attrs: alwaysinline nounwind
|
210 |
+
define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
|
211 |
+
%1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
|
212 |
+
%.not = icmp eq i32 %1, 0
|
213 |
+
br i1 %.not, label %4, label %2
|
214 |
+
|
215 |
+
2: ; preds = %0
|
216 |
+
%3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
|
217 |
+
br label %6
|
218 |
+
|
219 |
+
4: ; preds = %0
|
220 |
+
%5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
|
221 |
+
br label %6
|
222 |
+
|
223 |
+
6: ; preds = %4, %2
|
224 |
+
%.0 = phi float [ %3, %2 ], [ %5, %4 ]
|
225 |
+
ret float %.0
|
226 |
+
}
|
227 |
+
|
228 |
+
declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
|
229 |
+
|
230 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
231 |
+
declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
|
232 |
+
|
233 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
234 |
+
declare float @llvm.nvvm.rsqrt.approx.f(float) #5
|
235 |
+
|
236 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
237 |
+
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
238 |
+
attributes #2 = { convergent nocallback nounwind }
|
239 |
+
attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
240 |
+
attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
241 |
+
attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
|
242 |
+
attributes #6 = { nounwind }
|
243 |
+
|
244 |
+
!llvm.module.flags = !{!0, !1}
|
245 |
+
!llvm.dbg.cu = !{!2}
|
246 |
+
!nvvm.annotations = !{!4, !5, !5, !4}
|
247 |
+
!llvm.ident = !{!6}
|
248 |
+
|
249 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
250 |
+
!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
251 |
+
!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
252 |
+
!3 = !DIFile(filename: "ctvr3xs46luhhbr7xomihgyropjaatss7yata4igaw6kvgwas7g2.py", directory: "/tmp/torchinductor_root/tv")
|
253 |
+
!4 = !{ptr @triton__0d1d2d3d4d5de6de, !"kernel", i32 1}
|
254 |
+
!5 = !{ptr @triton__0d1d2d3d4d5de6de, !"maxntidx", i32 64}
|
255 |
+
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
|
256 |
+
!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5de6de", linkageName: "triton__0d1d2d3d4d5de6de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
|
257 |
+
!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
|
258 |
+
!9 = !{}
|
259 |
+
!10 = !DILocation(line: 26, column: 26, scope: !7)
|
260 |
+
!11 = !DILocation(line: 23, column: 28, scope: !7)
|
261 |
+
!12 = !DILocation(line: 30, column: 40, scope: !7)
|
262 |
+
!13 = !DILocation(line: 30, column: 36, scope: !7)
|
263 |
+
!14 = !DILocation(line: 30, column: 30, scope: !7)
|
264 |
+
!15 = !DILocation(line: 30, column: 46, scope: !7)
|
265 |
+
!16 = !DILocation(line: 31, column: 30, scope: !7)
|
266 |
+
!17 = !DILocation(line: 31, column: 46, scope: !7)
|
267 |
+
!18 = !DILocation(line: 31, column: 67, scope: !7)
|
268 |
+
!19 = !DILocation(line: 32, column: 30, scope: !7)
|
269 |
+
!20 = !DILocation(line: 32, column: 46, scope: !7)
|
270 |
+
!21 = !DILocation(line: 32, column: 67, scope: !7)
|
271 |
+
!22 = !DILocation(line: 33, column: 31, scope: !7)
|
272 |
+
!23 = !DILocation(line: 33, column: 36, scope: !7)
|
273 |
+
!24 = !DILocation(line: 35, column: 18, scope: !7)
|
274 |
+
!25 = !DILocation(line: 37, column: 18, scope: !7)
|
275 |
+
!26 = !DILocation(line: 233, column: 15, scope: !27, inlinedAt: !30)
|
276 |
+
!27 = distinct !DILexicalBlockFile(scope: !29, file: !28, discriminator: 0)
|
277 |
+
!28 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
|
278 |
+
!29 = distinct !DILexicalBlockFile(scope: !7, file: !28, discriminator: 0)
|
279 |
+
!30 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !31)
|
280 |
+
!31 = !DILocation(line: 42, column: 59, scope: !27)
|
281 |
+
!32 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !33)
|
282 |
+
!33 = !DILocation(line: 42, column: 59, scope: !29)
|
283 |
+
!34 = !DILocation(line: 8, column: 15, scope: !35, inlinedAt: !37)
|
284 |
+
!35 = distinct !DILexicalBlockFile(scope: !7, file: !36, discriminator: 0)
|
285 |
+
!36 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
|
286 |
+
!37 = !DILocation(line: 42, column: 45, scope: !35)
|
287 |
+
!38 = !DILocation(line: 45, column: 20, scope: !7)
|
288 |
+
!39 = !DILocation(line: 46, column: 19, scope: !7)
|
289 |
+
!40 = !DILocation(line: 47, column: 20, scope: !7)
|
290 |
+
!41 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !42)
|
291 |
+
!42 = !DILocation(line: 50, column: 59, scope: !29)
|
292 |
+
!43 = !DILocation(line: 233, column: 15, scope: !27, inlinedAt: !44)
|
293 |
+
!44 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !45)
|
294 |
+
!45 = !DILocation(line: 50, column: 59, scope: !27)
|
295 |
+
!46 = !DILocation(line: 8, column: 15, scope: !35, inlinedAt: !47)
|
296 |
+
!47 = !DILocation(line: 50, column: 45, scope: !35)
|
297 |
+
!48 = !DILocation(line: 53, column: 20, scope: !7)
|
298 |
+
!49 = !DILocation(line: 55, column: 20, scope: !7)
|
299 |
+
!50 = !DILocation(line: 56, column: 26, scope: !7)
|
300 |
+
!51 = !DILocation(line: 57, column: 20, scope: !7)
|
301 |
+
!52 = !DILocation(line: 58, column: 20, scope: !7)
|
302 |
+
!53 = !DILocation(line: 59, column: 25, scope: !7)
|
303 |
+
!54 = !DILocation(line: 59, column: 48, scope: !7)
|
304 |
+
!55 = !DILocation(line: 59, column: 4, scope: !7)
|
.triton/dump/44b225411009956bfbae22f8bac7d703/triton_.ttir
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%c256_i32 = arith.constant 256 : i32
|
4 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<256xbf16>
|
5 |
+
%cst_0 = arith.constant 0.000000e+00 : f32
|
6 |
+
%cst_1 = arith.constant 2.560000e+02 : f32
|
7 |
+
%cst_2 = arith.constant 9.99999974E-6 : f32
|
8 |
+
%cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32>
|
9 |
+
%cst_4 = arith.constant dense<256> : tensor<256xi32>
|
10 |
+
%0 = tt.get_program_id x : i32
|
11 |
+
%1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
|
12 |
+
%2 = arith.cmpi slt, %1, %cst_4 : tensor<256xi32>
|
13 |
+
%3 = arith.muli %0, %c256_i32 : i32
|
14 |
+
%4 = tt.splat %3 : (i32) -> tensor<256xi32>
|
15 |
+
%5 = arith.addi %1, %4 : tensor<256xi32>
|
16 |
+
%6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
|
17 |
+
%7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
|
18 |
+
%8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
|
19 |
+
%9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
|
20 |
+
%10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
|
21 |
+
%11 = tt.load %10, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
|
22 |
+
%12 = arith.extf %11 : tensor<256xbf16> to tensor<256xf32>
|
23 |
+
%13 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
|
24 |
+
%14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
|
25 |
+
%15 = tt.load %14, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
|
26 |
+
%16 = arith.extf %15 : tensor<256xbf16> to tensor<256xf32>
|
27 |
+
%17 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
|
28 |
+
%18 = tt.addptr %17, %1 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
|
29 |
+
%19 = tt.load %18, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
|
30 |
+
%20 = arith.addf %8, %12 : tensor<256xf32>
|
31 |
+
%21 = arith.addf %20, %16 : tensor<256xf32>
|
32 |
+
%22 = arith.select %2, %21, %cst_3 : tensor<256xi1>, tensor<256xf32>
|
33 |
+
%23 = "tt.reduce"(%22) <{axis = 0 : i32}> ({
|
34 |
+
^bb0(%arg7: f32, %arg8: f32):
|
35 |
+
%40 = arith.addf %arg7, %arg8 : f32
|
36 |
+
tt.reduce.return %40 : f32
|
37 |
+
}) : (tensor<256xf32>) -> f32
|
38 |
+
%24 = arith.addf %23, %cst_0 : f32
|
39 |
+
%25 = arith.divf %24, %cst_1 : f32
|
40 |
+
%26 = tt.splat %25 : (f32) -> tensor<256xf32>
|
41 |
+
%27 = arith.subf %21, %26 : tensor<256xf32>
|
42 |
+
%28 = arith.mulf %27, %27 : tensor<256xf32>
|
43 |
+
%29 = arith.select %2, %28, %cst_3 : tensor<256xi1>, tensor<256xf32>
|
44 |
+
%30 = "tt.reduce"(%29) <{axis = 0 : i32}> ({
|
45 |
+
^bb0(%arg7: f32, %arg8: f32):
|
46 |
+
%40 = arith.addf %arg7, %arg8 : f32
|
47 |
+
tt.reduce.return %40 : f32
|
48 |
+
}) : (tensor<256xf32>) -> f32
|
49 |
+
%31 = arith.addf %30, %cst_0 : f32
|
50 |
+
%32 = arith.divf %31, %cst_1 : f32
|
51 |
+
%33 = arith.addf %32, %cst_2 : f32
|
52 |
+
%34 = tt.extern_elementwise %33 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
|
53 |
+
%35 = tt.splat %34 : (f32) -> tensor<256xf32>
|
54 |
+
%36 = arith.mulf %27, %35 : tensor<256xf32>
|
55 |
+
%37 = arith.mulf %36, %19 : tensor<256xf32>
|
56 |
+
%38 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
|
57 |
+
%39 = tt.addptr %38, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
|
58 |
+
tt.store %39, %37, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32>
|
59 |
+
tt.return
|
60 |
+
}
|
61 |
+
}
|
.triton/dump/4c6ad48573c74d55ed79384f6b432d50/triton_.ttir
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%c1024_i32 = arith.constant 1024 : i32
|
4 |
+
%0 = tt.get_program_id x : i32
|
5 |
+
%1 = arith.muli %0, %c1024_i32 : i32
|
6 |
+
%2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
|
7 |
+
%3 = tt.splat %1 : (i32) -> tensor<1024xi32>
|
8 |
+
%4 = arith.addi %3, %2 : tensor<1024xi32>
|
9 |
+
%5 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>>
|
10 |
+
%6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<f32, 1>>, tensor<1024xi32>
|
11 |
+
%7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xf32>
|
12 |
+
%8 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
|
13 |
+
%9 = tt.addptr %8, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
|
14 |
+
%10 = arith.truncf %7 : tensor<1024xf32> to tensor<1024xbf16>
|
15 |
+
tt.store %9, %10 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16>
|
16 |
+
tt.return
|
17 |
+
}
|
18 |
+
}
|
.triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.llir
ADDED
@@ -0,0 +1,362 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
|
5 |
+
|
6 |
+
define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !7 {
|
7 |
+
%4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
|
8 |
+
%5 = shl i32 %4, 1, !dbg !10
|
9 |
+
%6 = and i32 %5, 510, !dbg !10
|
10 |
+
%7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #4, !dbg !11
|
11 |
+
%8 = shl i32 %7, 9, !dbg !12
|
12 |
+
%9 = or i32 %8, %6, !dbg !13
|
13 |
+
%10 = sext i32 %9 to i64, !dbg !14
|
14 |
+
%11 = getelementptr i16, ptr addrspace(1) %0, i64 %10, !dbg !14
|
15 |
+
%12 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %11, i1 true) #4, !dbg !15
|
16 |
+
%13 = trunc i32 %12 to i16, !dbg !15
|
17 |
+
%extelt.offset = lshr i32 %12, 16, !dbg !15
|
18 |
+
%14 = trunc i32 %extelt.offset to i16, !dbg !15
|
19 |
+
%15 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %13) #4, !dbg !16
|
20 |
+
%16 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %14) #4, !dbg !16
|
21 |
+
%17 = getelementptr i16, ptr addrspace(1) %1, i64 %10, !dbg !17
|
22 |
+
%18 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %17, i1 true) #4, !dbg !18
|
23 |
+
%19 = trunc i32 %18 to i16, !dbg !18
|
24 |
+
%extelt.offset1 = lshr i32 %18, 16, !dbg !18
|
25 |
+
%20 = trunc i32 %extelt.offset1 to i16, !dbg !18
|
26 |
+
%21 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %19) #4, !dbg !19
|
27 |
+
%22 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %20) #4, !dbg !19
|
28 |
+
%23 = fmul float %21, 0x3FE6A09E60000000, !dbg !20
|
29 |
+
%24 = fmul float %22, 0x3FE6A09E60000000, !dbg !20
|
30 |
+
%25 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
31 |
+
%.not.i = icmp eq i32 %25, 0, !dbg !21
|
32 |
+
%26 = tail call float @llvm.nvvm.fabs.ftz.f(float %23) #4, !dbg !21
|
33 |
+
%27 = tail call float @llvm.nvvm.fabs.f(float %23) #4, !dbg !21
|
34 |
+
%.0.i = select i1 %.not.i, float %27, float %26, !dbg !21
|
35 |
+
%28 = fcmp oge float %.0.i, 0x3FF00C1FC0000000, !dbg !21
|
36 |
+
br i1 %28, label %__nv_fabsf.exit1.i, label %30, !dbg !21
|
37 |
+
|
38 |
+
__nv_fabsf.exit1.i: ; preds = %3
|
39 |
+
%29 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
40 |
+
%.not1.i = icmp eq i32 %29, 0, !dbg !21
|
41 |
+
%.01.i = select i1 %.not1.i, float %27, float %26, !dbg !21
|
42 |
+
br label %__internal_fmad.exit.i, !dbg !21
|
43 |
+
|
44 |
+
30: ; preds = %3
|
45 |
+
%31 = fmul float %23, %23, !dbg !21
|
46 |
+
br label %__internal_fmad.exit.i, !dbg !21
|
47 |
+
|
48 |
+
__internal_fmad.exit.i: ; preds = %30, %__nv_fabsf.exit1.i
|
49 |
+
%32 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i ], [ 0x3FC06EBA60000000, %30 ], !dbg !21
|
50 |
+
%33 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i ], [ 0xBFD8127580000000, %30 ], !dbg !21
|
51 |
+
%34 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i ], [ 0x3FBCE315E0000000, %30 ], !dbg !21
|
52 |
+
%35 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i ], [ 0xBF9B837CE0000000, %30 ], !dbg !21
|
53 |
+
%36 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i ], [ 0x3F755ABD40000000, %30 ], !dbg !21
|
54 |
+
%37 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i ], [ 0xBF4AE9A400000000, %30 ], !dbg !21
|
55 |
+
%38 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i ], [ 0x3F163D2D40000000, %30 ], !dbg !21
|
56 |
+
%39 = phi float [ %.01.i, %__nv_fabsf.exit1.i ], [ %31, %30 ], !dbg !21
|
57 |
+
%40 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
58 |
+
%.not2.i = icmp eq i32 %40, 0, !dbg !21
|
59 |
+
%41 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %38, float %39, float %37) #4, !dbg !21
|
60 |
+
%42 = tail call float @llvm.nvvm.fma.rn.f(float %38, float %39, float %37) #4, !dbg !21
|
61 |
+
%.02.i = select i1 %.not2.i, float %42, float %41, !dbg !21
|
62 |
+
%43 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
63 |
+
%.not3.i = icmp eq i32 %43, 0, !dbg !21
|
64 |
+
%44 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i, float %39, float %36) #4, !dbg !21
|
65 |
+
%45 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i, float %39, float %36) #4, !dbg !21
|
66 |
+
%.03.i = select i1 %.not3.i, float %45, float %44, !dbg !21
|
67 |
+
%46 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
68 |
+
%.not4.i = icmp eq i32 %46, 0, !dbg !21
|
69 |
+
%47 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i, float %39, float %35) #4, !dbg !21
|
70 |
+
%48 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i, float %39, float %35) #4, !dbg !21
|
71 |
+
%.04.i = select i1 %.not4.i, float %48, float %47, !dbg !21
|
72 |
+
%49 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
73 |
+
%.not5.i = icmp eq i32 %49, 0, !dbg !21
|
74 |
+
%50 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i, float %39, float %34) #4, !dbg !21
|
75 |
+
%51 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i, float %39, float %34) #4, !dbg !21
|
76 |
+
%.05.i = select i1 %.not5.i, float %51, float %50, !dbg !21
|
77 |
+
%52 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
78 |
+
%.not6.i = icmp eq i32 %52, 0, !dbg !21
|
79 |
+
%53 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i, float %39, float %33) #4, !dbg !21
|
80 |
+
%54 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i, float %39, float %33) #4, !dbg !21
|
81 |
+
%.06.i = select i1 %.not6.i, float %54, float %53, !dbg !21
|
82 |
+
%55 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
83 |
+
%.not7.i = icmp eq i32 %55, 0, !dbg !21
|
84 |
+
%56 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i, float %39, float %32) #4, !dbg !21
|
85 |
+
%57 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i, float %39, float %32) #4, !dbg !21
|
86 |
+
%.07.i = select i1 %.not7.i, float %57, float %56, !dbg !21
|
87 |
+
%58 = fneg float %39, !dbg !21
|
88 |
+
%59 = select i1 %28, float %58, float %23, !dbg !21
|
89 |
+
%60 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
90 |
+
%.not8.i = icmp eq i32 %60, 0, !dbg !21
|
91 |
+
%61 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i, float %59, float %59) #4, !dbg !21
|
92 |
+
%62 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i, float %59, float %59) #4, !dbg !21
|
93 |
+
%.08.i = select i1 %.not8.i, float %62, float %61, !dbg !21
|
94 |
+
br i1 %28, label %63, label %__nv_erff.exit, !dbg !21
|
95 |
+
|
96 |
+
63: ; preds = %__internal_fmad.exit.i
|
97 |
+
%64 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i) #4, !dbg !21
|
98 |
+
%65 = fsub float 1.000000e+00, %64, !dbg !21
|
99 |
+
%66 = bitcast float %65 to i32, !dbg !21
|
100 |
+
%67 = bitcast float %23 to i32, !dbg !21
|
101 |
+
%68 = and i32 %67, -2147483648, !dbg !21
|
102 |
+
%69 = or i32 %68, %66, !dbg !21
|
103 |
+
%70 = bitcast i32 %69 to float, !dbg !21
|
104 |
+
br label %__nv_erff.exit, !dbg !21
|
105 |
+
|
106 |
+
__nv_erff.exit: ; preds = %__internal_fmad.exit.i, %63
|
107 |
+
%r.0.i = phi float [ %70, %63 ], [ %.08.i, %__internal_fmad.exit.i ], !dbg !21
|
108 |
+
%71 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
109 |
+
%.not.i2 = icmp eq i32 %71, 0, !dbg !21
|
110 |
+
%72 = tail call float @llvm.nvvm.fabs.ftz.f(float %24) #4, !dbg !21
|
111 |
+
%73 = tail call float @llvm.nvvm.fabs.f(float %24) #4, !dbg !21
|
112 |
+
%.0.i3 = select i1 %.not.i2, float %73, float %72, !dbg !21
|
113 |
+
%74 = fcmp oge float %.0.i3, 0x3FF00C1FC0000000, !dbg !21
|
114 |
+
br i1 %74, label %__nv_fabsf.exit1.i20, label %76, !dbg !21
|
115 |
+
|
116 |
+
__nv_fabsf.exit1.i20: ; preds = %__nv_erff.exit
|
117 |
+
%75 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
118 |
+
%.not1.i21 = icmp eq i32 %75, 0, !dbg !21
|
119 |
+
%.01.i22 = select i1 %.not1.i21, float %73, float %72, !dbg !21
|
120 |
+
br label %__internal_fmad.exit.i4, !dbg !21
|
121 |
+
|
122 |
+
76: ; preds = %__nv_erff.exit
|
123 |
+
%77 = fmul float %24, %24, !dbg !21
|
124 |
+
br label %__internal_fmad.exit.i4, !dbg !21
|
125 |
+
|
126 |
+
__internal_fmad.exit.i4: ; preds = %76, %__nv_fabsf.exit1.i20
|
127 |
+
%78 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i20 ], [ 0x3FC06EBA60000000, %76 ], !dbg !21
|
128 |
+
%79 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i20 ], [ 0xBFD8127580000000, %76 ], !dbg !21
|
129 |
+
%80 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i20 ], [ 0x3FBCE315E0000000, %76 ], !dbg !21
|
130 |
+
%81 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i20 ], [ 0xBF9B837CE0000000, %76 ], !dbg !21
|
131 |
+
%82 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i20 ], [ 0x3F755ABD40000000, %76 ], !dbg !21
|
132 |
+
%83 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i20 ], [ 0xBF4AE9A400000000, %76 ], !dbg !21
|
133 |
+
%84 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i20 ], [ 0x3F163D2D40000000, %76 ], !dbg !21
|
134 |
+
%85 = phi float [ %.01.i22, %__nv_fabsf.exit1.i20 ], [ %77, %76 ], !dbg !21
|
135 |
+
%86 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
136 |
+
%.not2.i5 = icmp eq i32 %86, 0, !dbg !21
|
137 |
+
%87 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %84, float %85, float %83) #4, !dbg !21
|
138 |
+
%88 = tail call float @llvm.nvvm.fma.rn.f(float %84, float %85, float %83) #4, !dbg !21
|
139 |
+
%.02.i6 = select i1 %.not2.i5, float %88, float %87, !dbg !21
|
140 |
+
%89 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
141 |
+
%.not3.i7 = icmp eq i32 %89, 0, !dbg !21
|
142 |
+
%90 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i6, float %85, float %82) #4, !dbg !21
|
143 |
+
%91 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i6, float %85, float %82) #4, !dbg !21
|
144 |
+
%.03.i8 = select i1 %.not3.i7, float %91, float %90, !dbg !21
|
145 |
+
%92 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
146 |
+
%.not4.i9 = icmp eq i32 %92, 0, !dbg !21
|
147 |
+
%93 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i8, float %85, float %81) #4, !dbg !21
|
148 |
+
%94 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i8, float %85, float %81) #4, !dbg !21
|
149 |
+
%.04.i10 = select i1 %.not4.i9, float %94, float %93, !dbg !21
|
150 |
+
%95 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
151 |
+
%.not5.i11 = icmp eq i32 %95, 0, !dbg !21
|
152 |
+
%96 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i10, float %85, float %80) #4, !dbg !21
|
153 |
+
%97 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i10, float %85, float %80) #4, !dbg !21
|
154 |
+
%.05.i12 = select i1 %.not5.i11, float %97, float %96, !dbg !21
|
155 |
+
%98 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
156 |
+
%.not6.i13 = icmp eq i32 %98, 0, !dbg !21
|
157 |
+
%99 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i12, float %85, float %79) #4, !dbg !21
|
158 |
+
%100 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i12, float %85, float %79) #4, !dbg !21
|
159 |
+
%.06.i14 = select i1 %.not6.i13, float %100, float %99, !dbg !21
|
160 |
+
%101 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
161 |
+
%.not7.i15 = icmp eq i32 %101, 0, !dbg !21
|
162 |
+
%102 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i14, float %85, float %78) #4, !dbg !21
|
163 |
+
%103 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i14, float %85, float %78) #4, !dbg !21
|
164 |
+
%.07.i16 = select i1 %.not7.i15, float %103, float %102, !dbg !21
|
165 |
+
%104 = fneg float %85, !dbg !21
|
166 |
+
%105 = select i1 %74, float %104, float %24, !dbg !21
|
167 |
+
%106 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
168 |
+
%.not8.i17 = icmp eq i32 %106, 0, !dbg !21
|
169 |
+
%107 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i16, float %105, float %105) #4, !dbg !21
|
170 |
+
%108 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i16, float %105, float %105) #4, !dbg !21
|
171 |
+
%.08.i18 = select i1 %.not8.i17, float %108, float %107, !dbg !21
|
172 |
+
br i1 %74, label %109, label %__nv_erff.exit23, !dbg !21
|
173 |
+
|
174 |
+
109: ; preds = %__internal_fmad.exit.i4
|
175 |
+
%110 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i18) #4, !dbg !21
|
176 |
+
%111 = fsub float 1.000000e+00, %110, !dbg !21
|
177 |
+
%112 = bitcast float %111 to i32, !dbg !21
|
178 |
+
%113 = bitcast float %24 to i32, !dbg !21
|
179 |
+
%114 = and i32 %113, -2147483648, !dbg !21
|
180 |
+
%115 = or i32 %114, %112, !dbg !21
|
181 |
+
%116 = bitcast i32 %115 to float, !dbg !21
|
182 |
+
br label %__nv_erff.exit23, !dbg !21
|
183 |
+
|
184 |
+
__nv_erff.exit23: ; preds = %__internal_fmad.exit.i4, %109
|
185 |
+
%r.0.i19 = phi float [ %116, %109 ], [ %.08.i18, %__internal_fmad.exit.i4 ], !dbg !21
|
186 |
+
%117 = fadd float %r.0.i, 1.000000e+00, !dbg !22
|
187 |
+
%118 = fadd float %r.0.i19, 1.000000e+00, !dbg !22
|
188 |
+
%119 = fmul float %117, 5.000000e-01, !dbg !23
|
189 |
+
%120 = fmul float %118, 5.000000e-01, !dbg !23
|
190 |
+
%121 = fmul float %21, %21, !dbg !24
|
191 |
+
%122 = fmul float %22, %22, !dbg !24
|
192 |
+
%123 = fmul float %121, -5.000000e-01, !dbg !25
|
193 |
+
%124 = fmul float %122, -5.000000e-01, !dbg !25
|
194 |
+
%125 = fmul float %123, 0x3FF7154760000000, !dbg !26
|
195 |
+
%126 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %125) #4, !dbg !26
|
196 |
+
%127 = fmul float %124, 0x3FF7154760000000, !dbg !26
|
197 |
+
%128 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %127) #4, !dbg !26
|
198 |
+
%129 = fmul float %126, 0x3FD9884540000000, !dbg !27
|
199 |
+
%130 = fmul float %128, 0x3FD9884540000000, !dbg !27
|
200 |
+
%131 = fmul float %21, %129, !dbg !28
|
201 |
+
%132 = fmul float %22, %130, !dbg !28
|
202 |
+
%133 = fadd float %119, %131, !dbg !29
|
203 |
+
%134 = fadd float %120, %132, !dbg !29
|
204 |
+
%135 = fmul float %15, %133, !dbg !30
|
205 |
+
%136 = fmul float %16, %134, !dbg !30
|
206 |
+
%137 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %135) #4, !dbg !31
|
207 |
+
%138 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %136) #4, !dbg !31
|
208 |
+
%139 = insertelement <2 x i16> undef, i16 %137, i64 0, !dbg !31
|
209 |
+
%140 = insertelement <2 x i16> %139, i16 %138, i64 1, !dbg !31
|
210 |
+
%141 = bitcast <2 x i16> %140 to i32, !dbg !31
|
211 |
+
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %141, ptr addrspace(1) %11, i1 true) #4, !dbg !31
|
212 |
+
ret void, !dbg !32
|
213 |
+
}
|
214 |
+
|
215 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
216 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
217 |
+
|
218 |
+
; Function Attrs: alwaysinline nounwind
|
219 |
+
define float @__nv_erff(float %a) local_unnamed_addr #1 {
|
220 |
+
__nv_fabsf.exit:
|
221 |
+
%0 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
222 |
+
%.not = icmp eq i32 %0, 0
|
223 |
+
%1 = tail call float @llvm.nvvm.fabs.ftz.f(float %a) #4
|
224 |
+
%2 = tail call float @llvm.nvvm.fabs.f(float %a) #4
|
225 |
+
%.0 = select i1 %.not, float %2, float %1
|
226 |
+
%3 = fcmp oge float %.0, 0x3FF00C1FC0000000
|
227 |
+
br i1 %3, label %__nv_fabsf.exit1, label %5
|
228 |
+
|
229 |
+
__nv_fabsf.exit1: ; preds = %__nv_fabsf.exit
|
230 |
+
%4 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
231 |
+
%.not1 = icmp eq i32 %4, 0
|
232 |
+
%.01 = select i1 %.not1, float %2, float %1
|
233 |
+
br label %__internal_fmad.exit
|
234 |
+
|
235 |
+
5: ; preds = %__nv_fabsf.exit
|
236 |
+
%6 = fmul float %a, %a
|
237 |
+
br label %__internal_fmad.exit
|
238 |
+
|
239 |
+
__internal_fmad.exit: ; preds = %5, %__nv_fabsf.exit1
|
240 |
+
%7 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1 ], [ 0x3FC06EBA60000000, %5 ]
|
241 |
+
%8 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1 ], [ 0xBFD8127580000000, %5 ]
|
242 |
+
%9 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1 ], [ 0x3FBCE315E0000000, %5 ]
|
243 |
+
%10 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1 ], [ 0xBF9B837CE0000000, %5 ]
|
244 |
+
%11 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1 ], [ 0x3F755ABD40000000, %5 ]
|
245 |
+
%12 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1 ], [ 0xBF4AE9A400000000, %5 ]
|
246 |
+
%13 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1 ], [ 0x3F163D2D40000000, %5 ]
|
247 |
+
%14 = phi float [ %.01, %__nv_fabsf.exit1 ], [ %6, %5 ]
|
248 |
+
%15 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
249 |
+
%.not2 = icmp eq i32 %15, 0
|
250 |
+
%16 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %13, float %14, float %12) #4
|
251 |
+
%17 = tail call float @llvm.nvvm.fma.rn.f(float %13, float %14, float %12) #4
|
252 |
+
%.02 = select i1 %.not2, float %17, float %16
|
253 |
+
%18 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
254 |
+
%.not3 = icmp eq i32 %18, 0
|
255 |
+
%19 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02, float %14, float %11) #4
|
256 |
+
%20 = tail call float @llvm.nvvm.fma.rn.f(float %.02, float %14, float %11) #4
|
257 |
+
%.03 = select i1 %.not3, float %20, float %19
|
258 |
+
%21 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
259 |
+
%.not4 = icmp eq i32 %21, 0
|
260 |
+
%22 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03, float %14, float %10) #4
|
261 |
+
%23 = tail call float @llvm.nvvm.fma.rn.f(float %.03, float %14, float %10) #4
|
262 |
+
%.04 = select i1 %.not4, float %23, float %22
|
263 |
+
%24 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
264 |
+
%.not5 = icmp eq i32 %24, 0
|
265 |
+
%25 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04, float %14, float %9) #4
|
266 |
+
%26 = tail call float @llvm.nvvm.fma.rn.f(float %.04, float %14, float %9) #4
|
267 |
+
%.05 = select i1 %.not5, float %26, float %25
|
268 |
+
%27 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
269 |
+
%.not6 = icmp eq i32 %27, 0
|
270 |
+
%28 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05, float %14, float %8) #4
|
271 |
+
%29 = tail call float @llvm.nvvm.fma.rn.f(float %.05, float %14, float %8) #4
|
272 |
+
%.06 = select i1 %.not6, float %29, float %28
|
273 |
+
%30 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
274 |
+
%.not7 = icmp eq i32 %30, 0
|
275 |
+
%31 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06, float %14, float %7) #4
|
276 |
+
%32 = tail call float @llvm.nvvm.fma.rn.f(float %.06, float %14, float %7) #4
|
277 |
+
%.07 = select i1 %.not7, float %32, float %31
|
278 |
+
%33 = fneg float %14
|
279 |
+
%34 = select i1 %3, float %33, float %a
|
280 |
+
%35 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
281 |
+
%.not8 = icmp eq i32 %35, 0
|
282 |
+
%36 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07, float %34, float %34) #4
|
283 |
+
%37 = tail call float @llvm.nvvm.fma.rn.f(float %.07, float %34, float %34) #4
|
284 |
+
%.08 = select i1 %.not8, float %37, float %36
|
285 |
+
br i1 %3, label %38, label %46
|
286 |
+
|
287 |
+
38: ; preds = %__internal_fmad.exit
|
288 |
+
%39 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08) #4
|
289 |
+
%40 = fsub float 1.000000e+00, %39
|
290 |
+
%41 = bitcast float %40 to i32
|
291 |
+
%42 = bitcast float %a to i32
|
292 |
+
%43 = and i32 %42, -2147483648
|
293 |
+
%44 = or i32 %43, %41
|
294 |
+
%45 = bitcast i32 %44 to float
|
295 |
+
br label %46
|
296 |
+
|
297 |
+
46: ; preds = %38, %__internal_fmad.exit
|
298 |
+
%r.0 = phi float [ %45, %38 ], [ %.08, %__internal_fmad.exit ]
|
299 |
+
ret float %r.0
|
300 |
+
}
|
301 |
+
|
302 |
+
declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #2
|
303 |
+
|
304 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
305 |
+
declare float @llvm.nvvm.fabs.ftz.f(float) #0
|
306 |
+
|
307 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
308 |
+
declare float @llvm.nvvm.fabs.f(float) #0
|
309 |
+
|
310 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
311 |
+
declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float) #0
|
312 |
+
|
313 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
314 |
+
declare float @llvm.nvvm.fma.rn.f(float, float, float) #0
|
315 |
+
|
316 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
317 |
+
declare float @llvm.nvvm.ex2.approx.ftz.f(float) #3
|
318 |
+
|
319 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
320 |
+
attributes #1 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
321 |
+
attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
322 |
+
attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
|
323 |
+
attributes #4 = { nounwind }
|
324 |
+
|
325 |
+
!llvm.module.flags = !{!0, !1}
|
326 |
+
!llvm.dbg.cu = !{!2}
|
327 |
+
!nvvm.annotations = !{!4, !5, !5, !4}
|
328 |
+
!llvm.ident = !{!6}
|
329 |
+
|
330 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
331 |
+
!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
332 |
+
!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
333 |
+
!3 = !DIFile(filename: "c5jxaguxho3nhrlt5vcinnz5fevodumlpwn4wyb2vx3xrveicerl.py", directory: "/tmp/torchinductor_root/5j")
|
334 |
+
!4 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
|
335 |
+
!5 = !{ptr @triton__0d1d2de, !"maxntidx", i32 256}
|
336 |
+
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
|
337 |
+
!7 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
|
338 |
+
!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
|
339 |
+
!9 = !{}
|
340 |
+
!10 = !DILocation(line: 21, column: 36, scope: !7)
|
341 |
+
!11 = !DILocation(line: 20, column: 28, scope: !7)
|
342 |
+
!12 = !DILocation(line: 20, column: 33, scope: !7)
|
343 |
+
!13 = !DILocation(line: 21, column: 23, scope: !7)
|
344 |
+
!14 = !DILocation(line: 24, column: 34, scope: !7)
|
345 |
+
!15 = !DILocation(line: 24, column: 39, scope: !7)
|
346 |
+
!16 = !DILocation(line: 24, column: 48, scope: !7)
|
347 |
+
!17 = !DILocation(line: 25, column: 30, scope: !7)
|
348 |
+
!18 = !DILocation(line: 25, column: 35, scope: !7)
|
349 |
+
!19 = !DILocation(line: 25, column: 44, scope: !7)
|
350 |
+
!20 = !DILocation(line: 29, column: 18, scope: !7)
|
351 |
+
!21 = !DILocation(line: 30, column: 23, scope: !7)
|
352 |
+
!22 = !DILocation(line: 32, column: 18, scope: !7)
|
353 |
+
!23 = !DILocation(line: 34, column: 19, scope: !7)
|
354 |
+
!24 = !DILocation(line: 35, column: 19, scope: !7)
|
355 |
+
!25 = !DILocation(line: 37, column: 20, scope: !7)
|
356 |
+
!26 = !DILocation(line: 38, column: 19, scope: !7)
|
357 |
+
!27 = !DILocation(line: 40, column: 20, scope: !7)
|
358 |
+
!28 = !DILocation(line: 41, column: 19, scope: !7)
|
359 |
+
!29 = !DILocation(line: 42, column: 20, scope: !7)
|
360 |
+
!30 = !DILocation(line: 43, column: 19, scope: !7)
|
361 |
+
!31 = !DILocation(line: 45, column: 40, scope: !7)
|
362 |
+
!32 = !DILocation(line: 45, column: 4, scope: !7)
|
.triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.ptx
ADDED
@@ -0,0 +1,486 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2de
|
10 |
+
.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
|
11 |
+
|
12 |
+
.visible .entry triton__0d1d2de(
|
13 |
+
.param .u64 triton__0d1d2de_param_0,
|
14 |
+
.param .u64 triton__0d1d2de_param_1,
|
15 |
+
.param .u32 triton__0d1d2de_param_2
|
16 |
+
)
|
17 |
+
.maxntid 256, 1, 1
|
18 |
+
{
|
19 |
+
.reg .pred %p<10>;
|
20 |
+
.reg .b16 %rs<7>;
|
21 |
+
.reg .b32 %r<25>;
|
22 |
+
.reg .f32 %f<127>;
|
23 |
+
.reg .b64 %rd<8>;
|
24 |
+
.loc 1 18 0
|
25 |
+
$L__func_begin0:
|
26 |
+
.loc 1 18 0
|
27 |
+
|
28 |
+
ld.param.u64 %rd4, [triton__0d1d2de_param_0];
|
29 |
+
ld.param.u64 %rd5, [triton__0d1d2de_param_1];
|
30 |
+
$L__tmp0:
|
31 |
+
.loc 1 21 36
|
32 |
+
mov.u32 %r8, %tid.x;
|
33 |
+
shl.b32 %r9, %r8, 1;
|
34 |
+
and.b32 %r10, %r9, 510;
|
35 |
+
.loc 1 20 28
|
36 |
+
mov.u32 %r1, %ctaid.x;
|
37 |
+
.loc 1 20 33
|
38 |
+
shl.b32 %r11, %r1, 9;
|
39 |
+
.loc 1 21 23
|
40 |
+
or.b32 %r12, %r11, %r10;
|
41 |
+
.loc 1 24 34
|
42 |
+
mul.wide.s32 %rd6, %r12, 2;
|
43 |
+
add.s64 %rd7, %rd4, %rd6;
|
44 |
+
mov.pred %p1, -1;
|
45 |
+
.loc 1 24 39
|
46 |
+
mov.u32 %r2, 0x0;
|
47 |
+
@%p1 ld.global.b32 { %r2 }, [ %rd7 + 0 ];
|
48 |
+
.loc 1 25 30
|
49 |
+
add.s64 %rd3, %rd5, %rd6;
|
50 |
+
.loc 1 25 35
|
51 |
+
mov.u32 %r5, 0x0;
|
52 |
+
@%p1 ld.global.b32 { %r5 }, [ %rd3 + 0 ];
|
53 |
+
cvt.u16.u32 %rs3, %r5;
|
54 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r5; }
|
55 |
+
.loc 1 25 44
|
56 |
+
cvt.f32.bf16 %r6, %rs3;
|
57 |
+
mov.b32 %f3, %r6;
|
58 |
+
cvt.f32.bf16 %r7, %rs4;
|
59 |
+
mov.b32 %f4, %r7;
|
60 |
+
.loc 1 29 18
|
61 |
+
mul.f32 %f5, %f3, 0f3F3504F3;
|
62 |
+
.loc 1 30 23
|
63 |
+
abs.ftz.f32 %f7, %f5;
|
64 |
+
setp.ge.f32 %p3, %f7, 0f3F8060FE;
|
65 |
+
mov.f32 %f115, 0f3789CA3C;
|
66 |
+
mov.f32 %f114, 0fB9F560B9;
|
67 |
+
mov.f32 %f113, 0f3BAC840B;
|
68 |
+
mov.f32 %f112, 0fBD0C8162;
|
69 |
+
mov.f32 %f111, 0f3E1CF906;
|
70 |
+
mov.f32 %f110, 0f3F6A937E;
|
71 |
+
mov.f32 %f109, 0f3F20D842;
|
72 |
+
mov.f32 %f116, %f7;
|
73 |
+
@%p3 bra $L__BB0_2;
|
74 |
+
.loc 1 0 23
|
75 |
+
mov.f32 %f115, 0f38B1E96A;
|
76 |
+
mov.f32 %f114, 0fBA574D20;
|
77 |
+
mov.f32 %f113, 0f3BAAD5EA;
|
78 |
+
mov.f32 %f112, 0fBCDC1BE7;
|
79 |
+
mov.f32 %f111, 0f3DE718AF;
|
80 |
+
mov.f32 %f110, 0fBEC093AC;
|
81 |
+
mov.f32 %f109, 0f3E0375D3;
|
82 |
+
.loc 1 30 23
|
83 |
+
mul.f32 %f116, %f5, %f5;
|
84 |
+
$L__BB0_2:
|
85 |
+
.loc 1 0 0
|
86 |
+
cvt.u16.u32 %rs1, %r2;
|
87 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
|
88 |
+
mul.f32 %f6, %f4, 0f3F3504F3;
|
89 |
+
.loc 1 30 23
|
90 |
+
setp.ltu.f32 %p4, %f7, 0f3F8060FE;
|
91 |
+
fma.rn.ftz.f32 %f47, %f115, %f116, %f114;
|
92 |
+
fma.rn.ftz.f32 %f48, %f47, %f116, %f113;
|
93 |
+
fma.rn.ftz.f32 %f49, %f48, %f116, %f112;
|
94 |
+
fma.rn.ftz.f32 %f50, %f49, %f116, %f111;
|
95 |
+
fma.rn.ftz.f32 %f51, %f50, %f116, %f110;
|
96 |
+
fma.rn.ftz.f32 %f52, %f51, %f116, %f109;
|
97 |
+
neg.f32 %f53, %f116;
|
98 |
+
selp.f32 %f54, %f53, %f5, %p3;
|
99 |
+
fma.rn.ftz.f32 %f117, %f52, %f54, %f54;
|
100 |
+
mov.f32 %f108, 0f3F800000;
|
101 |
+
@%p4 bra $L__BB0_4;
|
102 |
+
ex2.approx.ftz.f32 %f55, %f117;
|
103 |
+
sub.f32 %f57, %f108, %f55;
|
104 |
+
mov.b32 %r13, %f57;
|
105 |
+
mov.b32 %r14, %f5;
|
106 |
+
and.b32 %r15, %r14, -2147483648;
|
107 |
+
or.b32 %r16, %r15, %r13;
|
108 |
+
mov.b32 %f117, %r16;
|
109 |
+
$L__BB0_4:
|
110 |
+
.loc 1 0 0
|
111 |
+
cvt.f32.bf16 %r3, %rs1;
|
112 |
+
cvt.f32.bf16 %r4, %rs2;
|
113 |
+
.loc 1 30 23
|
114 |
+
abs.ftz.f32 %f20, %f6;
|
115 |
+
setp.ge.f32 %p6, %f20, 0f3F8060FE;
|
116 |
+
mov.f32 %f124, 0f3789CA3C;
|
117 |
+
mov.f32 %f123, 0fB9F560B9;
|
118 |
+
mov.f32 %f122, 0f3BAC840B;
|
119 |
+
mov.f32 %f121, 0fBD0C8162;
|
120 |
+
mov.f32 %f120, 0f3E1CF906;
|
121 |
+
mov.f32 %f119, 0f3F6A937E;
|
122 |
+
mov.f32 %f118, 0f3F20D842;
|
123 |
+
mov.f32 %f125, %f20;
|
124 |
+
@%p6 bra $L__BB0_6;
|
125 |
+
mul.f32 %f125, %f6, %f6;
|
126 |
+
mov.f32 %f124, 0f38B1E96A;
|
127 |
+
mov.f32 %f123, 0fBA574D20;
|
128 |
+
mov.f32 %f122, 0f3BAAD5EA;
|
129 |
+
mov.f32 %f121, 0fBCDC1BE7;
|
130 |
+
mov.f32 %f120, 0f3DE718AF;
|
131 |
+
mov.f32 %f119, 0fBEC093AC;
|
132 |
+
mov.f32 %f118, 0f3E0375D3;
|
133 |
+
$L__BB0_6:
|
134 |
+
.loc 1 0 0
|
135 |
+
mov.b32 %f1, %r3;
|
136 |
+
mov.b32 %f2, %r4;
|
137 |
+
.loc 1 30 23
|
138 |
+
setp.ltu.f32 %p7, %f20, 0f3F8060FE;
|
139 |
+
fma.rn.ftz.f32 %f72, %f124, %f125, %f123;
|
140 |
+
fma.rn.ftz.f32 %f73, %f72, %f125, %f122;
|
141 |
+
fma.rn.ftz.f32 %f74, %f73, %f125, %f121;
|
142 |
+
fma.rn.ftz.f32 %f75, %f74, %f125, %f120;
|
143 |
+
fma.rn.ftz.f32 %f76, %f75, %f125, %f119;
|
144 |
+
fma.rn.ftz.f32 %f77, %f76, %f125, %f118;
|
145 |
+
neg.f32 %f78, %f125;
|
146 |
+
selp.f32 %f79, %f78, %f6, %p6;
|
147 |
+
fma.rn.ftz.f32 %f126, %f77, %f79, %f79;
|
148 |
+
@%p7 bra $L__BB0_8;
|
149 |
+
ex2.approx.ftz.f32 %f80, %f126;
|
150 |
+
sub.f32 %f82, %f108, %f80;
|
151 |
+
mov.b32 %r17, %f82;
|
152 |
+
mov.b32 %r18, %f6;
|
153 |
+
and.b32 %r19, %r18, -2147483648;
|
154 |
+
or.b32 %r20, %r19, %r17;
|
155 |
+
mov.b32 %f126, %r20;
|
156 |
+
$L__BB0_8:
|
157 |
+
.loc 1 32 18
|
158 |
+
add.f32 %f87, %f117, 0f3F800000;
|
159 |
+
add.f32 %f88, %f126, 0f3F800000;
|
160 |
+
.loc 1 35 19
|
161 |
+
mul.f32 %f89, %f3, %f3;
|
162 |
+
mul.f32 %f90, %f4, %f4;
|
163 |
+
.loc 1 37 20
|
164 |
+
mul.f32 %f91, %f89, 0fBF000000;
|
165 |
+
mul.f32 %f92, %f90, 0fBF000000;
|
166 |
+
.loc 1 38 19
|
167 |
+
mul.f32 %f84, %f91, 0f3FB8AA3B;
|
168 |
+
ex2.approx.f32 %f83, %f84;
|
169 |
+
mul.f32 %f86, %f92, 0f3FB8AA3B;
|
170 |
+
ex2.approx.f32 %f85, %f86;
|
171 |
+
.loc 1 40 20
|
172 |
+
mul.f32 %f93, %f83, 0f3ECC422A;
|
173 |
+
mul.f32 %f94, %f85, 0f3ECC422A;
|
174 |
+
.loc 1 41 19
|
175 |
+
mul.f32 %f95, %f3, %f93;
|
176 |
+
mul.f32 %f96, %f4, %f94;
|
177 |
+
.loc 1 42 20
|
178 |
+
fma.rn.f32 %f97, %f87, 0f3F000000, %f95;
|
179 |
+
fma.rn.f32 %f98, %f88, 0f3F000000, %f96;
|
180 |
+
.loc 1 43 19
|
181 |
+
mul.f32 %f99, %f1, %f97;
|
182 |
+
mul.f32 %f100, %f2, %f98;
|
183 |
+
.loc 1 45 40
|
184 |
+
mov.b32 %r21, %f99;
|
185 |
+
cvt.rn.bf16.f32 %rs5, %r21;
|
186 |
+
mov.b32 %r22, %f100;
|
187 |
+
cvt.rn.bf16.f32 %rs6, %r22;
|
188 |
+
mov.b32 %r24, {%rs5, %rs6};
|
189 |
+
@%p1 st.global.b32 [ %rd7 + 0 ], { %r24 };
|
190 |
+
.loc 1 45 4
|
191 |
+
ret;
|
192 |
+
$L__tmp1:
|
193 |
+
$L__func_end0:
|
194 |
+
|
195 |
+
}
|
196 |
+
// .globl __nv_erff
|
197 |
+
.visible .func (.param .b32 func_retval0) __nv_erff(
|
198 |
+
.param .b32 __nv_erff_param_0
|
199 |
+
)
|
200 |
+
{
|
201 |
+
.reg .pred %p<4>;
|
202 |
+
.reg .b32 %r<5>;
|
203 |
+
.reg .f32 %f<49>;
|
204 |
+
$L__func_begin1:
|
205 |
+
|
206 |
+
ld.param.f32 %f14, [__nv_erff_param_0];
|
207 |
+
abs.ftz.f32 %f1, %f14;
|
208 |
+
setp.ge.f32 %p1, %f1, 0f3F8060FE;
|
209 |
+
mov.f32 %f46, 0f3789CA3C;
|
210 |
+
mov.f32 %f45, 0fB9F560B9;
|
211 |
+
mov.f32 %f44, 0f3BAC840B;
|
212 |
+
mov.f32 %f43, 0fBD0C8162;
|
213 |
+
mov.f32 %f42, 0f3E1CF906;
|
214 |
+
mov.f32 %f41, 0f3F6A937E;
|
215 |
+
mov.f32 %f40, 0f3F20D842;
|
216 |
+
mov.f32 %f47, %f1;
|
217 |
+
@%p1 bra $L__BB1_2;
|
218 |
+
mul.f32 %f47, %f14, %f14;
|
219 |
+
mov.f32 %f46, 0f38B1E96A;
|
220 |
+
mov.f32 %f45, 0fBA574D20;
|
221 |
+
mov.f32 %f44, 0f3BAAD5EA;
|
222 |
+
mov.f32 %f43, 0fBCDC1BE7;
|
223 |
+
mov.f32 %f42, 0f3DE718AF;
|
224 |
+
mov.f32 %f41, 0fBEC093AC;
|
225 |
+
mov.f32 %f40, 0f3E0375D3;
|
226 |
+
$L__BB1_2:
|
227 |
+
setp.ltu.f32 %p2, %f1, 0f3F8060FE;
|
228 |
+
fma.rn.ftz.f32 %f29, %f46, %f47, %f45;
|
229 |
+
fma.rn.ftz.f32 %f30, %f29, %f47, %f44;
|
230 |
+
fma.rn.ftz.f32 %f31, %f30, %f47, %f43;
|
231 |
+
fma.rn.ftz.f32 %f32, %f31, %f47, %f42;
|
232 |
+
fma.rn.ftz.f32 %f33, %f32, %f47, %f41;
|
233 |
+
fma.rn.ftz.f32 %f34, %f33, %f47, %f40;
|
234 |
+
neg.f32 %f35, %f47;
|
235 |
+
selp.f32 %f36, %f35, %f14, %p1;
|
236 |
+
fma.rn.ftz.f32 %f48, %f34, %f36, %f36;
|
237 |
+
@%p2 bra $L__BB1_4;
|
238 |
+
ex2.approx.ftz.f32 %f37, %f48;
|
239 |
+
mov.f32 %f38, 0f3F800000;
|
240 |
+
sub.f32 %f39, %f38, %f37;
|
241 |
+
mov.b32 %r1, %f39;
|
242 |
+
mov.b32 %r2, %f14;
|
243 |
+
and.b32 %r3, %r2, -2147483648;
|
244 |
+
or.b32 %r4, %r3, %r1;
|
245 |
+
mov.b32 %f48, %r4;
|
246 |
+
$L__BB1_4:
|
247 |
+
st.param.f32 [func_retval0+0], %f48;
|
248 |
+
ret;
|
249 |
+
$L__func_end1:
|
250 |
+
|
251 |
+
}
|
252 |
+
.file 1 "/tmp/torchinductor_root/5j/c5jxaguxho3nhrlt5vcinnz5fevodumlpwn4wyb2vx3xrveicerl.py"
|
253 |
+
.section .debug_abbrev
|
254 |
+
{
|
255 |
+
.b8 1
|
256 |
+
.b8 17
|
257 |
+
.b8 1
|
258 |
+
.b8 37
|
259 |
+
.b8 8
|
260 |
+
.b8 19
|
261 |
+
.b8 5
|
262 |
+
.b8 3
|
263 |
+
.b8 8
|
264 |
+
.b8 16
|
265 |
+
.b8 6
|
266 |
+
.b8 27
|
267 |
+
.b8 8
|
268 |
+
.b8 180
|
269 |
+
.b8 66
|
270 |
+
.b8 12
|
271 |
+
.b8 17
|
272 |
+
.b8 1
|
273 |
+
.b8 18
|
274 |
+
.b8 1
|
275 |
+
.b8 0
|
276 |
+
.b8 0
|
277 |
+
.b8 2
|
278 |
+
.b8 46
|
279 |
+
.b8 0
|
280 |
+
.b8 17
|
281 |
+
.b8 1
|
282 |
+
.b8 18
|
283 |
+
.b8 1
|
284 |
+
.b8 64
|
285 |
+
.b8 10
|
286 |
+
.b8 135
|
287 |
+
.b8 64
|
288 |
+
.b8 8
|
289 |
+
.b8 3
|
290 |
+
.b8 8
|
291 |
+
.b8 58
|
292 |
+
.b8 11
|
293 |
+
.b8 59
|
294 |
+
.b8 11
|
295 |
+
.b8 63
|
296 |
+
.b8 12
|
297 |
+
.b8 0
|
298 |
+
.b8 0
|
299 |
+
.b8 0
|
300 |
+
}
|
301 |
+
.section .debug_info
|
302 |
+
{
|
303 |
+
.b32 176
|
304 |
+
.b8 2
|
305 |
+
.b8 0
|
306 |
+
.b32 .debug_abbrev
|
307 |
+
.b8 8
|
308 |
+
.b8 1
|
309 |
+
.b8 116
|
310 |
+
.b8 114
|
311 |
+
.b8 105
|
312 |
+
.b8 116
|
313 |
+
.b8 111
|
314 |
+
.b8 110
|
315 |
+
.b8 0
|
316 |
+
.b8 2
|
317 |
+
.b8 0
|
318 |
+
.b8 99
|
319 |
+
.b8 53
|
320 |
+
.b8 106
|
321 |
+
.b8 120
|
322 |
+
.b8 97
|
323 |
+
.b8 103
|
324 |
+
.b8 117
|
325 |
+
.b8 120
|
326 |
+
.b8 104
|
327 |
+
.b8 111
|
328 |
+
.b8 51
|
329 |
+
.b8 110
|
330 |
+
.b8 104
|
331 |
+
.b8 114
|
332 |
+
.b8 108
|
333 |
+
.b8 116
|
334 |
+
.b8 53
|
335 |
+
.b8 118
|
336 |
+
.b8 99
|
337 |
+
.b8 105
|
338 |
+
.b8 110
|
339 |
+
.b8 110
|
340 |
+
.b8 122
|
341 |
+
.b8 53
|
342 |
+
.b8 102
|
343 |
+
.b8 101
|
344 |
+
.b8 118
|
345 |
+
.b8 111
|
346 |
+
.b8 100
|
347 |
+
.b8 117
|
348 |
+
.b8 109
|
349 |
+
.b8 108
|
350 |
+
.b8 112
|
351 |
+
.b8 119
|
352 |
+
.b8 110
|
353 |
+
.b8 52
|
354 |
+
.b8 119
|
355 |
+
.b8 121
|
356 |
+
.b8 98
|
357 |
+
.b8 50
|
358 |
+
.b8 118
|
359 |
+
.b8 120
|
360 |
+
.b8 51
|
361 |
+
.b8 120
|
362 |
+
.b8 114
|
363 |
+
.b8 118
|
364 |
+
.b8 101
|
365 |
+
.b8 105
|
366 |
+
.b8 99
|
367 |
+
.b8 101
|
368 |
+
.b8 114
|
369 |
+
.b8 108
|
370 |
+
.b8 46
|
371 |
+
.b8 112
|
372 |
+
.b8 121
|
373 |
+
.b8 0
|
374 |
+
.b32 .debug_line
|
375 |
+
.b8 47
|
376 |
+
.b8 116
|
377 |
+
.b8 109
|
378 |
+
.b8 112
|
379 |
+
.b8 47
|
380 |
+
.b8 116
|
381 |
+
.b8 111
|
382 |
+
.b8 114
|
383 |
+
.b8 99
|
384 |
+
.b8 104
|
385 |
+
.b8 105
|
386 |
+
.b8 110
|
387 |
+
.b8 100
|
388 |
+
.b8 117
|
389 |
+
.b8 99
|
390 |
+
.b8 116
|
391 |
+
.b8 111
|
392 |
+
.b8 114
|
393 |
+
.b8 95
|
394 |
+
.b8 114
|
395 |
+
.b8 111
|
396 |
+
.b8 111
|
397 |
+
.b8 116
|
398 |
+
.b8 47
|
399 |
+
.b8 53
|
400 |
+
.b8 106
|
401 |
+
.b8 0
|
402 |
+
.b8 1
|
403 |
+
.b64 $L__func_begin0
|
404 |
+
.b64 $L__func_end0
|
405 |
+
.b8 2
|
406 |
+
.b64 $L__func_begin0
|
407 |
+
.b64 $L__func_end0
|
408 |
+
.b8 1
|
409 |
+
.b8 156
|
410 |
+
.b8 116
|
411 |
+
.b8 114
|
412 |
+
.b8 105
|
413 |
+
.b8 116
|
414 |
+
.b8 111
|
415 |
+
.b8 110
|
416 |
+
.b8 95
|
417 |
+
.b8 95
|
418 |
+
.b8 48
|
419 |
+
.b8 100
|
420 |
+
.b8 49
|
421 |
+
.b8 100
|
422 |
+
.b8 50
|
423 |
+
.b8 100
|
424 |
+
.b8 101
|
425 |
+
.b8 0
|
426 |
+
.b8 116
|
427 |
+
.b8 114
|
428 |
+
.b8 105
|
429 |
+
.b8 116
|
430 |
+
.b8 111
|
431 |
+
.b8 110
|
432 |
+
.b8 95
|
433 |
+
.b8 95
|
434 |
+
.b8 48
|
435 |
+
.b8 100
|
436 |
+
.b8 49
|
437 |
+
.b8 100
|
438 |
+
.b8 50
|
439 |
+
.b8 100
|
440 |
+
.b8 101
|
441 |
+
.b8 0
|
442 |
+
.b8 1
|
443 |
+
.b8 18
|
444 |
+
.b8 1
|
445 |
+
.b8 0
|
446 |
+
}
|
447 |
+
.section .debug_pubnames
|
448 |
+
{
|
449 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
450 |
+
$L__pubNames_start0:
|
451 |
+
.b8 2
|
452 |
+
.b8 0
|
453 |
+
.b32 .debug_info
|
454 |
+
.b32 180
|
455 |
+
.b32 125
|
456 |
+
.b8 116
|
457 |
+
.b8 114
|
458 |
+
.b8 105
|
459 |
+
.b8 116
|
460 |
+
.b8 111
|
461 |
+
.b8 110
|
462 |
+
.b8 95
|
463 |
+
.b8 95
|
464 |
+
.b8 48
|
465 |
+
.b8 100
|
466 |
+
.b8 49
|
467 |
+
.b8 100
|
468 |
+
.b8 50
|
469 |
+
.b8 100
|
470 |
+
.b8 101
|
471 |
+
.b8 0
|
472 |
+
.b32 0
|
473 |
+
$L__pubNames_end0:
|
474 |
+
}
|
475 |
+
.section .debug_pubtypes
|
476 |
+
{
|
477 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
478 |
+
$L__pubTypes_start0:
|
479 |
+
.b8 2
|
480 |
+
.b8 0
|
481 |
+
.b32 .debug_info
|
482 |
+
.b32 180
|
483 |
+
.b32 0
|
484 |
+
$L__pubTypes_end0:
|
485 |
+
}
|
486 |
+
.section .debug_loc { }
|
.triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.ttgir
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
3 |
+
tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
4 |
+
%cst = arith.constant dense<0.398942292> : tensor<512xf32, #blocked>
|
5 |
+
%cst_0 = arith.constant dense<-5.000000e-01> : tensor<512xf32, #blocked>
|
6 |
+
%cst_1 = arith.constant dense<5.000000e-01> : tensor<512xf32, #blocked>
|
7 |
+
%cst_2 = arith.constant dense<1.000000e+00> : tensor<512xf32, #blocked>
|
8 |
+
%cst_3 = arith.constant dense<0.707106769> : tensor<512xf32, #blocked>
|
9 |
+
%c512_i32 = arith.constant 512 : i32
|
10 |
+
%0 = tt.get_program_id x : i32
|
11 |
+
%1 = arith.muli %0, %c512_i32 : i32
|
12 |
+
%2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked>
|
13 |
+
%3 = tt.splat %1 : (i32) -> tensor<512xi32, #blocked>
|
14 |
+
%4 = arith.addi %3, %2 : tensor<512xi32, #blocked>
|
15 |
+
%5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>, #blocked>
|
16 |
+
%6 = tt.addptr %5, %4 : tensor<512x!tt.ptr<bf16, 1>, #blocked>, tensor<512xi32, #blocked>
|
17 |
+
%7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16, #blocked>
|
18 |
+
%8 = arith.extf %7 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked>
|
19 |
+
%9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>, #blocked>
|
20 |
+
%10 = tt.addptr %9, %4 : tensor<512x!tt.ptr<bf16, 1>, #blocked>, tensor<512xi32, #blocked>
|
21 |
+
%11 = tt.load %10 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16, #blocked>
|
22 |
+
%12 = arith.extf %11 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked>
|
23 |
+
%13 = arith.mulf %12, %cst_3 : tensor<512xf32, #blocked>
|
24 |
+
%14 = tt.extern_elementwise %13 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<512xf32, #blocked>) -> tensor<512xf32, #blocked>
|
25 |
+
%15 = arith.addf %14, %cst_2 : tensor<512xf32, #blocked>
|
26 |
+
%16 = arith.mulf %15, %cst_1 : tensor<512xf32, #blocked>
|
27 |
+
%17 = arith.mulf %12, %12 : tensor<512xf32, #blocked>
|
28 |
+
%18 = arith.mulf %17, %cst_0 : tensor<512xf32, #blocked>
|
29 |
+
%19 = math.exp %18 : tensor<512xf32, #blocked>
|
30 |
+
%20 = arith.mulf %19, %cst : tensor<512xf32, #blocked>
|
31 |
+
%21 = arith.mulf %12, %20 : tensor<512xf32, #blocked>
|
32 |
+
%22 = arith.addf %16, %21 : tensor<512xf32, #blocked>
|
33 |
+
%23 = arith.mulf %8, %22 : tensor<512xf32, #blocked>
|
34 |
+
%24 = arith.truncf %23 : tensor<512xf32, #blocked> to tensor<512xbf16, #blocked>
|
35 |
+
tt.store %6, %24 {cache = 1 : i32, evict = 1 : i32} : tensor<512xbf16, #blocked>
|
36 |
+
tt.return
|
37 |
+
}
|
38 |
+
}
|
.triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.ttir
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<0.398942292> : tensor<512xf32>
|
4 |
+
%cst_0 = arith.constant dense<-5.000000e-01> : tensor<512xf32>
|
5 |
+
%cst_1 = arith.constant dense<5.000000e-01> : tensor<512xf32>
|
6 |
+
%cst_2 = arith.constant dense<1.000000e+00> : tensor<512xf32>
|
7 |
+
%cst_3 = arith.constant dense<0.707106769> : tensor<512xf32>
|
8 |
+
%c512_i32 = arith.constant 512 : i32
|
9 |
+
%0 = tt.get_program_id x : i32
|
10 |
+
%1 = arith.muli %0, %c512_i32 : i32
|
11 |
+
%2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32>
|
12 |
+
%3 = tt.splat %1 : (i32) -> tensor<512xi32>
|
13 |
+
%4 = arith.addi %3, %2 : tensor<512xi32>
|
14 |
+
%5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>>
|
15 |
+
%6 = tt.addptr %5, %4 : tensor<512x!tt.ptr<bf16, 1>>, tensor<512xi32>
|
16 |
+
%7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16>
|
17 |
+
%8 = arith.extf %7 : tensor<512xbf16> to tensor<512xf32>
|
18 |
+
%9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>>
|
19 |
+
%10 = tt.addptr %9, %4 : tensor<512x!tt.ptr<bf16, 1>>, tensor<512xi32>
|
20 |
+
%11 = tt.load %10 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16>
|
21 |
+
%12 = arith.extf %11 : tensor<512xbf16> to tensor<512xf32>
|
22 |
+
%13 = arith.mulf %12, %cst_3 : tensor<512xf32>
|
23 |
+
%14 = tt.extern_elementwise %13 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<512xf32>) -> tensor<512xf32>
|
24 |
+
%15 = arith.addf %14, %cst_2 : tensor<512xf32>
|
25 |
+
%16 = arith.mulf %15, %cst_1 : tensor<512xf32>
|
26 |
+
%17 = arith.mulf %12, %12 : tensor<512xf32>
|
27 |
+
%18 = arith.mulf %17, %cst_0 : tensor<512xf32>
|
28 |
+
%19 = math.exp %18 : tensor<512xf32>
|
29 |
+
%20 = arith.mulf %19, %cst : tensor<512xf32>
|
30 |
+
%21 = arith.mulf %12, %20 : tensor<512xf32>
|
31 |
+
%22 = arith.addf %16, %21 : tensor<512xf32>
|
32 |
+
%23 = arith.mulf %8, %22 : tensor<512xf32>
|
33 |
+
%24 = arith.truncf %23 : tensor<512xf32> to tensor<512xbf16>
|
34 |
+
tt.store %6, %24 {cache = 1 : i32, evict = 1 : i32} : tensor<512xbf16>
|
35 |
+
tt.return
|
36 |
+
}
|
37 |
+
}
|
.triton/dump/645565eaba0a18dd23ef200fe9abb0c0/triton_.cubin
ADDED
Binary file (19.5 kB). View file
|
|
.triton/dump/645565eaba0a18dd23ef200fe9abb0c0/triton_.ptx
ADDED
@@ -0,0 +1,834 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3d4d5d6d7d8de9de
|
10 |
+
.extern .func __assertfail
|
11 |
+
(
|
12 |
+
.param .b64 __assertfail_param_0,
|
13 |
+
.param .b64 __assertfail_param_1,
|
14 |
+
.param .b32 __assertfail_param_2,
|
15 |
+
.param .b64 __assertfail_param_3,
|
16 |
+
.param .b64 __assertfail_param_4
|
17 |
+
)
|
18 |
+
;
|
19 |
+
.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
|
20 |
+
.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
|
21 |
+
.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
|
22 |
+
.extern .shared .align 1 .b8 global_smem[];
|
23 |
+
.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
|
24 |
+
|
25 |
+
.visible .entry triton__0d1d2d3d4d5d6d7d8de9de(
|
26 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_0,
|
27 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_1,
|
28 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_2,
|
29 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_3,
|
30 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_4,
|
31 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_5,
|
32 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_6,
|
33 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_7,
|
34 |
+
.param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_8,
|
35 |
+
.param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_9
|
36 |
+
)
|
37 |
+
.maxntid 64, 1, 1
|
38 |
+
{
|
39 |
+
.reg .pred %p<36>;
|
40 |
+
.reg .b16 %rs<5>;
|
41 |
+
.reg .b32 %r<109>;
|
42 |
+
.reg .f32 %f<70>;
|
43 |
+
.reg .b64 %rd<49>;
|
44 |
+
.loc 1 18 0
|
45 |
+
$L__func_begin0:
|
46 |
+
.loc 1 18 0
|
47 |
+
|
48 |
+
ld.param.u64 %rd8, [triton__0d1d2d3d4d5d6d7d8de9de_param_7];
|
49 |
+
ld.param.u64 %rd7, [triton__0d1d2d3d4d5d6d7d8de9de_param_6];
|
50 |
+
ld.param.u64 %rd6, [triton__0d1d2d3d4d5d6d7d8de9de_param_5];
|
51 |
+
ld.param.u64 %rd5, [triton__0d1d2d3d4d5d6d7d8de9de_param_2];
|
52 |
+
ld.param.u64 %rd4, [triton__0d1d2d3d4d5d6d7d8de9de_param_0];
|
53 |
+
$L__tmp0:
|
54 |
+
.loc 1 26 26
|
55 |
+
mov.u32 %r1, %tid.x;
|
56 |
+
ld.param.u64 %rd21, [triton__0d1d2d3d4d5d6d7d8de9de_param_1];
|
57 |
+
and.b32 %r2, %r1, 63;
|
58 |
+
shl.b32 %r28, %r2, 2;
|
59 |
+
ld.param.u64 %rd22, [triton__0d1d2d3d4d5d6d7d8de9de_param_3];
|
60 |
+
ld.param.u64 %rd23, [triton__0d1d2d3d4d5d6d7d8de9de_param_4];
|
61 |
+
.loc 1 23 28
|
62 |
+
mov.u32 %r11, %ctaid.x;
|
63 |
+
.loc 1 30 18
|
64 |
+
shr.s32 %r29, %r11, 31;
|
65 |
+
shr.u32 %r30, %r29, 23;
|
66 |
+
add.s32 %r31, %r11, %r30;
|
67 |
+
and.b32 %r32, %r31, 16776704;
|
68 |
+
sub.s32 %r33, %r11, %r32;
|
69 |
+
.loc 1 31 30
|
70 |
+
cvt.s64.s32 %rd1, %r11;
|
71 |
+
mul.wide.s32 %rd24, %r11, 8;
|
72 |
+
add.s64 %rd10, %rd21, %rd24;
|
73 |
+
mov.pred %p18, -1;
|
74 |
+
.loc 1 31 35
|
75 |
+
mov.u64 %rd9, 0x0;
|
76 |
+
@%p18 ld.global.L1::evict_last.b64 { %rd9 }, [ %rd10 + 0 ];
|
77 |
+
mov.u64 %rd11, 0x0;
|
78 |
+
@%p18 ld.global.L1::evict_last.b64 { %rd11 }, [ %rd10 + 0 ];
|
79 |
+
mov.u64 %rd13, 0x0;
|
80 |
+
@%p18 ld.global.L1::evict_last.b64 { %rd13 }, [ %rd10 + 0 ];
|
81 |
+
mov.u64 %rd15, 0x0;
|
82 |
+
@%p18 ld.global.L1::evict_last.b64 { %rd15 }, [ %rd10 + 0 ];
|
83 |
+
mov.u64 %rd17, 0x0;
|
84 |
+
@%p18 ld.global.L1::evict_last.b64 { %rd17 }, [ %rd10 + 0 ];
|
85 |
+
.loc 1 32 40
|
86 |
+
shl.b32 %r34, %r33, 8;
|
87 |
+
.loc 1 32 36
|
88 |
+
or.b32 %r35, %r34, %r28;
|
89 |
+
.loc 1 32 30
|
90 |
+
mul.wide.s32 %rd25, %r35, 4;
|
91 |
+
add.s64 %rd19, %rd22, %rd25;
|
92 |
+
mov.b32 %r41, 0;
|
93 |
+
.loc 1 32 46
|
94 |
+
mov.u32 %r12, 0x0;
|
95 |
+
mov.u32 %r13, 0x0;
|
96 |
+
mov.u32 %r14, 0x0;
|
97 |
+
mov.u32 %r15, 0x0;
|
98 |
+
@%p18 ld.global.L1::evict_last.v4.b32 { %r12, %r13, %r14, %r15 }, [ %rd19 + 0 ];
|
99 |
+
@!%p18 mov.u32 %r12, %r41;
|
100 |
+
@!%p18 mov.u32 %r13, %r41;
|
101 |
+
@!%p18 mov.u32 %r14, %r41;
|
102 |
+
@!%p18 mov.u32 %r15, %r41;
|
103 |
+
.loc 1 33 31
|
104 |
+
cvt.u64.u32 %rd3, %r28;
|
105 |
+
mul.wide.u32 %rd26, %r28, 4;
|
106 |
+
add.s64 %rd20, %rd23, %rd26;
|
107 |
+
.loc 1 33 36
|
108 |
+
mov.u32 %r20, 0x0;
|
109 |
+
mov.u32 %r21, 0x0;
|
110 |
+
mov.u32 %r22, 0x0;
|
111 |
+
mov.u32 %r23, 0x0;
|
112 |
+
@%p18 ld.global.L1::evict_last.v4.b32 { %r20, %r21, %r22, %r23 }, [ %rd20 + 0 ];
|
113 |
+
@!%p18 mov.u32 %r20, %r41;
|
114 |
+
@!%p18 mov.u32 %r21, %r41;
|
115 |
+
@!%p18 mov.u32 %r22, %r41;
|
116 |
+
@!%p18 mov.u32 %r23, %r41;
|
117 |
+
.loc 1 34 18
|
118 |
+
add.s64 %rd27, %rd17, 50257;
|
119 |
+
.loc 1 35 18
|
120 |
+
setp.lt.s64 %p16, %rd17, 0;
|
121 |
+
.loc 1 36 32
|
122 |
+
selp.b64 %rd28, %rd27, %rd17, %p16;
|
123 |
+
.loc 1 37 36
|
124 |
+
setp.lt.u64 %p17, %rd28, 50257;
|
125 |
+
.loc 1 37 51
|
126 |
+
@%p17 bra $L__BB0_2;
|
127 |
+
mov.u64 %rd29, assertMessage_0;
|
128 |
+
cvta.global.u64 %rd30, %rd29;
|
129 |
+
mov.u64 %rd31, assertFile_0;
|
130 |
+
cvta.global.u64 %rd32, %rd31;
|
131 |
+
mov.u64 %rd33, assertFunc_0;
|
132 |
+
cvta.global.u64 %rd34, %rd33;
|
133 |
+
mov.b32 %r36, 883;
|
134 |
+
mov.u64 %rd35, 1;
|
135 |
+
{ // callseq 0, 0
|
136 |
+
.reg .b32 temp_param_reg;
|
137 |
+
.param .b64 param0;
|
138 |
+
st.param.b64 [param0+0], %rd30;
|
139 |
+
.param .b64 param1;
|
140 |
+
st.param.b64 [param1+0], %rd32;
|
141 |
+
.param .b32 param2;
|
142 |
+
st.param.b32 [param2+0], %r36;
|
143 |
+
.param .b64 param3;
|
144 |
+
st.param.b64 [param3+0], %rd34;
|
145 |
+
.param .b64 param4;
|
146 |
+
st.param.b64 [param4+0], %rd35;
|
147 |
+
call.uni
|
148 |
+
__assertfail,
|
149 |
+
(
|
150 |
+
param0,
|
151 |
+
param1,
|
152 |
+
param2,
|
153 |
+
param3,
|
154 |
+
param4
|
155 |
+
);
|
156 |
+
} // callseq 0
|
157 |
+
$L__BB0_2:
|
158 |
+
.loc 1 35 18
|
159 |
+
setp.lt.s64 %p33, %rd9, 0;
|
160 |
+
.loc 1 26 26
|
161 |
+
and.b32 %r75, %r1, 31;
|
162 |
+
.loc 1 38 40
|
163 |
+
shl.b64 %rd41, %rd9, 8;
|
164 |
+
add.s64 %rd42, %rd41, 12865792;
|
165 |
+
selp.b64 %rd43, %rd42, %rd41, %p33;
|
166 |
+
.loc 1 38 36
|
167 |
+
or.b64 %rd44, %rd43, %rd3;
|
168 |
+
.loc 1 38 30
|
169 |
+
shl.b64 %rd45, %rd44, 2;
|
170 |
+
add.s64 %rd36, %rd5, %rd45;
|
171 |
+
.loc 1 38 48
|
172 |
+
mov.u32 %r37, 0x0;
|
173 |
+
mov.u32 %r38, 0x0;
|
174 |
+
mov.u32 %r39, 0x0;
|
175 |
+
mov.u32 %r40, 0x0;
|
176 |
+
@%p18 ld.global.v4.b32 { %r37, %r38, %r39, %r40 }, [ %rd36 + 0 ];
|
177 |
+
@!%p18 mov.u32 %r37, %r41;
|
178 |
+
@!%p18 mov.u32 %r38, %r41;
|
179 |
+
@!%p18 mov.u32 %r39, %r41;
|
180 |
+
@!%p18 mov.u32 %r40, %r41;
|
181 |
+
.loc 1 32 46
|
182 |
+
mov.b32 %f1, %r12;
|
183 |
+
mov.b32 %f2, %r13;
|
184 |
+
.loc 1 38 48
|
185 |
+
mov.b32 %f3, %r37;
|
186 |
+
mov.b32 %f4, %r38;
|
187 |
+
.loc 1 39 18
|
188 |
+
add.f32 %f5, %f2, %f4;
|
189 |
+
mov.b32 %r64, %f5;
|
190 |
+
add.f32 %f6, %f1, %f3;
|
191 |
+
.loc 1 32 46
|
192 |
+
mov.b32 %f7, %r15;
|
193 |
+
mov.b32 %f8, %r14;
|
194 |
+
.loc 1 38 48
|
195 |
+
mov.b32 %f9, %r40;
|
196 |
+
mov.b32 %f10, %r39;
|
197 |
+
.loc 1 39 18
|
198 |
+
add.f32 %f11, %f8, %f10;
|
199 |
+
mov.b32 %r65, %f11;
|
200 |
+
add.f32 %f12, %f7, %f9;
|
201 |
+
$L__tmp1:
|
202 |
+
.loc 2 233 15
|
203 |
+
add.f32 %f13, %f6, %f5;
|
204 |
+
add.f32 %f14, %f11, %f13;
|
205 |
+
add.f32 %f15, %f12, %f14;
|
206 |
+
$L__tmp2:
|
207 |
+
.loc 2 243 36
|
208 |
+
mov.b32 %r76, %f15;
|
209 |
+
shfl.sync.bfly.b32 %r77, %r76, 16, 31, -1;
|
210 |
+
mov.b32 %f16, %r77;
|
211 |
+
$L__tmp3:
|
212 |
+
.loc 2 233 15
|
213 |
+
add.f32 %f17, %f15, %f16;
|
214 |
+
$L__tmp4:
|
215 |
+
.loc 2 243 36
|
216 |
+
mov.b32 %r78, %f17;
|
217 |
+
shfl.sync.bfly.b32 %r79, %r78, 8, 31, -1;
|
218 |
+
mov.b32 %f18, %r79;
|
219 |
+
$L__tmp5:
|
220 |
+
.loc 2 233 15
|
221 |
+
add.f32 %f19, %f17, %f18;
|
222 |
+
$L__tmp6:
|
223 |
+
.loc 2 243 36
|
224 |
+
mov.b32 %r80, %f19;
|
225 |
+
shfl.sync.bfly.b32 %r81, %r80, 4, 31, -1;
|
226 |
+
mov.b32 %f20, %r81;
|
227 |
+
$L__tmp7:
|
228 |
+
.loc 2 233 15
|
229 |
+
add.f32 %f21, %f19, %f20;
|
230 |
+
$L__tmp8:
|
231 |
+
.loc 2 243 36
|
232 |
+
mov.b32 %r82, %f21;
|
233 |
+
shfl.sync.bfly.b32 %r83, %r82, 2, 31, -1;
|
234 |
+
mov.b32 %f22, %r83;
|
235 |
+
$L__tmp9:
|
236 |
+
.loc 2 233 15
|
237 |
+
add.f32 %f23, %f21, %f22;
|
238 |
+
$L__tmp10:
|
239 |
+
.loc 2 243 36
|
240 |
+
mov.b32 %r84, %f23;
|
241 |
+
shfl.sync.bfly.b32 %r85, %r84, 1, 31, -1;
|
242 |
+
mov.b32 %f24, %r85;
|
243 |
+
$L__tmp11:
|
244 |
+
.loc 2 233 15
|
245 |
+
add.f32 %f25, %f23, %f24;
|
246 |
+
$L__tmp12:
|
247 |
+
.loc 2 243 36
|
248 |
+
setp.eq.s32 %p23, %r75, 0;
|
249 |
+
shr.u32 %r86, %r1, 3;
|
250 |
+
and.b32 %r87, %r86, 4;
|
251 |
+
mov.u32 %r88, global_smem;
|
252 |
+
add.s32 %r45, %r88, %r87;
|
253 |
+
mov.b32 %r46, %f25;
|
254 |
+
@%p23 st.shared.b32 [ %r45 + 0 ], %r46;
|
255 |
+
bar.sync 0;
|
256 |
+
setp.lt.s32 %p24, %r1, 2;
|
257 |
+
shl.b32 %r89, %r1, 2;
|
258 |
+
add.s32 %r48, %r88, %r89;
|
259 |
+
@%p24 ld.shared.b32 %r47, [ %r48 + 0 ];
|
260 |
+
mov.b32 %f26, %r47;
|
261 |
+
shfl.sync.bfly.b32 %r90, %r47, 1, 31, -1;
|
262 |
+
mov.b32 %f27, %r90;
|
263 |
+
$L__tmp13:
|
264 |
+
.loc 2 233 15
|
265 |
+
add.f32 %f28, %f26, %f27;
|
266 |
+
$L__tmp14:
|
267 |
+
.loc 2 243 36
|
268 |
+
and.b32 %r91, %r1, 1;
|
269 |
+
setp.eq.b32 %p34, %r91, 1;
|
270 |
+
not.pred %p35, %p34;
|
271 |
+
and.pred %p25, %p24, %p35;
|
272 |
+
mov.b32 %r50, %f28;
|
273 |
+
@%p25 st.shared.b32 [ %r48 + 0 ], %r50;
|
274 |
+
bar.sync 0;
|
275 |
+
ld.shared.f32 %f29, [global_smem];
|
276 |
+
$L__tmp15:
|
277 |
+
.loc 3 8 15
|
278 |
+
add.f32 %f30, %f29, 0f00000000;
|
279 |
+
$L__tmp16:
|
280 |
+
.loc 1 47 20
|
281 |
+
mov.b32 %r52, %f30;
|
282 |
+
mov.b32 %r53, 1132462080;
|
283 |
+
div.full.f32 %r74, %r52, %r53;
|
284 |
+
mov.b32 %f31, %r74;
|
285 |
+
.loc 1 48 19
|
286 |
+
sub.f32 %f32, %f6, %f31;
|
287 |
+
sub.f32 %f33, %f5, %f31;
|
288 |
+
sub.f32 %f34, %f11, %f31;
|
289 |
+
sub.f32 %f35, %f12, %f31;
|
290 |
+
.loc 1 49 20
|
291 |
+
mul.f32 %f36, %f33, %f33;
|
292 |
+
$L__tmp17:
|
293 |
+
.loc 2 243 36
|
294 |
+
bar.sync 0;
|
295 |
+
$L__tmp18:
|
296 |
+
.loc 2 233 15
|
297 |
+
fma.rn.f32 %f37, %f32, %f32, %f36;
|
298 |
+
fma.rn.f32 %f38, %f34, %f34, %f37;
|
299 |
+
fma.rn.f32 %f39, %f35, %f35, %f38;
|
300 |
+
$L__tmp19:
|
301 |
+
.loc 2 243 36
|
302 |
+
mov.b32 %r92, %f39;
|
303 |
+
shfl.sync.bfly.b32 %r93, %r92, 16, 31, -1;
|
304 |
+
mov.b32 %f40, %r93;
|
305 |
+
$L__tmp20:
|
306 |
+
.loc 2 233 15
|
307 |
+
add.f32 %f41, %f39, %f40;
|
308 |
+
$L__tmp21:
|
309 |
+
.loc 2 243 36
|
310 |
+
mov.b32 %r94, %f41;
|
311 |
+
shfl.sync.bfly.b32 %r95, %r94, 8, 31, -1;
|
312 |
+
mov.b32 %f42, %r95;
|
313 |
+
$L__tmp22:
|
314 |
+
.loc 2 233 15
|
315 |
+
add.f32 %f43, %f41, %f42;
|
316 |
+
$L__tmp23:
|
317 |
+
.loc 2 243 36
|
318 |
+
mov.b32 %r96, %f43;
|
319 |
+
shfl.sync.bfly.b32 %r97, %r96, 4, 31, -1;
|
320 |
+
mov.b32 %f44, %r97;
|
321 |
+
$L__tmp24:
|
322 |
+
.loc 2 233 15
|
323 |
+
add.f32 %f45, %f43, %f44;
|
324 |
+
$L__tmp25:
|
325 |
+
.loc 2 243 36
|
326 |
+
mov.b32 %r98, %f45;
|
327 |
+
shfl.sync.bfly.b32 %r99, %r98, 2, 31, -1;
|
328 |
+
mov.b32 %f46, %r99;
|
329 |
+
$L__tmp26:
|
330 |
+
.loc 2 233 15
|
331 |
+
add.f32 %f47, %f45, %f46;
|
332 |
+
$L__tmp27:
|
333 |
+
.loc 2 243 36
|
334 |
+
mov.b32 %r100, %f47;
|
335 |
+
shfl.sync.bfly.b32 %r101, %r100, 1, 31, -1;
|
336 |
+
mov.b32 %f48, %r101;
|
337 |
+
$L__tmp28:
|
338 |
+
.loc 2 233 15
|
339 |
+
add.f32 %f49, %f47, %f48;
|
340 |
+
$L__tmp29:
|
341 |
+
.loc 2 243 36
|
342 |
+
mov.b32 %r55, %f49;
|
343 |
+
@%p23 st.shared.b32 [ %r45 + 0 ], %r55;
|
344 |
+
bar.sync 0;
|
345 |
+
@%p24 ld.shared.b32 %r56, [ %r48 + 0 ];
|
346 |
+
mov.b32 %f50, %r56;
|
347 |
+
shfl.sync.bfly.b32 %r102, %r56, 1, 31, -1;
|
348 |
+
mov.b32 %f51, %r102;
|
349 |
+
$L__tmp30:
|
350 |
+
.loc 2 233 15
|
351 |
+
add.f32 %f52, %f50, %f51;
|
352 |
+
$L__tmp31:
|
353 |
+
.loc 2 243 36
|
354 |
+
mov.b32 %r59, %f52;
|
355 |
+
@%p25 st.shared.b32 [ %r48 + 0 ], %r59;
|
356 |
+
bar.sync 0;
|
357 |
+
ld.shared.f32 %f53, [global_smem];
|
358 |
+
$L__tmp32:
|
359 |
+
.loc 3 8 15
|
360 |
+
add.f32 %f54, %f53, 0f00000000;
|
361 |
+
$L__tmp33:
|
362 |
+
.loc 1 54 20
|
363 |
+
mov.b32 %r61, %f54;
|
364 |
+
div.full.f32 %r60, %r61, %r53;
|
365 |
+
mov.b32 %f55, %r60;
|
366 |
+
.loc 1 56 20
|
367 |
+
add.f32 %f56, %f55, 0f3727C5AC;
|
368 |
+
.loc 1 57 26
|
369 |
+
rsqrt.approx.ftz.f32 %f57, %f56;
|
370 |
+
cvt.u32.u64 %r103, %rd3;
|
371 |
+
cvt.u32.u64 %r104, %rd1;
|
372 |
+
.loc 1 33 36
|
373 |
+
mov.b32 %f58, %r20;
|
374 |
+
mov.b32 %f59, %r21;
|
375 |
+
mov.b32 %f60, %r22;
|
376 |
+
mov.b32 %f61, %r23;
|
377 |
+
.loc 1 59 20
|
378 |
+
mul.f32 %f62, %f32, %f57;
|
379 |
+
mul.f32 %f63, %f33, %f57;
|
380 |
+
mul.f32 %f64, %f34, %f57;
|
381 |
+
mul.f32 %f65, %f35, %f57;
|
382 |
+
.loc 1 60 20
|
383 |
+
mul.f32 %f66, %f62, %f58;
|
384 |
+
mul.f32 %f67, %f63, %f59;
|
385 |
+
mul.f32 %f68, %f64, %f60;
|
386 |
+
mul.f32 %f69, %f65, %f61;
|
387 |
+
.loc 1 62 35
|
388 |
+
shl.b32 %r105, %r104, 8;
|
389 |
+
.loc 1 62 31
|
390 |
+
or.b32 %r106, %r105, %r103;
|
391 |
+
.loc 1 62 25
|
392 |
+
mul.wide.s32 %rd46, %r106, 4;
|
393 |
+
add.s64 %rd37, %rd6, %rd46;
|
394 |
+
.loc 1 39 18
|
395 |
+
mov.b32 %r63, %f6;
|
396 |
+
mov.b32 %r66, %f12;
|
397 |
+
.loc 1 62 47
|
398 |
+
@%p18 st.global.v4.b32 [ %rd37 + 0 ], { %r63, %r64, %r65, %r66 };
|
399 |
+
.loc 1 63 4
|
400 |
+
bar.sync 0;
|
401 |
+
.loc 1 64 28
|
402 |
+
shl.b64 %rd47, %rd1, 2;
|
403 |
+
add.s64 %rd38, %rd4, %rd47;
|
404 |
+
.loc 1 64 40
|
405 |
+
setp.eq.s32 %p30, %r2, 0;
|
406 |
+
mov.b32 %r67, %f57;
|
407 |
+
@%p30 st.global.b32 [ %rd38 + 0 ], { %r67 };
|
408 |
+
.loc 1 65 25
|
409 |
+
mul.wide.s32 %rd48, %r106, 2;
|
410 |
+
add.s64 %rd39, %rd8, %rd48;
|
411 |
+
.loc 1 65 48
|
412 |
+
mov.b32 %r68, %f66;
|
413 |
+
cvt.rn.bf16.f32 %rs1, %r68;
|
414 |
+
mov.b32 %r69, %f67;
|
415 |
+
cvt.rn.bf16.f32 %rs2, %r69;
|
416 |
+
mov.b32 %r70, %f68;
|
417 |
+
cvt.rn.bf16.f32 %rs3, %r70;
|
418 |
+
mov.b32 %r71, %f69;
|
419 |
+
cvt.rn.bf16.f32 %rs4, %r71;
|
420 |
+
mov.b32 %r107, {%rs1, %rs2};
|
421 |
+
mov.b32 %r108, {%rs3, %rs4};
|
422 |
+
@%p18 st.global.v2.b32 [ %rd39 + 0 ], { %r107, %r108 };
|
423 |
+
.loc 1 66 25
|
424 |
+
add.s64 %rd40, %rd7, %rd47;
|
425 |
+
.loc 1 66 37
|
426 |
+
@%p30 st.global.b32 [ %rd40 + 0 ], { %r74 };
|
427 |
+
.loc 1 66 4
|
428 |
+
ret;
|
429 |
+
$L__tmp34:
|
430 |
+
$L__func_end0:
|
431 |
+
|
432 |
+
}
|
433 |
+
// .globl __nv_rsqrtf
|
434 |
+
.visible .func (.param .b32 func_retval0) __nv_rsqrtf(
|
435 |
+
.param .b32 __nv_rsqrtf_param_0
|
436 |
+
)
|
437 |
+
{
|
438 |
+
.reg .f32 %f<3>;
|
439 |
+
$L__func_begin1:
|
440 |
+
|
441 |
+
ld.param.f32 %f1, [__nv_rsqrtf_param_0];
|
442 |
+
rsqrt.approx.ftz.f32 %f2, %f1;
|
443 |
+
st.param.f32 [func_retval0+0], %f2;
|
444 |
+
ret;
|
445 |
+
$L__func_end1:
|
446 |
+
|
447 |
+
}
|
448 |
+
.file 1 "/tmp/torchinductor_root/pd/cpdqiwgwgnzx7tsvbieui7kffx5dt43uhgvg7z7egekxcsybpv34.py"
|
449 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
|
450 |
+
.file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
|
451 |
+
.section .debug_abbrev
|
452 |
+
{
|
453 |
+
.b8 1
|
454 |
+
.b8 17
|
455 |
+
.b8 1
|
456 |
+
.b8 37
|
457 |
+
.b8 8
|
458 |
+
.b8 19
|
459 |
+
.b8 5
|
460 |
+
.b8 3
|
461 |
+
.b8 8
|
462 |
+
.b8 16
|
463 |
+
.b8 6
|
464 |
+
.b8 27
|
465 |
+
.b8 8
|
466 |
+
.b8 180
|
467 |
+
.b8 66
|
468 |
+
.b8 12
|
469 |
+
.b8 17
|
470 |
+
.b8 1
|
471 |
+
.b8 18
|
472 |
+
.b8 1
|
473 |
+
.b8 0
|
474 |
+
.b8 0
|
475 |
+
.b8 2
|
476 |
+
.b8 46
|
477 |
+
.b8 0
|
478 |
+
.b8 135
|
479 |
+
.b8 64
|
480 |
+
.b8 8
|
481 |
+
.b8 3
|
482 |
+
.b8 8
|
483 |
+
.b8 58
|
484 |
+
.b8 11
|
485 |
+
.b8 59
|
486 |
+
.b8 11
|
487 |
+
.b8 63
|
488 |
+
.b8 12
|
489 |
+
.b8 32
|
490 |
+
.b8 11
|
491 |
+
.b8 0
|
492 |
+
.b8 0
|
493 |
+
.b8 3
|
494 |
+
.b8 46
|
495 |
+
.b8 1
|
496 |
+
.b8 17
|
497 |
+
.b8 1
|
498 |
+
.b8 18
|
499 |
+
.b8 1
|
500 |
+
.b8 64
|
501 |
+
.b8 10
|
502 |
+
.b8 49
|
503 |
+
.b8 19
|
504 |
+
.b8 0
|
505 |
+
.b8 0
|
506 |
+
.b8 4
|
507 |
+
.b8 29
|
508 |
+
.b8 1
|
509 |
+
.b8 49
|
510 |
+
.b8 19
|
511 |
+
.b8 17
|
512 |
+
.b8 1
|
513 |
+
.b8 18
|
514 |
+
.b8 1
|
515 |
+
.b8 88
|
516 |
+
.b8 11
|
517 |
+
.b8 89
|
518 |
+
.b8 11
|
519 |
+
.b8 87
|
520 |
+
.b8 11
|
521 |
+
.b8 0
|
522 |
+
.b8 0
|
523 |
+
.b8 5
|
524 |
+
.b8 29
|
525 |
+
.b8 0
|
526 |
+
.b8 49
|
527 |
+
.b8 19
|
528 |
+
.b8 17
|
529 |
+
.b8 1
|
530 |
+
.b8 18
|
531 |
+
.b8 1
|
532 |
+
.b8 88
|
533 |
+
.b8 11
|
534 |
+
.b8 89
|
535 |
+
.b8 11
|
536 |
+
.b8 87
|
537 |
+
.b8 11
|
538 |
+
.b8 0
|
539 |
+
.b8 0
|
540 |
+
.b8 0
|
541 |
+
}
|
542 |
+
.section .debug_info
|
543 |
+
{
|
544 |
+
.b32 407
|
545 |
+
.b8 2
|
546 |
+
.b8 0
|
547 |
+
.b32 .debug_abbrev
|
548 |
+
.b8 8
|
549 |
+
.b8 1
|
550 |
+
.b8 116
|
551 |
+
.b8 114
|
552 |
+
.b8 105
|
553 |
+
.b8 116
|
554 |
+
.b8 111
|
555 |
+
.b8 110
|
556 |
+
.b8 0
|
557 |
+
.b8 2
|
558 |
+
.b8 0
|
559 |
+
.b8 99
|
560 |
+
.b8 112
|
561 |
+
.b8 100
|
562 |
+
.b8 113
|
563 |
+
.b8 105
|
564 |
+
.b8 119
|
565 |
+
.b8 103
|
566 |
+
.b8 119
|
567 |
+
.b8 103
|
568 |
+
.b8 110
|
569 |
+
.b8 122
|
570 |
+
.b8 120
|
571 |
+
.b8 55
|
572 |
+
.b8 116
|
573 |
+
.b8 115
|
574 |
+
.b8 118
|
575 |
+
.b8 98
|
576 |
+
.b8 105
|
577 |
+
.b8 101
|
578 |
+
.b8 117
|
579 |
+
.b8 105
|
580 |
+
.b8 55
|
581 |
+
.b8 107
|
582 |
+
.b8 102
|
583 |
+
.b8 102
|
584 |
+
.b8 120
|
585 |
+
.b8 53
|
586 |
+
.b8 100
|
587 |
+
.b8 116
|
588 |
+
.b8 52
|
589 |
+
.b8 51
|
590 |
+
.b8 117
|
591 |
+
.b8 104
|
592 |
+
.b8 103
|
593 |
+
.b8 118
|
594 |
+
.b8 103
|
595 |
+
.b8 55
|
596 |
+
.b8 122
|
597 |
+
.b8 55
|
598 |
+
.b8 101
|
599 |
+
.b8 103
|
600 |
+
.b8 101
|
601 |
+
.b8 107
|
602 |
+
.b8 120
|
603 |
+
.b8 99
|
604 |
+
.b8 115
|
605 |
+
.b8 121
|
606 |
+
.b8 98
|
607 |
+
.b8 112
|
608 |
+
.b8 118
|
609 |
+
.b8 51
|
610 |
+
.b8 52
|
611 |
+
.b8 46
|
612 |
+
.b8 112
|
613 |
+
.b8 121
|
614 |
+
.b8 0
|
615 |
+
.b32 .debug_line
|
616 |
+
.b8 47
|
617 |
+
.b8 116
|
618 |
+
.b8 109
|
619 |
+
.b8 112
|
620 |
+
.b8 47
|
621 |
+
.b8 116
|
622 |
+
.b8 111
|
623 |
+
.b8 114
|
624 |
+
.b8 99
|
625 |
+
.b8 104
|
626 |
+
.b8 105
|
627 |
+
.b8 110
|
628 |
+
.b8 100
|
629 |
+
.b8 117
|
630 |
+
.b8 99
|
631 |
+
.b8 116
|
632 |
+
.b8 111
|
633 |
+
.b8 114
|
634 |
+
.b8 95
|
635 |
+
.b8 114
|
636 |
+
.b8 111
|
637 |
+
.b8 111
|
638 |
+
.b8 116
|
639 |
+
.b8 47
|
640 |
+
.b8 112
|
641 |
+
.b8 100
|
642 |
+
.b8 0
|
643 |
+
.b8 1
|
644 |
+
.b64 $L__func_begin0
|
645 |
+
.b64 $L__func_end0
|
646 |
+
.b8 2
|
647 |
+
.b8 116
|
648 |
+
.b8 114
|
649 |
+
.b8 105
|
650 |
+
.b8 116
|
651 |
+
.b8 111
|
652 |
+
.b8 110
|
653 |
+
.b8 95
|
654 |
+
.b8 95
|
655 |
+
.b8 48
|
656 |
+
.b8 100
|
657 |
+
.b8 49
|
658 |
+
.b8 100
|
659 |
+
.b8 50
|
660 |
+
.b8 100
|
661 |
+
.b8 51
|
662 |
+
.b8 100
|
663 |
+
.b8 52
|
664 |
+
.b8 100
|
665 |
+
.b8 53
|
666 |
+
.b8 100
|
667 |
+
.b8 54
|
668 |
+
.b8 100
|
669 |
+
.b8 55
|
670 |
+
.b8 100
|
671 |
+
.b8 56
|
672 |
+
.b8 100
|
673 |
+
.b8 101
|
674 |
+
.b8 57
|
675 |
+
.b8 100
|
676 |
+
.b8 101
|
677 |
+
.b8 0
|
678 |
+
.b8 116
|
679 |
+
.b8 114
|
680 |
+
.b8 105
|
681 |
+
.b8 116
|
682 |
+
.b8 111
|
683 |
+
.b8 110
|
684 |
+
.b8 95
|
685 |
+
.b8 95
|
686 |
+
.b8 48
|
687 |
+
.b8 100
|
688 |
+
.b8 49
|
689 |
+
.b8 100
|
690 |
+
.b8 50
|
691 |
+
.b8 100
|
692 |
+
.b8 51
|
693 |
+
.b8 100
|
694 |
+
.b8 52
|
695 |
+
.b8 100
|
696 |
+
.b8 53
|
697 |
+
.b8 100
|
698 |
+
.b8 54
|
699 |
+
.b8 100
|
700 |
+
.b8 55
|
701 |
+
.b8 100
|
702 |
+
.b8 56
|
703 |
+
.b8 100
|
704 |
+
.b8 101
|
705 |
+
.b8 57
|
706 |
+
.b8 100
|
707 |
+
.b8 101
|
708 |
+
.b8 0
|
709 |
+
.b8 1
|
710 |
+
.b8 18
|
711 |
+
.b8 1
|
712 |
+
.b8 1
|
713 |
+
.b8 3
|
714 |
+
.b64 $L__func_begin0
|
715 |
+
.b64 $L__func_end0
|
716 |
+
.b8 1
|
717 |
+
.b8 156
|
718 |
+
.b32 125
|
719 |
+
.b8 4
|
720 |
+
.b32 125
|
721 |
+
.b64 $L__tmp1
|
722 |
+
.b64 $L__tmp14
|
723 |
+
.b8 2
|
724 |
+
.b8 44
|
725 |
+
.b8 59
|
726 |
+
.b8 5
|
727 |
+
.b32 125
|
728 |
+
.b64 $L__tmp1
|
729 |
+
.b64 $L__tmp14
|
730 |
+
.b8 2
|
731 |
+
.b8 243
|
732 |
+
.b8 36
|
733 |
+
.b8 0
|
734 |
+
.b8 5
|
735 |
+
.b32 125
|
736 |
+
.b64 $L__tmp2
|
737 |
+
.b64 $L__tmp15
|
738 |
+
.b8 2
|
739 |
+
.b8 44
|
740 |
+
.b8 59
|
741 |
+
.b8 5
|
742 |
+
.b32 125
|
743 |
+
.b64 $L__tmp15
|
744 |
+
.b64 $L__tmp16
|
745 |
+
.b8 3
|
746 |
+
.b8 44
|
747 |
+
.b8 45
|
748 |
+
.b8 5
|
749 |
+
.b32 125
|
750 |
+
.b64 $L__tmp17
|
751 |
+
.b64 $L__tmp32
|
752 |
+
.b8 2
|
753 |
+
.b8 52
|
754 |
+
.b8 59
|
755 |
+
.b8 4
|
756 |
+
.b32 125
|
757 |
+
.b64 $L__tmp18
|
758 |
+
.b64 $L__tmp31
|
759 |
+
.b8 2
|
760 |
+
.b8 52
|
761 |
+
.b8 59
|
762 |
+
.b8 5
|
763 |
+
.b32 125
|
764 |
+
.b64 $L__tmp18
|
765 |
+
.b64 $L__tmp31
|
766 |
+
.b8 2
|
767 |
+
.b8 243
|
768 |
+
.b8 36
|
769 |
+
.b8 0
|
770 |
+
.b8 5
|
771 |
+
.b32 125
|
772 |
+
.b64 $L__tmp32
|
773 |
+
.b64 $L__tmp33
|
774 |
+
.b8 3
|
775 |
+
.b8 52
|
776 |
+
.b8 45
|
777 |
+
.b8 0
|
778 |
+
.b8 0
|
779 |
+
}
|
780 |
+
.section .debug_pubnames
|
781 |
+
{
|
782 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
783 |
+
$L__pubNames_start0:
|
784 |
+
.b8 2
|
785 |
+
.b8 0
|
786 |
+
.b32 .debug_info
|
787 |
+
.b32 411
|
788 |
+
.b32 125
|
789 |
+
.b8 116
|
790 |
+
.b8 114
|
791 |
+
.b8 105
|
792 |
+
.b8 116
|
793 |
+
.b8 111
|
794 |
+
.b8 110
|
795 |
+
.b8 95
|
796 |
+
.b8 95
|
797 |
+
.b8 48
|
798 |
+
.b8 100
|
799 |
+
.b8 49
|
800 |
+
.b8 100
|
801 |
+
.b8 50
|
802 |
+
.b8 100
|
803 |
+
.b8 51
|
804 |
+
.b8 100
|
805 |
+
.b8 52
|
806 |
+
.b8 100
|
807 |
+
.b8 53
|
808 |
+
.b8 100
|
809 |
+
.b8 54
|
810 |
+
.b8 100
|
811 |
+
.b8 55
|
812 |
+
.b8 100
|
813 |
+
.b8 56
|
814 |
+
.b8 100
|
815 |
+
.b8 101
|
816 |
+
.b8 57
|
817 |
+
.b8 100
|
818 |
+
.b8 101
|
819 |
+
.b8 0
|
820 |
+
.b32 0
|
821 |
+
$L__pubNames_end0:
|
822 |
+
}
|
823 |
+
.section .debug_pubtypes
|
824 |
+
{
|
825 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
826 |
+
$L__pubTypes_start0:
|
827 |
+
.b8 2
|
828 |
+
.b8 0
|
829 |
+
.b32 .debug_info
|
830 |
+
.b32 411
|
831 |
+
.b32 0
|
832 |
+
$L__pubTypes_end0:
|
833 |
+
}
|
834 |
+
.section .debug_loc { }
|
.triton/dump/645565eaba0a18dd23ef200fe9abb0c0/triton_.ttgir
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
3 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
4 |
+
tt.func public @triton__0d1d2d3d4d5d6d7d8de9de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
5 |
+
%cst = arith.constant dense<256> : tensor<256xi32, #blocked>
|
6 |
+
%cst_0 = arith.constant dense<0> : tensor<1xi64, #blocked>
|
7 |
+
%cst_1 = arith.constant dense<50257> : tensor<1xi64, #blocked>
|
8 |
+
%cst_2 = arith.constant dense<256> : tensor<1xi64, #blocked>
|
9 |
+
%cst_3 = arith.constant 9.99999974E-6 : f32
|
10 |
+
%cst_4 = arith.constant 2.560000e+02 : f32
|
11 |
+
%cst_5 = arith.constant 0.000000e+00 : f32
|
12 |
+
%c256_i32 = arith.constant 256 : i32
|
13 |
+
%c512_i32 = arith.constant 512 : i32
|
14 |
+
%cst_6 = arith.constant dense<50257> : tensor<1xi64, #blocked1>
|
15 |
+
%cst_7 = arith.constant dense<0> : tensor<1xi64, #blocked1>
|
16 |
+
%cst_8 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
|
17 |
+
%0 = tt.get_program_id x : i32
|
18 |
+
%1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
|
19 |
+
%2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
|
20 |
+
%3 = arith.remsi %0, %c512_i32 : i32
|
21 |
+
%4 = tt.addptr %arg1, %0 : !tt.ptr<i64, 1>, i32
|
22 |
+
%5 = tt.splat %4 : (!tt.ptr<i64, 1>) -> tensor<1x!tt.ptr<i64, 1>, #blocked>
|
23 |
+
%6 = tt.splat %4 : (!tt.ptr<i64, 1>) -> tensor<1x!tt.ptr<i64, 1>, #blocked1>
|
24 |
+
%7 = tt.load %5 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xi64, #blocked>
|
25 |
+
%8 = tt.load %6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xi64, #blocked1>
|
26 |
+
%9 = arith.muli %3, %c256_i32 : i32
|
27 |
+
%10 = tt.splat %9 : (i32) -> tensor<256xi32, #blocked>
|
28 |
+
%11 = arith.addi %1, %10 : tensor<256xi32, #blocked>
|
29 |
+
%12 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
30 |
+
%13 = tt.addptr %12, %11 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
31 |
+
%14 = tt.load %13, %2, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
32 |
+
%15 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
33 |
+
%16 = tt.addptr %15, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
34 |
+
%17 = tt.load %16, %2, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
35 |
+
%18 = arith.addi %7, %cst_1 : tensor<1xi64, #blocked>
|
36 |
+
%19 = arith.addi %8, %cst_6 : tensor<1xi64, #blocked1>
|
37 |
+
%20 = arith.cmpi slt, %7, %cst_0 : tensor<1xi64, #blocked>
|
38 |
+
%21 = arith.cmpi slt, %8, %cst_7 : tensor<1xi64, #blocked1>
|
39 |
+
%22 = arith.select %20, %18, %7 : tensor<1xi1, #blocked>, tensor<1xi64, #blocked>
|
40 |
+
%23 = arith.select %21, %19, %8 : tensor<1xi1, #blocked1>, tensor<1xi64, #blocked1>
|
41 |
+
%24 = arith.cmpi sge, %23, %cst_7 : tensor<1xi64, #blocked1>
|
42 |
+
%25 = arith.cmpi slt, %23, %cst_6 : tensor<1xi64, #blocked1>
|
43 |
+
%26 = arith.andi %24, %25 : tensor<1xi1, #blocked1>
|
44 |
+
tt.assert %26, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<1xi1, #blocked1>
|
45 |
+
%27 = arith.muli %22, %cst_2 : tensor<1xi64, #blocked>
|
46 |
+
%28 = tt.broadcast %27 : (tensor<1xi64, #blocked>) -> tensor<256xi64, #blocked>
|
47 |
+
%29 = arith.extsi %1 : tensor<256xi32, #blocked> to tensor<256xi64, #blocked>
|
48 |
+
%30 = arith.addi %29, %28 : tensor<256xi64, #blocked>
|
49 |
+
%31 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
50 |
+
%32 = tt.addptr %31, %30 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi64, #blocked>
|
51 |
+
%33 = tt.load %32, %2, %cst_8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
52 |
+
%34 = arith.addf %33, %14 : tensor<256xf32, #blocked>
|
53 |
+
%35 = arith.select %2, %34, %cst_8 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
|
54 |
+
%36 = "tt.reduce"(%35) <{axis = 0 : i32}> ({
|
55 |
+
^bb0(%arg10: f32, %arg11: f32):
|
56 |
+
%65 = arith.addf %arg10, %arg11 : f32
|
57 |
+
tt.reduce.return %65 : f32
|
58 |
+
}) : (tensor<256xf32, #blocked>) -> f32
|
59 |
+
%37 = arith.addf %36, %cst_5 : f32
|
60 |
+
%38 = arith.divf %37, %cst_4 : f32
|
61 |
+
%39 = tt.splat %38 : (f32) -> tensor<1xf32, #blocked1>
|
62 |
+
%40 = tt.splat %38 : (f32) -> tensor<256xf32, #blocked>
|
63 |
+
%41 = arith.subf %34, %40 : tensor<256xf32, #blocked>
|
64 |
+
%42 = arith.mulf %41, %41 : tensor<256xf32, #blocked>
|
65 |
+
%43 = arith.select %2, %42, %cst_8 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
|
66 |
+
%44 = "tt.reduce"(%43) <{axis = 0 : i32}> ({
|
67 |
+
^bb0(%arg10: f32, %arg11: f32):
|
68 |
+
%65 = arith.addf %arg10, %arg11 : f32
|
69 |
+
tt.reduce.return %65 : f32
|
70 |
+
}) : (tensor<256xf32, #blocked>) -> f32
|
71 |
+
%45 = arith.addf %44, %cst_5 : f32
|
72 |
+
%46 = arith.divf %45, %cst_4 : f32
|
73 |
+
%47 = arith.addf %46, %cst_3 : f32
|
74 |
+
%48 = tt.extern_elementwise %47 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
|
75 |
+
%49 = tt.splat %48 : (f32) -> tensor<1xf32, #blocked1>
|
76 |
+
%50 = tt.splat %48 : (f32) -> tensor<256xf32, #blocked>
|
77 |
+
%51 = arith.mulf %41, %50 : tensor<256xf32, #blocked>
|
78 |
+
%52 = arith.mulf %51, %17 : tensor<256xf32, #blocked>
|
79 |
+
%53 = arith.muli %0, %c256_i32 : i32
|
80 |
+
%54 = tt.splat %53 : (i32) -> tensor<256xi32, #blocked>
|
81 |
+
%55 = arith.addi %1, %54 : tensor<256xi32, #blocked>
|
82 |
+
%56 = tt.splat %arg5 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
83 |
+
%57 = tt.addptr %56, %55 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
84 |
+
tt.store %57, %34, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked>
|
85 |
+
gpu.barrier
|
86 |
+
%58 = tt.addptr %arg0, %0 : !tt.ptr<f32, 1>, i32
|
87 |
+
%59 = tt.splat %58 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked1>
|
88 |
+
tt.store %59, %49 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32, #blocked1>
|
89 |
+
%60 = tt.splat %arg7 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
90 |
+
%61 = tt.addptr %60, %55 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
91 |
+
%62 = arith.truncf %52 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked>
|
92 |
+
tt.store %61, %62, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked>
|
93 |
+
%63 = tt.addptr %arg6, %0 : !tt.ptr<f32, 1>, i32
|
94 |
+
%64 = tt.splat %63 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked1>
|
95 |
+
tt.store %64, %39 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32, #blocked1>
|
96 |
+
tt.return
|
97 |
+
}
|
98 |
+
}
|
.triton/dump/7dc5bb3e5c2bb99527fff34c6fba7810/triton_.llir
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
define void @triton__0d1de(ptr addrspace(1) %0, i32 %1) local_unnamed_addr !dbg !5 {
|
5 |
+
%3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
|
6 |
+
%4 = and i32 %3, 127, !dbg !8
|
7 |
+
%5 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
|
8 |
+
%6 = shl i32 %5, 7, !dbg !10
|
9 |
+
%7 = or i32 %6, %4, !dbg !11
|
10 |
+
%8 = icmp slt i32 %7, 512, !dbg !12
|
11 |
+
%9 = sext i32 %7 to i64, !dbg !13
|
12 |
+
%10 = getelementptr i64, ptr addrspace(1) %0, i64 %9, !dbg !13
|
13 |
+
tail call void asm sideeffect "@$2 st.global.b64 [ $1 + 0 ], { $0 };", "l,l,b"(i64 %9, ptr addrspace(1) %10, i1 %8) #1, !dbg !14
|
14 |
+
ret void, !dbg !15
|
15 |
+
}
|
16 |
+
|
17 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
18 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
19 |
+
|
20 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
21 |
+
attributes #1 = { nounwind }
|
22 |
+
|
23 |
+
!llvm.module.flags = !{!0}
|
24 |
+
!llvm.dbg.cu = !{!1}
|
25 |
+
!nvvm.annotations = !{!3, !4, !4, !3}
|
26 |
+
|
27 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
28 |
+
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
29 |
+
!2 = !DIFile(filename: "cwxxgxdevnyc453z7hh4nxzgmvlhh6suwokktps3dw62btskgxt4.py", directory: "/tmp/torchinductor_root/wx")
|
30 |
+
!3 = !{ptr @triton__0d1de, !"kernel", i32 1}
|
31 |
+
!4 = !{ptr @triton__0d1de, !"maxntidx", i32 128}
|
32 |
+
!5 = distinct !DISubprogram(name: "triton__0d1de", linkageName: "triton__0d1de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
|
33 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
34 |
+
!7 = !{}
|
35 |
+
!8 = !DILocation(line: 21, column: 36, scope: !5)
|
36 |
+
!9 = !DILocation(line: 20, column: 28, scope: !5)
|
37 |
+
!10 = !DILocation(line: 20, column: 33, scope: !5)
|
38 |
+
!11 = !DILocation(line: 21, column: 23, scope: !5)
|
39 |
+
!12 = !DILocation(line: 22, column: 21, scope: !5)
|
40 |
+
!13 = !DILocation(line: 25, column: 25, scope: !5)
|
41 |
+
!14 = !DILocation(line: 25, column: 36, scope: !5)
|
42 |
+
!15 = !DILocation(line: 25, column: 4, scope: !5)
|
.triton/dump/7dc5bb3e5c2bb99527fff34c6fba7810/triton_.ttir
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<512> : tensor<128xi32>
|
4 |
+
%c128_i32 = arith.constant 128 : i32
|
5 |
+
%0 = tt.get_program_id x : i32
|
6 |
+
%1 = arith.muli %0, %c128_i32 : i32
|
7 |
+
%2 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
|
8 |
+
%3 = tt.splat %1 : (i32) -> tensor<128xi32>
|
9 |
+
%4 = arith.addi %3, %2 : tensor<128xi32>
|
10 |
+
%5 = arith.cmpi slt, %4, %cst : tensor<128xi32>
|
11 |
+
%6 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<128x!tt.ptr<i64, 1>>
|
12 |
+
%7 = tt.addptr %6, %4 : tensor<128x!tt.ptr<i64, 1>>, tensor<128xi32>
|
13 |
+
%8 = arith.extsi %4 : tensor<128xi32> to tensor<128xi64>
|
14 |
+
tt.store %7, %8, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<128xi64>
|
15 |
+
tt.return
|
16 |
+
}
|
17 |
+
}
|
.triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.cubin
ADDED
Binary file (15.2 kB). View file
|
|
.triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.llir
ADDED
@@ -0,0 +1,333 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@global_smem = external addrspace(3) global [0 x i8]
|
5 |
+
@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
|
6 |
+
|
7 |
+
define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !7 {
|
8 |
+
%9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
|
9 |
+
%10 = and i32 %9, 31, !dbg !10
|
10 |
+
%11 = lshr i32 %9, 5, !dbg !10
|
11 |
+
%12 = and i32 %11, 1, !dbg !10
|
12 |
+
%urem = shl i32 %9, 2, !dbg !10
|
13 |
+
%13 = and i32 %urem, 252, !dbg !10
|
14 |
+
%14 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !11
|
15 |
+
%15 = shl i32 %14, 8, !dbg !12
|
16 |
+
%16 = or i32 %15, %13, !dbg !13
|
17 |
+
%17 = sext i32 %16 to i64, !dbg !14
|
18 |
+
%18 = getelementptr float, ptr addrspace(1) %0, i64 %17, !dbg !14
|
19 |
+
%19 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %18, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !15
|
20 |
+
%20 = extractvalue { i32, i32, i32, i32 } %19, 0, !dbg !15
|
21 |
+
%21 = extractvalue { i32, i32, i32, i32 } %19, 1, !dbg !15
|
22 |
+
%22 = extractvalue { i32, i32, i32, i32 } %19, 2, !dbg !15
|
23 |
+
%23 = extractvalue { i32, i32, i32, i32 } %19, 3, !dbg !15
|
24 |
+
%24 = bitcast i32 %22 to float, !dbg !15
|
25 |
+
%25 = bitcast i32 %23 to float, !dbg !15
|
26 |
+
%26 = getelementptr i16, ptr addrspace(1) %1, i64 %17, !dbg !16
|
27 |
+
%27 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %26, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !17
|
28 |
+
%28 = extractvalue { i32, i32 } %27, 0, !dbg !17
|
29 |
+
%29 = extractvalue { i32, i32 } %27, 1, !dbg !17
|
30 |
+
%30 = trunc i32 %28 to i16, !dbg !17
|
31 |
+
%extelt.offset = lshr i32 %28, 16, !dbg !17
|
32 |
+
%31 = trunc i32 %extelt.offset to i16, !dbg !17
|
33 |
+
%32 = trunc i32 %29 to i16, !dbg !17
|
34 |
+
%extelt.offset1 = lshr i32 %29, 16, !dbg !17
|
35 |
+
%33 = trunc i32 %extelt.offset1 to i16, !dbg !17
|
36 |
+
%34 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %30) #6, !dbg !18
|
37 |
+
%35 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %31) #6, !dbg !18
|
38 |
+
%36 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %32) #6, !dbg !18
|
39 |
+
%37 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %33) #6, !dbg !18
|
40 |
+
%38 = getelementptr i16, ptr addrspace(1) %2, i64 %17, !dbg !19
|
41 |
+
%39 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %38, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !20
|
42 |
+
%40 = extractvalue { i32, i32 } %39, 0, !dbg !20
|
43 |
+
%41 = extractvalue { i32, i32 } %39, 1, !dbg !20
|
44 |
+
%42 = trunc i32 %40 to i16, !dbg !20
|
45 |
+
%extelt.offset2 = lshr i32 %40, 16, !dbg !20
|
46 |
+
%43 = trunc i32 %extelt.offset2 to i16, !dbg !20
|
47 |
+
%44 = trunc i32 %41 to i16, !dbg !20
|
48 |
+
%extelt.offset3 = lshr i32 %41, 16, !dbg !20
|
49 |
+
%45 = trunc i32 %extelt.offset3 to i16, !dbg !20
|
50 |
+
%46 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %42) #6, !dbg !21
|
51 |
+
%47 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %43) #6, !dbg !21
|
52 |
+
%48 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %44) #6, !dbg !21
|
53 |
+
%49 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %45) #6, !dbg !21
|
54 |
+
%50 = getelementptr i16, ptr addrspace(1) %3, i64 %17, !dbg !22
|
55 |
+
%51 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %50, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !23
|
56 |
+
%52 = extractvalue { i32, i32 } %51, 0, !dbg !23
|
57 |
+
%53 = extractvalue { i32, i32 } %51, 1, !dbg !23
|
58 |
+
%54 = trunc i32 %52 to i16, !dbg !23
|
59 |
+
%extelt.offset4 = lshr i32 %52, 16, !dbg !23
|
60 |
+
%55 = trunc i32 %extelt.offset4 to i16, !dbg !23
|
61 |
+
%56 = trunc i32 %53 to i16, !dbg !23
|
62 |
+
%extelt.offset5 = lshr i32 %53, 16, !dbg !23
|
63 |
+
%57 = trunc i32 %extelt.offset5 to i16, !dbg !23
|
64 |
+
%58 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %54) #6, !dbg !24
|
65 |
+
%59 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %55) #6, !dbg !24
|
66 |
+
%60 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %56) #6, !dbg !24
|
67 |
+
%61 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %57) #6, !dbg !24
|
68 |
+
%62 = zext nneg i32 %13 to i64, !dbg !25
|
69 |
+
%63 = getelementptr float, ptr addrspace(1) %4, i64 %62, !dbg !25
|
70 |
+
%64 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %63, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !26
|
71 |
+
%65 = fadd float %36, %24, !dbg !27
|
72 |
+
%66 = fadd float %37, %25, !dbg !27
|
73 |
+
%67 = fadd float %65, %48, !dbg !28
|
74 |
+
%68 = fadd float %66, %49, !dbg !28
|
75 |
+
%69 = insertelement <2 x i32> poison, i32 %20, i64 0, !dbg !15
|
76 |
+
%70 = insertelement <2 x i32> %69, i32 %21, i64 1, !dbg !15
|
77 |
+
%71 = bitcast <2 x i32> %70 to <2 x float>, !dbg !15
|
78 |
+
%72 = insertelement <2 x float> poison, float %34, i64 0, !dbg !27
|
79 |
+
%73 = insertelement <2 x float> %72, float %35, i64 1, !dbg !27
|
80 |
+
%74 = fadd <2 x float> %73, %71, !dbg !27
|
81 |
+
%75 = insertelement <2 x float> poison, float %46, i64 0, !dbg !28
|
82 |
+
%76 = insertelement <2 x float> %75, float %47, i64 1, !dbg !28
|
83 |
+
%77 = fadd <2 x float> %74, %76, !dbg !28
|
84 |
+
%78 = insertelement <2 x float> poison, float %58, i64 0, !dbg !29
|
85 |
+
%79 = insertelement <2 x float> %78, float %59, i64 1, !dbg !29
|
86 |
+
%80 = fadd <2 x float> %77, %79, !dbg !29
|
87 |
+
%81 = fadd float %67, %60, !dbg !29
|
88 |
+
%82 = fadd float %68, %61, !dbg !29
|
89 |
+
%83 = extractelement <2 x float> %80, i64 0, !dbg !30
|
90 |
+
%84 = extractelement <2 x float> %80, i64 1, !dbg !30
|
91 |
+
%85 = fadd float %83, %84, !dbg !30
|
92 |
+
%86 = fadd float %85, %81, !dbg !30
|
93 |
+
%87 = fadd float %86, %82, !dbg !30
|
94 |
+
%88 = bitcast float %87 to i32, !dbg !36
|
95 |
+
%89 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %88, i32 16, i32 31), !dbg !36
|
96 |
+
%90 = bitcast i32 %89 to float, !dbg !36
|
97 |
+
%91 = fadd float %87, %90, !dbg !30
|
98 |
+
%92 = bitcast float %91 to i32, !dbg !36
|
99 |
+
%93 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %92, i32 8, i32 31), !dbg !36
|
100 |
+
%94 = bitcast i32 %93 to float, !dbg !36
|
101 |
+
%95 = fadd float %91, %94, !dbg !30
|
102 |
+
%96 = bitcast float %95 to i32, !dbg !36
|
103 |
+
%97 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %96, i32 4, i32 31), !dbg !36
|
104 |
+
%98 = bitcast i32 %97 to float, !dbg !36
|
105 |
+
%99 = fadd float %95, %98, !dbg !30
|
106 |
+
%100 = bitcast float %99 to i32, !dbg !36
|
107 |
+
%101 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %100, i32 2, i32 31), !dbg !36
|
108 |
+
%102 = bitcast i32 %101 to float, !dbg !36
|
109 |
+
%103 = fadd float %99, %102, !dbg !30
|
110 |
+
%104 = bitcast float %103 to i32, !dbg !36
|
111 |
+
%105 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %104, i32 1, i32 31), !dbg !36
|
112 |
+
%106 = bitcast i32 %105 to float, !dbg !36
|
113 |
+
%107 = fadd float %103, %106, !dbg !30
|
114 |
+
%108 = icmp eq i32 %10, 0, !dbg !36
|
115 |
+
%109 = zext nneg i32 %12 to i64, !dbg !36
|
116 |
+
%110 = getelementptr float, ptr addrspace(3) @global_smem, i64 %109, !dbg !36
|
117 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %110, float %107, i1 %108) #6, !dbg !36
|
118 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !36
|
119 |
+
%111 = icmp slt i32 %9, 2, !dbg !36
|
120 |
+
%112 = sext i32 %9 to i64, !dbg !36
|
121 |
+
%113 = getelementptr float, ptr addrspace(3) @global_smem, i64 %112, !dbg !36
|
122 |
+
%114 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %113, i1 %111) #6, !dbg !36
|
123 |
+
%115 = bitcast float %114 to i32, !dbg !36
|
124 |
+
%116 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %115, i32 1, i32 31), !dbg !36
|
125 |
+
%117 = bitcast i32 %116 to float, !dbg !36
|
126 |
+
%118 = fadd float %114, %117, !dbg !30
|
127 |
+
%119 = and i32 %9, 1, !dbg !36
|
128 |
+
%120 = icmp eq i32 %119, 0, !dbg !36
|
129 |
+
%121 = and i1 %111, %120, !dbg !36
|
130 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %113, float %118, i1 %121) #6, !dbg !36
|
131 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !36
|
132 |
+
%122 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !36
|
133 |
+
%123 = fadd float %122, 0.000000e+00, !dbg !38
|
134 |
+
%124 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %123, float 2.560000e+02) #6, !dbg !42
|
135 |
+
%125 = fsub float %83, %124, !dbg !43
|
136 |
+
%126 = fsub float %84, %124, !dbg !43
|
137 |
+
%127 = fsub float %81, %124, !dbg !43
|
138 |
+
%128 = fsub float %82, %124, !dbg !43
|
139 |
+
%129 = fmul float %125, %125, !dbg !44
|
140 |
+
%130 = fmul float %126, %126, !dbg !44
|
141 |
+
%131 = fmul float %127, %127, !dbg !44
|
142 |
+
%132 = fmul float %128, %128, !dbg !44
|
143 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !45
|
144 |
+
%133 = fadd float %129, %130, !dbg !47
|
145 |
+
%134 = fadd float %131, %133, !dbg !47
|
146 |
+
%135 = fadd float %132, %134, !dbg !47
|
147 |
+
%136 = bitcast float %135 to i32, !dbg !45
|
148 |
+
%137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 16, i32 31), !dbg !45
|
149 |
+
%138 = bitcast i32 %137 to float, !dbg !45
|
150 |
+
%139 = fadd float %135, %138, !dbg !47
|
151 |
+
%140 = bitcast float %139 to i32, !dbg !45
|
152 |
+
%141 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %140, i32 8, i32 31), !dbg !45
|
153 |
+
%142 = bitcast i32 %141 to float, !dbg !45
|
154 |
+
%143 = fadd float %139, %142, !dbg !47
|
155 |
+
%144 = bitcast float %143 to i32, !dbg !45
|
156 |
+
%145 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %144, i32 4, i32 31), !dbg !45
|
157 |
+
%146 = bitcast i32 %145 to float, !dbg !45
|
158 |
+
%147 = fadd float %143, %146, !dbg !47
|
159 |
+
%148 = bitcast float %147 to i32, !dbg !45
|
160 |
+
%149 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %148, i32 2, i32 31), !dbg !45
|
161 |
+
%150 = bitcast i32 %149 to float, !dbg !45
|
162 |
+
%151 = fadd float %147, %150, !dbg !47
|
163 |
+
%152 = bitcast float %151 to i32, !dbg !45
|
164 |
+
%153 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %152, i32 1, i32 31), !dbg !45
|
165 |
+
%154 = bitcast i32 %153 to float, !dbg !45
|
166 |
+
%155 = fadd float %151, %154, !dbg !47
|
167 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %110, float %155, i1 %108) #6, !dbg !45
|
168 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !45
|
169 |
+
%156 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %113, i1 %111) #6, !dbg !45
|
170 |
+
%157 = bitcast float %156 to i32, !dbg !45
|
171 |
+
%158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 1, i32 31), !dbg !45
|
172 |
+
%159 = bitcast i32 %158 to float, !dbg !45
|
173 |
+
%160 = fadd float %156, %159, !dbg !47
|
174 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %113, float %160, i1 %121) #6, !dbg !45
|
175 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !45
|
176 |
+
%161 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !45
|
177 |
+
%162 = fadd float %161, 0.000000e+00, !dbg !50
|
178 |
+
%163 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %162, float 2.560000e+02) #6, !dbg !52
|
179 |
+
%164 = fadd float %163, 0x3EE4F8B580000000, !dbg !53
|
180 |
+
%165 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !54
|
181 |
+
%.not.i = icmp eq i32 %165, 0, !dbg !54
|
182 |
+
br i1 %.not.i, label %168, label %166, !dbg !54
|
183 |
+
|
184 |
+
166: ; preds = %8
|
185 |
+
%167 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %164), !dbg !54
|
186 |
+
br label %__nv_rsqrtf.exit, !dbg !54
|
187 |
+
|
188 |
+
168: ; preds = %8
|
189 |
+
%169 = tail call float @llvm.nvvm.rsqrt.approx.f(float %164), !dbg !54
|
190 |
+
br label %__nv_rsqrtf.exit, !dbg !54
|
191 |
+
|
192 |
+
__nv_rsqrtf.exit: ; preds = %166, %168
|
193 |
+
%.0.i = phi float [ %167, %166 ], [ %169, %168 ], !dbg !54
|
194 |
+
%170 = extractvalue { i32, i32, i32, i32 } %64, 3, !dbg !26
|
195 |
+
%171 = bitcast i32 %170 to float, !dbg !26
|
196 |
+
%172 = extractvalue { i32, i32, i32, i32 } %64, 2, !dbg !26
|
197 |
+
%173 = bitcast i32 %172 to float, !dbg !26
|
198 |
+
%174 = extractvalue { i32, i32, i32, i32 } %64, 1, !dbg !26
|
199 |
+
%175 = bitcast i32 %174 to float, !dbg !26
|
200 |
+
%176 = extractvalue { i32, i32, i32, i32 } %64, 0, !dbg !26
|
201 |
+
%177 = bitcast i32 %176 to float, !dbg !26
|
202 |
+
%178 = fmul float %125, %.0.i, !dbg !55
|
203 |
+
%179 = fmul float %126, %.0.i, !dbg !55
|
204 |
+
%180 = fmul float %127, %.0.i, !dbg !55
|
205 |
+
%181 = fmul float %128, %.0.i, !dbg !55
|
206 |
+
%182 = fmul float %178, %177, !dbg !56
|
207 |
+
%183 = fmul float %179, %175, !dbg !56
|
208 |
+
%184 = fmul float %180, %173, !dbg !56
|
209 |
+
%185 = fmul float %181, %171, !dbg !56
|
210 |
+
%186 = getelementptr i16, ptr addrspace(1) %5, i64 %17, !dbg !57
|
211 |
+
%187 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %182) #6, !dbg !58
|
212 |
+
%188 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %183) #6, !dbg !58
|
213 |
+
%189 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %184) #6, !dbg !58
|
214 |
+
%190 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %185) #6, !dbg !58
|
215 |
+
%191 = insertelement <2 x i16> undef, i16 %187, i64 0, !dbg !58
|
216 |
+
%192 = insertelement <2 x i16> %191, i16 %188, i64 1, !dbg !58
|
217 |
+
%193 = bitcast <2 x i16> %192 to i32, !dbg !58
|
218 |
+
%194 = insertelement <2 x i16> undef, i16 %189, i64 0, !dbg !58
|
219 |
+
%195 = insertelement <2 x i16> %194, i16 %190, i64 1, !dbg !58
|
220 |
+
%196 = bitcast <2 x i16> %195 to i32, !dbg !58
|
221 |
+
tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %193, i32 %196, ptr addrspace(1) %186, i1 true) #6, !dbg !58
|
222 |
+
ret void, !dbg !59
|
223 |
+
}
|
224 |
+
|
225 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
226 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
227 |
+
|
228 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
229 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
|
230 |
+
|
231 |
+
; Function Attrs: convergent nocallback nounwind
|
232 |
+
declare void @llvm.nvvm.barrier0() #2
|
233 |
+
|
234 |
+
; Function Attrs: alwaysinline nounwind
|
235 |
+
define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
|
236 |
+
%1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
|
237 |
+
%.not = icmp eq i32 %1, 0
|
238 |
+
br i1 %.not, label %4, label %2
|
239 |
+
|
240 |
+
2: ; preds = %0
|
241 |
+
%3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
|
242 |
+
br label %6
|
243 |
+
|
244 |
+
4: ; preds = %0
|
245 |
+
%5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
|
246 |
+
br label %6
|
247 |
+
|
248 |
+
6: ; preds = %4, %2
|
249 |
+
%.0 = phi float [ %3, %2 ], [ %5, %4 ]
|
250 |
+
ret float %.0
|
251 |
+
}
|
252 |
+
|
253 |
+
declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
|
254 |
+
|
255 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
256 |
+
declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
|
257 |
+
|
258 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
259 |
+
declare float @llvm.nvvm.rsqrt.approx.f(float) #5
|
260 |
+
|
261 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
262 |
+
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
263 |
+
attributes #2 = { convergent nocallback nounwind }
|
264 |
+
attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
265 |
+
attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
266 |
+
attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
|
267 |
+
attributes #6 = { nounwind }
|
268 |
+
|
269 |
+
!llvm.module.flags = !{!0, !1}
|
270 |
+
!llvm.dbg.cu = !{!2}
|
271 |
+
!nvvm.annotations = !{!4, !5, !5, !4}
|
272 |
+
!llvm.ident = !{!6}
|
273 |
+
|
274 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
275 |
+
!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
276 |
+
!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
277 |
+
!3 = !DIFile(filename: "cpwl4wgyi5spzbgbswrqxfrxlyk2m76a4bakbp6l5ltopjbkjadt.py", directory: "/tmp/torchinductor_root/pw")
|
278 |
+
!4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1}
|
279 |
+
!5 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 64}
|
280 |
+
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
|
281 |
+
!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
|
282 |
+
!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
|
283 |
+
!9 = !{}
|
284 |
+
!10 = !DILocation(line: 26, column: 26, scope: !7)
|
285 |
+
!11 = !DILocation(line: 23, column: 28, scope: !7)
|
286 |
+
!12 = !DILocation(line: 30, column: 40, scope: !7)
|
287 |
+
!13 = !DILocation(line: 30, column: 36, scope: !7)
|
288 |
+
!14 = !DILocation(line: 30, column: 30, scope: !7)
|
289 |
+
!15 = !DILocation(line: 30, column: 46, scope: !7)
|
290 |
+
!16 = !DILocation(line: 31, column: 30, scope: !7)
|
291 |
+
!17 = !DILocation(line: 31, column: 46, scope: !7)
|
292 |
+
!18 = !DILocation(line: 31, column: 67, scope: !7)
|
293 |
+
!19 = !DILocation(line: 32, column: 30, scope: !7)
|
294 |
+
!20 = !DILocation(line: 32, column: 46, scope: !7)
|
295 |
+
!21 = !DILocation(line: 32, column: 67, scope: !7)
|
296 |
+
!22 = !DILocation(line: 33, column: 30, scope: !7)
|
297 |
+
!23 = !DILocation(line: 33, column: 46, scope: !7)
|
298 |
+
!24 = !DILocation(line: 33, column: 67, scope: !7)
|
299 |
+
!25 = !DILocation(line: 34, column: 31, scope: !7)
|
300 |
+
!26 = !DILocation(line: 34, column: 36, scope: !7)
|
301 |
+
!27 = !DILocation(line: 36, column: 18, scope: !7)
|
302 |
+
!28 = !DILocation(line: 38, column: 18, scope: !7)
|
303 |
+
!29 = !DILocation(line: 40, column: 18, scope: !7)
|
304 |
+
!30 = !DILocation(line: 233, column: 15, scope: !31, inlinedAt: !34)
|
305 |
+
!31 = distinct !DILexicalBlockFile(scope: !33, file: !32, discriminator: 0)
|
306 |
+
!32 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
|
307 |
+
!33 = distinct !DILexicalBlockFile(scope: !7, file: !32, discriminator: 0)
|
308 |
+
!34 = !DILocation(line: 243, column: 36, scope: !31, inlinedAt: !35)
|
309 |
+
!35 = !DILocation(line: 45, column: 59, scope: !31)
|
310 |
+
!36 = !DILocation(line: 243, column: 36, scope: !33, inlinedAt: !37)
|
311 |
+
!37 = !DILocation(line: 45, column: 59, scope: !33)
|
312 |
+
!38 = !DILocation(line: 8, column: 15, scope: !39, inlinedAt: !41)
|
313 |
+
!39 = distinct !DILexicalBlockFile(scope: !7, file: !40, discriminator: 0)
|
314 |
+
!40 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
|
315 |
+
!41 = !DILocation(line: 45, column: 45, scope: !39)
|
316 |
+
!42 = !DILocation(line: 48, column: 20, scope: !7)
|
317 |
+
!43 = !DILocation(line: 49, column: 20, scope: !7)
|
318 |
+
!44 = !DILocation(line: 50, column: 20, scope: !7)
|
319 |
+
!45 = !DILocation(line: 243, column: 36, scope: !33, inlinedAt: !46)
|
320 |
+
!46 = !DILocation(line: 53, column: 59, scope: !33)
|
321 |
+
!47 = !DILocation(line: 233, column: 15, scope: !31, inlinedAt: !48)
|
322 |
+
!48 = !DILocation(line: 243, column: 36, scope: !31, inlinedAt: !49)
|
323 |
+
!49 = !DILocation(line: 53, column: 59, scope: !31)
|
324 |
+
!50 = !DILocation(line: 8, column: 15, scope: !39, inlinedAt: !51)
|
325 |
+
!51 = !DILocation(line: 53, column: 45, scope: !39)
|
326 |
+
!52 = !DILocation(line: 56, column: 20, scope: !7)
|
327 |
+
!53 = !DILocation(line: 58, column: 20, scope: !7)
|
328 |
+
!54 = !DILocation(line: 59, column: 26, scope: !7)
|
329 |
+
!55 = !DILocation(line: 60, column: 20, scope: !7)
|
330 |
+
!56 = !DILocation(line: 61, column: 20, scope: !7)
|
331 |
+
!57 = !DILocation(line: 63, column: 25, scope: !7)
|
332 |
+
!58 = !DILocation(line: 63, column: 48, scope: !7)
|
333 |
+
!59 = !DILocation(line: 63, column: 4, scope: !7)
|
.triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.ttgir
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
3 |
+
tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
4 |
+
%cst = arith.constant dense<256> : tensor<256xi32, #blocked>
|
5 |
+
%cst_0 = arith.constant 9.99999974E-6 : f32
|
6 |
+
%cst_1 = arith.constant 2.560000e+02 : f32
|
7 |
+
%cst_2 = arith.constant 0.000000e+00 : f32
|
8 |
+
%c256_i32 = arith.constant 256 : i32
|
9 |
+
%cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
|
10 |
+
%cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
|
11 |
+
%0 = tt.get_program_id x : i32
|
12 |
+
%1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
|
13 |
+
%2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
|
14 |
+
%3 = arith.muli %0, %c256_i32 : i32
|
15 |
+
%4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
|
16 |
+
%5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
|
17 |
+
%6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
18 |
+
%7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
19 |
+
%8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
20 |
+
%9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
21 |
+
%10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
22 |
+
%11 = tt.load %10, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
|
23 |
+
%12 = arith.extf %11 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
|
24 |
+
%13 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
25 |
+
%14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
26 |
+
%15 = tt.load %14, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
|
27 |
+
%16 = arith.extf %15 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
|
28 |
+
%17 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
29 |
+
%18 = tt.addptr %17, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
30 |
+
%19 = tt.load %18, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
|
31 |
+
%20 = arith.extf %19 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
|
32 |
+
%21 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
33 |
+
%22 = tt.addptr %21, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
34 |
+
%23 = tt.load %22, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
35 |
+
%24 = arith.addf %8, %12 : tensor<256xf32, #blocked>
|
36 |
+
%25 = arith.addf %24, %16 : tensor<256xf32, #blocked>
|
37 |
+
%26 = arith.addf %25, %20 : tensor<256xf32, #blocked>
|
38 |
+
%27 = arith.select %2, %26, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
|
39 |
+
%28 = "tt.reduce"(%27) <{axis = 0 : i32}> ({
|
40 |
+
^bb0(%arg8: f32, %arg9: f32):
|
41 |
+
%46 = arith.addf %arg8, %arg9 : f32
|
42 |
+
tt.reduce.return %46 : f32
|
43 |
+
}) : (tensor<256xf32, #blocked>) -> f32
|
44 |
+
%29 = arith.addf %28, %cst_2 : f32
|
45 |
+
%30 = arith.divf %29, %cst_1 : f32
|
46 |
+
%31 = tt.splat %30 : (f32) -> tensor<256xf32, #blocked>
|
47 |
+
%32 = arith.subf %26, %31 : tensor<256xf32, #blocked>
|
48 |
+
%33 = arith.mulf %32, %32 : tensor<256xf32, #blocked>
|
49 |
+
%34 = arith.select %2, %33, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
|
50 |
+
%35 = "tt.reduce"(%34) <{axis = 0 : i32}> ({
|
51 |
+
^bb0(%arg8: f32, %arg9: f32):
|
52 |
+
%46 = arith.addf %arg8, %arg9 : f32
|
53 |
+
tt.reduce.return %46 : f32
|
54 |
+
}) : (tensor<256xf32, #blocked>) -> f32
|
55 |
+
%36 = arith.addf %35, %cst_2 : f32
|
56 |
+
%37 = arith.divf %36, %cst_1 : f32
|
57 |
+
%38 = arith.addf %37, %cst_0 : f32
|
58 |
+
%39 = tt.extern_elementwise %38 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
|
59 |
+
%40 = tt.splat %39 : (f32) -> tensor<256xf32, #blocked>
|
60 |
+
%41 = arith.mulf %32, %40 : tensor<256xf32, #blocked>
|
61 |
+
%42 = arith.mulf %41, %23 : tensor<256xf32, #blocked>
|
62 |
+
%43 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
63 |
+
%44 = tt.addptr %43, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
64 |
+
%45 = arith.truncf %42 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked>
|
65 |
+
tt.store %44, %45, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked>
|
66 |
+
tt.return
|
67 |
+
}
|
68 |
+
}
|
.triton/dump/9a2fb05196b13393bea452d08e9aaca8/triton_.llir
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 {
|
5 |
+
%4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
|
6 |
+
%5 = shl i32 %4, 1, !dbg !8
|
7 |
+
%6 = and i32 %5, 510, !dbg !8
|
8 |
+
%7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
|
9 |
+
%8 = shl i32 %7, 9, !dbg !10
|
10 |
+
%9 = or i32 %8, %6, !dbg !11
|
11 |
+
%10 = sext i32 %9 to i64, !dbg !12
|
12 |
+
%11 = getelementptr float, ptr addrspace(1) %0, i64 %10, !dbg !12
|
13 |
+
%12 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];", "=r,=r,l,b"(ptr addrspace(1) %11, i1 true) #1, !dbg !13
|
14 |
+
%13 = extractvalue { i32, i32 } %12, 0, !dbg !13
|
15 |
+
%14 = extractvalue { i32, i32 } %12, 1, !dbg !13
|
16 |
+
%15 = bitcast i32 %13 to float, !dbg !13
|
17 |
+
%16 = bitcast i32 %14 to float, !dbg !13
|
18 |
+
%17 = getelementptr i16, ptr addrspace(1) %1, i64 %10, !dbg !14
|
19 |
+
%18 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %15) #1, !dbg !15
|
20 |
+
%19 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %16) #1, !dbg !15
|
21 |
+
%20 = insertelement <2 x i16> undef, i16 %18, i64 0, !dbg !15
|
22 |
+
%21 = insertelement <2 x i16> %20, i16 %19, i64 1, !dbg !15
|
23 |
+
%22 = bitcast <2 x i16> %21 to i32, !dbg !15
|
24 |
+
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %22, ptr addrspace(1) %17, i1 true) #1, !dbg !15
|
25 |
+
ret void, !dbg !16
|
26 |
+
}
|
27 |
+
|
28 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
29 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
30 |
+
|
31 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
32 |
+
attributes #1 = { nounwind }
|
33 |
+
|
34 |
+
!llvm.module.flags = !{!0}
|
35 |
+
!llvm.dbg.cu = !{!1}
|
36 |
+
!nvvm.annotations = !{!3, !4, !4, !3}
|
37 |
+
|
38 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
39 |
+
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
40 |
+
!2 = !DIFile(filename: "c5tryp5qwkhreijk7s5x327wofz54lwj4kvctuqdzv2vrf2xyons.py", directory: "/tmp/torchinductor_root/5t")
|
41 |
+
!3 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
|
42 |
+
!4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 256}
|
43 |
+
!5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
|
44 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
45 |
+
!7 = !{}
|
46 |
+
!8 = !DILocation(line: 21, column: 36, scope: !5)
|
47 |
+
!9 = !DILocation(line: 20, column: 28, scope: !5)
|
48 |
+
!10 = !DILocation(line: 20, column: 33, scope: !5)
|
49 |
+
!11 = !DILocation(line: 21, column: 23, scope: !5)
|
50 |
+
!12 = !DILocation(line: 24, column: 30, scope: !5)
|
51 |
+
!13 = !DILocation(line: 24, column: 35, scope: !5)
|
52 |
+
!14 = !DILocation(line: 26, column: 25, scope: !5)
|
53 |
+
!15 = !DILocation(line: 26, column: 36, scope: !5)
|
54 |
+
!16 = !DILocation(line: 26, column: 4, scope: !5)
|
.triton/dump/9a2fb05196b13393bea452d08e9aaca8/triton_.ttgir
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
3 |
+
tt.func public @triton__0d1d2de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
4 |
+
%c512_i32 = arith.constant 512 : i32
|
5 |
+
%0 = tt.get_program_id x : i32
|
6 |
+
%1 = arith.muli %0, %c512_i32 : i32
|
7 |
+
%2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked>
|
8 |
+
%3 = tt.splat %1 : (i32) -> tensor<512xi32, #blocked>
|
9 |
+
%4 = arith.addi %3, %2 : tensor<512xi32, #blocked>
|
10 |
+
%5 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>, #blocked>
|
11 |
+
%6 = tt.addptr %5, %4 : tensor<512x!tt.ptr<f32, 1>, #blocked>, tensor<512xi32, #blocked>
|
12 |
+
%7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xf32, #blocked>
|
13 |
+
%8 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>, #blocked>
|
14 |
+
%9 = tt.addptr %8, %4 : tensor<512x!tt.ptr<bf16, 1>, #blocked>, tensor<512xi32, #blocked>
|
15 |
+
%10 = arith.truncf %7 : tensor<512xf32, #blocked> to tensor<512xbf16, #blocked>
|
16 |
+
tt.store %9, %10 {cache = 1 : i32, evict = 1 : i32} : tensor<512xbf16, #blocked>
|
17 |
+
tt.return
|
18 |
+
}
|
19 |
+
}
|
.triton/dump/a69784da01a97187168f22847465505f/triton_.cubin
ADDED
Binary file (15 kB). View file
|
|
.triton/dump/a69784da01a97187168f22847465505f/triton_.ttgir
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
3 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
4 |
+
tt.func public @triton__0d1d2d3d4d5d6d7de8de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
5 |
+
%cst = arith.constant dense<256> : tensor<256xi32, #blocked>
|
6 |
+
%cst_0 = arith.constant 9.99999974E-6 : f32
|
7 |
+
%cst_1 = arith.constant 2.560000e+02 : f32
|
8 |
+
%cst_2 = arith.constant 0.000000e+00 : f32
|
9 |
+
%c256_i32 = arith.constant 256 : i32
|
10 |
+
%cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
|
11 |
+
%cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
|
12 |
+
%0 = tt.get_program_id x : i32
|
13 |
+
%1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
|
14 |
+
%2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
|
15 |
+
%3 = arith.muli %0, %c256_i32 : i32
|
16 |
+
%4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
|
17 |
+
%5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
|
18 |
+
%6 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
19 |
+
%7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
20 |
+
%8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
21 |
+
%9 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
22 |
+
%10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
23 |
+
%11 = tt.load %10, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
|
24 |
+
%12 = arith.extf %11 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
|
25 |
+
%13 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
26 |
+
%14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
27 |
+
%15 = tt.load %14, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
|
28 |
+
%16 = arith.extf %15 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
|
29 |
+
%17 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
30 |
+
%18 = tt.addptr %17, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
31 |
+
%19 = tt.load %18, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
32 |
+
%20 = arith.addf %8, %12 : tensor<256xf32, #blocked>
|
33 |
+
%21 = arith.addf %20, %16 : tensor<256xf32, #blocked>
|
34 |
+
%22 = arith.select %2, %21, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
|
35 |
+
%23 = "tt.reduce"(%22) <{axis = 0 : i32}> ({
|
36 |
+
^bb0(%arg9: f32, %arg10: f32):
|
37 |
+
%47 = arith.addf %arg9, %arg10 : f32
|
38 |
+
tt.reduce.return %47 : f32
|
39 |
+
}) : (tensor<256xf32, #blocked>) -> f32
|
40 |
+
%24 = arith.addf %23, %cst_2 : f32
|
41 |
+
%25 = arith.divf %24, %cst_1 : f32
|
42 |
+
%26 = tt.splat %25 : (f32) -> tensor<1xf32, #blocked1>
|
43 |
+
%27 = tt.splat %25 : (f32) -> tensor<256xf32, #blocked>
|
44 |
+
%28 = arith.subf %21, %27 : tensor<256xf32, #blocked>
|
45 |
+
%29 = arith.mulf %28, %28 : tensor<256xf32, #blocked>
|
46 |
+
%30 = arith.select %2, %29, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
|
47 |
+
%31 = "tt.reduce"(%30) <{axis = 0 : i32}> ({
|
48 |
+
^bb0(%arg9: f32, %arg10: f32):
|
49 |
+
%47 = arith.addf %arg9, %arg10 : f32
|
50 |
+
tt.reduce.return %47 : f32
|
51 |
+
}) : (tensor<256xf32, #blocked>) -> f32
|
52 |
+
%32 = arith.addf %31, %cst_2 : f32
|
53 |
+
%33 = arith.divf %32, %cst_1 : f32
|
54 |
+
%34 = arith.addf %33, %cst_0 : f32
|
55 |
+
%35 = tt.extern_elementwise %34 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
|
56 |
+
%36 = tt.splat %35 : (f32) -> tensor<1xf32, #blocked1>
|
57 |
+
%37 = tt.splat %35 : (f32) -> tensor<256xf32, #blocked>
|
58 |
+
%38 = arith.mulf %28, %37 : tensor<256xf32, #blocked>
|
59 |
+
%39 = arith.mulf %38, %19 : tensor<256xf32, #blocked>
|
60 |
+
gpu.barrier
|
61 |
+
%40 = tt.addptr %arg0, %0 : !tt.ptr<f32, 1>, i32
|
62 |
+
%41 = tt.splat %40 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked1>
|
63 |
+
tt.store %41, %36 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32, #blocked1>
|
64 |
+
%42 = tt.splat %arg6 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
65 |
+
%43 = tt.addptr %42, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
66 |
+
%44 = arith.truncf %39 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked>
|
67 |
+
tt.store %43, %44, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked>
|
68 |
+
%45 = tt.addptr %arg5, %0 : !tt.ptr<f32, 1>, i32
|
69 |
+
%46 = tt.splat %45 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked1>
|
70 |
+
tt.store %46, %26 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32, #blocked1>
|
71 |
+
tt.return
|
72 |
+
}
|
73 |
+
}
|
.triton/dump/a75e14a8d2d1ec8471f1c7b615552f8c/triton_.cubin
ADDED
Binary file (4.9 kB). View file
|
|
.triton/dump/a75e14a8d2d1ec8471f1c7b615552f8c/triton_.ptx
ADDED
@@ -0,0 +1,295 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2de
|
10 |
+
|
11 |
+
.visible .entry triton__0d1d2de(
|
12 |
+
.param .u64 triton__0d1d2de_param_0,
|
13 |
+
.param .u64 triton__0d1d2de_param_1,
|
14 |
+
.param .u32 triton__0d1d2de_param_2
|
15 |
+
)
|
16 |
+
.maxntid 128, 1, 1
|
17 |
+
{
|
18 |
+
.reg .pred %p<3>;
|
19 |
+
.reg .b16 %rs<3>;
|
20 |
+
.reg .b32 %r<13>;
|
21 |
+
.reg .b64 %rd<7>;
|
22 |
+
.loc 1 18 0
|
23 |
+
$L__func_begin0:
|
24 |
+
.loc 1 18 0
|
25 |
+
|
26 |
+
ld.param.u64 %rd3, [triton__0d1d2de_param_0];
|
27 |
+
ld.param.u64 %rd4, [triton__0d1d2de_param_1];
|
28 |
+
$L__tmp0:
|
29 |
+
.loc 1 21 36
|
30 |
+
mov.u32 %r7, %tid.x;
|
31 |
+
shl.b32 %r8, %r7, 1;
|
32 |
+
and.b32 %r9, %r8, 254;
|
33 |
+
.loc 1 20 28
|
34 |
+
mov.u32 %r1, %ctaid.x;
|
35 |
+
.loc 1 20 33
|
36 |
+
shl.b32 %r10, %r1, 8;
|
37 |
+
.loc 1 21 23
|
38 |
+
or.b32 %r11, %r10, %r9;
|
39 |
+
.loc 1 24 30
|
40 |
+
mul.wide.s32 %rd5, %r11, 4;
|
41 |
+
add.s64 %rd1, %rd3, %rd5;
|
42 |
+
mov.pred %p1, -1;
|
43 |
+
.loc 1 24 35
|
44 |
+
mov.u32 %r4, 0x0;
|
45 |
+
mov.u32 %r5, 0x0;
|
46 |
+
@%p1 ld.global.v2.b32 { %r4, %r5 }, [ %rd1 + 0 ];
|
47 |
+
.loc 1 26 25
|
48 |
+
mul.wide.s32 %rd6, %r11, 2;
|
49 |
+
add.s64 %rd2, %rd4, %rd6;
|
50 |
+
.loc 1 26 36
|
51 |
+
cvt.rn.bf16.f32 %rs1, %r4;
|
52 |
+
cvt.rn.bf16.f32 %rs2, %r5;
|
53 |
+
mov.b32 %r12, {%rs1, %rs2};
|
54 |
+
@%p1 st.global.b32 [ %rd2 + 0 ], { %r12 };
|
55 |
+
.loc 1 26 4
|
56 |
+
ret;
|
57 |
+
$L__tmp1:
|
58 |
+
$L__func_end0:
|
59 |
+
|
60 |
+
}
|
61 |
+
.file 1 "/tmp/torchinductor_root/pq/cpqhcwm5bfrhuwddh4c4qks6bh7sovfbpfnmqhnm4h4w23icqnu6.py"
|
62 |
+
.section .debug_abbrev
|
63 |
+
{
|
64 |
+
.b8 1
|
65 |
+
.b8 17
|
66 |
+
.b8 1
|
67 |
+
.b8 37
|
68 |
+
.b8 8
|
69 |
+
.b8 19
|
70 |
+
.b8 5
|
71 |
+
.b8 3
|
72 |
+
.b8 8
|
73 |
+
.b8 16
|
74 |
+
.b8 6
|
75 |
+
.b8 27
|
76 |
+
.b8 8
|
77 |
+
.b8 180
|
78 |
+
.b8 66
|
79 |
+
.b8 12
|
80 |
+
.b8 17
|
81 |
+
.b8 1
|
82 |
+
.b8 18
|
83 |
+
.b8 1
|
84 |
+
.b8 0
|
85 |
+
.b8 0
|
86 |
+
.b8 2
|
87 |
+
.b8 46
|
88 |
+
.b8 0
|
89 |
+
.b8 17
|
90 |
+
.b8 1
|
91 |
+
.b8 18
|
92 |
+
.b8 1
|
93 |
+
.b8 64
|
94 |
+
.b8 10
|
95 |
+
.b8 135
|
96 |
+
.b8 64
|
97 |
+
.b8 8
|
98 |
+
.b8 3
|
99 |
+
.b8 8
|
100 |
+
.b8 58
|
101 |
+
.b8 11
|
102 |
+
.b8 59
|
103 |
+
.b8 11
|
104 |
+
.b8 63
|
105 |
+
.b8 12
|
106 |
+
.b8 0
|
107 |
+
.b8 0
|
108 |
+
.b8 0
|
109 |
+
}
|
110 |
+
.section .debug_info
|
111 |
+
{
|
112 |
+
.b32 176
|
113 |
+
.b8 2
|
114 |
+
.b8 0
|
115 |
+
.b32 .debug_abbrev
|
116 |
+
.b8 8
|
117 |
+
.b8 1
|
118 |
+
.b8 116
|
119 |
+
.b8 114
|
120 |
+
.b8 105
|
121 |
+
.b8 116
|
122 |
+
.b8 111
|
123 |
+
.b8 110
|
124 |
+
.b8 0
|
125 |
+
.b8 2
|
126 |
+
.b8 0
|
127 |
+
.b8 99
|
128 |
+
.b8 112
|
129 |
+
.b8 113
|
130 |
+
.b8 104
|
131 |
+
.b8 99
|
132 |
+
.b8 119
|
133 |
+
.b8 109
|
134 |
+
.b8 53
|
135 |
+
.b8 98
|
136 |
+
.b8 102
|
137 |
+
.b8 114
|
138 |
+
.b8 104
|
139 |
+
.b8 117
|
140 |
+
.b8 119
|
141 |
+
.b8 100
|
142 |
+
.b8 100
|
143 |
+
.b8 104
|
144 |
+
.b8 52
|
145 |
+
.b8 99
|
146 |
+
.b8 52
|
147 |
+
.b8 113
|
148 |
+
.b8 107
|
149 |
+
.b8 115
|
150 |
+
.b8 54
|
151 |
+
.b8 98
|
152 |
+
.b8 104
|
153 |
+
.b8 55
|
154 |
+
.b8 115
|
155 |
+
.b8 111
|
156 |
+
.b8 118
|
157 |
+
.b8 102
|
158 |
+
.b8 98
|
159 |
+
.b8 112
|
160 |
+
.b8 102
|
161 |
+
.b8 110
|
162 |
+
.b8 109
|
163 |
+
.b8 113
|
164 |
+
.b8 104
|
165 |
+
.b8 110
|
166 |
+
.b8 109
|
167 |
+
.b8 52
|
168 |
+
.b8 104
|
169 |
+
.b8 52
|
170 |
+
.b8 119
|
171 |
+
.b8 50
|
172 |
+
.b8 51
|
173 |
+
.b8 105
|
174 |
+
.b8 99
|
175 |
+
.b8 113
|
176 |
+
.b8 110
|
177 |
+
.b8 117
|
178 |
+
.b8 54
|
179 |
+
.b8 46
|
180 |
+
.b8 112
|
181 |
+
.b8 121
|
182 |
+
.b8 0
|
183 |
+
.b32 .debug_line
|
184 |
+
.b8 47
|
185 |
+
.b8 116
|
186 |
+
.b8 109
|
187 |
+
.b8 112
|
188 |
+
.b8 47
|
189 |
+
.b8 116
|
190 |
+
.b8 111
|
191 |
+
.b8 114
|
192 |
+
.b8 99
|
193 |
+
.b8 104
|
194 |
+
.b8 105
|
195 |
+
.b8 110
|
196 |
+
.b8 100
|
197 |
+
.b8 117
|
198 |
+
.b8 99
|
199 |
+
.b8 116
|
200 |
+
.b8 111
|
201 |
+
.b8 114
|
202 |
+
.b8 95
|
203 |
+
.b8 114
|
204 |
+
.b8 111
|
205 |
+
.b8 111
|
206 |
+
.b8 116
|
207 |
+
.b8 47
|
208 |
+
.b8 112
|
209 |
+
.b8 113
|
210 |
+
.b8 0
|
211 |
+
.b8 1
|
212 |
+
.b64 $L__func_begin0
|
213 |
+
.b64 $L__func_end0
|
214 |
+
.b8 2
|
215 |
+
.b64 $L__func_begin0
|
216 |
+
.b64 $L__func_end0
|
217 |
+
.b8 1
|
218 |
+
.b8 156
|
219 |
+
.b8 116
|
220 |
+
.b8 114
|
221 |
+
.b8 105
|
222 |
+
.b8 116
|
223 |
+
.b8 111
|
224 |
+
.b8 110
|
225 |
+
.b8 95
|
226 |
+
.b8 95
|
227 |
+
.b8 48
|
228 |
+
.b8 100
|
229 |
+
.b8 49
|
230 |
+
.b8 100
|
231 |
+
.b8 50
|
232 |
+
.b8 100
|
233 |
+
.b8 101
|
234 |
+
.b8 0
|
235 |
+
.b8 116
|
236 |
+
.b8 114
|
237 |
+
.b8 105
|
238 |
+
.b8 116
|
239 |
+
.b8 111
|
240 |
+
.b8 110
|
241 |
+
.b8 95
|
242 |
+
.b8 95
|
243 |
+
.b8 48
|
244 |
+
.b8 100
|
245 |
+
.b8 49
|
246 |
+
.b8 100
|
247 |
+
.b8 50
|
248 |
+
.b8 100
|
249 |
+
.b8 101
|
250 |
+
.b8 0
|
251 |
+
.b8 1
|
252 |
+
.b8 18
|
253 |
+
.b8 1
|
254 |
+
.b8 0
|
255 |
+
}
|
256 |
+
.section .debug_pubnames
|
257 |
+
{
|
258 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
259 |
+
$L__pubNames_start0:
|
260 |
+
.b8 2
|
261 |
+
.b8 0
|
262 |
+
.b32 .debug_info
|
263 |
+
.b32 180
|
264 |
+
.b32 125
|
265 |
+
.b8 116
|
266 |
+
.b8 114
|
267 |
+
.b8 105
|
268 |
+
.b8 116
|
269 |
+
.b8 111
|
270 |
+
.b8 110
|
271 |
+
.b8 95
|
272 |
+
.b8 95
|
273 |
+
.b8 48
|
274 |
+
.b8 100
|
275 |
+
.b8 49
|
276 |
+
.b8 100
|
277 |
+
.b8 50
|
278 |
+
.b8 100
|
279 |
+
.b8 101
|
280 |
+
.b8 0
|
281 |
+
.b32 0
|
282 |
+
$L__pubNames_end0:
|
283 |
+
}
|
284 |
+
.section .debug_pubtypes
|
285 |
+
{
|
286 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
287 |
+
$L__pubTypes_start0:
|
288 |
+
.b8 2
|
289 |
+
.b8 0
|
290 |
+
.b32 .debug_info
|
291 |
+
.b32 180
|
292 |
+
.b32 0
|
293 |
+
$L__pubTypes_end0:
|
294 |
+
}
|
295 |
+
.section .debug_loc { }
|
.triton/dump/a75e14a8d2d1ec8471f1c7b615552f8c/triton_.ttir
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%c256_i32 = arith.constant 256 : i32
|
4 |
+
%0 = tt.get_program_id x : i32
|
5 |
+
%1 = arith.muli %0, %c256_i32 : i32
|
6 |
+
%2 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
|
7 |
+
%3 = tt.splat %1 : (i32) -> tensor<256xi32>
|
8 |
+
%4 = arith.addi %3, %2 : tensor<256xi32>
|
9 |
+
%5 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
|
10 |
+
%6 = tt.addptr %5, %4 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
|
11 |
+
%7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
|
12 |
+
%8 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
|
13 |
+
%9 = tt.addptr %8, %4 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
|
14 |
+
%10 = arith.truncf %7 : tensor<256xf32> to tensor<256xbf16>
|
15 |
+
tt.store %9, %10 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16>
|
16 |
+
tt.return
|
17 |
+
}
|
18 |
+
}
|
.triton/dump/ab89f85e55d5ddd9676325b49df9419f/triton_.ptx
ADDED
@@ -0,0 +1,717 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3d4d5d6de7de
|
10 |
+
.extern .shared .align 1 .b8 global_smem[];
|
11 |
+
.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
|
12 |
+
|
13 |
+
.visible .entry triton__0d1d2d3d4d5d6de7de(
|
14 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_0,
|
15 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_1,
|
16 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_2,
|
17 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_3,
|
18 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_4,
|
19 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_5,
|
20 |
+
.param .u32 triton__0d1d2d3d4d5d6de7de_param_6,
|
21 |
+
.param .u32 triton__0d1d2d3d4d5d6de7de_param_7
|
22 |
+
)
|
23 |
+
.maxntid 64, 1, 1
|
24 |
+
{
|
25 |
+
.reg .pred %p<25>;
|
26 |
+
.reg .b16 %rs<9>;
|
27 |
+
.reg .b32 %r<87>;
|
28 |
+
.reg .f32 %f<70>;
|
29 |
+
.reg .b64 %rd<17>;
|
30 |
+
.loc 1 18 0
|
31 |
+
$L__func_begin0:
|
32 |
+
.loc 1 18 0
|
33 |
+
|
34 |
+
ld.param.u64 %rd7, [triton__0d1d2d3d4d5d6de7de_param_0];
|
35 |
+
ld.param.u64 %rd8, [triton__0d1d2d3d4d5d6de7de_param_1];
|
36 |
+
$L__tmp0:
|
37 |
+
.loc 1 26 26
|
38 |
+
mov.u32 %r52, %tid.x;
|
39 |
+
and.b32 %r53, %r52, 31;
|
40 |
+
ld.param.u64 %rd9, [triton__0d1d2d3d4d5d6de7de_param_2];
|
41 |
+
ld.param.u64 %rd10, [triton__0d1d2d3d4d5d6de7de_param_3];
|
42 |
+
ld.param.u64 %rd11, [triton__0d1d2d3d4d5d6de7de_param_4];
|
43 |
+
and.b32 %r54, %r52, 63;
|
44 |
+
ld.param.u64 %rd12, [triton__0d1d2d3d4d5d6de7de_param_5];
|
45 |
+
shl.b32 %r55, %r54, 2;
|
46 |
+
.loc 1 23 28
|
47 |
+
mov.u32 %r1, %ctaid.x;
|
48 |
+
.loc 1 30 40
|
49 |
+
shl.b32 %r56, %r1, 8;
|
50 |
+
.loc 1 30 36
|
51 |
+
or.b32 %r57, %r56, %r55;
|
52 |
+
.loc 1 30 30
|
53 |
+
mul.wide.s32 %rd13, %r57, 4;
|
54 |
+
add.s64 %rd1, %rd8, %rd13;
|
55 |
+
mov.b32 %r6, 0;
|
56 |
+
mov.pred %p1, -1;
|
57 |
+
.loc 1 30 46
|
58 |
+
mov.u32 %r2, 0x0;
|
59 |
+
mov.u32 %r3, 0x0;
|
60 |
+
mov.u32 %r4, 0x0;
|
61 |
+
mov.u32 %r5, 0x0;
|
62 |
+
@%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
|
63 |
+
@!%p1 mov.u32 %r2, %r6;
|
64 |
+
@!%p1 mov.u32 %r3, %r6;
|
65 |
+
@!%p1 mov.u32 %r4, %r6;
|
66 |
+
@!%p1 mov.u32 %r5, %r6;
|
67 |
+
mov.b32 %f1, %r2;
|
68 |
+
mov.b32 %f2, %r3;
|
69 |
+
mov.b32 %f3, %r4;
|
70 |
+
mov.b32 %f4, %r5;
|
71 |
+
.loc 1 31 30
|
72 |
+
mul.wide.s32 %rd14, %r57, 2;
|
73 |
+
add.s64 %rd2, %rd9, %rd14;
|
74 |
+
.loc 1 31 46
|
75 |
+
mov.u32 %r10, 0x0;
|
76 |
+
mov.u32 %r11, 0x0;
|
77 |
+
@%p1 ld.global.v2.b32 { %r10, %r11 }, [ %rd2 + 0 ];
|
78 |
+
@!%p1 mov.u32 %r10, %r6;
|
79 |
+
@!%p1 mov.u32 %r11, %r6;
|
80 |
+
cvt.u16.u32 %rs1, %r10;
|
81 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r10; }
|
82 |
+
cvt.u16.u32 %rs3, %r11;
|
83 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r11; }
|
84 |
+
.loc 1 31 67
|
85 |
+
cvt.f32.bf16 %r14, %rs1;
|
86 |
+
mov.b32 %f5, %r14;
|
87 |
+
cvt.f32.bf16 %r15, %rs2;
|
88 |
+
mov.b32 %f6, %r15;
|
89 |
+
cvt.f32.bf16 %r16, %rs3;
|
90 |
+
mov.b32 %f7, %r16;
|
91 |
+
cvt.f32.bf16 %r17, %rs4;
|
92 |
+
mov.b32 %f8, %r17;
|
93 |
+
.loc 1 32 31
|
94 |
+
mul.wide.u32 %rd15, %r55, 4;
|
95 |
+
add.s64 %rd3, %rd10, %rd15;
|
96 |
+
.loc 1 32 36
|
97 |
+
mov.u32 %r18, 0x0;
|
98 |
+
mov.u32 %r19, 0x0;
|
99 |
+
mov.u32 %r20, 0x0;
|
100 |
+
mov.u32 %r21, 0x0;
|
101 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd3 + 0 ];
|
102 |
+
@!%p1 mov.u32 %r18, %r6;
|
103 |
+
@!%p1 mov.u32 %r19, %r6;
|
104 |
+
@!%p1 mov.u32 %r20, %r6;
|
105 |
+
@!%p1 mov.u32 %r21, %r6;
|
106 |
+
.loc 1 34 18
|
107 |
+
add.f32 %f9, %f5, %f1;
|
108 |
+
add.f32 %f10, %f6, %f2;
|
109 |
+
add.f32 %f11, %f7, %f3;
|
110 |
+
add.f32 %f12, %f8, %f4;
|
111 |
+
$L__tmp1:
|
112 |
+
.loc 2 233 15
|
113 |
+
add.f32 %f13, %f9, %f10;
|
114 |
+
add.f32 %f14, %f13, %f11;
|
115 |
+
add.f32 %f15, %f14, %f12;
|
116 |
+
$L__tmp2:
|
117 |
+
.loc 2 243 36
|
118 |
+
mov.b32 %r58, %f15;
|
119 |
+
shfl.sync.bfly.b32 %r59, %r58, 16, 31, -1;
|
120 |
+
mov.b32 %f16, %r59;
|
121 |
+
$L__tmp3:
|
122 |
+
.loc 2 233 15
|
123 |
+
add.f32 %f17, %f15, %f16;
|
124 |
+
$L__tmp4:
|
125 |
+
.loc 2 243 36
|
126 |
+
mov.b32 %r60, %f17;
|
127 |
+
shfl.sync.bfly.b32 %r61, %r60, 8, 31, -1;
|
128 |
+
mov.b32 %f18, %r61;
|
129 |
+
$L__tmp5:
|
130 |
+
.loc 2 233 15
|
131 |
+
add.f32 %f19, %f17, %f18;
|
132 |
+
$L__tmp6:
|
133 |
+
.loc 2 243 36
|
134 |
+
mov.b32 %r62, %f19;
|
135 |
+
shfl.sync.bfly.b32 %r63, %r62, 4, 31, -1;
|
136 |
+
mov.b32 %f20, %r63;
|
137 |
+
$L__tmp7:
|
138 |
+
.loc 2 233 15
|
139 |
+
add.f32 %f21, %f19, %f20;
|
140 |
+
$L__tmp8:
|
141 |
+
.loc 2 243 36
|
142 |
+
mov.b32 %r64, %f21;
|
143 |
+
shfl.sync.bfly.b32 %r65, %r64, 2, 31, -1;
|
144 |
+
mov.b32 %f22, %r65;
|
145 |
+
$L__tmp9:
|
146 |
+
.loc 2 233 15
|
147 |
+
add.f32 %f23, %f21, %f22;
|
148 |
+
$L__tmp10:
|
149 |
+
.loc 2 243 36
|
150 |
+
mov.b32 %r66, %f23;
|
151 |
+
shfl.sync.bfly.b32 %r67, %r66, 1, 31, -1;
|
152 |
+
mov.b32 %f24, %r67;
|
153 |
+
$L__tmp11:
|
154 |
+
.loc 2 233 15
|
155 |
+
add.f32 %f25, %f23, %f24;
|
156 |
+
$L__tmp12:
|
157 |
+
.loc 2 243 36
|
158 |
+
setp.eq.s32 %p14, %r53, 0;
|
159 |
+
shr.u32 %r68, %r52, 3;
|
160 |
+
and.b32 %r69, %r68, 4;
|
161 |
+
mov.u32 %r70, global_smem;
|
162 |
+
add.s32 %r26, %r70, %r69;
|
163 |
+
mov.b32 %r27, %f25;
|
164 |
+
@%p14 st.shared.b32 [ %r26 + 0 ], %r27;
|
165 |
+
bar.sync 0;
|
166 |
+
setp.lt.s32 %p15, %r52, 2;
|
167 |
+
shl.b32 %r71, %r52, 2;
|
168 |
+
add.s32 %r29, %r70, %r71;
|
169 |
+
@%p15 ld.shared.b32 %r28, [ %r29 + 0 ];
|
170 |
+
mov.b32 %f26, %r28;
|
171 |
+
shfl.sync.bfly.b32 %r72, %r28, 1, 31, -1;
|
172 |
+
mov.b32 %f27, %r72;
|
173 |
+
$L__tmp13:
|
174 |
+
.loc 2 233 15
|
175 |
+
add.f32 %f28, %f26, %f27;
|
176 |
+
$L__tmp14:
|
177 |
+
.loc 2 243 36
|
178 |
+
and.b32 %r73, %r52, 1;
|
179 |
+
setp.eq.b32 %p23, %r73, 1;
|
180 |
+
not.pred %p24, %p23;
|
181 |
+
and.pred %p16, %p15, %p24;
|
182 |
+
mov.b32 %r31, %f28;
|
183 |
+
@%p16 st.shared.b32 [ %r29 + 0 ], %r31;
|
184 |
+
bar.sync 0;
|
185 |
+
ld.shared.f32 %f29, [global_smem];
|
186 |
+
$L__tmp15:
|
187 |
+
.loc 3 8 15
|
188 |
+
add.f32 %f30, %f29, 0f00000000;
|
189 |
+
$L__tmp16:
|
190 |
+
.loc 1 42 20
|
191 |
+
mov.b32 %r33, %f30;
|
192 |
+
mov.b32 %r34, 1132462080;
|
193 |
+
div.full.f32 %r51, %r33, %r34;
|
194 |
+
mov.b32 %f31, %r51;
|
195 |
+
.loc 1 43 19
|
196 |
+
sub.f32 %f32, %f9, %f31;
|
197 |
+
sub.f32 %f33, %f10, %f31;
|
198 |
+
sub.f32 %f34, %f11, %f31;
|
199 |
+
sub.f32 %f35, %f12, %f31;
|
200 |
+
.loc 1 44 20
|
201 |
+
mul.f32 %f36, %f33, %f33;
|
202 |
+
$L__tmp17:
|
203 |
+
.loc 2 243 36
|
204 |
+
bar.sync 0;
|
205 |
+
$L__tmp18:
|
206 |
+
.loc 2 233 15
|
207 |
+
fma.rn.f32 %f37, %f32, %f32, %f36;
|
208 |
+
fma.rn.f32 %f38, %f34, %f34, %f37;
|
209 |
+
fma.rn.f32 %f39, %f35, %f35, %f38;
|
210 |
+
$L__tmp19:
|
211 |
+
.loc 2 243 36
|
212 |
+
mov.b32 %r74, %f39;
|
213 |
+
shfl.sync.bfly.b32 %r75, %r74, 16, 31, -1;
|
214 |
+
mov.b32 %f40, %r75;
|
215 |
+
$L__tmp20:
|
216 |
+
.loc 2 233 15
|
217 |
+
add.f32 %f41, %f39, %f40;
|
218 |
+
$L__tmp21:
|
219 |
+
.loc 2 243 36
|
220 |
+
mov.b32 %r76, %f41;
|
221 |
+
shfl.sync.bfly.b32 %r77, %r76, 8, 31, -1;
|
222 |
+
mov.b32 %f42, %r77;
|
223 |
+
$L__tmp22:
|
224 |
+
.loc 2 233 15
|
225 |
+
add.f32 %f43, %f41, %f42;
|
226 |
+
$L__tmp23:
|
227 |
+
.loc 2 243 36
|
228 |
+
mov.b32 %r78, %f43;
|
229 |
+
shfl.sync.bfly.b32 %r79, %r78, 4, 31, -1;
|
230 |
+
mov.b32 %f44, %r79;
|
231 |
+
$L__tmp24:
|
232 |
+
.loc 2 233 15
|
233 |
+
add.f32 %f45, %f43, %f44;
|
234 |
+
$L__tmp25:
|
235 |
+
.loc 2 243 36
|
236 |
+
mov.b32 %r80, %f45;
|
237 |
+
shfl.sync.bfly.b32 %r81, %r80, 2, 31, -1;
|
238 |
+
mov.b32 %f46, %r81;
|
239 |
+
$L__tmp26:
|
240 |
+
.loc 2 233 15
|
241 |
+
add.f32 %f47, %f45, %f46;
|
242 |
+
$L__tmp27:
|
243 |
+
.loc 2 243 36
|
244 |
+
mov.b32 %r82, %f47;
|
245 |
+
shfl.sync.bfly.b32 %r83, %r82, 1, 31, -1;
|
246 |
+
mov.b32 %f48, %r83;
|
247 |
+
$L__tmp28:
|
248 |
+
.loc 2 233 15
|
249 |
+
add.f32 %f49, %f47, %f48;
|
250 |
+
$L__tmp29:
|
251 |
+
.loc 2 243 36
|
252 |
+
mov.b32 %r36, %f49;
|
253 |
+
@%p14 st.shared.b32 [ %r26 + 0 ], %r36;
|
254 |
+
bar.sync 0;
|
255 |
+
@%p15 ld.shared.b32 %r37, [ %r29 + 0 ];
|
256 |
+
mov.b32 %f50, %r37;
|
257 |
+
shfl.sync.bfly.b32 %r84, %r37, 1, 31, -1;
|
258 |
+
mov.b32 %f51, %r84;
|
259 |
+
$L__tmp30:
|
260 |
+
.loc 2 233 15
|
261 |
+
add.f32 %f52, %f50, %f51;
|
262 |
+
$L__tmp31:
|
263 |
+
.loc 2 243 36
|
264 |
+
mov.b32 %r40, %f52;
|
265 |
+
@%p16 st.shared.b32 [ %r29 + 0 ], %r40;
|
266 |
+
bar.sync 0;
|
267 |
+
ld.shared.f32 %f53, [global_smem];
|
268 |
+
$L__tmp32:
|
269 |
+
.loc 3 8 15
|
270 |
+
add.f32 %f54, %f53, 0f00000000;
|
271 |
+
$L__tmp33:
|
272 |
+
.loc 1 49 20
|
273 |
+
mov.b32 %r42, %f54;
|
274 |
+
div.full.f32 %r41, %r42, %r34;
|
275 |
+
mov.b32 %f55, %r41;
|
276 |
+
.loc 1 51 20
|
277 |
+
add.f32 %f56, %f55, 0f3727C5AC;
|
278 |
+
.loc 1 52 26
|
279 |
+
rsqrt.approx.ftz.f32 %f57, %f56;
|
280 |
+
.loc 1 32 36
|
281 |
+
mov.b32 %f58, %r21;
|
282 |
+
mov.b32 %f59, %r20;
|
283 |
+
mov.b32 %f60, %r19;
|
284 |
+
mov.b32 %f61, %r18;
|
285 |
+
.loc 1 54 20
|
286 |
+
mul.f32 %f62, %f32, %f57;
|
287 |
+
mul.f32 %f63, %f33, %f57;
|
288 |
+
mul.f32 %f64, %f34, %f57;
|
289 |
+
mul.f32 %f65, %f35, %f57;
|
290 |
+
.loc 1 55 20
|
291 |
+
mul.f32 %f66, %f62, %f61;
|
292 |
+
mul.f32 %f67, %f63, %f60;
|
293 |
+
mul.f32 %f68, %f64, %f59;
|
294 |
+
mul.f32 %f69, %f65, %f58;
|
295 |
+
.loc 1 57 4
|
296 |
+
bar.sync 0;
|
297 |
+
.loc 1 58 28
|
298 |
+
mul.wide.s32 %rd16, %r1, 4;
|
299 |
+
add.s64 %rd4, %rd7, %rd16;
|
300 |
+
.loc 1 58 40
|
301 |
+
setp.eq.s32 %p20, %r54, 0;
|
302 |
+
mov.b32 %r44, %f57;
|
303 |
+
@%p20 st.global.b32 [ %rd4 + 0 ], { %r44 };
|
304 |
+
.loc 1 59 25
|
305 |
+
add.s64 %rd5, %rd12, %rd14;
|
306 |
+
.loc 1 59 48
|
307 |
+
mov.b32 %r45, %f66;
|
308 |
+
cvt.rn.bf16.f32 %rs5, %r45;
|
309 |
+
mov.b32 %r46, %f67;
|
310 |
+
cvt.rn.bf16.f32 %rs6, %r46;
|
311 |
+
mov.b32 %r47, %f68;
|
312 |
+
cvt.rn.bf16.f32 %rs7, %r47;
|
313 |
+
mov.b32 %r48, %f69;
|
314 |
+
cvt.rn.bf16.f32 %rs8, %r48;
|
315 |
+
mov.b32 %r85, {%rs5, %rs6};
|
316 |
+
mov.b32 %r86, {%rs7, %rs8};
|
317 |
+
@%p1 st.global.v2.b32 [ %rd5 + 0 ], { %r85, %r86 };
|
318 |
+
.loc 1 60 25
|
319 |
+
add.s64 %rd6, %rd11, %rd16;
|
320 |
+
.loc 1 60 37
|
321 |
+
@%p20 st.global.b32 [ %rd6 + 0 ], { %r51 };
|
322 |
+
.loc 1 60 4
|
323 |
+
ret;
|
324 |
+
$L__tmp34:
|
325 |
+
$L__func_end0:
|
326 |
+
|
327 |
+
}
|
328 |
+
// .globl __nv_rsqrtf
|
329 |
+
.visible .func (.param .b32 func_retval0) __nv_rsqrtf(
|
330 |
+
.param .b32 __nv_rsqrtf_param_0
|
331 |
+
)
|
332 |
+
{
|
333 |
+
.reg .f32 %f<3>;
|
334 |
+
$L__func_begin1:
|
335 |
+
|
336 |
+
ld.param.f32 %f1, [__nv_rsqrtf_param_0];
|
337 |
+
rsqrt.approx.ftz.f32 %f2, %f1;
|
338 |
+
st.param.f32 [func_retval0+0], %f2;
|
339 |
+
ret;
|
340 |
+
$L__func_end1:
|
341 |
+
|
342 |
+
}
|
343 |
+
.file 1 "/tmp/torchinductor_root/w3/cw35gljjtatzr2ztskwlxndj2nreiih7r3vg5rw4douyaxccqgij.py"
|
344 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
|
345 |
+
.file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
|
346 |
+
.section .debug_abbrev
|
347 |
+
{
|
348 |
+
.b8 1
|
349 |
+
.b8 17
|
350 |
+
.b8 1
|
351 |
+
.b8 37
|
352 |
+
.b8 8
|
353 |
+
.b8 19
|
354 |
+
.b8 5
|
355 |
+
.b8 3
|
356 |
+
.b8 8
|
357 |
+
.b8 16
|
358 |
+
.b8 6
|
359 |
+
.b8 27
|
360 |
+
.b8 8
|
361 |
+
.b8 180
|
362 |
+
.b8 66
|
363 |
+
.b8 12
|
364 |
+
.b8 17
|
365 |
+
.b8 1
|
366 |
+
.b8 18
|
367 |
+
.b8 1
|
368 |
+
.b8 0
|
369 |
+
.b8 0
|
370 |
+
.b8 2
|
371 |
+
.b8 46
|
372 |
+
.b8 0
|
373 |
+
.b8 135
|
374 |
+
.b8 64
|
375 |
+
.b8 8
|
376 |
+
.b8 3
|
377 |
+
.b8 8
|
378 |
+
.b8 58
|
379 |
+
.b8 11
|
380 |
+
.b8 59
|
381 |
+
.b8 11
|
382 |
+
.b8 63
|
383 |
+
.b8 12
|
384 |
+
.b8 32
|
385 |
+
.b8 11
|
386 |
+
.b8 0
|
387 |
+
.b8 0
|
388 |
+
.b8 3
|
389 |
+
.b8 46
|
390 |
+
.b8 1
|
391 |
+
.b8 17
|
392 |
+
.b8 1
|
393 |
+
.b8 18
|
394 |
+
.b8 1
|
395 |
+
.b8 64
|
396 |
+
.b8 10
|
397 |
+
.b8 49
|
398 |
+
.b8 19
|
399 |
+
.b8 0
|
400 |
+
.b8 0
|
401 |
+
.b8 4
|
402 |
+
.b8 29
|
403 |
+
.b8 1
|
404 |
+
.b8 49
|
405 |
+
.b8 19
|
406 |
+
.b8 17
|
407 |
+
.b8 1
|
408 |
+
.b8 18
|
409 |
+
.b8 1
|
410 |
+
.b8 88
|
411 |
+
.b8 11
|
412 |
+
.b8 89
|
413 |
+
.b8 11
|
414 |
+
.b8 87
|
415 |
+
.b8 11
|
416 |
+
.b8 0
|
417 |
+
.b8 0
|
418 |
+
.b8 5
|
419 |
+
.b8 29
|
420 |
+
.b8 0
|
421 |
+
.b8 49
|
422 |
+
.b8 19
|
423 |
+
.b8 17
|
424 |
+
.b8 1
|
425 |
+
.b8 18
|
426 |
+
.b8 1
|
427 |
+
.b8 88
|
428 |
+
.b8 11
|
429 |
+
.b8 89
|
430 |
+
.b8 11
|
431 |
+
.b8 87
|
432 |
+
.b8 11
|
433 |
+
.b8 0
|
434 |
+
.b8 0
|
435 |
+
.b8 0
|
436 |
+
}
|
437 |
+
.section .debug_info
|
438 |
+
{
|
439 |
+
.b32 399
|
440 |
+
.b8 2
|
441 |
+
.b8 0
|
442 |
+
.b32 .debug_abbrev
|
443 |
+
.b8 8
|
444 |
+
.b8 1
|
445 |
+
.b8 116
|
446 |
+
.b8 114
|
447 |
+
.b8 105
|
448 |
+
.b8 116
|
449 |
+
.b8 111
|
450 |
+
.b8 110
|
451 |
+
.b8 0
|
452 |
+
.b8 2
|
453 |
+
.b8 0
|
454 |
+
.b8 99
|
455 |
+
.b8 119
|
456 |
+
.b8 51
|
457 |
+
.b8 53
|
458 |
+
.b8 103
|
459 |
+
.b8 108
|
460 |
+
.b8 106
|
461 |
+
.b8 106
|
462 |
+
.b8 116
|
463 |
+
.b8 97
|
464 |
+
.b8 116
|
465 |
+
.b8 122
|
466 |
+
.b8 114
|
467 |
+
.b8 50
|
468 |
+
.b8 122
|
469 |
+
.b8 116
|
470 |
+
.b8 115
|
471 |
+
.b8 107
|
472 |
+
.b8 119
|
473 |
+
.b8 108
|
474 |
+
.b8 120
|
475 |
+
.b8 110
|
476 |
+
.b8 100
|
477 |
+
.b8 106
|
478 |
+
.b8 50
|
479 |
+
.b8 110
|
480 |
+
.b8 114
|
481 |
+
.b8 101
|
482 |
+
.b8 105
|
483 |
+
.b8 105
|
484 |
+
.b8 104
|
485 |
+
.b8 55
|
486 |
+
.b8 114
|
487 |
+
.b8 51
|
488 |
+
.b8 118
|
489 |
+
.b8 103
|
490 |
+
.b8 53
|
491 |
+
.b8 114
|
492 |
+
.b8 119
|
493 |
+
.b8 52
|
494 |
+
.b8 100
|
495 |
+
.b8 111
|
496 |
+
.b8 117
|
497 |
+
.b8 121
|
498 |
+
.b8 97
|
499 |
+
.b8 120
|
500 |
+
.b8 99
|
501 |
+
.b8 99
|
502 |
+
.b8 113
|
503 |
+
.b8 103
|
504 |
+
.b8 105
|
505 |
+
.b8 106
|
506 |
+
.b8 46
|
507 |
+
.b8 112
|
508 |
+
.b8 121
|
509 |
+
.b8 0
|
510 |
+
.b32 .debug_line
|
511 |
+
.b8 47
|
512 |
+
.b8 116
|
513 |
+
.b8 109
|
514 |
+
.b8 112
|
515 |
+
.b8 47
|
516 |
+
.b8 116
|
517 |
+
.b8 111
|
518 |
+
.b8 114
|
519 |
+
.b8 99
|
520 |
+
.b8 104
|
521 |
+
.b8 105
|
522 |
+
.b8 110
|
523 |
+
.b8 100
|
524 |
+
.b8 117
|
525 |
+
.b8 99
|
526 |
+
.b8 116
|
527 |
+
.b8 111
|
528 |
+
.b8 114
|
529 |
+
.b8 95
|
530 |
+
.b8 114
|
531 |
+
.b8 111
|
532 |
+
.b8 111
|
533 |
+
.b8 116
|
534 |
+
.b8 47
|
535 |
+
.b8 119
|
536 |
+
.b8 51
|
537 |
+
.b8 0
|
538 |
+
.b8 1
|
539 |
+
.b64 $L__func_begin0
|
540 |
+
.b64 $L__func_end0
|
541 |
+
.b8 2
|
542 |
+
.b8 116
|
543 |
+
.b8 114
|
544 |
+
.b8 105
|
545 |
+
.b8 116
|
546 |
+
.b8 111
|
547 |
+
.b8 110
|
548 |
+
.b8 95
|
549 |
+
.b8 95
|
550 |
+
.b8 48
|
551 |
+
.b8 100
|
552 |
+
.b8 49
|
553 |
+
.b8 100
|
554 |
+
.b8 50
|
555 |
+
.b8 100
|
556 |
+
.b8 51
|
557 |
+
.b8 100
|
558 |
+
.b8 52
|
559 |
+
.b8 100
|
560 |
+
.b8 53
|
561 |
+
.b8 100
|
562 |
+
.b8 54
|
563 |
+
.b8 100
|
564 |
+
.b8 101
|
565 |
+
.b8 55
|
566 |
+
.b8 100
|
567 |
+
.b8 101
|
568 |
+
.b8 0
|
569 |
+
.b8 116
|
570 |
+
.b8 114
|
571 |
+
.b8 105
|
572 |
+
.b8 116
|
573 |
+
.b8 111
|
574 |
+
.b8 110
|
575 |
+
.b8 95
|
576 |
+
.b8 95
|
577 |
+
.b8 48
|
578 |
+
.b8 100
|
579 |
+
.b8 49
|
580 |
+
.b8 100
|
581 |
+
.b8 50
|
582 |
+
.b8 100
|
583 |
+
.b8 51
|
584 |
+
.b8 100
|
585 |
+
.b8 52
|
586 |
+
.b8 100
|
587 |
+
.b8 53
|
588 |
+
.b8 100
|
589 |
+
.b8 54
|
590 |
+
.b8 100
|
591 |
+
.b8 101
|
592 |
+
.b8 55
|
593 |
+
.b8 100
|
594 |
+
.b8 101
|
595 |
+
.b8 0
|
596 |
+
.b8 1
|
597 |
+
.b8 18
|
598 |
+
.b8 1
|
599 |
+
.b8 1
|
600 |
+
.b8 3
|
601 |
+
.b64 $L__func_begin0
|
602 |
+
.b64 $L__func_end0
|
603 |
+
.b8 1
|
604 |
+
.b8 156
|
605 |
+
.b32 125
|
606 |
+
.b8 4
|
607 |
+
.b32 125
|
608 |
+
.b64 $L__tmp1
|
609 |
+
.b64 $L__tmp14
|
610 |
+
.b8 2
|
611 |
+
.b8 39
|
612 |
+
.b8 58
|
613 |
+
.b8 5
|
614 |
+
.b32 125
|
615 |
+
.b64 $L__tmp1
|
616 |
+
.b64 $L__tmp14
|
617 |
+
.b8 2
|
618 |
+
.b8 243
|
619 |
+
.b8 36
|
620 |
+
.b8 0
|
621 |
+
.b8 5
|
622 |
+
.b32 125
|
623 |
+
.b64 $L__tmp2
|
624 |
+
.b64 $L__tmp15
|
625 |
+
.b8 2
|
626 |
+
.b8 39
|
627 |
+
.b8 58
|
628 |
+
.b8 5
|
629 |
+
.b32 125
|
630 |
+
.b64 $L__tmp15
|
631 |
+
.b64 $L__tmp16
|
632 |
+
.b8 3
|
633 |
+
.b8 39
|
634 |
+
.b8 45
|
635 |
+
.b8 5
|
636 |
+
.b32 125
|
637 |
+
.b64 $L__tmp17
|
638 |
+
.b64 $L__tmp32
|
639 |
+
.b8 2
|
640 |
+
.b8 47
|
641 |
+
.b8 59
|
642 |
+
.b8 4
|
643 |
+
.b32 125
|
644 |
+
.b64 $L__tmp18
|
645 |
+
.b64 $L__tmp31
|
646 |
+
.b8 2
|
647 |
+
.b8 47
|
648 |
+
.b8 59
|
649 |
+
.b8 5
|
650 |
+
.b32 125
|
651 |
+
.b64 $L__tmp18
|
652 |
+
.b64 $L__tmp31
|
653 |
+
.b8 2
|
654 |
+
.b8 243
|
655 |
+
.b8 36
|
656 |
+
.b8 0
|
657 |
+
.b8 5
|
658 |
+
.b32 125
|
659 |
+
.b64 $L__tmp32
|
660 |
+
.b64 $L__tmp33
|
661 |
+
.b8 3
|
662 |
+
.b8 47
|
663 |
+
.b8 45
|
664 |
+
.b8 0
|
665 |
+
.b8 0
|
666 |
+
}
|
667 |
+
.section .debug_pubnames
|
668 |
+
{
|
669 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
670 |
+
$L__pubNames_start0:
|
671 |
+
.b8 2
|
672 |
+
.b8 0
|
673 |
+
.b32 .debug_info
|
674 |
+
.b32 403
|
675 |
+
.b32 125
|
676 |
+
.b8 116
|
677 |
+
.b8 114
|
678 |
+
.b8 105
|
679 |
+
.b8 116
|
680 |
+
.b8 111
|
681 |
+
.b8 110
|
682 |
+
.b8 95
|
683 |
+
.b8 95
|
684 |
+
.b8 48
|
685 |
+
.b8 100
|
686 |
+
.b8 49
|
687 |
+
.b8 100
|
688 |
+
.b8 50
|
689 |
+
.b8 100
|
690 |
+
.b8 51
|
691 |
+
.b8 100
|
692 |
+
.b8 52
|
693 |
+
.b8 100
|
694 |
+
.b8 53
|
695 |
+
.b8 100
|
696 |
+
.b8 54
|
697 |
+
.b8 100
|
698 |
+
.b8 101
|
699 |
+
.b8 55
|
700 |
+
.b8 100
|
701 |
+
.b8 101
|
702 |
+
.b8 0
|
703 |
+
.b32 0
|
704 |
+
$L__pubNames_end0:
|
705 |
+
}
|
706 |
+
.section .debug_pubtypes
|
707 |
+
{
|
708 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
709 |
+
$L__pubTypes_start0:
|
710 |
+
.b8 2
|
711 |
+
.b8 0
|
712 |
+
.b32 .debug_info
|
713 |
+
.b32 403
|
714 |
+
.b32 0
|
715 |
+
$L__pubTypes_end0:
|
716 |
+
}
|
717 |
+
.section .debug_loc { }
|
.triton/dump/ab89f85e55d5ddd9676325b49df9419f/triton_.ttgir
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
3 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
4 |
+
tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
5 |
+
%cst = arith.constant dense<256> : tensor<256xi32, #blocked>
|
6 |
+
%cst_0 = arith.constant 9.99999974E-6 : f32
|
7 |
+
%cst_1 = arith.constant 2.560000e+02 : f32
|
8 |
+
%cst_2 = arith.constant 0.000000e+00 : f32
|
9 |
+
%c256_i32 = arith.constant 256 : i32
|
10 |
+
%cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
|
11 |
+
%cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
|
12 |
+
%0 = tt.get_program_id x : i32
|
13 |
+
%1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
|
14 |
+
%2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
|
15 |
+
%3 = arith.muli %0, %c256_i32 : i32
|
16 |
+
%4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
|
17 |
+
%5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
|
18 |
+
%6 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
19 |
+
%7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
20 |
+
%8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
21 |
+
%9 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
22 |
+
%10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
23 |
+
%11 = tt.load %10, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
|
24 |
+
%12 = arith.extf %11 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
|
25 |
+
%13 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
26 |
+
%14 = tt.addptr %13, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
27 |
+
%15 = tt.load %14, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
28 |
+
%16 = arith.addf %8, %12 : tensor<256xf32, #blocked>
|
29 |
+
%17 = arith.select %2, %16, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
|
30 |
+
%18 = "tt.reduce"(%17) <{axis = 0 : i32}> ({
|
31 |
+
^bb0(%arg8: f32, %arg9: f32):
|
32 |
+
%42 = arith.addf %arg8, %arg9 : f32
|
33 |
+
tt.reduce.return %42 : f32
|
34 |
+
}) : (tensor<256xf32, #blocked>) -> f32
|
35 |
+
%19 = arith.addf %18, %cst_2 : f32
|
36 |
+
%20 = arith.divf %19, %cst_1 : f32
|
37 |
+
%21 = tt.splat %20 : (f32) -> tensor<1xf32, #blocked1>
|
38 |
+
%22 = tt.splat %20 : (f32) -> tensor<256xf32, #blocked>
|
39 |
+
%23 = arith.subf %16, %22 : tensor<256xf32, #blocked>
|
40 |
+
%24 = arith.mulf %23, %23 : tensor<256xf32, #blocked>
|
41 |
+
%25 = arith.select %2, %24, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
|
42 |
+
%26 = "tt.reduce"(%25) <{axis = 0 : i32}> ({
|
43 |
+
^bb0(%arg8: f32, %arg9: f32):
|
44 |
+
%42 = arith.addf %arg8, %arg9 : f32
|
45 |
+
tt.reduce.return %42 : f32
|
46 |
+
}) : (tensor<256xf32, #blocked>) -> f32
|
47 |
+
%27 = arith.addf %26, %cst_2 : f32
|
48 |
+
%28 = arith.divf %27, %cst_1 : f32
|
49 |
+
%29 = arith.addf %28, %cst_0 : f32
|
50 |
+
%30 = tt.extern_elementwise %29 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
|
51 |
+
%31 = tt.splat %30 : (f32) -> tensor<1xf32, #blocked1>
|
52 |
+
%32 = tt.splat %30 : (f32) -> tensor<256xf32, #blocked>
|
53 |
+
%33 = arith.mulf %23, %32 : tensor<256xf32, #blocked>
|
54 |
+
%34 = arith.mulf %33, %15 : tensor<256xf32, #blocked>
|
55 |
+
gpu.barrier
|
56 |
+
%35 = tt.addptr %arg0, %0 : !tt.ptr<f32, 1>, i32
|
57 |
+
%36 = tt.splat %35 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked1>
|
58 |
+
tt.store %36, %31 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32, #blocked1>
|
59 |
+
%37 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
60 |
+
%38 = tt.addptr %37, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
61 |
+
%39 = arith.truncf %34 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked>
|
62 |
+
tt.store %38, %39, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked>
|
63 |
+
%40 = tt.addptr %arg4, %0 : !tt.ptr<f32, 1>, i32
|
64 |
+
%41 = tt.splat %40 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked1>
|
65 |
+
tt.store %41, %21 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32, #blocked1>
|
66 |
+
tt.return
|
67 |
+
}
|
68 |
+
}
|
.triton/dump/be28ee3793470d9803546f03e49c3edf/triton_.cubin
ADDED
Binary file (17.5 kB). View file
|
|
.triton/dump/be28ee3793470d9803546f03e49c3edf/triton_.ttgir
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
3 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
4 |
+
tt.func public @triton__0d1d2d3d4d5d6d7d8d9d10de11de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg8: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg9: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg10: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg11: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
5 |
+
%cst = arith.constant dense<256> : tensor<256xi32, #blocked>
|
6 |
+
%cst_0 = arith.constant 9.99999974E-6 : f32
|
7 |
+
%cst_1 = arith.constant 2.560000e+02 : f32
|
8 |
+
%cst_2 = arith.constant 0.000000e+00 : f32
|
9 |
+
%c256_i32 = arith.constant 256 : i32
|
10 |
+
%cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
|
11 |
+
%cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
|
12 |
+
%0 = tt.get_program_id x : i32
|
13 |
+
%1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
|
14 |
+
%2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
|
15 |
+
%3 = arith.muli %0, %c256_i32 : i32
|
16 |
+
%4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
|
17 |
+
%5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
|
18 |
+
%6 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
19 |
+
%7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
20 |
+
%8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
21 |
+
%9 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
22 |
+
%10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
23 |
+
%11 = tt.load %10, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
|
24 |
+
%12 = arith.extf %11 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
|
25 |
+
%13 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
26 |
+
%14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
27 |
+
%15 = tt.load %14, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
|
28 |
+
%16 = arith.extf %15 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
|
29 |
+
%17 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
30 |
+
%18 = tt.addptr %17, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
31 |
+
%19 = tt.load %18, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
|
32 |
+
%20 = arith.extf %19 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
|
33 |
+
%21 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
34 |
+
%22 = tt.addptr %21, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
35 |
+
%23 = tt.load %22, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
|
36 |
+
%24 = arith.extf %23 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
|
37 |
+
%25 = tt.splat %arg6 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
38 |
+
%26 = tt.addptr %25, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
39 |
+
%27 = tt.load %26, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
40 |
+
%28 = arith.addf %8, %12 : tensor<256xf32, #blocked>
|
41 |
+
%29 = arith.addf %28, %16 : tensor<256xf32, #blocked>
|
42 |
+
%30 = arith.addf %29, %20 : tensor<256xf32, #blocked>
|
43 |
+
%31 = arith.addf %30, %24 : tensor<256xf32, #blocked>
|
44 |
+
%32 = arith.select %2, %31, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
|
45 |
+
%33 = "tt.reduce"(%32) <{axis = 0 : i32}> ({
|
46 |
+
^bb0(%arg12: f32, %arg13: f32):
|
47 |
+
%58 = arith.addf %arg12, %arg13 : f32
|
48 |
+
tt.reduce.return %58 : f32
|
49 |
+
}) : (tensor<256xf32, #blocked>) -> f32
|
50 |
+
%34 = arith.addf %33, %cst_2 : f32
|
51 |
+
%35 = arith.divf %34, %cst_1 : f32
|
52 |
+
%36 = tt.splat %35 : (f32) -> tensor<1xf32, #blocked1>
|
53 |
+
%37 = tt.splat %35 : (f32) -> tensor<256xf32, #blocked>
|
54 |
+
%38 = arith.subf %31, %37 : tensor<256xf32, #blocked>
|
55 |
+
%39 = arith.mulf %38, %38 : tensor<256xf32, #blocked>
|
56 |
+
%40 = arith.select %2, %39, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
|
57 |
+
%41 = "tt.reduce"(%40) <{axis = 0 : i32}> ({
|
58 |
+
^bb0(%arg12: f32, %arg13: f32):
|
59 |
+
%58 = arith.addf %arg12, %arg13 : f32
|
60 |
+
tt.reduce.return %58 : f32
|
61 |
+
}) : (tensor<256xf32, #blocked>) -> f32
|
62 |
+
%42 = arith.addf %41, %cst_2 : f32
|
63 |
+
%43 = arith.divf %42, %cst_1 : f32
|
64 |
+
%44 = arith.addf %43, %cst_0 : f32
|
65 |
+
%45 = tt.extern_elementwise %44 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
|
66 |
+
%46 = tt.splat %45 : (f32) -> tensor<1xf32, #blocked1>
|
67 |
+
%47 = tt.splat %45 : (f32) -> tensor<256xf32, #blocked>
|
68 |
+
%48 = arith.mulf %38, %47 : tensor<256xf32, #blocked>
|
69 |
+
%49 = arith.mulf %48, %27 : tensor<256xf32, #blocked>
|
70 |
+
%50 = tt.splat %arg7 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
71 |
+
%51 = tt.addptr %50, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
72 |
+
tt.store %51, %31, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked>
|
73 |
+
gpu.barrier
|
74 |
+
%52 = tt.addptr %arg0, %0 : !tt.ptr<f32, 1>, i32
|
75 |
+
%53 = tt.splat %52 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked1>
|
76 |
+
tt.store %53, %46 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32, #blocked1>
|
77 |
+
%54 = tt.splat %arg9 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
78 |
+
%55 = tt.addptr %54, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
79 |
+
tt.store %55, %49, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked>
|
80 |
+
%56 = tt.addptr %arg8, %0 : !tt.ptr<f32, 1>, i32
|
81 |
+
%57 = tt.splat %56 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked1>
|
82 |
+
tt.store %57, %36 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32, #blocked1>
|
83 |
+
tt.return
|
84 |
+
}
|
85 |
+
}
|
.triton/dump/c0c34db8f5ff22b79fc971c63187477a/triton_.cubin
ADDED
Binary file (17.8 kB). View file
|
|
.triton/dump/c0c34db8f5ff22b79fc971c63187477a/triton_.ttir
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2d3d4d5d6d7d8d9d10de11de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg8: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg9: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg10: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg11: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%c256_i32 = arith.constant 256 : i32
|
4 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<256xbf16>
|
5 |
+
%cst_0 = arith.constant 0.000000e+00 : f32
|
6 |
+
%cst_1 = arith.constant 2.560000e+02 : f32
|
7 |
+
%cst_2 = arith.constant 9.99999974E-6 : f32
|
8 |
+
%cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32>
|
9 |
+
%cst_4 = arith.constant dense<256> : tensor<256xi32>
|
10 |
+
%0 = tt.get_program_id x : i32
|
11 |
+
%1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
|
12 |
+
%2 = arith.cmpi slt, %1, %cst_4 : tensor<256xi32>
|
13 |
+
%3 = arith.muli %0, %c256_i32 : i32
|
14 |
+
%4 = tt.splat %3 : (i32) -> tensor<256xi32>
|
15 |
+
%5 = arith.addi %1, %4 : tensor<256xi32>
|
16 |
+
%6 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
|
17 |
+
%7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
|
18 |
+
%8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
|
19 |
+
%9 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
|
20 |
+
%10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
|
21 |
+
%11 = tt.load %10, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
|
22 |
+
%12 = arith.extf %11 : tensor<256xbf16> to tensor<256xf32>
|
23 |
+
%13 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
|
24 |
+
%14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
|
25 |
+
%15 = tt.load %14, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
|
26 |
+
%16 = arith.extf %15 : tensor<256xbf16> to tensor<256xf32>
|
27 |
+
%17 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
|
28 |
+
%18 = tt.addptr %17, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
|
29 |
+
%19 = tt.load %18, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
|
30 |
+
%20 = arith.extf %19 : tensor<256xbf16> to tensor<256xf32>
|
31 |
+
%21 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
|
32 |
+
%22 = tt.addptr %21, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
|
33 |
+
%23 = tt.load %22, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
|
34 |
+
%24 = arith.extf %23 : tensor<256xbf16> to tensor<256xf32>
|
35 |
+
%25 = tt.splat %arg6 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
|
36 |
+
%26 = tt.addptr %25, %1 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
|
37 |
+
%27 = tt.load %26, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
|
38 |
+
%28 = arith.addf %8, %12 : tensor<256xf32>
|
39 |
+
%29 = arith.addf %28, %16 : tensor<256xf32>
|
40 |
+
%30 = arith.addf %29, %20 : tensor<256xf32>
|
41 |
+
%31 = arith.addf %30, %24 : tensor<256xf32>
|
42 |
+
%32 = arith.select %2, %31, %cst_3 : tensor<256xi1>, tensor<256xf32>
|
43 |
+
%33 = "tt.reduce"(%32) <{axis = 0 : i32}> ({
|
44 |
+
^bb0(%arg12: f32, %arg13: f32):
|
45 |
+
%59 = arith.addf %arg12, %arg13 : f32
|
46 |
+
tt.reduce.return %59 : f32
|
47 |
+
}) : (tensor<256xf32>) -> f32
|
48 |
+
%34 = arith.addf %33, %cst_0 : f32
|
49 |
+
%35 = arith.divf %34, %cst_1 : f32
|
50 |
+
%36 = tt.splat %35 : (f32) -> tensor<1xf32>
|
51 |
+
%37 = tt.splat %35 : (f32) -> tensor<256xf32>
|
52 |
+
%38 = arith.subf %31, %37 : tensor<256xf32>
|
53 |
+
%39 = arith.mulf %38, %38 : tensor<256xf32>
|
54 |
+
%40 = arith.select %2, %39, %cst_3 : tensor<256xi1>, tensor<256xf32>
|
55 |
+
%41 = "tt.reduce"(%40) <{axis = 0 : i32}> ({
|
56 |
+
^bb0(%arg12: f32, %arg13: f32):
|
57 |
+
%59 = arith.addf %arg12, %arg13 : f32
|
58 |
+
tt.reduce.return %59 : f32
|
59 |
+
}) : (tensor<256xf32>) -> f32
|
60 |
+
%42 = arith.addf %41, %cst_0 : f32
|
61 |
+
%43 = arith.divf %42, %cst_1 : f32
|
62 |
+
%44 = arith.addf %43, %cst_2 : f32
|
63 |
+
%45 = tt.extern_elementwise %44 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
|
64 |
+
%46 = tt.splat %45 : (f32) -> tensor<1xf32>
|
65 |
+
%47 = tt.splat %45 : (f32) -> tensor<256xf32>
|
66 |
+
%48 = arith.mulf %38, %47 : tensor<256xf32>
|
67 |
+
%49 = arith.mulf %48, %27 : tensor<256xf32>
|
68 |
+
%50 = tt.splat %arg7 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
|
69 |
+
%51 = tt.addptr %50, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
|
70 |
+
tt.store %51, %31, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32>
|
71 |
+
gpu.barrier
|
72 |
+
%52 = tt.addptr %arg0, %0 : !tt.ptr<f32, 1>, i32
|
73 |
+
%53 = tt.splat %52 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>>
|
74 |
+
tt.store %53, %46 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32>
|
75 |
+
%54 = tt.splat %arg9 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
|
76 |
+
%55 = tt.addptr %54, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
|
77 |
+
%56 = arith.truncf %49 : tensor<256xf32> to tensor<256xbf16>
|
78 |
+
tt.store %55, %56, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16>
|
79 |
+
%57 = tt.addptr %arg8, %0 : !tt.ptr<f32, 1>, i32
|
80 |
+
%58 = tt.splat %57 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>>
|
81 |
+
tt.store %58, %36 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32>
|
82 |
+
tt.return
|
83 |
+
}
|
84 |
+
}
|
.triton/dump/c0db4dd81e5aac83500e3ccf67d3896d/triton_.llir
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 {
|
5 |
+
%4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
|
6 |
+
%5 = shl i32 %4, 1, !dbg !8
|
7 |
+
%6 = and i32 %5, 510, !dbg !8
|
8 |
+
%7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
|
9 |
+
%8 = shl i32 %7, 9, !dbg !10
|
10 |
+
%9 = or i32 %8, %6, !dbg !11
|
11 |
+
%10 = sext i32 %9 to i64, !dbg !12
|
12 |
+
%11 = getelementptr i16, ptr addrspace(1) %0, i64 %10, !dbg !12
|
13 |
+
%12 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %11, i1 true) #1, !dbg !13
|
14 |
+
%13 = trunc i32 %12 to i16, !dbg !13
|
15 |
+
%extelt.offset = lshr i32 %12, 16, !dbg !13
|
16 |
+
%14 = trunc i32 %extelt.offset to i16, !dbg !13
|
17 |
+
%15 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %13) #1, !dbg !14
|
18 |
+
%16 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %14) #1, !dbg !14
|
19 |
+
%17 = getelementptr float, ptr addrspace(1) %1, i64 %10, !dbg !15
|
20 |
+
%18 = bitcast float %15 to i32, !dbg !16
|
21 |
+
%19 = bitcast float %16 to i32, !dbg !16
|
22 |
+
tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %18, i32 %19, ptr addrspace(1) %17, i1 true) #1, !dbg !16
|
23 |
+
ret void, !dbg !17
|
24 |
+
}
|
25 |
+
|
26 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
27 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
28 |
+
|
29 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
30 |
+
attributes #1 = { nounwind }
|
31 |
+
|
32 |
+
!llvm.module.flags = !{!0}
|
33 |
+
!llvm.dbg.cu = !{!1}
|
34 |
+
!nvvm.annotations = !{!3, !4, !4, !3}
|
35 |
+
|
36 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
37 |
+
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
38 |
+
!2 = !DIFile(filename: "cyamhdbxtmf4rgres6uo7orhfzw3ryhsvm5qzdvyqgggck2hqbyi.py", directory: "/tmp/torchinductor_root/ya")
|
39 |
+
!3 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
|
40 |
+
!4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 256}
|
41 |
+
!5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
|
42 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
43 |
+
!7 = !{}
|
44 |
+
!8 = !DILocation(line: 21, column: 36, scope: !5)
|
45 |
+
!9 = !DILocation(line: 20, column: 28, scope: !5)
|
46 |
+
!10 = !DILocation(line: 20, column: 33, scope: !5)
|
47 |
+
!11 = !DILocation(line: 21, column: 23, scope: !5)
|
48 |
+
!12 = !DILocation(line: 24, column: 30, scope: !5)
|
49 |
+
!13 = !DILocation(line: 24, column: 35, scope: !5)
|
50 |
+
!14 = !DILocation(line: 24, column: 44, scope: !5)
|
51 |
+
!15 = !DILocation(line: 26, column: 25, scope: !5)
|
52 |
+
!16 = !DILocation(line: 26, column: 36, scope: !5)
|
53 |
+
!17 = !DILocation(line: 26, column: 4, scope: !5)
|
.triton/dump/d7a12c0ba96f8920b8147157303ee99f/triton_.ttir
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%c256_i32 = arith.constant 256 : i32
|
4 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<256xbf16>
|
5 |
+
%cst_0 = arith.constant 0.000000e+00 : f32
|
6 |
+
%cst_1 = arith.constant 2.560000e+02 : f32
|
7 |
+
%cst_2 = arith.constant 9.99999974E-6 : f32
|
8 |
+
%cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32>
|
9 |
+
%cst_4 = arith.constant dense<256> : tensor<256xi32>
|
10 |
+
%0 = tt.get_program_id x : i32
|
11 |
+
%1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
|
12 |
+
%2 = arith.cmpi slt, %1, %cst_4 : tensor<256xi32>
|
13 |
+
%3 = arith.muli %0, %c256_i32 : i32
|
14 |
+
%4 = tt.splat %3 : (i32) -> tensor<256xi32>
|
15 |
+
%5 = arith.addi %1, %4 : tensor<256xi32>
|
16 |
+
%6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
|
17 |
+
%7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
|
18 |
+
%8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
|
19 |
+
%9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
|
20 |
+
%10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
|
21 |
+
%11 = tt.load %10, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
|
22 |
+
%12 = arith.extf %11 : tensor<256xbf16> to tensor<256xf32>
|
23 |
+
%13 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
|
24 |
+
%14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
|
25 |
+
%15 = tt.load %14, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
|
26 |
+
%16 = arith.extf %15 : tensor<256xbf16> to tensor<256xf32>
|
27 |
+
%17 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
|
28 |
+
%18 = tt.addptr %17, %1 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
|
29 |
+
%19 = tt.load %18, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
|
30 |
+
%20 = arith.addf %8, %12 : tensor<256xf32>
|
31 |
+
%21 = arith.addf %20, %16 : tensor<256xf32>
|
32 |
+
%22 = arith.select %2, %21, %cst_3 : tensor<256xi1>, tensor<256xf32>
|
33 |
+
%23 = "tt.reduce"(%22) <{axis = 0 : i32}> ({
|
34 |
+
^bb0(%arg7: f32, %arg8: f32):
|
35 |
+
%41 = arith.addf %arg7, %arg8 : f32
|
36 |
+
tt.reduce.return %41 : f32
|
37 |
+
}) : (tensor<256xf32>) -> f32
|
38 |
+
%24 = arith.addf %23, %cst_0 : f32
|
39 |
+
%25 = arith.divf %24, %cst_1 : f32
|
40 |
+
%26 = tt.splat %25 : (f32) -> tensor<256xf32>
|
41 |
+
%27 = arith.subf %21, %26 : tensor<256xf32>
|
42 |
+
%28 = arith.mulf %27, %27 : tensor<256xf32>
|
43 |
+
%29 = arith.select %2, %28, %cst_3 : tensor<256xi1>, tensor<256xf32>
|
44 |
+
%30 = "tt.reduce"(%29) <{axis = 0 : i32}> ({
|
45 |
+
^bb0(%arg7: f32, %arg8: f32):
|
46 |
+
%41 = arith.addf %arg7, %arg8 : f32
|
47 |
+
tt.reduce.return %41 : f32
|
48 |
+
}) : (tensor<256xf32>) -> f32
|
49 |
+
%31 = arith.addf %30, %cst_0 : f32
|
50 |
+
%32 = arith.divf %31, %cst_1 : f32
|
51 |
+
%33 = arith.addf %32, %cst_2 : f32
|
52 |
+
%34 = tt.extern_elementwise %33 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
|
53 |
+
%35 = tt.splat %34 : (f32) -> tensor<256xf32>
|
54 |
+
%36 = arith.mulf %27, %35 : tensor<256xf32>
|
55 |
+
%37 = arith.mulf %36, %19 : tensor<256xf32>
|
56 |
+
%38 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
|
57 |
+
%39 = tt.addptr %38, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
|
58 |
+
%40 = arith.truncf %37 : tensor<256xf32> to tensor<256xbf16>
|
59 |
+
tt.store %39, %40, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16>
|
60 |
+
tt.return
|
61 |
+
}
|
62 |
+
}
|
.triton/dump/f5088324dcdcf6814f6743553c1321c2/triton_.llir
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 {
|
5 |
+
%4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
|
6 |
+
%5 = shl i32 %4, 2, !dbg !8
|
7 |
+
%6 = and i32 %5, 508, !dbg !8
|
8 |
+
%7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
|
9 |
+
%8 = shl i32 %7, 9, !dbg !10
|
10 |
+
%9 = or i32 %8, %6, !dbg !11
|
11 |
+
%10 = sext i32 %9 to i64, !dbg !12
|
12 |
+
%11 = getelementptr float, ptr addrspace(1) %0, i64 %10, !dbg !12
|
13 |
+
%12 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %11, i1 true) #1, !dbg !13
|
14 |
+
%13 = extractvalue { i32, i32, i32, i32 } %12, 0, !dbg !13
|
15 |
+
%14 = extractvalue { i32, i32, i32, i32 } %12, 1, !dbg !13
|
16 |
+
%15 = extractvalue { i32, i32, i32, i32 } %12, 2, !dbg !13
|
17 |
+
%16 = extractvalue { i32, i32, i32, i32 } %12, 3, !dbg !13
|
18 |
+
%17 = bitcast i32 %13 to float, !dbg !13
|
19 |
+
%18 = bitcast i32 %14 to float, !dbg !13
|
20 |
+
%19 = bitcast i32 %15 to float, !dbg !13
|
21 |
+
%20 = bitcast i32 %16 to float, !dbg !13
|
22 |
+
%21 = getelementptr i16, ptr addrspace(1) %1, i64 %10, !dbg !14
|
23 |
+
%22 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %17) #1, !dbg !15
|
24 |
+
%23 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %18) #1, !dbg !15
|
25 |
+
%24 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %19) #1, !dbg !15
|
26 |
+
%25 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %20) #1, !dbg !15
|
27 |
+
%26 = insertelement <2 x i16> undef, i16 %22, i64 0, !dbg !15
|
28 |
+
%27 = insertelement <2 x i16> %26, i16 %23, i64 1, !dbg !15
|
29 |
+
%28 = bitcast <2 x i16> %27 to i32, !dbg !15
|
30 |
+
%29 = insertelement <2 x i16> undef, i16 %24, i64 0, !dbg !15
|
31 |
+
%30 = insertelement <2 x i16> %29, i16 %25, i64 1, !dbg !15
|
32 |
+
%31 = bitcast <2 x i16> %30 to i32, !dbg !15
|
33 |
+
tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %28, i32 %31, ptr addrspace(1) %21, i1 true) #1, !dbg !15
|
34 |
+
ret void, !dbg !16
|
35 |
+
}
|
36 |
+
|
37 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
38 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
39 |
+
|
40 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
41 |
+
attributes #1 = { nounwind }
|
42 |
+
|
43 |
+
!llvm.module.flags = !{!0}
|
44 |
+
!llvm.dbg.cu = !{!1}
|
45 |
+
!nvvm.annotations = !{!3, !4, !4, !3}
|
46 |
+
|
47 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
48 |
+
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
49 |
+
!2 = !DIFile(filename: "cpqhcwm5bfrhuwddh4c4qks6bh7sovfbpfnmqhnm4h4w23icqnu6.py", directory: "/tmp/torchinductor_root/pq")
|
50 |
+
!3 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
|
51 |
+
!4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 128}
|
52 |
+
!5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
|
53 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
54 |
+
!7 = !{}
|
55 |
+
!8 = !DILocation(line: 21, column: 36, scope: !5)
|
56 |
+
!9 = !DILocation(line: 20, column: 28, scope: !5)
|
57 |
+
!10 = !DILocation(line: 20, column: 33, scope: !5)
|
58 |
+
!11 = !DILocation(line: 21, column: 23, scope: !5)
|
59 |
+
!12 = !DILocation(line: 24, column: 30, scope: !5)
|
60 |
+
!13 = !DILocation(line: 24, column: 35, scope: !5)
|
61 |
+
!14 = !DILocation(line: 26, column: 25, scope: !5)
|
62 |
+
!15 = !DILocation(line: 26, column: 36, scope: !5)
|
63 |
+
!16 = !DILocation(line: 26, column: 4, scope: !5)
|
.triton/dump/f5088324dcdcf6814f6743553c1321c2/triton_.ptx
ADDED
@@ -0,0 +1,300 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2de
|
10 |
+
|
11 |
+
.visible .entry triton__0d1d2de(
|
12 |
+
.param .u64 triton__0d1d2de_param_0,
|
13 |
+
.param .u64 triton__0d1d2de_param_1,
|
14 |
+
.param .u32 triton__0d1d2de_param_2
|
15 |
+
)
|
16 |
+
.maxntid 128, 1, 1
|
17 |
+
{
|
18 |
+
.reg .pred %p<3>;
|
19 |
+
.reg .b16 %rs<5>;
|
20 |
+
.reg .b32 %r<19>;
|
21 |
+
.reg .b64 %rd<7>;
|
22 |
+
.loc 1 18 0
|
23 |
+
$L__func_begin0:
|
24 |
+
.loc 1 18 0
|
25 |
+
|
26 |
+
ld.param.u64 %rd3, [triton__0d1d2de_param_0];
|
27 |
+
ld.param.u64 %rd4, [triton__0d1d2de_param_1];
|
28 |
+
$L__tmp0:
|
29 |
+
.loc 1 21 36
|
30 |
+
mov.u32 %r12, %tid.x;
|
31 |
+
shl.b32 %r13, %r12, 2;
|
32 |
+
and.b32 %r14, %r13, 508;
|
33 |
+
.loc 1 20 28
|
34 |
+
mov.u32 %r1, %ctaid.x;
|
35 |
+
.loc 1 20 33
|
36 |
+
shl.b32 %r15, %r1, 9;
|
37 |
+
.loc 1 21 23
|
38 |
+
or.b32 %r16, %r15, %r14;
|
39 |
+
.loc 1 24 30
|
40 |
+
mul.wide.s32 %rd5, %r16, 4;
|
41 |
+
add.s64 %rd1, %rd3, %rd5;
|
42 |
+
mov.pred %p1, -1;
|
43 |
+
.loc 1 24 35
|
44 |
+
mov.u32 %r6, 0x0;
|
45 |
+
mov.u32 %r7, 0x0;
|
46 |
+
mov.u32 %r8, 0x0;
|
47 |
+
mov.u32 %r9, 0x0;
|
48 |
+
@%p1 ld.global.v4.b32 { %r6, %r7, %r8, %r9 }, [ %rd1 + 0 ];
|
49 |
+
.loc 1 26 25
|
50 |
+
mul.wide.s32 %rd6, %r16, 2;
|
51 |
+
add.s64 %rd2, %rd4, %rd6;
|
52 |
+
.loc 1 26 36
|
53 |
+
cvt.rn.bf16.f32 %rs1, %r6;
|
54 |
+
cvt.rn.bf16.f32 %rs2, %r7;
|
55 |
+
cvt.rn.bf16.f32 %rs3, %r8;
|
56 |
+
cvt.rn.bf16.f32 %rs4, %r9;
|
57 |
+
mov.b32 %r17, {%rs1, %rs2};
|
58 |
+
mov.b32 %r18, {%rs3, %rs4};
|
59 |
+
@%p1 st.global.v2.b32 [ %rd2 + 0 ], { %r17, %r18 };
|
60 |
+
.loc 1 26 4
|
61 |
+
ret;
|
62 |
+
$L__tmp1:
|
63 |
+
$L__func_end0:
|
64 |
+
|
65 |
+
}
|
66 |
+
.file 1 "/tmp/torchinductor_root/pq/cpqhcwm5bfrhuwddh4c4qks6bh7sovfbpfnmqhnm4h4w23icqnu6.py"
|
67 |
+
.section .debug_abbrev
|
68 |
+
{
|
69 |
+
.b8 1
|
70 |
+
.b8 17
|
71 |
+
.b8 1
|
72 |
+
.b8 37
|
73 |
+
.b8 8
|
74 |
+
.b8 19
|
75 |
+
.b8 5
|
76 |
+
.b8 3
|
77 |
+
.b8 8
|
78 |
+
.b8 16
|
79 |
+
.b8 6
|
80 |
+
.b8 27
|
81 |
+
.b8 8
|
82 |
+
.b8 180
|
83 |
+
.b8 66
|
84 |
+
.b8 12
|
85 |
+
.b8 17
|
86 |
+
.b8 1
|
87 |
+
.b8 18
|
88 |
+
.b8 1
|
89 |
+
.b8 0
|
90 |
+
.b8 0
|
91 |
+
.b8 2
|
92 |
+
.b8 46
|
93 |
+
.b8 0
|
94 |
+
.b8 17
|
95 |
+
.b8 1
|
96 |
+
.b8 18
|
97 |
+
.b8 1
|
98 |
+
.b8 64
|
99 |
+
.b8 10
|
100 |
+
.b8 135
|
101 |
+
.b8 64
|
102 |
+
.b8 8
|
103 |
+
.b8 3
|
104 |
+
.b8 8
|
105 |
+
.b8 58
|
106 |
+
.b8 11
|
107 |
+
.b8 59
|
108 |
+
.b8 11
|
109 |
+
.b8 63
|
110 |
+
.b8 12
|
111 |
+
.b8 0
|
112 |
+
.b8 0
|
113 |
+
.b8 0
|
114 |
+
}
|
115 |
+
.section .debug_info
|
116 |
+
{
|
117 |
+
.b32 176
|
118 |
+
.b8 2
|
119 |
+
.b8 0
|
120 |
+
.b32 .debug_abbrev
|
121 |
+
.b8 8
|
122 |
+
.b8 1
|
123 |
+
.b8 116
|
124 |
+
.b8 114
|
125 |
+
.b8 105
|
126 |
+
.b8 116
|
127 |
+
.b8 111
|
128 |
+
.b8 110
|
129 |
+
.b8 0
|
130 |
+
.b8 2
|
131 |
+
.b8 0
|
132 |
+
.b8 99
|
133 |
+
.b8 112
|
134 |
+
.b8 113
|
135 |
+
.b8 104
|
136 |
+
.b8 99
|
137 |
+
.b8 119
|
138 |
+
.b8 109
|
139 |
+
.b8 53
|
140 |
+
.b8 98
|
141 |
+
.b8 102
|
142 |
+
.b8 114
|
143 |
+
.b8 104
|
144 |
+
.b8 117
|
145 |
+
.b8 119
|
146 |
+
.b8 100
|
147 |
+
.b8 100
|
148 |
+
.b8 104
|
149 |
+
.b8 52
|
150 |
+
.b8 99
|
151 |
+
.b8 52
|
152 |
+
.b8 113
|
153 |
+
.b8 107
|
154 |
+
.b8 115
|
155 |
+
.b8 54
|
156 |
+
.b8 98
|
157 |
+
.b8 104
|
158 |
+
.b8 55
|
159 |
+
.b8 115
|
160 |
+
.b8 111
|
161 |
+
.b8 118
|
162 |
+
.b8 102
|
163 |
+
.b8 98
|
164 |
+
.b8 112
|
165 |
+
.b8 102
|
166 |
+
.b8 110
|
167 |
+
.b8 109
|
168 |
+
.b8 113
|
169 |
+
.b8 104
|
170 |
+
.b8 110
|
171 |
+
.b8 109
|
172 |
+
.b8 52
|
173 |
+
.b8 104
|
174 |
+
.b8 52
|
175 |
+
.b8 119
|
176 |
+
.b8 50
|
177 |
+
.b8 51
|
178 |
+
.b8 105
|
179 |
+
.b8 99
|
180 |
+
.b8 113
|
181 |
+
.b8 110
|
182 |
+
.b8 117
|
183 |
+
.b8 54
|
184 |
+
.b8 46
|
185 |
+
.b8 112
|
186 |
+
.b8 121
|
187 |
+
.b8 0
|
188 |
+
.b32 .debug_line
|
189 |
+
.b8 47
|
190 |
+
.b8 116
|
191 |
+
.b8 109
|
192 |
+
.b8 112
|
193 |
+
.b8 47
|
194 |
+
.b8 116
|
195 |
+
.b8 111
|
196 |
+
.b8 114
|
197 |
+
.b8 99
|
198 |
+
.b8 104
|
199 |
+
.b8 105
|
200 |
+
.b8 110
|
201 |
+
.b8 100
|
202 |
+
.b8 117
|
203 |
+
.b8 99
|
204 |
+
.b8 116
|
205 |
+
.b8 111
|
206 |
+
.b8 114
|
207 |
+
.b8 95
|
208 |
+
.b8 114
|
209 |
+
.b8 111
|
210 |
+
.b8 111
|
211 |
+
.b8 116
|
212 |
+
.b8 47
|
213 |
+
.b8 112
|
214 |
+
.b8 113
|
215 |
+
.b8 0
|
216 |
+
.b8 1
|
217 |
+
.b64 $L__func_begin0
|
218 |
+
.b64 $L__func_end0
|
219 |
+
.b8 2
|
220 |
+
.b64 $L__func_begin0
|
221 |
+
.b64 $L__func_end0
|
222 |
+
.b8 1
|
223 |
+
.b8 156
|
224 |
+
.b8 116
|
225 |
+
.b8 114
|
226 |
+
.b8 105
|
227 |
+
.b8 116
|
228 |
+
.b8 111
|
229 |
+
.b8 110
|
230 |
+
.b8 95
|
231 |
+
.b8 95
|
232 |
+
.b8 48
|
233 |
+
.b8 100
|
234 |
+
.b8 49
|
235 |
+
.b8 100
|
236 |
+
.b8 50
|
237 |
+
.b8 100
|
238 |
+
.b8 101
|
239 |
+
.b8 0
|
240 |
+
.b8 116
|
241 |
+
.b8 114
|
242 |
+
.b8 105
|
243 |
+
.b8 116
|
244 |
+
.b8 111
|
245 |
+
.b8 110
|
246 |
+
.b8 95
|
247 |
+
.b8 95
|
248 |
+
.b8 48
|
249 |
+
.b8 100
|
250 |
+
.b8 49
|
251 |
+
.b8 100
|
252 |
+
.b8 50
|
253 |
+
.b8 100
|
254 |
+
.b8 101
|
255 |
+
.b8 0
|
256 |
+
.b8 1
|
257 |
+
.b8 18
|
258 |
+
.b8 1
|
259 |
+
.b8 0
|
260 |
+
}
|
261 |
+
.section .debug_pubnames
|
262 |
+
{
|
263 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
264 |
+
$L__pubNames_start0:
|
265 |
+
.b8 2
|
266 |
+
.b8 0
|
267 |
+
.b32 .debug_info
|
268 |
+
.b32 180
|
269 |
+
.b32 125
|
270 |
+
.b8 116
|
271 |
+
.b8 114
|
272 |
+
.b8 105
|
273 |
+
.b8 116
|
274 |
+
.b8 111
|
275 |
+
.b8 110
|
276 |
+
.b8 95
|
277 |
+
.b8 95
|
278 |
+
.b8 48
|
279 |
+
.b8 100
|
280 |
+
.b8 49
|
281 |
+
.b8 100
|
282 |
+
.b8 50
|
283 |
+
.b8 100
|
284 |
+
.b8 101
|
285 |
+
.b8 0
|
286 |
+
.b32 0
|
287 |
+
$L__pubNames_end0:
|
288 |
+
}
|
289 |
+
.section .debug_pubtypes
|
290 |
+
{
|
291 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
292 |
+
$L__pubTypes_start0:
|
293 |
+
.b8 2
|
294 |
+
.b8 0
|
295 |
+
.b32 .debug_info
|
296 |
+
.b32 180
|
297 |
+
.b32 0
|
298 |
+
$L__pubTypes_end0:
|
299 |
+
}
|
300 |
+
.section .debug_loc { }
|