0-hero
/

gpt2-pos-encoding-experiment-100B

Model card Files Files and versions Community

0-hero commited on Sep 27, 2024

Commit

934a9ba

verified ·

1 Parent(s): 71c6277

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.launchpadlib/api.launchpad.net/cache/api.launchpad.net,devel,-application,vnd.sun.wadl+xml,2f09acb494bdefdbf8ef0d1396a05e86 +0 -0
.local/share/jupyter/nbextensions/printview/main.js +75 -0
.local/share/jupyter/nbextensions/python-markdown/main.js +212 -0
.local/share/jupyter/nbextensions/qtconsole/qtconsole.yaml +6 -0
.local/share/jupyter/nbextensions/rubberband/main.css +12 -0
.local/share/jupyter/nbextensions/rubberband/rubberband.yaml +7 -0
.local/share/jupyter/nbextensions/ruler/ruler.yaml +32 -0
.local/share/jupyter/nbextensions/runtools/main.js +745 -0
.local/share/jupyter/nbextensions/runtools/runtools_lock.png +0 -0
.local/share/jupyter/nbextensions/scratchpad/README.md +14 -0
.local/share/jupyter/nbextensions/skill/README.md +15 -0
.triton/cache/6e97c2a1f7a095255f6dd5de1807841d/cuda_utils.so +0 -0
.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.ptx +807 -0
.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.ttir +53 -0
.triton/dump/305a9479aab997a3a16bfe46bb303a50/triton_.cubin +0 -0
.triton/dump/305a9479aab997a3a16bfe46bb303a50/triton_.ptx +1041 -0
.triton/dump/415aac87553b7d064f52694fa7254686/triton_.llir +860 -0
.triton/dump/44b225411009956bfbae22f8bac7d703/triton_.cubin +0 -0
.triton/dump/44b225411009956bfbae22f8bac7d703/triton_.llir +304 -0
.triton/dump/44b225411009956bfbae22f8bac7d703/triton_.ttir +61 -0
.triton/dump/4c6ad48573c74d55ed79384f6b432d50/triton_.ttir +18 -0
.triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.llir +362 -0
.triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.ptx +486 -0
.triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.ttgir +38 -0
.triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.ttir +37 -0
.triton/dump/645565eaba0a18dd23ef200fe9abb0c0/triton_.cubin +0 -0
.triton/dump/645565eaba0a18dd23ef200fe9abb0c0/triton_.ptx +834 -0
.triton/dump/645565eaba0a18dd23ef200fe9abb0c0/triton_.ttgir +98 -0
.triton/dump/7dc5bb3e5c2bb99527fff34c6fba7810/triton_.llir +42 -0
.triton/dump/7dc5bb3e5c2bb99527fff34c6fba7810/triton_.ttir +17 -0
.triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.cubin +0 -0
.triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.llir +333 -0
.triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.ttgir +68 -0
.triton/dump/9a2fb05196b13393bea452d08e9aaca8/triton_.llir +54 -0
.triton/dump/9a2fb05196b13393bea452d08e9aaca8/triton_.ttgir +19 -0
.triton/dump/a69784da01a97187168f22847465505f/triton_.cubin +0 -0
.triton/dump/a69784da01a97187168f22847465505f/triton_.ttgir +73 -0
.triton/dump/a75e14a8d2d1ec8471f1c7b615552f8c/triton_.cubin +0 -0
.triton/dump/a75e14a8d2d1ec8471f1c7b615552f8c/triton_.ptx +295 -0
.triton/dump/a75e14a8d2d1ec8471f1c7b615552f8c/triton_.ttir +18 -0
.triton/dump/ab89f85e55d5ddd9676325b49df9419f/triton_.ptx +717 -0
.triton/dump/ab89f85e55d5ddd9676325b49df9419f/triton_.ttgir +68 -0
.triton/dump/be28ee3793470d9803546f03e49c3edf/triton_.cubin +0 -0
.triton/dump/be28ee3793470d9803546f03e49c3edf/triton_.ttgir +85 -0
.triton/dump/c0c34db8f5ff22b79fc971c63187477a/triton_.cubin +0 -0
.triton/dump/c0c34db8f5ff22b79fc971c63187477a/triton_.ttir +84 -0
.triton/dump/c0db4dd81e5aac83500e3ccf67d3896d/triton_.llir +53 -0
.triton/dump/d7a12c0ba96f8920b8147157303ee99f/triton_.ttir +62 -0
.triton/dump/f5088324dcdcf6814f6743553c1321c2/triton_.llir +63 -0
.triton/dump/f5088324dcdcf6814f6743553c1321c2/triton_.ptx +300 -0

.launchpadlib/api.launchpad.net/cache/api.launchpad.net,devel,-application,vnd.sun.wadl+xml,2f09acb494bdefdbf8ef0d1396a05e86 ADDED Viewed

The diff for this file is too large to render. See raw diff

.local/share/jupyter/nbextensions/printview/main.js ADDED Viewed

	@@ -0,0 +1,75 @@

+// call "jupyter nbconvert" and open generated html file in new tab
+define([
+    'base/js/namespace',
+    'jquery',
+    'base/js/events',
+    'base/js/utils'
+], function(
+    IPython,
+    $,
+    events,
+    utils
+) {
+    "use strict";
+    var nbconvert_options = '--to html';
+    var extension = '.html';
+	var open_tab = true;
+    /**
+     * Get option from config
+     */
+    var initialize = function () {
+        var config = IPython.notebook.config;
+        if (config.data.hasOwnProperty('printview_nbconvert_options') ) {
+            nbconvert_options = config.data.printview_nbconvert_options;
+            if (nbconvert_options.search('pdf') > 0) extension = '.pdf';
+            if (nbconvert_options.search('slides') > 0) extension = '.slides.html';
+        }
+        if (config.data.hasOwnProperty('printview_open_tab') ) {
+            if (typeof(config.data.printview_open_tab) === "boolean") {
+                open_tab = config.data.printview_open_tab;
+            }
+        }
+    };
+    /**
+     * Call nbconvert using the current notebook server profile
+     *
+     */
+	var callNbconvert = function () {
+        events.off('notebook_saved.Notebook');
+		var kernel = IPython.notebook.kernel;
+		var name = IPython.notebook.notebook_name;
+		var command = 'import os; os.system(\'jupyter nbconvert ' + nbconvert_options + ' \"' + name + '\"\')';
+		function callback() {
+			if (open_tab === true) {
+				var url = utils.splitext(name)[0] + extension;
+				window.open(url, '_blank');
+			}
+		}
+		kernel.execute(command, { shell: { reply : callback } });
+        $('#doPrintView').blur();
+	};
+    var nbconvertPrintView = function () {
+        events.on('notebook_saved.Notebook',callNbconvert);
+        IPython.notebook.save_notebook(false);
+	};
+	var load_ipython_extension = function() {
+		$(IPython.toolbar.add_buttons_group([
+			IPython.keyboard_manager.actions.register ({
+				help   : 'Create static print view',
+				icon   : 'fa-print',
+				handler: nbconvertPrintView
+			}, 'create-static-printview',  'printview'),
+		])).find('.btn').attr('id', 'doPrintView');
+        return IPython.notebook.config.loaded.then(initialize);
+	};
+	return {
+        load_ipython_extension : load_ipython_extension
+    };
+});

.local/share/jupyter/nbextensions/python-markdown/main.js ADDED Viewed

	@@ -0,0 +1,212 @@

+// Allow Python-code in markdown cells
+// Encapsulate using {{...}}
+// - You can also return html or markdown from your Python code
+// - You can embed images, however they will be sanitized on reload.
+// TODO: Markdown cells will only be reevaluated when a notebook is dirty
+//       (i.e. you have made changes). If you save it before reevaluating MD cells,
+//       they will show the old value.
+define([
+    'base/js/namespace',
+    'jquery',
+    'require',
+    'notebook/js/cell',
+    'base/js/security',
+    'components/marked/lib/marked',
+    'base/js/events',
+    'notebook/js/textcell'
+], function(IPython, $, requirejs, cell, security, marked, events, textcell) {
+    "use strict";
+    /*
+     * Find Python expression enclosed in {{ }}, execute and add to text as
+     * <span> tags. The actual content gets filled in later by a callback.
+     * Already executed expressions are cached in cell metadata.
+     *
+     * @method execute_python
+     * @param cell {Cell} notebook cell
+     * @param text {String} text in cell
+     */
+    var execute_python = function(cell,text) {
+        /* never execute code in untrusted notebooks */
+        if (IPython.notebook.trusted === false ) {
+            return undefined
+        }
+        /* always clear stored variables if notebook is dirty */
+        if (IPython.notebook.dirty === true ) delete cell.metadata.variables;
+        // search for code in double curly braces: {{}}
+        var found = false;
+        var newtext = text.replace(/{{(.*?)}}/g, function(match,tag,cha) {
+            found = true;
+            if (tag === "") return undefined;
+            var code = tag;
+            var id = 'python_'+cell.cell_id+'_'+cha; /* create an individual ID */
+            var thiscell = cell;
+            var thismatch = tag;
+            /* there a two possible options:
+               a) notebook dirty or variable not stored in metadata: evaluate variable
+               b) notebook clean and variable stored in metadata: display stored value
+            */
+            if (typeof cell.metadata.variables === "undefined") {
+                cell.metadata.variables = {}
+            }
+            var val = cell.metadata.variables[thismatch];
+            if (IPython.notebook.dirty === true || val === undefined || jQuery.isEmptyObject(val)) {
+                cell.metadata.variables[thismatch] = {};
+                var execute_callback = function (out_data)
+                        {
+                        var html;
+                        if (out_data.msg_type === "error") {
+                            var text = "**" + out_data.content.ename + "**: " +  out_data.content.evalue;
+                            html = marked(text);
+                        } else if (out_data.msg_type === "stream") {
+                            html = marked(out_data.content.text);
+                            var t = html.match(/^\s*<p>([\s\S]*?)<\/p>\s*$/); //strip <p> and </p> that marked (maybe) adds and we don't want
+                            html = t !== null ? t[1] : html;
+                            var q = html.match(/^&#39;([\s\S]*?)&#39;$/); // strip quotes from strings
+                            if (q !== null) html = q[1]
+                        } else if (out_data.msg_type === "execute_result" | out_data.msg_type === "display_data" ) {
+                            var ul = out_data.content.data;
+                            if (ul != undefined) {
+                                if (ul['text/latex'] != undefined) {
+                                    html = ul['text/latex'];
+                                } else if (ul['image/svg+xml'] != undefined) {
+                                    var svg = ul['image/svg+xml'];
+                                    /* embed SVG in an <img> tag, still get eaten by sanitizer... */
+                                    svg = btoa(svg);
+                                    html = '<img src="data:image/svg+xml;base64,' + svg + '"/>';
+                                } else if (ul['image/jpeg'] != undefined) {
+                                    var jpeg = ul['image/jpeg'];
+                                    html = '<img src="data:image/jpeg;base64,' + jpeg + '"/>';
+                                } else if (ul['image/png'] != undefined) {
+                                    var png = ul['image/png'];
+                                    html = '<img src="data:image/png;base64,' + png + '"/>';
+                                } else if (ul['text/markdown'] != undefined) {
+                                    html = marked(ul['text/markdown']);
+                                } else if (ul['text/html'] != undefined) {
+                                    html = ul['text/html'];
+                                } else {
+                                    html = marked(ul['text/plain']);
+                                    // [\s\S] is used to also catch newlines
+                                    var t = html.match(/^\s*<p>([\s\S]*?)<\/p>\s*$/); //strip <p> and </p> that marked adds and we don't want
+                                    html = t !== null ? t[1] : html;
+                                    var q = html.match(/^&#39;([\s\S]*?)&#39;$/); // strip quotes from strings
+                                    if (q !== null) html = q[1]
+                                }
+                            }
+                        } else {
+                            return;
+                        }
+                        thiscell.metadata.variables[thismatch] = html;
+                        var el = document.getElementById(id);
+                        el.innerHTML = el.innerHTML + html; // output result
+                    };
+                var callbacks = { iopub : { output: execute_callback } };
+                if (cell.notebook.kernel != null) {
+                    cell.notebook.kernel.execute(code, callbacks, {silent: false, store_history : false, stop_on_error: false });
+                    return "<span id='"+id+"'></span>"; // add HTML tag with ID where output will be placed
+                    }
+                return undefined;
+            } else {
+                /* Notebook not dirty: replace tags with metadata */
+                val = cell.metadata.variables[tag];
+                return "<span id='"+id+"'>"+val+"</span>"
+            }
+        });
+        if (found == true) return newtext;
+        return undefined
+    };
+    /*
+     * Render markdown cell and replace {{...}} with python code
+     *
+     */
+    var render_cell = function(cell) {
+        var element = cell.element.find('div.text_cell_render');
+        var text = execute_python(cell, element[0].innerHTML);
+        if (text !== undefined) {
+            element[0].innerHTML = text;
+            MathJax.Hub.Queue(["Typeset",MathJax.Hub,element[0]]);
+        }
+    };
+	/* force rendering of markdown cell if notebook is dirty */
+	var original_render = textcell.MarkdownCell.prototype.render;
+	textcell.MarkdownCell.prototype.render = function() {
+		if (IPython.notebook.dirty === true) {
+			this.rendered = false
+		}
+		return original_render.apply(this)
+	};
+    var set_trusted_indicator = function() {
+        var ind = $('.notebook-trusted');
+        if (IPython.notebook.trusted === true) {
+            ind.attr('title','Notebook is trusted');
+            ind.removeClass('fa-question');
+            ind.addClass('fa-check');
+        } else {
+            ind.attr('title','Notebook is not trusted');
+            ind.removeClass('fa-check');
+            ind.addClass('fa-question');
+        }
+    };
+   /**
+     * Add CSS file
+     *
+     * @param name filename
+     */
+    var load_css = function (name) {
+        var link = document.createElement("link");
+        link.type = "text/css";
+        link.rel = "stylesheet";
+        link.href = requirejs.toUrl(name);
+        document.getElementsByTagName("head")[0].appendChild(link);
+    };
+   /**
+     * Update all references variables in markdown cells
+     *
+     */
+   var update_md_cells = function () {
+       var ncells = IPython.notebook.ncells();
+       var cells = IPython.notebook.get_cells();
+       for (var i = 0; i < ncells; i++) {
+           var cell = cells[i];
+           if (cell.metadata.hasOwnProperty('variables')) {
+               render_cell(cell)
+           }
+       }
+   };
+    var load_ipython_extension = function() {
+        load_css('./main.css');
+        events.on("rendered.MarkdownCell", function (event, data) {
+            render_cell(data.cell);
+        });
+        events.on("trust_changed.Notebook", set_trusted_indicator);
+        $('#save_widget').append('<i id="notebook-trusted-indicator" class="fa fa-question notebook-trusted" />');
+        set_trusted_indicator();
+        /* Show values stored in metadata on reload */
+        events.on("kernel_ready.Kernel", function () {
+            if (Jupyter.notebook !== undefined && Jupyter.notebook._fully_loaded) {
+                update_md_cells()
+            } else {
+                events.on("notebook_loaded.Notebook", function () {
+                    update_md_cells()
+                })
+            }
+        });
+    };
+    return {
+        load_ipython_extension : load_ipython_extension
+    };
+});

.local/share/jupyter/nbextensions/qtconsole/qtconsole.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+Type: IPython Notebook Extension
+Name: Launch QTConsole
+Link: README.md
+Description: Launch a QTConsole attached to the running kernel
+Main: qtconsole.js
+Compatibility: 4.x

.local/share/jupyter/nbextensions/rubberband/main.css ADDED Viewed

	@@ -0,0 +1,12 @@

+.highlight-drag
+{
+    background-color: transparent;
+    border: dashed #ff3333 3px;
+    position: absolute;
+    display: none;
+}
+.cell.selected
+{
+    background-color: #fcfcfc;
+}

.local/share/jupyter/nbextensions/rubberband/rubberband.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+Type: IPython Notebook Extension
+Name: Rubberband
+Description: The rubberband extension allows selecting multiple cells
+Link: readme.md
+Icon: icon.png
+Main: main.js
+Compatibility: 4.x, 5.x

.local/share/jupyter/nbextensions/ruler/ruler.yaml ADDED Viewed

	@@ -0,0 +1,32 @@

+Type: IPython Notebook Extension
+Name: Ruler
+Description: This extension enables the Ruler CodeMirror feature
+Link: readme.md
+Icon: icon.png
+Main: main.js
+Compatibility: 4.x, 5.x
+Parameters:
+- name: ruler_column
+  input_type: list
+  list_element:
+    input_type: number
+  description: Column where ruler is displayed
+  default: [78]
+- name: ruler_color
+  input_type: list
+  list_element:
+    input_type: color
+  description: Ruler color
+  default: ["#ff0000"]
+- name: ruler_linestyle
+  description: 'Ruler style, e.g. solid, dashed'
+  input_type: list
+  default: ['dashed']
+- name: ruler_do_css_patch
+  description: apply css patch for ruler padding bug in notebook >= 4.3
+  input_type: checkbox
+  default: true

.local/share/jupyter/nbextensions/runtools/main.js ADDED Viewed

	@@ -0,0 +1,745 @@

+// Extended code execution commands and more
+define([
+    'base/js/namespace',
+    'jquery',
+    'require',
+    'base/js/events',
+    'services/config',
+    'base/js/utils',
+    'notebook/js/codecell'
+], function(Jupyter, $, requirejs, events, configmod, utils, codecell) {
+    "use strict";
+    var run_list = []; /* list of cells to be run */
+    // define default config parameter values
+    var params = {
+        run_cells_above: 'Alt-a',
+        run_cells_below: 'Alt-b',
+        toggle_marker: 'Alt-t',
+        mark_all_codecells: 'Alt-m',
+        unmark_all_codecells: 'Alt-u',
+        run_marked_cells: 'Alt-r',
+        run_all_cells: 'Alt-x',
+        run_all_cells_ignore_errors: 'Alt-f',
+        stop_execution: 'Ctrl-c',
+        marked_color: '#20f224',
+        scheduled_color: '#00def0',
+        run_color: '#f30a2d'
+    };
+    /**
+     * Add event if user clicks on codemirror gutter
+     *
+     */
+    function add_gutter_events() {
+        var ncells = Jupyter.notebook.ncells();
+        var cells = Jupyter.notebook.get_cells();
+        for (var i = 0; i < ncells; i++) {
+            var cell = cells[i];
+            if ((cell.cell_type === "code")) {
+                cell.code_mirror.on("gutterClick", changeEvent);
+                if (is_marked(cell)) {
+                    var g = cell.code_mirror.getGutterElement();
+                    $(g).css({
+                        "background-color": params.marked_color
+                    });
+                }
+            }
+        }
+    }
+    /*
+     * Initialize toolbar and gutter after config was loaded
+     */
+    function initialize() {
+        $.extend(true, params, Jupyter.notebook.config.data.runtools);
+        add_gutter_events();
+        /* Add run control buttons to toolbar */
+        $(Jupyter.toolbar.add_buttons_group([
+            Jupyter.keyboard_manager.actions.register ({
+                help: 'Toggle Runtools Toolbar',
+                icon: 'fa-cogs',
+                handler: toggle_toolbar
+            }, 'toggle-runtools-toolbar', 'runtools')
+        ])).find('.btn').attr('id', 'toggle_runtools').css({
+            'outline': 'none'
+        });
+        /* Add keyboard shortcuts */
+        var add_command_shortcuts = {};
+        add_command_shortcuts[params["run_cells_above"]] = {
+            help: 'Run cells above',
+            help_index: 'xa',
+            handler: function() {
+                execute_cells_above();
+                return false;
+            }
+        };
+        add_command_shortcuts[params["run_cells_below"]] = {
+            help: 'Run cells below',
+            help_index: 'aa',
+            handler: function() {
+                execute_cells_below();
+                return false;
+            }
+        };
+        add_command_shortcuts[params["toggle_marker"]] = {
+            help: 'Toggle marker',
+            help_index: 'mt',
+            handler: function() {
+                toggle_marker();
+                return false;
+            }
+        };
+        add_command_shortcuts[params["mark_all_codecells"]] = {
+            help: 'Mark all codecells',
+            help_index: 'ma',
+            handler: function() {
+                mark_all();
+                return false;
+            }
+        };
+        add_command_shortcuts[params["unmark_all_codecells"]] = {
+            help: 'Unmark all codecells',
+            help_index: 'mu',
+            handler: function() {
+                mark_none();
+                return false;
+            }
+        };
+        add_command_shortcuts[params["run_marked_cells"]] = {
+            help: 'Run marked cells',
+            help_index: 'rm',
+            handler: function() {
+                run_marked_cells();
+                return false;
+            }
+        };
+        add_command_shortcuts[params["run_all_cells"]] = {
+            help: 'Run all cells',
+            help_index: 'ra',
+            handler: function() {
+                var pos = Jupyter.notebook.element.scrollTop();
+                execute_all_cells();
+                Jupyter.notebook.element.animate({
+                    scrollTop: pos
+                }, 100);
+                return false;
+            }
+        };
+        add_command_shortcuts[params["run_all_cells_ignore_errors"]] = {
+            help: 'Run all cells - ignore errors',
+            help_index: 'rf',
+            handler: function() {
+                run_all_cells_ignore_errors();
+                return false;
+            }
+        };
+        Jupyter.keyboard_manager.command_shortcuts.add_shortcuts(add_command_shortcuts);
+        Jupyter.keyboard_manager.edit_shortcuts.add_shortcuts(add_command_shortcuts);
+        events.on('finished_execute.CodeCell', finished_execute_event);
+    }
+    /**
+     * Hide or show a cell
+     *
+     * @param cell
+     * @param io 'i' for cell input, 'o' for cell output
+     * @param showme {Boolean} show (true) or hide (false) cell
+     */
+    function showCell(cell, io, showme) {
+        if (io === 'i') {
+            if (showme === true) {
+                cell.element.find("div.input").show();
+                cell.metadata.hide_input = false;
+            } else {
+                cell.element.find("div.input").hide();
+                cell.metadata.hide_input = true;
+            }
+        } else {
+            if (showme === true) {
+                cell.element.find('div.output').show();
+                cell.metadata.hide_output = false;
+            } else {
+                cell.element.find('div.output').hide();
+                cell.metadata.hide_output = true;
+            }
+        }
+    }
+    function _show_input_output_of_marked(show, char) {
+        var cells = Jupyter.notebook.get_cells();
+        var ncells = cells.length;
+        for (var i = 0; i < ncells; i++) {
+            var _cell = cells[i];
+            if (is_marked(_cell))
+                showCell(_cell, char, show);
+        }
+    }
+    /**
+     * Hide or show input of all marked code cells
+     *
+     * @param show {Boolean} show (true) or hide (false) code cells
+     */
+    function show_input(show) {
+        _show_input_output_of_marked(show, 'i');
+    }
+    /**
+     * Hide or show output area of all marked code cells
+     *
+     * @param {Boolean} show show (true) or hide (false)
+     */
+    function show_output(show) {
+        _show_input_output_of_marked(show, 'o');
+    }
+    /**
+     * Execute next cell in run list, if it is still marked
+     *
+     */
+    function execute_next_marked_cell() {
+        var cells = Jupyter.notebook.get_cells();
+        var end = cells.length;
+        while (run_list.length > 0) {
+            var runcell = run_list.shift();
+            for (var i = 0; i < end; i++) {
+                if (runcell === cells[i]) {
+                    if (runcell.metadata.run_control !== undefined && runcell.metadata.run_control.marked === true) {
+                        var g = runcell.code_mirror.getGutterElement();
+                        $(g).css({
+                            "background-color": params.run_color
+                        });
+                        runcell.execute();
+                        return;
+                    }
+                }
+            }
+        }
+    }
+    function _execute_without_selecting(idx_start, idx_end, stop_on_error) {
+        // notebook.execute_cells alters selection, this doesn't
+        var cells = Jupyter.notebook.get_cells();
+        idx_start = idx_start !== undefined ? idx_start : 0;
+        idx_end = idx_end !== undefined ? idx_end : cells.length;
+        for (var ii = idx_start; ii < idx_end; ii++) {
+            cells[ii].execute(stop_on_error);
+        }
+    }
+    function execute_cells_above() {
+        _execute_without_selecting(0, Jupyter.notebook.get_selected_index());
+    }
+    function execute_cells_below() {
+        _execute_without_selecting(Jupyter.notebook.get_selected_index(), undefined);
+    }
+    function execute_all_cells(stop_on_error) {
+        _execute_without_selecting(0, undefined, stop_on_error);
+    }
+    /**
+     * Run code cells marked in metadata
+     *
+     */
+    function run_marked_cells() {
+        var cells = Jupyter.notebook.get_cells();
+        var end = cells.length;
+        run_list = [];
+        /* Show all marked cells as scheduled to be run with new gutter background color  */
+        for (var i = 0; i < end; i++) {
+            var cell = cells[i];
+            if (cell instanceof codecell.CodeCell) {
+                var last_line = cell.code_mirror.lastLine();
+                var cell_empty = ( last_line === 0 && cell.code_mirror.getLine(last_line) === "");
+                if (cell.metadata.run_control !== undefined && cell_empty === false) {
+                    if (cell.metadata.run_control.marked === true) {
+                        var g = cell.code_mirror.getGutterElement();
+                        $(g).css({
+                            "background-color": params.scheduled_color
+                        });
+                        run_list.push(cell);
+                    }
+                }
+            }
+        }
+        execute_next_marked_cell();
+    }
+    /*
+     * Execute next cell in run_list when notified execution of last cell has been finished
+     * @param evt Event
+     * @param data Cell that has finished executing
+     */
+    var finished_execute_event = function(evt, data) {
+        var cell = data.cell;
+        /* Reset gutter color no non-queued state */
+        if (is_marked(cell)) {
+            var g = cell.code_mirror.getGutterElement();
+            $(g).css({
+                "background-color": params.marked_color
+            });
+        }
+        execute_next_marked_cell();
+    };
+    /**
+     *
+     * @param cell
+     * @param value
+     */
+    function setCell(cell, value) {
+        if (!(cell instanceof codecell.CodeCell)) return;
+        if (cell.metadata.run_control === undefined) cell.metadata.run_control = {};
+        if (cell.metadata.run_control.marked === undefined) cell.metadata.run_control.marked = false;
+        if (value === undefined) value = !cell.metadata.run_control.marked;
+        var g = cell.code_mirror.getGutterElement();
+        if (value === false) {
+            cell.metadata.run_control.marked = false;
+            $(g).css({
+                "background-color": ""
+            });
+        } else {
+            cell.metadata.run_control.marked = true;
+            $(g).css({
+                "background-color": params.marked_color
+            });
+        }
+    }
+    function setCellsMarked(cells, value) {
+        var ncells = cells.length;
+        for (var i = 0; i < ncells; i++) {
+            setCell(cells[i], value);
+        }
+    }
+    /**
+     * Toggle code cell marker
+     */
+    function toggle_marker() {
+        setCellsMarked(Jupyter.notebook.get_selected_cells(), undefined);
+    }
+    /**
+     *
+     */
+    function mark_all() {
+        setCellsMarked(Jupyter.notebook.get_cells(), true);
+    }
+    /**
+     *
+     */
+    function mark_none() {
+        setCellsMarked(Jupyter.notebook.get_cells(), false);
+    }
+    /**
+     *
+     * @param cell notebook cell instance
+     * @param state {string} state to be display [ '', 'locked', 'executed', 'modified' ]
+     */
+    function set_cell_state(cell, state) {
+        var icon = "";
+        if (state === 'locked') {
+            icon = '<div class="fa fa-lock" style="font-size:70%;" /div>'
+        }
+        cell.code_mirror.setGutterMarker(0, "CodeMirror-cellstate", celltypeMarker(icon))
+    }
+    /**
+     * Change event to mark/unmark cell
+     *
+     * @param cm codemirror instance
+     * @param line current line
+     * @param gutter not used
+     */
+    function changeEvent(cm, line, gutter) {
+        if (gutter === "CodeMirror-foldgutter") return; /* Don't collide with codefolding extension */
+        var cmline = cm.doc.children[0].lines[line];
+        if (cmline === undefined) {
+            return;
+        }
+        var cell = $(cm.display.gutters).closest('.cell').data('cell');
+        if (cell.metadata.run_control === undefined)
+            cell.metadata.run_control = {};
+        setCell(cell, !cell.metadata.run_control.marked);
+    }
+    /**
+     *
+     * @param cell cell to be tested
+     * @returns {boolean} true if marked
+     */
+    var is_marked = function(cell) {
+        return (cell instanceof codecell.CodeCell) &&
+            cell.metadata.run_control !== undefined &&
+            cell.metadata.run_control.marked;
+    };
+    /**
+     * Return div element to set in cellstate gutter
+     *
+     * @param val HTML string
+     * @returns {Element} div Element
+     */
+    function celltypeMarker(val) {
+        var marker = document.createElement("div");
+        marker.style.color = "#822";
+        marker.innerHTML = val;
+        return marker;
+    }
+    /**
+     * Lock/Unlock current code cell
+     *             if (cell.metadata.run_control != undefined && cell.metadata.run_control.read_only) {
+     *                     cell.code_mirror.setOption('readOnly', cell.metadata.run_control.read_only);
+     */
+    var lock_cell = function(locked) {
+        var ncells = Jupyter.notebook.ncells();
+        for (var i = ncells - 2; i >= 0; i--) {
+            var cells = Jupyter.notebook.get_cells();
+            if ((cells[i].cell_type === "code") && is_marked(cells[i])) {
+                if (locked === true) {
+                    cells[i].metadata.editable = false;
+                    set_cell_state(cells[i], 'locked')
+                } else {
+                    cells[i].metadata.editable = true;
+                    set_cell_state(cells[i], '')
+                }
+            }
+        }
+    };
+    /**
+     * Execute all cells and don't stop on errors
+     *
+     */
+    var run_all_cells_ignore_errors = function() {
+        execute_all_cells(false);
+    };
+    /**
+     * Create floating toolbar
+     *
+     */
+    var create_runtools_div = function() {
+        var btn = '<div class="btn-toolbar">\
+            <div class="btn-group">\
+                <button type="button" id="run_c" class="btn-primary fa fa-step-forward" title="Run current cell"></button>\
+                <button type="button" id="run_ca" class="btn-primary fa icon-run-to" title="' +
+            'Run cells above (' + params["run_cells_above"] + ')"</button>\
+                <button type="button" id="run_cb" class="btn-primary fa icon-run-from" title="' +
+            'Run cells below (' + params["run_cells_below"] + ')"</button>\
+                <button type="button" id="run_a" class="btn-primary fa icon-run-all" title="' +
+            'Run all cells (' + params["run_all_cells"] + ')"</button>\
+                <button type="button" id="run_af" class="btn-primary fa icon-run-all-forced" title="' +
+            'Run all - ignore errors (' + params["run_all_cells_ignore_errors"] + ')"</button>\
+                <button type="button" id="run_m" class="btn-primary fa icon-run-marked" title="' +
+            'Run marked codecells (' + params["run_marked_cells"] + ')"</button>\
+                <button type="button" id="interrupt_b" class="btn-primary fa fa-stop" title="' +
+            'Stop execution (' + params["stop_execution"] + ')"</button>\
+            </div>\
+            <div class="btn-group">\
+                <button type="button" id="mark_toggle" class="btn-primary fa icon-mark-toggle" title="Mark single code cell"></button>\
+                <button type="button" id="mark_all" class="btn-primary fa icon-mark-all" title="Mark all code cells"></button>\
+                <button type="button" id="mark_none" class="btn-primary fa icon-mark-none" title="Unmark all code cells"></button>\
+            </div>\
+            <div class="btn-group">\
+                <button type="button" id="show_input" class="btn-primary fa icon-show-input" title="Show input of code cell"></button>\
+                <button type="button" id="hide_input" class="btn-primary fa icon-hide-input" title="Hide input of code cell"></button>\
+                <button type="button" id="show_output" class="btn-primary fa icon-show-output" title="Show output of code cell"></button>\
+                <button type="button" id="hide_output" class="btn-primary fa icon-hide-output" title="Hide output of code cell"></button>\
+                <button type="button" id="lock_marked" class="btn-primary fa fa-lock" title="Lock marked cells"></button>\
+                <button type="button" id="unlock_marked" class="btn-primary fa fa-unlock" title="Unlock marked cells"></button>\
+            </div>\
+            </div>';
+        var runtools_wrapper = $('<div id="runtools-wrapper">')
+            .text("Runtools")
+            .append(btn)
+            .draggable()
+            .append("</div>");
+        $("#header").append(runtools_wrapper);
+        $("#runtools-wrapper").css({
+            'position': 'absolute'
+        });
+        $('#run_c').on('click', function(e) {
+                var idx = Jupyter.notebook.get_selected_index();
+                _execute_without_selecting(idx, idx + 1);
+                e.target.blur();
+            })
+            .tooltip({
+                delay: {
+                    show: 500,
+                    hide: 100
+                }
+            });
+        $('#run_ca').on('click', function(e) {
+                execute_cells_above();
+                e.target.blur();
+            })
+            .tooltip({
+                delay: {
+                    show: 500,
+                    hide: 100
+                }
+            });
+        $('#run_cb').on('click', function(e) {
+                execute_cells_below();
+                e.target.blur();
+            })
+            .tooltip({
+                delay: {
+                    show: 500,
+                    hide: 100
+                }
+            });
+        $('#run_a').on('click', function(e) {
+                execute_all_cells();
+                e.target.blur();
+            })
+            .tooltip({
+                delay: {
+                    show: 500,
+                    hide: 100
+                }
+            });
+        $('#run_af').on('click', function(e) {
+                run_all_cells_ignore_errors();
+                e.target.blur()
+            })
+            .tooltip({
+                delay: {
+                    show: 500,
+                    hide: 100
+                }
+            });
+        $('#run_m').on('click', function(e) {
+                run_marked_cells();
+                e.target.blur()
+            })
+            .tooltip({
+                delay: {
+                    show: 500,
+                    hide: 100
+                }
+            });
+        $('#interrupt_b').on('click', function(e) {
+                interrupt_execution();
+                e.target.blur()
+            })
+            .tooltip({
+                delay: {
+                    show: 500,
+                    hide: 100
+                }
+            });
+        $('#mark_toggle').on('click', function() {
+                toggle_marker()
+            })
+            .tooltip({
+                delay: {
+                    show: 500,
+                    hide: 100
+                }
+            });
+        $('#mark_all').on('click', function() {
+                mark_all()
+            })
+            .tooltip({
+                delay: {
+                    show: 500,
+                    hide: 100
+                }
+            });
+        $('#mark_none').on('click', function() {
+                mark_none()
+            })
+            .tooltip({
+                delay: {
+                    show: 500,
+                    hide: 100
+                }
+            });
+        $('#show_input').on('click', function() {
+                show_input(true);
+                this.blur()
+            })
+            .tooltip({
+                delay: {
+                    show: 500,
+                    hide: 100
+                }
+            });
+        $('#hide_input').on('click', function() {
+                show_input(false);
+                this.blur()
+            })
+            .tooltip({
+                delay: {
+                    show: 500,
+                    hide: 100
+                }
+            });
+        $('#show_output').on('click', function() {
+                show_output(true);
+                this.blur()
+            })
+            .tooltip({
+                delay: {
+                    show: 500,
+                    hide: 100
+                }
+            });
+        $('#hide_output').on('click', function() {
+                show_output(false);
+                this.blur()
+            })
+            .tooltip({
+                delay: {
+                    show: 500,
+                    hide: 100
+                }
+            });
+        $('#lock_marked').on('click', function() {
+                lock_cell(true);
+                this.blur()
+            })
+            .tooltip({
+                delay: {
+                    show: 500,
+                    hide: 100
+                }
+            });
+        $('#unlock_marked').on('click', function() {
+                lock_cell(false);
+                this.blur()
+            })
+            .tooltip({
+                delay: {
+                    show: 500,
+                    hide: 100
+                }
+            });
+    };
+    /**
+     * Show/hide toolbar
+     *
+     */
+    var toggle_toolbar = function() {
+        var dom = $("#runtools-wrapper");
+        if (dom.is(':visible')) {
+            $('#toggle_runtools').removeClass('active').blur();
+            dom.hide();
+        } else {
+            $('#toggle_runtools').addClass('active');
+            dom.show();
+        }
+        if (dom.length === 0) {
+            create_runtools_div()
+        }
+    };
+    /**
+     * Add CSS file
+     *
+     * @param name filename
+     */
+    var load_css = function(name) {
+        var link = document.createElement("link");
+        link.type = "text/css";
+        link.rel = "stylesheet";
+        link.href = requirejs.toUrl(name);
+        document.getElementsByTagName("head")[0].appendChild(link);
+    };
+    /**
+     * Add gutter to a new cell
+     *
+     * @param event
+     * @param nbcell
+     *
+     */
+    var createCell = function(event, nbcell) {
+        var cell = nbcell.cell;
+        if (cell instanceof codecell.CodeCell) {
+            var gutters = cell.code_mirror.getOption('gutters').slice();
+            if ($.inArray("CodeMirror-cellstate", gutters) < 0) {
+                gutters.push('CodeMirror-cellstate');
+                cell.code_mirror.setOption('gutters', gutters);
+                cell.code_mirror.on("gutterClick", changeEvent);
+            }
+        }
+    };
+    /**
+     * Initialize all cells with new gutter
+     */
+    var initGutter = function() {
+        var cells = Jupyter.notebook.get_cells();
+        var ncells = cells.length;
+        for (var i = 0; i < ncells; i++) {
+            var cell = cells[i];
+            if (cell instanceof codecell.CodeCell) {
+                var gutters = cell.code_mirror.getOption('gutters').slice();
+                if ($.inArray("CodeMirror-cellstate", gutters) < 0) {
+                    gutters.push('CodeMirror-cellstate');
+                    cell.code_mirror.setOption('gutters', gutters);
+                }
+            }
+            /**
+             * Restore hide/show status after reload
+             */
+            if (cell.metadata.hasOwnProperty('hide_input') && cell.metadata.hide_input === true)
+                showCell(cell, 'i', false);
+            if (cell.metadata.hasOwnProperty('hide_output') && cell.metadata.hide_output === true)
+                showCell(cell, 'o', false);
+            if (cell.is_editable() === false) {
+                set_cell_state(cell, 'locked');
+            }
+            cell.code_mirror.refresh();
+        }
+        events.on('create.Cell', createCell);
+    };
+    /**
+     * Called from notebook after extension was loaded
+     *
+     */
+    var load_extension = function() {
+        load_css('./main.css');
+        load_css('./gutter.css'); /* set gutter width */
+        requirejs(['./cellstate'], function() {
+            if (Jupyter.notebook._fully_loaded) {
+                initGutter();
+            } else {
+                events.one('notebook_loaded.Notebook', initGutter);
+            }
+        });
+        Jupyter.notebook.config.loaded.then(initialize);
+    };
+    return {
+        load_jupyter_extension: load_extension,
+        load_ipython_extension: load_extension
+    };
+});

.local/share/jupyter/nbextensions/runtools/runtools_lock.png ADDED Viewed

.local/share/jupyter/nbextensions/scratchpad/README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+# Scratchpad notebook extension
+Adds a scratchpad cell to Jupyter notebook.
+This is a cell in which you can execute code against the current kernel without modifying the notebook document.
+Scratchpad cells can be executed using `Shift-Enter` (other shortcuts are appled to the notebook document). The scratchpad can be toggled by clicking the icon in the bottom-right, or via the keyboard shortcut `Ctrl-B`.
+![demo](demo.gif)
+## Credits
+This extension is a copy of the extension from MinRK here:
+`git clone git://github.com/minrk/nbextension-scratchpad`.

.local/share/jupyter/nbextensions/skill/README.md ADDED Viewed

	@@ -0,0 +1,15 @@

+# SKILL for Codemirror
+This extension provides a *SKILL* mode for CodeMirror editor.
+The extension adds a MIME type `x-skill` and a mode `skill` that can be
+used with CodeMirror.
+## About SKILL
+From [Wikipedia](https://en.wikipedia.org/wiki/Cadence_SKILL):
+SKILL is a Lisp dialect used as a scripting language and PCell (parameterized
+cells) description language used in many EDA software suites by Cadence Design
+Systems (e.g. Cadence Allegro and Cadence Virtuoso)
+## Notes
+This extension was written to enhance the Virtuoso kernel for Jupyter
+(https://github.com/benvarkey/JuVi).

.triton/cache/6e97c2a1f7a095255f6dd5de1807841d/cuda_utils.so ADDED Viewed

Binary file (28 kB). View file

.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.ptx ADDED Viewed

	@@ -0,0 +1,807 @@

+//
+// Generated by LLVM NVPTX Back-End
+//
+.version 8.2
+.target sm_89
+.address_size 64
+	// .globl	triton__0d1d2d3d4d5d6d7d8de9de
+.extern .shared .align 1 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
+.visible .entry triton__0d1d2d3d4d5d6d7d8de9de(
+	.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_0,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_1,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_2,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_3,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_4,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_5,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_6,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_7,
+	.param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_8,
+	.param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_9
+)
+.maxntid 64, 1, 1
+{
+	.reg .pred 	%p<33>;
+	.reg .b16 	%rs<21>;
+	.reg .b32 	%r<112>;
+	.reg .f32 	%f<94>;
+	.reg .b64 	%rd<20>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+	ld.param.u64 	%rd9, [triton__0d1d2d3d4d5d6d7d8de9de_param_0];
+	ld.param.u64 	%rd10, [triton__0d1d2d3d4d5d6d7d8de9de_param_1];
+$L__tmp0:
+	.loc	1 26 26
+	mov.u32 	%r78, %tid.x;
+	and.b32  	%r79, %r78, 31;
+	ld.param.u64 	%rd11, [triton__0d1d2d3d4d5d6d7d8de9de_param_2];
+	ld.param.u64 	%rd12, [triton__0d1d2d3d4d5d6d7d8de9de_param_3];
+	ld.param.u64 	%rd13, [triton__0d1d2d3d4d5d6d7d8de9de_param_4];
+	shl.b32 	%r80, %r78, 2;
+	ld.param.u64 	%rd14, [triton__0d1d2d3d4d5d6d7d8de9de_param_5];
+	and.b32  	%r81, %r80, 252;
+	ld.param.u64 	%rd15, [triton__0d1d2d3d4d5d6d7d8de9de_param_6];
+	ld.param.u64 	%rd16, [triton__0d1d2d3d4d5d6d7d8de9de_param_7];
+	.loc	1 23 28
+	mov.u32 %r1, %ctaid.x;
+	.loc	1 30 40
+	shl.b32 	%r82, %r1, 8;
+	.loc	1 30 36
+	or.b32  	%r83, %r82, %r81;
+	.loc	1 30 30
+	mul.wide.s32 	%rd17, %r83, 4;
+	add.s64 	%rd1, %rd9, %rd17;
+	mov.b32 	%r6, 0;
+	mov.pred 	%p1, -1;
+	.loc	1 30 46
+	mov.u32 %r2, 0x0;
+	mov.u32 %r3, 0x0;
+	mov.u32 %r4, 0x0;
+	mov.u32 %r5, 0x0;
+	@%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
+	@!%p1 mov.u32 %r2, %r6;
+	@!%p1 mov.u32 %r3, %r6;
+	@!%p1 mov.u32 %r4, %r6;
+	@!%p1 mov.u32 %r5, %r6;
+	mov.b32 	%f1, %r2;
+	mov.b32 	%f2, %r3;
+	mov.b32 	%f3, %r4;
+	mov.b32 	%f4, %r5;
+	.loc	1 31 30
+	mul.wide.s32 	%rd18, %r83, 2;
+	add.s64 	%rd2, %rd10, %rd18;
+	.loc	1 31 46
+	mov.u32 %r10, 0x0;
+	mov.u32 %r11, 0x0;
+	@%p1 ld.global.v2.b32 { %r10, %r11 }, [ %rd2 + 0 ];
+	@!%p1 mov.u32 %r10, %r6;
+	@!%p1 mov.u32 %r11, %r6;
+	cvt.u16.u32 	%rs1, %r10;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r10; }
+	cvt.u16.u32 	%rs3, %r11;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r11; }
+	.loc	1 31 67
+	cvt.f32.bf16 %r14, %rs1;
+	mov.b32 	%f5, %r14;
+	cvt.f32.bf16 %r15, %rs2;
+	mov.b32 	%f6, %r15;
+	cvt.f32.bf16 %r16, %rs3;
+	mov.b32 	%f7, %r16;
+	cvt.f32.bf16 %r17, %rs4;
+	mov.b32 	%f8, %r17;
+	.loc	1 32 30
+	add.s64 	%rd3, %rd11, %rd18;
+	.loc	1 32 46
+	mov.u32 %r18, 0x0;
+	mov.u32 %r19, 0x0;
+	@%p1 ld.global.v2.b32 { %r18, %r19 }, [ %rd3 + 0 ];
+	@!%p1 mov.u32 %r18, %r6;
+	@!%p1 mov.u32 %r19, %r6;
+	cvt.u16.u32 	%rs5, %r18;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r18; }
+	cvt.u16.u32 	%rs7, %r19;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r19; }
+	.loc	1 32 67
+	cvt.f32.bf16 %r22, %rs5;
+	mov.b32 	%f9, %r22;
+	cvt.f32.bf16 %r23, %rs6;
+	mov.b32 	%f10, %r23;
+	cvt.f32.bf16 %r24, %rs7;
+	mov.b32 	%f11, %r24;
+	cvt.f32.bf16 %r25, %rs8;
+	mov.b32 	%f12, %r25;
+	.loc	1 33 30
+	add.s64 	%rd4, %rd12, %rd18;
+	.loc	1 33 46
+	mov.u32 %r26, 0x0;
+	mov.u32 %r27, 0x0;
+	@%p1 ld.global.v2.b32 { %r26, %r27 }, [ %rd4 + 0 ];
+	@!%p1 mov.u32 %r26, %r6;
+	@!%p1 mov.u32 %r27, %r6;
+	cvt.u16.u32 	%rs9, %r26;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r26; }
+	cvt.u16.u32 	%rs11, %r27;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r27; }
+	.loc	1 33 67
+	cvt.f32.bf16 %r30, %rs9;
+	mov.b32 	%f13, %r30;
+	cvt.f32.bf16 %r31, %rs10;
+	mov.b32 	%f14, %r31;
+	cvt.f32.bf16 %r32, %rs11;
+	mov.b32 	%f15, %r32;
+	cvt.f32.bf16 %r33, %rs12;
+	mov.b32 	%f16, %r33;
+	.loc	1 34 31
+	add.s64 	%rd5, %rd13, %rd18;
+	.loc	1 34 47
+	mov.u32 %r34, 0x0;
+	mov.u32 %r35, 0x0;
+	@%p1 ld.global.v2.b32 { %r34, %r35 }, [ %rd5 + 0 ];
+	@!%p1 mov.u32 %r34, %r6;
+	@!%p1 mov.u32 %r35, %r6;
+	cvt.u16.u32 	%rs13, %r34;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs14}, %r34; }
+	cvt.u16.u32 	%rs15, %r35;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs16}, %r35; }
+	.loc	1 34 68
+	cvt.f32.bf16 %r38, %rs13;
+	mov.b32 	%f17, %r38;
+	cvt.f32.bf16 %r39, %rs14;
+	mov.b32 	%f18, %r39;
+	cvt.f32.bf16 %r40, %rs15;
+	mov.b32 	%f19, %r40;
+	cvt.f32.bf16 %r41, %rs16;
+	mov.b32 	%f20, %r41;
+	.loc	1 35 31
+	mul.wide.u32 	%rd19, %r81, 4;
+	add.s64 	%rd6, %rd14, %rd19;
+	.loc	1 35 36
+	mov.u32 %r42, 0x0;
+	mov.u32 %r43, 0x0;
+	mov.u32 %r44, 0x0;
+	mov.u32 %r45, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r42, %r43, %r44, %r45 }, [ %rd6 + 0 ];
+	@!%p1 mov.u32 %r42, %r6;
+	@!%p1 mov.u32 %r43, %r6;
+	@!%p1 mov.u32 %r44, %r6;
+	@!%p1 mov.u32 %r45, %r6;
+	.loc	1 37 18
+	add.f32 	%f21, %f5, %f1;
+	add.f32 	%f22, %f6, %f2;
+	add.f32 	%f23, %f7, %f3;
+	.loc	1 39 18
+	add.f32 	%f24, %f21, %f9;
+	add.f32 	%f25, %f22, %f10;
+	add.f32 	%f26, %f23, %f11;
+	.loc	1 41 18
+	add.f32 	%f27, %f25, %f14;
+	add.f32 	%f28, %f26, %f15;
+	.loc	1 43 19
+	add.f32 	%f29, %f27, %f18;
+	add.f32 	%f30, %f28, %f19;
+	.loc	1 41 18
+	add.f32 	%f31, %f24, %f13;
+	add.f32 	%f32, %f8, %f4;
+	.loc	1 43 19
+	add.f32 	%f33, %f32, %f12;
+	add.f32 	%f34, %f31, %f17;
+$L__tmp1:
+	.loc	2 233 15
+	add.f32 	%f35, %f34, %f29;
+	add.f32 	%f36, %f33, %f16;
+	add.f32 	%f37, %f35, %f30;
+	add.f32 	%f38, %f36, %f20;
+	mov.b32 	%r71, %f38;
+	add.f32 	%f39, %f37, %f38;
+$L__tmp2:
+	.loc	2 243 36
+	mov.b32 	%r84, %f39;
+	shfl.sync.bfly.b32	%r85, %r84, 16, 31, -1;
+	mov.b32 	%f40, %r85;
+$L__tmp3:
+	.loc	2 233 15
+	add.f32 	%f41, %f39, %f40;
+$L__tmp4:
+	.loc	2 243 36
+	mov.b32 	%r86, %f41;
+	shfl.sync.bfly.b32	%r87, %r86, 8, 31, -1;
+	mov.b32 	%f42, %r87;
+$L__tmp5:
+	.loc	2 233 15
+	add.f32 	%f43, %f41, %f42;
+$L__tmp6:
+	.loc	2 243 36
+	mov.b32 	%r88, %f43;
+	shfl.sync.bfly.b32	%r89, %r88, 4, 31, -1;
+	mov.b32 	%f44, %r89;
+$L__tmp7:
+	.loc	2 233 15
+	add.f32 	%f45, %f43, %f44;
+$L__tmp8:
+	.loc	2 243 36
+	mov.b32 	%r90, %f45;
+	shfl.sync.bfly.b32	%r91, %r90, 2, 31, -1;
+	mov.b32 	%f46, %r91;
+$L__tmp9:
+	.loc	2 233 15
+	add.f32 	%f47, %f45, %f46;
+$L__tmp10:
+	.loc	2 243 36
+	mov.b32 	%r92, %f47;
+	shfl.sync.bfly.b32	%r93, %r92, 1, 31, -1;
+	mov.b32 	%f48, %r93;
+$L__tmp11:
+	.loc	2 233 15
+	add.f32 	%f49, %f47, %f48;
+$L__tmp12:
+	.loc	2 243 36
+	setp.eq.s32 	%p23, %r79, 0;
+	shr.u32 	%r94, %r78, 3;
+	and.b32  	%r95, %r94, 4;
+	mov.u32 	%r96, global_smem;
+	add.s32 	%r50, %r96, %r95;
+	mov.b32 	%r51, %f49;
+	@%p23 st.shared.b32 [ %r50 + 0 ], %r51;
+	bar.sync 	0;
+	setp.lt.s32 	%p24, %r78, 2;
+	add.s32 	%r53, %r96, %r80;
+	@%p24 ld.shared.b32 %r52, [ %r53 + 0 ];
+	mov.b32 	%f50, %r52;
+	shfl.sync.bfly.b32	%r97, %r52, 1, 31, -1;
+	mov.b32 	%f51, %r97;
+$L__tmp13:
+	.loc	2 233 15
+	add.f32 	%f52, %f50, %f51;
+$L__tmp14:
+	.loc	2 243 36
+	and.b32  	%r98, %r78, 1;
+	setp.eq.b32 	%p31, %r98, 1;
+	not.pred 	%p32, %p31;
+	and.pred  	%p25, %p24, %p32;
+	mov.b32 	%r55, %f52;
+	@%p25 st.shared.b32 [ %r53 + 0 ], %r55;
+	bar.sync 	0;
+	ld.shared.f32 	%f53, [global_smem];
+$L__tmp15:
+	.loc	3 8 15
+	add.f32 	%f54, %f53, 0f00000000;
+$L__tmp16:
+	.loc	1 51 20
+	mov.b32 	%r57, %f54;
+	mov.b32 	%r58, 1132462080;
+	div.full.f32 %r56, %r57, %r58;
+	mov.b32 	%f55, %r56;
+	.loc	1 52 20
+	sub.f32 	%f56, %f34, %f55;
+	sub.f32 	%f57, %f29, %f55;
+	sub.f32 	%f58, %f30, %f55;
+	sub.f32 	%f59, %f38, %f55;
+	.loc	1 53 20
+	mul.f32 	%f60, %f57, %f57;
+$L__tmp17:
+	.loc	2 243 36
+	bar.sync 	0;
+$L__tmp18:
+	.loc	2 233 15
+	fma.rn.f32 	%f61, %f56, %f56, %f60;
+	fma.rn.f32 	%f62, %f58, %f58, %f61;
+	fma.rn.f32 	%f63, %f59, %f59, %f62;
+$L__tmp19:
+	.loc	2 243 36
+	mov.b32 	%r99, %f63;
+	shfl.sync.bfly.b32	%r100, %r99, 16, 31, -1;
+	mov.b32 	%f64, %r100;
+$L__tmp20:
+	.loc	2 233 15
+	add.f32 	%f65, %f63, %f64;
+$L__tmp21:
+	.loc	2 243 36
+	mov.b32 	%r101, %f65;
+	shfl.sync.bfly.b32	%r102, %r101, 8, 31, -1;
+	mov.b32 	%f66, %r102;
+$L__tmp22:
+	.loc	2 233 15
+	add.f32 	%f67, %f65, %f66;
+$L__tmp23:
+	.loc	2 243 36
+	mov.b32 	%r103, %f67;
+	shfl.sync.bfly.b32	%r104, %r103, 4, 31, -1;
+	mov.b32 	%f68, %r104;
+$L__tmp24:
+	.loc	2 233 15
+	add.f32 	%f69, %f67, %f68;
+$L__tmp25:
+	.loc	2 243 36
+	mov.b32 	%r105, %f69;
+	shfl.sync.bfly.b32	%r106, %r105, 2, 31, -1;
+	mov.b32 	%f70, %r106;
+$L__tmp26:
+	.loc	2 233 15
+	add.f32 	%f71, %f69, %f70;
+$L__tmp27:
+	.loc	2 243 36
+	mov.b32 	%r107, %f71;
+	shfl.sync.bfly.b32	%r108, %r107, 1, 31, -1;
+	mov.b32 	%f72, %r108;
+$L__tmp28:
+	.loc	2 233 15
+	add.f32 	%f73, %f71, %f72;
+$L__tmp29:
+	.loc	2 243 36
+	mov.b32 	%r60, %f73;
+	@%p23 st.shared.b32 [ %r50 + 0 ], %r60;
+	bar.sync 	0;
+	@%p24 ld.shared.b32 %r61, [ %r53 + 0 ];
+	mov.b32 	%f74, %r61;
+	shfl.sync.bfly.b32	%r109, %r61, 1, 31, -1;
+	mov.b32 	%f75, %r109;
+$L__tmp30:
+	.loc	2 233 15
+	add.f32 	%f76, %f74, %f75;
+$L__tmp31:
+	.loc	2 243 36
+	mov.b32 	%r64, %f76;
+	@%p25 st.shared.b32 [ %r53 + 0 ], %r64;
+	bar.sync 	0;
+	ld.shared.f32 	%f77, [global_smem];
+$L__tmp32:
+	.loc	3 8 15
+	add.f32 	%f78, %f77, 0f00000000;
+$L__tmp33:
+	.loc	1 59 20
+	mov.b32 	%r66, %f78;
+	div.full.f32 %r65, %r66, %r58;
+	mov.b32 	%f79, %r65;
+	.loc	1 61 20
+	add.f32 	%f80, %f79, 0f3727C5AC;
+	.loc	1 62 26
+	rsqrt.approx.ftz.f32 	%f81, %f80;
+	.loc	1 35 36
+	mov.b32 	%f82, %r45;
+	mov.b32 	%f83, %r44;
+	mov.b32 	%f84, %r43;
+	mov.b32 	%f85, %r42;
+	.loc	1 63 20
+	mul.f32 	%f86, %f56, %f81;
+	mul.f32 	%f87, %f57, %f81;
+	mul.f32 	%f88, %f58, %f81;
+	mul.f32 	%f89, %f59, %f81;
+	.loc	1 64 20
+	mul.f32 	%f90, %f86, %f85;
+	mul.f32 	%f91, %f87, %f84;
+	mul.f32 	%f92, %f88, %f83;
+	mul.f32 	%f93, %f89, %f82;
+	.loc	1 66 25
+	add.s64 	%rd7, %rd15, %rd17;
+	.loc	1 66 48
+	mov.b32 	%r68, %f34;
+	mov.b32 	%r69, %f29;
+	mov.b32 	%r70, %f30;
+	@%p1 st.global.v4.b32 [ %rd7 + 0 ], { %r68, %r69, %r70, %r71 };
+	.loc	1 67 25
+	add.s64 	%rd8, %rd16, %rd18;
+	.loc	1 67 48
+	mov.b32 	%r72, %f90;
+	cvt.rn.bf16.f32 %rs17, %r72;
+	mov.b32 	%r73, %f91;
+	cvt.rn.bf16.f32 %rs18, %r73;
+	mov.b32 	%r74, %f92;
+	cvt.rn.bf16.f32 %rs19, %r74;
+	mov.b32 	%r75, %f93;
+	cvt.rn.bf16.f32 %rs20, %r75;
+	mov.b32 	%r110, {%rs17, %rs18};
+	mov.b32 	%r111, {%rs19, %rs20};
+	@%p1 st.global.v2.b32 [ %rd8 + 0 ], { %r110, %r111 };
+	.loc	1 67 4
+	ret;
+$L__tmp34:
+$L__func_end0:
+}
+	// .globl	__nv_rsqrtf
+.visible .func  (.param .b32 func_retval0) __nv_rsqrtf(
+	.param .b32 __nv_rsqrtf_param_0
+)
+{
+	.reg .f32 	%f<3>;
+$L__func_begin1:
+	ld.param.f32 	%f1, [__nv_rsqrtf_param_0];
+	rsqrt.approx.ftz.f32 	%f2, %f1;
+	st.param.f32 	[func_retval0+0], %f2;
+	ret;
+$L__func_end1:
+}
+	.file	1 "/tmp/torchinductor_root/jb/cjbnqg5u4sj7a4xstjer3a6tdgnnigb2iymd27gcs6o7oduhxy2v.py"
+	.file	2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
+	.file	3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 32
+.b8 11
+.b8 0
+.b8 0
+.b8 3
+.b8 46
+.b8 1
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 49
+.b8 19
+.b8 0
+.b8 0
+.b8 4
+.b8 29
+.b8 1
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 5
+.b8 29
+.b8 0
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 407
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 106
+.b8 98
+.b8 110
+.b8 113
+.b8 103
+.b8 53
+.b8 117
+.b8 52
+.b8 115
+.b8 106
+.b8 55
+.b8 97
+.b8 52
+.b8 120
+.b8 115
+.b8 116
+.b8 106
+.b8 101
+.b8 114
+.b8 51
+.b8 97
+.b8 54
+.b8 116
+.b8 100
+.b8 103
+.b8 110
+.b8 110
+.b8 105
+.b8 103
+.b8 98
+.b8 50
+.b8 105
+.b8 121
+.b8 109
+.b8 100
+.b8 50
+.b8 55
+.b8 103
+.b8 99
+.b8 115
+.b8 54
+.b8 111
+.b8 55
+.b8 111
+.b8 100
+.b8 117
+.b8 104
+.b8 120
+.b8 121
+.b8 50
+.b8 118
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 106
+.b8 98
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 55
+.b8 100
+.b8 56
+.b8 100
+.b8 101
+.b8 57
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 55
+.b8 100
+.b8 56
+.b8 100
+.b8 101
+.b8 57
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 1
+.b8 3
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b32 125
+.b8 4
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp14
+.b8 2
+.b8 48
+.b8 59
+.b8 5
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp14
+.b8 2
+.b8 243
+.b8 36
+.b8 0
+.b8 5
+.b32 125
+.b64 $L__tmp2
+.b64 $L__tmp15
+.b8 2
+.b8 48
+.b8 59
+.b8 5
+.b32 125
+.b64 $L__tmp15
+.b64 $L__tmp16
+.b8 3
+.b8 48
+.b8 45
+.b8 5
+.b32 125
+.b64 $L__tmp17
+.b64 $L__tmp32
+.b8 2
+.b8 56
+.b8 59
+.b8 4
+.b32 125
+.b64 $L__tmp18
+.b64 $L__tmp31
+.b8 2
+.b8 56
+.b8 59
+.b8 5
+.b32 125
+.b64 $L__tmp18
+.b64 $L__tmp31
+.b8 2
+.b8 243
+.b8 36
+.b8 0
+.b8 5
+.b32 125
+.b64 $L__tmp32
+.b64 $L__tmp33
+.b8 3
+.b8 56
+.b8 45
+.b8 0
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 411
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 55
+.b8 100
+.b8 56
+.b8 100
+.b8 101
+.b8 57
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 411
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}

.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.ttir ADDED Viewed

	@@ -0,0 +1,53 @@

+module {
+  tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<256> : tensor<32x1xi64>
+    %cst_0 = arith.constant dense<0> : tensor<32x1xi64>
+    %cst_1 = arith.constant dense<512> : tensor<32x1xi64>
+    %cst_2 = arith.constant dense<true> : tensor<32x1xi1>
+    %cst_3 = arith.constant dense<256> : tensor<32x1xi32>
+    %cst_4 = arith.constant dense<131072> : tensor<1x128xi32>
+    %cst_5 = arith.constant dense<120> : tensor<1x128xi32>
+    %cst_6 = arith.constant dense<0.000000e+00> : tensor<32x128xf32>
+    %c32_i32 = arith.constant 32 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c32_i32 : i32
+    %2 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32>
+    %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<32xi32>) -> tensor<32x1xi32>
+    %4 = tt.splat %1 : (i32) -> tensor<32x1xi32>
+    %5 = arith.addi %4, %3 : tensor<32x1xi32>
+    %6 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
+    %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<128xi32>) -> tensor<1x128xi32>
+    %8 = arith.cmpi slt, %7, %cst_5 : tensor<1x128xi32>
+    %9 = arith.muli %7, %cst_4 : tensor<1x128xi32>
+    %10 = tt.broadcast %5 : (tensor<32x1xi32>) -> tensor<32x128xi32>
+    %11 = tt.broadcast %9 : (tensor<1x128xi32>) -> tensor<32x128xi32>
+    %12 = arith.addi %10, %11 : tensor<32x128xi32>
+    %13 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<32x128x!tt.ptr<f32, 1>>
+    %14 = tt.addptr %13, %12 : tensor<32x128x!tt.ptr<f32, 1>>, tensor<32x128xi32>
+    %15 = tt.broadcast %8 : (tensor<1x128xi1>) -> tensor<32x128xi1>
+    %16 = tt.load %14, %15, %cst_6 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<32x128xf32>
+    %17 = arith.addf %16, %cst_6 : tensor<32x128xf32>
+    %18 = arith.select %15, %17, %cst_6 : tensor<32x128xi1>, tensor<32x128xf32>
+    %19 = "tt.reduce"(%18) <{axis = 1 : i32}> ({
+    ^bb0(%arg5: f32, %arg6: f32):
+      %35 = arith.addf %arg5, %arg6 : f32
+      tt.reduce.return %35 : f32
+    }) : (tensor<32x128xf32>) -> tensor<32xf32>
+    %20 = tt.expand_dims %19 {axis = 1 : i32} : (tensor<32xf32>) -> tensor<32x1xf32>
+    %21 = arith.divsi %5, %cst_3 : tensor<32x1xi32>
+    %22 = arith.remsi %5, %cst_3 : tensor<32x1xi32>
+    %23 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<32x1x!tt.ptr<i64, 1>>
+    %24 = tt.addptr %23, %21 : tensor<32x1x!tt.ptr<i64, 1>>, tensor<32x1xi32>
+    %25 = tt.load %24 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<32x1xi64>
+    %26 = arith.addi %25, %cst_1 : tensor<32x1xi64>
+    %27 = arith.cmpi slt, %25, %cst_0 : tensor<32x1xi64>
+    %28 = arith.select %27, %26, %25 : tensor<32x1xi1>, tensor<32x1xi64>
+    %29 = arith.muli %28, %cst : tensor<32x1xi64>
+    %30 = arith.extsi %22 : tensor<32x1xi32> to tensor<32x1xi64>
+    %31 = arith.addi %30, %29 : tensor<32x1xi64>
+    %32 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<32x1x!tt.ptr<f32, 1>>
+    %33 = tt.addptr %32, %31 : tensor<32x1x!tt.ptr<f32, 1>>, tensor<32x1xi64>
+    %34 = "tt.atomic_rmw"(%33, %20, %cst_2) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<32x1x!tt.ptr<f32, 1>>, tensor<32x1xf32>, tensor<32x1xi1>) -> tensor<32x1xf32>
+    tt.return
+  }
+}

.triton/dump/305a9479aab997a3a16bfe46bb303a50/triton_.cubin ADDED Viewed

Binary file (30.4 kB). View file

.triton/dump/305a9479aab997a3a16bfe46bb303a50/triton_.ptx ADDED Viewed

	@@ -0,0 +1,1041 @@

+//
+// Generated by LLVM NVPTX Back-End
+//
+.version 8.2
+.target sm_89
+.address_size 64
+	// .globl	triton__0d1d2d3d4d5d6de7de
+.extern .func __assertfail
+(
+	.param .b64 __assertfail_param_0,
+	.param .b64 __assertfail_param_1,
+	.param .b32 __assertfail_param_2,
+	.param .b64 __assertfail_param_3,
+	.param .b64 __assertfail_param_4
+)
+;
+.global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
+.global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
+.global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 54, 32, 60, 32, 53, 48, 50, 53, 55};
+.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
+.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
+.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
+.extern .shared .align 1 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
+.visible .entry triton__0d1d2d3d4d5d6de7de(
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_0,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_1,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_2,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_3,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_4,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_5,
+	.param .u32 triton__0d1d2d3d4d5d6de7de_param_6,
+	.param .u32 triton__0d1d2d3d4d5d6de7de_param_7
+)
+.maxntid 64, 1, 1
+{
+	.reg .pred 	%p<59>;
+	.reg .b16 	%rs<13>;
+	.reg .b32 	%r<176>;
+	.reg .f32 	%f<169>;
+	.reg .b64 	%rd<58>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+	ld.param.u64 	%rd8, [triton__0d1d2d3d4d5d6de7de_param_4];
+	ld.param.u64 	%rd7, [triton__0d1d2d3d4d5d6de7de_param_1];
+	ld.param.u64 	%rd22, [triton__0d1d2d3d4d5d6de7de_param_0];
+$L__tmp0:
+	.loc	1 24 33
+	mov.u32 	%r1, %tid.x;
+	and.b32  	%r2, %r1, 31;
+	ld.param.u64 	%rd23, [triton__0d1d2d3d4d5d6de7de_param_2];
+	ld.param.u64 	%rd24, [triton__0d1d2d3d4d5d6de7de_param_3];
+	bfe.u32 	%r3, %r1, 5, 1;
+	shl.b32 	%r30, %r1, 2;
+	and.b32  	%r4, %r30, 252;
+	.loc	1 21 28
+	mov.u32 %r13, %ctaid.x;
+	.loc	1 26 30
+	mul.wide.s32 	%rd25, %r13, 8;
+	add.s64 	%rd11, %rd22, %rd25;
+	mov.pred 	%p53, -1;
+	.loc	1 26 35
+	mov.u64 %rd10, 0x0;
+	@%p53 ld.global.L1::evict_last.b64 { %rd10 }, [ %rd11 + 0 ];
+	mov.u64 %rd12, 0x0;
+	@%p53 ld.global.L1::evict_last.b64 { %rd12 }, [ %rd11 + 0 ];
+	mov.u64 %rd14, 0x0;
+	@%p53 ld.global.L1::evict_last.b64 { %rd14 }, [ %rd11 + 0 ];
+	mov.u64 %rd16, 0x0;
+	@%p53 ld.global.L1::evict_last.b64 { %rd16 }, [ %rd11 + 0 ];
+	mov.u64 %rd18, 0x0;
+	@%p53 ld.global.L1::evict_last.b64 { %rd18 }, [ %rd11 + 0 ];
+	.loc	1 27 18
+	shr.s32 	%r31, %r13, 31;
+	shr.u32 	%r32, %r31, 23;
+	add.s32 	%r33, %r13, %r32;
+	and.b32  	%r34, %r33, 16776704;
+	sub.s32 	%r35, %r13, %r34;
+	.loc	1 35 44
+	shl.b32 	%r36, %r35, 8;
+	.loc	1 35 40
+	or.b32  	%r37, %r36, %r4;
+	.loc	1 35 34
+	mul.wide.s32 	%rd26, %r37, 4;
+	add.s64 	%rd37, %rd23, %rd26;
+	mov.b32 	%r151, 0;
+	.loc	1 35 50
+	mov.u32 %r14, 0x0;
+	mov.u32 %r15, 0x0;
+	mov.u32 %r16, 0x0;
+	mov.u32 %r17, 0x0;
+	@%p53 ld.global.L1::evict_last.v4.b32 { %r14, %r15, %r16, %r17 }, [ %rd37 + 0 ];
+	@!%p53 mov.u32 %r14, %r151;
+	@!%p53 mov.u32 %r15, %r151;
+	@!%p53 mov.u32 %r16, %r151;
+	@!%p53 mov.u32 %r17, %r151;
+	mov.b32 	%f2, %r14;
+	mov.b32 	%f1, %r15;
+	mov.b32 	%f3, %r16;
+	mov.b32 	%f4, %r17;
+	.loc	1 36 44
+	shl.b32 	%r38, %r13, 8;
+	.loc	1 36 40
+	or.b32  	%r39, %r38, %r4;
+	.loc	1 36 34
+	mul.wide.s32 	%rd27, %r39, 2;
+	add.s64 	%rd38, %rd24, %rd27;
+	.loc	1 36 50
+	mov.u32 %r22, 0x0;
+	mov.u32 %r23, 0x0;
+	@%p53 ld.global.L1::evict_last.v2.b32 { %r22, %r23 }, [ %rd38 + 0 ];
+	@!%p53 mov.u32 %r22, %r151;
+	@!%p53 mov.u32 %r23, %r151;
+	cvt.u16.u32 	%rs1, %r22;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r22; }
+	cvt.u16.u32 	%rs3, %r23;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r23; }
+	.loc	1 36 101
+	cvt.f32.bf16 %r26, %rs1;
+	mov.b32 	%f5, %r26;
+	cvt.f32.bf16 %r27, %rs2;
+	mov.b32 	%f6, %r27;
+	cvt.f32.bf16 %r28, %rs3;
+	mov.b32 	%f7, %r28;
+	cvt.f32.bf16 %r29, %rs4;
+	mov.b32 	%f8, %r29;
+	.loc	1 37 22
+	add.s64 	%rd28, %rd18, 50257;
+	.loc	1 38 22
+	setp.lt.s64 	%p14, %rd18, 0;
+	.loc	1 39 36
+	selp.b64 	%rd5, %rd28, %rd18, %p14;
+	.loc	1 40 40
+	setp.lt.u64 	%p15, %rd5, 50257;
+	mov.b32 	%r175, 883;
+	mov.u64 	%rd57, 1;
+	.loc	1 40 55
+	@%p15 bra 	$L__BB0_2;
+	mov.u64 	%rd29, assertMessage_0;
+	cvta.global.u64 	%rd30, %rd29;
+	mov.u64 	%rd31, assertFile_0;
+	cvta.global.u64 	%rd32, %rd31;
+	mov.u64 	%rd33, assertFunc_0;
+	cvta.global.u64 	%rd34, %rd33;
+	{ // callseq 0, 0
+	.reg .b32 temp_param_reg;
+	.param .b64 param0;
+	st.param.b64 	[param0+0], %rd30;
+	.param .b64 param1;
+	st.param.b64 	[param1+0], %rd32;
+	.param .b32 param2;
+	st.param.b32 	[param2+0], %r175;
+	.param .b64 param3;
+	st.param.b64 	[param3+0], %rd34;
+	.param .b64 param4;
+	st.param.b64 	[param4+0], %rd57;
+	call.uni
+	__assertfail,
+	(
+	param0,
+	param1,
+	param2,
+	param3,
+	param4
+	);
+	} // callseq 0
+$L__BB0_2:
+	.loc	1 0 55
+	ld.param.u64 	%rd9, [triton__0d1d2d3d4d5d6de7de_param_5];
+	cvt.s64.s32 	%rd3, %r39;
+	.loc	1 38 22
+	setp.lt.s64 	%p44, %rd10, 0;
+	.loc	1 41 44
+	shl.b64 	%rd40, %rd10, 8;
+	add.s64 	%rd41, %rd40, 12865792;
+	selp.b64 	%rd42, %rd41, %rd40, %p44;
+	cvt.u64.u32 	%rd43, %r4;
+	.loc	1 41 40
+	or.b64  	%rd44, %rd42, %rd43;
+	.loc	1 41 34
+	shl.b64 	%rd45, %rd44, 2;
+	add.s64 	%rd54, %rd7, %rd45;
+	.loc	1 41 52
+	mov.u32 %r41, 0x0;
+	mov.u32 %r42, 0x0;
+	mov.u32 %r43, 0x0;
+	mov.u32 %r44, 0x0;
+	@%p53 ld.global.L1::evict_last.v4.b32 { %r41, %r42, %r43, %r44 }, [ %rd54 + 0 ];
+	@!%p53 mov.u32 %r41, %r151;
+	@!%p53 mov.u32 %r42, %r151;
+	@!%p53 mov.u32 %r43, %r151;
+	@!%p53 mov.u32 %r44, %r151;
+	mov.b32 	%f15, %r43;
+	mov.b32 	%f16, %r44;
+	.loc	1 42 22
+	add.f32 	%f17, %f3, %f15;
+	add.f32 	%f18, %f4, %f16;
+	.loc	1 44 22
+	add.f32 	%f19, %f7, %f17;
+	add.f32 	%f20, %f8, %f18;
+	.loc	1 41 52
+	mov.b32 	%f21, %r41;
+	mov.b32 	%f22, %r42;
+	.loc	1 42 22
+	add.f32 	%f23, %f1, %f22;
+	add.f32 	%f24, %f2, %f21;
+	.loc	1 44 22
+	add.f32 	%f25, %f5, %f24;
+	add.f32 	%f26, %f6, %f23;
+$L__tmp1:
+	.loc	2 98 22
+	add.f32 	%f27, %f26, 0f00000000;
+	add.f32 	%f28, %f25, 0f00000000;
+	add.f32 	%f29, %f19, 0f00000000;
+	add.f32 	%f30, %f20, 0f00000000;
+	.loc	2 101 30
+	sub.f32 	%f31, %f25, %f28;
+	sub.f32 	%f32, %f26, %f27;
+	sub.f32 	%f33, %f19, %f29;
+	sub.f32 	%f34, %f20, %f30;
+	.loc	2 101 13
+	fma.rn.f32 	%f35, %f25, %f31, 0f00000000;
+	fma.rn.f32 	%f36, %f26, %f32, 0f00000000;
+	fma.rn.f32 	%f37, %f19, %f33, 0f00000000;
+	fma.rn.f32 	%f38, %f20, %f34, 0f00000000;
+$L__tmp2:
+	.loc	2 108 21
+	sub.f32 	%f39, %f27, %f28;
+	mov.b32 	%r50, 1065353216;
+	mov.b32 	%r51, 1073741824;
+	.loc	2 110 60
+	div.full.f32 %r49, %r50, %r51;
+	mov.b32 	%f40, %r49;
+	.loc	2 112 17
+	fma.rn.f32 	%f41, %f40, %f39, %f28;
+	.loc	2 113 15
+	add.f32 	%f42, %f35, %f36;
+	.loc	2 113 30
+	mul.f32 	%f43, %f39, %f39;
+	.loc	2 113 22
+	fma.rn.f32 	%f44, %f40, %f43, %f42;
+	.loc	2 108 21
+	sub.f32 	%f45, %f29, %f41;
+	mov.b32 	%r54, 1077936128;
+	.loc	2 110 60
+	div.full.f32 %r52, %r50, %r54;
+	mov.b32 	%f46, %r52;
+	.loc	2 112 17
+	fma.rn.f32 	%f47, %f46, %f45, %f41;
+	.loc	2 113 15
+	add.f32 	%f48, %f37, %f44;
+	.loc	2 113 30
+	mul.f32 	%f49, %f45, %f45;
+	.loc	2 113 38
+	fma.rn.f32 	%f50, %f45, %f45, %f49;
+	.loc	2 113 22
+	fma.rn.f32 	%f51, %f46, %f50, %f48;
+	.loc	2 108 21
+	sub.f32 	%f52, %f30, %f47;
+	mov.b32 	%r57, 1082130432;
+	.loc	2 110 60
+	div.full.f32 %r55, %r50, %r57;
+	mov.b32 	%f53, %r55;
+	.loc	2 112 17
+	fma.rn.f32 	%f54, %f53, %f52, %f47;
+	.loc	2 113 15
+	add.f32 	%f55, %f38, %f51;
+	.loc	2 113 30
+	mul.f32 	%f56, %f52, %f52;
+	.loc	2 113 38
+	mul.f32 	%f57, %f56, 0f40400000;
+	.loc	2 113 22
+	fma.rn.f32 	%f58, %f53, %f57, %f55;
+$L__tmp3:
+	.loc	2 120 46
+	mov.b32 	%r118, %f54;
+	shfl.sync.bfly.b32	%r119, %r118, 16, 31, -1;
+	mov.b32 	%f59, %r119;
+	mov.b32 	%r120, %f58;
+	shfl.sync.bfly.b32	%r121, %r120, 16, 31, -1;
+	mov.b32 	%f60, %r121;
+	shfl.sync.bfly.b32	%r59, %r57, 16, 31, -1;
+	mov.b32 	%f61, %r59;
+$L__tmp4:
+	.loc	2 108 21
+	sub.f32 	%f62, %f59, %f54;
+	.loc	2 109 28
+	add.f32 	%f63, %f61, 0f40800000;
+	.loc	2 110 39
+	setp.eq.f32 	%p45, %f63, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r60, %f63;
+	div.full.f32 %r58, %r59, %r60;
+	mov.b32 	%f64, %r58;
+	.loc	2 110 49
+	selp.f32 	%f65, 0f00000000, %f64, %p45;
+	.loc	2 112 17
+	fma.rn.f32 	%f66, %f65, %f62, %f54;
+	.loc	2 113 15
+	add.f32 	%f67, %f58, %f60;
+	.loc	2 113 30
+	mul.f32 	%f68, %f62, %f62;
+	.loc	2 113 38
+	mul.f32 	%f69, %f68, 0f40800000;
+	.loc	2 113 22
+	fma.rn.f32 	%f70, %f65, %f69, %f67;
+$L__tmp5:
+	.loc	2 120 46
+	mov.b32 	%r122, %f66;
+	shfl.sync.bfly.b32	%r123, %r122, 8, 31, -1;
+	mov.b32 	%f71, %r123;
+	mov.b32 	%r124, %f70;
+	shfl.sync.bfly.b32	%r125, %r124, 8, 31, -1;
+	mov.b32 	%f72, %r125;
+	shfl.sync.bfly.b32	%r62, %r60, 8, 31, -1;
+	mov.b32 	%f73, %r62;
+$L__tmp6:
+	.loc	2 108 21
+	sub.f32 	%f74, %f71, %f66;
+	.loc	2 109 28
+	add.f32 	%f75, %f63, %f73;
+	.loc	2 110 39
+	setp.eq.f32 	%p46, %f75, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r63, %f75;
+	div.full.f32 %r61, %r62, %r63;
+	mov.b32 	%f76, %r61;
+	.loc	2 110 49
+	selp.f32 	%f77, 0f00000000, %f76, %p46;
+	.loc	2 112 17
+	fma.rn.f32 	%f78, %f77, %f74, %f66;
+	.loc	2 113 15
+	add.f32 	%f79, %f70, %f72;
+	.loc	2 113 30
+	mul.f32 	%f80, %f74, %f74;
+	.loc	2 113 38
+	mul.f32 	%f81, %f63, %f80;
+	.loc	2 113 22
+	fma.rn.f32 	%f82, %f77, %f81, %f79;
+$L__tmp7:
+	.loc	2 120 46
+	mov.b32 	%r126, %f78;
+	shfl.sync.bfly.b32	%r127, %r126, 4, 31, -1;
+	mov.b32 	%f83, %r127;
+	mov.b32 	%r128, %f82;
+	shfl.sync.bfly.b32	%r129, %r128, 4, 31, -1;
+	mov.b32 	%f84, %r129;
+	shfl.sync.bfly.b32	%r65, %r63, 4, 31, -1;
+	mov.b32 	%f85, %r65;
+$L__tmp8:
+	.loc	2 108 21
+	sub.f32 	%f86, %f83, %f78;
+	.loc	2 109 28
+	add.f32 	%f87, %f75, %f85;
+	.loc	2 110 39
+	setp.eq.f32 	%p47, %f87, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r66, %f87;
+	div.full.f32 %r64, %r65, %r66;
+	mov.b32 	%f88, %r64;
+	.loc	2 110 49
+	selp.f32 	%f89, 0f00000000, %f88, %p47;
+	.loc	2 112 17
+	fma.rn.f32 	%f90, %f89, %f86, %f78;
+	.loc	2 113 15
+	add.f32 	%f91, %f82, %f84;
+	.loc	2 113 30
+	mul.f32 	%f92, %f86, %f86;
+	.loc	2 113 38
+	mul.f32 	%f93, %f75, %f92;
+	.loc	2 113 22
+	fma.rn.f32 	%f94, %f89, %f93, %f91;
+$L__tmp9:
+	.loc	2 120 46
+	mov.b32 	%r130, %f90;
+	shfl.sync.bfly.b32	%r131, %r130, 2, 31, -1;
+	mov.b32 	%f95, %r131;
+	mov.b32 	%r132, %f94;
+	shfl.sync.bfly.b32	%r133, %r132, 2, 31, -1;
+	mov.b32 	%f96, %r133;
+	shfl.sync.bfly.b32	%r68, %r66, 2, 31, -1;
+	mov.b32 	%f97, %r68;
+$L__tmp10:
+	.loc	2 108 21
+	sub.f32 	%f98, %f95, %f90;
+	.loc	2 109 28
+	add.f32 	%f99, %f87, %f97;
+	.loc	2 110 39
+	setp.eq.f32 	%p48, %f99, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r69, %f99;
+	div.full.f32 %r67, %r68, %r69;
+	mov.b32 	%f100, %r67;
+	.loc	2 110 49
+	selp.f32 	%f101, 0f00000000, %f100, %p48;
+	.loc	2 112 17
+	fma.rn.f32 	%f102, %f101, %f98, %f90;
+	.loc	2 113 15
+	add.f32 	%f103, %f94, %f96;
+	.loc	2 113 30
+	mul.f32 	%f104, %f98, %f98;
+	.loc	2 113 38
+	mul.f32 	%f105, %f87, %f104;
+	.loc	2 113 22
+	fma.rn.f32 	%f106, %f101, %f105, %f103;
+$L__tmp11:
+	.loc	2 120 46
+	mov.b32 	%r134, %f102;
+	shfl.sync.bfly.b32	%r135, %r134, 1, 31, -1;
+	mov.b32 	%f107, %r135;
+	mov.b32 	%r136, %f106;
+	shfl.sync.bfly.b32	%r137, %r136, 1, 31, -1;
+	mov.b32 	%f108, %r137;
+	shfl.sync.bfly.b32	%r71, %r69, 1, 31, -1;
+	mov.b32 	%f109, %r71;
+$L__tmp12:
+	.loc	2 108 21
+	sub.f32 	%f110, %f107, %f102;
+	.loc	2 109 28
+	add.f32 	%f111, %f99, %f109;
+	.loc	2 110 39
+	setp.eq.f32 	%p49, %f111, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r72, %f111;
+	div.full.f32 %r70, %r71, %r72;
+	mov.b32 	%f112, %r70;
+	.loc	2 110 49
+	selp.f32 	%f113, 0f00000000, %f112, %p49;
+	.loc	2 112 17
+	fma.rn.f32 	%f114, %f113, %f110, %f102;
+	.loc	2 113 15
+	add.f32 	%f115, %f106, %f108;
+	.loc	2 113 30
+	mul.f32 	%f116, %f110, %f110;
+	.loc	2 113 38
+	mul.f32 	%f117, %f99, %f116;
+	.loc	2 113 22
+	fma.rn.f32 	%f118, %f113, %f117, %f115;
+$L__tmp13:
+	.loc	2 120 46
+	setp.eq.s32 	%p21, %r2, 0;
+	shl.b32 	%r138, %r3, 2;
+	mov.u32 	%r139, global_smem;
+	add.s32 	%r73, %r139, %r138;
+	mov.b32 	%r74, %f114;
+	@%p21 st.shared.b32 [ %r73 + 0 ], %r74;
+	add.s32 	%r140, %r139, 8;
+	add.s32 	%r75, %r140, %r138;
+	mov.b32 	%r76, %f118;
+	@%p21 st.shared.b32 [ %r75 + 0 ], %r76;
+	add.s32 	%r141, %r139, 16;
+	add.s32 	%r77, %r141, %r138;
+	@%p21 st.shared.b32 [ %r77 + 0 ], %r72;
+	bar.sync 	0;
+	setp.lt.s32 	%p24, %r1, 2;
+	add.s32 	%r80, %r139, %r30;
+	@%p24 ld.shared.b32 %r79, [ %r80 + 0 ];
+	mov.b32 	%f119, %r79;
+	add.s32 	%r82, %r140, %r30;
+	@%p24 ld.shared.b32 %r81, [ %r82 + 0 ];
+	mov.b32 	%f120, %r81;
+	add.s32 	%r84, %r141, %r30;
+	@%p24 ld.shared.b32 %r83, [ %r84 + 0 ];
+	mov.b32 	%f121, %r83;
+	shfl.sync.bfly.b32	%r143, %r79, 1, 31, -1;
+	mov.b32 	%f122, %r143;
+	shfl.sync.bfly.b32	%r144, %r81, 1, 31, -1;
+	mov.b32 	%f123, %r144;
+	shfl.sync.bfly.b32	%r86, %r83, 1, 31, -1;
+	mov.b32 	%f124, %r86;
+$L__tmp14:
+	.loc	2 108 21
+	sub.f32 	%f125, %f122, %f119;
+	.loc	2 109 28
+	add.f32 	%f126, %f121, %f124;
+	.loc	2 110 39
+	setp.eq.f32 	%p50, %f126, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r87, %f126;
+	div.full.f32 %r85, %r86, %r87;
+	mov.b32 	%f127, %r85;
+	.loc	2 110 49
+	selp.f32 	%f128, 0f00000000, %f127, %p50;
+	.loc	2 112 17
+	fma.rn.f32 	%f129, %f125, %f128, %f119;
+	.loc	2 113 15
+	add.f32 	%f130, %f120, %f123;
+	.loc	2 113 30
+	mul.f32 	%f131, %f125, %f125;
+	.loc	2 113 38
+	mul.f32 	%f132, %f121, %f131;
+	.loc	2 113 22
+	fma.rn.f32 	%f133, %f132, %f128, %f130;
+$L__tmp15:
+	.loc	2 120 46
+	and.b32  	%r145, %r1, 1;
+	setp.eq.b32 	%p51, %r145, 1;
+	not.pred 	%p52, %p51;
+	and.pred  	%p27, %p24, %p52;
+	mov.b32 	%r89, %f129;
+	@%p27 st.shared.b32 [ %r80 + 0 ], %r89;
+	mov.b32 	%r91, %f133;
+	@%p27 st.shared.b32 [ %r82 + 0 ], %r91;
+	@%p27 st.shared.b32 [ %r84 + 0 ], %r87;
+	bar.sync 	0;
+	ld.shared.f32 	%f9, [global_smem];
+	ld.shared.f32 	%f10, [global_smem+8];
+$L__tmp16:
+	.loc	1 62 51
+	mov.u32 %r94, 0x0;
+	mov.u32 %r95, 0x0;
+	mov.u32 %r96, 0x0;
+	mov.u32 %r97, 0x0;
+	@%p53 ld.global.L1::evict_last.v4.b32 { %r94, %r95, %r96, %r97 }, [ %rd37 + 0 ];
+	@!%p53 mov.u32 %r94, %r151;
+	@!%p53 mov.u32 %r95, %r151;
+	@!%p53 mov.u32 %r96, %r151;
+	@!%p53 mov.u32 %r97, %r151;
+	.loc	1 63 51
+	mov.u32 %r102, 0x0;
+	mov.u32 %r103, 0x0;
+	@%p53 ld.global.L1::evict_first.v2.b32 { %r102, %r103 }, [ %rd38 + 0 ];
+	@!%p53 mov.u32 %r102, %r151;
+	@!%p53 mov.u32 %r103, %r151;
+	cvt.u16.u32 	%rs5, %r102;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r102; }
+	cvt.u16.u32 	%rs7, %r103;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r103; }
+	.loc	1 63 103
+	cvt.f32.bf16 %r106, %rs5;
+	mov.b32 	%f11, %r106;
+	cvt.f32.bf16 %r107, %rs6;
+	mov.b32 	%f12, %r107;
+	cvt.f32.bf16 %r108, %rs7;
+	mov.b32 	%f13, %r108;
+	cvt.f32.bf16 %r109, %rs8;
+	mov.b32 	%f14, %r109;
+	.loc	1 64 35
+	mul.wide.u32 	%rd46, %r4, 4;
+	add.s64 	%rd39, %rd8, %rd46;
+	.loc	1 64 40
+	mov.u32 %r110, 0x0;
+	mov.u32 %r111, 0x0;
+	mov.u32 %r112, 0x0;
+	mov.u32 %r113, 0x0;
+	@%p53 ld.global.L1::evict_last.v4.b32 { %r110, %r111, %r112, %r113 }, [ %rd39 + 0 ];
+	@!%p53 mov.u32 %r110, %r151;
+	@!%p53 mov.u32 %r111, %r151;
+	@!%p53 mov.u32 %r112, %r151;
+	@!%p53 mov.u32 %r113, %r151;
+	.loc	1 68 57
+	@%p15 bra 	$L__BB0_4;
+	mov.u64 	%rd47, assertMessage_1;
+	cvta.global.u64 	%rd48, %rd47;
+	mov.u64 	%rd49, assertFile_1;
+	cvta.global.u64 	%rd50, %rd49;
+	mov.u64 	%rd51, assertFunc_1;
+	cvta.global.u64 	%rd52, %rd51;
+	{ // callseq 1, 0
+	.reg .b32 temp_param_reg;
+	.param .b64 param0;
+	st.param.b64 	[param0+0], %rd48;
+	.param .b64 param1;
+	st.param.b64 	[param1+0], %rd50;
+	.param .b32 param2;
+	st.param.b32 	[param2+0], %r175;
+	.param .b64 param3;
+	st.param.b64 	[param3+0], %rd52;
+	.param .b64 param4;
+	st.param.b64 	[param4+0], %rd57;
+	call.uni
+	__assertfail,
+	(
+	param0,
+	param1,
+	param2,
+	param3,
+	param4
+	);
+	} // callseq 1
+$L__BB0_4:
+	.loc	1 69 54
+	mov.u32 %r147, 0x0;
+	mov.u32 %r148, 0x0;
+	mov.u32 %r149, 0x0;
+	mov.u32 %r150, 0x0;
+	@%p53 ld.global.L1::evict_first.v4.b32 { %r147, %r148, %r149, %r150 }, [ %rd54 + 0 ];
+	@!%p53 mov.u32 %r147, %r151;
+	@!%p53 mov.u32 %r148, %r151;
+	@!%p53 mov.u32 %r149, %r151;
+	@!%p53 mov.u32 %r150, %r151;
+	.loc	1 75 24
+	mov.b32 	%r156, %f10;
+	mov.b32 	%r157, 1132462080;
+	div.full.f32 %r155, %r156, %r157;
+	mov.b32 	%f134, %r155;
+	.loc	1 77 24
+	add.f32 	%f135, %f134, 0f3727C5AC;
+	.loc	1 78 30
+	rsqrt.approx.ftz.f32 	%f136, %f135;
+	.loc	1 69 54
+	mov.b32 	%f137, %r150;
+	.loc	1 62 51
+	mov.b32 	%f138, %r97;
+	.loc	1 70 24
+	add.f32 	%f139, %f138, %f137;
+	.loc	1 72 24
+	add.f32 	%f140, %f14, %f139;
+	.loc	1 73 24
+	sub.f32 	%f141, %f140, %f9;
+	.loc	1 69 54
+	mov.b32 	%f142, %r149;
+	.loc	1 62 51
+	mov.b32 	%f143, %r96;
+	.loc	1 70 24
+	add.f32 	%f144, %f143, %f142;
+	.loc	1 72 24
+	add.f32 	%f145, %f13, %f144;
+	.loc	1 73 24
+	sub.f32 	%f146, %f145, %f9;
+	.loc	1 69 54
+	mov.b32 	%f147, %r148;
+	.loc	1 62 51
+	mov.b32 	%f148, %r95;
+	.loc	1 70 24
+	add.f32 	%f149, %f148, %f147;
+	.loc	1 72 24
+	add.f32 	%f150, %f12, %f149;
+	.loc	1 73 24
+	sub.f32 	%f151, %f150, %f9;
+	.loc	1 69 54
+	mov.b32 	%f152, %r147;
+	.loc	1 62 51
+	mov.b32 	%f153, %r94;
+	.loc	1 70 24
+	add.f32 	%f154, %f153, %f152;
+	.loc	1 72 24
+	add.f32 	%f155, %f11, %f154;
+	.loc	1 73 24
+	sub.f32 	%f156, %f155, %f9;
+	.loc	1 64 40
+	mov.b32 	%f157, %r110;
+	mov.b32 	%f158, %r111;
+	mov.b32 	%f159, %r112;
+	mov.b32 	%f160, %r113;
+	.loc	1 79 24
+	mul.f32 	%f161, %f156, %f136;
+	mul.f32 	%f162, %f151, %f136;
+	mul.f32 	%f163, %f146, %f136;
+	mul.f32 	%f164, %f141, %f136;
+	.loc	1 80 24
+	mul.f32 	%f165, %f161, %f157;
+	mul.f32 	%f166, %f162, %f158;
+	mul.f32 	%f167, %f163, %f159;
+	mul.f32 	%f168, %f164, %f160;
+	.loc	1 82 29
+	shl.b64 	%rd56, %rd3, 1;
+	add.s64 	%rd55, %rd9, %rd56;
+	.loc	1 82 52
+	mov.b32 	%r167, %f165;
+	cvt.rn.bf16.f32 %rs9, %r167;
+	mov.b32 	%r168, %f166;
+	cvt.rn.bf16.f32 %rs10, %r168;
+	mov.b32 	%r169, %f167;
+	cvt.rn.bf16.f32 %rs11, %r169;
+	mov.b32 	%r170, %f168;
+	cvt.rn.bf16.f32 %rs12, %r170;
+	mov.b32 	%r173, {%rs9, %rs10};
+	mov.b32 	%r174, {%rs11, %rs12};
+	@%p53 st.global.v2.b32 [ %rd55 + 0 ], { %r173, %r174 };
+	.loc	1 58 4
+	ret;
+$L__tmp17:
+$L__func_end0:
+}
+	// .globl	__nv_rsqrtf
+.visible .func  (.param .b32 func_retval0) __nv_rsqrtf(
+	.param .b32 __nv_rsqrtf_param_0
+)
+{
+	.reg .f32 	%f<3>;
+$L__func_begin1:
+	ld.param.f32 	%f1, [__nv_rsqrtf_param_0];
+	rsqrt.approx.ftz.f32 	%f2, %f1;
+	st.param.f32 	[func_retval0+0], %f2;
+	ret;
+$L__func_end1:
+}
+	.file	1 "/tmp/torchinductor_root/pn/cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py"
+	.file	2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 32
+.b8 11
+.b8 0
+.b8 0
+.b8 3
+.b8 46
+.b8 1
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 49
+.b8 19
+.b8 0
+.b8 0
+.b8 4
+.b8 29
+.b8 0
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 5
+.b8 29
+.b8 1
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 302
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 112
+.b8 110
+.b8 51
+.b8 108
+.b8 97
+.b8 119
+.b8 103
+.b8 54
+.b8 53
+.b8 108
+.b8 112
+.b8 105
+.b8 54
+.b8 51
+.b8 103
+.b8 118
+.b8 54
+.b8 99
+.b8 54
+.b8 112
+.b8 110
+.b8 52
+.b8 111
+.b8 105
+.b8 107
+.b8 104
+.b8 103
+.b8 54
+.b8 113
+.b8 118
+.b8 97
+.b8 50
+.b8 104
+.b8 50
+.b8 113
+.b8 106
+.b8 100
+.b8 112
+.b8 120
+.b8 101
+.b8 54
+.b8 113
+.b8 106
+.b8 52
+.b8 108
+.b8 118
+.b8 116
+.b8 116
+.b8 119
+.b8 101
+.b8 122
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 112
+.b8 110
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 101
+.b8 55
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 101
+.b8 55
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 1
+.b8 3
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b32 125
+.b8 4
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp2
+.b8 2
+.b8 47
+.b8 41
+.b8 5
+.b32 125
+.b64 $L__tmp2
+.b64 $L__tmp15
+.b8 2
+.b8 53
+.b8 44
+.b8 4
+.b32 125
+.b64 $L__tmp2
+.b64 $L__tmp15
+.b8 2
+.b8 120
+.b8 46
+.b8 0
+.b8 4
+.b32 125
+.b64 $L__tmp3
+.b64 $L__tmp16
+.b8 2
+.b8 53
+.b8 44
+.b8 0
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 306
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 101
+.b8 55
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 306
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}

.triton/dump/415aac87553b7d064f52694fa7254686/triton_.llir ADDED Viewed

	@@ -0,0 +1,860 @@

+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !7 {
+  %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %5 = shl i32 %4, 3, !dbg !10
+  %6 = and i32 %5, 1016, !dbg !10
+  %7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #4, !dbg !11
+  %8 = shl i32 %7, 10, !dbg !12
+  %9 = or i32 %8, %6, !dbg !13
+  %10 = sext i32 %9 to i64, !dbg !14
+  %11 = getelementptr i16, ptr addrspace(1) %0, i64 %10, !dbg !14
+  %12 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %11, i1 true) #4, !dbg !15
+  %13 = extractvalue { i32, i32, i32, i32 } %12, 0, !dbg !15
+  %14 = extractvalue { i32, i32, i32, i32 } %12, 1, !dbg !15
+  %15 = extractvalue { i32, i32, i32, i32 } %12, 2, !dbg !15
+  %16 = extractvalue { i32, i32, i32, i32 } %12, 3, !dbg !15
+  %17 = trunc i32 %13 to i16, !dbg !15
+  %extelt.offset = lshr i32 %13, 16, !dbg !15
+  %18 = trunc i32 %extelt.offset to i16, !dbg !15
+  %19 = trunc i32 %14 to i16, !dbg !15
+  %extelt.offset1 = lshr i32 %14, 16, !dbg !15
+  %20 = trunc i32 %extelt.offset1 to i16, !dbg !15
+  %21 = trunc i32 %15 to i16, !dbg !15
+  %extelt.offset2 = lshr i32 %15, 16, !dbg !15
+  %22 = trunc i32 %extelt.offset2 to i16, !dbg !15
+  %23 = trunc i32 %16 to i16, !dbg !15
+  %extelt.offset3 = lshr i32 %16, 16, !dbg !15
+  %24 = trunc i32 %extelt.offset3 to i16, !dbg !15
+  %25 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %17) #4, !dbg !16
+  %26 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %18) #4, !dbg !16
+  %27 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %19) #4, !dbg !16
+  %28 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %20) #4, !dbg !16
+  %29 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %21) #4, !dbg !16
+  %30 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %22) #4, !dbg !16
+  %31 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %23) #4, !dbg !16
+  %32 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %24) #4, !dbg !16
+  %33 = fmul float %25, 0x3FE6A09E60000000, !dbg !17
+  %34 = fmul float %26, 0x3FE6A09E60000000, !dbg !17
+  %35 = fmul float %27, 0x3FE6A09E60000000, !dbg !17
+  %36 = fmul float %28, 0x3FE6A09E60000000, !dbg !17
+  %37 = fmul float %29, 0x3FE6A09E60000000, !dbg !17
+  %38 = fmul float %30, 0x3FE6A09E60000000, !dbg !17
+  %39 = fmul float %31, 0x3FE6A09E60000000, !dbg !17
+  %40 = fmul float %32, 0x3FE6A09E60000000, !dbg !17
+  %41 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not.i = icmp eq i32 %41, 0, !dbg !18
+  %42 = tail call float @llvm.nvvm.fabs.ftz.f(float %33) #4, !dbg !18
+  %43 = tail call float @llvm.nvvm.fabs.f(float %33) #4, !dbg !18
+  %.0.i = select i1 %.not.i, float %43, float %42, !dbg !18
+  %44 = fcmp oge float %.0.i, 0x3FF00C1FC0000000, !dbg !18
+  br i1 %44, label %__nv_fabsf.exit1.i, label %46, !dbg !18
+__nv_fabsf.exit1.i:                               ; preds = %3
+  %45 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not1.i = icmp eq i32 %45, 0, !dbg !18
+  %.01.i = select i1 %.not1.i, float %43, float %42, !dbg !18
+  br label %__internal_fmad.exit.i, !dbg !18
+46:                                               ; preds = %3
+  %47 = fmul float %33, %33, !dbg !18
+  br label %__internal_fmad.exit.i, !dbg !18
+__internal_fmad.exit.i:                           ; preds = %46, %__nv_fabsf.exit1.i
+  %48 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i ], [ 0x3FC06EBA60000000, %46 ], !dbg !18
+  %49 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i ], [ 0xBFD8127580000000, %46 ], !dbg !18
+  %50 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i ], [ 0x3FBCE315E0000000, %46 ], !dbg !18
+  %51 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i ], [ 0xBF9B837CE0000000, %46 ], !dbg !18
+  %52 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i ], [ 0x3F755ABD40000000, %46 ], !dbg !18
+  %53 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i ], [ 0xBF4AE9A400000000, %46 ], !dbg !18
+  %54 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i ], [ 0x3F163D2D40000000, %46 ], !dbg !18
+  %55 = phi float [ %.01.i, %__nv_fabsf.exit1.i ], [ %47, %46 ], !dbg !18
+  %56 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not2.i = icmp eq i32 %56, 0, !dbg !18
+  %57 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %54, float %55, float %53) #4, !dbg !18
+  %58 = tail call float @llvm.nvvm.fma.rn.f(float %54, float %55, float %53) #4, !dbg !18
+  %.02.i = select i1 %.not2.i, float %58, float %57, !dbg !18
+  %59 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not3.i = icmp eq i32 %59, 0, !dbg !18
+  %60 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i, float %55, float %52) #4, !dbg !18
+  %61 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i, float %55, float %52) #4, !dbg !18
+  %.03.i = select i1 %.not3.i, float %61, float %60, !dbg !18
+  %62 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not4.i = icmp eq i32 %62, 0, !dbg !18
+  %63 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i, float %55, float %51) #4, !dbg !18
+  %64 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i, float %55, float %51) #4, !dbg !18
+  %.04.i = select i1 %.not4.i, float %64, float %63, !dbg !18
+  %65 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not5.i = icmp eq i32 %65, 0, !dbg !18
+  %66 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i, float %55, float %50) #4, !dbg !18
+  %67 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i, float %55, float %50) #4, !dbg !18
+  %.05.i = select i1 %.not5.i, float %67, float %66, !dbg !18
+  %68 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not6.i = icmp eq i32 %68, 0, !dbg !18
+  %69 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i, float %55, float %49) #4, !dbg !18
+  %70 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i, float %55, float %49) #4, !dbg !18
+  %.06.i = select i1 %.not6.i, float %70, float %69, !dbg !18
+  %71 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not7.i = icmp eq i32 %71, 0, !dbg !18
+  %72 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i, float %55, float %48) #4, !dbg !18
+  %73 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i, float %55, float %48) #4, !dbg !18
+  %.07.i = select i1 %.not7.i, float %73, float %72, !dbg !18
+  %74 = fneg float %55, !dbg !18
+  %75 = select i1 %44, float %74, float %33, !dbg !18
+  %76 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not8.i = icmp eq i32 %76, 0, !dbg !18
+  %77 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i, float %75, float %75) #4, !dbg !18
+  %78 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i, float %75, float %75) #4, !dbg !18
+  %.08.i = select i1 %.not8.i, float %78, float %77, !dbg !18
+  br i1 %44, label %79, label %__nv_erff.exit, !dbg !18
+79:                                               ; preds = %__internal_fmad.exit.i
+  %80 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i) #4, !dbg !18
+  %81 = fsub float 1.000000e+00, %80, !dbg !18
+  %82 = bitcast float %81 to i32, !dbg !18
+  %83 = bitcast float %33 to i32, !dbg !18
+  %84 = and i32 %83, -2147483648, !dbg !18
+  %85 = or i32 %84, %82, !dbg !18
+  %86 = bitcast i32 %85 to float, !dbg !18
+  br label %__nv_erff.exit, !dbg !18
+__nv_erff.exit:                                   ; preds = %__internal_fmad.exit.i, %79
+  %r.0.i = phi float [ %86, %79 ], [ %.08.i, %__internal_fmad.exit.i ], !dbg !18
+  %87 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not.i4 = icmp eq i32 %87, 0, !dbg !18
+  %88 = tail call float @llvm.nvvm.fabs.ftz.f(float %34) #4, !dbg !18
+  %89 = tail call float @llvm.nvvm.fabs.f(float %34) #4, !dbg !18
+  %.0.i5 = select i1 %.not.i4, float %89, float %88, !dbg !18
+  %90 = fcmp oge float %.0.i5, 0x3FF00C1FC0000000, !dbg !18
+  br i1 %90, label %__nv_fabsf.exit1.i22, label %92, !dbg !18
+__nv_fabsf.exit1.i22:                             ; preds = %__nv_erff.exit
+  %91 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not1.i23 = icmp eq i32 %91, 0, !dbg !18
+  %.01.i24 = select i1 %.not1.i23, float %89, float %88, !dbg !18
+  br label %__internal_fmad.exit.i6, !dbg !18
+92:                                               ; preds = %__nv_erff.exit
+  %93 = fmul float %34, %34, !dbg !18
+  br label %__internal_fmad.exit.i6, !dbg !18
+__internal_fmad.exit.i6:                          ; preds = %92, %__nv_fabsf.exit1.i22
+  %94 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i22 ], [ 0x3FC06EBA60000000, %92 ], !dbg !18
+  %95 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i22 ], [ 0xBFD8127580000000, %92 ], !dbg !18
+  %96 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i22 ], [ 0x3FBCE315E0000000, %92 ], !dbg !18
+  %97 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i22 ], [ 0xBF9B837CE0000000, %92 ], !dbg !18
+  %98 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i22 ], [ 0x3F755ABD40000000, %92 ], !dbg !18
+  %99 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i22 ], [ 0xBF4AE9A400000000, %92 ], !dbg !18
+  %100 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i22 ], [ 0x3F163D2D40000000, %92 ], !dbg !18
+  %101 = phi float [ %.01.i24, %__nv_fabsf.exit1.i22 ], [ %93, %92 ], !dbg !18
+  %102 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not2.i7 = icmp eq i32 %102, 0, !dbg !18
+  %103 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %100, float %101, float %99) #4, !dbg !18
+  %104 = tail call float @llvm.nvvm.fma.rn.f(float %100, float %101, float %99) #4, !dbg !18
+  %.02.i8 = select i1 %.not2.i7, float %104, float %103, !dbg !18
+  %105 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not3.i9 = icmp eq i32 %105, 0, !dbg !18
+  %106 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i8, float %101, float %98) #4, !dbg !18
+  %107 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i8, float %101, float %98) #4, !dbg !18
+  %.03.i10 = select i1 %.not3.i9, float %107, float %106, !dbg !18
+  %108 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not4.i11 = icmp eq i32 %108, 0, !dbg !18
+  %109 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i10, float %101, float %97) #4, !dbg !18
+  %110 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i10, float %101, float %97) #4, !dbg !18
+  %.04.i12 = select i1 %.not4.i11, float %110, float %109, !dbg !18
+  %111 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not5.i13 = icmp eq i32 %111, 0, !dbg !18
+  %112 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i12, float %101, float %96) #4, !dbg !18
+  %113 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i12, float %101, float %96) #4, !dbg !18
+  %.05.i14 = select i1 %.not5.i13, float %113, float %112, !dbg !18
+  %114 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not6.i15 = icmp eq i32 %114, 0, !dbg !18
+  %115 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i14, float %101, float %95) #4, !dbg !18
+  %116 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i14, float %101, float %95) #4, !dbg !18
+  %.06.i16 = select i1 %.not6.i15, float %116, float %115, !dbg !18
+  %117 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not7.i17 = icmp eq i32 %117, 0, !dbg !18
+  %118 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i16, float %101, float %94) #4, !dbg !18
+  %119 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i16, float %101, float %94) #4, !dbg !18
+  %.07.i18 = select i1 %.not7.i17, float %119, float %118, !dbg !18
+  %120 = fneg float %101, !dbg !18
+  %121 = select i1 %90, float %120, float %34, !dbg !18
+  %122 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not8.i19 = icmp eq i32 %122, 0, !dbg !18
+  %123 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i18, float %121, float %121) #4, !dbg !18
+  %124 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i18, float %121, float %121) #4, !dbg !18
+  %.08.i20 = select i1 %.not8.i19, float %124, float %123, !dbg !18
+  br i1 %90, label %125, label %__nv_erff.exit25, !dbg !18
+125:                                              ; preds = %__internal_fmad.exit.i6
+  %126 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i20) #4, !dbg !18
+  %127 = fsub float 1.000000e+00, %126, !dbg !18
+  %128 = bitcast float %127 to i32, !dbg !18
+  %129 = bitcast float %34 to i32, !dbg !18
+  %130 = and i32 %129, -2147483648, !dbg !18
+  %131 = or i32 %130, %128, !dbg !18
+  %132 = bitcast i32 %131 to float, !dbg !18
+  br label %__nv_erff.exit25, !dbg !18
+__nv_erff.exit25:                                 ; preds = %__internal_fmad.exit.i6, %125
+  %r.0.i21 = phi float [ %132, %125 ], [ %.08.i20, %__internal_fmad.exit.i6 ], !dbg !18
+  %133 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not.i26 = icmp eq i32 %133, 0, !dbg !18
+  %134 = tail call float @llvm.nvvm.fabs.ftz.f(float %35) #4, !dbg !18
+  %135 = tail call float @llvm.nvvm.fabs.f(float %35) #4, !dbg !18
+  %.0.i27 = select i1 %.not.i26, float %135, float %134, !dbg !18
+  %136 = fcmp oge float %.0.i27, 0x3FF00C1FC0000000, !dbg !18
+  br i1 %136, label %__nv_fabsf.exit1.i44, label %138, !dbg !18
+__nv_fabsf.exit1.i44:                             ; preds = %__nv_erff.exit25
+  %137 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not1.i45 = icmp eq i32 %137, 0, !dbg !18
+  %.01.i46 = select i1 %.not1.i45, float %135, float %134, !dbg !18
+  br label %__internal_fmad.exit.i28, !dbg !18
+138:                                              ; preds = %__nv_erff.exit25
+  %139 = fmul float %35, %35, !dbg !18
+  br label %__internal_fmad.exit.i28, !dbg !18
+__internal_fmad.exit.i28:                         ; preds = %138, %__nv_fabsf.exit1.i44
+  %140 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i44 ], [ 0x3FC06EBA60000000, %138 ], !dbg !18
+  %141 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i44 ], [ 0xBFD8127580000000, %138 ], !dbg !18
+  %142 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i44 ], [ 0x3FBCE315E0000000, %138 ], !dbg !18
+  %143 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i44 ], [ 0xBF9B837CE0000000, %138 ], !dbg !18
+  %144 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i44 ], [ 0x3F755ABD40000000, %138 ], !dbg !18
+  %145 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i44 ], [ 0xBF4AE9A400000000, %138 ], !dbg !18
+  %146 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i44 ], [ 0x3F163D2D40000000, %138 ], !dbg !18
+  %147 = phi float [ %.01.i46, %__nv_fabsf.exit1.i44 ], [ %139, %138 ], !dbg !18
+  %148 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not2.i29 = icmp eq i32 %148, 0, !dbg !18
+  %149 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %146, float %147, float %145) #4, !dbg !18
+  %150 = tail call float @llvm.nvvm.fma.rn.f(float %146, float %147, float %145) #4, !dbg !18
+  %.02.i30 = select i1 %.not2.i29, float %150, float %149, !dbg !18
+  %151 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not3.i31 = icmp eq i32 %151, 0, !dbg !18
+  %152 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i30, float %147, float %144) #4, !dbg !18
+  %153 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i30, float %147, float %144) #4, !dbg !18
+  %.03.i32 = select i1 %.not3.i31, float %153, float %152, !dbg !18
+  %154 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not4.i33 = icmp eq i32 %154, 0, !dbg !18
+  %155 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i32, float %147, float %143) #4, !dbg !18
+  %156 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i32, float %147, float %143) #4, !dbg !18
+  %.04.i34 = select i1 %.not4.i33, float %156, float %155, !dbg !18
+  %157 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not5.i35 = icmp eq i32 %157, 0, !dbg !18
+  %158 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i34, float %147, float %142) #4, !dbg !18
+  %159 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i34, float %147, float %142) #4, !dbg !18
+  %.05.i36 = select i1 %.not5.i35, float %159, float %158, !dbg !18
+  %160 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not6.i37 = icmp eq i32 %160, 0, !dbg !18
+  %161 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i36, float %147, float %141) #4, !dbg !18
+  %162 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i36, float %147, float %141) #4, !dbg !18
+  %.06.i38 = select i1 %.not6.i37, float %162, float %161, !dbg !18
+  %163 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not7.i39 = icmp eq i32 %163, 0, !dbg !18
+  %164 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i38, float %147, float %140) #4, !dbg !18
+  %165 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i38, float %147, float %140) #4, !dbg !18
+  %.07.i40 = select i1 %.not7.i39, float %165, float %164, !dbg !18
+  %166 = fneg float %147, !dbg !18
+  %167 = select i1 %136, float %166, float %35, !dbg !18
+  %168 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not8.i41 = icmp eq i32 %168, 0, !dbg !18
+  %169 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i40, float %167, float %167) #4, !dbg !18
+  %170 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i40, float %167, float %167) #4, !dbg !18
+  %.08.i42 = select i1 %.not8.i41, float %170, float %169, !dbg !18
+  br i1 %136, label %171, label %__nv_erff.exit47, !dbg !18
+171:                                              ; preds = %__internal_fmad.exit.i28
+  %172 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i42) #4, !dbg !18
+  %173 = fsub float 1.000000e+00, %172, !dbg !18
+  %174 = bitcast float %173 to i32, !dbg !18
+  %175 = bitcast float %35 to i32, !dbg !18
+  %176 = and i32 %175, -2147483648, !dbg !18
+  %177 = or i32 %176, %174, !dbg !18
+  %178 = bitcast i32 %177 to float, !dbg !18
+  br label %__nv_erff.exit47, !dbg !18
+__nv_erff.exit47:                                 ; preds = %__internal_fmad.exit.i28, %171
+  %r.0.i43 = phi float [ %178, %171 ], [ %.08.i42, %__internal_fmad.exit.i28 ], !dbg !18
+  %179 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not.i48 = icmp eq i32 %179, 0, !dbg !18
+  %180 = tail call float @llvm.nvvm.fabs.ftz.f(float %36) #4, !dbg !18
+  %181 = tail call float @llvm.nvvm.fabs.f(float %36) #4, !dbg !18
+  %.0.i49 = select i1 %.not.i48, float %181, float %180, !dbg !18
+  %182 = fcmp oge float %.0.i49, 0x3FF00C1FC0000000, !dbg !18
+  br i1 %182, label %__nv_fabsf.exit1.i66, label %184, !dbg !18
+__nv_fabsf.exit1.i66:                             ; preds = %__nv_erff.exit47
+  %183 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not1.i67 = icmp eq i32 %183, 0, !dbg !18
+  %.01.i68 = select i1 %.not1.i67, float %181, float %180, !dbg !18
+  br label %__internal_fmad.exit.i50, !dbg !18
+184:                                              ; preds = %__nv_erff.exit47
+  %185 = fmul float %36, %36, !dbg !18
+  br label %__internal_fmad.exit.i50, !dbg !18
+__internal_fmad.exit.i50:                         ; preds = %184, %__nv_fabsf.exit1.i66
+  %186 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i66 ], [ 0x3FC06EBA60000000, %184 ], !dbg !18
+  %187 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i66 ], [ 0xBFD8127580000000, %184 ], !dbg !18
+  %188 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i66 ], [ 0x3FBCE315E0000000, %184 ], !dbg !18
+  %189 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i66 ], [ 0xBF9B837CE0000000, %184 ], !dbg !18
+  %190 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i66 ], [ 0x3F755ABD40000000, %184 ], !dbg !18
+  %191 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i66 ], [ 0xBF4AE9A400000000, %184 ], !dbg !18
+  %192 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i66 ], [ 0x3F163D2D40000000, %184 ], !dbg !18
+  %193 = phi float [ %.01.i68, %__nv_fabsf.exit1.i66 ], [ %185, %184 ], !dbg !18
+  %194 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not2.i51 = icmp eq i32 %194, 0, !dbg !18
+  %195 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %192, float %193, float %191) #4, !dbg !18
+  %196 = tail call float @llvm.nvvm.fma.rn.f(float %192, float %193, float %191) #4, !dbg !18
+  %.02.i52 = select i1 %.not2.i51, float %196, float %195, !dbg !18
+  %197 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not3.i53 = icmp eq i32 %197, 0, !dbg !18
+  %198 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i52, float %193, float %190) #4, !dbg !18
+  %199 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i52, float %193, float %190) #4, !dbg !18
+  %.03.i54 = select i1 %.not3.i53, float %199, float %198, !dbg !18
+  %200 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not4.i55 = icmp eq i32 %200, 0, !dbg !18
+  %201 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i54, float %193, float %189) #4, !dbg !18
+  %202 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i54, float %193, float %189) #4, !dbg !18
+  %.04.i56 = select i1 %.not4.i55, float %202, float %201, !dbg !18
+  %203 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not5.i57 = icmp eq i32 %203, 0, !dbg !18
+  %204 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i56, float %193, float %188) #4, !dbg !18
+  %205 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i56, float %193, float %188) #4, !dbg !18
+  %.05.i58 = select i1 %.not5.i57, float %205, float %204, !dbg !18
+  %206 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not6.i59 = icmp eq i32 %206, 0, !dbg !18
+  %207 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i58, float %193, float %187) #4, !dbg !18
+  %208 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i58, float %193, float %187) #4, !dbg !18
+  %.06.i60 = select i1 %.not6.i59, float %208, float %207, !dbg !18
+  %209 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not7.i61 = icmp eq i32 %209, 0, !dbg !18
+  %210 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i60, float %193, float %186) #4, !dbg !18
+  %211 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i60, float %193, float %186) #4, !dbg !18
+  %.07.i62 = select i1 %.not7.i61, float %211, float %210, !dbg !18
+  %212 = fneg float %193, !dbg !18
+  %213 = select i1 %182, float %212, float %36, !dbg !18
+  %214 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not8.i63 = icmp eq i32 %214, 0, !dbg !18
+  %215 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i62, float %213, float %213) #4, !dbg !18
+  %216 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i62, float %213, float %213) #4, !dbg !18
+  %.08.i64 = select i1 %.not8.i63, float %216, float %215, !dbg !18
+  br i1 %182, label %217, label %__nv_erff.exit69, !dbg !18
+217:                                              ; preds = %__internal_fmad.exit.i50
+  %218 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i64) #4, !dbg !18
+  %219 = fsub float 1.000000e+00, %218, !dbg !18
+  %220 = bitcast float %219 to i32, !dbg !18
+  %221 = bitcast float %36 to i32, !dbg !18
+  %222 = and i32 %221, -2147483648, !dbg !18
+  %223 = or i32 %222, %220, !dbg !18
+  %224 = bitcast i32 %223 to float, !dbg !18
+  br label %__nv_erff.exit69, !dbg !18
+__nv_erff.exit69:                                 ; preds = %__internal_fmad.exit.i50, %217
+  %r.0.i65 = phi float [ %224, %217 ], [ %.08.i64, %__internal_fmad.exit.i50 ], !dbg !18
+  %225 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not.i70 = icmp eq i32 %225, 0, !dbg !18
+  %226 = tail call float @llvm.nvvm.fabs.ftz.f(float %37) #4, !dbg !18
+  %227 = tail call float @llvm.nvvm.fabs.f(float %37) #4, !dbg !18
+  %.0.i71 = select i1 %.not.i70, float %227, float %226, !dbg !18
+  %228 = fcmp oge float %.0.i71, 0x3FF00C1FC0000000, !dbg !18
+  br i1 %228, label %__nv_fabsf.exit1.i88, label %230, !dbg !18
+__nv_fabsf.exit1.i88:                             ; preds = %__nv_erff.exit69
+  %229 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not1.i89 = icmp eq i32 %229, 0, !dbg !18
+  %.01.i90 = select i1 %.not1.i89, float %227, float %226, !dbg !18
+  br label %__internal_fmad.exit.i72, !dbg !18
+230:                                              ; preds = %__nv_erff.exit69
+  %231 = fmul float %37, %37, !dbg !18
+  br label %__internal_fmad.exit.i72, !dbg !18
+__internal_fmad.exit.i72:                         ; preds = %230, %__nv_fabsf.exit1.i88
+  %232 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i88 ], [ 0x3FC06EBA60000000, %230 ], !dbg !18
+  %233 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i88 ], [ 0xBFD8127580000000, %230 ], !dbg !18
+  %234 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i88 ], [ 0x3FBCE315E0000000, %230 ], !dbg !18
+  %235 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i88 ], [ 0xBF9B837CE0000000, %230 ], !dbg !18
+  %236 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i88 ], [ 0x3F755ABD40000000, %230 ], !dbg !18
+  %237 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i88 ], [ 0xBF4AE9A400000000, %230 ], !dbg !18
+  %238 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i88 ], [ 0x3F163D2D40000000, %230 ], !dbg !18
+  %239 = phi float [ %.01.i90, %__nv_fabsf.exit1.i88 ], [ %231, %230 ], !dbg !18
+  %240 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not2.i73 = icmp eq i32 %240, 0, !dbg !18
+  %241 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %238, float %239, float %237) #4, !dbg !18
+  %242 = tail call float @llvm.nvvm.fma.rn.f(float %238, float %239, float %237) #4, !dbg !18
+  %.02.i74 = select i1 %.not2.i73, float %242, float %241, !dbg !18
+  %243 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not3.i75 = icmp eq i32 %243, 0, !dbg !18
+  %244 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i74, float %239, float %236) #4, !dbg !18
+  %245 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i74, float %239, float %236) #4, !dbg !18
+  %.03.i76 = select i1 %.not3.i75, float %245, float %244, !dbg !18
+  %246 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not4.i77 = icmp eq i32 %246, 0, !dbg !18
+  %247 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i76, float %239, float %235) #4, !dbg !18
+  %248 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i76, float %239, float %235) #4, !dbg !18
+  %.04.i78 = select i1 %.not4.i77, float %248, float %247, !dbg !18
+  %249 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not5.i79 = icmp eq i32 %249, 0, !dbg !18
+  %250 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i78, float %239, float %234) #4, !dbg !18
+  %251 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i78, float %239, float %234) #4, !dbg !18
+  %.05.i80 = select i1 %.not5.i79, float %251, float %250, !dbg !18
+  %252 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not6.i81 = icmp eq i32 %252, 0, !dbg !18
+  %253 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i80, float %239, float %233) #4, !dbg !18
+  %254 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i80, float %239, float %233) #4, !dbg !18
+  %.06.i82 = select i1 %.not6.i81, float %254, float %253, !dbg !18
+  %255 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not7.i83 = icmp eq i32 %255, 0, !dbg !18
+  %256 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i82, float %239, float %232) #4, !dbg !18
+  %257 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i82, float %239, float %232) #4, !dbg !18
+  %.07.i84 = select i1 %.not7.i83, float %257, float %256, !dbg !18
+  %258 = fneg float %239, !dbg !18
+  %259 = select i1 %228, float %258, float %37, !dbg !18
+  %260 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not8.i85 = icmp eq i32 %260, 0, !dbg !18
+  %261 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i84, float %259, float %259) #4, !dbg !18
+  %262 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i84, float %259, float %259) #4, !dbg !18
+  %.08.i86 = select i1 %.not8.i85, float %262, float %261, !dbg !18
+  br i1 %228, label %263, label %__nv_erff.exit91, !dbg !18
+263:                                              ; preds = %__internal_fmad.exit.i72
+  %264 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i86) #4, !dbg !18
+  %265 = fsub float 1.000000e+00, %264, !dbg !18
+  %266 = bitcast float %265 to i32, !dbg !18
+  %267 = bitcast float %37 to i32, !dbg !18
+  %268 = and i32 %267, -2147483648, !dbg !18
+  %269 = or i32 %268, %266, !dbg !18
+  %270 = bitcast i32 %269 to float, !dbg !18
+  br label %__nv_erff.exit91, !dbg !18
+__nv_erff.exit91:                                 ; preds = %__internal_fmad.exit.i72, %263
+  %r.0.i87 = phi float [ %270, %263 ], [ %.08.i86, %__internal_fmad.exit.i72 ], !dbg !18
+  %271 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not.i92 = icmp eq i32 %271, 0, !dbg !18
+  %272 = tail call float @llvm.nvvm.fabs.ftz.f(float %38) #4, !dbg !18
+  %273 = tail call float @llvm.nvvm.fabs.f(float %38) #4, !dbg !18
+  %.0.i93 = select i1 %.not.i92, float %273, float %272, !dbg !18
+  %274 = fcmp oge float %.0.i93, 0x3FF00C1FC0000000, !dbg !18
+  br i1 %274, label %__nv_fabsf.exit1.i110, label %276, !dbg !18
+__nv_fabsf.exit1.i110:                            ; preds = %__nv_erff.exit91
+  %275 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not1.i111 = icmp eq i32 %275, 0, !dbg !18
+  %.01.i112 = select i1 %.not1.i111, float %273, float %272, !dbg !18
+  br label %__internal_fmad.exit.i94, !dbg !18
+276:                                              ; preds = %__nv_erff.exit91
+  %277 = fmul float %38, %38, !dbg !18
+  br label %__internal_fmad.exit.i94, !dbg !18
+__internal_fmad.exit.i94:                         ; preds = %276, %__nv_fabsf.exit1.i110
+  %278 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i110 ], [ 0x3FC06EBA60000000, %276 ], !dbg !18
+  %279 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i110 ], [ 0xBFD8127580000000, %276 ], !dbg !18
+  %280 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i110 ], [ 0x3FBCE315E0000000, %276 ], !dbg !18
+  %281 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i110 ], [ 0xBF9B837CE0000000, %276 ], !dbg !18
+  %282 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i110 ], [ 0x3F755ABD40000000, %276 ], !dbg !18
+  %283 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i110 ], [ 0xBF4AE9A400000000, %276 ], !dbg !18
+  %284 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i110 ], [ 0x3F163D2D40000000, %276 ], !dbg !18
+  %285 = phi float [ %.01.i112, %__nv_fabsf.exit1.i110 ], [ %277, %276 ], !dbg !18
+  %286 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not2.i95 = icmp eq i32 %286, 0, !dbg !18
+  %287 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %284, float %285, float %283) #4, !dbg !18
+  %288 = tail call float @llvm.nvvm.fma.rn.f(float %284, float %285, float %283) #4, !dbg !18
+  %.02.i96 = select i1 %.not2.i95, float %288, float %287, !dbg !18
+  %289 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not3.i97 = icmp eq i32 %289, 0, !dbg !18
+  %290 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i96, float %285, float %282) #4, !dbg !18
+  %291 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i96, float %285, float %282) #4, !dbg !18
+  %.03.i98 = select i1 %.not3.i97, float %291, float %290, !dbg !18
+  %292 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not4.i99 = icmp eq i32 %292, 0, !dbg !18
+  %293 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i98, float %285, float %281) #4, !dbg !18
+  %294 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i98, float %285, float %281) #4, !dbg !18
+  %.04.i100 = select i1 %.not4.i99, float %294, float %293, !dbg !18
+  %295 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not5.i101 = icmp eq i32 %295, 0, !dbg !18
+  %296 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i100, float %285, float %280) #4, !dbg !18
+  %297 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i100, float %285, float %280) #4, !dbg !18
+  %.05.i102 = select i1 %.not5.i101, float %297, float %296, !dbg !18
+  %298 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not6.i103 = icmp eq i32 %298, 0, !dbg !18
+  %299 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i102, float %285, float %279) #4, !dbg !18
+  %300 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i102, float %285, float %279) #4, !dbg !18
+  %.06.i104 = select i1 %.not6.i103, float %300, float %299, !dbg !18
+  %301 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not7.i105 = icmp eq i32 %301, 0, !dbg !18
+  %302 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i104, float %285, float %278) #4, !dbg !18
+  %303 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i104, float %285, float %278) #4, !dbg !18
+  %.07.i106 = select i1 %.not7.i105, float %303, float %302, !dbg !18
+  %304 = fneg float %285, !dbg !18
+  %305 = select i1 %274, float %304, float %38, !dbg !18
+  %306 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not8.i107 = icmp eq i32 %306, 0, !dbg !18
+  %307 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i106, float %305, float %305) #4, !dbg !18
+  %308 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i106, float %305, float %305) #4, !dbg !18
+  %.08.i108 = select i1 %.not8.i107, float %308, float %307, !dbg !18
+  br i1 %274, label %309, label %__nv_erff.exit113, !dbg !18
+309:                                              ; preds = %__internal_fmad.exit.i94
+  %310 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i108) #4, !dbg !18
+  %311 = fsub float 1.000000e+00, %310, !dbg !18
+  %312 = bitcast float %311 to i32, !dbg !18
+  %313 = bitcast float %38 to i32, !dbg !18
+  %314 = and i32 %313, -2147483648, !dbg !18
+  %315 = or i32 %314, %312, !dbg !18
+  %316 = bitcast i32 %315 to float, !dbg !18
+  br label %__nv_erff.exit113, !dbg !18
+__nv_erff.exit113:                                ; preds = %__internal_fmad.exit.i94, %309
+  %r.0.i109 = phi float [ %316, %309 ], [ %.08.i108, %__internal_fmad.exit.i94 ], !dbg !18
+  %317 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not.i114 = icmp eq i32 %317, 0, !dbg !18
+  %318 = tail call float @llvm.nvvm.fabs.ftz.f(float %39) #4, !dbg !18
+  %319 = tail call float @llvm.nvvm.fabs.f(float %39) #4, !dbg !18
+  %.0.i115 = select i1 %.not.i114, float %319, float %318, !dbg !18
+  %320 = fcmp oge float %.0.i115, 0x3FF00C1FC0000000, !dbg !18
+  br i1 %320, label %__nv_fabsf.exit1.i132, label %322, !dbg !18
+__nv_fabsf.exit1.i132:                            ; preds = %__nv_erff.exit113
+  %321 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not1.i133 = icmp eq i32 %321, 0, !dbg !18
+  %.01.i134 = select i1 %.not1.i133, float %319, float %318, !dbg !18
+  br label %__internal_fmad.exit.i116, !dbg !18
+322:                                              ; preds = %__nv_erff.exit113
+  %323 = fmul float %39, %39, !dbg !18
+  br label %__internal_fmad.exit.i116, !dbg !18
+__internal_fmad.exit.i116:                        ; preds = %322, %__nv_fabsf.exit1.i132
+  %324 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i132 ], [ 0x3FC06EBA60000000, %322 ], !dbg !18
+  %325 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i132 ], [ 0xBFD8127580000000, %322 ], !dbg !18
+  %326 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i132 ], [ 0x3FBCE315E0000000, %322 ], !dbg !18
+  %327 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i132 ], [ 0xBF9B837CE0000000, %322 ], !dbg !18
+  %328 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i132 ], [ 0x3F755ABD40000000, %322 ], !dbg !18
+  %329 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i132 ], [ 0xBF4AE9A400000000, %322 ], !dbg !18
+  %330 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i132 ], [ 0x3F163D2D40000000, %322 ], !dbg !18
+  %331 = phi float [ %.01.i134, %__nv_fabsf.exit1.i132 ], [ %323, %322 ], !dbg !18
+  %332 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not2.i117 = icmp eq i32 %332, 0, !dbg !18
+  %333 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %330, float %331, float %329) #4, !dbg !18
+  %334 = tail call float @llvm.nvvm.fma.rn.f(float %330, float %331, float %329) #4, !dbg !18
+  %.02.i118 = select i1 %.not2.i117, float %334, float %333, !dbg !18
+  %335 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not3.i119 = icmp eq i32 %335, 0, !dbg !18
+  %336 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i118, float %331, float %328) #4, !dbg !18
+  %337 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i118, float %331, float %328) #4, !dbg !18
+  %.03.i120 = select i1 %.not3.i119, float %337, float %336, !dbg !18
+  %338 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not4.i121 = icmp eq i32 %338, 0, !dbg !18
+  %339 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i120, float %331, float %327) #4, !dbg !18
+  %340 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i120, float %331, float %327) #4, !dbg !18
+  %.04.i122 = select i1 %.not4.i121, float %340, float %339, !dbg !18
+  %341 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not5.i123 = icmp eq i32 %341, 0, !dbg !18
+  %342 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i122, float %331, float %326) #4, !dbg !18
+  %343 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i122, float %331, float %326) #4, !dbg !18
+  %.05.i124 = select i1 %.not5.i123, float %343, float %342, !dbg !18
+  %344 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not6.i125 = icmp eq i32 %344, 0, !dbg !18
+  %345 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i124, float %331, float %325) #4, !dbg !18
+  %346 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i124, float %331, float %325) #4, !dbg !18
+  %.06.i126 = select i1 %.not6.i125, float %346, float %345, !dbg !18
+  %347 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not7.i127 = icmp eq i32 %347, 0, !dbg !18
+  %348 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i126, float %331, float %324) #4, !dbg !18
+  %349 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i126, float %331, float %324) #4, !dbg !18
+  %.07.i128 = select i1 %.not7.i127, float %349, float %348, !dbg !18
+  %350 = fneg float %331, !dbg !18
+  %351 = select i1 %320, float %350, float %39, !dbg !18
+  %352 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not8.i129 = icmp eq i32 %352, 0, !dbg !18
+  %353 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i128, float %351, float %351) #4, !dbg !18
+  %354 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i128, float %351, float %351) #4, !dbg !18
+  %.08.i130 = select i1 %.not8.i129, float %354, float %353, !dbg !18
+  br i1 %320, label %355, label %__nv_erff.exit135, !dbg !18
+355:                                              ; preds = %__internal_fmad.exit.i116
+  %356 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i130) #4, !dbg !18
+  %357 = fsub float 1.000000e+00, %356, !dbg !18
+  %358 = bitcast float %357 to i32, !dbg !18
+  %359 = bitcast float %39 to i32, !dbg !18
+  %360 = and i32 %359, -2147483648, !dbg !18
+  %361 = or i32 %360, %358, !dbg !18
+  %362 = bitcast i32 %361 to float, !dbg !18
+  br label %__nv_erff.exit135, !dbg !18
+__nv_erff.exit135:                                ; preds = %__internal_fmad.exit.i116, %355
+  %r.0.i131 = phi float [ %362, %355 ], [ %.08.i130, %__internal_fmad.exit.i116 ], !dbg !18
+  %363 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not.i136 = icmp eq i32 %363, 0, !dbg !18
+  %364 = tail call float @llvm.nvvm.fabs.ftz.f(float %40) #4, !dbg !18
+  %365 = tail call float @llvm.nvvm.fabs.f(float %40) #4, !dbg !18
+  %.0.i137 = select i1 %.not.i136, float %365, float %364, !dbg !18
+  %366 = fcmp oge float %.0.i137, 0x3FF00C1FC0000000, !dbg !18
+  br i1 %366, label %__nv_fabsf.exit1.i154, label %368, !dbg !18
+__nv_fabsf.exit1.i154:                            ; preds = %__nv_erff.exit135
+  %367 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not1.i155 = icmp eq i32 %367, 0, !dbg !18
+  %.01.i156 = select i1 %.not1.i155, float %365, float %364, !dbg !18
+  br label %__internal_fmad.exit.i138, !dbg !18
+368:                                              ; preds = %__nv_erff.exit135
+  %369 = fmul float %40, %40, !dbg !18
+  br label %__internal_fmad.exit.i138, !dbg !18
+__internal_fmad.exit.i138:                        ; preds = %368, %__nv_fabsf.exit1.i154
+  %370 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i154 ], [ 0x3FC06EBA60000000, %368 ], !dbg !18
+  %371 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i154 ], [ 0xBFD8127580000000, %368 ], !dbg !18
+  %372 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i154 ], [ 0x3FBCE315E0000000, %368 ], !dbg !18
+  %373 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i154 ], [ 0xBF9B837CE0000000, %368 ], !dbg !18
+  %374 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i154 ], [ 0x3F755ABD40000000, %368 ], !dbg !18
+  %375 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i154 ], [ 0xBF4AE9A400000000, %368 ], !dbg !18
+  %376 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i154 ], [ 0x3F163D2D40000000, %368 ], !dbg !18
+  %377 = phi float [ %.01.i156, %__nv_fabsf.exit1.i154 ], [ %369, %368 ], !dbg !18
+  %378 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not2.i139 = icmp eq i32 %378, 0, !dbg !18
+  %379 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %376, float %377, float %375) #4, !dbg !18
+  %380 = tail call float @llvm.nvvm.fma.rn.f(float %376, float %377, float %375) #4, !dbg !18
+  %.02.i140 = select i1 %.not2.i139, float %380, float %379, !dbg !18
+  %381 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not3.i141 = icmp eq i32 %381, 0, !dbg !18
+  %382 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i140, float %377, float %374) #4, !dbg !18
+  %383 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i140, float %377, float %374) #4, !dbg !18
+  %.03.i142 = select i1 %.not3.i141, float %383, float %382, !dbg !18
+  %384 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not4.i143 = icmp eq i32 %384, 0, !dbg !18
+  %385 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i142, float %377, float %373) #4, !dbg !18
+  %386 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i142, float %377, float %373) #4, !dbg !18
+  %.04.i144 = select i1 %.not4.i143, float %386, float %385, !dbg !18
+  %387 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not5.i145 = icmp eq i32 %387, 0, !dbg !18
+  %388 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i144, float %377, float %372) #4, !dbg !18
+  %389 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i144, float %377, float %372) #4, !dbg !18
+  %.05.i146 = select i1 %.not5.i145, float %389, float %388, !dbg !18
+  %390 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not6.i147 = icmp eq i32 %390, 0, !dbg !18
+  %391 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i146, float %377, float %371) #4, !dbg !18
+  %392 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i146, float %377, float %371) #4, !dbg !18
+  %.06.i148 = select i1 %.not6.i147, float %392, float %391, !dbg !18
+  %393 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not7.i149 = icmp eq i32 %393, 0, !dbg !18
+  %394 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i148, float %377, float %370) #4, !dbg !18
+  %395 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i148, float %377, float %370) #4, !dbg !18
+  %.07.i150 = select i1 %.not7.i149, float %395, float %394, !dbg !18
+  %396 = fneg float %377, !dbg !18
+  %397 = select i1 %366, float %396, float %40, !dbg !18
+  %398 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not8.i151 = icmp eq i32 %398, 0, !dbg !18
+  %399 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i150, float %397, float %397) #4, !dbg !18
+  %400 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i150, float %397, float %397) #4, !dbg !18
+  %.08.i152 = select i1 %.not8.i151, float %400, float %399, !dbg !18
+  br i1 %366, label %401, label %__nv_erff.exit157, !dbg !18
+401:                                              ; preds = %__internal_fmad.exit.i138
+  %402 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i152) #4, !dbg !18
+  %403 = fsub float 1.000000e+00, %402, !dbg !18
+  %404 = bitcast float %403 to i32, !dbg !18
+  %405 = bitcast float %40 to i32, !dbg !18
+  %406 = and i32 %405, -2147483648, !dbg !18
+  %407 = or i32 %406, %404, !dbg !18
+  %408 = bitcast i32 %407 to float, !dbg !18
+  br label %__nv_erff.exit157, !dbg !18
+__nv_erff.exit157:                                ; preds = %__internal_fmad.exit.i138, %401
+  %r.0.i153 = phi float [ %408, %401 ], [ %.08.i152, %__internal_fmad.exit.i138 ], !dbg !18
+  %409 = fmul float %32, 5.000000e-01, !dbg !19
+  %410 = fmul float %31, 5.000000e-01, !dbg !19
+  %411 = fmul float %30, 5.000000e-01, !dbg !19
+  %412 = fmul float %29, 5.000000e-01, !dbg !19
+  %413 = fmul float %28, 5.000000e-01, !dbg !19
+  %414 = fmul float %27, 5.000000e-01, !dbg !19
+  %415 = fmul float %26, 5.000000e-01, !dbg !19
+  %416 = fmul float %25, 5.000000e-01, !dbg !19
+  %417 = fadd float %r.0.i, 1.000000e+00, !dbg !20
+  %418 = fadd float %r.0.i21, 1.000000e+00, !dbg !20
+  %419 = fadd float %r.0.i43, 1.000000e+00, !dbg !20
+  %420 = fadd float %r.0.i65, 1.000000e+00, !dbg !20
+  %421 = fadd float %r.0.i87, 1.000000e+00, !dbg !20
+  %422 = fadd float %r.0.i109, 1.000000e+00, !dbg !20
+  %423 = fadd float %r.0.i131, 1.000000e+00, !dbg !20
+  %424 = fadd float %r.0.i153, 1.000000e+00, !dbg !20
+  %425 = fmul float %416, %417, !dbg !21
+  %426 = fmul float %415, %418, !dbg !21
+  %427 = fmul float %414, %419, !dbg !21
+  %428 = fmul float %413, %420, !dbg !21
+  %429 = fmul float %412, %421, !dbg !21
+  %430 = fmul float %411, %422, !dbg !21
+  %431 = fmul float %410, %423, !dbg !21
+  %432 = fmul float %409, %424, !dbg !21
+  %433 = getelementptr i16, ptr addrspace(1) %1, i64 %10, !dbg !22
+  %434 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %425) #4, !dbg !23
+  %435 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %426) #4, !dbg !23
+  %436 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %427) #4, !dbg !23
+  %437 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %428) #4, !dbg !23
+  %438 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %429) #4, !dbg !23
+  %439 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %430) #4, !dbg !23
+  %440 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %431) #4, !dbg !23
+  %441 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %432) #4, !dbg !23
+  %442 = insertelement <2 x i16> undef, i16 %434, i64 0, !dbg !23
+  %443 = insertelement <2 x i16> %442, i16 %435, i64 1, !dbg !23
+  %444 = bitcast <2 x i16> %443 to i32, !dbg !23
+  %445 = insertelement <2 x i16> undef, i16 %436, i64 0, !dbg !23
+  %446 = insertelement <2 x i16> %445, i16 %437, i64 1, !dbg !23
+  %447 = bitcast <2 x i16> %446 to i32, !dbg !23
+  %448 = insertelement <2 x i16> undef, i16 %438, i64 0, !dbg !23
+  %449 = insertelement <2 x i16> %448, i16 %439, i64 1, !dbg !23
+  %450 = bitcast <2 x i16> %449 to i32, !dbg !23
+  %451 = insertelement <2 x i16> undef, i16 %440, i64 0, !dbg !23
+  %452 = insertelement <2 x i16> %451, i16 %441, i64 1, !dbg !23
+  %453 = bitcast <2 x i16> %452 to i32, !dbg !23
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %444, i32 %447, i32 %450, i32 %453, ptr addrspace(1) %433, i1 true) #4, !dbg !23
+  ret void, !dbg !24
+}
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+; Function Attrs: alwaysinline nounwind
+define float @__nv_erff(float %a) local_unnamed_addr #1 {
+__nv_fabsf.exit:
+  %0 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not = icmp eq i32 %0, 0
+  %1 = tail call float @llvm.nvvm.fabs.ftz.f(float %a) #4
+  %2 = tail call float @llvm.nvvm.fabs.f(float %a) #4
+  %.0 = select i1 %.not, float %2, float %1
+  %3 = fcmp oge float %.0, 0x3FF00C1FC0000000
+  br i1 %3, label %__nv_fabsf.exit1, label %5
+__nv_fabsf.exit1:                                 ; preds = %__nv_fabsf.exit
+  %4 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not1 = icmp eq i32 %4, 0
+  %.01 = select i1 %.not1, float %2, float %1
+  br label %__internal_fmad.exit
+5:                                                ; preds = %__nv_fabsf.exit
+  %6 = fmul float %a, %a
+  br label %__internal_fmad.exit
+__internal_fmad.exit:                             ; preds = %5, %__nv_fabsf.exit1
+  %7 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1 ], [ 0x3FC06EBA60000000, %5 ]
+  %8 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1 ], [ 0xBFD8127580000000, %5 ]
+  %9 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1 ], [ 0x3FBCE315E0000000, %5 ]
+  %10 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1 ], [ 0xBF9B837CE0000000, %5 ]
+  %11 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1 ], [ 0x3F755ABD40000000, %5 ]
+  %12 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1 ], [ 0xBF4AE9A400000000, %5 ]
+  %13 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1 ], [ 0x3F163D2D40000000, %5 ]
+  %14 = phi float [ %.01, %__nv_fabsf.exit1 ], [ %6, %5 ]
+  %15 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not2 = icmp eq i32 %15, 0
+  %16 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %13, float %14, float %12) #4
+  %17 = tail call float @llvm.nvvm.fma.rn.f(float %13, float %14, float %12) #4
+  %.02 = select i1 %.not2, float %17, float %16
+  %18 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not3 = icmp eq i32 %18, 0
+  %19 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02, float %14, float %11) #4
+  %20 = tail call float @llvm.nvvm.fma.rn.f(float %.02, float %14, float %11) #4
+  %.03 = select i1 %.not3, float %20, float %19
+  %21 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not4 = icmp eq i32 %21, 0
+  %22 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03, float %14, float %10) #4
+  %23 = tail call float @llvm.nvvm.fma.rn.f(float %.03, float %14, float %10) #4
+  %.04 = select i1 %.not4, float %23, float %22
+  %24 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not5 = icmp eq i32 %24, 0
+  %25 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04, float %14, float %9) #4
+  %26 = tail call float @llvm.nvvm.fma.rn.f(float %.04, float %14, float %9) #4
+  %.05 = select i1 %.not5, float %26, float %25
+  %27 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not6 = icmp eq i32 %27, 0
+  %28 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05, float %14, float %8) #4
+  %29 = tail call float @llvm.nvvm.fma.rn.f(float %.05, float %14, float %8) #4
+  %.06 = select i1 %.not6, float %29, float %28
+  %30 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not7 = icmp eq i32 %30, 0
+  %31 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06, float %14, float %7) #4
+  %32 = tail call float @llvm.nvvm.fma.rn.f(float %.06, float %14, float %7) #4
+  %.07 = select i1 %.not7, float %32, float %31
+  %33 = fneg float %14
+  %34 = select i1 %3, float %33, float %a
+  %35 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not8 = icmp eq i32 %35, 0
+  %36 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07, float %34, float %34) #4
+  %37 = tail call float @llvm.nvvm.fma.rn.f(float %.07, float %34, float %34) #4
+  %.08 = select i1 %.not8, float %37, float %36
+  br i1 %3, label %38, label %46
+38:                                               ; preds = %__internal_fmad.exit
+  %39 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08) #4
+  %40 = fsub float 1.000000e+00, %39
+  %41 = bitcast float %40 to i32
+  %42 = bitcast float %a to i32
+  %43 = and i32 %42, -2147483648
+  %44 = or i32 %43, %41
+  %45 = bitcast i32 %44 to float
+  br label %46
+46:                                               ; preds = %38, %__internal_fmad.exit
+  %r.0 = phi float [ %45, %38 ], [ %.08, %__internal_fmad.exit ]
+  ret float %r.0
+}
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #2
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare float @llvm.nvvm.fabs.ftz.f(float) #0
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare float @llvm.nvvm.fabs.f(float) #0
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float) #0
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare float @llvm.nvvm.fma.rn.f(float, float, float) #0
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.ex2.approx.ftz.f(float) #3
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #4 = { nounwind }
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+!nvvm.annotations = !{!4, !5, !5, !4}
+!llvm.ident = !{!6}
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!3 = !DIFile(filename: "cjfoqo3nutni5cmtw4brla34cz45fusadehkxfkr2fie2qgo7vwt.py", directory: "/tmp/torchinductor_root/jf")
+!4 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
+!5 = !{ptr @triton__0d1d2de, !"maxntidx", i32 128}
+!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!7 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
+!9 = !{}
+!10 = !DILocation(line: 21, column: 36, scope: !7)
+!11 = !DILocation(line: 20, column: 28, scope: !7)
+!12 = !DILocation(line: 20, column: 33, scope: !7)
+!13 = !DILocation(line: 21, column: 23, scope: !7)
+!14 = !DILocation(line: 24, column: 30, scope: !7)
+!15 = !DILocation(line: 24, column: 35, scope: !7)
+!16 = !DILocation(line: 24, column: 44, scope: !7)
+!17 = !DILocation(line: 29, column: 18, scope: !7)
+!18 = !DILocation(line: 30, column: 23, scope: !7)
+!19 = !DILocation(line: 27, column: 18, scope: !7)
+!20 = !DILocation(line: 32, column: 18, scope: !7)
+!21 = !DILocation(line: 33, column: 18, scope: !7)
+!22 = !DILocation(line: 35, column: 25, scope: !7)
+!23 = !DILocation(line: 35, column: 37, scope: !7)
+!24 = !DILocation(line: 35, column: 4, scope: !7)

.triton/dump/44b225411009956bfbae22f8bac7d703/triton_.cubin ADDED Viewed

Binary file (13.9 kB). View file

.triton/dump/44b225411009956bfbae22f8bac7d703/triton_.llir ADDED Viewed

	@@ -0,0 +1,304 @@

+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+@global_smem = external addrspace(3) global [0 x i8]
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+define void @triton__0d1d2d3d4d5de6de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i32 %5, i32 %6) local_unnamed_addr !dbg !7 {
+  %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %9 = and i32 %8, 31, !dbg !10
+  %10 = lshr i32 %8, 5, !dbg !10
+  %11 = and i32 %10, 1, !dbg !10
+  %urem = shl i32 %8, 2, !dbg !10
+  %12 = and i32 %urem, 252, !dbg !10
+  %13 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !11
+  %14 = shl i32 %13, 8, !dbg !12
+  %15 = or i32 %14, %12, !dbg !13
+  %16 = sext i32 %15 to i64, !dbg !14
+  %17 = getelementptr float, ptr addrspace(1) %0, i64 %16, !dbg !14
+  %18 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %17, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !15
+  %19 = extractvalue { i32, i32, i32, i32 } %18, 0, !dbg !15
+  %20 = extractvalue { i32, i32, i32, i32 } %18, 1, !dbg !15
+  %21 = extractvalue { i32, i32, i32, i32 } %18, 2, !dbg !15
+  %22 = extractvalue { i32, i32, i32, i32 } %18, 3, !dbg !15
+  %23 = bitcast i32 %21 to float, !dbg !15
+  %24 = bitcast i32 %22 to float, !dbg !15
+  %25 = getelementptr i16, ptr addrspace(1) %1, i64 %16, !dbg !16
+  %26 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %25, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !17
+  %27 = extractvalue { i32, i32 } %26, 0, !dbg !17
+  %28 = extractvalue { i32, i32 } %26, 1, !dbg !17
+  %29 = trunc i32 %27 to i16, !dbg !17
+  %extelt.offset = lshr i32 %27, 16, !dbg !17
+  %30 = trunc i32 %extelt.offset to i16, !dbg !17
+  %31 = trunc i32 %28 to i16, !dbg !17
+  %extelt.offset1 = lshr i32 %28, 16, !dbg !17
+  %32 = trunc i32 %extelt.offset1 to i16, !dbg !17
+  %33 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %29) #6, !dbg !18
+  %34 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %30) #6, !dbg !18
+  %35 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %31) #6, !dbg !18
+  %36 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %32) #6, !dbg !18
+  %37 = getelementptr i16, ptr addrspace(1) %2, i64 %16, !dbg !19
+  %38 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %37, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !20
+  %39 = extractvalue { i32, i32 } %38, 0, !dbg !20
+  %40 = extractvalue { i32, i32 } %38, 1, !dbg !20
+  %41 = trunc i32 %39 to i16, !dbg !20
+  %extelt.offset2 = lshr i32 %39, 16, !dbg !20
+  %42 = trunc i32 %extelt.offset2 to i16, !dbg !20
+  %43 = trunc i32 %40 to i16, !dbg !20
+  %extelt.offset3 = lshr i32 %40, 16, !dbg !20
+  %44 = trunc i32 %extelt.offset3 to i16, !dbg !20
+  %45 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %41) #6, !dbg !21
+  %46 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %42) #6, !dbg !21
+  %47 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %43) #6, !dbg !21
+  %48 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %44) #6, !dbg !21
+  %49 = zext nneg i32 %12 to i64, !dbg !22
+  %50 = getelementptr float, ptr addrspace(1) %3, i64 %49, !dbg !22
+  %51 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %50, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !23
+  %52 = fadd float %35, %23, !dbg !24
+  %53 = fadd float %36, %24, !dbg !24
+  %54 = insertelement <2 x i32> poison, i32 %19, i64 0, !dbg !15
+  %55 = insertelement <2 x i32> %54, i32 %20, i64 1, !dbg !15
+  %56 = bitcast <2 x i32> %55 to <2 x float>, !dbg !15
+  %57 = insertelement <2 x float> poison, float %33, i64 0, !dbg !24
+  %58 = insertelement <2 x float> %57, float %34, i64 1, !dbg !24
+  %59 = fadd <2 x float> %58, %56, !dbg !24
+  %60 = insertelement <2 x float> poison, float %45, i64 0, !dbg !25
+  %61 = insertelement <2 x float> %60, float %46, i64 1, !dbg !25
+  %62 = fadd <2 x float> %59, %61, !dbg !25
+  %63 = fadd float %52, %47, !dbg !25
+  %64 = fadd float %53, %48, !dbg !25
+  %65 = extractelement <2 x float> %62, i64 0, !dbg !26
+  %66 = extractelement <2 x float> %62, i64 1, !dbg !26
+  %67 = fadd float %65, %66, !dbg !26
+  %68 = fadd float %67, %63, !dbg !26
+  %69 = fadd float %68, %64, !dbg !26
+  %70 = bitcast float %69 to i32, !dbg !32
+  %71 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %70, i32 16, i32 31), !dbg !32
+  %72 = bitcast i32 %71 to float, !dbg !32
+  %73 = fadd float %69, %72, !dbg !26
+  %74 = bitcast float %73 to i32, !dbg !32
+  %75 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %74, i32 8, i32 31), !dbg !32
+  %76 = bitcast i32 %75 to float, !dbg !32
+  %77 = fadd float %73, %76, !dbg !26
+  %78 = bitcast float %77 to i32, !dbg !32
+  %79 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %78, i32 4, i32 31), !dbg !32
+  %80 = bitcast i32 %79 to float, !dbg !32
+  %81 = fadd float %77, %80, !dbg !26
+  %82 = bitcast float %81 to i32, !dbg !32
+  %83 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %82, i32 2, i32 31), !dbg !32
+  %84 = bitcast i32 %83 to float, !dbg !32
+  %85 = fadd float %81, %84, !dbg !26
+  %86 = bitcast float %85 to i32, !dbg !32
+  %87 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %86, i32 1, i32 31), !dbg !32
+  %88 = bitcast i32 %87 to float, !dbg !32
+  %89 = fadd float %85, %88, !dbg !26
+  %90 = icmp eq i32 %9, 0, !dbg !32
+  %91 = zext nneg i32 %11 to i64, !dbg !32
+  %92 = getelementptr float, ptr addrspace(3) @global_smem, i64 %91, !dbg !32
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %92, float %89, i1 %90) #6, !dbg !32
+  tail call void @llvm.nvvm.barrier0(), !dbg !32
+  %93 = icmp slt i32 %8, 2, !dbg !32
+  %94 = sext i32 %8 to i64, !dbg !32
+  %95 = getelementptr float, ptr addrspace(3) @global_smem, i64 %94, !dbg !32
+  %96 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %95, i1 %93) #6, !dbg !32
+  %97 = bitcast float %96 to i32, !dbg !32
+  %98 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %97, i32 1, i32 31), !dbg !32
+  %99 = bitcast i32 %98 to float, !dbg !32
+  %100 = fadd float %96, %99, !dbg !26
+  %101 = and i32 %8, 1, !dbg !32
+  %102 = icmp eq i32 %101, 0, !dbg !32
+  %103 = and i1 %93, %102, !dbg !32
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %95, float %100, i1 %103) #6, !dbg !32
+  tail call void @llvm.nvvm.barrier0(), !dbg !32
+  %104 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !32
+  %105 = fadd float %104, 0.000000e+00, !dbg !34
+  %106 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %105, float 2.560000e+02) #6, !dbg !38
+  %107 = fsub float %65, %106, !dbg !39
+  %108 = fsub float %66, %106, !dbg !39
+  %109 = fsub float %63, %106, !dbg !39
+  %110 = fsub float %64, %106, !dbg !39
+  %111 = fmul float %107, %107, !dbg !40
+  %112 = fmul float %108, %108, !dbg !40
+  %113 = fmul float %109, %109, !dbg !40
+  %114 = fmul float %110, %110, !dbg !40
+  tail call void @llvm.nvvm.barrier0(), !dbg !41
+  %115 = fadd float %111, %112, !dbg !43
+  %116 = fadd float %113, %115, !dbg !43
+  %117 = fadd float %114, %116, !dbg !43
+  %118 = bitcast float %117 to i32, !dbg !41
+  %119 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %118, i32 16, i32 31), !dbg !41
+  %120 = bitcast i32 %119 to float, !dbg !41
+  %121 = fadd float %117, %120, !dbg !43
+  %122 = bitcast float %121 to i32, !dbg !41
+  %123 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %122, i32 8, i32 31), !dbg !41
+  %124 = bitcast i32 %123 to float, !dbg !41
+  %125 = fadd float %121, %124, !dbg !43
+  %126 = bitcast float %125 to i32, !dbg !41
+  %127 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %126, i32 4, i32 31), !dbg !41
+  %128 = bitcast i32 %127 to float, !dbg !41
+  %129 = fadd float %125, %128, !dbg !43
+  %130 = bitcast float %129 to i32, !dbg !41
+  %131 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %130, i32 2, i32 31), !dbg !41
+  %132 = bitcast i32 %131 to float, !dbg !41
+  %133 = fadd float %129, %132, !dbg !43
+  %134 = bitcast float %133 to i32, !dbg !41
+  %135 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %134, i32 1, i32 31), !dbg !41
+  %136 = bitcast i32 %135 to float, !dbg !41
+  %137 = fadd float %133, %136, !dbg !43
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %92, float %137, i1 %90) #6, !dbg !41
+  tail call void @llvm.nvvm.barrier0(), !dbg !41
+  %138 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %95, i1 %93) #6, !dbg !41
+  %139 = bitcast float %138 to i32, !dbg !41
+  %140 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %139, i32 1, i32 31), !dbg !41
+  %141 = bitcast i32 %140 to float, !dbg !41
+  %142 = fadd float %138, %141, !dbg !43
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %95, float %142, i1 %103) #6, !dbg !41
+  tail call void @llvm.nvvm.barrier0(), !dbg !41
+  %143 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !41
+  %144 = fadd float %143, 0.000000e+00, !dbg !46
+  %145 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %144, float 2.560000e+02) #6, !dbg !48
+  %146 = fadd float %145, 0x3EE4F8B580000000, !dbg !49
+  %147 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !50
+  %.not.i = icmp eq i32 %147, 0, !dbg !50
+  br i1 %.not.i, label %150, label %148, !dbg !50
+148:                                              ; preds = %7
+  %149 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %146), !dbg !50
+  br label %__nv_rsqrtf.exit, !dbg !50
+150:                                              ; preds = %7
+  %151 = tail call float @llvm.nvvm.rsqrt.approx.f(float %146), !dbg !50
+  br label %__nv_rsqrtf.exit, !dbg !50
+__nv_rsqrtf.exit:                                 ; preds = %148, %150
+  %.0.i = phi float [ %149, %148 ], [ %151, %150 ], !dbg !50
+  %152 = extractvalue { i32, i32, i32, i32 } %51, 3, !dbg !23
+  %153 = bitcast i32 %152 to float, !dbg !23
+  %154 = extractvalue { i32, i32, i32, i32 } %51, 2, !dbg !23
+  %155 = bitcast i32 %154 to float, !dbg !23
+  %156 = extractvalue { i32, i32, i32, i32 } %51, 1, !dbg !23
+  %157 = bitcast i32 %156 to float, !dbg !23
+  %158 = extractvalue { i32, i32, i32, i32 } %51, 0, !dbg !23
+  %159 = bitcast i32 %158 to float, !dbg !23
+  %160 = fmul float %107, %.0.i, !dbg !51
+  %161 = fmul float %108, %.0.i, !dbg !51
+  %162 = fmul float %109, %.0.i, !dbg !51
+  %163 = fmul float %110, %.0.i, !dbg !51
+  %164 = fmul float %160, %159, !dbg !52
+  %165 = fmul float %161, %157, !dbg !52
+  %166 = fmul float %162, %155, !dbg !52
+  %167 = fmul float %163, %153, !dbg !52
+  %168 = getelementptr float, ptr addrspace(1) %4, i64 %16, !dbg !53
+  %169 = bitcast float %164 to i32, !dbg !54
+  %170 = bitcast float %165 to i32, !dbg !54
+  %171 = bitcast float %166 to i32, !dbg !54
+  %172 = bitcast float %167 to i32, !dbg !54
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %169, i32 %170, i32 %171, i32 %172, ptr addrspace(1) %168, i1 true) #6, !dbg !54
+  ret void, !dbg !55
+}
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #2
+; Function Attrs: alwaysinline nounwind
+define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
+  %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
+  %.not = icmp eq i32 %1, 0
+  br i1 %.not, label %4, label %2
+2:                                                ; preds = %0
+  %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
+  br label %6
+4:                                                ; preds = %0
+  %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
+  br label %6
+6:                                                ; preds = %4, %2
+  %.0 = phi float [ %3, %2 ], [ %5, %4 ]
+  ret float %.0
+}
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #5
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #2 = { convergent nocallback nounwind }
+attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #6 = { nounwind }
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+!nvvm.annotations = !{!4, !5, !5, !4}
+!llvm.ident = !{!6}
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!3 = !DIFile(filename: "ctvr3xs46luhhbr7xomihgyropjaatss7yata4igaw6kvgwas7g2.py", directory: "/tmp/torchinductor_root/tv")
+!4 = !{ptr @triton__0d1d2d3d4d5de6de, !"kernel", i32 1}
+!5 = !{ptr @triton__0d1d2d3d4d5de6de, !"maxntidx", i32 64}
+!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5de6de", linkageName: "triton__0d1d2d3d4d5de6de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
+!9 = !{}
+!10 = !DILocation(line: 26, column: 26, scope: !7)
+!11 = !DILocation(line: 23, column: 28, scope: !7)
+!12 = !DILocation(line: 30, column: 40, scope: !7)
+!13 = !DILocation(line: 30, column: 36, scope: !7)
+!14 = !DILocation(line: 30, column: 30, scope: !7)
+!15 = !DILocation(line: 30, column: 46, scope: !7)
+!16 = !DILocation(line: 31, column: 30, scope: !7)
+!17 = !DILocation(line: 31, column: 46, scope: !7)
+!18 = !DILocation(line: 31, column: 67, scope: !7)
+!19 = !DILocation(line: 32, column: 30, scope: !7)
+!20 = !DILocation(line: 32, column: 46, scope: !7)
+!21 = !DILocation(line: 32, column: 67, scope: !7)
+!22 = !DILocation(line: 33, column: 31, scope: !7)
+!23 = !DILocation(line: 33, column: 36, scope: !7)
+!24 = !DILocation(line: 35, column: 18, scope: !7)
+!25 = !DILocation(line: 37, column: 18, scope: !7)
+!26 = !DILocation(line: 233, column: 15, scope: !27, inlinedAt: !30)
+!27 = distinct !DILexicalBlockFile(scope: !29, file: !28, discriminator: 0)
+!28 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
+!29 = distinct !DILexicalBlockFile(scope: !7, file: !28, discriminator: 0)
+!30 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !31)
+!31 = !DILocation(line: 42, column: 59, scope: !27)
+!32 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !33)
+!33 = !DILocation(line: 42, column: 59, scope: !29)
+!34 = !DILocation(line: 8, column: 15, scope: !35, inlinedAt: !37)
+!35 = distinct !DILexicalBlockFile(scope: !7, file: !36, discriminator: 0)
+!36 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
+!37 = !DILocation(line: 42, column: 45, scope: !35)
+!38 = !DILocation(line: 45, column: 20, scope: !7)
+!39 = !DILocation(line: 46, column: 19, scope: !7)
+!40 = !DILocation(line: 47, column: 20, scope: !7)
+!41 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !42)
+!42 = !DILocation(line: 50, column: 59, scope: !29)
+!43 = !DILocation(line: 233, column: 15, scope: !27, inlinedAt: !44)
+!44 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !45)
+!45 = !DILocation(line: 50, column: 59, scope: !27)
+!46 = !DILocation(line: 8, column: 15, scope: !35, inlinedAt: !47)
+!47 = !DILocation(line: 50, column: 45, scope: !35)
+!48 = !DILocation(line: 53, column: 20, scope: !7)
+!49 = !DILocation(line: 55, column: 20, scope: !7)
+!50 = !DILocation(line: 56, column: 26, scope: !7)
+!51 = !DILocation(line: 57, column: 20, scope: !7)
+!52 = !DILocation(line: 58, column: 20, scope: !7)
+!53 = !DILocation(line: 59, column: 25, scope: !7)
+!54 = !DILocation(line: 59, column: 48, scope: !7)
+!55 = !DILocation(line: 59, column: 4, scope: !7)

.triton/dump/44b225411009956bfbae22f8bac7d703/triton_.ttir ADDED Viewed

	@@ -0,0 +1,61 @@

+module {
+  tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %c256_i32 = arith.constant 256 : i32
+    %cst = arith.constant dense<0.000000e+00> : tensor<256xbf16>
+    %cst_0 = arith.constant 0.000000e+00 : f32
+    %cst_1 = arith.constant 2.560000e+02 : f32
+    %cst_2 = arith.constant 9.99999974E-6 : f32
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32>
+    %cst_4 = arith.constant dense<256> : tensor<256xi32>
+    %0 = tt.get_program_id x : i32
+    %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
+    %2 = arith.cmpi slt, %1, %cst_4 : tensor<256xi32>
+    %3 = arith.muli %0, %c256_i32 : i32
+    %4 = tt.splat %3 : (i32) -> tensor<256xi32>
+    %5 = arith.addi %1, %4 : tensor<256xi32>
+    %6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
+    %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
+    %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
+    %9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
+    %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
+    %11 = tt.load %10, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
+    %12 = arith.extf %11 : tensor<256xbf16> to tensor<256xf32>
+    %13 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
+    %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
+    %15 = tt.load %14, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
+    %16 = arith.extf %15 : tensor<256xbf16> to tensor<256xf32>
+    %17 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
+    %18 = tt.addptr %17, %1 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
+    %19 = tt.load %18, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
+    %20 = arith.addf %8, %12 : tensor<256xf32>
+    %21 = arith.addf %20, %16 : tensor<256xf32>
+    %22 = arith.select %2, %21, %cst_3 : tensor<256xi1>, tensor<256xf32>
+    %23 = "tt.reduce"(%22) <{axis = 0 : i32}> ({
+    ^bb0(%arg7: f32, %arg8: f32):
+      %40 = arith.addf %arg7, %arg8 : f32
+      tt.reduce.return %40 : f32
+    }) : (tensor<256xf32>) -> f32
+    %24 = arith.addf %23, %cst_0 : f32
+    %25 = arith.divf %24, %cst_1 : f32
+    %26 = tt.splat %25 : (f32) -> tensor<256xf32>
+    %27 = arith.subf %21, %26 : tensor<256xf32>
+    %28 = arith.mulf %27, %27 : tensor<256xf32>
+    %29 = arith.select %2, %28, %cst_3 : tensor<256xi1>, tensor<256xf32>
+    %30 = "tt.reduce"(%29) <{axis = 0 : i32}> ({
+    ^bb0(%arg7: f32, %arg8: f32):
+      %40 = arith.addf %arg7, %arg8 : f32
+      tt.reduce.return %40 : f32
+    }) : (tensor<256xf32>) -> f32
+    %31 = arith.addf %30, %cst_0 : f32
+    %32 = arith.divf %31, %cst_1 : f32
+    %33 = arith.addf %32, %cst_2 : f32
+    %34 = tt.extern_elementwise %33 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
+    %35 = tt.splat %34 : (f32) -> tensor<256xf32>
+    %36 = arith.mulf %27, %35 : tensor<256xf32>
+    %37 = arith.mulf %36, %19 : tensor<256xf32>
+    %38 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
+    %39 = tt.addptr %38, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
+    tt.store %39, %37, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32>
+    tt.return
+  }
+}

.triton/dump/4c6ad48573c74d55ed79384f6b432d50/triton_.ttir ADDED Viewed

	@@ -0,0 +1,18 @@

+module {
+  tt.func public @triton__0d1d2de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %c1024_i32 = arith.constant 1024 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c1024_i32 : i32
+    %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
+    %3 = tt.splat %1 : (i32) -> tensor<1024xi32>
+    %4 = arith.addi %3, %2 : tensor<1024xi32>
+    %5 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>>
+    %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<f32, 1>>, tensor<1024xi32>
+    %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xf32>
+    %8 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
+    %9 = tt.addptr %8, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
+    %10 = arith.truncf %7 : tensor<1024xf32> to tensor<1024xbf16>
+    tt.store %9, %10 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16>
+    tt.return
+  }
+}

.triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.llir ADDED Viewed

	@@ -0,0 +1,362 @@

+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !7 {
+  %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %5 = shl i32 %4, 1, !dbg !10
+  %6 = and i32 %5, 510, !dbg !10
+  %7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #4, !dbg !11
+  %8 = shl i32 %7, 9, !dbg !12
+  %9 = or i32 %8, %6, !dbg !13
+  %10 = sext i32 %9 to i64, !dbg !14
+  %11 = getelementptr i16, ptr addrspace(1) %0, i64 %10, !dbg !14
+  %12 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %11, i1 true) #4, !dbg !15
+  %13 = trunc i32 %12 to i16, !dbg !15
+  %extelt.offset = lshr i32 %12, 16, !dbg !15
+  %14 = trunc i32 %extelt.offset to i16, !dbg !15
+  %15 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %13) #4, !dbg !16
+  %16 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %14) #4, !dbg !16
+  %17 = getelementptr i16, ptr addrspace(1) %1, i64 %10, !dbg !17
+  %18 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %17, i1 true) #4, !dbg !18
+  %19 = trunc i32 %18 to i16, !dbg !18
+  %extelt.offset1 = lshr i32 %18, 16, !dbg !18
+  %20 = trunc i32 %extelt.offset1 to i16, !dbg !18
+  %21 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %19) #4, !dbg !19
+  %22 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %20) #4, !dbg !19
+  %23 = fmul float %21, 0x3FE6A09E60000000, !dbg !20
+  %24 = fmul float %22, 0x3FE6A09E60000000, !dbg !20
+  %25 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
+  %.not.i = icmp eq i32 %25, 0, !dbg !21
+  %26 = tail call float @llvm.nvvm.fabs.ftz.f(float %23) #4, !dbg !21
+  %27 = tail call float @llvm.nvvm.fabs.f(float %23) #4, !dbg !21
+  %.0.i = select i1 %.not.i, float %27, float %26, !dbg !21
+  %28 = fcmp oge float %.0.i, 0x3FF00C1FC0000000, !dbg !21
+  br i1 %28, label %__nv_fabsf.exit1.i, label %30, !dbg !21
+__nv_fabsf.exit1.i:                               ; preds = %3
+  %29 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
+  %.not1.i = icmp eq i32 %29, 0, !dbg !21
+  %.01.i = select i1 %.not1.i, float %27, float %26, !dbg !21
+  br label %__internal_fmad.exit.i, !dbg !21
+30:                                               ; preds = %3
+  %31 = fmul float %23, %23, !dbg !21
+  br label %__internal_fmad.exit.i, !dbg !21
+__internal_fmad.exit.i:                           ; preds = %30, %__nv_fabsf.exit1.i
+  %32 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i ], [ 0x3FC06EBA60000000, %30 ], !dbg !21
+  %33 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i ], [ 0xBFD8127580000000, %30 ], !dbg !21
+  %34 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i ], [ 0x3FBCE315E0000000, %30 ], !dbg !21
+  %35 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i ], [ 0xBF9B837CE0000000, %30 ], !dbg !21
+  %36 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i ], [ 0x3F755ABD40000000, %30 ], !dbg !21
+  %37 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i ], [ 0xBF4AE9A400000000, %30 ], !dbg !21
+  %38 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i ], [ 0x3F163D2D40000000, %30 ], !dbg !21
+  %39 = phi float [ %.01.i, %__nv_fabsf.exit1.i ], [ %31, %30 ], !dbg !21
+  %40 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
+  %.not2.i = icmp eq i32 %40, 0, !dbg !21
+  %41 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %38, float %39, float %37) #4, !dbg !21
+  %42 = tail call float @llvm.nvvm.fma.rn.f(float %38, float %39, float %37) #4, !dbg !21
+  %.02.i = select i1 %.not2.i, float %42, float %41, !dbg !21
+  %43 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
+  %.not3.i = icmp eq i32 %43, 0, !dbg !21
+  %44 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i, float %39, float %36) #4, !dbg !21
+  %45 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i, float %39, float %36) #4, !dbg !21
+  %.03.i = select i1 %.not3.i, float %45, float %44, !dbg !21
+  %46 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
+  %.not4.i = icmp eq i32 %46, 0, !dbg !21
+  %47 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i, float %39, float %35) #4, !dbg !21
+  %48 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i, float %39, float %35) #4, !dbg !21
+  %.04.i = select i1 %.not4.i, float %48, float %47, !dbg !21
+  %49 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
+  %.not5.i = icmp eq i32 %49, 0, !dbg !21
+  %50 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i, float %39, float %34) #4, !dbg !21
+  %51 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i, float %39, float %34) #4, !dbg !21
+  %.05.i = select i1 %.not5.i, float %51, float %50, !dbg !21
+  %52 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
+  %.not6.i = icmp eq i32 %52, 0, !dbg !21
+  %53 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i, float %39, float %33) #4, !dbg !21
+  %54 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i, float %39, float %33) #4, !dbg !21
+  %.06.i = select i1 %.not6.i, float %54, float %53, !dbg !21
+  %55 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
+  %.not7.i = icmp eq i32 %55, 0, !dbg !21
+  %56 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i, float %39, float %32) #4, !dbg !21
+  %57 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i, float %39, float %32) #4, !dbg !21
+  %.07.i = select i1 %.not7.i, float %57, float %56, !dbg !21
+  %58 = fneg float %39, !dbg !21
+  %59 = select i1 %28, float %58, float %23, !dbg !21
+  %60 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
+  %.not8.i = icmp eq i32 %60, 0, !dbg !21
+  %61 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i, float %59, float %59) #4, !dbg !21
+  %62 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i, float %59, float %59) #4, !dbg !21
+  %.08.i = select i1 %.not8.i, float %62, float %61, !dbg !21
+  br i1 %28, label %63, label %__nv_erff.exit, !dbg !21
+63:                                               ; preds = %__internal_fmad.exit.i
+  %64 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i) #4, !dbg !21
+  %65 = fsub float 1.000000e+00, %64, !dbg !21
+  %66 = bitcast float %65 to i32, !dbg !21
+  %67 = bitcast float %23 to i32, !dbg !21
+  %68 = and i32 %67, -2147483648, !dbg !21
+  %69 = or i32 %68, %66, !dbg !21
+  %70 = bitcast i32 %69 to float, !dbg !21
+  br label %__nv_erff.exit, !dbg !21
+__nv_erff.exit:                                   ; preds = %__internal_fmad.exit.i, %63
+  %r.0.i = phi float [ %70, %63 ], [ %.08.i, %__internal_fmad.exit.i ], !dbg !21
+  %71 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
+  %.not.i2 = icmp eq i32 %71, 0, !dbg !21
+  %72 = tail call float @llvm.nvvm.fabs.ftz.f(float %24) #4, !dbg !21
+  %73 = tail call float @llvm.nvvm.fabs.f(float %24) #4, !dbg !21
+  %.0.i3 = select i1 %.not.i2, float %73, float %72, !dbg !21
+  %74 = fcmp oge float %.0.i3, 0x3FF00C1FC0000000, !dbg !21
+  br i1 %74, label %__nv_fabsf.exit1.i20, label %76, !dbg !21
+__nv_fabsf.exit1.i20:                             ; preds = %__nv_erff.exit
+  %75 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
+  %.not1.i21 = icmp eq i32 %75, 0, !dbg !21
+  %.01.i22 = select i1 %.not1.i21, float %73, float %72, !dbg !21
+  br label %__internal_fmad.exit.i4, !dbg !21
+76:                                               ; preds = %__nv_erff.exit
+  %77 = fmul float %24, %24, !dbg !21
+  br label %__internal_fmad.exit.i4, !dbg !21
+__internal_fmad.exit.i4:                          ; preds = %76, %__nv_fabsf.exit1.i20
+  %78 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i20 ], [ 0x3FC06EBA60000000, %76 ], !dbg !21
+  %79 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i20 ], [ 0xBFD8127580000000, %76 ], !dbg !21
+  %80 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i20 ], [ 0x3FBCE315E0000000, %76 ], !dbg !21
+  %81 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i20 ], [ 0xBF9B837CE0000000, %76 ], !dbg !21
+  %82 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i20 ], [ 0x3F755ABD40000000, %76 ], !dbg !21
+  %83 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i20 ], [ 0xBF4AE9A400000000, %76 ], !dbg !21
+  %84 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i20 ], [ 0x3F163D2D40000000, %76 ], !dbg !21
+  %85 = phi float [ %.01.i22, %__nv_fabsf.exit1.i20 ], [ %77, %76 ], !dbg !21
+  %86 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
+  %.not2.i5 = icmp eq i32 %86, 0, !dbg !21
+  %87 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %84, float %85, float %83) #4, !dbg !21
+  %88 = tail call float @llvm.nvvm.fma.rn.f(float %84, float %85, float %83) #4, !dbg !21
+  %.02.i6 = select i1 %.not2.i5, float %88, float %87, !dbg !21
+  %89 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
+  %.not3.i7 = icmp eq i32 %89, 0, !dbg !21
+  %90 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i6, float %85, float %82) #4, !dbg !21
+  %91 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i6, float %85, float %82) #4, !dbg !21
+  %.03.i8 = select i1 %.not3.i7, float %91, float %90, !dbg !21
+  %92 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
+  %.not4.i9 = icmp eq i32 %92, 0, !dbg !21
+  %93 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i8, float %85, float %81) #4, !dbg !21
+  %94 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i8, float %85, float %81) #4, !dbg !21
+  %.04.i10 = select i1 %.not4.i9, float %94, float %93, !dbg !21
+  %95 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
+  %.not5.i11 = icmp eq i32 %95, 0, !dbg !21
+  %96 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i10, float %85, float %80) #4, !dbg !21
+  %97 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i10, float %85, float %80) #4, !dbg !21
+  %.05.i12 = select i1 %.not5.i11, float %97, float %96, !dbg !21
+  %98 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
+  %.not6.i13 = icmp eq i32 %98, 0, !dbg !21
+  %99 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i12, float %85, float %79) #4, !dbg !21
+  %100 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i12, float %85, float %79) #4, !dbg !21
+  %.06.i14 = select i1 %.not6.i13, float %100, float %99, !dbg !21
+  %101 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
+  %.not7.i15 = icmp eq i32 %101, 0, !dbg !21
+  %102 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i14, float %85, float %78) #4, !dbg !21
+  %103 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i14, float %85, float %78) #4, !dbg !21
+  %.07.i16 = select i1 %.not7.i15, float %103, float %102, !dbg !21
+  %104 = fneg float %85, !dbg !21
+  %105 = select i1 %74, float %104, float %24, !dbg !21
+  %106 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
+  %.not8.i17 = icmp eq i32 %106, 0, !dbg !21
+  %107 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i16, float %105, float %105) #4, !dbg !21
+  %108 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i16, float %105, float %105) #4, !dbg !21
+  %.08.i18 = select i1 %.not8.i17, float %108, float %107, !dbg !21
+  br i1 %74, label %109, label %__nv_erff.exit23, !dbg !21
+109:                                              ; preds = %__internal_fmad.exit.i4
+  %110 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i18) #4, !dbg !21
+  %111 = fsub float 1.000000e+00, %110, !dbg !21
+  %112 = bitcast float %111 to i32, !dbg !21
+  %113 = bitcast float %24 to i32, !dbg !21
+  %114 = and i32 %113, -2147483648, !dbg !21
+  %115 = or i32 %114, %112, !dbg !21
+  %116 = bitcast i32 %115 to float, !dbg !21
+  br label %__nv_erff.exit23, !dbg !21
+__nv_erff.exit23:                                 ; preds = %__internal_fmad.exit.i4, %109
+  %r.0.i19 = phi float [ %116, %109 ], [ %.08.i18, %__internal_fmad.exit.i4 ], !dbg !21
+  %117 = fadd float %r.0.i, 1.000000e+00, !dbg !22
+  %118 = fadd float %r.0.i19, 1.000000e+00, !dbg !22
+  %119 = fmul float %117, 5.000000e-01, !dbg !23
+  %120 = fmul float %118, 5.000000e-01, !dbg !23
+  %121 = fmul float %21, %21, !dbg !24
+  %122 = fmul float %22, %22, !dbg !24
+  %123 = fmul float %121, -5.000000e-01, !dbg !25
+  %124 = fmul float %122, -5.000000e-01, !dbg !25
+  %125 = fmul float %123, 0x3FF7154760000000, !dbg !26
+  %126 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %125) #4, !dbg !26
+  %127 = fmul float %124, 0x3FF7154760000000, !dbg !26
+  %128 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %127) #4, !dbg !26
+  %129 = fmul float %126, 0x3FD9884540000000, !dbg !27
+  %130 = fmul float %128, 0x3FD9884540000000, !dbg !27
+  %131 = fmul float %21, %129, !dbg !28
+  %132 = fmul float %22, %130, !dbg !28
+  %133 = fadd float %119, %131, !dbg !29
+  %134 = fadd float %120, %132, !dbg !29
+  %135 = fmul float %15, %133, !dbg !30
+  %136 = fmul float %16, %134, !dbg !30
+  %137 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %135) #4, !dbg !31
+  %138 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %136) #4, !dbg !31
+  %139 = insertelement <2 x i16> undef, i16 %137, i64 0, !dbg !31
+  %140 = insertelement <2 x i16> %139, i16 %138, i64 1, !dbg !31
+  %141 = bitcast <2 x i16> %140 to i32, !dbg !31
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %141, ptr addrspace(1) %11, i1 true) #4, !dbg !31
+  ret void, !dbg !32
+}
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+; Function Attrs: alwaysinline nounwind
+define float @__nv_erff(float %a) local_unnamed_addr #1 {
+__nv_fabsf.exit:
+  %0 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not = icmp eq i32 %0, 0
+  %1 = tail call float @llvm.nvvm.fabs.ftz.f(float %a) #4
+  %2 = tail call float @llvm.nvvm.fabs.f(float %a) #4
+  %.0 = select i1 %.not, float %2, float %1
+  %3 = fcmp oge float %.0, 0x3FF00C1FC0000000
+  br i1 %3, label %__nv_fabsf.exit1, label %5
+__nv_fabsf.exit1:                                 ; preds = %__nv_fabsf.exit
+  %4 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not1 = icmp eq i32 %4, 0
+  %.01 = select i1 %.not1, float %2, float %1
+  br label %__internal_fmad.exit
+5:                                                ; preds = %__nv_fabsf.exit
+  %6 = fmul float %a, %a
+  br label %__internal_fmad.exit
+__internal_fmad.exit:                             ; preds = %5, %__nv_fabsf.exit1
+  %7 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1 ], [ 0x3FC06EBA60000000, %5 ]
+  %8 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1 ], [ 0xBFD8127580000000, %5 ]
+  %9 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1 ], [ 0x3FBCE315E0000000, %5 ]
+  %10 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1 ], [ 0xBF9B837CE0000000, %5 ]
+  %11 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1 ], [ 0x3F755ABD40000000, %5 ]
+  %12 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1 ], [ 0xBF4AE9A400000000, %5 ]
+  %13 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1 ], [ 0x3F163D2D40000000, %5 ]
+  %14 = phi float [ %.01, %__nv_fabsf.exit1 ], [ %6, %5 ]
+  %15 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not2 = icmp eq i32 %15, 0
+  %16 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %13, float %14, float %12) #4
+  %17 = tail call float @llvm.nvvm.fma.rn.f(float %13, float %14, float %12) #4
+  %.02 = select i1 %.not2, float %17, float %16
+  %18 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not3 = icmp eq i32 %18, 0
+  %19 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02, float %14, float %11) #4
+  %20 = tail call float @llvm.nvvm.fma.rn.f(float %.02, float %14, float %11) #4
+  %.03 = select i1 %.not3, float %20, float %19
+  %21 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not4 = icmp eq i32 %21, 0
+  %22 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03, float %14, float %10) #4
+  %23 = tail call float @llvm.nvvm.fma.rn.f(float %.03, float %14, float %10) #4
+  %.04 = select i1 %.not4, float %23, float %22
+  %24 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not5 = icmp eq i32 %24, 0
+  %25 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04, float %14, float %9) #4
+  %26 = tail call float @llvm.nvvm.fma.rn.f(float %.04, float %14, float %9) #4
+  %.05 = select i1 %.not5, float %26, float %25
+  %27 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not6 = icmp eq i32 %27, 0
+  %28 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05, float %14, float %8) #4
+  %29 = tail call float @llvm.nvvm.fma.rn.f(float %.05, float %14, float %8) #4
+  %.06 = select i1 %.not6, float %29, float %28
+  %30 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not7 = icmp eq i32 %30, 0
+  %31 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06, float %14, float %7) #4
+  %32 = tail call float @llvm.nvvm.fma.rn.f(float %.06, float %14, float %7) #4
+  %.07 = select i1 %.not7, float %32, float %31
+  %33 = fneg float %14
+  %34 = select i1 %3, float %33, float %a
+  %35 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not8 = icmp eq i32 %35, 0
+  %36 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07, float %34, float %34) #4
+  %37 = tail call float @llvm.nvvm.fma.rn.f(float %.07, float %34, float %34) #4
+  %.08 = select i1 %.not8, float %37, float %36
+  br i1 %3, label %38, label %46
+38:                                               ; preds = %__internal_fmad.exit
+  %39 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08) #4
+  %40 = fsub float 1.000000e+00, %39
+  %41 = bitcast float %40 to i32
+  %42 = bitcast float %a to i32
+  %43 = and i32 %42, -2147483648
+  %44 = or i32 %43, %41
+  %45 = bitcast i32 %44 to float
+  br label %46
+46:                                               ; preds = %38, %__internal_fmad.exit
+  %r.0 = phi float [ %45, %38 ], [ %.08, %__internal_fmad.exit ]
+  ret float %r.0
+}
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #2
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare float @llvm.nvvm.fabs.ftz.f(float) #0
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare float @llvm.nvvm.fabs.f(float) #0
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float) #0
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare float @llvm.nvvm.fma.rn.f(float, float, float) #0
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.ex2.approx.ftz.f(float) #3
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #4 = { nounwind }
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+!nvvm.annotations = !{!4, !5, !5, !4}
+!llvm.ident = !{!6}
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!3 = !DIFile(filename: "c5jxaguxho3nhrlt5vcinnz5fevodumlpwn4wyb2vx3xrveicerl.py", directory: "/tmp/torchinductor_root/5j")
+!4 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
+!5 = !{ptr @triton__0d1d2de, !"maxntidx", i32 256}
+!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!7 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
+!9 = !{}
+!10 = !DILocation(line: 21, column: 36, scope: !7)
+!11 = !DILocation(line: 20, column: 28, scope: !7)
+!12 = !DILocation(line: 20, column: 33, scope: !7)
+!13 = !DILocation(line: 21, column: 23, scope: !7)
+!14 = !DILocation(line: 24, column: 34, scope: !7)
+!15 = !DILocation(line: 24, column: 39, scope: !7)
+!16 = !DILocation(line: 24, column: 48, scope: !7)
+!17 = !DILocation(line: 25, column: 30, scope: !7)
+!18 = !DILocation(line: 25, column: 35, scope: !7)
+!19 = !DILocation(line: 25, column: 44, scope: !7)
+!20 = !DILocation(line: 29, column: 18, scope: !7)
+!21 = !DILocation(line: 30, column: 23, scope: !7)
+!22 = !DILocation(line: 32, column: 18, scope: !7)
+!23 = !DILocation(line: 34, column: 19, scope: !7)
+!24 = !DILocation(line: 35, column: 19, scope: !7)
+!25 = !DILocation(line: 37, column: 20, scope: !7)
+!26 = !DILocation(line: 38, column: 19, scope: !7)
+!27 = !DILocation(line: 40, column: 20, scope: !7)
+!28 = !DILocation(line: 41, column: 19, scope: !7)
+!29 = !DILocation(line: 42, column: 20, scope: !7)
+!30 = !DILocation(line: 43, column: 19, scope: !7)
+!31 = !DILocation(line: 45, column: 40, scope: !7)
+!32 = !DILocation(line: 45, column: 4, scope: !7)

.triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.ptx ADDED Viewed

	@@ -0,0 +1,486 @@

+//
+// Generated by LLVM NVPTX Back-End
+//
+.version 8.2
+.target sm_89
+.address_size 64
+	// .globl	triton__0d1d2de
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
+.visible .entry triton__0d1d2de(
+	.param .u64 triton__0d1d2de_param_0,
+	.param .u64 triton__0d1d2de_param_1,
+	.param .u32 triton__0d1d2de_param_2
+)
+.maxntid 256, 1, 1
+{
+	.reg .pred 	%p<10>;
+	.reg .b16 	%rs<7>;
+	.reg .b32 	%r<25>;
+	.reg .f32 	%f<127>;
+	.reg .b64 	%rd<8>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+	ld.param.u64 	%rd4, [triton__0d1d2de_param_0];
+	ld.param.u64 	%rd5, [triton__0d1d2de_param_1];
+$L__tmp0:
+	.loc	1 21 36
+	mov.u32 	%r8, %tid.x;
+	shl.b32 	%r9, %r8, 1;
+	and.b32  	%r10, %r9, 510;
+	.loc	1 20 28
+	mov.u32 %r1, %ctaid.x;
+	.loc	1 20 33
+	shl.b32 	%r11, %r1, 9;
+	.loc	1 21 23
+	or.b32  	%r12, %r11, %r10;
+	.loc	1 24 34
+	mul.wide.s32 	%rd6, %r12, 2;
+	add.s64 	%rd7, %rd4, %rd6;
+	mov.pred 	%p1, -1;
+	.loc	1 24 39
+	mov.u32 %r2, 0x0;
+	@%p1 ld.global.b32 { %r2 }, [ %rd7 + 0 ];
+	.loc	1 25 30
+	add.s64 	%rd3, %rd5, %rd6;
+	.loc	1 25 35
+	mov.u32 %r5, 0x0;
+	@%p1 ld.global.b32 { %r5 }, [ %rd3 + 0 ];
+	cvt.u16.u32 	%rs3, %r5;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r5; }
+	.loc	1 25 44
+	cvt.f32.bf16 %r6, %rs3;
+	mov.b32 	%f3, %r6;
+	cvt.f32.bf16 %r7, %rs4;
+	mov.b32 	%f4, %r7;
+	.loc	1 29 18
+	mul.f32 	%f5, %f3, 0f3F3504F3;
+	.loc	1 30 23
+	abs.ftz.f32 	%f7, %f5;
+	setp.ge.f32 	%p3, %f7, 0f3F8060FE;
+	mov.f32 	%f115, 0f3789CA3C;
+	mov.f32 	%f114, 0fB9F560B9;
+	mov.f32 	%f113, 0f3BAC840B;
+	mov.f32 	%f112, 0fBD0C8162;
+	mov.f32 	%f111, 0f3E1CF906;
+	mov.f32 	%f110, 0f3F6A937E;
+	mov.f32 	%f109, 0f3F20D842;
+	mov.f32 	%f116, %f7;
+	@%p3 bra 	$L__BB0_2;
+	.loc	1 0 23
+	mov.f32 	%f115, 0f38B1E96A;
+	mov.f32 	%f114, 0fBA574D20;
+	mov.f32 	%f113, 0f3BAAD5EA;
+	mov.f32 	%f112, 0fBCDC1BE7;
+	mov.f32 	%f111, 0f3DE718AF;
+	mov.f32 	%f110, 0fBEC093AC;
+	mov.f32 	%f109, 0f3E0375D3;
+	.loc	1 30 23
+	mul.f32 	%f116, %f5, %f5;
+$L__BB0_2:
+	.loc	1 0 0
+	cvt.u16.u32 	%rs1, %r2;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
+	mul.f32 	%f6, %f4, 0f3F3504F3;
+	.loc	1 30 23
+	setp.ltu.f32 	%p4, %f7, 0f3F8060FE;
+	fma.rn.ftz.f32 	%f47, %f115, %f116, %f114;
+	fma.rn.ftz.f32 	%f48, %f47, %f116, %f113;
+	fma.rn.ftz.f32 	%f49, %f48, %f116, %f112;
+	fma.rn.ftz.f32 	%f50, %f49, %f116, %f111;
+	fma.rn.ftz.f32 	%f51, %f50, %f116, %f110;
+	fma.rn.ftz.f32 	%f52, %f51, %f116, %f109;
+	neg.f32 	%f53, %f116;
+	selp.f32 	%f54, %f53, %f5, %p3;
+	fma.rn.ftz.f32 	%f117, %f52, %f54, %f54;
+	mov.f32 	%f108, 0f3F800000;
+	@%p4 bra 	$L__BB0_4;
+	ex2.approx.ftz.f32 	%f55, %f117;
+	sub.f32 	%f57, %f108, %f55;
+	mov.b32 	%r13, %f57;
+	mov.b32 	%r14, %f5;
+	and.b32  	%r15, %r14, -2147483648;
+	or.b32  	%r16, %r15, %r13;
+	mov.b32 	%f117, %r16;
+$L__BB0_4:
+	.loc	1 0 0
+	cvt.f32.bf16 %r3, %rs1;
+	cvt.f32.bf16 %r4, %rs2;
+	.loc	1 30 23
+	abs.ftz.f32 	%f20, %f6;
+	setp.ge.f32 	%p6, %f20, 0f3F8060FE;
+	mov.f32 	%f124, 0f3789CA3C;
+	mov.f32 	%f123, 0fB9F560B9;
+	mov.f32 	%f122, 0f3BAC840B;
+	mov.f32 	%f121, 0fBD0C8162;
+	mov.f32 	%f120, 0f3E1CF906;
+	mov.f32 	%f119, 0f3F6A937E;
+	mov.f32 	%f118, 0f3F20D842;
+	mov.f32 	%f125, %f20;
+	@%p6 bra 	$L__BB0_6;
+	mul.f32 	%f125, %f6, %f6;
+	mov.f32 	%f124, 0f38B1E96A;
+	mov.f32 	%f123, 0fBA574D20;
+	mov.f32 	%f122, 0f3BAAD5EA;
+	mov.f32 	%f121, 0fBCDC1BE7;
+	mov.f32 	%f120, 0f3DE718AF;
+	mov.f32 	%f119, 0fBEC093AC;
+	mov.f32 	%f118, 0f3E0375D3;
+$L__BB0_6:
+	.loc	1 0 0
+	mov.b32 	%f1, %r3;
+	mov.b32 	%f2, %r4;
+	.loc	1 30 23
+	setp.ltu.f32 	%p7, %f20, 0f3F8060FE;
+	fma.rn.ftz.f32 	%f72, %f124, %f125, %f123;
+	fma.rn.ftz.f32 	%f73, %f72, %f125, %f122;
+	fma.rn.ftz.f32 	%f74, %f73, %f125, %f121;
+	fma.rn.ftz.f32 	%f75, %f74, %f125, %f120;
+	fma.rn.ftz.f32 	%f76, %f75, %f125, %f119;
+	fma.rn.ftz.f32 	%f77, %f76, %f125, %f118;
+	neg.f32 	%f78, %f125;
+	selp.f32 	%f79, %f78, %f6, %p6;
+	fma.rn.ftz.f32 	%f126, %f77, %f79, %f79;
+	@%p7 bra 	$L__BB0_8;
+	ex2.approx.ftz.f32 	%f80, %f126;
+	sub.f32 	%f82, %f108, %f80;
+	mov.b32 	%r17, %f82;
+	mov.b32 	%r18, %f6;
+	and.b32  	%r19, %r18, -2147483648;
+	or.b32  	%r20, %r19, %r17;
+	mov.b32 	%f126, %r20;
+$L__BB0_8:
+	.loc	1 32 18
+	add.f32 	%f87, %f117, 0f3F800000;
+	add.f32 	%f88, %f126, 0f3F800000;
+	.loc	1 35 19
+	mul.f32 	%f89, %f3, %f3;
+	mul.f32 	%f90, %f4, %f4;
+	.loc	1 37 20
+	mul.f32 	%f91, %f89, 0fBF000000;
+	mul.f32 	%f92, %f90, 0fBF000000;
+	.loc	1 38 19
+	mul.f32 	%f84, %f91, 0f3FB8AA3B;
+	ex2.approx.f32 %f83, %f84;
+	mul.f32 	%f86, %f92, 0f3FB8AA3B;
+	ex2.approx.f32 %f85, %f86;
+	.loc	1 40 20
+	mul.f32 	%f93, %f83, 0f3ECC422A;
+	mul.f32 	%f94, %f85, 0f3ECC422A;
+	.loc	1 41 19
+	mul.f32 	%f95, %f3, %f93;
+	mul.f32 	%f96, %f4, %f94;
+	.loc	1 42 20
+	fma.rn.f32 	%f97, %f87, 0f3F000000, %f95;
+	fma.rn.f32 	%f98, %f88, 0f3F000000, %f96;
+	.loc	1 43 19
+	mul.f32 	%f99, %f1, %f97;
+	mul.f32 	%f100, %f2, %f98;
+	.loc	1 45 40
+	mov.b32 	%r21, %f99;
+	cvt.rn.bf16.f32 %rs5, %r21;
+	mov.b32 	%r22, %f100;
+	cvt.rn.bf16.f32 %rs6, %r22;
+	mov.b32 	%r24, {%rs5, %rs6};
+	@%p1 st.global.b32 [ %rd7 + 0 ], { %r24 };
+	.loc	1 45 4
+	ret;
+$L__tmp1:
+$L__func_end0:
+}
+	// .globl	__nv_erff
+.visible .func  (.param .b32 func_retval0) __nv_erff(
+	.param .b32 __nv_erff_param_0
+)
+{
+	.reg .pred 	%p<4>;
+	.reg .b32 	%r<5>;
+	.reg .f32 	%f<49>;
+$L__func_begin1:
+	ld.param.f32 	%f14, [__nv_erff_param_0];
+	abs.ftz.f32 	%f1, %f14;
+	setp.ge.f32 	%p1, %f1, 0f3F8060FE;
+	mov.f32 	%f46, 0f3789CA3C;
+	mov.f32 	%f45, 0fB9F560B9;
+	mov.f32 	%f44, 0f3BAC840B;
+	mov.f32 	%f43, 0fBD0C8162;
+	mov.f32 	%f42, 0f3E1CF906;
+	mov.f32 	%f41, 0f3F6A937E;
+	mov.f32 	%f40, 0f3F20D842;
+	mov.f32 	%f47, %f1;
+	@%p1 bra 	$L__BB1_2;
+	mul.f32 	%f47, %f14, %f14;
+	mov.f32 	%f46, 0f38B1E96A;
+	mov.f32 	%f45, 0fBA574D20;
+	mov.f32 	%f44, 0f3BAAD5EA;
+	mov.f32 	%f43, 0fBCDC1BE7;
+	mov.f32 	%f42, 0f3DE718AF;
+	mov.f32 	%f41, 0fBEC093AC;
+	mov.f32 	%f40, 0f3E0375D3;
+$L__BB1_2:
+	setp.ltu.f32 	%p2, %f1, 0f3F8060FE;
+	fma.rn.ftz.f32 	%f29, %f46, %f47, %f45;
+	fma.rn.ftz.f32 	%f30, %f29, %f47, %f44;
+	fma.rn.ftz.f32 	%f31, %f30, %f47, %f43;
+	fma.rn.ftz.f32 	%f32, %f31, %f47, %f42;
+	fma.rn.ftz.f32 	%f33, %f32, %f47, %f41;
+	fma.rn.ftz.f32 	%f34, %f33, %f47, %f40;
+	neg.f32 	%f35, %f47;
+	selp.f32 	%f36, %f35, %f14, %p1;
+	fma.rn.ftz.f32 	%f48, %f34, %f36, %f36;
+	@%p2 bra 	$L__BB1_4;
+	ex2.approx.ftz.f32 	%f37, %f48;
+	mov.f32 	%f38, 0f3F800000;
+	sub.f32 	%f39, %f38, %f37;
+	mov.b32 	%r1, %f39;
+	mov.b32 	%r2, %f14;
+	and.b32  	%r3, %r2, -2147483648;
+	or.b32  	%r4, %r3, %r1;
+	mov.b32 	%f48, %r4;
+$L__BB1_4:
+	st.param.f32 	[func_retval0+0], %f48;
+	ret;
+$L__func_end1:
+}
+	.file	1 "/tmp/torchinductor_root/5j/c5jxaguxho3nhrlt5vcinnz5fevodumlpwn4wyb2vx3xrveicerl.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 176
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 53
+.b8 106
+.b8 120
+.b8 97
+.b8 103
+.b8 117
+.b8 120
+.b8 104
+.b8 111
+.b8 51
+.b8 110
+.b8 104
+.b8 114
+.b8 108
+.b8 116
+.b8 53
+.b8 118
+.b8 99
+.b8 105
+.b8 110
+.b8 110
+.b8 122
+.b8 53
+.b8 102
+.b8 101
+.b8 118
+.b8 111
+.b8 100
+.b8 117
+.b8 109
+.b8 108
+.b8 112
+.b8 119
+.b8 110
+.b8 52
+.b8 119
+.b8 121
+.b8 98
+.b8 50
+.b8 118
+.b8 120
+.b8 51
+.b8 120
+.b8 114
+.b8 118
+.b8 101
+.b8 105
+.b8 99
+.b8 101
+.b8 114
+.b8 108
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 53
+.b8 106
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 180
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 180
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}

.triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.ttgir ADDED Viewed

	@@ -0,0 +1,38 @@

+#blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<0.398942292> : tensor<512xf32, #blocked>
+    %cst_0 = arith.constant dense<-5.000000e-01> : tensor<512xf32, #blocked>
+    %cst_1 = arith.constant dense<5.000000e-01> : tensor<512xf32, #blocked>
+    %cst_2 = arith.constant dense<1.000000e+00> : tensor<512xf32, #blocked>
+    %cst_3 = arith.constant dense<0.707106769> : tensor<512xf32, #blocked>
+    %c512_i32 = arith.constant 512 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c512_i32 : i32
+    %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked>
+    %3 = tt.splat %1 : (i32) -> tensor<512xi32, #blocked>
+    %4 = arith.addi %3, %2 : tensor<512xi32, #blocked>
+    %5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>, #blocked>
+    %6 = tt.addptr %5, %4 : tensor<512x!tt.ptr<bf16, 1>, #blocked>, tensor<512xi32, #blocked>
+    %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16, #blocked>
+    %8 = arith.extf %7 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked>
+    %9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>, #blocked>
+    %10 = tt.addptr %9, %4 : tensor<512x!tt.ptr<bf16, 1>, #blocked>, tensor<512xi32, #blocked>
+    %11 = tt.load %10 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16, #blocked>
+    %12 = arith.extf %11 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked>
+    %13 = arith.mulf %12, %cst_3 : tensor<512xf32, #blocked>
+    %14 = tt.extern_elementwise %13 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<512xf32, #blocked>) -> tensor<512xf32, #blocked>
+    %15 = arith.addf %14, %cst_2 : tensor<512xf32, #blocked>
+    %16 = arith.mulf %15, %cst_1 : tensor<512xf32, #blocked>
+    %17 = arith.mulf %12, %12 : tensor<512xf32, #blocked>
+    %18 = arith.mulf %17, %cst_0 : tensor<512xf32, #blocked>
+    %19 = math.exp %18 : tensor<512xf32, #blocked>
+    %20 = arith.mulf %19, %cst : tensor<512xf32, #blocked>
+    %21 = arith.mulf %12, %20 : tensor<512xf32, #blocked>
+    %22 = arith.addf %16, %21 : tensor<512xf32, #blocked>
+    %23 = arith.mulf %8, %22 : tensor<512xf32, #blocked>
+    %24 = arith.truncf %23 : tensor<512xf32, #blocked> to tensor<512xbf16, #blocked>
+    tt.store %6, %24 {cache = 1 : i32, evict = 1 : i32} : tensor<512xbf16, #blocked>
+    tt.return
+  }
+}

.triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.ttir ADDED Viewed

	@@ -0,0 +1,37 @@

+module {
+  tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<0.398942292> : tensor<512xf32>
+    %cst_0 = arith.constant dense<-5.000000e-01> : tensor<512xf32>
+    %cst_1 = arith.constant dense<5.000000e-01> : tensor<512xf32>
+    %cst_2 = arith.constant dense<1.000000e+00> : tensor<512xf32>
+    %cst_3 = arith.constant dense<0.707106769> : tensor<512xf32>
+    %c512_i32 = arith.constant 512 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c512_i32 : i32
+    %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32>
+    %3 = tt.splat %1 : (i32) -> tensor<512xi32>
+    %4 = arith.addi %3, %2 : tensor<512xi32>
+    %5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>>
+    %6 = tt.addptr %5, %4 : tensor<512x!tt.ptr<bf16, 1>>, tensor<512xi32>
+    %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16>
+    %8 = arith.extf %7 : tensor<512xbf16> to tensor<512xf32>
+    %9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>>
+    %10 = tt.addptr %9, %4 : tensor<512x!tt.ptr<bf16, 1>>, tensor<512xi32>
+    %11 = tt.load %10 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16>
+    %12 = arith.extf %11 : tensor<512xbf16> to tensor<512xf32>
+    %13 = arith.mulf %12, %cst_3 : tensor<512xf32>
+    %14 = tt.extern_elementwise %13 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<512xf32>) -> tensor<512xf32>
+    %15 = arith.addf %14, %cst_2 : tensor<512xf32>
+    %16 = arith.mulf %15, %cst_1 : tensor<512xf32>
+    %17 = arith.mulf %12, %12 : tensor<512xf32>
+    %18 = arith.mulf %17, %cst_0 : tensor<512xf32>
+    %19 = math.exp %18 : tensor<512xf32>
+    %20 = arith.mulf %19, %cst : tensor<512xf32>
+    %21 = arith.mulf %12, %20 : tensor<512xf32>
+    %22 = arith.addf %16, %21 : tensor<512xf32>
+    %23 = arith.mulf %8, %22 : tensor<512xf32>
+    %24 = arith.truncf %23 : tensor<512xf32> to tensor<512xbf16>
+    tt.store %6, %24 {cache = 1 : i32, evict = 1 : i32} : tensor<512xbf16>
+    tt.return
+  }
+}

.triton/dump/645565eaba0a18dd23ef200fe9abb0c0/triton_.cubin ADDED Viewed

Binary file (19.5 kB). View file

.triton/dump/645565eaba0a18dd23ef200fe9abb0c0/triton_.ptx ADDED Viewed

	@@ -0,0 +1,834 @@

+//
+// Generated by LLVM NVPTX Back-End
+//
+.version 8.2
+.target sm_89
+.address_size 64
+	// .globl	triton__0d1d2d3d4d5d6d7d8de9de
+.extern .func __assertfail
+(
+	.param .b64 __assertfail_param_0,
+	.param .b64 __assertfail_param_1,
+	.param .b32 __assertfail_param_2,
+	.param .b64 __assertfail_param_3,
+	.param .b64 __assertfail_param_4
+)
+;
+.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
+.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
+.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
+.extern .shared .align 1 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
+.visible .entry triton__0d1d2d3d4d5d6d7d8de9de(
+	.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_0,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_1,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_2,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_3,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_4,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_5,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_6,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_7,
+	.param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_8,
+	.param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_9
+)
+.maxntid 64, 1, 1
+{
+	.reg .pred 	%p<36>;
+	.reg .b16 	%rs<5>;
+	.reg .b32 	%r<109>;
+	.reg .f32 	%f<70>;
+	.reg .b64 	%rd<49>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+	ld.param.u64 	%rd8, [triton__0d1d2d3d4d5d6d7d8de9de_param_7];
+	ld.param.u64 	%rd7, [triton__0d1d2d3d4d5d6d7d8de9de_param_6];
+	ld.param.u64 	%rd6, [triton__0d1d2d3d4d5d6d7d8de9de_param_5];
+	ld.param.u64 	%rd5, [triton__0d1d2d3d4d5d6d7d8de9de_param_2];
+	ld.param.u64 	%rd4, [triton__0d1d2d3d4d5d6d7d8de9de_param_0];
+$L__tmp0:
+	.loc	1 26 26
+	mov.u32 	%r1, %tid.x;
+	ld.param.u64 	%rd21, [triton__0d1d2d3d4d5d6d7d8de9de_param_1];
+	and.b32  	%r2, %r1, 63;
+	shl.b32 	%r28, %r2, 2;
+	ld.param.u64 	%rd22, [triton__0d1d2d3d4d5d6d7d8de9de_param_3];
+	ld.param.u64 	%rd23, [triton__0d1d2d3d4d5d6d7d8de9de_param_4];
+	.loc	1 23 28
+	mov.u32 %r11, %ctaid.x;
+	.loc	1 30 18
+	shr.s32 	%r29, %r11, 31;
+	shr.u32 	%r30, %r29, 23;
+	add.s32 	%r31, %r11, %r30;
+	and.b32  	%r32, %r31, 16776704;
+	sub.s32 	%r33, %r11, %r32;
+	.loc	1 31 30
+	cvt.s64.s32 	%rd1, %r11;
+	mul.wide.s32 	%rd24, %r11, 8;
+	add.s64 	%rd10, %rd21, %rd24;
+	mov.pred 	%p18, -1;
+	.loc	1 31 35
+	mov.u64 %rd9, 0x0;
+	@%p18 ld.global.L1::evict_last.b64 { %rd9 }, [ %rd10 + 0 ];
+	mov.u64 %rd11, 0x0;
+	@%p18 ld.global.L1::evict_last.b64 { %rd11 }, [ %rd10 + 0 ];
+	mov.u64 %rd13, 0x0;
+	@%p18 ld.global.L1::evict_last.b64 { %rd13 }, [ %rd10 + 0 ];
+	mov.u64 %rd15, 0x0;
+	@%p18 ld.global.L1::evict_last.b64 { %rd15 }, [ %rd10 + 0 ];
+	mov.u64 %rd17, 0x0;
+	@%p18 ld.global.L1::evict_last.b64 { %rd17 }, [ %rd10 + 0 ];
+	.loc	1 32 40
+	shl.b32 	%r34, %r33, 8;
+	.loc	1 32 36
+	or.b32  	%r35, %r34, %r28;
+	.loc	1 32 30
+	mul.wide.s32 	%rd25, %r35, 4;
+	add.s64 	%rd19, %rd22, %rd25;
+	mov.b32 	%r41, 0;
+	.loc	1 32 46
+	mov.u32 %r12, 0x0;
+	mov.u32 %r13, 0x0;
+	mov.u32 %r14, 0x0;
+	mov.u32 %r15, 0x0;
+	@%p18 ld.global.L1::evict_last.v4.b32 { %r12, %r13, %r14, %r15 }, [ %rd19 + 0 ];
+	@!%p18 mov.u32 %r12, %r41;
+	@!%p18 mov.u32 %r13, %r41;
+	@!%p18 mov.u32 %r14, %r41;
+	@!%p18 mov.u32 %r15, %r41;
+	.loc	1 33 31
+	cvt.u64.u32 	%rd3, %r28;
+	mul.wide.u32 	%rd26, %r28, 4;
+	add.s64 	%rd20, %rd23, %rd26;
+	.loc	1 33 36
+	mov.u32 %r20, 0x0;
+	mov.u32 %r21, 0x0;
+	mov.u32 %r22, 0x0;
+	mov.u32 %r23, 0x0;
+	@%p18 ld.global.L1::evict_last.v4.b32 { %r20, %r21, %r22, %r23 }, [ %rd20 + 0 ];
+	@!%p18 mov.u32 %r20, %r41;
+	@!%p18 mov.u32 %r21, %r41;
+	@!%p18 mov.u32 %r22, %r41;
+	@!%p18 mov.u32 %r23, %r41;
+	.loc	1 34 18
+	add.s64 	%rd27, %rd17, 50257;
+	.loc	1 35 18
+	setp.lt.s64 	%p16, %rd17, 0;
+	.loc	1 36 32
+	selp.b64 	%rd28, %rd27, %rd17, %p16;
+	.loc	1 37 36
+	setp.lt.u64 	%p17, %rd28, 50257;
+	.loc	1 37 51
+	@%p17 bra 	$L__BB0_2;
+	mov.u64 	%rd29, assertMessage_0;
+	cvta.global.u64 	%rd30, %rd29;
+	mov.u64 	%rd31, assertFile_0;
+	cvta.global.u64 	%rd32, %rd31;
+	mov.u64 	%rd33, assertFunc_0;
+	cvta.global.u64 	%rd34, %rd33;
+	mov.b32 	%r36, 883;
+	mov.u64 	%rd35, 1;
+	{ // callseq 0, 0
+	.reg .b32 temp_param_reg;
+	.param .b64 param0;
+	st.param.b64 	[param0+0], %rd30;
+	.param .b64 param1;
+	st.param.b64 	[param1+0], %rd32;
+	.param .b32 param2;
+	st.param.b32 	[param2+0], %r36;
+	.param .b64 param3;
+	st.param.b64 	[param3+0], %rd34;
+	.param .b64 param4;
+	st.param.b64 	[param4+0], %rd35;
+	call.uni
+	__assertfail,
+	(
+	param0,
+	param1,
+	param2,
+	param3,
+	param4
+	);
+	} // callseq 0
+$L__BB0_2:
+	.loc	1 35 18
+	setp.lt.s64 	%p33, %rd9, 0;
+	.loc	1 26 26
+	and.b32  	%r75, %r1, 31;
+	.loc	1 38 40
+	shl.b64 	%rd41, %rd9, 8;
+	add.s64 	%rd42, %rd41, 12865792;
+	selp.b64 	%rd43, %rd42, %rd41, %p33;
+	.loc	1 38 36
+	or.b64  	%rd44, %rd43, %rd3;
+	.loc	1 38 30
+	shl.b64 	%rd45, %rd44, 2;
+	add.s64 	%rd36, %rd5, %rd45;
+	.loc	1 38 48
+	mov.u32 %r37, 0x0;
+	mov.u32 %r38, 0x0;
+	mov.u32 %r39, 0x0;
+	mov.u32 %r40, 0x0;
+	@%p18 ld.global.v4.b32 { %r37, %r38, %r39, %r40 }, [ %rd36 + 0 ];
+	@!%p18 mov.u32 %r37, %r41;
+	@!%p18 mov.u32 %r38, %r41;
+	@!%p18 mov.u32 %r39, %r41;
+	@!%p18 mov.u32 %r40, %r41;
+	.loc	1 32 46
+	mov.b32 	%f1, %r12;
+	mov.b32 	%f2, %r13;
+	.loc	1 38 48
+	mov.b32 	%f3, %r37;
+	mov.b32 	%f4, %r38;
+	.loc	1 39 18
+	add.f32 	%f5, %f2, %f4;
+	mov.b32 	%r64, %f5;
+	add.f32 	%f6, %f1, %f3;
+	.loc	1 32 46
+	mov.b32 	%f7, %r15;
+	mov.b32 	%f8, %r14;
+	.loc	1 38 48
+	mov.b32 	%f9, %r40;
+	mov.b32 	%f10, %r39;
+	.loc	1 39 18
+	add.f32 	%f11, %f8, %f10;
+	mov.b32 	%r65, %f11;
+	add.f32 	%f12, %f7, %f9;
+$L__tmp1:
+	.loc	2 233 15
+	add.f32 	%f13, %f6, %f5;
+	add.f32 	%f14, %f11, %f13;
+	add.f32 	%f15, %f12, %f14;
+$L__tmp2:
+	.loc	2 243 36
+	mov.b32 	%r76, %f15;
+	shfl.sync.bfly.b32	%r77, %r76, 16, 31, -1;
+	mov.b32 	%f16, %r77;
+$L__tmp3:
+	.loc	2 233 15
+	add.f32 	%f17, %f15, %f16;
+$L__tmp4:
+	.loc	2 243 36
+	mov.b32 	%r78, %f17;
+	shfl.sync.bfly.b32	%r79, %r78, 8, 31, -1;
+	mov.b32 	%f18, %r79;
+$L__tmp5:
+	.loc	2 233 15
+	add.f32 	%f19, %f17, %f18;
+$L__tmp6:
+	.loc	2 243 36
+	mov.b32 	%r80, %f19;
+	shfl.sync.bfly.b32	%r81, %r80, 4, 31, -1;
+	mov.b32 	%f20, %r81;
+$L__tmp7:
+	.loc	2 233 15
+	add.f32 	%f21, %f19, %f20;
+$L__tmp8:
+	.loc	2 243 36
+	mov.b32 	%r82, %f21;
+	shfl.sync.bfly.b32	%r83, %r82, 2, 31, -1;
+	mov.b32 	%f22, %r83;
+$L__tmp9:
+	.loc	2 233 15
+	add.f32 	%f23, %f21, %f22;
+$L__tmp10:
+	.loc	2 243 36
+	mov.b32 	%r84, %f23;
+	shfl.sync.bfly.b32	%r85, %r84, 1, 31, -1;
+	mov.b32 	%f24, %r85;
+$L__tmp11:
+	.loc	2 233 15
+	add.f32 	%f25, %f23, %f24;
+$L__tmp12:
+	.loc	2 243 36
+	setp.eq.s32 	%p23, %r75, 0;
+	shr.u32 	%r86, %r1, 3;
+	and.b32  	%r87, %r86, 4;
+	mov.u32 	%r88, global_smem;
+	add.s32 	%r45, %r88, %r87;
+	mov.b32 	%r46, %f25;
+	@%p23 st.shared.b32 [ %r45 + 0 ], %r46;
+	bar.sync 	0;
+	setp.lt.s32 	%p24, %r1, 2;
+	shl.b32 	%r89, %r1, 2;
+	add.s32 	%r48, %r88, %r89;
+	@%p24 ld.shared.b32 %r47, [ %r48 + 0 ];
+	mov.b32 	%f26, %r47;
+	shfl.sync.bfly.b32	%r90, %r47, 1, 31, -1;
+	mov.b32 	%f27, %r90;
+$L__tmp13:
+	.loc	2 233 15
+	add.f32 	%f28, %f26, %f27;
+$L__tmp14:
+	.loc	2 243 36
+	and.b32  	%r91, %r1, 1;
+	setp.eq.b32 	%p34, %r91, 1;
+	not.pred 	%p35, %p34;
+	and.pred  	%p25, %p24, %p35;
+	mov.b32 	%r50, %f28;
+	@%p25 st.shared.b32 [ %r48 + 0 ], %r50;
+	bar.sync 	0;
+	ld.shared.f32 	%f29, [global_smem];
+$L__tmp15:
+	.loc	3 8 15
+	add.f32 	%f30, %f29, 0f00000000;
+$L__tmp16:
+	.loc	1 47 20
+	mov.b32 	%r52, %f30;
+	mov.b32 	%r53, 1132462080;
+	div.full.f32 %r74, %r52, %r53;
+	mov.b32 	%f31, %r74;
+	.loc	1 48 19
+	sub.f32 	%f32, %f6, %f31;
+	sub.f32 	%f33, %f5, %f31;
+	sub.f32 	%f34, %f11, %f31;
+	sub.f32 	%f35, %f12, %f31;
+	.loc	1 49 20
+	mul.f32 	%f36, %f33, %f33;
+$L__tmp17:
+	.loc	2 243 36
+	bar.sync 	0;
+$L__tmp18:
+	.loc	2 233 15
+	fma.rn.f32 	%f37, %f32, %f32, %f36;
+	fma.rn.f32 	%f38, %f34, %f34, %f37;
+	fma.rn.f32 	%f39, %f35, %f35, %f38;
+$L__tmp19:
+	.loc	2 243 36
+	mov.b32 	%r92, %f39;
+	shfl.sync.bfly.b32	%r93, %r92, 16, 31, -1;
+	mov.b32 	%f40, %r93;
+$L__tmp20:
+	.loc	2 233 15
+	add.f32 	%f41, %f39, %f40;
+$L__tmp21:
+	.loc	2 243 36
+	mov.b32 	%r94, %f41;
+	shfl.sync.bfly.b32	%r95, %r94, 8, 31, -1;
+	mov.b32 	%f42, %r95;
+$L__tmp22:
+	.loc	2 233 15
+	add.f32 	%f43, %f41, %f42;
+$L__tmp23:
+	.loc	2 243 36
+	mov.b32 	%r96, %f43;
+	shfl.sync.bfly.b32	%r97, %r96, 4, 31, -1;
+	mov.b32 	%f44, %r97;
+$L__tmp24:
+	.loc	2 233 15
+	add.f32 	%f45, %f43, %f44;
+$L__tmp25:
+	.loc	2 243 36
+	mov.b32 	%r98, %f45;
+	shfl.sync.bfly.b32	%r99, %r98, 2, 31, -1;
+	mov.b32 	%f46, %r99;
+$L__tmp26:
+	.loc	2 233 15
+	add.f32 	%f47, %f45, %f46;
+$L__tmp27:
+	.loc	2 243 36
+	mov.b32 	%r100, %f47;
+	shfl.sync.bfly.b32	%r101, %r100, 1, 31, -1;
+	mov.b32 	%f48, %r101;
+$L__tmp28:
+	.loc	2 233 15
+	add.f32 	%f49, %f47, %f48;
+$L__tmp29:
+	.loc	2 243 36
+	mov.b32 	%r55, %f49;
+	@%p23 st.shared.b32 [ %r45 + 0 ], %r55;
+	bar.sync 	0;
+	@%p24 ld.shared.b32 %r56, [ %r48 + 0 ];
+	mov.b32 	%f50, %r56;
+	shfl.sync.bfly.b32	%r102, %r56, 1, 31, -1;
+	mov.b32 	%f51, %r102;
+$L__tmp30:
+	.loc	2 233 15
+	add.f32 	%f52, %f50, %f51;
+$L__tmp31:
+	.loc	2 243 36
+	mov.b32 	%r59, %f52;
+	@%p25 st.shared.b32 [ %r48 + 0 ], %r59;
+	bar.sync 	0;
+	ld.shared.f32 	%f53, [global_smem];
+$L__tmp32:
+	.loc	3 8 15
+	add.f32 	%f54, %f53, 0f00000000;
+$L__tmp33:
+	.loc	1 54 20
+	mov.b32 	%r61, %f54;
+	div.full.f32 %r60, %r61, %r53;
+	mov.b32 	%f55, %r60;
+	.loc	1 56 20
+	add.f32 	%f56, %f55, 0f3727C5AC;
+	.loc	1 57 26
+	rsqrt.approx.ftz.f32 	%f57, %f56;
+	cvt.u32.u64 	%r103, %rd3;
+	cvt.u32.u64 	%r104, %rd1;
+	.loc	1 33 36
+	mov.b32 	%f58, %r20;
+	mov.b32 	%f59, %r21;
+	mov.b32 	%f60, %r22;
+	mov.b32 	%f61, %r23;
+	.loc	1 59 20
+	mul.f32 	%f62, %f32, %f57;
+	mul.f32 	%f63, %f33, %f57;
+	mul.f32 	%f64, %f34, %f57;
+	mul.f32 	%f65, %f35, %f57;
+	.loc	1 60 20
+	mul.f32 	%f66, %f62, %f58;
+	mul.f32 	%f67, %f63, %f59;
+	mul.f32 	%f68, %f64, %f60;
+	mul.f32 	%f69, %f65, %f61;
+	.loc	1 62 35
+	shl.b32 	%r105, %r104, 8;
+	.loc	1 62 31
+	or.b32  	%r106, %r105, %r103;
+	.loc	1 62 25
+	mul.wide.s32 	%rd46, %r106, 4;
+	add.s64 	%rd37, %rd6, %rd46;
+	.loc	1 39 18
+	mov.b32 	%r63, %f6;
+	mov.b32 	%r66, %f12;
+	.loc	1 62 47
+	@%p18 st.global.v4.b32 [ %rd37 + 0 ], { %r63, %r64, %r65, %r66 };
+	.loc	1 63 4
+	bar.sync 	0;
+	.loc	1 64 28
+	shl.b64 	%rd47, %rd1, 2;
+	add.s64 	%rd38, %rd4, %rd47;
+	.loc	1 64 40
+	setp.eq.s32 	%p30, %r2, 0;
+	mov.b32 	%r67, %f57;
+	@%p30 st.global.b32 [ %rd38 + 0 ], { %r67 };
+	.loc	1 65 25
+	mul.wide.s32 	%rd48, %r106, 2;
+	add.s64 	%rd39, %rd8, %rd48;
+	.loc	1 65 48
+	mov.b32 	%r68, %f66;
+	cvt.rn.bf16.f32 %rs1, %r68;
+	mov.b32 	%r69, %f67;
+	cvt.rn.bf16.f32 %rs2, %r69;
+	mov.b32 	%r70, %f68;
+	cvt.rn.bf16.f32 %rs3, %r70;
+	mov.b32 	%r71, %f69;
+	cvt.rn.bf16.f32 %rs4, %r71;
+	mov.b32 	%r107, {%rs1, %rs2};
+	mov.b32 	%r108, {%rs3, %rs4};
+	@%p18 st.global.v2.b32 [ %rd39 + 0 ], { %r107, %r108 };
+	.loc	1 66 25
+	add.s64 	%rd40, %rd7, %rd47;
+	.loc	1 66 37
+	@%p30 st.global.b32 [ %rd40 + 0 ], { %r74 };
+	.loc	1 66 4
+	ret;
+$L__tmp34:
+$L__func_end0:
+}
+	// .globl	__nv_rsqrtf
+.visible .func  (.param .b32 func_retval0) __nv_rsqrtf(
+	.param .b32 __nv_rsqrtf_param_0
+)
+{
+	.reg .f32 	%f<3>;
+$L__func_begin1:
+	ld.param.f32 	%f1, [__nv_rsqrtf_param_0];
+	rsqrt.approx.ftz.f32 	%f2, %f1;
+	st.param.f32 	[func_retval0+0], %f2;
+	ret;
+$L__func_end1:
+}
+	.file	1 "/tmp/torchinductor_root/pd/cpdqiwgwgnzx7tsvbieui7kffx5dt43uhgvg7z7egekxcsybpv34.py"
+	.file	2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
+	.file	3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 32
+.b8 11
+.b8 0
+.b8 0
+.b8 3
+.b8 46
+.b8 1
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 49
+.b8 19
+.b8 0
+.b8 0
+.b8 4
+.b8 29
+.b8 1
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 5
+.b8 29
+.b8 0
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 407
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 112
+.b8 100
+.b8 113
+.b8 105
+.b8 119
+.b8 103
+.b8 119
+.b8 103
+.b8 110
+.b8 122
+.b8 120
+.b8 55
+.b8 116
+.b8 115
+.b8 118
+.b8 98
+.b8 105
+.b8 101
+.b8 117
+.b8 105
+.b8 55
+.b8 107
+.b8 102
+.b8 102
+.b8 120
+.b8 53
+.b8 100
+.b8 116
+.b8 52
+.b8 51
+.b8 117
+.b8 104
+.b8 103
+.b8 118
+.b8 103
+.b8 55
+.b8 122
+.b8 55
+.b8 101
+.b8 103
+.b8 101
+.b8 107
+.b8 120
+.b8 99
+.b8 115
+.b8 121
+.b8 98
+.b8 112
+.b8 118
+.b8 51
+.b8 52
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 112
+.b8 100
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 55
+.b8 100
+.b8 56
+.b8 100
+.b8 101
+.b8 57
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 55
+.b8 100
+.b8 56
+.b8 100
+.b8 101
+.b8 57
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 1
+.b8 3
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b32 125
+.b8 4
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp14
+.b8 2
+.b8 44
+.b8 59
+.b8 5
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp14
+.b8 2
+.b8 243
+.b8 36
+.b8 0
+.b8 5
+.b32 125
+.b64 $L__tmp2
+.b64 $L__tmp15
+.b8 2
+.b8 44
+.b8 59
+.b8 5
+.b32 125
+.b64 $L__tmp15
+.b64 $L__tmp16
+.b8 3
+.b8 44
+.b8 45
+.b8 5
+.b32 125
+.b64 $L__tmp17
+.b64 $L__tmp32
+.b8 2
+.b8 52
+.b8 59
+.b8 4
+.b32 125
+.b64 $L__tmp18
+.b64 $L__tmp31
+.b8 2
+.b8 52
+.b8 59
+.b8 5
+.b32 125
+.b64 $L__tmp18
+.b64 $L__tmp31
+.b8 2
+.b8 243
+.b8 36
+.b8 0
+.b8 5
+.b32 125
+.b64 $L__tmp32
+.b64 $L__tmp33
+.b8 3
+.b8 52
+.b8 45
+.b8 0
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 411
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 55
+.b8 100
+.b8 56
+.b8 100
+.b8 101
+.b8 57
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 411
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}

.triton/dump/645565eaba0a18dd23ef200fe9abb0c0/triton_.ttgir ADDED Viewed

	@@ -0,0 +1,98 @@

+#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d3d4d5d6d7d8de9de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<256> : tensor<256xi32, #blocked>
+    %cst_0 = arith.constant dense<0> : tensor<1xi64, #blocked>
+    %cst_1 = arith.constant dense<50257> : tensor<1xi64, #blocked>
+    %cst_2 = arith.constant dense<256> : tensor<1xi64, #blocked>
+    %cst_3 = arith.constant 9.99999974E-6 : f32
+    %cst_4 = arith.constant 2.560000e+02 : f32
+    %cst_5 = arith.constant 0.000000e+00 : f32
+    %c256_i32 = arith.constant 256 : i32
+    %c512_i32 = arith.constant 512 : i32
+    %cst_6 = arith.constant dense<50257> : tensor<1xi64, #blocked1>
+    %cst_7 = arith.constant dense<0> : tensor<1xi64, #blocked1>
+    %cst_8 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
+    %0 = tt.get_program_id x : i32
+    %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
+    %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
+    %3 = arith.remsi %0, %c512_i32 : i32
+    %4 = tt.addptr %arg1, %0 : !tt.ptr<i64, 1>, i32
+    %5 = tt.splat %4 : (!tt.ptr<i64, 1>) -> tensor<1x!tt.ptr<i64, 1>, #blocked>
+    %6 = tt.splat %4 : (!tt.ptr<i64, 1>) -> tensor<1x!tt.ptr<i64, 1>, #blocked1>
+    %7 = tt.load %5 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xi64, #blocked>
+    %8 = tt.load %6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xi64, #blocked1>
+    %9 = arith.muli %3, %c256_i32 : i32
+    %10 = tt.splat %9 : (i32) -> tensor<256xi32, #blocked>
+    %11 = arith.addi %1, %10 : tensor<256xi32, #blocked>
+    %12 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
+    %13 = tt.addptr %12, %11 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
+    %14 = tt.load %13, %2, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
+    %15 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
+    %16 = tt.addptr %15, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
+    %17 = tt.load %16, %2, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
+    %18 = arith.addi %7, %cst_1 : tensor<1xi64, #blocked>
+    %19 = arith.addi %8, %cst_6 : tensor<1xi64, #blocked1>
+    %20 = arith.cmpi slt, %7, %cst_0 : tensor<1xi64, #blocked>
+    %21 = arith.cmpi slt, %8, %cst_7 : tensor<1xi64, #blocked1>
+    %22 = arith.select %20, %18, %7 : tensor<1xi1, #blocked>, tensor<1xi64, #blocked>
+    %23 = arith.select %21, %19, %8 : tensor<1xi1, #blocked1>, tensor<1xi64, #blocked1>
+    %24 = arith.cmpi sge, %23, %cst_7 : tensor<1xi64, #blocked1>
+    %25 = arith.cmpi slt, %23, %cst_6 : tensor<1xi64, #blocked1>
+    %26 = arith.andi %24, %25 : tensor<1xi1, #blocked1>
+    tt.assert %26, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<1xi1, #blocked1>
+    %27 = arith.muli %22, %cst_2 : tensor<1xi64, #blocked>
+    %28 = tt.broadcast %27 : (tensor<1xi64, #blocked>) -> tensor<256xi64, #blocked>
+    %29 = arith.extsi %1 : tensor<256xi32, #blocked> to tensor<256xi64, #blocked>
+    %30 = arith.addi %29, %28 : tensor<256xi64, #blocked>
+    %31 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
+    %32 = tt.addptr %31, %30 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi64, #blocked>
+    %33 = tt.load %32, %2, %cst_8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
+    %34 = arith.addf %33, %14 : tensor<256xf32, #blocked>
+    %35 = arith.select %2, %34, %cst_8 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
+    %36 = "tt.reduce"(%35) <{axis = 0 : i32}> ({
+    ^bb0(%arg10: f32, %arg11: f32):
+      %65 = arith.addf %arg10, %arg11 : f32
+      tt.reduce.return %65 : f32
+    }) : (tensor<256xf32, #blocked>) -> f32
+    %37 = arith.addf %36, %cst_5 : f32
+    %38 = arith.divf %37, %cst_4 : f32
+    %39 = tt.splat %38 : (f32) -> tensor<1xf32, #blocked1>
+    %40 = tt.splat %38 : (f32) -> tensor<256xf32, #blocked>
+    %41 = arith.subf %34, %40 : tensor<256xf32, #blocked>
+    %42 = arith.mulf %41, %41 : tensor<256xf32, #blocked>
+    %43 = arith.select %2, %42, %cst_8 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
+    %44 = "tt.reduce"(%43) <{axis = 0 : i32}> ({
+    ^bb0(%arg10: f32, %arg11: f32):
+      %65 = arith.addf %arg10, %arg11 : f32
+      tt.reduce.return %65 : f32
+    }) : (tensor<256xf32, #blocked>) -> f32
+    %45 = arith.addf %44, %cst_5 : f32
+    %46 = arith.divf %45, %cst_4 : f32
+    %47 = arith.addf %46, %cst_3 : f32
+    %48 = tt.extern_elementwise %47 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
+    %49 = tt.splat %48 : (f32) -> tensor<1xf32, #blocked1>
+    %50 = tt.splat %48 : (f32) -> tensor<256xf32, #blocked>
+    %51 = arith.mulf %41, %50 : tensor<256xf32, #blocked>
+    %52 = arith.mulf %51, %17 : tensor<256xf32, #blocked>
+    %53 = arith.muli %0, %c256_i32 : i32
+    %54 = tt.splat %53 : (i32) -> tensor<256xi32, #blocked>
+    %55 = arith.addi %1, %54 : tensor<256xi32, #blocked>
+    %56 = tt.splat %arg5 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
+    %57 = tt.addptr %56, %55 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
+    tt.store %57, %34, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked>
+    gpu.barrier
+    %58 = tt.addptr %arg0, %0 : !tt.ptr<f32, 1>, i32
+    %59 = tt.splat %58 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked1>
+    tt.store %59, %49 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32, #blocked1>
+    %60 = tt.splat %arg7 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
+    %61 = tt.addptr %60, %55 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
+    %62 = arith.truncf %52 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked>
+    tt.store %61, %62, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked>
+    %63 = tt.addptr %arg6, %0 : !tt.ptr<f32, 1>, i32
+    %64 = tt.splat %63 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked1>
+    tt.store %64, %39 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32, #blocked1>
+    tt.return
+  }
+}

.triton/dump/7dc5bb3e5c2bb99527fff34c6fba7810/triton_.llir ADDED Viewed

	@@ -0,0 +1,42 @@

+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+define void @triton__0d1de(ptr addrspace(1) %0, i32 %1) local_unnamed_addr !dbg !5 {
+  %3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
+  %4 = and i32 %3, 127, !dbg !8
+  %5 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
+  %6 = shl i32 %5, 7, !dbg !10
+  %7 = or i32 %6, %4, !dbg !11
+  %8 = icmp slt i32 %7, 512, !dbg !12
+  %9 = sext i32 %7 to i64, !dbg !13
+  %10 = getelementptr i64, ptr addrspace(1) %0, i64 %9, !dbg !13
+  tail call void asm sideeffect "@$2 st.global.b64 [ $1 + 0 ], { $0 };", "l,l,b"(i64 %9, ptr addrspace(1) %10, i1 %8) #1, !dbg !14
+  ret void, !dbg !15
+}
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { nounwind }
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+!nvvm.annotations = !{!3, !4, !4, !3}
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!2 = !DIFile(filename: "cwxxgxdevnyc453z7hh4nxzgmvlhh6suwokktps3dw62btskgxt4.py", directory: "/tmp/torchinductor_root/wx")
+!3 = !{ptr @triton__0d1de, !"kernel", i32 1}
+!4 = !{ptr @triton__0d1de, !"maxntidx", i32 128}
+!5 = distinct !DISubprogram(name: "triton__0d1de", linkageName: "triton__0d1de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 21, column: 36, scope: !5)
+!9 = !DILocation(line: 20, column: 28, scope: !5)
+!10 = !DILocation(line: 20, column: 33, scope: !5)
+!11 = !DILocation(line: 21, column: 23, scope: !5)
+!12 = !DILocation(line: 22, column: 21, scope: !5)
+!13 = !DILocation(line: 25, column: 25, scope: !5)
+!14 = !DILocation(line: 25, column: 36, scope: !5)
+!15 = !DILocation(line: 25, column: 4, scope: !5)

.triton/dump/7dc5bb3e5c2bb99527fff34c6fba7810/triton_.ttir ADDED Viewed

	@@ -0,0 +1,17 @@

+module {
+  tt.func public @triton__0d1de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<512> : tensor<128xi32>
+    %c128_i32 = arith.constant 128 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c128_i32 : i32
+    %2 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
+    %3 = tt.splat %1 : (i32) -> tensor<128xi32>
+    %4 = arith.addi %3, %2 : tensor<128xi32>
+    %5 = arith.cmpi slt, %4, %cst : tensor<128xi32>
+    %6 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<128x!tt.ptr<i64, 1>>
+    %7 = tt.addptr %6, %4 : tensor<128x!tt.ptr<i64, 1>>, tensor<128xi32>
+    %8 = arith.extsi %4 : tensor<128xi32> to tensor<128xi64>
+    tt.store %7, %8, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<128xi64>
+    tt.return
+  }
+}

.triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.cubin ADDED Viewed

Binary file (15.2 kB). View file

.triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.llir ADDED Viewed

	@@ -0,0 +1,333 @@

+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+@global_smem = external addrspace(3) global [0 x i8]
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !7 {
+  %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %10 = and i32 %9, 31, !dbg !10
+  %11 = lshr i32 %9, 5, !dbg !10
+  %12 = and i32 %11, 1, !dbg !10
+  %urem = shl i32 %9, 2, !dbg !10
+  %13 = and i32 %urem, 252, !dbg !10
+  %14 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !11
+  %15 = shl i32 %14, 8, !dbg !12
+  %16 = or i32 %15, %13, !dbg !13
+  %17 = sext i32 %16 to i64, !dbg !14
+  %18 = getelementptr float, ptr addrspace(1) %0, i64 %17, !dbg !14
+  %19 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %18, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !15
+  %20 = extractvalue { i32, i32, i32, i32 } %19, 0, !dbg !15
+  %21 = extractvalue { i32, i32, i32, i32 } %19, 1, !dbg !15
+  %22 = extractvalue { i32, i32, i32, i32 } %19, 2, !dbg !15
+  %23 = extractvalue { i32, i32, i32, i32 } %19, 3, !dbg !15
+  %24 = bitcast i32 %22 to float, !dbg !15
+  %25 = bitcast i32 %23 to float, !dbg !15
+  %26 = getelementptr i16, ptr addrspace(1) %1, i64 %17, !dbg !16
+  %27 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %26, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !17
+  %28 = extractvalue { i32, i32 } %27, 0, !dbg !17
+  %29 = extractvalue { i32, i32 } %27, 1, !dbg !17
+  %30 = trunc i32 %28 to i16, !dbg !17
+  %extelt.offset = lshr i32 %28, 16, !dbg !17
+  %31 = trunc i32 %extelt.offset to i16, !dbg !17
+  %32 = trunc i32 %29 to i16, !dbg !17
+  %extelt.offset1 = lshr i32 %29, 16, !dbg !17
+  %33 = trunc i32 %extelt.offset1 to i16, !dbg !17
+  %34 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %30) #6, !dbg !18
+  %35 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %31) #6, !dbg !18
+  %36 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %32) #6, !dbg !18
+  %37 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %33) #6, !dbg !18
+  %38 = getelementptr i16, ptr addrspace(1) %2, i64 %17, !dbg !19
+  %39 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %38, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !20
+  %40 = extractvalue { i32, i32 } %39, 0, !dbg !20
+  %41 = extractvalue { i32, i32 } %39, 1, !dbg !20
+  %42 = trunc i32 %40 to i16, !dbg !20
+  %extelt.offset2 = lshr i32 %40, 16, !dbg !20
+  %43 = trunc i32 %extelt.offset2 to i16, !dbg !20
+  %44 = trunc i32 %41 to i16, !dbg !20
+  %extelt.offset3 = lshr i32 %41, 16, !dbg !20
+  %45 = trunc i32 %extelt.offset3 to i16, !dbg !20
+  %46 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %42) #6, !dbg !21
+  %47 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %43) #6, !dbg !21
+  %48 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %44) #6, !dbg !21
+  %49 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %45) #6, !dbg !21
+  %50 = getelementptr i16, ptr addrspace(1) %3, i64 %17, !dbg !22
+  %51 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %50, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !23
+  %52 = extractvalue { i32, i32 } %51, 0, !dbg !23
+  %53 = extractvalue { i32, i32 } %51, 1, !dbg !23
+  %54 = trunc i32 %52 to i16, !dbg !23
+  %extelt.offset4 = lshr i32 %52, 16, !dbg !23
+  %55 = trunc i32 %extelt.offset4 to i16, !dbg !23
+  %56 = trunc i32 %53 to i16, !dbg !23
+  %extelt.offset5 = lshr i32 %53, 16, !dbg !23
+  %57 = trunc i32 %extelt.offset5 to i16, !dbg !23
+  %58 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %54) #6, !dbg !24
+  %59 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %55) #6, !dbg !24
+  %60 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %56) #6, !dbg !24
+  %61 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %57) #6, !dbg !24
+  %62 = zext nneg i32 %13 to i64, !dbg !25
+  %63 = getelementptr float, ptr addrspace(1) %4, i64 %62, !dbg !25
+  %64 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %63, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !26
+  %65 = fadd float %36, %24, !dbg !27
+  %66 = fadd float %37, %25, !dbg !27
+  %67 = fadd float %65, %48, !dbg !28
+  %68 = fadd float %66, %49, !dbg !28
+  %69 = insertelement <2 x i32> poison, i32 %20, i64 0, !dbg !15
+  %70 = insertelement <2 x i32> %69, i32 %21, i64 1, !dbg !15
+  %71 = bitcast <2 x i32> %70 to <2 x float>, !dbg !15
+  %72 = insertelement <2 x float> poison, float %34, i64 0, !dbg !27
+  %73 = insertelement <2 x float> %72, float %35, i64 1, !dbg !27
+  %74 = fadd <2 x float> %73, %71, !dbg !27
+  %75 = insertelement <2 x float> poison, float %46, i64 0, !dbg !28
+  %76 = insertelement <2 x float> %75, float %47, i64 1, !dbg !28
+  %77 = fadd <2 x float> %74, %76, !dbg !28
+  %78 = insertelement <2 x float> poison, float %58, i64 0, !dbg !29
+  %79 = insertelement <2 x float> %78, float %59, i64 1, !dbg !29
+  %80 = fadd <2 x float> %77, %79, !dbg !29
+  %81 = fadd float %67, %60, !dbg !29
+  %82 = fadd float %68, %61, !dbg !29
+  %83 = extractelement <2 x float> %80, i64 0, !dbg !30
+  %84 = extractelement <2 x float> %80, i64 1, !dbg !30
+  %85 = fadd float %83, %84, !dbg !30
+  %86 = fadd float %85, %81, !dbg !30
+  %87 = fadd float %86, %82, !dbg !30
+  %88 = bitcast float %87 to i32, !dbg !36
+  %89 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %88, i32 16, i32 31), !dbg !36
+  %90 = bitcast i32 %89 to float, !dbg !36
+  %91 = fadd float %87, %90, !dbg !30
+  %92 = bitcast float %91 to i32, !dbg !36
+  %93 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %92, i32 8, i32 31), !dbg !36
+  %94 = bitcast i32 %93 to float, !dbg !36
+  %95 = fadd float %91, %94, !dbg !30
+  %96 = bitcast float %95 to i32, !dbg !36
+  %97 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %96, i32 4, i32 31), !dbg !36
+  %98 = bitcast i32 %97 to float, !dbg !36
+  %99 = fadd float %95, %98, !dbg !30
+  %100 = bitcast float %99 to i32, !dbg !36
+  %101 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %100, i32 2, i32 31), !dbg !36
+  %102 = bitcast i32 %101 to float, !dbg !36
+  %103 = fadd float %99, %102, !dbg !30
+  %104 = bitcast float %103 to i32, !dbg !36
+  %105 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %104, i32 1, i32 31), !dbg !36
+  %106 = bitcast i32 %105 to float, !dbg !36
+  %107 = fadd float %103, %106, !dbg !30
+  %108 = icmp eq i32 %10, 0, !dbg !36
+  %109 = zext nneg i32 %12 to i64, !dbg !36
+  %110 = getelementptr float, ptr addrspace(3) @global_smem, i64 %109, !dbg !36
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %110, float %107, i1 %108) #6, !dbg !36
+  tail call void @llvm.nvvm.barrier0(), !dbg !36
+  %111 = icmp slt i32 %9, 2, !dbg !36
+  %112 = sext i32 %9 to i64, !dbg !36
+  %113 = getelementptr float, ptr addrspace(3) @global_smem, i64 %112, !dbg !36
+  %114 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %113, i1 %111) #6, !dbg !36
+  %115 = bitcast float %114 to i32, !dbg !36
+  %116 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %115, i32 1, i32 31), !dbg !36
+  %117 = bitcast i32 %116 to float, !dbg !36
+  %118 = fadd float %114, %117, !dbg !30
+  %119 = and i32 %9, 1, !dbg !36
+  %120 = icmp eq i32 %119, 0, !dbg !36
+  %121 = and i1 %111, %120, !dbg !36
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %113, float %118, i1 %121) #6, !dbg !36
+  tail call void @llvm.nvvm.barrier0(), !dbg !36
+  %122 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !36
+  %123 = fadd float %122, 0.000000e+00, !dbg !38
+  %124 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %123, float 2.560000e+02) #6, !dbg !42
+  %125 = fsub float %83, %124, !dbg !43
+  %126 = fsub float %84, %124, !dbg !43
+  %127 = fsub float %81, %124, !dbg !43
+  %128 = fsub float %82, %124, !dbg !43
+  %129 = fmul float %125, %125, !dbg !44
+  %130 = fmul float %126, %126, !dbg !44
+  %131 = fmul float %127, %127, !dbg !44
+  %132 = fmul float %128, %128, !dbg !44
+  tail call void @llvm.nvvm.barrier0(), !dbg !45
+  %133 = fadd float %129, %130, !dbg !47
+  %134 = fadd float %131, %133, !dbg !47
+  %135 = fadd float %132, %134, !dbg !47
+  %136 = bitcast float %135 to i32, !dbg !45
+  %137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 16, i32 31), !dbg !45
+  %138 = bitcast i32 %137 to float, !dbg !45
+  %139 = fadd float %135, %138, !dbg !47
+  %140 = bitcast float %139 to i32, !dbg !45
+  %141 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %140, i32 8, i32 31), !dbg !45
+  %142 = bitcast i32 %141 to float, !dbg !45
+  %143 = fadd float %139, %142, !dbg !47
+  %144 = bitcast float %143 to i32, !dbg !45
+  %145 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %144, i32 4, i32 31), !dbg !45
+  %146 = bitcast i32 %145 to float, !dbg !45
+  %147 = fadd float %143, %146, !dbg !47
+  %148 = bitcast float %147 to i32, !dbg !45
+  %149 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %148, i32 2, i32 31), !dbg !45
+  %150 = bitcast i32 %149 to float, !dbg !45
+  %151 = fadd float %147, %150, !dbg !47
+  %152 = bitcast float %151 to i32, !dbg !45
+  %153 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %152, i32 1, i32 31), !dbg !45
+  %154 = bitcast i32 %153 to float, !dbg !45
+  %155 = fadd float %151, %154, !dbg !47
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %110, float %155, i1 %108) #6, !dbg !45
+  tail call void @llvm.nvvm.barrier0(), !dbg !45
+  %156 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %113, i1 %111) #6, !dbg !45
+  %157 = bitcast float %156 to i32, !dbg !45
+  %158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 1, i32 31), !dbg !45
+  %159 = bitcast i32 %158 to float, !dbg !45
+  %160 = fadd float %156, %159, !dbg !47
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %113, float %160, i1 %121) #6, !dbg !45
+  tail call void @llvm.nvvm.barrier0(), !dbg !45
+  %161 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !45
+  %162 = fadd float %161, 0.000000e+00, !dbg !50
+  %163 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %162, float 2.560000e+02) #6, !dbg !52
+  %164 = fadd float %163, 0x3EE4F8B580000000, !dbg !53
+  %165 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !54
+  %.not.i = icmp eq i32 %165, 0, !dbg !54
+  br i1 %.not.i, label %168, label %166, !dbg !54
+166:                                              ; preds = %8
+  %167 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %164), !dbg !54
+  br label %__nv_rsqrtf.exit, !dbg !54
+168:                                              ; preds = %8
+  %169 = tail call float @llvm.nvvm.rsqrt.approx.f(float %164), !dbg !54
+  br label %__nv_rsqrtf.exit, !dbg !54
+__nv_rsqrtf.exit:                                 ; preds = %166, %168
+  %.0.i = phi float [ %167, %166 ], [ %169, %168 ], !dbg !54
+  %170 = extractvalue { i32, i32, i32, i32 } %64, 3, !dbg !26
+  %171 = bitcast i32 %170 to float, !dbg !26
+  %172 = extractvalue { i32, i32, i32, i32 } %64, 2, !dbg !26
+  %173 = bitcast i32 %172 to float, !dbg !26
+  %174 = extractvalue { i32, i32, i32, i32 } %64, 1, !dbg !26
+  %175 = bitcast i32 %174 to float, !dbg !26
+  %176 = extractvalue { i32, i32, i32, i32 } %64, 0, !dbg !26
+  %177 = bitcast i32 %176 to float, !dbg !26
+  %178 = fmul float %125, %.0.i, !dbg !55
+  %179 = fmul float %126, %.0.i, !dbg !55
+  %180 = fmul float %127, %.0.i, !dbg !55
+  %181 = fmul float %128, %.0.i, !dbg !55
+  %182 = fmul float %178, %177, !dbg !56
+  %183 = fmul float %179, %175, !dbg !56
+  %184 = fmul float %180, %173, !dbg !56
+  %185 = fmul float %181, %171, !dbg !56
+  %186 = getelementptr i16, ptr addrspace(1) %5, i64 %17, !dbg !57
+  %187 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %182) #6, !dbg !58
+  %188 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %183) #6, !dbg !58
+  %189 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %184) #6, !dbg !58
+  %190 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %185) #6, !dbg !58
+  %191 = insertelement <2 x i16> undef, i16 %187, i64 0, !dbg !58
+  %192 = insertelement <2 x i16> %191, i16 %188, i64 1, !dbg !58
+  %193 = bitcast <2 x i16> %192 to i32, !dbg !58
+  %194 = insertelement <2 x i16> undef, i16 %189, i64 0, !dbg !58
+  %195 = insertelement <2 x i16> %194, i16 %190, i64 1, !dbg !58
+  %196 = bitcast <2 x i16> %195 to i32, !dbg !58
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %193, i32 %196, ptr addrspace(1) %186, i1 true) #6, !dbg !58
+  ret void, !dbg !59
+}
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #2
+; Function Attrs: alwaysinline nounwind
+define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
+  %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
+  %.not = icmp eq i32 %1, 0
+  br i1 %.not, label %4, label %2
+2:                                                ; preds = %0
+  %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
+  br label %6
+4:                                                ; preds = %0
+  %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
+  br label %6
+6:                                                ; preds = %4, %2
+  %.0 = phi float [ %3, %2 ], [ %5, %4 ]
+  ret float %.0
+}
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #5
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #2 = { convergent nocallback nounwind }
+attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #6 = { nounwind }
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+!nvvm.annotations = !{!4, !5, !5, !4}
+!llvm.ident = !{!6}
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!3 = !DIFile(filename: "cpwl4wgyi5spzbgbswrqxfrxlyk2m76a4bakbp6l5ltopjbkjadt.py", directory: "/tmp/torchinductor_root/pw")
+!4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1}
+!5 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 64}
+!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
+!9 = !{}
+!10 = !DILocation(line: 26, column: 26, scope: !7)
+!11 = !DILocation(line: 23, column: 28, scope: !7)
+!12 = !DILocation(line: 30, column: 40, scope: !7)
+!13 = !DILocation(line: 30, column: 36, scope: !7)
+!14 = !DILocation(line: 30, column: 30, scope: !7)
+!15 = !DILocation(line: 30, column: 46, scope: !7)
+!16 = !DILocation(line: 31, column: 30, scope: !7)
+!17 = !DILocation(line: 31, column: 46, scope: !7)
+!18 = !DILocation(line: 31, column: 67, scope: !7)
+!19 = !DILocation(line: 32, column: 30, scope: !7)
+!20 = !DILocation(line: 32, column: 46, scope: !7)
+!21 = !DILocation(line: 32, column: 67, scope: !7)
+!22 = !DILocation(line: 33, column: 30, scope: !7)
+!23 = !DILocation(line: 33, column: 46, scope: !7)
+!24 = !DILocation(line: 33, column: 67, scope: !7)
+!25 = !DILocation(line: 34, column: 31, scope: !7)
+!26 = !DILocation(line: 34, column: 36, scope: !7)
+!27 = !DILocation(line: 36, column: 18, scope: !7)
+!28 = !DILocation(line: 38, column: 18, scope: !7)
+!29 = !DILocation(line: 40, column: 18, scope: !7)
+!30 = !DILocation(line: 233, column: 15, scope: !31, inlinedAt: !34)
+!31 = distinct !DILexicalBlockFile(scope: !33, file: !32, discriminator: 0)
+!32 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
+!33 = distinct !DILexicalBlockFile(scope: !7, file: !32, discriminator: 0)
+!34 = !DILocation(line: 243, column: 36, scope: !31, inlinedAt: !35)
+!35 = !DILocation(line: 45, column: 59, scope: !31)
+!36 = !DILocation(line: 243, column: 36, scope: !33, inlinedAt: !37)
+!37 = !DILocation(line: 45, column: 59, scope: !33)
+!38 = !DILocation(line: 8, column: 15, scope: !39, inlinedAt: !41)
+!39 = distinct !DILexicalBlockFile(scope: !7, file: !40, discriminator: 0)
+!40 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
+!41 = !DILocation(line: 45, column: 45, scope: !39)
+!42 = !DILocation(line: 48, column: 20, scope: !7)
+!43 = !DILocation(line: 49, column: 20, scope: !7)
+!44 = !DILocation(line: 50, column: 20, scope: !7)
+!45 = !DILocation(line: 243, column: 36, scope: !33, inlinedAt: !46)
+!46 = !DILocation(line: 53, column: 59, scope: !33)
+!47 = !DILocation(line: 233, column: 15, scope: !31, inlinedAt: !48)
+!48 = !DILocation(line: 243, column: 36, scope: !31, inlinedAt: !49)
+!49 = !DILocation(line: 53, column: 59, scope: !31)
+!50 = !DILocation(line: 8, column: 15, scope: !39, inlinedAt: !51)
+!51 = !DILocation(line: 53, column: 45, scope: !39)
+!52 = !DILocation(line: 56, column: 20, scope: !7)
+!53 = !DILocation(line: 58, column: 20, scope: !7)
+!54 = !DILocation(line: 59, column: 26, scope: !7)
+!55 = !DILocation(line: 60, column: 20, scope: !7)
+!56 = !DILocation(line: 61, column: 20, scope: !7)
+!57 = !DILocation(line: 63, column: 25, scope: !7)
+!58 = !DILocation(line: 63, column: 48, scope: !7)
+!59 = !DILocation(line: 63, column: 4, scope: !7)

.triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.ttgir ADDED Viewed

	@@ -0,0 +1,68 @@

+#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<256> : tensor<256xi32, #blocked>
+    %cst_0 = arith.constant 9.99999974E-6 : f32
+    %cst_1 = arith.constant 2.560000e+02 : f32
+    %cst_2 = arith.constant 0.000000e+00 : f32
+    %c256_i32 = arith.constant 256 : i32
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
+    %cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
+    %0 = tt.get_program_id x : i32
+    %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
+    %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
+    %3 = arith.muli %0, %c256_i32 : i32
+    %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
+    %5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
+    %6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
+    %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
+    %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
+    %9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
+    %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
+    %11 = tt.load %10, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
+    %12 = arith.extf %11 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
+    %13 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
+    %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
+    %15 = tt.load %14, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
+    %16 = arith.extf %15 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
+    %17 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
+    %18 = tt.addptr %17, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
+    %19 = tt.load %18, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
+    %20 = arith.extf %19 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
+    %21 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
+    %22 = tt.addptr %21, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
+    %23 = tt.load %22, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
+    %24 = arith.addf %8, %12 : tensor<256xf32, #blocked>
+    %25 = arith.addf %24, %16 : tensor<256xf32, #blocked>
+    %26 = arith.addf %25, %20 : tensor<256xf32, #blocked>
+    %27 = arith.select %2, %26, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
+    %28 = "tt.reduce"(%27) <{axis = 0 : i32}> ({
+    ^bb0(%arg8: f32, %arg9: f32):
+      %46 = arith.addf %arg8, %arg9 : f32
+      tt.reduce.return %46 : f32
+    }) : (tensor<256xf32, #blocked>) -> f32
+    %29 = arith.addf %28, %cst_2 : f32
+    %30 = arith.divf %29, %cst_1 : f32
+    %31 = tt.splat %30 : (f32) -> tensor<256xf32, #blocked>
+    %32 = arith.subf %26, %31 : tensor<256xf32, #blocked>
+    %33 = arith.mulf %32, %32 : tensor<256xf32, #blocked>
+    %34 = arith.select %2, %33, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
+    %35 = "tt.reduce"(%34) <{axis = 0 : i32}> ({
+    ^bb0(%arg8: f32, %arg9: f32):
+      %46 = arith.addf %arg8, %arg9 : f32
+      tt.reduce.return %46 : f32
+    }) : (tensor<256xf32, #blocked>) -> f32
+    %36 = arith.addf %35, %cst_2 : f32
+    %37 = arith.divf %36, %cst_1 : f32
+    %38 = arith.addf %37, %cst_0 : f32
+    %39 = tt.extern_elementwise %38 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
+    %40 = tt.splat %39 : (f32) -> tensor<256xf32, #blocked>
+    %41 = arith.mulf %32, %40 : tensor<256xf32, #blocked>
+    %42 = arith.mulf %41, %23 : tensor<256xf32, #blocked>
+    %43 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
+    %44 = tt.addptr %43, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
+    %45 = arith.truncf %42 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked>
+    tt.store %44, %45, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked>
+    tt.return
+  }
+}

.triton/dump/9a2fb05196b13393bea452d08e9aaca8/triton_.llir ADDED Viewed

	@@ -0,0 +1,54 @@

+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 {
+  %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
+  %5 = shl i32 %4, 1, !dbg !8
+  %6 = and i32 %5, 510, !dbg !8
+  %7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
+  %8 = shl i32 %7, 9, !dbg !10
+  %9 = or i32 %8, %6, !dbg !11
+  %10 = sext i32 %9 to i64, !dbg !12
+  %11 = getelementptr float, ptr addrspace(1) %0, i64 %10, !dbg !12
+  %12 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];", "=r,=r,l,b"(ptr addrspace(1) %11, i1 true) #1, !dbg !13
+  %13 = extractvalue { i32, i32 } %12, 0, !dbg !13
+  %14 = extractvalue { i32, i32 } %12, 1, !dbg !13
+  %15 = bitcast i32 %13 to float, !dbg !13
+  %16 = bitcast i32 %14 to float, !dbg !13
+  %17 = getelementptr i16, ptr addrspace(1) %1, i64 %10, !dbg !14
+  %18 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %15) #1, !dbg !15
+  %19 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %16) #1, !dbg !15
+  %20 = insertelement <2 x i16> undef, i16 %18, i64 0, !dbg !15
+  %21 = insertelement <2 x i16> %20, i16 %19, i64 1, !dbg !15
+  %22 = bitcast <2 x i16> %21 to i32, !dbg !15
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %22, ptr addrspace(1) %17, i1 true) #1, !dbg !15
+  ret void, !dbg !16
+}
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { nounwind }
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+!nvvm.annotations = !{!3, !4, !4, !3}
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!2 = !DIFile(filename: "c5tryp5qwkhreijk7s5x327wofz54lwj4kvctuqdzv2vrf2xyons.py", directory: "/tmp/torchinductor_root/5t")
+!3 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
+!4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 256}
+!5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 21, column: 36, scope: !5)
+!9 = !DILocation(line: 20, column: 28, scope: !5)
+!10 = !DILocation(line: 20, column: 33, scope: !5)
+!11 = !DILocation(line: 21, column: 23, scope: !5)
+!12 = !DILocation(line: 24, column: 30, scope: !5)
+!13 = !DILocation(line: 24, column: 35, scope: !5)
+!14 = !DILocation(line: 26, column: 25, scope: !5)
+!15 = !DILocation(line: 26, column: 36, scope: !5)
+!16 = !DILocation(line: 26, column: 4, scope: !5)

.triton/dump/9a2fb05196b13393bea452d08e9aaca8/triton_.ttgir ADDED Viewed

	@@ -0,0 +1,19 @@

+#blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %c512_i32 = arith.constant 512 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c512_i32 : i32
+    %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked>
+    %3 = tt.splat %1 : (i32) -> tensor<512xi32, #blocked>
+    %4 = arith.addi %3, %2 : tensor<512xi32, #blocked>
+    %5 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>, #blocked>
+    %6 = tt.addptr %5, %4 : tensor<512x!tt.ptr<f32, 1>, #blocked>, tensor<512xi32, #blocked>
+    %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xf32, #blocked>
+    %8 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>, #blocked>
+    %9 = tt.addptr %8, %4 : tensor<512x!tt.ptr<bf16, 1>, #blocked>, tensor<512xi32, #blocked>
+    %10 = arith.truncf %7 : tensor<512xf32, #blocked> to tensor<512xbf16, #blocked>
+    tt.store %9, %10 {cache = 1 : i32, evict = 1 : i32} : tensor<512xbf16, #blocked>
+    tt.return
+  }
+}

.triton/dump/a69784da01a97187168f22847465505f/triton_.cubin ADDED Viewed

Binary file (15 kB). View file

.triton/dump/a69784da01a97187168f22847465505f/triton_.ttgir ADDED Viewed

	@@ -0,0 +1,73 @@

+#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d3d4d5d6d7de8de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<256> : tensor<256xi32, #blocked>
+    %cst_0 = arith.constant 9.99999974E-6 : f32
+    %cst_1 = arith.constant 2.560000e+02 : f32
+    %cst_2 = arith.constant 0.000000e+00 : f32
+    %c256_i32 = arith.constant 256 : i32
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
+    %cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
+    %0 = tt.get_program_id x : i32
+    %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
+    %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
+    %3 = arith.muli %0, %c256_i32 : i32
+    %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
+    %5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
+    %6 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
+    %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
+    %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
+    %9 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
+    %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
+    %11 = tt.load %10, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
+    %12 = arith.extf %11 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
+    %13 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
+    %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
+    %15 = tt.load %14, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
+    %16 = arith.extf %15 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
+    %17 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
+    %18 = tt.addptr %17, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
+    %19 = tt.load %18, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
+    %20 = arith.addf %8, %12 : tensor<256xf32, #blocked>
+    %21 = arith.addf %20, %16 : tensor<256xf32, #blocked>
+    %22 = arith.select %2, %21, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
+    %23 = "tt.reduce"(%22) <{axis = 0 : i32}> ({
+    ^bb0(%arg9: f32, %arg10: f32):
+      %47 = arith.addf %arg9, %arg10 : f32
+      tt.reduce.return %47 : f32
+    }) : (tensor<256xf32, #blocked>) -> f32
+    %24 = arith.addf %23, %cst_2 : f32
+    %25 = arith.divf %24, %cst_1 : f32
+    %26 = tt.splat %25 : (f32) -> tensor<1xf32, #blocked1>
+    %27 = tt.splat %25 : (f32) -> tensor<256xf32, #blocked>
+    %28 = arith.subf %21, %27 : tensor<256xf32, #blocked>
+    %29 = arith.mulf %28, %28 : tensor<256xf32, #blocked>
+    %30 = arith.select %2, %29, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
+    %31 = "tt.reduce"(%30) <{axis = 0 : i32}> ({
+    ^bb0(%arg9: f32, %arg10: f32):
+      %47 = arith.addf %arg9, %arg10 : f32
+      tt.reduce.return %47 : f32
+    }) : (tensor<256xf32, #blocked>) -> f32
+    %32 = arith.addf %31, %cst_2 : f32
+    %33 = arith.divf %32, %cst_1 : f32
+    %34 = arith.addf %33, %cst_0 : f32
+    %35 = tt.extern_elementwise %34 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
+    %36 = tt.splat %35 : (f32) -> tensor<1xf32, #blocked1>
+    %37 = tt.splat %35 : (f32) -> tensor<256xf32, #blocked>
+    %38 = arith.mulf %28, %37 : tensor<256xf32, #blocked>
+    %39 = arith.mulf %38, %19 : tensor<256xf32, #blocked>
+    gpu.barrier
+    %40 = tt.addptr %arg0, %0 : !tt.ptr<f32, 1>, i32
+    %41 = tt.splat %40 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked1>
+    tt.store %41, %36 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32, #blocked1>
+    %42 = tt.splat %arg6 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
+    %43 = tt.addptr %42, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
+    %44 = arith.truncf %39 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked>
+    tt.store %43, %44, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked>
+    %45 = tt.addptr %arg5, %0 : !tt.ptr<f32, 1>, i32
+    %46 = tt.splat %45 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked1>
+    tt.store %46, %26 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32, #blocked1>
+    tt.return
+  }
+}

.triton/dump/a75e14a8d2d1ec8471f1c7b615552f8c/triton_.cubin ADDED Viewed

Binary file (4.9 kB). View file

.triton/dump/a75e14a8d2d1ec8471f1c7b615552f8c/triton_.ptx ADDED Viewed

	@@ -0,0 +1,295 @@

+//
+// Generated by LLVM NVPTX Back-End
+//
+.version 8.2
+.target sm_89
+.address_size 64
+	// .globl	triton__0d1d2de
+.visible .entry triton__0d1d2de(
+	.param .u64 triton__0d1d2de_param_0,
+	.param .u64 triton__0d1d2de_param_1,
+	.param .u32 triton__0d1d2de_param_2
+)
+.maxntid 128, 1, 1
+{
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<3>;
+	.reg .b32 	%r<13>;
+	.reg .b64 	%rd<7>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+	ld.param.u64 	%rd3, [triton__0d1d2de_param_0];
+	ld.param.u64 	%rd4, [triton__0d1d2de_param_1];
+$L__tmp0:
+	.loc	1 21 36
+	mov.u32 	%r7, %tid.x;
+	shl.b32 	%r8, %r7, 1;
+	and.b32  	%r9, %r8, 254;
+	.loc	1 20 28
+	mov.u32 %r1, %ctaid.x;
+	.loc	1 20 33
+	shl.b32 	%r10, %r1, 8;
+	.loc	1 21 23
+	or.b32  	%r11, %r10, %r9;
+	.loc	1 24 30
+	mul.wide.s32 	%rd5, %r11, 4;
+	add.s64 	%rd1, %rd3, %rd5;
+	mov.pred 	%p1, -1;
+	.loc	1 24 35
+	mov.u32 %r4, 0x0;
+	mov.u32 %r5, 0x0;
+	@%p1 ld.global.v2.b32 { %r4, %r5 }, [ %rd1 + 0 ];
+	.loc	1 26 25
+	mul.wide.s32 	%rd6, %r11, 2;
+	add.s64 	%rd2, %rd4, %rd6;
+	.loc	1 26 36
+	cvt.rn.bf16.f32 %rs1, %r4;
+	cvt.rn.bf16.f32 %rs2, %r5;
+	mov.b32 	%r12, {%rs1, %rs2};
+	@%p1 st.global.b32 [ %rd2 + 0 ], { %r12 };
+	.loc	1 26 4
+	ret;
+$L__tmp1:
+$L__func_end0:
+}
+	.file	1 "/tmp/torchinductor_root/pq/cpqhcwm5bfrhuwddh4c4qks6bh7sovfbpfnmqhnm4h4w23icqnu6.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 176
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 112
+.b8 113
+.b8 104
+.b8 99
+.b8 119
+.b8 109
+.b8 53
+.b8 98
+.b8 102
+.b8 114
+.b8 104
+.b8 117
+.b8 119
+.b8 100
+.b8 100
+.b8 104
+.b8 52
+.b8 99
+.b8 52
+.b8 113
+.b8 107
+.b8 115
+.b8 54
+.b8 98
+.b8 104
+.b8 55
+.b8 115
+.b8 111
+.b8 118
+.b8 102
+.b8 98
+.b8 112
+.b8 102
+.b8 110
+.b8 109
+.b8 113
+.b8 104
+.b8 110
+.b8 109
+.b8 52
+.b8 104
+.b8 52
+.b8 119
+.b8 50
+.b8 51
+.b8 105
+.b8 99
+.b8 113
+.b8 110
+.b8 117
+.b8 54
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 112
+.b8 113
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 180
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 180
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}

.triton/dump/a75e14a8d2d1ec8471f1c7b615552f8c/triton_.ttir ADDED Viewed

	@@ -0,0 +1,18 @@

+module {
+  tt.func public @triton__0d1d2de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %c256_i32 = arith.constant 256 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c256_i32 : i32
+    %2 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
+    %3 = tt.splat %1 : (i32) -> tensor<256xi32>
+    %4 = arith.addi %3, %2 : tensor<256xi32>
+    %5 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
+    %6 = tt.addptr %5, %4 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
+    %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
+    %8 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
+    %9 = tt.addptr %8, %4 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
+    %10 = arith.truncf %7 : tensor<256xf32> to tensor<256xbf16>
+    tt.store %9, %10 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16>
+    tt.return
+  }
+}

.triton/dump/ab89f85e55d5ddd9676325b49df9419f/triton_.ptx ADDED Viewed

	@@ -0,0 +1,717 @@

+//
+// Generated by LLVM NVPTX Back-End
+//
+.version 8.2
+.target sm_89
+.address_size 64
+	// .globl	triton__0d1d2d3d4d5d6de7de
+.extern .shared .align 1 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
+.visible .entry triton__0d1d2d3d4d5d6de7de(
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_0,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_1,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_2,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_3,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_4,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_5,
+	.param .u32 triton__0d1d2d3d4d5d6de7de_param_6,
+	.param .u32 triton__0d1d2d3d4d5d6de7de_param_7
+)
+.maxntid 64, 1, 1
+{
+	.reg .pred 	%p<25>;
+	.reg .b16 	%rs<9>;
+	.reg .b32 	%r<87>;
+	.reg .f32 	%f<70>;
+	.reg .b64 	%rd<17>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+	ld.param.u64 	%rd7, [triton__0d1d2d3d4d5d6de7de_param_0];
+	ld.param.u64 	%rd8, [triton__0d1d2d3d4d5d6de7de_param_1];
+$L__tmp0:
+	.loc	1 26 26
+	mov.u32 	%r52, %tid.x;
+	and.b32  	%r53, %r52, 31;
+	ld.param.u64 	%rd9, [triton__0d1d2d3d4d5d6de7de_param_2];
+	ld.param.u64 	%rd10, [triton__0d1d2d3d4d5d6de7de_param_3];
+	ld.param.u64 	%rd11, [triton__0d1d2d3d4d5d6de7de_param_4];
+	and.b32  	%r54, %r52, 63;
+	ld.param.u64 	%rd12, [triton__0d1d2d3d4d5d6de7de_param_5];
+	shl.b32 	%r55, %r54, 2;
+	.loc	1 23 28
+	mov.u32 %r1, %ctaid.x;
+	.loc	1 30 40
+	shl.b32 	%r56, %r1, 8;
+	.loc	1 30 36
+	or.b32  	%r57, %r56, %r55;
+	.loc	1 30 30
+	mul.wide.s32 	%rd13, %r57, 4;
+	add.s64 	%rd1, %rd8, %rd13;
+	mov.b32 	%r6, 0;
+	mov.pred 	%p1, -1;
+	.loc	1 30 46
+	mov.u32 %r2, 0x0;
+	mov.u32 %r3, 0x0;
+	mov.u32 %r4, 0x0;
+	mov.u32 %r5, 0x0;
+	@%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
+	@!%p1 mov.u32 %r2, %r6;
+	@!%p1 mov.u32 %r3, %r6;
+	@!%p1 mov.u32 %r4, %r6;
+	@!%p1 mov.u32 %r5, %r6;
+	mov.b32 	%f1, %r2;
+	mov.b32 	%f2, %r3;
+	mov.b32 	%f3, %r4;
+	mov.b32 	%f4, %r5;
+	.loc	1 31 30
+	mul.wide.s32 	%rd14, %r57, 2;
+	add.s64 	%rd2, %rd9, %rd14;
+	.loc	1 31 46
+	mov.u32 %r10, 0x0;
+	mov.u32 %r11, 0x0;
+	@%p1 ld.global.v2.b32 { %r10, %r11 }, [ %rd2 + 0 ];
+	@!%p1 mov.u32 %r10, %r6;
+	@!%p1 mov.u32 %r11, %r6;
+	cvt.u16.u32 	%rs1, %r10;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r10; }
+	cvt.u16.u32 	%rs3, %r11;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r11; }
+	.loc	1 31 67
+	cvt.f32.bf16 %r14, %rs1;
+	mov.b32 	%f5, %r14;
+	cvt.f32.bf16 %r15, %rs2;
+	mov.b32 	%f6, %r15;
+	cvt.f32.bf16 %r16, %rs3;
+	mov.b32 	%f7, %r16;
+	cvt.f32.bf16 %r17, %rs4;
+	mov.b32 	%f8, %r17;
+	.loc	1 32 31
+	mul.wide.u32 	%rd15, %r55, 4;
+	add.s64 	%rd3, %rd10, %rd15;
+	.loc	1 32 36
+	mov.u32 %r18, 0x0;
+	mov.u32 %r19, 0x0;
+	mov.u32 %r20, 0x0;
+	mov.u32 %r21, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd3 + 0 ];
+	@!%p1 mov.u32 %r18, %r6;
+	@!%p1 mov.u32 %r19, %r6;
+	@!%p1 mov.u32 %r20, %r6;
+	@!%p1 mov.u32 %r21, %r6;
+	.loc	1 34 18
+	add.f32 	%f9, %f5, %f1;
+	add.f32 	%f10, %f6, %f2;
+	add.f32 	%f11, %f7, %f3;
+	add.f32 	%f12, %f8, %f4;
+$L__tmp1:
+	.loc	2 233 15
+	add.f32 	%f13, %f9, %f10;
+	add.f32 	%f14, %f13, %f11;
+	add.f32 	%f15, %f14, %f12;
+$L__tmp2:
+	.loc	2 243 36
+	mov.b32 	%r58, %f15;
+	shfl.sync.bfly.b32	%r59, %r58, 16, 31, -1;
+	mov.b32 	%f16, %r59;
+$L__tmp3:
+	.loc	2 233 15
+	add.f32 	%f17, %f15, %f16;
+$L__tmp4:
+	.loc	2 243 36
+	mov.b32 	%r60, %f17;
+	shfl.sync.bfly.b32	%r61, %r60, 8, 31, -1;
+	mov.b32 	%f18, %r61;
+$L__tmp5:
+	.loc	2 233 15
+	add.f32 	%f19, %f17, %f18;
+$L__tmp6:
+	.loc	2 243 36
+	mov.b32 	%r62, %f19;
+	shfl.sync.bfly.b32	%r63, %r62, 4, 31, -1;
+	mov.b32 	%f20, %r63;
+$L__tmp7:
+	.loc	2 233 15
+	add.f32 	%f21, %f19, %f20;
+$L__tmp8:
+	.loc	2 243 36
+	mov.b32 	%r64, %f21;
+	shfl.sync.bfly.b32	%r65, %r64, 2, 31, -1;
+	mov.b32 	%f22, %r65;
+$L__tmp9:
+	.loc	2 233 15
+	add.f32 	%f23, %f21, %f22;
+$L__tmp10:
+	.loc	2 243 36
+	mov.b32 	%r66, %f23;
+	shfl.sync.bfly.b32	%r67, %r66, 1, 31, -1;
+	mov.b32 	%f24, %r67;
+$L__tmp11:
+	.loc	2 233 15
+	add.f32 	%f25, %f23, %f24;
+$L__tmp12:
+	.loc	2 243 36
+	setp.eq.s32 	%p14, %r53, 0;
+	shr.u32 	%r68, %r52, 3;
+	and.b32  	%r69, %r68, 4;
+	mov.u32 	%r70, global_smem;
+	add.s32 	%r26, %r70, %r69;
+	mov.b32 	%r27, %f25;
+	@%p14 st.shared.b32 [ %r26 + 0 ], %r27;
+	bar.sync 	0;
+	setp.lt.s32 	%p15, %r52, 2;
+	shl.b32 	%r71, %r52, 2;
+	add.s32 	%r29, %r70, %r71;
+	@%p15 ld.shared.b32 %r28, [ %r29 + 0 ];
+	mov.b32 	%f26, %r28;
+	shfl.sync.bfly.b32	%r72, %r28, 1, 31, -1;
+	mov.b32 	%f27, %r72;
+$L__tmp13:
+	.loc	2 233 15
+	add.f32 	%f28, %f26, %f27;
+$L__tmp14:
+	.loc	2 243 36
+	and.b32  	%r73, %r52, 1;
+	setp.eq.b32 	%p23, %r73, 1;
+	not.pred 	%p24, %p23;
+	and.pred  	%p16, %p15, %p24;
+	mov.b32 	%r31, %f28;
+	@%p16 st.shared.b32 [ %r29 + 0 ], %r31;
+	bar.sync 	0;
+	ld.shared.f32 	%f29, [global_smem];
+$L__tmp15:
+	.loc	3 8 15
+	add.f32 	%f30, %f29, 0f00000000;
+$L__tmp16:
+	.loc	1 42 20
+	mov.b32 	%r33, %f30;
+	mov.b32 	%r34, 1132462080;
+	div.full.f32 %r51, %r33, %r34;
+	mov.b32 	%f31, %r51;
+	.loc	1 43 19
+	sub.f32 	%f32, %f9, %f31;
+	sub.f32 	%f33, %f10, %f31;
+	sub.f32 	%f34, %f11, %f31;
+	sub.f32 	%f35, %f12, %f31;
+	.loc	1 44 20
+	mul.f32 	%f36, %f33, %f33;
+$L__tmp17:
+	.loc	2 243 36
+	bar.sync 	0;
+$L__tmp18:
+	.loc	2 233 15
+	fma.rn.f32 	%f37, %f32, %f32, %f36;
+	fma.rn.f32 	%f38, %f34, %f34, %f37;
+	fma.rn.f32 	%f39, %f35, %f35, %f38;
+$L__tmp19:
+	.loc	2 243 36
+	mov.b32 	%r74, %f39;
+	shfl.sync.bfly.b32	%r75, %r74, 16, 31, -1;
+	mov.b32 	%f40, %r75;
+$L__tmp20:
+	.loc	2 233 15
+	add.f32 	%f41, %f39, %f40;
+$L__tmp21:
+	.loc	2 243 36
+	mov.b32 	%r76, %f41;
+	shfl.sync.bfly.b32	%r77, %r76, 8, 31, -1;
+	mov.b32 	%f42, %r77;
+$L__tmp22:
+	.loc	2 233 15
+	add.f32 	%f43, %f41, %f42;
+$L__tmp23:
+	.loc	2 243 36
+	mov.b32 	%r78, %f43;
+	shfl.sync.bfly.b32	%r79, %r78, 4, 31, -1;
+	mov.b32 	%f44, %r79;
+$L__tmp24:
+	.loc	2 233 15
+	add.f32 	%f45, %f43, %f44;
+$L__tmp25:
+	.loc	2 243 36
+	mov.b32 	%r80, %f45;
+	shfl.sync.bfly.b32	%r81, %r80, 2, 31, -1;
+	mov.b32 	%f46, %r81;
+$L__tmp26:
+	.loc	2 233 15
+	add.f32 	%f47, %f45, %f46;
+$L__tmp27:
+	.loc	2 243 36
+	mov.b32 	%r82, %f47;
+	shfl.sync.bfly.b32	%r83, %r82, 1, 31, -1;
+	mov.b32 	%f48, %r83;
+$L__tmp28:
+	.loc	2 233 15
+	add.f32 	%f49, %f47, %f48;
+$L__tmp29:
+	.loc	2 243 36
+	mov.b32 	%r36, %f49;
+	@%p14 st.shared.b32 [ %r26 + 0 ], %r36;
+	bar.sync 	0;
+	@%p15 ld.shared.b32 %r37, [ %r29 + 0 ];
+	mov.b32 	%f50, %r37;
+	shfl.sync.bfly.b32	%r84, %r37, 1, 31, -1;
+	mov.b32 	%f51, %r84;
+$L__tmp30:
+	.loc	2 233 15
+	add.f32 	%f52, %f50, %f51;
+$L__tmp31:
+	.loc	2 243 36
+	mov.b32 	%r40, %f52;
+	@%p16 st.shared.b32 [ %r29 + 0 ], %r40;
+	bar.sync 	0;
+	ld.shared.f32 	%f53, [global_smem];
+$L__tmp32:
+	.loc	3 8 15
+	add.f32 	%f54, %f53, 0f00000000;
+$L__tmp33:
+	.loc	1 49 20
+	mov.b32 	%r42, %f54;
+	div.full.f32 %r41, %r42, %r34;
+	mov.b32 	%f55, %r41;
+	.loc	1 51 20
+	add.f32 	%f56, %f55, 0f3727C5AC;
+	.loc	1 52 26
+	rsqrt.approx.ftz.f32 	%f57, %f56;
+	.loc	1 32 36
+	mov.b32 	%f58, %r21;
+	mov.b32 	%f59, %r20;
+	mov.b32 	%f60, %r19;
+	mov.b32 	%f61, %r18;
+	.loc	1 54 20
+	mul.f32 	%f62, %f32, %f57;
+	mul.f32 	%f63, %f33, %f57;
+	mul.f32 	%f64, %f34, %f57;
+	mul.f32 	%f65, %f35, %f57;
+	.loc	1 55 20
+	mul.f32 	%f66, %f62, %f61;
+	mul.f32 	%f67, %f63, %f60;
+	mul.f32 	%f68, %f64, %f59;
+	mul.f32 	%f69, %f65, %f58;
+	.loc	1 57 4
+	bar.sync 	0;
+	.loc	1 58 28
+	mul.wide.s32 	%rd16, %r1, 4;
+	add.s64 	%rd4, %rd7, %rd16;
+	.loc	1 58 40
+	setp.eq.s32 	%p20, %r54, 0;
+	mov.b32 	%r44, %f57;
+	@%p20 st.global.b32 [ %rd4 + 0 ], { %r44 };
+	.loc	1 59 25
+	add.s64 	%rd5, %rd12, %rd14;
+	.loc	1 59 48
+	mov.b32 	%r45, %f66;
+	cvt.rn.bf16.f32 %rs5, %r45;
+	mov.b32 	%r46, %f67;
+	cvt.rn.bf16.f32 %rs6, %r46;
+	mov.b32 	%r47, %f68;
+	cvt.rn.bf16.f32 %rs7, %r47;
+	mov.b32 	%r48, %f69;
+	cvt.rn.bf16.f32 %rs8, %r48;
+	mov.b32 	%r85, {%rs5, %rs6};
+	mov.b32 	%r86, {%rs7, %rs8};
+	@%p1 st.global.v2.b32 [ %rd5 + 0 ], { %r85, %r86 };
+	.loc	1 60 25
+	add.s64 	%rd6, %rd11, %rd16;
+	.loc	1 60 37
+	@%p20 st.global.b32 [ %rd6 + 0 ], { %r51 };
+	.loc	1 60 4
+	ret;
+$L__tmp34:
+$L__func_end0:
+}
+	// .globl	__nv_rsqrtf
+.visible .func  (.param .b32 func_retval0) __nv_rsqrtf(
+	.param .b32 __nv_rsqrtf_param_0
+)
+{
+	.reg .f32 	%f<3>;
+$L__func_begin1:
+	ld.param.f32 	%f1, [__nv_rsqrtf_param_0];
+	rsqrt.approx.ftz.f32 	%f2, %f1;
+	st.param.f32 	[func_retval0+0], %f2;
+	ret;
+$L__func_end1:
+}
+	.file	1 "/tmp/torchinductor_root/w3/cw35gljjtatzr2ztskwlxndj2nreiih7r3vg5rw4douyaxccqgij.py"
+	.file	2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
+	.file	3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 32
+.b8 11
+.b8 0
+.b8 0
+.b8 3
+.b8 46
+.b8 1
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 49
+.b8 19
+.b8 0
+.b8 0
+.b8 4
+.b8 29
+.b8 1
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 5
+.b8 29
+.b8 0
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 399
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 119
+.b8 51
+.b8 53
+.b8 103
+.b8 108
+.b8 106
+.b8 106
+.b8 116
+.b8 97
+.b8 116
+.b8 122
+.b8 114
+.b8 50
+.b8 122
+.b8 116
+.b8 115
+.b8 107
+.b8 119
+.b8 108
+.b8 120
+.b8 110
+.b8 100
+.b8 106
+.b8 50
+.b8 110
+.b8 114
+.b8 101
+.b8 105
+.b8 105
+.b8 104
+.b8 55
+.b8 114
+.b8 51
+.b8 118
+.b8 103
+.b8 53
+.b8 114
+.b8 119
+.b8 52
+.b8 100
+.b8 111
+.b8 117
+.b8 121
+.b8 97
+.b8 120
+.b8 99
+.b8 99
+.b8 113
+.b8 103
+.b8 105
+.b8 106
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 119
+.b8 51
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 101
+.b8 55
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 101
+.b8 55
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 1
+.b8 3
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b32 125
+.b8 4
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp14
+.b8 2
+.b8 39
+.b8 58
+.b8 5
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp14
+.b8 2
+.b8 243
+.b8 36
+.b8 0
+.b8 5
+.b32 125
+.b64 $L__tmp2
+.b64 $L__tmp15
+.b8 2
+.b8 39
+.b8 58
+.b8 5
+.b32 125
+.b64 $L__tmp15
+.b64 $L__tmp16
+.b8 3
+.b8 39
+.b8 45
+.b8 5
+.b32 125
+.b64 $L__tmp17
+.b64 $L__tmp32
+.b8 2
+.b8 47
+.b8 59
+.b8 4
+.b32 125
+.b64 $L__tmp18
+.b64 $L__tmp31
+.b8 2
+.b8 47
+.b8 59
+.b8 5
+.b32 125
+.b64 $L__tmp18
+.b64 $L__tmp31
+.b8 2
+.b8 243
+.b8 36
+.b8 0
+.b8 5
+.b32 125
+.b64 $L__tmp32
+.b64 $L__tmp33
+.b8 3
+.b8 47
+.b8 45
+.b8 0
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 403
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 101
+.b8 55
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 403
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}

.triton/dump/ab89f85e55d5ddd9676325b49df9419f/triton_.ttgir ADDED Viewed

	@@ -0,0 +1,68 @@

+#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<256> : tensor<256xi32, #blocked>
+    %cst_0 = arith.constant 9.99999974E-6 : f32
+    %cst_1 = arith.constant 2.560000e+02 : f32
+    %cst_2 = arith.constant 0.000000e+00 : f32
+    %c256_i32 = arith.constant 256 : i32
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
+    %cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
+    %0 = tt.get_program_id x : i32
+    %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
+    %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
+    %3 = arith.muli %0, %c256_i32 : i32
+    %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
+    %5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
+    %6 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
+    %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
+    %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
+    %9 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
+    %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
+    %11 = tt.load %10, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
+    %12 = arith.extf %11 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
+    %13 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
+    %14 = tt.addptr %13, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
+    %15 = tt.load %14, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
+    %16 = arith.addf %8, %12 : tensor<256xf32, #blocked>
+    %17 = arith.select %2, %16, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
+    %18 = "tt.reduce"(%17) <{axis = 0 : i32}> ({
+    ^bb0(%arg8: f32, %arg9: f32):
+      %42 = arith.addf %arg8, %arg9 : f32
+      tt.reduce.return %42 : f32
+    }) : (tensor<256xf32, #blocked>) -> f32
+    %19 = arith.addf %18, %cst_2 : f32
+    %20 = arith.divf %19, %cst_1 : f32
+    %21 = tt.splat %20 : (f32) -> tensor<1xf32, #blocked1>
+    %22 = tt.splat %20 : (f32) -> tensor<256xf32, #blocked>
+    %23 = arith.subf %16, %22 : tensor<256xf32, #blocked>
+    %24 = arith.mulf %23, %23 : tensor<256xf32, #blocked>
+    %25 = arith.select %2, %24, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
+    %26 = "tt.reduce"(%25) <{axis = 0 : i32}> ({
+    ^bb0(%arg8: f32, %arg9: f32):
+      %42 = arith.addf %arg8, %arg9 : f32
+      tt.reduce.return %42 : f32
+    }) : (tensor<256xf32, #blocked>) -> f32
+    %27 = arith.addf %26, %cst_2 : f32
+    %28 = arith.divf %27, %cst_1 : f32
+    %29 = arith.addf %28, %cst_0 : f32
+    %30 = tt.extern_elementwise %29 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
+    %31 = tt.splat %30 : (f32) -> tensor<1xf32, #blocked1>
+    %32 = tt.splat %30 : (f32) -> tensor<256xf32, #blocked>
+    %33 = arith.mulf %23, %32 : tensor<256xf32, #blocked>
+    %34 = arith.mulf %33, %15 : tensor<256xf32, #blocked>
+    gpu.barrier
+    %35 = tt.addptr %arg0, %0 : !tt.ptr<f32, 1>, i32
+    %36 = tt.splat %35 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked1>
+    tt.store %36, %31 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32, #blocked1>
+    %37 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
+    %38 = tt.addptr %37, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
+    %39 = arith.truncf %34 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked>
+    tt.store %38, %39, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked>
+    %40 = tt.addptr %arg4, %0 : !tt.ptr<f32, 1>, i32
+    %41 = tt.splat %40 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked1>
+    tt.store %41, %21 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32, #blocked1>
+    tt.return
+  }
+}

.triton/dump/be28ee3793470d9803546f03e49c3edf/triton_.cubin ADDED Viewed

Binary file (17.5 kB). View file

.triton/dump/be28ee3793470d9803546f03e49c3edf/triton_.ttgir ADDED Viewed

	@@ -0,0 +1,85 @@

+#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d3d4d5d6d7d8d9d10de11de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg8: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg9: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg10: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg11: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<256> : tensor<256xi32, #blocked>
+    %cst_0 = arith.constant 9.99999974E-6 : f32
+    %cst_1 = arith.constant 2.560000e+02 : f32
+    %cst_2 = arith.constant 0.000000e+00 : f32
+    %c256_i32 = arith.constant 256 : i32
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
+    %cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
+    %0 = tt.get_program_id x : i32
+    %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
+    %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
+    %3 = arith.muli %0, %c256_i32 : i32
+    %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
+    %5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
+    %6 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
+    %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
+    %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
+    %9 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
+    %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
+    %11 = tt.load %10, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
+    %12 = arith.extf %11 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
+    %13 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
+    %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
+    %15 = tt.load %14, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
+    %16 = arith.extf %15 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
+    %17 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
+    %18 = tt.addptr %17, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
+    %19 = tt.load %18, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
+    %20 = arith.extf %19 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
+    %21 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
+    %22 = tt.addptr %21, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
+    %23 = tt.load %22, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
+    %24 = arith.extf %23 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
+    %25 = tt.splat %arg6 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
+    %26 = tt.addptr %25, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
+    %27 = tt.load %26, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
+    %28 = arith.addf %8, %12 : tensor<256xf32, #blocked>
+    %29 = arith.addf %28, %16 : tensor<256xf32, #blocked>
+    %30 = arith.addf %29, %20 : tensor<256xf32, #blocked>
+    %31 = arith.addf %30, %24 : tensor<256xf32, #blocked>
+    %32 = arith.select %2, %31, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
+    %33 = "tt.reduce"(%32) <{axis = 0 : i32}> ({
+    ^bb0(%arg12: f32, %arg13: f32):
+      %58 = arith.addf %arg12, %arg13 : f32
+      tt.reduce.return %58 : f32
+    }) : (tensor<256xf32, #blocked>) -> f32
+    %34 = arith.addf %33, %cst_2 : f32
+    %35 = arith.divf %34, %cst_1 : f32
+    %36 = tt.splat %35 : (f32) -> tensor<1xf32, #blocked1>
+    %37 = tt.splat %35 : (f32) -> tensor<256xf32, #blocked>
+    %38 = arith.subf %31, %37 : tensor<256xf32, #blocked>
+    %39 = arith.mulf %38, %38 : tensor<256xf32, #blocked>
+    %40 = arith.select %2, %39, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
+    %41 = "tt.reduce"(%40) <{axis = 0 : i32}> ({
+    ^bb0(%arg12: f32, %arg13: f32):
+      %58 = arith.addf %arg12, %arg13 : f32
+      tt.reduce.return %58 : f32
+    }) : (tensor<256xf32, #blocked>) -> f32
+    %42 = arith.addf %41, %cst_2 : f32
+    %43 = arith.divf %42, %cst_1 : f32
+    %44 = arith.addf %43, %cst_0 : f32
+    %45 = tt.extern_elementwise %44 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
+    %46 = tt.splat %45 : (f32) -> tensor<1xf32, #blocked1>
+    %47 = tt.splat %45 : (f32) -> tensor<256xf32, #blocked>
+    %48 = arith.mulf %38, %47 : tensor<256xf32, #blocked>
+    %49 = arith.mulf %48, %27 : tensor<256xf32, #blocked>
+    %50 = tt.splat %arg7 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
+    %51 = tt.addptr %50, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
+    tt.store %51, %31, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked>
+    gpu.barrier
+    %52 = tt.addptr %arg0, %0 : !tt.ptr<f32, 1>, i32
+    %53 = tt.splat %52 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked1>
+    tt.store %53, %46 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32, #blocked1>
+    %54 = tt.splat %arg9 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
+    %55 = tt.addptr %54, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
+    tt.store %55, %49, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked>
+    %56 = tt.addptr %arg8, %0 : !tt.ptr<f32, 1>, i32
+    %57 = tt.splat %56 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked1>
+    tt.store %57, %36 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32, #blocked1>
+    tt.return
+  }
+}

.triton/dump/c0c34db8f5ff22b79fc971c63187477a/triton_.cubin ADDED Viewed

Binary file (17.8 kB). View file

.triton/dump/c0c34db8f5ff22b79fc971c63187477a/triton_.ttir ADDED Viewed

	@@ -0,0 +1,84 @@

+module {
+  tt.func public @triton__0d1d2d3d4d5d6d7d8d9d10de11de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg8: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg9: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg10: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg11: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %c256_i32 = arith.constant 256 : i32
+    %cst = arith.constant dense<0.000000e+00> : tensor<256xbf16>
+    %cst_0 = arith.constant 0.000000e+00 : f32
+    %cst_1 = arith.constant 2.560000e+02 : f32
+    %cst_2 = arith.constant 9.99999974E-6 : f32
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32>
+    %cst_4 = arith.constant dense<256> : tensor<256xi32>
+    %0 = tt.get_program_id x : i32
+    %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
+    %2 = arith.cmpi slt, %1, %cst_4 : tensor<256xi32>
+    %3 = arith.muli %0, %c256_i32 : i32
+    %4 = tt.splat %3 : (i32) -> tensor<256xi32>
+    %5 = arith.addi %1, %4 : tensor<256xi32>
+    %6 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
+    %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
+    %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
+    %9 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
+    %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
+    %11 = tt.load %10, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
+    %12 = arith.extf %11 : tensor<256xbf16> to tensor<256xf32>
+    %13 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
+    %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
+    %15 = tt.load %14, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
+    %16 = arith.extf %15 : tensor<256xbf16> to tensor<256xf32>
+    %17 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
+    %18 = tt.addptr %17, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
+    %19 = tt.load %18, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
+    %20 = arith.extf %19 : tensor<256xbf16> to tensor<256xf32>
+    %21 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
+    %22 = tt.addptr %21, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
+    %23 = tt.load %22, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
+    %24 = arith.extf %23 : tensor<256xbf16> to tensor<256xf32>
+    %25 = tt.splat %arg6 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
+    %26 = tt.addptr %25, %1 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
+    %27 = tt.load %26, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
+    %28 = arith.addf %8, %12 : tensor<256xf32>
+    %29 = arith.addf %28, %16 : tensor<256xf32>
+    %30 = arith.addf %29, %20 : tensor<256xf32>
+    %31 = arith.addf %30, %24 : tensor<256xf32>
+    %32 = arith.select %2, %31, %cst_3 : tensor<256xi1>, tensor<256xf32>
+    %33 = "tt.reduce"(%32) <{axis = 0 : i32}> ({
+    ^bb0(%arg12: f32, %arg13: f32):
+      %59 = arith.addf %arg12, %arg13 : f32
+      tt.reduce.return %59 : f32
+    }) : (tensor<256xf32>) -> f32
+    %34 = arith.addf %33, %cst_0 : f32
+    %35 = arith.divf %34, %cst_1 : f32
+    %36 = tt.splat %35 : (f32) -> tensor<1xf32>
+    %37 = tt.splat %35 : (f32) -> tensor<256xf32>
+    %38 = arith.subf %31, %37 : tensor<256xf32>
+    %39 = arith.mulf %38, %38 : tensor<256xf32>
+    %40 = arith.select %2, %39, %cst_3 : tensor<256xi1>, tensor<256xf32>
+    %41 = "tt.reduce"(%40) <{axis = 0 : i32}> ({
+    ^bb0(%arg12: f32, %arg13: f32):
+      %59 = arith.addf %arg12, %arg13 : f32
+      tt.reduce.return %59 : f32
+    }) : (tensor<256xf32>) -> f32
+    %42 = arith.addf %41, %cst_0 : f32
+    %43 = arith.divf %42, %cst_1 : f32
+    %44 = arith.addf %43, %cst_2 : f32
+    %45 = tt.extern_elementwise %44 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
+    %46 = tt.splat %45 : (f32) -> tensor<1xf32>
+    %47 = tt.splat %45 : (f32) -> tensor<256xf32>
+    %48 = arith.mulf %38, %47 : tensor<256xf32>
+    %49 = arith.mulf %48, %27 : tensor<256xf32>
+    %50 = tt.splat %arg7 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
+    %51 = tt.addptr %50, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
+    tt.store %51, %31, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32>
+    gpu.barrier
+    %52 = tt.addptr %arg0, %0 : !tt.ptr<f32, 1>, i32
+    %53 = tt.splat %52 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>>
+    tt.store %53, %46 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32>
+    %54 = tt.splat %arg9 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
+    %55 = tt.addptr %54, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
+    %56 = arith.truncf %49 : tensor<256xf32> to tensor<256xbf16>
+    tt.store %55, %56, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16>
+    %57 = tt.addptr %arg8, %0 : !tt.ptr<f32, 1>, i32
+    %58 = tt.splat %57 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>>
+    tt.store %58, %36 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32>
+    tt.return
+  }
+}

.triton/dump/c0db4dd81e5aac83500e3ccf67d3896d/triton_.llir ADDED Viewed

	@@ -0,0 +1,53 @@

+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 {
+  %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
+  %5 = shl i32 %4, 1, !dbg !8
+  %6 = and i32 %5, 510, !dbg !8
+  %7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
+  %8 = shl i32 %7, 9, !dbg !10
+  %9 = or i32 %8, %6, !dbg !11
+  %10 = sext i32 %9 to i64, !dbg !12
+  %11 = getelementptr i16, ptr addrspace(1) %0, i64 %10, !dbg !12
+  %12 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %11, i1 true) #1, !dbg !13
+  %13 = trunc i32 %12 to i16, !dbg !13
+  %extelt.offset = lshr i32 %12, 16, !dbg !13
+  %14 = trunc i32 %extelt.offset to i16, !dbg !13
+  %15 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %13) #1, !dbg !14
+  %16 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %14) #1, !dbg !14
+  %17 = getelementptr float, ptr addrspace(1) %1, i64 %10, !dbg !15
+  %18 = bitcast float %15 to i32, !dbg !16
+  %19 = bitcast float %16 to i32, !dbg !16
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %18, i32 %19, ptr addrspace(1) %17, i1 true) #1, !dbg !16
+  ret void, !dbg !17
+}
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { nounwind }
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+!nvvm.annotations = !{!3, !4, !4, !3}
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!2 = !DIFile(filename: "cyamhdbxtmf4rgres6uo7orhfzw3ryhsvm5qzdvyqgggck2hqbyi.py", directory: "/tmp/torchinductor_root/ya")
+!3 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
+!4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 256}
+!5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 21, column: 36, scope: !5)
+!9 = !DILocation(line: 20, column: 28, scope: !5)
+!10 = !DILocation(line: 20, column: 33, scope: !5)
+!11 = !DILocation(line: 21, column: 23, scope: !5)
+!12 = !DILocation(line: 24, column: 30, scope: !5)
+!13 = !DILocation(line: 24, column: 35, scope: !5)
+!14 = !DILocation(line: 24, column: 44, scope: !5)
+!15 = !DILocation(line: 26, column: 25, scope: !5)
+!16 = !DILocation(line: 26, column: 36, scope: !5)
+!17 = !DILocation(line: 26, column: 4, scope: !5)

.triton/dump/d7a12c0ba96f8920b8147157303ee99f/triton_.ttir ADDED Viewed

	@@ -0,0 +1,62 @@

+module {
+  tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %c256_i32 = arith.constant 256 : i32
+    %cst = arith.constant dense<0.000000e+00> : tensor<256xbf16>
+    %cst_0 = arith.constant 0.000000e+00 : f32
+    %cst_1 = arith.constant 2.560000e+02 : f32
+    %cst_2 = arith.constant 9.99999974E-6 : f32
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32>
+    %cst_4 = arith.constant dense<256> : tensor<256xi32>
+    %0 = tt.get_program_id x : i32
+    %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
+    %2 = arith.cmpi slt, %1, %cst_4 : tensor<256xi32>
+    %3 = arith.muli %0, %c256_i32 : i32
+    %4 = tt.splat %3 : (i32) -> tensor<256xi32>
+    %5 = arith.addi %1, %4 : tensor<256xi32>
+    %6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
+    %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
+    %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
+    %9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
+    %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
+    %11 = tt.load %10, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
+    %12 = arith.extf %11 : tensor<256xbf16> to tensor<256xf32>
+    %13 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
+    %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
+    %15 = tt.load %14, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
+    %16 = arith.extf %15 : tensor<256xbf16> to tensor<256xf32>
+    %17 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
+    %18 = tt.addptr %17, %1 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
+    %19 = tt.load %18, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
+    %20 = arith.addf %8, %12 : tensor<256xf32>
+    %21 = arith.addf %20, %16 : tensor<256xf32>
+    %22 = arith.select %2, %21, %cst_3 : tensor<256xi1>, tensor<256xf32>
+    %23 = "tt.reduce"(%22) <{axis = 0 : i32}> ({
+    ^bb0(%arg7: f32, %arg8: f32):
+      %41 = arith.addf %arg7, %arg8 : f32
+      tt.reduce.return %41 : f32
+    }) : (tensor<256xf32>) -> f32
+    %24 = arith.addf %23, %cst_0 : f32
+    %25 = arith.divf %24, %cst_1 : f32
+    %26 = tt.splat %25 : (f32) -> tensor<256xf32>
+    %27 = arith.subf %21, %26 : tensor<256xf32>
+    %28 = arith.mulf %27, %27 : tensor<256xf32>
+    %29 = arith.select %2, %28, %cst_3 : tensor<256xi1>, tensor<256xf32>
+    %30 = "tt.reduce"(%29) <{axis = 0 : i32}> ({
+    ^bb0(%arg7: f32, %arg8: f32):
+      %41 = arith.addf %arg7, %arg8 : f32
+      tt.reduce.return %41 : f32
+    }) : (tensor<256xf32>) -> f32
+    %31 = arith.addf %30, %cst_0 : f32
+    %32 = arith.divf %31, %cst_1 : f32
+    %33 = arith.addf %32, %cst_2 : f32
+    %34 = tt.extern_elementwise %33 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
+    %35 = tt.splat %34 : (f32) -> tensor<256xf32>
+    %36 = arith.mulf %27, %35 : tensor<256xf32>
+    %37 = arith.mulf %36, %19 : tensor<256xf32>
+    %38 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
+    %39 = tt.addptr %38, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
+    %40 = arith.truncf %37 : tensor<256xf32> to tensor<256xbf16>
+    tt.store %39, %40, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16>
+    tt.return
+  }
+}

.triton/dump/f5088324dcdcf6814f6743553c1321c2/triton_.llir ADDED Viewed

	@@ -0,0 +1,63 @@

+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 {
+  %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
+  %5 = shl i32 %4, 2, !dbg !8
+  %6 = and i32 %5, 508, !dbg !8
+  %7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
+  %8 = shl i32 %7, 9, !dbg !10
+  %9 = or i32 %8, %6, !dbg !11
+  %10 = sext i32 %9 to i64, !dbg !12
+  %11 = getelementptr float, ptr addrspace(1) %0, i64 %10, !dbg !12
+  %12 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %11, i1 true) #1, !dbg !13
+  %13 = extractvalue { i32, i32, i32, i32 } %12, 0, !dbg !13
+  %14 = extractvalue { i32, i32, i32, i32 } %12, 1, !dbg !13
+  %15 = extractvalue { i32, i32, i32, i32 } %12, 2, !dbg !13
+  %16 = extractvalue { i32, i32, i32, i32 } %12, 3, !dbg !13
+  %17 = bitcast i32 %13 to float, !dbg !13
+  %18 = bitcast i32 %14 to float, !dbg !13
+  %19 = bitcast i32 %15 to float, !dbg !13
+  %20 = bitcast i32 %16 to float, !dbg !13
+  %21 = getelementptr i16, ptr addrspace(1) %1, i64 %10, !dbg !14
+  %22 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %17) #1, !dbg !15
+  %23 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %18) #1, !dbg !15
+  %24 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %19) #1, !dbg !15
+  %25 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %20) #1, !dbg !15
+  %26 = insertelement <2 x i16> undef, i16 %22, i64 0, !dbg !15
+  %27 = insertelement <2 x i16> %26, i16 %23, i64 1, !dbg !15
+  %28 = bitcast <2 x i16> %27 to i32, !dbg !15
+  %29 = insertelement <2 x i16> undef, i16 %24, i64 0, !dbg !15
+  %30 = insertelement <2 x i16> %29, i16 %25, i64 1, !dbg !15
+  %31 = bitcast <2 x i16> %30 to i32, !dbg !15
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %28, i32 %31, ptr addrspace(1) %21, i1 true) #1, !dbg !15
+  ret void, !dbg !16
+}
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { nounwind }
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+!nvvm.annotations = !{!3, !4, !4, !3}
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!2 = !DIFile(filename: "cpqhcwm5bfrhuwddh4c4qks6bh7sovfbpfnmqhnm4h4w23icqnu6.py", directory: "/tmp/torchinductor_root/pq")
+!3 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
+!4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 128}
+!5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 21, column: 36, scope: !5)
+!9 = !DILocation(line: 20, column: 28, scope: !5)
+!10 = !DILocation(line: 20, column: 33, scope: !5)
+!11 = !DILocation(line: 21, column: 23, scope: !5)
+!12 = !DILocation(line: 24, column: 30, scope: !5)
+!13 = !DILocation(line: 24, column: 35, scope: !5)
+!14 = !DILocation(line: 26, column: 25, scope: !5)
+!15 = !DILocation(line: 26, column: 36, scope: !5)
+!16 = !DILocation(line: 26, column: 4, scope: !5)

.triton/dump/f5088324dcdcf6814f6743553c1321c2/triton_.ptx ADDED Viewed

	@@ -0,0 +1,300 @@

+//
+// Generated by LLVM NVPTX Back-End
+//
+.version 8.2
+.target sm_89
+.address_size 64
+	// .globl	triton__0d1d2de
+.visible .entry triton__0d1d2de(
+	.param .u64 triton__0d1d2de_param_0,
+	.param .u64 triton__0d1d2de_param_1,
+	.param .u32 triton__0d1d2de_param_2
+)
+.maxntid 128, 1, 1
+{
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<5>;
+	.reg .b32 	%r<19>;
+	.reg .b64 	%rd<7>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+	ld.param.u64 	%rd3, [triton__0d1d2de_param_0];
+	ld.param.u64 	%rd4, [triton__0d1d2de_param_1];
+$L__tmp0:
+	.loc	1 21 36
+	mov.u32 	%r12, %tid.x;
+	shl.b32 	%r13, %r12, 2;
+	and.b32  	%r14, %r13, 508;
+	.loc	1 20 28
+	mov.u32 %r1, %ctaid.x;
+	.loc	1 20 33
+	shl.b32 	%r15, %r1, 9;
+	.loc	1 21 23
+	or.b32  	%r16, %r15, %r14;
+	.loc	1 24 30
+	mul.wide.s32 	%rd5, %r16, 4;
+	add.s64 	%rd1, %rd3, %rd5;
+	mov.pred 	%p1, -1;
+	.loc	1 24 35
+	mov.u32 %r6, 0x0;
+	mov.u32 %r7, 0x0;
+	mov.u32 %r8, 0x0;
+	mov.u32 %r9, 0x0;
+	@%p1 ld.global.v4.b32 { %r6, %r7, %r8, %r9 }, [ %rd1 + 0 ];
+	.loc	1 26 25
+	mul.wide.s32 	%rd6, %r16, 2;
+	add.s64 	%rd2, %rd4, %rd6;
+	.loc	1 26 36
+	cvt.rn.bf16.f32 %rs1, %r6;
+	cvt.rn.bf16.f32 %rs2, %r7;
+	cvt.rn.bf16.f32 %rs3, %r8;
+	cvt.rn.bf16.f32 %rs4, %r9;
+	mov.b32 	%r17, {%rs1, %rs2};
+	mov.b32 	%r18, {%rs3, %rs4};
+	@%p1 st.global.v2.b32 [ %rd2 + 0 ], { %r17, %r18 };
+	.loc	1 26 4
+	ret;
+$L__tmp1:
+$L__func_end0:
+}
+	.file	1 "/tmp/torchinductor_root/pq/cpqhcwm5bfrhuwddh4c4qks6bh7sovfbpfnmqhnm4h4w23icqnu6.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 176
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 112
+.b8 113
+.b8 104
+.b8 99
+.b8 119
+.b8 109
+.b8 53
+.b8 98
+.b8 102
+.b8 114
+.b8 104
+.b8 117
+.b8 119
+.b8 100
+.b8 100
+.b8 104
+.b8 52
+.b8 99
+.b8 52
+.b8 113
+.b8 107
+.b8 115
+.b8 54
+.b8 98
+.b8 104
+.b8 55
+.b8 115
+.b8 111
+.b8 118
+.b8 102
+.b8 98
+.b8 112
+.b8 102
+.b8 110
+.b8 109
+.b8 113
+.b8 104
+.b8 110
+.b8 109
+.b8 52
+.b8 104
+.b8 52
+.b8 119
+.b8 50
+.b8 51
+.b8 105
+.b8 99
+.b8 113
+.b8 110
+.b8 117
+.b8 54
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 112
+.b8 113
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 180
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 180
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}